2023-12-08 01:53:11

by Qais Yousef

[permalink] [raw]
Subject: [PATCH 1/4] sched/fair: Be less aggressive in calling cpufreq_update_util()

Due to the way code is structured, it makes a lot of sense to trigger
cpufreq_update_util() from update_load_avg(). But this is too aggressive
as in most cases we are iterating through entities in a loop to
update_load_avg() in the hierarchy. So we end up sending too many
request in an loop as we're updating the hierarchy.

Combine this with the rate limit in schedutil, we could end up
prematurely send up a wrong frequency update before we have actually
updated all entities appropriately.

Be smarter about it by limiting the trigger to perform frequency updates
after all accounting logic has done. This ended up being in the
following points:

1. enqueue/dequeue_task_fair()
2. throttle/unthrottle_cfs_rq()
3. attach/detach_task_cfs_rq()
4. task_tick_fair()
5. __sched_group_set_shares()

This is not 100% ideal still due to other limitations that might be
a bit harder to handle. Namely we can end up with premature update
request in the following situations:

a. Simultaneous task enqueue on the CPU where 2nd task is bigger and
requires higher freq. The trigger to cpufreq_update_util() by the
first task will lead to dropping the 2nd request until tick. Or
another CPU in the same policy trigger a freq update.

b. CPUs sharing a policy can end up with the same race in a but the
simultaneous enqueue happens on different CPUs in the same policy.

The above though are limitations in the governor/hardware, and from
scheduler point of view at least that's the best we can do. The
governor might consider smarter logic to aggregate near simultaneous
request and honour the higher one.

Signed-off-by: Qais Yousef (Google) <[email protected]>
---
kernel/sched/fair.c | 55 ++++++++++++---------------------------------
1 file changed, 14 insertions(+), 41 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index b83448be3f79..f99910fc6705 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -3997,29 +3997,6 @@ static inline void update_cfs_group(struct sched_entity *se)
}
#endif /* CONFIG_FAIR_GROUP_SCHED */

-static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq, int flags)
-{
- struct rq *rq = rq_of(cfs_rq);
-
- if (&rq->cfs == cfs_rq) {
- /*
- * There are a few boundary cases this might miss but it should
- * get called often enough that that should (hopefully) not be
- * a real problem.
- *
- * It will not get called when we go idle, because the idle
- * thread is a different class (!fair), nor will the utilization
- * number include things like RT tasks.
- *
- * As is, the util number is not freq-invariant (we'd have to
- * implement arch_scale_freq_capacity() for that).
- *
- * See cpu_util_cfs().
- */
- cpufreq_update_util(rq, flags);
- }
-}
-
#ifdef CONFIG_SMP
static inline bool load_avg_is_decayed(struct sched_avg *sa)
{
@@ -4648,8 +4625,6 @@ static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s

add_tg_cfs_propagate(cfs_rq, se->avg.load_sum);

- cfs_rq_util_change(cfs_rq, 0);
-
trace_pelt_cfs_tp(cfs_rq);
}

@@ -4678,8 +4653,6 @@ static void detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s

add_tg_cfs_propagate(cfs_rq, -se->avg.load_sum);

- cfs_rq_util_change(cfs_rq, 0);
-
trace_pelt_cfs_tp(cfs_rq);
}

@@ -4726,11 +4699,8 @@ static inline void update_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s
*/
detach_entity_load_avg(cfs_rq, se);
update_tg_load_avg(cfs_rq);
- } else if (decayed) {
- cfs_rq_util_change(cfs_rq, 0);
-
- if (flags & UPDATE_TG)
- update_tg_load_avg(cfs_rq);
+ } else if (decayed && (flags & UPDATE_TG)) {
+ update_tg_load_avg(cfs_rq);
}
}

@@ -5114,7 +5084,6 @@ static inline bool cfs_rq_is_decayed(struct cfs_rq *cfs_rq)

static inline void update_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se, int not_used1)
{
- cfs_rq_util_change(cfs_rq, 0);
}

static inline void remove_entity_load_avg(struct sched_entity *se) {}
@@ -5807,6 +5776,8 @@ static bool throttle_cfs_rq(struct cfs_rq *cfs_rq)
sub_nr_running(rq, task_delta);

done:
+ cpufreq_update_util(rq, 0);
+
/*
* Note: distribution will already see us throttled via the
* throttled-list. rq->lock protects completion.
@@ -5899,6 +5870,8 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
unthrottle_throttle:
assert_list_leaf_cfs_rq(rq);

+ cpufreq_update_util(rq, 0);
+
/* Determine whether we need to wake up potentially idle CPU: */
if (rq->curr == rq->idle && rq->cfs.nr_running)
resched_curr(rq);
@@ -6704,14 +6677,6 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
*/
util_est_enqueue(&rq->cfs, p);

- /*
- * If in_iowait is set, the code below may not trigger any cpufreq
- * utilization updates, so do it here explicitly with the IOWAIT flag
- * passed.
- */
- if (p->in_iowait)
- cpufreq_update_util(rq, SCHED_CPUFREQ_IOWAIT);
-
for_each_sched_entity(se) {
if (se->on_rq)
break;
@@ -6772,6 +6737,8 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
enqueue_throttle:
assert_list_leaf_cfs_rq(rq);

+ cpufreq_update_util(rq, p->in_iowait ? SCHED_CPUFREQ_IOWAIT : 0);
+
hrtick_update(rq);
}

@@ -6849,6 +6816,7 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)

dequeue_throttle:
util_est_update(&rq->cfs, p, task_sleep);
+ cpufreq_update_util(rq, 0);
hrtick_update(rq);
}

@@ -8482,6 +8450,7 @@ done: __maybe_unused;

update_misfit_status(p, rq);
sched_fair_update_stop_tick(rq, p);
+ cpufreq_update_util(rq, 0);

return p;

@@ -12615,6 +12584,7 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued)

update_misfit_status(curr, rq);
update_overutilized_status(task_rq(curr));
+ cpufreq_update_util(rq, 0);

task_tick_core(rq, curr);
}
@@ -12739,6 +12709,7 @@ static void detach_task_cfs_rq(struct task_struct *p)
struct sched_entity *se = &p->se;

detach_entity_cfs_rq(se);
+ cpufreq_update_util(task_rq(p), 0);
}

static void attach_task_cfs_rq(struct task_struct *p)
@@ -12746,6 +12717,7 @@ static void attach_task_cfs_rq(struct task_struct *p)
struct sched_entity *se = &p->se;

attach_entity_cfs_rq(se);
+ cpufreq_update_util(task_rq(p), 0);
}

static void switched_from_fair(struct rq *rq, struct task_struct *p)
@@ -12991,6 +12963,7 @@ static int __sched_group_set_shares(struct task_group *tg, unsigned long shares)
update_load_avg(cfs_rq_of(se), se, UPDATE_TG);
update_cfs_group(se);
}
+ cpufreq_update_util(rq, 0);
rq_unlock_irqrestore(rq, &rf);
}

--
2.34.1


2023-12-08 10:04:53

by Lukasz Luba

[permalink] [raw]
Subject: Re: [PATCH 1/4] sched/fair: Be less aggressive in calling cpufreq_update_util()

Hi Qais,

On 12/8/23 01:52, Qais Yousef wrote:

[snip]

> @@ -6704,14 +6677,6 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
> */
> util_est_enqueue(&rq->cfs, p);
>
> - /*
> - * If in_iowait is set, the code below may not trigger any cpufreq
> - * utilization updates, so do it here explicitly with the IOWAIT flag
> - * passed.
> - */
> - if (p->in_iowait)
> - cpufreq_update_util(rq, SCHED_CPUFREQ_IOWAIT);
> -

Why this io wait boost is considered as the $subject says 'aggressive'
calling?

Regards,
Lukasz

2023-12-10 21:36:25

by Qais Yousef

[permalink] [raw]
Subject: Re: [PATCH 1/4] sched/fair: Be less aggressive in calling cpufreq_update_util()

On 12/08/23 10:05, Lukasz Luba wrote:
> Hi Qais,
>
> On 12/8/23 01:52, Qais Yousef wrote:
>
> [snip]
>
> > @@ -6704,14 +6677,6 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
> > */
> > util_est_enqueue(&rq->cfs, p);
> > - /*
> > - * If in_iowait is set, the code below may not trigger any cpufreq
> > - * utilization updates, so do it here explicitly with the IOWAIT flag
> > - * passed.
> > - */
> > - if (p->in_iowait)
> > - cpufreq_update_util(rq, SCHED_CPUFREQ_IOWAIT);
> > -
>
> Why this io wait boost is considered as the $subject says 'aggressive'
> calling?

This will trigger a frequency update along with the iowait boost. Did I miss
something?


Cheers

--
Qais Yousef

2023-12-11 07:55:24

by Lukasz Luba

[permalink] [raw]
Subject: Re: [PATCH 1/4] sched/fair: Be less aggressive in calling cpufreq_update_util()



On 12/10/23 20:51, Qais Yousef wrote:
> On 12/08/23 10:05, Lukasz Luba wrote:
>> Hi Qais,
>>
>> On 12/8/23 01:52, Qais Yousef wrote:
>>
>> [snip]
>>
>>> @@ -6704,14 +6677,6 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
>>> */
>>> util_est_enqueue(&rq->cfs, p);
>>> - /*
>>> - * If in_iowait is set, the code below may not trigger any cpufreq
>>> - * utilization updates, so do it here explicitly with the IOWAIT flag
>>> - * passed.
>>> - */
>>> - if (p->in_iowait)
>>> - cpufreq_update_util(rq, SCHED_CPUFREQ_IOWAIT);
>>> -
>>
>> Why this io wait boost is considered as the $subject says 'aggressive'
>> calling?
>
> This will trigger a frequency update along with the iowait boost. Did I miss
> something?

Yes, it will change CPU freq and it was the main goal for this code
path. We have tests which check how that works on different memory
types.

Why do you want to remove it?

Did you run some tests (e.g. fio, gallery, etc) to check if you still
have a decent performance out some new ufs/nvme memories?

Beata & Dietmar have presented at LPC2021 a proposal to have a per-task
io boost, with a bit more controllable way of the trade off power vs.
performance [1]. IMO the io wait boost could evolve, not simply die.

Regards,
Lukasz

[1] https://lpc.events/event/11/contributions/1042/

2023-12-11 18:47:50

by Christian Loehle

[permalink] [raw]
Subject: Re: [PATCH 1/4] sched/fair: Be less aggressive in calling cpufreq_update_util()

On 08/12/2023 01:52, Qais Yousef wrote:
> Due to the way code is structured, it makes a lot of sense to trigger
> cpufreq_update_util() from update_load_avg(). But this is too aggressive
> as in most cases we are iterating through entities in a loop to
> update_load_avg() in the hierarchy. So we end up sending too many
> request in an loop as we're updating the hierarchy.

If this is actually less aggressive heavily depends on the workload,
I can argue the patch is more aggressive, as you call cpufreq_update_util
at every enqueue and dequeue, instead of just at enqueue.
For an I/O workload it is definitely more aggressive, see below.

>
> Combine this with the rate limit in schedutil, we could end up
> prematurely send up a wrong frequency update before we have actually
> updated all entities appropriately.
> [SNIP]


> @@ -6704,14 +6677,6 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
> */
> util_est_enqueue(&rq->cfs, p);
>
> - /*
> - * If in_iowait is set, the code below may not trigger any cpufreq
> - * utilization updates, so do it here explicitly with the IOWAIT flag
> - * passed.
> - */
> - if (p->in_iowait)
> - cpufreq_update_util(rq, SCHED_CPUFREQ_IOWAIT);
> -
> for_each_sched_entity(se) {
> if (se->on_rq)
> break;
> @@ -6772,6 +6737,8 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
> enqueue_throttle:
> assert_list_leaf_cfs_rq(rq);
>
> + cpufreq_update_util(rq, p->in_iowait ? SCHED_CPUFREQ_IOWAIT : 0);
> +
> hrtick_update(rq);
> }
>
> @@ -6849,6 +6816,7 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
>
> dequeue_throttle:
> util_est_update(&rq->cfs, p, task_sleep);
> + cpufreq_update_util(rq, 0);

This is quite critical, instead of only calling the update
at enqueue (with SCHED_CPUFREQ_IOWAIT if applicable) it is
now called at every enqueue and dequeue. The only way for
schedutil (intel_pstate too?) to build up a value of
iowait_boost > 128 is a large enough rate_limit_us, as even
for just a in_iowait task the enqueue increases the boost and
its own dequeue could reduce it already. For just a basic
benchmark workload and 2000 rate_limit_us this doesn't seem
to be that critical, anything below 200 rate_limit_us didn't
show any iowait boosting > 128 anymore on my system.
Of course if the workload does more between enqueue and
dequeue (time until task issues next I/O) already larger
values of rate_limit_us will disable any significant
iowait boost benefit.

Just to add some numbers to the story:
fio --time_based --name=fiotest --filename=/dev/nvme0n1 --runtime=30 --rw=randread --bs=4k --ioengine=psync --iodepth=1
fio --time_based --name=fiotest --filename=/dev/mmcblk2 --runtime=30 --rw=randread --bs=4k --ioengine=psync --iodepth=1

All results are sorted:
With this patch and rate_limit_us=2000:
(Second line is without iowait boosting, results are sorted):
[3883, 3980, 3997, 4018, 4019]
[2732, 2745, 2782, 2837, 2841]
/dev/mmcblk2
[4136, 4144, 4198, 4275, 4329]
[2753, 2975, 2975, 2975, 2976]

Without this patch and rate_limit_us=2000:
[3918, 4021, 4043, 4081, 4085]
[2850, 2859, 2863, 2873, 2887]
/dev/mmcblk2
[4277, 4358, 4380, 4421, 4425]
[2796, 3103, 3128, 3180, 3200]

With this patch and rate_limit_us=200:
/dev/nvme0n1
[2470, 2480, 2481, 2484, 2520]
[2473, 2510, 2517, 2534, 2572]
/dev/mmcblk2
[2286, 2338, 2440, 2504, 2535]
[2360, 2462, 2484, 2503, 2707]

Without this patch and rate_limit_us=200:
/dev/nvme0n1
[3880, 3956, 4010, 4013, 4016]
[2732, 2867, 2937, 2937, 2939]
/dev/mmcblk2
[4783, 4791, 4821, 4855, 4860]
[2653, 3091, 3095, 3166, 3202]

I'm currently working on iowait boosting and seeing where it's
actually needed and how it could be improved, so always interested
in anyone's thoughts.

(The second line here doesn't provide additional
information, I left it in to compare for reproducibility).
All with CONFIG_HZ=100 on an rk3399.

Best Regards,
Christian

> [SNIP]

2023-12-12 10:46:54

by Dietmar Eggemann

[permalink] [raw]
Subject: Re: [PATCH 1/4] sched/fair: Be less aggressive in calling cpufreq_update_util()

On 08/12/2023 02:52, Qais Yousef wrote:
> Due to the way code is structured, it makes a lot of sense to trigger
> cpufreq_update_util() from update_load_avg(). But this is too aggressive
> as in most cases we are iterating through entities in a loop to
> update_load_avg() in the hierarchy. So we end up sending too many
> request in an loop as we're updating the hierarchy.

But update_load_avg() calls cfs_rq_util_change() which only issues a
cpufreq_update_util() call for the root cfs_rq?

So the 'iterating through entities' should be for a task in a non-root
taskgroup which the condition (1) takes care of.

cfs_rq_util_change()

...
if (&rq->cfs == cfs_rq) (1)

cpufreq_update_util()

[...]

2023-12-12 10:47:37

by Hongyan Xia

[permalink] [raw]
Subject: Re: [PATCH 1/4] sched/fair: Be less aggressive in calling cpufreq_update_util()

On 08/12/2023 01:52, Qais Yousef wrote:
> Due to the way code is structured, it makes a lot of sense to trigger
> cpufreq_update_util() from update_load_avg(). But this is too aggressive
> as in most cases we are iterating through entities in a loop to
> update_load_avg() in the hierarchy. So we end up sending too many
> request in an loop as we're updating the hierarchy.

Do you mean the for_each_sched_entity(se) loop? I think we update CPU
frequency only once at the root CFS?

> Combine this with the rate limit in schedutil, we could end up
> prematurely send up a wrong frequency update before we have actually
> updated all entities appropriately.
>
> Be smarter about it by limiting the trigger to perform frequency updates
> after all accounting logic has done. This ended up being in the
> following points:
>
> 1. enqueue/dequeue_task_fair()
> 2. throttle/unthrottle_cfs_rq()
> 3. attach/detach_task_cfs_rq()
> 4. task_tick_fair()
> 5. __sched_group_set_shares()
>
> This is not 100% ideal still due to other limitations that might be
> a bit harder to handle. Namely we can end up with premature update
> request in the following situations:
>
> a. Simultaneous task enqueue on the CPU where 2nd task is bigger and
> requires higher freq. The trigger to cpufreq_update_util() by the
> first task will lead to dropping the 2nd request until tick. Or
> another CPU in the same policy trigger a freq update.
>
> b. CPUs sharing a policy can end up with the same race in a but the
> simultaneous enqueue happens on different CPUs in the same policy.
>
> The above though are limitations in the governor/hardware, and from
> scheduler point of view at least that's the best we can do. The
> governor might consider smarter logic to aggregate near simultaneous
> request and honour the higher one.
>
> Signed-off-by: Qais Yousef (Google) <[email protected]>
> ---
> kernel/sched/fair.c | 55 ++++++++++++---------------------------------
> 1 file changed, 14 insertions(+), 41 deletions(-)
>
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index b83448be3f79..f99910fc6705 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -3997,29 +3997,6 @@ static inline void update_cfs_group(struct sched_entity *se)
> }
> #endif /* CONFIG_FAIR_GROUP_SCHED */
>
> -static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq, int flags)
> -{
> - struct rq *rq = rq_of(cfs_rq);
> -
> - if (&rq->cfs == cfs_rq) {

Here. I think this restricts frequency updates to the root CFS?

> - /*
> - * There are a few boundary cases this might miss but it should
> - * get called often enough that that should (hopefully) not be
> - * a real problem.
> - *
> - * It will not get called when we go idle, because the idle
> - * thread is a different class (!fair), nor will the utilization
> - * number include things like RT tasks.
> - *
> - * As is, the util number is not freq-invariant (we'd have to
> - * implement arch_scale_freq_capacity() for that).
> - *
> - * See cpu_util_cfs().
> - */
> - cpufreq_update_util(rq, flags);
> - }
> -}
> -
> #ifdef CONFIG_SMP
> static inline bool load_avg_is_decayed(struct sched_avg *sa)
> {
> [...]

2023-12-12 11:07:02

by Vincent Guittot

[permalink] [raw]
Subject: Re: [PATCH 1/4] sched/fair: Be less aggressive in calling cpufreq_update_util()

On Fri, 8 Dec 2023 at 02:52, Qais Yousef <[email protected]> wrote:
>
> Due to the way code is structured, it makes a lot of sense to trigger
> cpufreq_update_util() from update_load_avg(). But this is too aggressive
> as in most cases we are iterating through entities in a loop to
> update_load_avg() in the hierarchy. So we end up sending too many
> request in an loop as we're updating the hierarchy.
>
> Combine this with the rate limit in schedutil, we could end up
> prematurely send up a wrong frequency update before we have actually
> updated all entities appropriately.
>
> Be smarter about it by limiting the trigger to perform frequency updates
> after all accounting logic has done. This ended up being in the
> following points:
>
> 1. enqueue/dequeue_task_fair()
> 2. throttle/unthrottle_cfs_rq()
> 3. attach/detach_task_cfs_rq()
> 4. task_tick_fair()
> 5. __sched_group_set_shares()
>
> This is not 100% ideal still due to other limitations that might be
> a bit harder to handle. Namely we can end up with premature update
> request in the following situations:
>
> a. Simultaneous task enqueue on the CPU where 2nd task is bigger and
> requires higher freq. The trigger to cpufreq_update_util() by the
> first task will lead to dropping the 2nd request until tick. Or
> another CPU in the same policy trigger a freq update.
>
> b. CPUs sharing a policy can end up with the same race in a but the
> simultaneous enqueue happens on different CPUs in the same policy.
>
> The above though are limitations in the governor/hardware, and from
> scheduler point of view at least that's the best we can do. The
> governor might consider smarter logic to aggregate near simultaneous
> request and honour the higher one.
>
> Signed-off-by: Qais Yousef (Google) <[email protected]>
> ---
> kernel/sched/fair.c | 55 ++++++++++++---------------------------------
> 1 file changed, 14 insertions(+), 41 deletions(-)
>
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index b83448be3f79..f99910fc6705 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -3997,29 +3997,6 @@ static inline void update_cfs_group(struct sched_entity *se)
> }
> #endif /* CONFIG_FAIR_GROUP_SCHED */
>
> -static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq, int flags)
> -{
> - struct rq *rq = rq_of(cfs_rq);
> -
> - if (&rq->cfs == cfs_rq) {
> - /*
> - * There are a few boundary cases this might miss but it should
> - * get called often enough that that should (hopefully) not be
> - * a real problem.
> - *
> - * It will not get called when we go idle, because the idle
> - * thread is a different class (!fair), nor will the utilization
> - * number include things like RT tasks.
> - *
> - * As is, the util number is not freq-invariant (we'd have to
> - * implement arch_scale_freq_capacity() for that).
> - *
> - * See cpu_util_cfs().
> - */
> - cpufreq_update_util(rq, flags);
> - }
> -}
> -
> #ifdef CONFIG_SMP
> static inline bool load_avg_is_decayed(struct sched_avg *sa)
> {
> @@ -4648,8 +4625,6 @@ static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s
>
> add_tg_cfs_propagate(cfs_rq, se->avg.load_sum);
>
> - cfs_rq_util_change(cfs_rq, 0);
> -
> trace_pelt_cfs_tp(cfs_rq);
> }
>
> @@ -4678,8 +4653,6 @@ static void detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s
>
> add_tg_cfs_propagate(cfs_rq, -se->avg.load_sum);
>
> - cfs_rq_util_change(cfs_rq, 0);
> -
> trace_pelt_cfs_tp(cfs_rq);
> }
>
> @@ -4726,11 +4699,8 @@ static inline void update_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s
> */
> detach_entity_load_avg(cfs_rq, se);
> update_tg_load_avg(cfs_rq);
> - } else if (decayed) {
> - cfs_rq_util_change(cfs_rq, 0);
> -
> - if (flags & UPDATE_TG)
> - update_tg_load_avg(cfs_rq);
> + } else if (decayed && (flags & UPDATE_TG)) {
> + update_tg_load_avg(cfs_rq);
> }
> }
>
> @@ -5114,7 +5084,6 @@ static inline bool cfs_rq_is_decayed(struct cfs_rq *cfs_rq)
>
> static inline void update_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se, int not_used1)
> {
> - cfs_rq_util_change(cfs_rq, 0);
> }
>
> static inline void remove_entity_load_avg(struct sched_entity *se) {}
> @@ -5807,6 +5776,8 @@ static bool throttle_cfs_rq(struct cfs_rq *cfs_rq)
> sub_nr_running(rq, task_delta);
>
> done:
> + cpufreq_update_util(rq, 0);
> +
> /*
> * Note: distribution will already see us throttled via the
> * throttled-list. rq->lock protects completion.
> @@ -5899,6 +5870,8 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
> unthrottle_throttle:
> assert_list_leaf_cfs_rq(rq);
>
> + cpufreq_update_util(rq, 0);
> +
> /* Determine whether we need to wake up potentially idle CPU: */
> if (rq->curr == rq->idle && rq->cfs.nr_running)
> resched_curr(rq);
> @@ -6704,14 +6677,6 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
> */
> util_est_enqueue(&rq->cfs, p);
>
> - /*
> - * If in_iowait is set, the code below may not trigger any cpufreq
> - * utilization updates, so do it here explicitly with the IOWAIT flag
> - * passed.
> - */
> - if (p->in_iowait)
> - cpufreq_update_util(rq, SCHED_CPUFREQ_IOWAIT);
> -
> for_each_sched_entity(se) {
> if (se->on_rq)
> break;
> @@ -6772,6 +6737,8 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
> enqueue_throttle:
> assert_list_leaf_cfs_rq(rq);
>

Here and in the other places below, you lose :

- } else if (decayed) {

The decayed condition ensures a rate limit (~1ms) in the number of
calls to cpufreq_update_util.

enqueue/dequeue/tick don't create any sudden change in the PELT
signals that would require to update cpufreq of the change unlike
attach/detach


> + cpufreq_update_util(rq, p->in_iowait ? SCHED_CPUFREQ_IOWAIT : 0);
> +
> hrtick_update(rq);
> }
>
> @@ -6849,6 +6816,7 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
>
> dequeue_throttle:
> util_est_update(&rq->cfs, p, task_sleep);
> + cpufreq_update_util(rq, 0);
> hrtick_update(rq);
> }
>
> @@ -8482,6 +8450,7 @@ done: __maybe_unused;
>
> update_misfit_status(p, rq);
> sched_fair_update_stop_tick(rq, p);
> + cpufreq_update_util(rq, 0);
>
> return p;
>
> @@ -12615,6 +12584,7 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued)
>
> update_misfit_status(curr, rq);
> update_overutilized_status(task_rq(curr));
> + cpufreq_update_util(rq, 0);
>
> task_tick_core(rq, curr);
> }
> @@ -12739,6 +12709,7 @@ static void detach_task_cfs_rq(struct task_struct *p)
> struct sched_entity *se = &p->se;
>
> detach_entity_cfs_rq(se);
> + cpufreq_update_util(task_rq(p), 0);
> }
>
> static void attach_task_cfs_rq(struct task_struct *p)
> @@ -12746,6 +12717,7 @@ static void attach_task_cfs_rq(struct task_struct *p)
> struct sched_entity *se = &p->se;
>
> attach_entity_cfs_rq(se);
> + cpufreq_update_util(task_rq(p), 0);
> }
>
> static void switched_from_fair(struct rq *rq, struct task_struct *p)
> @@ -12991,6 +12963,7 @@ static int __sched_group_set_shares(struct task_group *tg, unsigned long shares)
> update_load_avg(cfs_rq_of(se), se, UPDATE_TG);
> update_cfs_group(se);
> }
> + cpufreq_update_util(rq, 0);
> rq_unlock_irqrestore(rq, &rf);
> }
>
> --
> 2.34.1
>

2023-12-12 12:10:52

by Qais Yousef

[permalink] [raw]
Subject: Re: [PATCH 1/4] sched/fair: Be less aggressive in calling cpufreq_update_util()

On 12/11/23 07:56, Lukasz Luba wrote:
>
>
> On 12/10/23 20:51, Qais Yousef wrote:
> > On 12/08/23 10:05, Lukasz Luba wrote:
> > > Hi Qais,
> > >
> > > On 12/8/23 01:52, Qais Yousef wrote:
> > >
> > > [snip]
> > >
> > > > @@ -6704,14 +6677,6 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
> > > > */
> > > > util_est_enqueue(&rq->cfs, p);
> > > > - /*
> > > > - * If in_iowait is set, the code below may not trigger any cpufreq
> > > > - * utilization updates, so do it here explicitly with the IOWAIT flag
> > > > - * passed.
> > > > - */
> > > > - if (p->in_iowait)
> > > > - cpufreq_update_util(rq, SCHED_CPUFREQ_IOWAIT);
> > > > -
> > >
> > > Why this io wait boost is considered as the $subject says 'aggressive'
> > > calling?
> >
> > This will trigger a frequency update along with the iowait boost. Did I miss
> > something?
>
> Yes, it will change CPU freq and it was the main goal for this code
> path. We have tests which check how that works on different memory
> types.
>
> Why do you want to remove it?

It seems you missed this hunk? I of course didn't remove it altogether if
that's what you mean :)

@@ -6772,6 +6737,8 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
enqueue_throttle:
assert_list_leaf_cfs_rq(rq);

+ cpufreq_update_util(rq, p->in_iowait ? SCHED_CPUFREQ_IOWAIT : 0);
+
hrtick_update(rq);
}

>
> Did you run some tests (e.g. fio, gallery, etc) to check if you still
> have a decent performance out some new ufs/nvme memories?

PCMark storage gives

before*: 29681
after: 30014

* no patches applied including remove-margins one


Cheers

--
Qais Yousef

>
> Beata & Dietmar have presented at LPC2021 a proposal to have a per-task
> io boost, with a bit more controllable way of the trade off power vs.
> performance [1]. IMO the io wait boost could evolve, not simply die.
>
> Regards,
> Lukasz
>
> [1] https://lpc.events/event/11/contributions/1042/

2023-12-12 12:34:18

by Qais Yousef

[permalink] [raw]
Subject: Re: [PATCH 1/4] sched/fair: Be less aggressive in calling cpufreq_update_util()

On 12/11/23 18:47, Christian Loehle wrote:
> On 08/12/2023 01:52, Qais Yousef wrote:
> > Due to the way code is structured, it makes a lot of sense to trigger
> > cpufreq_update_util() from update_load_avg(). But this is too aggressive
> > as in most cases we are iterating through entities in a loop to
> > update_load_avg() in the hierarchy. So we end up sending too many
> > request in an loop as we're updating the hierarchy.
>
> If this is actually less aggressive heavily depends on the workload,
> I can argue the patch is more aggressive, as you call cpufreq_update_util
> at every enqueue and dequeue, instead of just at enqueue.
> For an I/O workload it is definitely more aggressive, see below.

I could have unwittingly broken something. Thanks for the report!

>
> >
> > Combine this with the rate limit in schedutil, we could end up
> > prematurely send up a wrong frequency update before we have actually
> > updated all entities appropriately.
> > [SNIP]
>
>
> > @@ -6704,14 +6677,6 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
> > */
> > util_est_enqueue(&rq->cfs, p);
> >
> > - /*
> > - * If in_iowait is set, the code below may not trigger any cpufreq
> > - * utilization updates, so do it here explicitly with the IOWAIT flag
> > - * passed.
> > - */
> > - if (p->in_iowait)
> > - cpufreq_update_util(rq, SCHED_CPUFREQ_IOWAIT);
> > -
> > for_each_sched_entity(se) {
> > if (se->on_rq)
> > break;
> > @@ -6772,6 +6737,8 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
> > enqueue_throttle:
> > assert_list_leaf_cfs_rq(rq);
> >
> > + cpufreq_update_util(rq, p->in_iowait ? SCHED_CPUFREQ_IOWAIT : 0);
> > +
> > hrtick_update(rq);
> > }
> >
> > @@ -6849,6 +6816,7 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
> >
> > dequeue_throttle:
> > util_est_update(&rq->cfs, p, task_sleep);
> > + cpufreq_update_util(rq, 0);
>
> This is quite critical, instead of only calling the update
> at enqueue (with SCHED_CPUFREQ_IOWAIT if applicable) it is
> now called at every enqueue and dequeue. The only way for

I think it was called at enqueue/dequeue before, but now it is done
unconditionally as I don't check for decay like before. It shouldn't change the
behavior as if there's no frequency change, then the governor will do nothing,
including not update last_update_time IIRC.

> schedutil (intel_pstate too?) to build up a value of
> iowait_boost > 128 is a large enough rate_limit_us, as even
> for just a in_iowait task the enqueue increases the boost and
> its own dequeue could reduce it already. For just a basic
> benchmark workload and 2000 rate_limit_us this doesn't seem
> to be that critical, anything below 200 rate_limit_us didn't

200us is too low. Does rk3399 support this? My pine64 has this SoC and
I remember it doesn't support fastswitch and the time to wake up the sugov
thread will be comparable to this before even trying to talk tot he hardware.

Not necessarily means that I don't have a bug in my code of course! :)

> show any iowait boosting > 128 anymore on my system.
> Of course if the workload does more between enqueue and
> dequeue (time until task issues next I/O) already larger
> values of rate_limit_us will disable any significant
> iowait boost benefit.

Hmm. It seems sugov_iowait_reset() is being called at the dequeue?

Tweaking the rate limit means short living tasks freq update at dequeue is
likely to be ignored by the governor.

The short value means it is likely to be taken into account.

Not sure if this is uncovering a bug somewhere else or I broke something.

>
> Just to add some numbers to the story:
> fio --time_based --name=fiotest --filename=/dev/nvme0n1 --runtime=30 --rw=randread --bs=4k --ioengine=psync --iodepth=1
> fio --time_based --name=fiotest --filename=/dev/mmcblk2 --runtime=30 --rw=randread --bs=4k --ioengine=psync --iodepth=1
>
> All results are sorted:
> With this patch and rate_limit_us=2000:
> (Second line is without iowait boosting, results are sorted):
> [3883, 3980, 3997, 4018, 4019]
> [2732, 2745, 2782, 2837, 2841]
> /dev/mmcblk2
> [4136, 4144, 4198, 4275, 4329]
> [2753, 2975, 2975, 2975, 2976]
>
> Without this patch and rate_limit_us=2000:
> [3918, 4021, 4043, 4081, 4085]
> [2850, 2859, 2863, 2873, 2887]
> /dev/mmcblk2
> [4277, 4358, 4380, 4421, 4425]
> [2796, 3103, 3128, 3180, 3200]
>
> With this patch and rate_limit_us=200:
> /dev/nvme0n1
> [2470, 2480, 2481, 2484, 2520]
> [2473, 2510, 2517, 2534, 2572]
> /dev/mmcblk2
> [2286, 2338, 2440, 2504, 2535]
> [2360, 2462, 2484, 2503, 2707]
>
> Without this patch and rate_limit_us=200:
> /dev/nvme0n1
> [3880, 3956, 4010, 4013, 4016]
> [2732, 2867, 2937, 2937, 2939]
> /dev/mmcblk2
> [4783, 4791, 4821, 4855, 4860]
> [2653, 3091, 3095, 3166, 3202]

Was any other patch in this series or remove margin series applied or just this
one?

>
> I'm currently working on iowait boosting and seeing where it's
> actually needed and how it could be improved, so always interested
> in anyone's thoughts.

One of the problems identified with iowait boost is that it is per-cpu; which
means tasks that are causing the iowait to happen will lose this boost when
migrated.

Arm was working on a way to help convert it to per-task. See Lukasz email.

>
> (The second line here doesn't provide additional
> information, I left it in to compare for reproducibility).
> All with CONFIG_HZ=100 on an rk3399.

Your tick is 10ms?! sugov_iowait_reset() should return false then. I see now,
we undo the boost in sugov_iowait_apply().

There's room for improvement for sure. Thanks for the feedback!


Cheers

--
Qais Yousef

>
> Best Regards,
> Christian
>
> > [SNIP]

2023-12-12 12:35:49

by Qais Yousef

[permalink] [raw]
Subject: Re: [PATCH 1/4] sched/fair: Be less aggressive in calling cpufreq_update_util()

On 12/12/23 11:46, Dietmar Eggemann wrote:
> On 08/12/2023 02:52, Qais Yousef wrote:
> > Due to the way code is structured, it makes a lot of sense to trigger
> > cpufreq_update_util() from update_load_avg(). But this is too aggressive
> > as in most cases we are iterating through entities in a loop to
> > update_load_avg() in the hierarchy. So we end up sending too many
> > request in an loop as we're updating the hierarchy.
>
> But update_load_avg() calls cfs_rq_util_change() which only issues a
> cpufreq_update_util() call for the root cfs_rq?

Yes I've noticed that and wondered. Maybe my analysis was flawed and I was just
hitting the issue of iowait boost request conflicting with update_load_avg()
request.

Let me have another look. I think we'll still end up needing to take the update
out of util_avg to be able to combine the two calls.


Cheers

--
Qais Yousef

>
> So the 'iterating through entities' should be for a task in a non-root
> taskgroup which the condition (1) takes care of.
>
> cfs_rq_util_change()
>
> ...
> if (&rq->cfs == cfs_rq) (1)
>
> cpufreq_update_util()
>
> [...]

2023-12-12 13:09:16

by Christian Loehle

[permalink] [raw]
Subject: Re: [PATCH 1/4] sched/fair: Be less aggressive in calling cpufreq_update_util()

On 12/12/2023 12:34, Qais Yousef wrote:
> On 12/11/23 18:47, Christian Loehle wrote:
>> On 08/12/2023 01:52, Qais Yousef wrote:
>>> Due to the way code is structured, it makes a lot of sense to trigger
>>> cpufreq_update_util() from update_load_avg(). But this is too aggressive
>>> as in most cases we are iterating through entities in a loop to
>>> update_load_avg() in the hierarchy. So we end up sending too many
>>> request in an loop as we're updating the hierarchy.
>>
>> If this is actually less aggressive heavily depends on the workload,
>> I can argue the patch is more aggressive, as you call cpufreq_update_util
>> at every enqueue and dequeue, instead of just at enqueue.
>> For an I/O workload it is definitely more aggressive, see below.
>
> I could have unwittingly broken something. Thanks for the report!
>
> [SNIP]
>>> dequeue_throttle:
>>> util_est_update(&rq->cfs, p, task_sleep);
>>> + cpufreq_update_util(rq, 0);
>>
>> This is quite critical, instead of only calling the update
>> at enqueue (with SCHED_CPUFREQ_IOWAIT if applicable) it is
>> now called at every enqueue and dequeue. The only way for
>
> I think it was called at enqueue/dequeue before, but now it is done
> unconditionally as I don't check for decay like before. It shouldn't change the
> behavior as if there's no frequency change, then the governor will do nothing

Well the governor will update the fields by calling sugov_iowait_apply
What exactly are you referring to when you say
"I think it was called at dequeue before"?
From what I can see this patch calls cpufreq_update_util much more
on an enqueue/dequeue workload like fio.

> including not update last_update_time IIRC.

sched_avg maybe, but iowait boosts last_update is changed,
I'll look into it, see below.

>
>> schedutil (intel_pstate too?) to build up a value of
>> iowait_boost > 128 is a large enough rate_limit_us, as even
>> for just a in_iowait task the enqueue increases the boost and
>> its own dequeue could reduce it already. For just a basic
>> benchmark workload and 2000 rate_limit_us this doesn't seem
>> to be that critical, anything below 200 rate_limit_us didn't
>
> 200us is too low. Does rk3399 support this? My pine64 has this SoC and
> I remember it doesn't support fastswitch and the time to wake up the sugov
> thread will be comparable to this before even trying to talk tot he hardware.

Unlikely it is actually supported, but I'm mostly concerned with the
actual iowait_boost value, so this limitation here actually is besides
my point. (The fact that tip:sched/core results are very different at 200us
from this patch gives me some confidence here.)

>
> Not necessarily means that I don't have a bug in my code of course! :)
>
>> show any iowait boosting > 128 anymore on my system.
>> Of course if the workload does more between enqueue and
>> dequeue (time until task issues next I/O) already larger
>> values of rate_limit_us will disable any significant
>> iowait boost benefit.
>
> Hmm. It seems sugov_iowait_reset() is being called at the dequeue?

Also yes, but I'm actually worried about the reduce / halving in
iowait_boost_apply().
With a one-to-one correspondence of enqueue (inc boost if in_iowait) and
dequeue (dec boost regardless) with cpufreq_update_util() you would expect
there to never be any boost apart from the minimal between the enqueue and
the dequeue. It does build up anyway, but that is only because the reduce
in iowait_boost_apply() is never hit if the last update time delta is < rate_limit_us.
(and rate_limit_us=2000 gives us still some headroom for read->read userspace
workloads, for read->think->read this could be more problematic, see also below.)

(Now thinking about it the fact that last_update before determining if frequency
should be changed, could be an issue, I'll look into it some more, anyway at worst
it's an issue with larger impact with your patch.)

>
> Tweaking the rate limit means short living tasks freq update at dequeue is
> likely to be ignored by the governor.
>
> The short value means it is likely to be taken into account.
>
> Not sure if this is uncovering a bug somewhere else or I broke something>
>>
>> Just to add some numbers to the story:
>> fio --time_based --name=fiotest --filename=/dev/nvme0n1 --runtime=30 --rw=randread --bs=4k --ioengine=psync --iodepth=1
>> fio --time_based --name=fiotest --filename=/dev/mmcblk2 --runtime=30 --rw=randread --bs=4k --ioengine=psync --iodepth=1
>>
>> All results are sorted:
>> With this patch and rate_limit_us=2000:
>> (Second line is without iowait boosting, results are sorted):
>> [3883, 3980, 3997, 4018, 4019]
>> [2732, 2745, 2782, 2837, 2841]
>> /dev/mmcblk2
>> [4136, 4144, 4198, 4275, 4329]
>> [2753, 2975, 2975, 2975, 2976]
>>
>> Without this patch and rate_limit_us=2000:
>> [3918, 4021, 4043, 4081, 4085]
>> [2850, 2859, 2863, 2873, 2887]
>> /dev/mmcblk2
>> [4277, 4358, 4380, 4421, 4425]
>> [2796, 3103, 3128, 3180, 3200]
>>
>> With this patch and rate_limit_us=200:
>> /dev/nvme0n1
>> [2470, 2480, 2481, 2484, 2520]
>> [2473, 2510, 2517, 2534, 2572]
>> /dev/mmcblk2
>> [2286, 2338, 2440, 2504, 2535]
>> [2360, 2462, 2484, 2503, 2707]
>>
>> Without this patch and rate_limit_us=200:
>> /dev/nvme0n1
>> [3880, 3956, 4010, 4013, 4016]
>> [2732, 2867, 2937, 2937, 2939]
>> /dev/mmcblk2
>> [4783, 4791, 4821, 4855, 4860]
>> [2653, 3091, 3095, 3166, 3202]
>
> Was any other patch in this series or remove margin series applied or just this
> one?

These specific numbers were just with this one.
I did a couple of tests to get a feel for the entire series(both),
but found no drastic change that would be uncovered by n=3 runs
anyway.

>>
>> I'm currently working on iowait boosting and seeing where it's
>> actually needed and how it could be improved, so always interested
>> in anyone's thoughts.
>
> One of the problems identified with iowait boost is that it is per-cpu; which
> means tasks that are causing the iowait to happen will lose this boost when
> migrated.
>
> Arm was working on a way to help convert it to per-task. See Lukasz email.

I guess that would be me now :)
Apart from considering per-task I'd like to get a reasonable scope for the
feature designed anyway.
Like in your patch, assuming rate_limit_us=2000, on my platform and scenarios
the context-switches + the minimum stuff fio does until between read syscalls
took around 200us (that's where the boost benefit disappears).
If we're considering workloads that do just a little more in-between than what
fio does (maybe actually looking at the data?) and therefore takes maybe >2000us
in-between, you disabled iowait boost for that workload with this patch.
My view right now is that this might not be critical, any attempts at recreating
such workloads (realistically) lead basically to them contributing enough util to
not need the iowait boost, but of course you can also create synthetic workloads
where this is not true.
If you want to play around with this too, I have recently added --thinkcycles
parameter to fio, you will have to build it from source though as it hasn't seen
a release since.

>
>>
>> (The second line here doesn't provide additional
>> information, I left it in to compare for reproducibility).
>> All with CONFIG_HZ=100 on an rk3399.
>
> Your tick is 10ms?! sugov_iowait_reset() should return false then. I see now,
> we undo the boost in sugov_iowait_apply().

Again, just to emphasize, the disabling of iowait boost then does not come from
sugov_iowait_reset, but sugov_iowait_apply, which will be called in dequeue regardless
in your patch, plus you're lowering the rate_limit_us, which right now acts as
a 'iowait boost protection' for your patch, if that makes sense.

Best Regards,
Christian

2023-12-12 13:21:05

by Qais Yousef

[permalink] [raw]
Subject: Re: [PATCH 1/4] sched/fair: Be less aggressive in calling cpufreq_update_util()

On 12/12/23 12:06, Vincent Guittot wrote:

> > @@ -6772,6 +6737,8 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
> > enqueue_throttle:
> > assert_list_leaf_cfs_rq(rq);
> >
>
> Here and in the other places below, you lose :
>
> - } else if (decayed) {
>
> The decayed condition ensures a rate limit (~1ms) in the number of
> calls to cpufreq_update_util.
>
> enqueue/dequeue/tick don't create any sudden change in the PELT
> signals that would require to update cpufreq of the change unlike
> attach/detach

Okay, thanks for the clue. Let me rethink this again.


Cheers

--
Qais Yousef

2023-12-12 13:29:32

by Qais Yousef

[permalink] [raw]
Subject: Re: [PATCH 1/4] sched/fair: Be less aggressive in calling cpufreq_update_util()

On 12/12/23 13:09, Christian Loehle wrote:

> > Arm was working on a way to help convert it to per-task. See Lukasz email.
>
> I guess that would be me now :)

Ah, sorry haven't noticed the email address :-)

> Apart from considering per-task I'd like to get a reasonable scope for the
> feature designed anyway.

Beside the iowait boost is completely ignored at migration. There's the desire
to disable it for some tasks. Not every task's io performance is important to
honour. Being able to control this via cgroup would be helpful so it can enable
disable it for background tasks for example. Generally controlling the default
behavior could be useful too.

> If you want to play around with this too, I have recently added --thinkcycles
> parameter to fio, you will have to build it from source though as it hasn't seen
> a release since.

Thanks. Might reach out if I needed this

> > Your tick is 10ms?! sugov_iowait_reset() should return false then. I see now,
> > we undo the boost in sugov_iowait_apply().
>
> Again, just to emphasize, the disabling of iowait boost then does not come from
> sugov_iowait_reset, but sugov_iowait_apply, which will be called in dequeue regardless
> in your patch, plus you're lowering the rate_limit_us, which right now acts as
> a 'iowait boost protection' for your patch, if that makes sense.

Maybe I should have redited my reply. I meant that I can see now how we can end
up undoing the boost in sugov_iowait_apply() under the conditions you pointed
out. So yep, I can see the problem.


Thanks!

--
Qais Yousef

2023-12-12 18:22:33

by Hongyan Xia

[permalink] [raw]
Subject: Re: [PATCH 1/4] sched/fair: Be less aggressive in calling cpufreq_update_util()

On 12/12/2023 12:35, Qais Yousef wrote:
> On 12/12/23 11:46, Dietmar Eggemann wrote:
>> On 08/12/2023 02:52, Qais Yousef wrote:
>>> Due to the way code is structured, it makes a lot of sense to trigger
>>> cpufreq_update_util() from update_load_avg(). But this is too aggressive
>>> as in most cases we are iterating through entities in a loop to
>>> update_load_avg() in the hierarchy. So we end up sending too many
>>> request in an loop as we're updating the hierarchy.
>>
>> But update_load_avg() calls cfs_rq_util_change() which only issues a
>> cpufreq_update_util() call for the root cfs_rq?
>
> Yes I've noticed that and wondered. Maybe my analysis was flawed and I was just
> hitting the issue of iowait boost request conflicting with update_load_avg()
> request.
>
> Let me have another look. I think we'll still end up needing to take the update
> out of util_avg to be able to combine the two calls.

I agree. Currently it does not express the intention clearly. We only
want to update the root CFS but the code was written in a misleading way
that suggests we want to update for every cfs_rq. A single update at the
end looks much nicer and makes other patches easier.

Hongyan

2023-12-14 08:18:28

by Lukasz Luba

[permalink] [raw]
Subject: Re: [PATCH 1/4] sched/fair: Be less aggressive in calling cpufreq_update_util()



On 12/12/23 12:10, Qais Yousef wrote:
> On 12/11/23 07:56, Lukasz Luba wrote:
>>
>>
>> On 12/10/23 20:51, Qais Yousef wrote:
>>> On 12/08/23 10:05, Lukasz Luba wrote:
>>>> Hi Qais,
>>>>
>>>> On 12/8/23 01:52, Qais Yousef wrote:
>>>>
>>>> [snip]
>>>>
>>>>> @@ -6704,14 +6677,6 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
>>>>> */
>>>>> util_est_enqueue(&rq->cfs, p);
>>>>> - /*
>>>>> - * If in_iowait is set, the code below may not trigger any cpufreq
>>>>> - * utilization updates, so do it here explicitly with the IOWAIT flag
>>>>> - * passed.
>>>>> - */
>>>>> - if (p->in_iowait)
>>>>> - cpufreq_update_util(rq, SCHED_CPUFREQ_IOWAIT);
>>>>> -
>>>>
>>>> Why this io wait boost is considered as the $subject says 'aggressive'
>>>> calling?
>>>
>>> This will trigger a frequency update along with the iowait boost. Did I miss
>>> something?
>>
>> Yes, it will change CPU freq and it was the main goal for this code
>> path. We have tests which check how that works on different memory
>> types.
>>
>> Why do you want to remove it?
>
> It seems you missed this hunk? I of course didn't remove it altogether if
> that's what you mean :)
>
> @@ -6772,6 +6737,8 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
> enqueue_throttle:
> assert_list_leaf_cfs_rq(rq);
>
> + cpufreq_update_util(rq, p->in_iowait ? SCHED_CPUFREQ_IOWAIT : 0);
> +
> hrtick_update(rq);
> }
>

Yes, you're right, I missed that change. I will have to spend some time
to figure out this new flow in the whole patch set.


>>
>> Did you run some tests (e.g. fio, gallery, etc) to check if you still
>> have a decent performance out some new ufs/nvme memories?
>
> PCMark storage gives
>
> before*: 29681
> after: 30014

The result looks good.

Thanks,
Lukasz

2023-12-18 08:53:47

by Dietmar Eggemann

[permalink] [raw]
Subject: Re: [PATCH 1/4] sched/fair: Be less aggressive in calling cpufreq_update_util()

On 08/12/2023 02:52, Qais Yousef wrote:
> Due to the way code is structured, it makes a lot of sense to trigger
> cpufreq_update_util() from update_load_avg(). But this is too aggressive
> as in most cases we are iterating through entities in a loop to
> update_load_avg() in the hierarchy. So we end up sending too many
> request in an loop as we're updating the hierarchy.
>
> Combine this with the rate limit in schedutil, we could end up
> prematurely send up a wrong frequency update before we have actually
> updated all entities appropriately.
>
> Be smarter about it by limiting the trigger to perform frequency updates
> after all accounting logic has done. This ended up being in the

What are the boundaries of the 'accounting logic' here? Is this related
to the update of all sched_entities and cfs_rq's involved when a task is
attached/detached (or enqueued/dequeued)?

I can't see that there are any premature cfs_rq_util_change() in the
current code when we consider this.

And avoiding updates for a smaller task to make sure updates for a
bigger task go through is IMHO not feasible.

I wonder how much influence does this patch has on the test results
presented the patch header?

> following points:
>
> 1. enqueue/dequeue_task_fair()
> 2. throttle/unthrottle_cfs_rq()
> 3. attach/detach_task_cfs_rq()
> 4. task_tick_fair()
> 5. __sched_group_set_shares()
>
> This is not 100% ideal still due to other limitations that might be
> a bit harder to handle. Namely we can end up with premature update
> request in the following situations:
>
> a. Simultaneous task enqueue on the CPU where 2nd task is bigger and
> requires higher freq. The trigger to cpufreq_update_util() by the
> first task will lead to dropping the 2nd request until tick. Or
> another CPU in the same policy trigger a freq update.
>
> b. CPUs sharing a policy can end up with the same race in a but the
> simultaneous enqueue happens on different CPUs in the same policy.
>
> The above though are limitations in the governor/hardware, and from
> scheduler point of view at least that's the best we can do. The
> governor might consider smarter logic to aggregate near simultaneous
> request and honour the higher one.

[...]


2023-12-18 18:19:49

by Qais Yousef

[permalink] [raw]
Subject: Re: [PATCH 1/4] sched/fair: Be less aggressive in calling cpufreq_update_util()

On 12/18/23 09:51, Dietmar Eggemann wrote:
> On 08/12/2023 02:52, Qais Yousef wrote:
> > Due to the way code is structured, it makes a lot of sense to trigger
> > cpufreq_update_util() from update_load_avg(). But this is too aggressive
> > as in most cases we are iterating through entities in a loop to
> > update_load_avg() in the hierarchy. So we end up sending too many
> > request in an loop as we're updating the hierarchy.
> >
> > Combine this with the rate limit in schedutil, we could end up
> > prematurely send up a wrong frequency update before we have actually
> > updated all entities appropriately.
> >
> > Be smarter about it by limiting the trigger to perform frequency updates
> > after all accounting logic has done. This ended up being in the
>
> What are the boundaries of the 'accounting logic' here? Is this related
> to the update of all sched_entities and cfs_rq's involved when a task is
> attached/detached (or enqueued/dequeued)?

Yes.

>
> I can't see that there are any premature cfs_rq_util_change() in the
> current code when we consider this.

Thanks for checking. I'll revisit the problem as indicated previously. This
patch is still needed; I'll update rationale at least and fix highlighted
issues with decay.

>
> And avoiding updates for a smaller task to make sure updates for a
> bigger task go through is IMHO not feasible.

Where did this line of thought come from? This patch is about consolidating
how scheduler request frequency updates. And later patches requires the single
update at tick to pass the new SCHED_CPUFREQ_PERF_HINTS.

If you're referring to the logic in later patches about ignore_short_tasks();
then we only ignore the performance hints for this task.

Why not feasible? What's the rationale?

>
> I wonder how much influence does this patch has on the test results
> presented the patch header?

The only change of behavior is how we deal with decay. Which I thought wouldn't
introduce a functional change, but as caught to by Christian, it did. No
functional changes are supposed to happen that can affect the test results
AFAICT.


Cheers

--
Qais Yousef

2023-12-29 00:25:47

by Qais Yousef

[permalink] [raw]
Subject: Re: [PATCH 1/4] sched/fair: Be less aggressive in calling cpufreq_update_util()

On 12/12/23 12:40, Qais Yousef wrote:
> On 12/12/23 12:06, Vincent Guittot wrote:
>
> > > @@ -6772,6 +6737,8 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
> > > enqueue_throttle:
> > > assert_list_leaf_cfs_rq(rq);
> > >
> >
> > Here and in the other places below, you lose :
> >
> > - } else if (decayed) {
> >
> > The decayed condition ensures a rate limit (~1ms) in the number of
> > calls to cpufreq_update_util.
> >
> > enqueue/dequeue/tick don't create any sudden change in the PELT
> > signals that would require to update cpufreq of the change unlike
> > attach/detach
>
> Okay, thanks for the clue. Let me rethink this again.

Thinking more about this. Do we really need to send freq updates at
enqueue/attach etc?

I did an experiment to remove all the updates except in three places:

1. Context switch (done unconditionally)
2. Tick
2. update_blocked_averages()

I tried the below patch on mac mini with m1 SoC, 6.6 kernel; speedometers
scores were the same (against this series).

I ran perf bench sched pipe to see if the addition in context switch will make
things worse

before (this series applied):

# Running 'sched/pipe' benchmark:
# Executed 1000000 pipe operations between two processes

Total time: 20.505 [sec]

20.505628 usecs/op
48767 ops/sec

after (proposed patch below applied on top):

# Running 'sched/pipe' benchmark:
# Executed 1000000 pipe operations between two processes

Total time: 19.475 [sec]

19.475838 usecs/op
51345 ops/sec

I tried against vanilla kernel (without any patches, including some dependency
backports) using schedutil governor

# Running 'sched/pipe' benchmark:
# Executed 1000000 pipe operations between two processes

Total time: 22.428 [sec]

22.428166 usecs/op
44586 ops/sec

(I got higher run to run variance against this kernel)

So things got better. I think we can actually unify all updates to be at
context switch and tick for all classes.

The one hole is for long running CFS tasks without context switch; no updates
until tick this means the dvfs headroom needs to cater for that as worst case
scenario now. I think this is the behavior today actually; but accidental
updates due to enqueue/dequeue could have helped to issue more updates. If none
of that happens, then updating load_avg at entity_tick() is what would have
triggered a frequency update.

I'm not sure if the blocked average one is required to be honest. But feared
that when the cpu goes to idle we might miss reducing frequencies vote for that
CPU - which matters on shared cpufreq policies.

I haven't done comprehensive testing of course. But would love to hear any
thoughts on how we can be more strategic and reduce the number of cpufreq
updates we send. And iowait boost state needs to be verified.

While testing this series more I did notice that short kworker context switches
still can cause the frequency to go high. I traced this back to
__balance_callbacks() in finish_lock_switch() after calling
uclamp_context_switch(). So I think we do have a problem of updates being
'accidental' and we need to improve the interaction with the governor to be
more intentional keeping in mind all the constraints we have today in h/w and
software.

--->8---


From 6deed09be1d075afa3858ca62dd54826cdb60d44 Mon Sep 17 00:00:00 2001
From: Qais Yousef <[email protected]>
Date: Tue, 26 Dec 2023 01:23:57 +0000
Subject: [PATCH] sched/fair: Update freq on tick and context switch and
blocked avgs

Signed-off-by: Qais Yousef (Google) <[email protected]>
---
kernel/sched/cpufreq_schedutil.c | 3 ---
kernel/sched/fair.c | 13 +------------
kernel/sched/sched.h | 15 +--------------
3 files changed, 2 insertions(+), 29 deletions(-)

diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c
index c0879a985097..553a3d7f02d8 100644
--- a/kernel/sched/cpufreq_schedutil.c
+++ b/kernel/sched/cpufreq_schedutil.c
@@ -166,9 +166,6 @@ static inline bool ignore_short_tasks(int cpu,
struct task_struct *p = cpu_rq(cpu)->curr;
unsigned long task_util;

- if (!(flags & SCHED_CPUFREQ_PERF_HINTS))
- return false;
-
if (!fair_policy(p->policy))
return false;

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index d63eae534cec..3a30f78b37d3 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -5717,8 +5717,6 @@ static bool throttle_cfs_rq(struct cfs_rq *cfs_rq)
sub_nr_running(rq, task_delta);

done:
- cpufreq_update_util(rq, 0);
-
/*
* Note: distribution will already see us throttled via the
* throttled-list. rq->lock protects completion.
@@ -5811,8 +5809,6 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
unthrottle_throttle:
assert_list_leaf_cfs_rq(rq);

- cpufreq_update_util(rq, 0);
-
/* Determine whether we need to wake up potentially idle CPU: */
if (rq->curr == rq->idle && rq->cfs.nr_running)
resched_curr(rq);
@@ -6675,8 +6671,6 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
enqueue_throttle:
assert_list_leaf_cfs_rq(rq);

- cpufreq_update_util(rq, p->in_iowait ? SCHED_CPUFREQ_IOWAIT : 0);
-
hrtick_update(rq);
}

@@ -6754,7 +6748,6 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)

dequeue_throttle:
util_est_update(&rq->cfs, p, task_sleep);
- cpufreq_update_util(rq, 0);
hrtick_update(rq);
}

@@ -8338,7 +8331,6 @@ done: __maybe_unused;

update_misfit_status(p, rq);
sched_fair_update_stop_tick(rq, p);
- cpufreq_update_util(rq, 0);

return p;

@@ -12460,7 +12452,7 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued)

update_misfit_status(curr, rq);
update_overutilized_status(task_rq(curr));
- cpufreq_update_util(rq, SCHED_CPUFREQ_PERF_HINTS);
+ cpufreq_update_util(rq, current->in_iowait ? SCHED_CPUFREQ_IOWAIT : 0);

task_tick_core(rq, curr);
}
@@ -12585,7 +12577,6 @@ static void detach_task_cfs_rq(struct task_struct *p)
struct sched_entity *se = &p->se;

detach_entity_cfs_rq(se);
- cpufreq_update_util(task_rq(p), 0);
}

static void attach_task_cfs_rq(struct task_struct *p)
@@ -12593,7 +12584,6 @@ static void attach_task_cfs_rq(struct task_struct *p)
struct sched_entity *se = &p->se;

attach_entity_cfs_rq(se);
- cpufreq_update_util(task_rq(p), 0);
}

static void switched_from_fair(struct rq *rq, struct task_struct *p)
@@ -12839,7 +12829,6 @@ static int __sched_group_set_shares(struct task_group *tg, unsigned long shares)
update_load_avg(cfs_rq_of(se), se, UPDATE_TG);
update_cfs_group(se);
}
- cpufreq_update_util(rq, 0);
rq_unlock_irqrestore(rq, &rf);
}

diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 516187ea2b81..e1622e2b82be 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -3076,20 +3076,7 @@ static inline bool uclamp_rq_is_capped(struct rq *rq)
/* Request freq update on context switch if necessary */
static inline void uclamp_context_switch(struct rq *rq)
{
- unsigned long uclamp_min;
- unsigned long uclamp_max;
- unsigned long util;
-
- /* Only RT and FAIR tasks are aware of uclamp */
- if (!rt_policy(current->policy) && !fair_policy(current->policy))
- return;
-
- uclamp_min = uclamp_eff_value(current, UCLAMP_MIN);
- uclamp_max = uclamp_eff_value(current, UCLAMP_MAX);
- util = rq->cfs.avg.util_avg;
-
- if (uclamp_min > util || uclamp_max < util)
- cpufreq_update_util(rq, SCHED_CPUFREQ_PERF_HINTS);
+ cpufreq_update_util(rq, current->in_iowait ? SCHED_CPUFREQ_IOWAIT : 0);
}
#else /* CONFIG_UCLAMP_TASK */
static inline unsigned long uclamp_eff_value(struct task_struct *p,
--
2.40.1


2024-01-03 13:43:14

by Vincent Guittot

[permalink] [raw]
Subject: Re: [PATCH 1/4] sched/fair: Be less aggressive in calling cpufreq_update_util()

On Fri, 29 Dec 2023 at 01:25, Qais Yousef <[email protected]> wrote:
>
> On 12/12/23 12:40, Qais Yousef wrote:
> > On 12/12/23 12:06, Vincent Guittot wrote:
> >
> > > > @@ -6772,6 +6737,8 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
> > > > enqueue_throttle:
> > > > assert_list_leaf_cfs_rq(rq);
> > > >
> > >
> > > Here and in the other places below, you lose :
> > >
> > > - } else if (decayed) {
> > >
> > > The decayed condition ensures a rate limit (~1ms) in the number of
> > > calls to cpufreq_update_util.
> > >
> > > enqueue/dequeue/tick don't create any sudden change in the PELT
> > > signals that would require to update cpufreq of the change unlike
> > > attach/detach
> >
> > Okay, thanks for the clue. Let me rethink this again.
>
> Thinking more about this. Do we really need to send freq updates at
> enqueue/attach etc?

Yes, attach and detach are the 2 events which can make abrupt and
significant changes in the utilization of the CPU.

>
> I did an experiment to remove all the updates except in three places:
>
> 1. Context switch (done unconditionally)
> 2. Tick
> 2. update_blocked_averages()

From the PoV of util_avg, attach, detach, tick and
update_blocked_averages are mandatory events to report to cpufreq to
correctly follow utilization.

>
> I tried the below patch on mac mini with m1 SoC, 6.6 kernel; speedometers
> scores were the same (against this series).
>
> I ran perf bench sched pipe to see if the addition in context switch will make
> things worse
>
> before (this series applied):
>
> # Running 'sched/pipe' benchmark:
> # Executed 1000000 pipe operations between two processes
>
> Total time: 20.505 [sec]
>
> 20.505628 usecs/op
> 48767 ops/sec
>
> after (proposed patch below applied on top):
>
> # Running 'sched/pipe' benchmark:
> # Executed 1000000 pipe operations between two processes
>
> Total time: 19.475 [sec]
>
> 19.475838 usecs/op
> 51345 ops/sec
>
> I tried against vanilla kernel (without any patches, including some dependency
> backports) using schedutil governor
>
> # Running 'sched/pipe' benchmark:
> # Executed 1000000 pipe operations between two processes
>
> Total time: 22.428 [sec]
>
> 22.428166 usecs/op
> 44586 ops/sec
>
> (I got higher run to run variance against this kernel)
>
> So things got better. I think we can actually unify all updates to be at
> context switch and tick for all classes.
>
> The one hole is for long running CFS tasks without context switch; no updates
> until tick this means the dvfs headroom needs to cater for that as worst case
> scenario now. I think this is the behavior today actually; but accidental
> updates due to enqueue/dequeue could have helped to issue more updates. If none
> of that happens, then updating load_avg at entity_tick() is what would have
> triggered a frequency update.
>
> I'm not sure if the blocked average one is required to be honest. But feared
> that when the cpu goes to idle we might miss reducing frequencies vote for that
> CPU - which matters on shared cpufreq policies.
>
> I haven't done comprehensive testing of course. But would love to hear any
> thoughts on how we can be more strategic and reduce the number of cpufreq
> updates we send. And iowait boost state needs to be verified.
>
> While testing this series more I did notice that short kworker context switches
> still can cause the frequency to go high. I traced this back to
> __balance_callbacks() in finish_lock_switch() after calling
> uclamp_context_switch(). So I think we do have a problem of updates being
> 'accidental' and we need to improve the interaction with the governor to be
> more intentional keeping in mind all the constraints we have today in h/w and
> software.
>
> --->8---
>
>
> From 6deed09be1d075afa3858ca62dd54826cdb60d44 Mon Sep 17 00:00:00 2001
> From: Qais Yousef <[email protected]>
> Date: Tue, 26 Dec 2023 01:23:57 +0000
> Subject: [PATCH] sched/fair: Update freq on tick and context switch and
> blocked avgs
>
> Signed-off-by: Qais Yousef (Google) <[email protected]>
> ---
> kernel/sched/cpufreq_schedutil.c | 3 ---
> kernel/sched/fair.c | 13 +------------
> kernel/sched/sched.h | 15 +--------------
> 3 files changed, 2 insertions(+), 29 deletions(-)
>
> diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c
> index c0879a985097..553a3d7f02d8 100644
> --- a/kernel/sched/cpufreq_schedutil.c
> +++ b/kernel/sched/cpufreq_schedutil.c
> @@ -166,9 +166,6 @@ static inline bool ignore_short_tasks(int cpu,
> struct task_struct *p = cpu_rq(cpu)->curr;
> unsigned long task_util;
>
> - if (!(flags & SCHED_CPUFREQ_PERF_HINTS))
> - return false;
> -
> if (!fair_policy(p->policy))
> return false;
>
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index d63eae534cec..3a30f78b37d3 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -5717,8 +5717,6 @@ static bool throttle_cfs_rq(struct cfs_rq *cfs_rq)
> sub_nr_running(rq, task_delta);
>
> done:
> - cpufreq_update_util(rq, 0);
> -
> /*
> * Note: distribution will already see us throttled via the
> * throttled-list. rq->lock protects completion.
> @@ -5811,8 +5809,6 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
> unthrottle_throttle:
> assert_list_leaf_cfs_rq(rq);
>
> - cpufreq_update_util(rq, 0);
> -
> /* Determine whether we need to wake up potentially idle CPU: */
> if (rq->curr == rq->idle && rq->cfs.nr_running)
> resched_curr(rq);
> @@ -6675,8 +6671,6 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
> enqueue_throttle:
> assert_list_leaf_cfs_rq(rq);
>
> - cpufreq_update_util(rq, p->in_iowait ? SCHED_CPUFREQ_IOWAIT : 0);
> -
> hrtick_update(rq);
> }
>
> @@ -6754,7 +6748,6 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
>
> dequeue_throttle:
> util_est_update(&rq->cfs, p, task_sleep);
> - cpufreq_update_util(rq, 0);
> hrtick_update(rq);
> }
>
> @@ -8338,7 +8331,6 @@ done: __maybe_unused;
>
> update_misfit_status(p, rq);
> sched_fair_update_stop_tick(rq, p);
> - cpufreq_update_util(rq, 0);
>
> return p;
>
> @@ -12460,7 +12452,7 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued)
>
> update_misfit_status(curr, rq);
> update_overutilized_status(task_rq(curr));
> - cpufreq_update_util(rq, SCHED_CPUFREQ_PERF_HINTS);
> + cpufreq_update_util(rq, current->in_iowait ? SCHED_CPUFREQ_IOWAIT : 0);
>
> task_tick_core(rq, curr);
> }
> @@ -12585,7 +12577,6 @@ static void detach_task_cfs_rq(struct task_struct *p)
> struct sched_entity *se = &p->se;
>
> detach_entity_cfs_rq(se);
> - cpufreq_update_util(task_rq(p), 0);
> }
>
> static void attach_task_cfs_rq(struct task_struct *p)
> @@ -12593,7 +12584,6 @@ static void attach_task_cfs_rq(struct task_struct *p)
> struct sched_entity *se = &p->se;
>
> attach_entity_cfs_rq(se);
> - cpufreq_update_util(task_rq(p), 0);
> }
>
> static void switched_from_fair(struct rq *rq, struct task_struct *p)
> @@ -12839,7 +12829,6 @@ static int __sched_group_set_shares(struct task_group *tg, unsigned long shares)
> update_load_avg(cfs_rq_of(se), se, UPDATE_TG);
> update_cfs_group(se);
> }
> - cpufreq_update_util(rq, 0);
> rq_unlock_irqrestore(rq, &rf);
> }
>
> diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
> index 516187ea2b81..e1622e2b82be 100644
> --- a/kernel/sched/sched.h
> +++ b/kernel/sched/sched.h
> @@ -3076,20 +3076,7 @@ static inline bool uclamp_rq_is_capped(struct rq *rq)
> /* Request freq update on context switch if necessary */
> static inline void uclamp_context_switch(struct rq *rq)
> {
> - unsigned long uclamp_min;
> - unsigned long uclamp_max;
> - unsigned long util;
> -
> - /* Only RT and FAIR tasks are aware of uclamp */
> - if (!rt_policy(current->policy) && !fair_policy(current->policy))
> - return;
> -
> - uclamp_min = uclamp_eff_value(current, UCLAMP_MIN);
> - uclamp_max = uclamp_eff_value(current, UCLAMP_MAX);
> - util = rq->cfs.avg.util_avg;
> -
> - if (uclamp_min > util || uclamp_max < util)
> - cpufreq_update_util(rq, SCHED_CPUFREQ_PERF_HINTS);
> + cpufreq_update_util(rq, current->in_iowait ? SCHED_CPUFREQ_IOWAIT : 0);
> }
> #else /* CONFIG_UCLAMP_TASK */
> static inline unsigned long uclamp_eff_value(struct task_struct *p,
> --
> 2.40.1
>

2024-01-04 19:40:41

by Qais Yousef

[permalink] [raw]
Subject: Re: [PATCH 1/4] sched/fair: Be less aggressive in calling cpufreq_update_util()

On 01/03/24 14:41, Vincent Guittot wrote:
> On Fri, 29 Dec 2023 at 01:25, Qais Yousef <[email protected]> wrote:
> >
> > On 12/12/23 12:40, Qais Yousef wrote:
> > > On 12/12/23 12:06, Vincent Guittot wrote:
> > >
> > > > > @@ -6772,6 +6737,8 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
> > > > > enqueue_throttle:
> > > > > assert_list_leaf_cfs_rq(rq);
> > > > >
> > > >
> > > > Here and in the other places below, you lose :
> > > >
> > > > - } else if (decayed) {
> > > >
> > > > The decayed condition ensures a rate limit (~1ms) in the number of
> > > > calls to cpufreq_update_util.
> > > >
> > > > enqueue/dequeue/tick don't create any sudden change in the PELT
> > > > signals that would require to update cpufreq of the change unlike
> > > > attach/detach
> > >
> > > Okay, thanks for the clue. Let me rethink this again.
> >
> > Thinking more about this. Do we really need to send freq updates at
> > enqueue/attach etc?
>
> Yes, attach and detach are the 2 events which can make abrupt and
> significant changes in the utilization of the CPU.
>
> >
> > I did an experiment to remove all the updates except in three places:
> >
> > 1. Context switch (done unconditionally)
> > 2. Tick
> > 2. update_blocked_averages()
>
> From the PoV of util_avg, attach, detach, tick and
> update_blocked_averages are mandatory events to report to cpufreq to
> correctly follow utilization.

Okay, I'll re-instate the attach/detach updates.

Worth noting that unconditional calling is not a good idea after all. So I'll
make sure that context switch updates are protected with static key for
governors that don't register a hook, and that it is only called when we think
it's necessary. I did notice some overhead after all against reverse-misfit
patches.


Thanks!

--
Qais Yousef