2023-10-04 09:05:42

by Hongyan Xia

[permalink] [raw]
Subject: [RFC PATCH 1/6] sched/uclamp: Track uclamped util_avg in sched_avg

From: Hongyan Xia <[email protected]>

Track a uclamped version of util_avg in sched_avg, which clamps util_avg
within [uclamp[UCLAMP_MIN], uclamp[UCLAMP_MAX]] every time util_avg is
updated. At the CFS rq level, cfs_rq->avg.util_avg_uclamp must always be
the sum of all util_avg_uclamp of entities on this cfs_rq. So, each
time the util_avg_uclamp of an entity gets updated, we also track the
delta and update the cfs_rq.

We can't put the update of cfs_rq util_avg_uclamp separately in
propagate_entity_load_avg(), because util_avg_uclamp of se and cfs_rq
are not tracked separately, unlike util_avg. As a result,
util_avg_uclamp of the se and the cfs_rq the se is on must be updated
at the same time.

Signed-off-by: Hongyan Xia <[email protected]>
---
include/linux/sched.h | 9 ++++++++-
kernel/sched/fair.c | 39 +++++++++++++++++++++++++++++++++++++++
kernel/sched/pelt.c | 43 +++++++++++++++++++++++++++++++++++++++++++
kernel/sched/sched.h | 37 +++++++++++++++++++++++++++++++++++++
4 files changed, 127 insertions(+), 1 deletion(-)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 177b3f3676ef..825d7b86b006 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -499,7 +499,14 @@ struct sched_avg {
u32 period_contrib;
unsigned long load_avg;
unsigned long runnable_avg;
- unsigned long util_avg;
+ unsigned int util_avg;
+#ifdef CONFIG_UCLAMP_TASK
+ /*
+ * XXX: util_avg shrunk to accommodate util_avg_uclamp.
+ * What are the consequences?
+ */
+ unsigned int util_avg_uclamp;
+#endif
struct util_est util_est;
} ____cacheline_aligned;

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 0b7445cd5af9..33e5a6e751c0 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -1077,6 +1077,9 @@ void post_init_entity_util_avg(struct task_struct *p)
}

sa->runnable_avg = sa->util_avg;
+#ifdef CONFIG_UCLAMP_TASK
+ sa->util_avg_uclamp = sa->util_avg;
+#endif
}

#else /* !CONFIG_SMP */
@@ -5068,6 +5071,7 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
update_stats_enqueue_fair(cfs_rq, se, flags);
if (!curr)
__enqueue_entity(cfs_rq, se);
+ enqueue_util_avg_uclamp(cfs_rq, se);
se->on_rq = 1;

if (cfs_rq->nr_running == 1) {
@@ -5138,6 +5142,7 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
update_entity_lag(cfs_rq, se);
if (se != cfs_rq->curr)
__dequeue_entity(cfs_rq, se);
+ dequeue_util_avg_uclamp(cfs_rq, se);
se->on_rq = 0;
account_entity_dequeue(cfs_rq, se);

@@ -6445,6 +6450,21 @@ static int sched_idle_cpu(int cpu)
}
#endif

+void ___update_util_avg_uclamp(struct sched_avg *avg, struct sched_entity *se);
+
+static void update_se_chain(struct task_struct *p)
+{
+#ifdef CONFIG_UCLAMP_TASK
+ struct sched_entity *se = &p->se;
+ struct rq *rq = task_rq(p);
+
+ for_each_sched_entity(se) {
+ struct cfs_rq *cfs_rq = cfs_rq_of(se);
+
+ ___update_util_avg_uclamp(&cfs_rq->avg, se);
+ }
+#endif
+}
/*
* The enqueue_task method is called before nr_running is
* increased. Here we update the fair scheduling stats and
@@ -6511,6 +6531,16 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
goto enqueue_throttle;
}

+ /*
+ * Re-evaluate the se hierarchy now that on_rq is true. This is
+ * important to enforce uclamp the moment a task with uclamp is
+ * enqueued, rather than waiting a timer tick for uclamp to kick in.
+ *
+ * XXX: This duplicates some of the work already done in the above for
+ * loops.
+ */
+ update_se_chain(p);
+
/* At this point se is NULL and we are at root level*/
add_nr_running(rq, 1);

@@ -6612,6 +6642,15 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
dequeue_throttle:
util_est_update(&rq->cfs, p, task_sleep);
hrtick_update(rq);
+
+#ifdef CONFIG_UCLAMP_TASK
+ if (rq->cfs.h_nr_running == 0) {
+ WARN_ONCE(rq->cfs.avg.util_avg_uclamp,
+ "0 tasks on CFS of CPU %d, but util_avg_uclamp is %u\n",
+ rq->cpu, rq->cfs.avg.util_avg_uclamp);
+ WRITE_ONCE(rq->cfs.avg.util_avg_uclamp, 0);
+ }
+#endif
}

#ifdef CONFIG_SMP
diff --git a/kernel/sched/pelt.c b/kernel/sched/pelt.c
index 0f310768260c..c656e4dcb1d1 100644
--- a/kernel/sched/pelt.c
+++ b/kernel/sched/pelt.c
@@ -266,6 +266,48 @@ ___update_load_avg(struct sched_avg *sa, unsigned long load)
WRITE_ONCE(sa->util_avg, sa->util_sum / divider);
}

+#ifdef CONFIG_UCLAMP_TASK
+/* avg must belong to the queue this se is on. */
+void ___update_util_avg_uclamp(struct sched_avg *avg, struct sched_entity *se)
+{
+ unsigned int util;
+ int delta;
+
+ if (entity_is_task(se)) {
+ unsigned int uclamp_min, uclamp_max;
+
+ if (!se->on_rq)
+ return;
+
+ util = READ_ONCE(se->avg.util_avg);
+ uclamp_min = uclamp_eff_value(task_of(se), UCLAMP_MIN);
+ uclamp_max = uclamp_eff_value(task_of(se), UCLAMP_MAX);
+ util = clamp(util, uclamp_min, uclamp_max);
+ } else {
+ util = READ_ONCE(group_cfs_rq(se)->avg.util_avg_uclamp);
+
+ if (!se->on_rq) {
+ WRITE_ONCE(se->avg.util_avg_uclamp, util);
+ return;
+ }
+ }
+
+ delta = util - READ_ONCE(se->avg.util_avg_uclamp);
+ if (delta == 0)
+ return;
+
+ WRITE_ONCE(se->avg.util_avg_uclamp, util);
+ util = READ_ONCE(avg->util_avg_uclamp);
+ util += delta;
+ WRITE_ONCE(avg->util_avg_uclamp, util);
+}
+#else /* !CONFIG_UCLAMP_TASK */
+static void
+___update_util_avg_uclamp(struct sched_avg *avg, struct sched_entity *se)
+{
+}
+#endif
+
/*
* sched_entity:
*
@@ -309,6 +351,7 @@ int __update_load_avg_se(u64 now, struct cfs_rq *cfs_rq, struct sched_entity *se
cfs_rq->curr == se)) {

___update_load_avg(&se->avg, se_weight(se));
+ ___update_util_avg_uclamp(&cfs_rq->avg, se);
cfs_se_util_change(&se->avg);
trace_pelt_se_tp(se);
return 1;
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 3a01b7a2bf66..2eefcdb0c3b0 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -3133,6 +3133,33 @@ static inline bool uclamp_is_used(void)
{
return static_branch_likely(&sched_uclamp_used);
}
+
+static inline void enqueue_util_avg_uclamp(struct cfs_rq *cfs_rq,
+ struct sched_entity *se)
+{
+ unsigned int cfs_val = READ_ONCE(cfs_rq->avg.util_avg_uclamp);
+ unsigned int se_val = READ_ONCE(se->avg.util_avg_uclamp);
+
+ WRITE_ONCE(cfs_rq->avg.util_avg_uclamp, cfs_val + se_val);
+}
+
+static inline void dequeue_util_avg_uclamp(struct cfs_rq *cfs_rq,
+ struct sched_entity *se)
+{
+ unsigned int cfs_val = READ_ONCE(cfs_rq->avg.util_avg_uclamp);
+ unsigned int se_val = READ_ONCE(se->avg.util_avg_uclamp), new_val;
+
+ if (cfs_val > se_val)
+ new_val = cfs_val - se_val;
+ else {
+ WARN_ONCE(cfs_val < se_val,
+ "CPU %d. cfs_rq %p, cfs_val %u is even less than se_val %u before subtraction\n",
+ rq_of(cfs_rq)->cpu, cfs_rq, cfs_val, se_val);
+ new_val = 0;
+ }
+
+ WRITE_ONCE(cfs_rq->avg.util_avg_uclamp, new_val);
+}
#else /* CONFIG_UCLAMP_TASK */
static inline unsigned long uclamp_eff_value(struct task_struct *p,
enum uclamp_id clamp_id)
@@ -3175,6 +3202,16 @@ static inline bool uclamp_rq_is_idle(struct rq *rq)
{
return false;
}
+
+static inline void enqueue_util_avg_uclamp(struct cfs_rq *cfs_rq,
+ struct sched_entity *se)
+{
+}
+
+static inline void dequeue_util_avg_uclamp(struct cfs_rq *cfs_rq,
+ struct sched_entity *se)
+{
+}
#endif /* CONFIG_UCLAMP_TASK */

#ifdef CONFIG_HAVE_SCHED_AVG_IRQ
--
2.34.1


2023-10-31 15:52:39

by Dietmar Eggemann

[permalink] [raw]
Subject: Re: [RFC PATCH 1/6] sched/uclamp: Track uclamped util_avg in sched_avg

On 04/10/2023 11:04, Hongyan Xia wrote:
> From: Hongyan Xia <[email protected]>

[...]

> @@ -6445,6 +6450,21 @@ static int sched_idle_cpu(int cpu)
> }
> #endif
>
> +void ___update_util_avg_uclamp(struct sched_avg *avg, struct sched_entity *se);

IMHO, `struct sched_avg *avg` can only be the one of a cfs_rq. So
passing a cfs_rq would eliminate the question whether this can be from
another se.

> +static void update_se_chain(struct task_struct *p)
> +{
> +#ifdef CONFIG_UCLAMP_TASK
> + struct sched_entity *se = &p->se;
> + struct rq *rq = task_rq(p);
> +
> + for_each_sched_entity(se) {
> + struct cfs_rq *cfs_rq = cfs_rq_of(se);
> +
> + ___update_util_avg_uclamp(&cfs_rq->avg, se);
> + }
> +#endif
> +}
> /*
> * The enqueue_task method is called before nr_running is
> * increased. Here we update the fair scheduling stats and
> @@ -6511,6 +6531,16 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
> goto enqueue_throttle;
> }
>
> + /*
> + * Re-evaluate the se hierarchy now that on_rq is true. This is
> + * important to enforce uclamp the moment a task with uclamp is
> + * enqueued, rather than waiting a timer tick for uclamp to kick in.
> + *
> + * XXX: This duplicates some of the work already done in the above for
> + * loops.
> + */
> + update_se_chain(p);

I try to figure out why this is necessary here:

enqueue_task_fair()
for_each_sched_entity()
enqueue_entity()
update_load_avg()
__update_load_avg_se()
___update_util_avg_uclamp() <-- if se->on_rq,
update p (se) if we
cross PELT period (1)
boundaries
___decay_util_avg_uclamp_towards() <-- decay p (se) (2)

enqueue_util_avg_uclamp() <-- enqueue p into cfs_rq (3)

se->on_rq = 1 <-- set p (se) on_rq (4)

for_each_sched_entity()
update_load_avg() <-- update all on_rq se's
in the hierarchy (5)

update_se_chain() <-- update p (se) and its
se hierarchy (6)

(1) Skip p since it is !se->on_rq.

(2) Decay p->se->avg.util_avg_uclamp to 0 since it was sleeping.

(3) Attach p to its cfs_rq

...

(6) Update all all se's and cfs_rq's involved in p's task_group
hierarchy (including propagation from se (level=x+1) to cfs_rq
(level=x))

Question for me is why can't you integrate the util_avg_uclamp signals
for se's and cfs_rq's/rq's much closer into existing PELT functions?

[...]

2023-12-04 16:07:54

by Vincent Guittot

[permalink] [raw]
Subject: Re: [RFC PATCH 1/6] sched/uclamp: Track uclamped util_avg in sched_avg

On Wed, 4 Oct 2023 at 11:05, Hongyan Xia <[email protected]> wrote:
>
> From: Hongyan Xia <[email protected]>
>
> Track a uclamped version of util_avg in sched_avg, which clamps util_avg
> within [uclamp[UCLAMP_MIN], uclamp[UCLAMP_MAX]] every time util_avg is
> updated. At the CFS rq level, cfs_rq->avg.util_avg_uclamp must always be
> the sum of all util_avg_uclamp of entities on this cfs_rq. So, each
> time the util_avg_uclamp of an entity gets updated, we also track the
> delta and update the cfs_rq.

No please don't do that. Don't start to mix several different signals
in one. Typically uclamp_min doen't say you want to use at least this
amount of cpu capacity.

With tasks with a util_avg of 10 but a uclamp min of 1024 what does it
mean when you start to sum this value ?




>
> We can't put the update of cfs_rq util_avg_uclamp separately in
> propagate_entity_load_avg(), because util_avg_uclamp of se and cfs_rq
> are not tracked separately, unlike util_avg. As a result,
> util_avg_uclamp of the se and the cfs_rq the se is on must be updated
> at the same time.
>
> Signed-off-by: Hongyan Xia <[email protected]>
> ---
> include/linux/sched.h | 9 ++++++++-
> kernel/sched/fair.c | 39 +++++++++++++++++++++++++++++++++++++++
> kernel/sched/pelt.c | 43 +++++++++++++++++++++++++++++++++++++++++++
> kernel/sched/sched.h | 37 +++++++++++++++++++++++++++++++++++++
> 4 files changed, 127 insertions(+), 1 deletion(-)
>
> diff --git a/include/linux/sched.h b/include/linux/sched.h
> index 177b3f3676ef..825d7b86b006 100644
> --- a/include/linux/sched.h
> +++ b/include/linux/sched.h
> @@ -499,7 +499,14 @@ struct sched_avg {
> u32 period_contrib;
> unsigned long load_avg;
> unsigned long runnable_avg;
> - unsigned long util_avg;
> + unsigned int util_avg;
> +#ifdef CONFIG_UCLAMP_TASK
> + /*
> + * XXX: util_avg shrunk to accommodate util_avg_uclamp.
> + * What are the consequences?
> + */
> + unsigned int util_avg_uclamp;
> +#endif
> struct util_est util_est;
> } ____cacheline_aligned;
>
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index 0b7445cd5af9..33e5a6e751c0 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -1077,6 +1077,9 @@ void post_init_entity_util_avg(struct task_struct *p)
> }
>
> sa->runnable_avg = sa->util_avg;
> +#ifdef CONFIG_UCLAMP_TASK
> + sa->util_avg_uclamp = sa->util_avg;
> +#endif
> }
>
> #else /* !CONFIG_SMP */
> @@ -5068,6 +5071,7 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
> update_stats_enqueue_fair(cfs_rq, se, flags);
> if (!curr)
> __enqueue_entity(cfs_rq, se);
> + enqueue_util_avg_uclamp(cfs_rq, se);
> se->on_rq = 1;
>
> if (cfs_rq->nr_running == 1) {
> @@ -5138,6 +5142,7 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
> update_entity_lag(cfs_rq, se);
> if (se != cfs_rq->curr)
> __dequeue_entity(cfs_rq, se);
> + dequeue_util_avg_uclamp(cfs_rq, se);
> se->on_rq = 0;
> account_entity_dequeue(cfs_rq, se);
>
> @@ -6445,6 +6450,21 @@ static int sched_idle_cpu(int cpu)
> }
> #endif
>
> +void ___update_util_avg_uclamp(struct sched_avg *avg, struct sched_entity *se);
> +
> +static void update_se_chain(struct task_struct *p)
> +{
> +#ifdef CONFIG_UCLAMP_TASK
> + struct sched_entity *se = &p->se;
> + struct rq *rq = task_rq(p);
> +
> + for_each_sched_entity(se) {
> + struct cfs_rq *cfs_rq = cfs_rq_of(se);
> +
> + ___update_util_avg_uclamp(&cfs_rq->avg, se);
> + }
> +#endif
> +}
> /*
> * The enqueue_task method is called before nr_running is
> * increased. Here we update the fair scheduling stats and
> @@ -6511,6 +6531,16 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
> goto enqueue_throttle;
> }
>
> + /*
> + * Re-evaluate the se hierarchy now that on_rq is true. This is
> + * important to enforce uclamp the moment a task with uclamp is
> + * enqueued, rather than waiting a timer tick for uclamp to kick in.
> + *
> + * XXX: This duplicates some of the work already done in the above for
> + * loops.
> + */
> + update_se_chain(p);
> +
> /* At this point se is NULL and we are at root level*/
> add_nr_running(rq, 1);
>
> @@ -6612,6 +6642,15 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
> dequeue_throttle:
> util_est_update(&rq->cfs, p, task_sleep);
> hrtick_update(rq);
> +
> +#ifdef CONFIG_UCLAMP_TASK
> + if (rq->cfs.h_nr_running == 0) {
> + WARN_ONCE(rq->cfs.avg.util_avg_uclamp,
> + "0 tasks on CFS of CPU %d, but util_avg_uclamp is %u\n",
> + rq->cpu, rq->cfs.avg.util_avg_uclamp);
> + WRITE_ONCE(rq->cfs.avg.util_avg_uclamp, 0);
> + }
> +#endif
> }
>
> #ifdef CONFIG_SMP
> diff --git a/kernel/sched/pelt.c b/kernel/sched/pelt.c
> index 0f310768260c..c656e4dcb1d1 100644
> --- a/kernel/sched/pelt.c
> +++ b/kernel/sched/pelt.c
> @@ -266,6 +266,48 @@ ___update_load_avg(struct sched_avg *sa, unsigned long load)
> WRITE_ONCE(sa->util_avg, sa->util_sum / divider);
> }
>
> +#ifdef CONFIG_UCLAMP_TASK
> +/* avg must belong to the queue this se is on. */
> +void ___update_util_avg_uclamp(struct sched_avg *avg, struct sched_entity *se)
> +{
> + unsigned int util;
> + int delta;
> +
> + if (entity_is_task(se)) {
> + unsigned int uclamp_min, uclamp_max;
> +
> + if (!se->on_rq)
> + return;
> +
> + util = READ_ONCE(se->avg.util_avg);
> + uclamp_min = uclamp_eff_value(task_of(se), UCLAMP_MIN);
> + uclamp_max = uclamp_eff_value(task_of(se), UCLAMP_MAX);
> + util = clamp(util, uclamp_min, uclamp_max);
> + } else {
> + util = READ_ONCE(group_cfs_rq(se)->avg.util_avg_uclamp);
> +
> + if (!se->on_rq) {
> + WRITE_ONCE(se->avg.util_avg_uclamp, util);
> + return;
> + }
> + }
> +
> + delta = util - READ_ONCE(se->avg.util_avg_uclamp);
> + if (delta == 0)
> + return;
> +
> + WRITE_ONCE(se->avg.util_avg_uclamp, util);
> + util = READ_ONCE(avg->util_avg_uclamp);
> + util += delta;
> + WRITE_ONCE(avg->util_avg_uclamp, util);
> +}
> +#else /* !CONFIG_UCLAMP_TASK */
> +static void
> +___update_util_avg_uclamp(struct sched_avg *avg, struct sched_entity *se)
> +{
> +}
> +#endif
> +
> /*
> * sched_entity:
> *
> @@ -309,6 +351,7 @@ int __update_load_avg_se(u64 now, struct cfs_rq *cfs_rq, struct sched_entity *se
> cfs_rq->curr == se)) {
>
> ___update_load_avg(&se->avg, se_weight(se));
> + ___update_util_avg_uclamp(&cfs_rq->avg, se);
> cfs_se_util_change(&se->avg);
> trace_pelt_se_tp(se);
> return 1;
> diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
> index 3a01b7a2bf66..2eefcdb0c3b0 100644
> --- a/kernel/sched/sched.h
> +++ b/kernel/sched/sched.h
> @@ -3133,6 +3133,33 @@ static inline bool uclamp_is_used(void)
> {
> return static_branch_likely(&sched_uclamp_used);
> }
> +
> +static inline void enqueue_util_avg_uclamp(struct cfs_rq *cfs_rq,
> + struct sched_entity *se)
> +{
> + unsigned int cfs_val = READ_ONCE(cfs_rq->avg.util_avg_uclamp);
> + unsigned int se_val = READ_ONCE(se->avg.util_avg_uclamp);
> +
> + WRITE_ONCE(cfs_rq->avg.util_avg_uclamp, cfs_val + se_val);
> +}
> +
> +static inline void dequeue_util_avg_uclamp(struct cfs_rq *cfs_rq,
> + struct sched_entity *se)
> +{
> + unsigned int cfs_val = READ_ONCE(cfs_rq->avg.util_avg_uclamp);
> + unsigned int se_val = READ_ONCE(se->avg.util_avg_uclamp), new_val;
> +
> + if (cfs_val > se_val)
> + new_val = cfs_val - se_val;
> + else {
> + WARN_ONCE(cfs_val < se_val,
> + "CPU %d. cfs_rq %p, cfs_val %u is even less than se_val %u before subtraction\n",
> + rq_of(cfs_rq)->cpu, cfs_rq, cfs_val, se_val);
> + new_val = 0;
> + }
> +
> + WRITE_ONCE(cfs_rq->avg.util_avg_uclamp, new_val);
> +}
> #else /* CONFIG_UCLAMP_TASK */
> static inline unsigned long uclamp_eff_value(struct task_struct *p,
> enum uclamp_id clamp_id)
> @@ -3175,6 +3202,16 @@ static inline bool uclamp_rq_is_idle(struct rq *rq)
> {
> return false;
> }
> +
> +static inline void enqueue_util_avg_uclamp(struct cfs_rq *cfs_rq,
> + struct sched_entity *se)
> +{
> +}
> +
> +static inline void dequeue_util_avg_uclamp(struct cfs_rq *cfs_rq,
> + struct sched_entity *se)
> +{
> +}
> #endif /* CONFIG_UCLAMP_TASK */
>
> #ifdef CONFIG_HAVE_SCHED_AVG_IRQ
> --
> 2.34.1
>

2023-12-05 14:24:48

by Hongyan Xia

[permalink] [raw]
Subject: Re: [RFC PATCH 1/6] sched/uclamp: Track uclamped util_avg in sched_avg

On 04/12/2023 16:07, Vincent Guittot wrote:
> On Wed, 4 Oct 2023 at 11:05, Hongyan Xia <[email protected]> wrote:
>>
>> From: Hongyan Xia <[email protected]>
>>
>> Track a uclamped version of util_avg in sched_avg, which clamps util_avg
>> within [uclamp[UCLAMP_MIN], uclamp[UCLAMP_MAX]] every time util_avg is
>> updated. At the CFS rq level, cfs_rq->avg.util_avg_uclamp must always be
>> the sum of all util_avg_uclamp of entities on this cfs_rq. So, each
>> time the util_avg_uclamp of an entity gets updated, we also track the
>> delta and update the cfs_rq.
>
> No please don't do that. Don't start to mix several different signals
> in one. Typically uclamp_min doen't say you want to use at least this
> amount of cpu capacity.

But I'd say with uclamp, PELT is already broken and is a mixed signal
anyway. When uclamp is used, a util_avg value X doesn't mean X, it means
X under some rq uclamp, and the worst part is that the rq uclamp may or
may not have anything to do with this task's uclamp.

Pretending X is a true PELT value now (and not a mixed value) is why we
have so many problems today. For example in the frequency spike problem,
if a task A has no idle time under uclamp_max, its PELT does not reflect
reality. The moment another task B comes in and uncap the rq uclamp_max,
the current scheduler code thinks the 1024 of A means a real 1024, which
is wrong and is exactly why we see a spike when B joins. It's also why
we need to special case 0 spare capacity with recent patches, because rq
util_avg has lost its original PELT meaning under uclamp.

Because X is not the true PELT, we always have to do something to bring
it back into reality. What the current max aggregation code does is to
introduce corner cases, like treating 0 spare capacity as potential
opportunity to queue more tasks (which creates further problems in my
tests), and maybe introducing uclamp load balancing special cases in the
future, or introducing uclamp filtering to delay the effect of wrong
PELT values.

What this series does is not much different. We keep the somewhat wrong
value X, but we also remember under what uclamp values did we get that X
to bring things back into reality, which means now we have [X,
uclamp_min when X happens, uclamp_max when X happens]. To save space,
this becomes [X, clamped X], which is what this series does. The
original PELT value X is kept, but we use the clamped X in several
places to improve our decisions.

>
> With tasks with a util_avg of 10 but a uclamp min of 1024 what does it
> mean when you start to sum this value ?

Like I replied in another comment, assuming a uclamp_min of 1024 is a
hint to run the task on the big CPUs, I don't think it's right to
directly use uclamp as a CPU placement indicator. A uclamp value may
come from ADFP from userspace. An ADPF uclamp_min value of little CPU
capacity + 1 certainly doesn't mean a game on purpose wants to avoid the
little core. It simply means it wants at least this much performance,
and whether this results in placing the game thread on a big CPU is
purely the job of EAS (or CAS, etc.). We want to use little CPU + 1 as
uclamp_min because we know the SoC and the little CPU is bad, but uclamp
should be generic and should not rely on knowing the SoC.

Basically, under sum aggregation one would not use a uclamp_min value of
1024 to place a small task on a big core. A uclamp_min of 1024 under sum
aggregation has the meaning in ADPF, which is a hint to try to run me as
fast as possible.

>
>
>> [...]

2023-12-05 16:24:28

by Vincent Guittot

[permalink] [raw]
Subject: Re: [RFC PATCH 1/6] sched/uclamp: Track uclamped util_avg in sched_avg

On Tue, 5 Dec 2023 at 15:24, Hongyan Xia <[email protected]> wrote:
>
> On 04/12/2023 16:07, Vincent Guittot wrote:
> > On Wed, 4 Oct 2023 at 11:05, Hongyan Xia <[email protected]> wrote:
> >>
> >> From: Hongyan Xia <[email protected]>
> >>
> >> Track a uclamped version of util_avg in sched_avg, which clamps util_avg
> >> within [uclamp[UCLAMP_MIN], uclamp[UCLAMP_MAX]] every time util_avg is
> >> updated. At the CFS rq level, cfs_rq->avg.util_avg_uclamp must always be
> >> the sum of all util_avg_uclamp of entities on this cfs_rq. So, each
> >> time the util_avg_uclamp of an entity gets updated, we also track the
> >> delta and update the cfs_rq.
> >
> > No please don't do that. Don't start to mix several different signals
> > in one. Typically uclamp_min doen't say you want to use at least this
> > amount of cpu capacity.
>
> But I'd say with uclamp, PELT is already broken and is a mixed signal

PELT has nothing to do with uclamp, you could argue that EAS is making
a wrong use or mix of PELT signals and uclamp hints to select a CPU
but PELT itself is not impacted by uclamp and should stay out of
uclamp policy.

> anyway. When uclamp is used, a util_avg value X doesn't mean X, it means
> X under some rq uclamp, and the worst part is that the rq uclamp may or
> may not have anything to do with this task's uclamp.

I think that you are mixing PELT and how it is(was) used by EAS.

Have you looked at the latest tip/sched/core and the changes in
effective_cpu_util(int cpu, unsigned long util_cfs, unsigned long
*min, unsigned long *max) ?

We return 3 values:
- the actual utilization which is not impacted by uclamp
- a targeted min value which takes into account uclamp_min
- a targeted max value which takes into account uclamp_max

https://lore.kernel.org/lkml/[email protected]/

>
> Pretending X is a true PELT value now (and not a mixed value) is why we
> have so many problems today. For example in the frequency spike problem,
> if a task A has no idle time under uclamp_max, its PELT does not reflect
> reality. The moment another task B comes in and uncap the rq uclamp_max,

you are mixing 2 things. The PELT signal of the task is correct.

> the current scheduler code thinks the 1024 of A means a real 1024, which
> is wrong and is exactly why we see a spike when B joins. It's also why

This is the true actual utilization, the actual util_avg of A is
really 1024. But users want to give a hint to lower its needs with
uclamp_max.

> we need to special case 0 spare capacity with recent patches, because rq
> util_avg has lost its original PELT meaning under uclamp.
>
> Because X is not the true PELT, we always have to do something to bring
> it back into reality. What the current max aggregation code does is to
> introduce corner cases, like treating 0 spare capacity as potential
> opportunity to queue more tasks (which creates further problems in my
> tests), and maybe introducing uclamp load balancing special cases in the
> future, or introducing uclamp filtering to delay the effect of wrong
> PELT values.
>
> What this series does is not much different. We keep the somewhat wrong
> value X, but we also remember under what uclamp values did we get that X
> to bring things back into reality, which means now we have [X,
> uclamp_min when X happens, uclamp_max when X happens]. To save space,
> this becomes [X, clamped X], which is what this series does. The
> original PELT value X is kept, but we use the clamped X in several
> places to improve our decisions.
>
> >
> > With tasks with a util_avg of 10 but a uclamp min of 1024 what does it
> > mean when you start to sum this value ?
>
> Like I replied in another comment, assuming a uclamp_min of 1024 is a
> hint to run the task on the big CPUs, I don't think it's right to

not especially to run on a big but to say that the task needs more
performance than what its actual utilization looks

> directly use uclamp as a CPU placement indicator. A uclamp value may
> come from ADFP from userspace. An ADPF uclamp_min value of little CPU
> capacity + 1 certainly doesn't mean a game on purpose wants to avoid the
> little core. It simply means it wants at least this much performance,
> and whether this results in placing the game thread on a big CPU is
> purely the job of EAS (or CAS, etc.). We want to use little CPU + 1 as
> uclamp_min because we know the SoC and the little CPU is bad, but uclamp
> should be generic and should not rely on knowing the SoC.
>
> Basically, under sum aggregation one would not use a uclamp_min value of
> 1024 to place a small task on a big core. A uclamp_min of 1024 under sum
> aggregation has the meaning in ADPF, which is a hint to try to run me as
> fast as possible.
>
> >
> >
> >> [...]