LinuxLists.cc - [PATCH v6 1/3] sched/fair: Add EAS checks before updating overutilized

2024-03-07 08:58:50

Subject: [PATCH v6 1/3] sched/fair: Add EAS checks before updating overutilized

Overutilized field of root domain is only used for EAS(energy aware scheduler)
to decide whether to do load balance or not. It is not used if EAS
not possible.

Currently enqueue_task_fair and task_tick_fair accesses, sometime updates
this field. In update_sd_lb_stats it is updated often. This causes cache
contention due to true sharing and burns a lot of cycles. overload and
overutilized are part of the same cacheline. Updating it often invalidates
the cacheline. That causes access to overload to slow down due to
false sharing. Hence add EAS check before accessing/updating this field.
EAS check is optimized at compile time or it is a static branch.
Hence it shouldn't cost much.

With the patch, both enqueue_task_fair and newidle_balance don't show
up as hot routines in perf profile.

6.8-rc4:
7.18% swapper [kernel.vmlinux] [k] enqueue_task_fair
6.78% s [kernel.vmlinux] [k] newidle_balance
+patch:
0.14% swapper [kernel.vmlinux] [k] enqueue_task_fair
0.00% swapper [kernel.vmlinux] [k] newidle_balance

Minor change: trace_sched_overutilized_tp expect that second argument to
be bool. So do a int to bool conversion for that.

Fixes: 2802bf3cd936 ("sched/fair: Add over-utilization/tipping point indicator")
Reviewed-by: Qais Yousef <[email protected]>
Reviewed-by: Srikar Dronamraju <[email protected]>
Signed-off-by: Shrikanth Hegde <[email protected]>
---
kernel/sched/fair.c | 53 +++++++++++++++++++++++++++++----------------
1 file changed, 34 insertions(+), 19 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 6a16129f9a5c..5aa6e918598c 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -6663,22 +6663,42 @@ static inline void hrtick_update(struct rq *rq)
#ifdef CONFIG_SMP
static inline bool cpu_overutilized(int cpu)
{
- unsigned long rq_util_min = uclamp_rq_get(cpu_rq(cpu), UCLAMP_MIN);
- unsigned long rq_util_max = uclamp_rq_get(cpu_rq(cpu), UCLAMP_MAX);
+ unsigned long rq_util_min, rq_util_max;
+
+ if (!sched_energy_enabled())
+ return false;
+
+ rq_util_min = uclamp_rq_get(cpu_rq(cpu), UCLAMP_MIN);
+ rq_util_max = uclamp_rq_get(cpu_rq(cpu), UCLAMP_MAX);

/* Return true only if the utilization doesn't fit CPU's capacity */
return !util_fits_cpu(cpu_util_cfs(cpu), rq_util_min, rq_util_max, cpu);
}

-static inline void update_overutilized_status(struct rq *rq)
+static inline void set_rd_overutilized_status(struct root_domain *rd,
+ unsigned int status)
{
- if (!READ_ONCE(rq->rd->overutilized) && cpu_overutilized(rq->cpu)) {
- WRITE_ONCE(rq->rd->overutilized, SG_OVERUTILIZED);
- trace_sched_overutilized_tp(rq->rd, SG_OVERUTILIZED);
- }
+ if (!sched_energy_enabled())
+ return;
+
+ WRITE_ONCE(rd->overutilized, status);
+ trace_sched_overutilized_tp(rd, !!status);
+}
+
+static inline void check_update_overutilized_status(struct rq *rq)
+{
+ /*
+ * overutilized field is used for load balancing decisions only
+ * if energy aware scheduler is being used
+ */
+ if (!sched_energy_enabled())
+ return;
+
+ if (!READ_ONCE(rq->rd->overutilized) && cpu_overutilized(rq->cpu))
+ set_rd_overutilized_status(rq->rd, SG_OVERUTILIZED);
}
#else
-static inline void update_overutilized_status(struct rq *rq) { }
+static inline void check_update_overutilized_status(struct rq *rq) { }
#endif

/* Runqueue only has SCHED_IDLE tasks enqueued */
@@ -6779,7 +6799,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
* and the following generally works well enough in practice.
*/
if (!task_new)
- update_overutilized_status(rq);
+ check_update_overutilized_status(rq);

enqueue_throttle:
assert_list_leaf_cfs_rq(rq);
@@ -10596,19 +10616,14 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd
env->fbq_type = fbq_classify_group(&sds->busiest_stat);

if (!env->sd->parent) {
- struct root_domain *rd = env->dst_rq->rd;
-
/* update overload indicator if we are at root domain */
- WRITE_ONCE(rd->overload, sg_status & SG_OVERLOAD);
+ WRITE_ONCE(env->dst_rq->rd->overload, sg_status & SG_OVERLOAD);

/* Update over-utilization (tipping point, U >= 0) indicator */
- WRITE_ONCE(rd->overutilized, sg_status & SG_OVERUTILIZED);
- trace_sched_overutilized_tp(rd, sg_status & SG_OVERUTILIZED);
+ set_rd_overutilized_status(env->dst_rq->rd,
+ sg_status & SG_OVERUTILIZED);
} else if (sg_status & SG_OVERUTILIZED) {
- struct root_domain *rd = env->dst_rq->rd;
-
- WRITE_ONCE(rd->overutilized, SG_OVERUTILIZED);
- trace_sched_overutilized_tp(rd, SG_OVERUTILIZED);
+ set_rd_overutilized_status(env->dst_rq->rd, SG_OVERUTILIZED);
}

update_idle_cpu_scan(env, sum_util);
@@ -12609,7 +12624,7 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued)
task_tick_numa(rq, curr);

update_misfit_status(curr, rq);
- update_overutilized_status(task_rq(curr));
+ check_update_overutilized_status(task_rq(curr));

task_tick_core(rq, curr);
}
--
2.39.3

2024-03-07 16:51:07

by Vincent Guittot

[permalink] [raw]

Subject: Re: [PATCH v6 1/3] sched/fair: Add EAS checks before updating overutilized

On Thu, 7 Mar 2024 at 09:57, Shrikanth Hegde <[email protected]> wrote:
>
> Overutilized field of root domain is only used for EAS(energy aware scheduler)
> to decide whether to do load balance or not. It is not used if EAS
> not possible.
>
> Currently enqueue_task_fair and task_tick_fair accesses, sometime updates
> this field. In update_sd_lb_stats it is updated often. This causes cache
> contention due to true sharing and burns a lot of cycles. overload and
> overutilized are part of the same cacheline. Updating it often invalidates
> the cacheline. That causes access to overload to slow down due to
> false sharing. Hence add EAS check before accessing/updating this field.
> EAS check is optimized at compile time or it is a static branch.
> Hence it shouldn't cost much.
>
> With the patch, both enqueue_task_fair and newidle_balance don't show
> up as hot routines in perf profile.
>
> 6.8-rc4:
> 7.18% swapper [kernel.vmlinux] [k] enqueue_task_fair
> 6.78% s [kernel.vmlinux] [k] newidle_balance
> +patch:
> 0.14% swapper [kernel.vmlinux] [k] enqueue_task_fair
> 0.00% swapper [kernel.vmlinux] [k] newidle_balance
>
> Minor change: trace_sched_overutilized_tp expect that second argument to
> be bool. So do a int to bool conversion for that.
>
> Fixes: 2802bf3cd936 ("sched/fair: Add over-utilization/tipping point indicator")
> Reviewed-by: Qais Yousef <[email protected]>
> Reviewed-by: Srikar Dronamraju <[email protected]>
> Signed-off-by: Shrikanth Hegde <[email protected]>

Reviewed-by: Vincent Guittot <[email protected]>

> ---
> kernel/sched/fair.c | 53 +++++++++++++++++++++++++++++----------------
> 1 file changed, 34 insertions(+), 19 deletions(-)
>
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index 6a16129f9a5c..5aa6e918598c 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -6663,22 +6663,42 @@ static inline void hrtick_update(struct rq *rq)
> #ifdef CONFIG_SMP
> static inline bool cpu_overutilized(int cpu)
> {
> - unsigned long rq_util_min = uclamp_rq_get(cpu_rq(cpu), UCLAMP_MIN);
> - unsigned long rq_util_max = uclamp_rq_get(cpu_rq(cpu), UCLAMP_MAX);
> + unsigned long rq_util_min, rq_util_max;
> +
> + if (!sched_energy_enabled())
> + return false;
> +
> + rq_util_min = uclamp_rq_get(cpu_rq(cpu), UCLAMP_MIN);
> + rq_util_max = uclamp_rq_get(cpu_rq(cpu), UCLAMP_MAX);
>
> /* Return true only if the utilization doesn't fit CPU's capacity */
> return !util_fits_cpu(cpu_util_cfs(cpu), rq_util_min, rq_util_max, cpu);
> }
>
> -static inline void update_overutilized_status(struct rq *rq)
> +static inline void set_rd_overutilized_status(struct root_domain *rd,
> + unsigned int status)
> {
> - if (!READ_ONCE(rq->rd->overutilized) && cpu_overutilized(rq->cpu)) {
> - WRITE_ONCE(rq->rd->overutilized, SG_OVERUTILIZED);
> - trace_sched_overutilized_tp(rq->rd, SG_OVERUTILIZED);
> - }
> + if (!sched_energy_enabled())
> + return;
> +
> + WRITE_ONCE(rd->overutilized, status);
> + trace_sched_overutilized_tp(rd, !!status);
> +}
> +
> +static inline void check_update_overutilized_status(struct rq *rq)
> +{
> + /*
> + * overutilized field is used for load balancing decisions only
> + * if energy aware scheduler is being used
> + */
> + if (!sched_energy_enabled())
> + return;
> +
> + if (!READ_ONCE(rq->rd->overutilized) && cpu_overutilized(rq->cpu))
> + set_rd_overutilized_status(rq->rd, SG_OVERUTILIZED);
> }
> #else
> -static inline void update_overutilized_status(struct rq *rq) { }
> +static inline void check_update_overutilized_status(struct rq *rq) { }
> #endif
>
> /* Runqueue only has SCHED_IDLE tasks enqueued */
> @@ -6779,7 +6799,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
> * and the following generally works well enough in practice.
> */
> if (!task_new)
> - update_overutilized_status(rq);
> + check_update_overutilized_status(rq);
>
> enqueue_throttle:
> assert_list_leaf_cfs_rq(rq);
> @@ -10596,19 +10616,14 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd
> env->fbq_type = fbq_classify_group(&sds->busiest_stat);
>
> if (!env->sd->parent) {
> - struct root_domain *rd = env->dst_rq->rd;
> -
> /* update overload indicator if we are at root domain */
> - WRITE_ONCE(rd->overload, sg_status & SG_OVERLOAD);
> + WRITE_ONCE(env->dst_rq->rd->overload, sg_status & SG_OVERLOAD);
>
> /* Update over-utilization (tipping point, U >= 0) indicator */
> - WRITE_ONCE(rd->overutilized, sg_status & SG_OVERUTILIZED);
> - trace_sched_overutilized_tp(rd, sg_status & SG_OVERUTILIZED);
> + set_rd_overutilized_status(env->dst_rq->rd,
> + sg_status & SG_OVERUTILIZED);
> } else if (sg_status & SG_OVERUTILIZED) {
> - struct root_domain *rd = env->dst_rq->rd;
> -
> - WRITE_ONCE(rd->overutilized, SG_OVERUTILIZED);
> - trace_sched_overutilized_tp(rd, SG_OVERUTILIZED);
> + set_rd_overutilized_status(env->dst_rq->rd, SG_OVERUTILIZED);
> }
>
> update_idle_cpu_scan(env, sum_util);
> @@ -12609,7 +12624,7 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued)
> task_tick_numa(rq, curr);
>
> update_misfit_status(curr, rq);
> - update_overutilized_status(task_rq(curr));
> + check_update_overutilized_status(task_rq(curr));
>
> task_tick_core(rq, curr);
> }
> --
> 2.39.3
>

2024-03-26 08:08:00

by tip-bot2 for Alexey Makhalov

[permalink] [raw]

Subject: [tip: sched/core] sched/fair: Add EAS checks before updating root_domain::overutilized

The following commit has been merged into the sched/core branch of tip:

Commit-ID: be3a51e68f2f1b17250ce40d8872c7645b7a2991
Gitweb: https://git.kernel.org/tip/be3a51e68f2f1b17250ce40d8872c7645b7a2991
Author: Shrikanth Hegde <[email protected]>
AuthorDate: Thu, 07 Mar 2024 14:27:23 +05:30
Committer: Ingo Molnar <[email protected]>
CommitterDate: Tue, 26 Mar 2024 08:58:59 +01:00

sched/fair: Add EAS checks before updating root_domain::overutilized

root_domain::overutilized is only used for EAS(energy aware scheduler)
to decide whether to do load balance or not. It is not used if EAS
not possible.

Currently enqueue_task_fair and task_tick_fair accesses, sometime updates
this field. In update_sd_lb_stats it is updated often. This causes cache
contention due to true sharing and burns a lot of cycles. ::overload and
::overutilized are part of the same cacheline. Updating it often invalidates
the cacheline. That causes access to ::overload to slow down due to
false sharing. Hence add EAS check before accessing/updating this field.
EAS check is optimized at compile time or it is a static branch.
Hence it shouldn't cost much.

With the patch, both enqueue_task_fair and newidle_balance don't show
up as hot routines in perf profile.

6.8-rc4:
7.18% swapper [kernel.vmlinux] [k] enqueue_task_fair
6.78% s [kernel.vmlinux] [k] newidle_balance

+patch:
0.14% swapper [kernel.vmlinux] [k] enqueue_task_fair
0.00% swapper [kernel.vmlinux] [k] newidle_balance

While at it: trace_sched_overutilized_tp expect that second argument to
be bool. So do a int to bool conversion for that.

Fixes: 2802bf3cd936 ("sched/fair: Add over-utilization/tipping point indicator")
Signed-off-by: Shrikanth Hegde <[email protected]>
Signed-off-by: Ingo Molnar <[email protected]>
Reviewed-by: Qais Yousef <[email protected]>
Reviewed-by: Srikar Dronamraju <[email protected]>
Reviewed-by: Vincent Guittot <[email protected]>
Link: https://lore.kernel.org/r/[email protected]
---
kernel/sched/fair.c | 53 ++++++++++++++++++++++++++++----------------
1 file changed, 34 insertions(+), 19 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index dbf4f1c..1afa4f8 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -6673,22 +6673,42 @@ static inline void hrtick_update(struct rq *rq)
#ifdef CONFIG_SMP
static inline bool cpu_overutilized(int cpu)
{
- unsigned long rq_util_min = uclamp_rq_get(cpu_rq(cpu), UCLAMP_MIN);
- unsigned long rq_util_max = uclamp_rq_get(cpu_rq(cpu), UCLAMP_MAX);
+ unsigned long rq_util_min, rq_util_max;
+
+ if (!sched_energy_enabled())
+ return false;
+
+ rq_util_min = uclamp_rq_get(cpu_rq(cpu), UCLAMP_MIN);
+ rq_util_max = uclamp_rq_get(cpu_rq(cpu), UCLAMP_MAX);

/* Return true only if the utilization doesn't fit CPU's capacity */
return !util_fits_cpu(cpu_util_cfs(cpu), rq_util_min, rq_util_max, cpu);
}

-static inline void update_overutilized_status(struct rq *rq)
+static inline void set_rd_overutilized_status(struct root_domain *rd,
+ unsigned int status)
{
- if (!READ_ONCE(rq->rd->overutilized) && cpu_overutilized(rq->cpu)) {
- WRITE_ONCE(rq->rd->overutilized, SG_OVERUTILIZED);
- trace_sched_overutilized_tp(rq->rd, SG_OVERUTILIZED);
- }
+ if (!sched_energy_enabled())
+ return;
+
+ WRITE_ONCE(rd->overutilized, status);
+ trace_sched_overutilized_tp(rd, !!status);
+}
+
+static inline void check_update_overutilized_status(struct rq *rq)
+{
+ /*
+ * overutilized field is used for load balancing decisions only
+ * if energy aware scheduler is being used
+ */
+ if (!sched_energy_enabled())
+ return;
+
+ if (!READ_ONCE(rq->rd->overutilized) && cpu_overutilized(rq->cpu))
+ set_rd_overutilized_status(rq->rd, SG_OVERUTILIZED);
}
#else
-static inline void update_overutilized_status(struct rq *rq) { }
+static inline void check_update_overutilized_status(struct rq *rq) { }
#endif

/* Runqueue only has SCHED_IDLE tasks enqueued */
@@ -6789,7 +6809,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
* and the following generally works well enough in practice.
*/
if (!task_new)
- update_overutilized_status(rq);
+ check_update_overutilized_status(rq);

enqueue_throttle:
assert_list_leaf_cfs_rq(rq);
@@ -10630,19 +10650,14 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd
env->fbq_type = fbq_classify_group(&sds->busiest_stat);

if (!env->sd->parent) {
- struct root_domain *rd = env->dst_rq->rd;
-
/* update overload indicator if we are at root domain */
- WRITE_ONCE(rd->overload, sg_status & SG_OVERLOAD);
+ WRITE_ONCE(env->dst_rq->rd->overload, sg_status & SG_OVERLOAD);

/* Update over-utilization (tipping point, U >= 0) indicator */
- WRITE_ONCE(rd->overutilized, sg_status & SG_OVERUTILIZED);
- trace_sched_overutilized_tp(rd, sg_status & SG_OVERUTILIZED);
+ set_rd_overutilized_status(env->dst_rq->rd,
+ sg_status & SG_OVERUTILIZED);
} else if (sg_status & SG_OVERUTILIZED) {
- struct root_domain *rd = env->dst_rq->rd;
-
- WRITE_ONCE(rd->overutilized, SG_OVERUTILIZED);
- trace_sched_overutilized_tp(rd, SG_OVERUTILIZED);
+ set_rd_overutilized_status(env->dst_rq->rd, SG_OVERUTILIZED);
}

update_idle_cpu_scan(env, sum_util);
@@ -12667,7 +12682,7 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued)
task_tick_numa(rq, curr);

update_misfit_status(curr, rq);
- update_overutilized_status(task_rq(curr));
+ check_update_overutilized_status(task_rq(curr));

task_tick_core(rq, curr);
}