2022-04-10 16:54:08

by Abel Wu

[permalink] [raw]
Subject: [RFC v2 2/2] sched/fair: introduce sched-idle balance

The periodic (normal/idle) balancing is regulated by intervals on each
sched-domain and the intervals can prevent the unoccupied cpus from
pulling the non-idle tasks. While the newly-idle balancing is triggered
only when the cpus become really idle, and sadly the sched-idle cpus
are not the case. There are also some other constrains to get in the
middle of the way of making unoccupied cpus busier.

Given the above, the sched-idle balancing is an extension to existing
load balance mechanisms on the unoccupied cpus to let them fast pull
non-idle tasks from the overloaded cpus. This is achieved by:

- Quit early in periodic load balancing if the cpu becomes
no idle anymore. This is similar to what we do in newly-
idle case in which we stop balancing once we got some
work to do (althrough this is partly due to newly-idle
can be very frequent, while periodic balancing is not).

- The newly-idle balancing will try harder to pull the non-
idle tasks if overloaded cpus exist.

In this way we will fill the unoccupied cpus more proactively to get
more cpu capacity for the non-idle tasks.

Signed-off-by: Abel Wu <[email protected]>
---
include/linux/sched/idle.h | 1 +
kernel/sched/core.c | 1 +
kernel/sched/fair.c | 145 ++++++++++++++++++++++++++++++++++++++++++---
kernel/sched/sched.h | 2 +
4 files changed, 142 insertions(+), 7 deletions(-)

diff --git a/include/linux/sched/idle.h b/include/linux/sched/idle.h
index d73d314d59c6..50ec5c770f85 100644
--- a/include/linux/sched/idle.h
+++ b/include/linux/sched/idle.h
@@ -8,6 +8,7 @@ enum cpu_idle_type {
CPU_IDLE,
CPU_NOT_IDLE,
CPU_NEWLY_IDLE,
+ CPU_SCHED_IDLE,
CPU_MAX_IDLE_TYPES
};

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index a372881f8eaf..c05c39541c4e 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -9495,6 +9495,7 @@ void __init sched_init(void)
rq->wake_stamp = jiffies;
rq->wake_avg_idle = rq->avg_idle;
rq->max_idle_balance_cost = sysctl_sched_migration_cost;
+ rq->sched_idle_balance = 0;
rq->overloaded = 0;

INIT_LIST_HEAD(&rq->cfs_tasks);
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index fbeb05321615..5fca3bb98273 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -456,6 +456,21 @@ static int se_is_idle(struct sched_entity *se)
return cfs_rq_is_idle(group_cfs_rq(se));
}

+/* Is this an idle task */
+static int task_h_idle(struct task_struct *p)
+{
+ struct sched_entity *se = &p->se;
+
+ if (task_has_idle_policy(p))
+ return 1;
+
+ for_each_sched_entity(se)
+ if (cfs_rq_is_idle(cfs_rq_of(se)))
+ return 1;
+
+ return 0;
+}
+
#else /* !CONFIG_FAIR_GROUP_SCHED */

#define for_each_sched_entity(se) \
@@ -508,6 +523,11 @@ static int se_is_idle(struct sched_entity *se)
return 0;
}

+static inline int task_h_idle(struct task_struct *p)
+{
+ return task_has_idle_policy(p);
+}
+
#endif /* CONFIG_FAIR_GROUP_SCHED */

static __always_inline
@@ -7039,6 +7059,16 @@ static inline bool cfs_rq_overloaded(struct rq *rq)
return rq->cfs.h_nr_running - rq->cfs.idle_h_nr_running > 1;
}

+static inline bool cfs_rq_busy(struct rq *rq)
+{
+ return rq->cfs.h_nr_running - rq->cfs.idle_h_nr_running == 1;
+}
+
+static inline bool need_pull_cfs_task(struct rq *rq)
+{
+ return rq->cfs.h_nr_running == rq->cfs.idle_h_nr_running;
+}
+
/*
* Use locality-friendly rq->overloaded to cache the status of the rq
* to minimize the heavy cost on LLC shared data.
@@ -7837,6 +7867,22 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env)
if (kthread_is_per_cpu(p))
return 0;

+ if (unlikely(task_h_idle(p))) {
+ /*
+ * Disregard hierarchically idle tasks during sched-idle
+ * load balancing.
+ */
+ if (env->idle == CPU_SCHED_IDLE)
+ return 0;
+ } else if (!static_branch_unlikely(&sched_asym_cpucapacity)) {
+ /*
+ * It's not gonna help if stacking non-idle tasks on one
+ * cpu while leaving some idle.
+ */
+ if (cfs_rq_busy(env->src_rq) && !need_pull_cfs_task(env->dst_rq))
+ return 0;
+ }
+
if (!cpumask_test_cpu(env->dst_cpu, p->cpus_ptr)) {
int cpu;

@@ -10337,6 +10383,68 @@ static inline bool update_newidle_cost(struct sched_domain *sd, u64 cost)
}

/*
+ * The sched-idle balancing tries to make full use of cpu capacity
+ * for non-idle tasks by pulling them for the unoccupied cpus from
+ * the overloaded ones.
+ *
+ * Return 1 if pulled successfully, 0 otherwise.
+ */
+static int sched_idle_balance(struct rq *dst_rq)
+{
+ struct sched_domain *sd;
+ struct task_struct *p = NULL;
+ int dst_cpu = cpu_of(dst_rq), cpu;
+
+ sd = rcu_dereference(per_cpu(sd_llc, dst_cpu));
+ if (unlikely(!sd))
+ return 0;
+
+ if (!atomic_read(&sd->shared->nr_overloaded))
+ return 0;
+
+ for_each_cpu_wrap(cpu, sdo_mask(sd->shared), dst_cpu + 1) {
+ struct rq *rq = cpu_rq(cpu);
+ struct rq_flags rf;
+ struct lb_env env;
+
+ if (cpu == dst_cpu || !cfs_rq_overloaded(rq) ||
+ READ_ONCE(rq->sched_idle_balance))
+ continue;
+
+ WRITE_ONCE(rq->sched_idle_balance, 1);
+ rq_lock_irqsave(rq, &rf);
+
+ env = (struct lb_env) {
+ .sd = sd,
+ .dst_cpu = dst_cpu,
+ .dst_rq = dst_rq,
+ .src_cpu = cpu,
+ .src_rq = rq,
+ .idle = CPU_SCHED_IDLE, /* non-idle only */
+ .flags = LBF_DST_PINNED, /* pin dst_cpu */
+ };
+
+ update_rq_clock(rq);
+ p = detach_one_task(&env);
+ if (p)
+ update_overload_status(rq);
+
+ rq_unlock(rq, &rf);
+ WRITE_ONCE(rq->sched_idle_balance, 0);
+
+ if (p) {
+ attach_one_task(dst_rq, p);
+ local_irq_restore(rf.flags);
+ return 1;
+ }
+
+ local_irq_restore(rf.flags);
+ }
+
+ return 0;
+}
+
+/*
* It checks each scheduling domain to see if it is due to be balanced,
* and initiates a balancing operation if so.
*
@@ -10356,6 +10464,15 @@ static void rebalance_domains(struct rq *rq, enum cpu_idle_type idle)
u64 max_cost = 0;

rcu_read_lock();
+
+ /*
+ * Quit early if this cpu is no idle any more. It might not be a
+ * problem since we have already made some contribution to fix
+ * imbalance.
+ */
+ if (need_pull_cfs_task(rq) && sched_idle_balance(rq))
+ continue_balancing = 0;
+
for_each_domain(cpu, sd) {
/*
* Decay the newidle max times here because this is a regular
@@ -10934,7 +11051,8 @@ static int newidle_balance(struct rq *this_rq, struct rq_flags *rf)
int this_cpu = this_rq->cpu;
u64 t0, t1, curr_cost = 0;
struct sched_domain *sd;
- int pulled_task = 0;
+ struct sched_domain_shared *sds;
+ int pulled_task = 0, has_overloaded_cpus = 0;

update_misfit_status(NULL, this_rq);

@@ -10985,6 +11103,11 @@ static int newidle_balance(struct rq *this_rq, struct rq_flags *rf)
update_blocked_averages(this_cpu);

rcu_read_lock();
+
+ sds = rcu_dereference(per_cpu(sd_llc_shared, this_cpu));
+ if (likely(sds))
+ has_overloaded_cpus = atomic_read(&sds->nr_overloaded);
+
for_each_domain(this_cpu, sd) {
int continue_balancing = 1;
u64 domain_cost;
@@ -10996,9 +11119,9 @@ static int newidle_balance(struct rq *this_rq, struct rq_flags *rf)

if (sd->flags & SD_BALANCE_NEWIDLE) {

- pulled_task = load_balance(this_cpu, this_rq,
- sd, CPU_NEWLY_IDLE,
- &continue_balancing);
+ pulled_task |= load_balance(this_cpu, this_rq,
+ sd, CPU_NEWLY_IDLE,
+ &continue_balancing);

t1 = sched_clock_cpu(this_cpu);
domain_cost = t1 - t0;
@@ -11006,13 +11129,21 @@ static int newidle_balance(struct rq *this_rq, struct rq_flags *rf)

curr_cost += domain_cost;
t0 = t1;
+
+ /*
+ * Stop searching for tasks to pull if there are
+ * now runnable tasks on this rq, given that no
+ * overloaded cpu can be found on this LLC.
+ */
+ if (pulled_task && !has_overloaded_cpus)
+ break;
}

/*
- * Stop searching for tasks to pull if there are
- * now runnable tasks on this rq.
+ * Try harder to pull non-idle tasks to let them use as more
+ * cpu capacity as it can be.
*/
- if (pulled_task || this_rq->nr_running > 0 ||
+ if (this_rq->nr_running > this_rq->cfs.idle_h_nr_running ||
this_rq->ttwu_pending)
break;
}
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index afa1bb68c3ec..dcceaec8d8b4 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1012,6 +1012,8 @@ struct rq {

unsigned char nohz_idle_balance;
unsigned char idle_balance;
+
+ unsigned char sched_idle_balance;
unsigned char overloaded;

unsigned long misfit_task_load;
--
2.11.0


2022-04-12 22:18:59

by Abel Wu

[permalink] [raw]
Subject: Re: [RFC v2 2/2] sched/fair: introduce sched-idle balance

Hi Josh,

On 4/12/22 9:59 AM, Josh Don Wrote:
> Hi Abel,
>
>>
>> +static inline bool cfs_rq_busy(struct rq *rq)
>> +{
>> + return rq->cfs.h_nr_running - rq->cfs.idle_h_nr_running == 1;
>> +}
>> +
>> +static inline bool need_pull_cfs_task(struct rq *rq)
>> +{
>> + return rq->cfs.h_nr_running == rq->cfs.idle_h_nr_running;
>> +}
>
> Note that this will also return true if there are 0 tasks, which I
> don't think is the semantics you intend for its use in
> rebalance_domains() below.

I intended covering the idle balance. My last v1 patchset wanted to
ignore the idle balance because of the high cpu wakeup latency, but
after some benchmarking, that seems not necessary.

>
>> /*
>> * Use locality-friendly rq->overloaded to cache the status of the rq
>> * to minimize the heavy cost on LLC shared data.
>> @@ -7837,6 +7867,22 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env)
>> if (kthread_is_per_cpu(p))
>> return 0;
>>
>> + if (unlikely(task_h_idle(p))) {
>> + /*
>> + * Disregard hierarchically idle tasks during sched-idle
>> + * load balancing.
>> + */
>> + if (env->idle == CPU_SCHED_IDLE)
>> + return 0;
>> + } else if (!static_branch_unlikely(&sched_asym_cpucapacity)) {
>> + /*
>> + * It's not gonna help if stacking non-idle tasks on one
>> + * cpu while leaving some idle.
>> + */
>> + if (cfs_rq_busy(env->src_rq) && !need_pull_cfs_task(env->dst_rq))
>> + return 0;
>
> These checks don't involve the task at all, so this kind of check
> should be pushed into the more general load balance function. But, I'm
> not totally clear on the motivation here. If we have cpu A with 1
> non-idle task and 100 idle tasks, and cpu B with 1 non-idle task, we
> should definitely try to load balance some of the idle tasks from A to
> B. idle tasks _do_ get time to run (although little), and this can add
> up and cause antagonism to the non-idle task if there are a lot of
> idle threads.

CPU_SCHED_IDLE means triggered by sched_idle_balance() in which pulls
a non-idle task for the unoccupied cpu from the overloaded ones, so
idle tasks are not the target and should be skipped.

The second part is: if we have cpu A with 1 non-idle task and 100 idle
tasks, and B with >=1 non-idle task, we don't migrate the last non-idle
task on A to B.

>
>>
>> /*
>> + * The sched-idle balancing tries to make full use of cpu capacity
>> + * for non-idle tasks by pulling them for the unoccupied cpus from
>> + * the overloaded ones.
>> + *
>> + * Return 1 if pulled successfully, 0 otherwise.
>> + */
>> +static int sched_idle_balance(struct rq *dst_rq)
>> +{
>> + struct sched_domain *sd;
>> + struct task_struct *p = NULL;
>> + int dst_cpu = cpu_of(dst_rq), cpu;
>> +
>> + sd = rcu_dereference(per_cpu(sd_llc, dst_cpu));
>> + if (unlikely(!sd))
>> + return 0;
>> +
>> + if (!atomic_read(&sd->shared->nr_overloaded))
>> + return 0;
>> +
>> + for_each_cpu_wrap(cpu, sdo_mask(sd->shared), dst_cpu + 1) {
>> + struct rq *rq = cpu_rq(cpu);
>> + struct rq_flags rf;
>> + struct lb_env env;
>> +
>> + if (cpu == dst_cpu || !cfs_rq_overloaded(rq) ||
>> + READ_ONCE(rq->sched_idle_balance))
>> + continue;
>> +
>> + WRITE_ONCE(rq->sched_idle_balance, 1);
>> + rq_lock_irqsave(rq, &rf);
>> +
>> + env = (struct lb_env) {
>> + .sd = sd,
>> + .dst_cpu = dst_cpu,
>> + .dst_rq = dst_rq,
>> + .src_cpu = cpu,
>> + .src_rq = rq,
>> + .idle = CPU_SCHED_IDLE, /* non-idle only */
>> + .flags = LBF_DST_PINNED, /* pin dst_cpu */
>> + };
>> +
>> + update_rq_clock(rq);
>> + p = detach_one_task(&env);
>> + if (p)
>> + update_overload_status(rq);
>> +
>> + rq_unlock(rq, &rf);
>> + WRITE_ONCE(rq->sched_idle_balance, 0);
>> +
>> + if (p) {
>> + attach_one_task(dst_rq, p);
>> + local_irq_restore(rf.flags);
>> + return 1;
>> + }
>> +
>> + local_irq_restore(rf.flags);
>> + }
>> +
>> + return 0;
>> +}
>
> I think this could probably be integrated with the load balancing
> function. Your goal is ignore idle tasks for the purpose of pulling
> from a remote rq. And I think the above isn't exactly what you want
> anyway; detach_tasks/detach_one_task are just going to iterate the
> task list in order. You want to actually look for the non-idle tasks
> explicitly.

I have tried a simple version like below (and sched_idle_balance() is
not needed anymore):

@@ -10338,6 +10343,7 @@ static void rebalance_domains(struct rq *rq,
enum cpu_idle_type idle)
int continue_balancing = 1;
int cpu = rq->cpu;
int busy = idle != CPU_IDLE && !sched_idle_cpu(cpu);
+ int prev_busy = busy;
unsigned long interval;
struct sched_domain *sd;
/* Earliest time when we have to do rebalance again */
@@ -10394,6 +10400,9 @@ static void rebalance_domains(struct rq *rq,
enum cpu_idle_type idle)
next_balance = sd->last_balance + interval;
update_next_balance = 1;
}
+
+ if (!prev_busy && !need_pull_cfs_task(rq))
+ break;
}
if (need_decay) {
/*

But benchmark results are not good enough compared to RFCv2 patchset.
I would dig more deep into this, thanks.

>
>> @@ -10996,9 +11119,9 @@ static int newidle_balance(struct rq *this_rq, struct rq_flags *rf)
>>
>> if (sd->flags & SD_BALANCE_NEWIDLE) {
>>
>> - pulled_task = load_balance(this_cpu, this_rq,
>> - sd, CPU_NEWLY_IDLE,
>> - &continue_balancing);
>> + pulled_task |= load_balance(this_cpu, this_rq,
>> + sd, CPU_NEWLY_IDLE,
>> + &continue_balancing);
>
> Why |= ?

This is because I changed the behavior of newidle balance a bit. Vanilla
kernel will quit newidle balance once we got task to run on this rq, no
matter the task is non-idle or not. But after this patch, if there are
overloaded cpus in this LLC, we will try harder on balance until we got
non-idle tasks, which means the balancing would be continue even if now
the cpu is sched_idle.

Thanks & BR,
Abel

2022-04-12 23:04:12

by Josh Don

[permalink] [raw]
Subject: Re: [RFC v2 2/2] sched/fair: introduce sched-idle balance

Hi Abel,

>
> +static inline bool cfs_rq_busy(struct rq *rq)
> +{
> + return rq->cfs.h_nr_running - rq->cfs.idle_h_nr_running == 1;
> +}
> +
> +static inline bool need_pull_cfs_task(struct rq *rq)
> +{
> + return rq->cfs.h_nr_running == rq->cfs.idle_h_nr_running;
> +}

Note that this will also return true if there are 0 tasks, which I
don't think is the semantics you intend for its use in
rebalance_domains() below.

> /*
> * Use locality-friendly rq->overloaded to cache the status of the rq
> * to minimize the heavy cost on LLC shared data.
> @@ -7837,6 +7867,22 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env)
> if (kthread_is_per_cpu(p))
> return 0;
>
> + if (unlikely(task_h_idle(p))) {
> + /*
> + * Disregard hierarchically idle tasks during sched-idle
> + * load balancing.
> + */
> + if (env->idle == CPU_SCHED_IDLE)
> + return 0;
> + } else if (!static_branch_unlikely(&sched_asym_cpucapacity)) {
> + /*
> + * It's not gonna help if stacking non-idle tasks on one
> + * cpu while leaving some idle.
> + */
> + if (cfs_rq_busy(env->src_rq) && !need_pull_cfs_task(env->dst_rq))
> + return 0;

These checks don't involve the task at all, so this kind of check
should be pushed into the more general load balance function. But, I'm
not totally clear on the motivation here. If we have cpu A with 1
non-idle task and 100 idle tasks, and cpu B with 1 non-idle task, we
should definitely try to load balance some of the idle tasks from A to
B. idle tasks _do_ get time to run (although little), and this can add
up and cause antagonism to the non-idle task if there are a lot of
idle threads.

>
> /*
> + * The sched-idle balancing tries to make full use of cpu capacity
> + * for non-idle tasks by pulling them for the unoccupied cpus from
> + * the overloaded ones.
> + *
> + * Return 1 if pulled successfully, 0 otherwise.
> + */
> +static int sched_idle_balance(struct rq *dst_rq)
> +{
> + struct sched_domain *sd;
> + struct task_struct *p = NULL;
> + int dst_cpu = cpu_of(dst_rq), cpu;
> +
> + sd = rcu_dereference(per_cpu(sd_llc, dst_cpu));
> + if (unlikely(!sd))
> + return 0;
> +
> + if (!atomic_read(&sd->shared->nr_overloaded))
> + return 0;
> +
> + for_each_cpu_wrap(cpu, sdo_mask(sd->shared), dst_cpu + 1) {
> + struct rq *rq = cpu_rq(cpu);
> + struct rq_flags rf;
> + struct lb_env env;
> +
> + if (cpu == dst_cpu || !cfs_rq_overloaded(rq) ||
> + READ_ONCE(rq->sched_idle_balance))
> + continue;
> +
> + WRITE_ONCE(rq->sched_idle_balance, 1);
> + rq_lock_irqsave(rq, &rf);
> +
> + env = (struct lb_env) {
> + .sd = sd,
> + .dst_cpu = dst_cpu,
> + .dst_rq = dst_rq,
> + .src_cpu = cpu,
> + .src_rq = rq,
> + .idle = CPU_SCHED_IDLE, /* non-idle only */
> + .flags = LBF_DST_PINNED, /* pin dst_cpu */
> + };
> +
> + update_rq_clock(rq);
> + p = detach_one_task(&env);
> + if (p)
> + update_overload_status(rq);
> +
> + rq_unlock(rq, &rf);
> + WRITE_ONCE(rq->sched_idle_balance, 0);
> +
> + if (p) {
> + attach_one_task(dst_rq, p);
> + local_irq_restore(rf.flags);
> + return 1;
> + }
> +
> + local_irq_restore(rf.flags);
> + }
> +
> + return 0;
> +}

I think this could probably be integrated with the load balancing
function. Your goal is ignore idle tasks for the purpose of pulling
from a remote rq. And I think the above isn't exactly what you want
anyway; detach_tasks/detach_one_task are just going to iterate the
task list in order. You want to actually look for the non-idle tasks
explicitly.

> @@ -10996,9 +11119,9 @@ static int newidle_balance(struct rq *this_rq, struct rq_flags *rf)
>
> if (sd->flags & SD_BALANCE_NEWIDLE) {
>
> - pulled_task = load_balance(this_cpu, this_rq,
> - sd, CPU_NEWLY_IDLE,
> - &continue_balancing);
> + pulled_task |= load_balance(this_cpu, this_rq,
> + sd, CPU_NEWLY_IDLE,
> + &continue_balancing);

Why |= ?

Thanks,
Josh

2022-04-14 12:01:34

by Josh Don

[permalink] [raw]
Subject: Re: [RFC v2 2/2] sched/fair: introduce sched-idle balance

> >> /*
> >> * Use locality-friendly rq->overloaded to cache the status of the rq
> >> * to minimize the heavy cost on LLC shared data.
> >> @@ -7837,6 +7867,22 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env)
> >> if (kthread_is_per_cpu(p))
> >> return 0;
> >>
> >> + if (unlikely(task_h_idle(p))) {
> >> + /*
> >> + * Disregard hierarchically idle tasks during sched-idle
> >> + * load balancing.
> >> + */
> >> + if (env->idle == CPU_SCHED_IDLE)
> >> + return 0;
> >> + } else if (!static_branch_unlikely(&sched_asym_cpucapacity)) {
> >> + /*
> >> + * It's not gonna help if stacking non-idle tasks on one
> >> + * cpu while leaving some idle.
> >> + */
> >> + if (cfs_rq_busy(env->src_rq) && !need_pull_cfs_task(env->dst_rq))
> >> + return 0;
> >
> > These checks don't involve the task at all, so this kind of check
> > should be pushed into the more general load balance function. But, I'm
> > not totally clear on the motivation here. If we have cpu A with 1
> > non-idle task and 100 idle tasks, and cpu B with 1 non-idle task, we
> > should definitely try to load balance some of the idle tasks from A to
> > B. idle tasks _do_ get time to run (although little), and this can add
> > up and cause antagonism to the non-idle task if there are a lot of
> > idle threads.
>
> CPU_SCHED_IDLE means triggered by sched_idle_balance() in which pulls
> a non-idle task for the unoccupied cpu from the overloaded ones, so
> idle tasks are not the target and should be skipped.
>
> The second part is: if we have cpu A with 1 non-idle task and 100 idle
> tasks, and B with >=1 non-idle task, we don't migrate the last non-idle
> task on A to B.

It could be possible that we do want to migrate the last non-idle task
from A to B, if the weight sum of idle tasks on A is very high (easily
possible with affinity restrictions). So I think we should leave
regular load balance alone here if it really wants to move the
non-idle task, and wrap this entire block in an if (env->idle ==
CPU_SCHED_IDLE).

Thanks,
Josh

2022-04-16 00:15:29

by Abel Wu

[permalink] [raw]
Subject: Re: [RFC v2 2/2] sched/fair: introduce sched-idle balance

On 4/14/22 8:08 AM, Josh Don Wrote:
>>>> /*
>>>> * Use locality-friendly rq->overloaded to cache the status of the rq
>>>> * to minimize the heavy cost on LLC shared data.
>>>> @@ -7837,6 +7867,22 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env)
>>>> if (kthread_is_per_cpu(p))
>>>> return 0;
>>>>
>>>> + if (unlikely(task_h_idle(p))) {
>>>> + /*
>>>> + * Disregard hierarchically idle tasks during sched-idle
>>>> + * load balancing.
>>>> + */
>>>> + if (env->idle == CPU_SCHED_IDLE)
>>>> + return 0;
>>>> + } else if (!static_branch_unlikely(&sched_asym_cpucapacity)) {
>>>> + /*
>>>> + * It's not gonna help if stacking non-idle tasks on one
>>>> + * cpu while leaving some idle.
>>>> + */
>>>> + if (cfs_rq_busy(env->src_rq) && !need_pull_cfs_task(env->dst_rq))
>>>> + return 0;
>>>
>>> These checks don't involve the task at all, so this kind of check
>>> should be pushed into the more general load balance function. But, I'm
>>> not totally clear on the motivation here. If we have cpu A with 1
>>> non-idle task and 100 idle tasks, and cpu B with 1 non-idle task, we
>>> should definitely try to load balance some of the idle tasks from A to
>>> B. idle tasks _do_ get time to run (although little), and this can add
>>> up and cause antagonism to the non-idle task if there are a lot of
>>> idle threads.
>>
>> CPU_SCHED_IDLE means triggered by sched_idle_balance() in which pulls
>> a non-idle task for the unoccupied cpu from the overloaded ones, so
>> idle tasks are not the target and should be skipped.
>>
>> The second part is: if we have cpu A with 1 non-idle task and 100 idle
>> tasks, and B with >=1 non-idle task, we don't migrate the last non-idle
>> task on A to B.
>
> It could be possible that we do want to migrate the last non-idle task
> from A to B, if the weight sum of idle tasks on A is very high (easily
> possible with affinity restrictions). So I think we should leave
> regular load balance alone here if it really wants to move the
> non-idle task, and wrap this entire block in an if (env->idle ==
> CPU_SCHED_IDLE).

Makes sense. I will fix it in next version.

Thanks & BR,
Abel

2022-04-27 13:46:23

by kernel test robot

[permalink] [raw]
Subject: [sched/fair] ae44f2177f: reaim.jobs_per_min 2.3% improvement



Greeting,

FYI, we noticed a 2.3% improvement of reaim.jobs_per_min due to commit:


commit: ae44f2177fa75c271734d5963972faecc3686c12 ("[RFC v2 2/2] sched/fair: introduce sched-idle balance")
url: https://github.com/intel-lab-lkp/linux/commits/Abel-Wu/introduece-sched-idle-balance/20220409-215303
base: https://git.kernel.org/cgit/linux/kernel/git/tip/tip.git 089c02ae2771a14af2928c59c56abfb9b885a8d7
patch link: https://lore.kernel.org/lkml/[email protected]

in testcase: reaim
on test machine: 16 threads 1 sockets Intel(R) Xeon(R) E-2278G CPU @ 3.40GHz with 32G memory
with following parameters:

runtime: 300s
nr_task: 100%
test: new_fserver
cpufreq_governor: performance
ucode: 0xec

test-description: REAIM is an updated and improved version of AIM 7 benchmark.
test-url: https://sourceforge.net/projects/re-aim-7/


Details are as below:
-------------------------------------------------------------------------------------------------->


To reproduce:

git clone https://github.com/intel/lkp-tests.git
cd lkp-tests
sudo bin/lkp install job.yaml # job file is attached in this email
bin/lkp split-job --compatible job.yaml # generate the yaml file for lkp run
sudo bin/lkp run generated-yaml-file

# if come across any failure that blocks the test,
# please remove ~/.lkp and /lkp dir to run from a clean state.

=========================================================================================
compiler/cpufreq_governor/kconfig/nr_task/rootfs/runtime/tbox_group/test/testcase/ucode:
gcc-11/performance/x86_64-rhel-8.3/100%/debian-10.4-x86_64-20200603.cgz/300s/lkp-cfl-e1/new_fserver/reaim/0xec

commit:
6b433275e3 ("sched/fair: filter out overloaded cpus in SIS")
ae44f2177f ("sched/fair: introduce sched-idle balance")

6b433275e3a3cf18 ae44f2177fa75c271734d596397
---------------- ---------------------------
%stddev %change %stddev
\ | \
356517 +2.3% 364606 reaim.jobs_per_min
22282 +2.3% 22787 reaim.jobs_per_min_child
0.26 -2.2% 0.26 reaim.parent_time
1.87 ? 3% -33.1% 1.25 ? 4% reaim.std_dev_percent
0.00 ? 11% -83.0% 0.00 ? 22% reaim.std_dev_time
71394 +5.5% 75296 reaim.time.involuntary_context_switches
412752 +2.9% 424628 reaim.time.voluntary_context_switches
7647401 ? 24% -39.7% 4609658 cpuidle..usage
27840 ? 21% -34.3% 18280 vmstat.system.in
3736 +4.2% 3894 proc-vmstat.nr_active_anon
6959 +2.2% 7111 proc-vmstat.nr_shmem
3736 +4.2% 3894 proc-vmstat.nr_zone_active_anon
0.33 ? 18% -23.8% 0.25 ? 8% sched_debug.cfs_rq:/.h_nr_running.avg
0.31 ? 14% -20.7% 0.24 ? 7% sched_debug.cfs_rq:/.nr_running.avg
0.00 ? 25% -31.1% 0.00 ? 36% sched_debug.cpu.next_balance.stddev
3.20 ? 4% +0.5 3.71 ? 8% perf-stat.i.cache-miss-rate%
1818 ? 3% -10.2% 1632 ? 6% perf-stat.i.cycles-between-cache-misses
987044 ? 2% -3.8% 949746 perf-stat.i.iTLB-load-misses
442185 ? 8% -10.4% 395984 perf-stat.i.iTLB-loads
5920 ? 3% +3.8% 6144 perf-stat.i.instructions-per-iTLB-miss
0.05 ? 13% -0.0 0.05 perf-stat.overall.dTLB-load-miss-rate%
8510 ? 2% +3.6% 8814 perf-stat.overall.instructions-per-iTLB-miss
983682 ? 2% -3.8% 946402 perf-stat.ps.iTLB-load-misses
440691 ? 8% -10.5% 394589 perf-stat.ps.iTLB-loads
345730 ? 80% -92.5% 25933 ? 7% turbostat.C3
1.73 ? 86% -1.7 0.07 ? 7% turbostat.C3%
5033041 ? 31% -19.9% 4033568 turbostat.C6
59141 ?141% +395.4% 292968 ? 12% turbostat.C8
1.16 ?141% +4.9 6.03 ? 13% turbostat.C8%
24.98 ? 22% -29.1% 17.71 turbostat.CPU%c1
1.14 ? 84% -95.9% 0.05 ? 9% turbostat.CPU%c3
0.66 ?141% +540.9% 4.25 ? 15% turbostat.CPU%c7
20.56 ? 5% -8.4% 18.84 turbostat.CorWatt
8504754 ? 21% -34.5% 5573672 turbostat.IRQ
36.52 ? 29% +31.0% 47.86 turbostat.Pkg%pc3
21.00 ? 5% -8.3% 19.24 turbostat.PkgWatt
13.92 ? 7% -4.9 9.06 ? 2% perf-profile.calltrace.cycles-pp.secondary_startup_64_no_verify
13.08 ? 5% -4.7 8.36 ? 3% perf-profile.calltrace.cycles-pp.do_idle.cpu_startup_entry.secondary_startup_64_no_verify
13.09 ? 5% -4.7 8.37 ? 3% perf-profile.calltrace.cycles-pp.cpu_startup_entry.secondary_startup_64_no_verify
12.92 ? 5% -4.7 8.24 ? 3% perf-profile.calltrace.cycles-pp.cpuidle_idle_call.do_idle.cpu_startup_entry.secondary_startup_64_no_verify
12.45 ? 6% -4.4 8.01 ? 5% perf-profile.calltrace.cycles-pp.cpuidle_enter_state.cpuidle_enter.cpuidle_idle_call.do_idle.cpu_startup_entry
11.98 ? 4% -4.3 7.72 ? 3% perf-profile.calltrace.cycles-pp.cpuidle_enter.cpuidle_idle_call.do_idle.cpu_startup_entry.secondary_startup_64_no_verify
9.04 ? 9% -3.5 5.53 ? 5% perf-profile.calltrace.cycles-pp.intel_idle.cpuidle_enter_state.cpuidle_enter.cpuidle_idle_call.do_idle
9.00 ? 9% -3.5 5.52 ? 5% perf-profile.calltrace.cycles-pp.mwait_idle_with_hints.intel_idle.cpuidle_enter_state.cpuidle_enter.cpuidle_idle_call
2.68 ? 4% -0.8 1.88 ? 6% perf-profile.calltrace.cycles-pp.asm_sysvec_apic_timer_interrupt.cpuidle_enter_state.cpuidle_enter.cpuidle_idle_call.do_idle
2.40 ? 2% -0.7 1.74 ? 6% perf-profile.calltrace.cycles-pp.sysvec_apic_timer_interrupt.asm_sysvec_apic_timer_interrupt.cpuidle_enter_state.cpuidle_enter.cpuidle_idle_call
2.34 ? 21% -0.6 1.72 ? 17% perf-profile.calltrace.cycles-pp.call_console_drivers.console_unlock.vprintk_emit.devkmsg_emit.devkmsg_write.cold
2.34 ? 21% -0.6 1.72 ? 17% perf-profile.calltrace.cycles-pp.console_unlock.vprintk_emit.devkmsg_emit.devkmsg_write.cold.new_sync_write
2.49 ? 11% -0.6 1.90 ? 11% perf-profile.calltrace.cycles-pp.drm_fb_helper_damage_blit_real.drm_fb_helper_damage_work.process_one_work.worker_thread.kthread
2.50 ? 11% -0.6 1.90 ? 11% perf-profile.calltrace.cycles-pp.drm_fb_helper_damage_work.process_one_work.worker_thread.kthread.ret_from_fork
2.52 ? 11% -0.6 1.93 ? 10% perf-profile.calltrace.cycles-pp.process_one_work.worker_thread.kthread.ret_from_fork
2.44 ? 11% -0.6 1.85 ? 11% perf-profile.calltrace.cycles-pp.memcpy_toio.drm_fb_helper_damage_blit_real.drm_fb_helper_damage_work.process_one_work.worker_thread
2.52 ? 11% -0.6 1.94 ? 10% perf-profile.calltrace.cycles-pp.worker_thread.kthread.ret_from_fork
2.97 ? 11% -0.6 2.40 ? 9% perf-profile.calltrace.cycles-pp.ret_from_fork
2.96 ? 11% -0.6 2.40 ? 9% perf-profile.calltrace.cycles-pp.kthread.ret_from_fork
1.61 ? 3% -0.5 1.11 ? 5% perf-profile.calltrace.cycles-pp.__sysvec_apic_timer_interrupt.sysvec_apic_timer_interrupt.asm_sysvec_apic_timer_interrupt.cpuidle_enter_state.cpuidle_enter
1.49 ? 3% -0.4 1.05 ? 6% perf-profile.calltrace.cycles-pp.hrtimer_interrupt.__sysvec_apic_timer_interrupt.sysvec_apic_timer_interrupt.asm_sysvec_apic_timer_interrupt.cpuidle_enter_state
1.14 ? 5% -0.3 0.87 ? 4% perf-profile.calltrace.cycles-pp.__hrtimer_run_queues.hrtimer_interrupt.__sysvec_apic_timer_interrupt.sysvec_apic_timer_interrupt.asm_sysvec_apic_timer_interrupt
4.69 ? 3% +0.3 5.00 perf-profile.calltrace.cycles-pp.div_long
0.90 ? 29% +0.4 1.34 ? 7% perf-profile.calltrace.cycles-pp.copy_process.kernel_clone.__do_sys_clone.do_syscall_64.entry_SYSCALL_64_after_hwframe
12.40 ? 5% +1.4 13.85 ? 2% perf-profile.calltrace.cycles-pp.string_rtns_1
13.92 ? 7% -4.9 9.06 ? 2% perf-profile.children.cycles-pp.do_idle
13.92 ? 7% -4.9 9.06 ? 2% perf-profile.children.cycles-pp.secondary_startup_64_no_verify
13.92 ? 7% -4.9 9.06 ? 2% perf-profile.children.cycles-pp.cpu_startup_entry
13.76 ? 7% -4.8 8.94 ? 2% perf-profile.children.cycles-pp.cpuidle_idle_call
12.74 ? 6% -4.4 8.37 ? 3% perf-profile.children.cycles-pp.cpuidle_enter_state
12.74 ? 6% -4.4 8.37 ? 3% perf-profile.children.cycles-pp.cpuidle_enter
9.14 ? 7% -3.4 5.73 ? 3% perf-profile.children.cycles-pp.intel_idle
9.10 ? 7% -3.4 5.71 ? 3% perf-profile.children.cycles-pp.mwait_idle_with_hints
3.42 ? 5% -0.8 2.58 ? 6% perf-profile.children.cycles-pp.asm_sysvec_apic_timer_interrupt
3.02 ? 3% -0.7 2.34 ? 6% perf-profile.children.cycles-pp.sysvec_apic_timer_interrupt
2.49 ? 11% -0.6 1.90 ? 11% perf-profile.children.cycles-pp.memcpy_toio
2.49 ? 11% -0.6 1.90 ? 11% perf-profile.children.cycles-pp.drm_fb_helper_damage_blit_real
2.50 ? 11% -0.6 1.90 ? 11% perf-profile.children.cycles-pp.drm_fb_helper_damage_work
2.52 ? 11% -0.6 1.93 ? 10% perf-profile.children.cycles-pp.process_one_work
2.52 ? 11% -0.6 1.94 ? 10% perf-profile.children.cycles-pp.worker_thread
2.96 ? 11% -0.6 2.40 ? 9% perf-profile.children.cycles-pp.kthread
3.00 ? 11% -0.6 2.44 ? 9% perf-profile.children.cycles-pp.ret_from_fork
1.99 ? 2% -0.5 1.45 ? 5% perf-profile.children.cycles-pp.__sysvec_apic_timer_interrupt
1.85 -0.5 1.38 ? 5% perf-profile.children.cycles-pp.hrtimer_interrupt
0.81 ? 19% -0.4 0.44 ? 7% perf-profile.children.cycles-pp.menu_select
1.44 ? 3% -0.3 1.14 ? 5% perf-profile.children.cycles-pp.__hrtimer_run_queues
0.41 ? 5% -0.1 0.32 ? 14% perf-profile.children.cycles-pp.io_serial_out
0.12 ? 27% -0.1 0.06 ? 36% perf-profile.children.cycles-pp.rcu_idle_exit
0.16 ? 16% -0.1 0.10 ? 17% perf-profile.children.cycles-pp.clockevents_program_event
0.09 ? 25% -0.1 0.04 ? 69% perf-profile.children.cycles-pp.native_apic_msr_eoi_write
0.14 ? 22% -0.0 0.09 ? 13% perf-profile.children.cycles-pp.enqueue_hrtimer
0.15 ? 10% -0.0 0.11 ? 19% perf-profile.children.cycles-pp.sched_clock_cpu
0.08 ? 14% -0.0 0.04 ? 69% perf-profile.children.cycles-pp.cpuidle_governor_latency_req
0.12 ? 23% -0.0 0.08 ? 15% perf-profile.children.cycles-pp.timerqueue_add
4.70 ? 3% +0.3 5.00 perf-profile.children.cycles-pp.div_long
12.46 ? 5% +1.5 13.91 ? 2% perf-profile.children.cycles-pp.string_rtns_1
9.10 ? 7% -3.4 5.71 ? 3% perf-profile.self.cycles-pp.mwait_idle_with_hints
2.46 ? 11% -0.6 1.84 ? 10% perf-profile.self.cycles-pp.memcpy_toio
0.42 ? 29% -0.2 0.20 ? 13% perf-profile.self.cycles-pp.menu_select
0.40 ? 23% -0.2 0.24 ? 15% perf-profile.self.cycles-pp.cpuidle_enter_state
0.41 ? 5% -0.1 0.32 ? 14% perf-profile.self.cycles-pp.io_serial_out
0.09 ? 25% -0.1 0.04 ? 69% perf-profile.self.cycles-pp.native_apic_msr_eoi_write
0.08 ? 19% -0.0 0.04 ? 52% perf-profile.self.cycles-pp.cpuidle_idle_call
4.67 ? 3% +0.3 4.98 perf-profile.self.cycles-pp.div_long
12.36 ? 5% +1.4 13.78 ? 2% perf-profile.self.cycles-pp.string_rtns_1



Disclaimer:
Results have been estimated based on internal Intel analysis and are provided
for informational purposes only. Any difference in system hardware or software
design or configuration may affect actual performance.


--
0-DAY CI Kernel Test Service
https://01.org/lkp



Attachments:
(No filename) (12.31 kB)
config-5.18.0-rc1-00005-gae44f2177fa7 (165.11 kB)
job-script (7.84 kB)
job.yaml (5.32 kB)
reproduce (13.96 kB)
Download all attachments