newidle_balance runs with both preempt and irq disabled which prevent
local irq to run during this period. The duration for updating of the
blocked load of CPUs varies according to the number of cgroups and
extends this critical period to an uncontrolled level.
Remove the update from newidle_balance and trigger a normal ILB that
will take care of the update instead.
Signed-off-by: Vincent Guittot <[email protected]>
---
kernel/sched/fair.c | 33 +++++----------------------------
1 file changed, 5 insertions(+), 28 deletions(-)
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 59b645e3c4fd..bfe1e235fe01 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -7392,8 +7392,6 @@ enum migration_type {
#define LBF_NEED_BREAK 0x02
#define LBF_DST_PINNED 0x04
#define LBF_SOME_PINNED 0x08
-#define LBF_NOHZ_STATS 0x10
-#define LBF_NOHZ_AGAIN 0x20
struct lb_env {
struct sched_domain *sd;
@@ -8397,9 +8395,6 @@ static inline void update_sg_lb_stats(struct lb_env *env,
for_each_cpu_and(i, sched_group_span(group), env->cpus) {
struct rq *rq = cpu_rq(i);
- if ((env->flags & LBF_NOHZ_STATS) && update_nohz_stats(rq, false))
- env->flags |= LBF_NOHZ_AGAIN;
-
sgs->group_load += cpu_load(rq);
sgs->group_util += cpu_util(i);
sgs->group_runnable += cpu_runnable(rq);
@@ -8940,11 +8935,6 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd
struct sg_lb_stats tmp_sgs;
int sg_status = 0;
-#ifdef CONFIG_NO_HZ_COMMON
- if (env->idle == CPU_NEWLY_IDLE && READ_ONCE(nohz.has_blocked))
- env->flags |= LBF_NOHZ_STATS;
-#endif
-
do {
struct sg_lb_stats *sgs = &tmp_sgs;
int local_group;
@@ -8981,14 +8971,6 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd
/* Tag domain that child domain prefers tasks go to siblings first */
sds->prefer_sibling = child && child->flags & SD_PREFER_SIBLING;
-#ifdef CONFIG_NO_HZ_COMMON
- if ((env->flags & LBF_NOHZ_AGAIN) &&
- cpumask_subset(nohz.idle_cpus_mask, sched_domain_span(env->sd))) {
-
- WRITE_ONCE(nohz.next_blocked,
- jiffies + msecs_to_jiffies(LOAD_AVG_PERIOD));
- }
-#endif
if (env->sd->flags & SD_NUMA)
env->fbq_type = fbq_classify_group(&sds->busiest_stat);
@@ -10517,16 +10499,11 @@ static void nohz_newidle_balance(struct rq *this_rq)
time_before(jiffies, READ_ONCE(nohz.next_blocked)))
return;
- raw_spin_unlock(&this_rq->lock);
/*
- * This CPU is going to be idle and blocked load of idle CPUs
- * need to be updated. Run the ilb locally as it is a good
- * candidate for ilb instead of waking up another idle CPU.
- * Kick an normal ilb if we failed to do the update.
+ * Blocked load of idle CPUs need to be updated.
+ * Kick an ILB to update statistics.
*/
- if (!_nohz_idle_balance(this_rq, NOHZ_STATS_KICK, CPU_NEWLY_IDLE))
- kick_ilb(NOHZ_STATS_KICK);
- raw_spin_lock(&this_rq->lock);
+ kick_ilb(NOHZ_STATS_KICK);
}
#else /* !CONFIG_NO_HZ_COMMON */
@@ -10587,8 +10564,6 @@ static int newidle_balance(struct rq *this_rq, struct rq_flags *rf)
update_next_balance(sd, &next_balance);
rcu_read_unlock();
- nohz_newidle_balance(this_rq);
-
goto out;
}
@@ -10654,6 +10629,8 @@ static int newidle_balance(struct rq *this_rq, struct rq_flags *rf)
if (pulled_task)
this_rq->idle_stamp = 0;
+ else
+ nohz_newidle_balance(this_rq);
rq_repin_lock(this_rq, rf);
--
2.17.1
On 05/02/21 12:48, Vincent Guittot wrote:
> @@ -10517,16 +10499,11 @@ static void nohz_newidle_balance(struct rq *this_rq)
> time_before(jiffies, READ_ONCE(nohz.next_blocked)))
> return;
>
I was wondering whether all the conditions above were still relevant. I
think they are, but this one:
/* Will wake up very soon. No time for doing anything else*/
if (this_rq->avg_idle < sysctl_sched_migration_cost)
return;
should have its comment updated to something like:
/*
* Will wake up very soon. Blocked load will be updated
* periodically, no need to wake an idle CPU.
*/
given kick_ilb() isn't the costliest of things.
> - raw_spin_unlock(&this_rq->lock);
> /*
> - * This CPU is going to be idle and blocked load of idle CPUs
> - * need to be updated. Run the ilb locally as it is a good
> - * candidate for ilb instead of waking up another idle CPU.
> - * Kick an normal ilb if we failed to do the update.
> + * Blocked load of idle CPUs need to be updated.
> + * Kick an ILB to update statistics.
> */
> - if (!_nohz_idle_balance(this_rq, NOHZ_STATS_KICK, CPU_NEWLY_IDLE))
> - kick_ilb(NOHZ_STATS_KICK);
> - raw_spin_lock(&this_rq->lock);
With this change, the return value of _nohz_idle_balance() is no longer
used. This means we could get rid of the tracking of whether it iterated
over all nohz CPUs or not.
> + kick_ilb(NOHZ_STATS_KICK);
> }
>
> #else /* !CONFIG_NO_HZ_COMMON */
On Tue, 9 Feb 2021 at 14:09, Valentin Schneider
<[email protected]> wrote:
>
> On 05/02/21 12:48, Vincent Guittot wrote:
> > @@ -10517,16 +10499,11 @@ static void nohz_newidle_balance(struct rq *this_rq)
> > time_before(jiffies, READ_ONCE(nohz.next_blocked)))
> > return;
> >
>
> I was wondering whether all the conditions above were still relevant. I
> think they are, but this one:
>
> /* Will wake up very soon. No time for doing anything else*/
> if (this_rq->avg_idle < sysctl_sched_migration_cost)
> return;
>
> should have its comment updated to something like:
>
> /*
> * Will wake up very soon. Blocked load will be updated
> * periodically, no need to wake an idle CPU.
> */
>
> given kick_ilb() isn't the costliest of things.
>
> > - raw_spin_unlock(&this_rq->lock);
> > /*
> > - * This CPU is going to be idle and blocked load of idle CPUs
> > - * need to be updated. Run the ilb locally as it is a good
> > - * candidate for ilb instead of waking up another idle CPU.
> > - * Kick an normal ilb if we failed to do the update.
> > + * Blocked load of idle CPUs need to be updated.
> > + * Kick an ILB to update statistics.
> > */
> > - if (!_nohz_idle_balance(this_rq, NOHZ_STATS_KICK, CPU_NEWLY_IDLE))
> > - kick_ilb(NOHZ_STATS_KICK);
> > - raw_spin_lock(&this_rq->lock);
>
> With this change, the return value of _nohz_idle_balance() is no longer
> used. This means we could get rid of the tracking of whether it iterated
> over all nohz CPUs or not.
Yeah, the return is useless now
>
> > + kick_ilb(NOHZ_STATS_KICK);
> > }
> >
> > #else /* !CONFIG_NO_HZ_COMMON */
On 05/02/2021 12:48, Vincent Guittot wrote:
> newidle_balance runs with both preempt and irq disabled which prevent
> local irq to run during this period. The duration for updating of the
> blocked load of CPUs varies according to the number of cgroups and
Maybe s/number of cgroups/number of CPU cgroups with non-decayed
cfs_rq's (i.e. cfs_rq within the leaf cfs_rq list)
> extends this critical period to an uncontrolled level.
>
> Remove the update from newidle_balance and trigger a normal ILB that
> will take care of the update instead.
>
> Signed-off-by: Vincent Guittot <[email protected]>
otherwise, LGTM.
[...]