2021-08-23 11:18:00

by Valentin Schneider

[permalink] [raw]
Subject: [PATCH v3 0/2] sched/fair: nohz.next_balance vs newly-idle CPUs

Hi folks,

This was caught up by our testing on an arm64 RB5 board - that's an 8 CPUs
DynamIQ SoC with 4 littles, 3 mediums and 1 big. It seems to rely more on NOHZ
balancing than our other boards being tested, which highlighted that not
including a newly-idle CPU into nohz.next_balance can cause issues (especially
when the other CPUs have had their balance_interval inflated by pinned tasks).

As suggested by Vincent, the approach here is to mimic what was done for
nohz.has_blocked, which gives us sane(ish) ordering guarantees.

Revisions
=========

v2 -> v3
++++++++

o Rebased against latest tip/sched/core: 234b8ab6476c ("sched: Introduce
dl_task_check_affinity() to check proposed affinity")

o Kept NOHZ_NEXT_KICK in NOHZ_KICK_MASK, but changed nohz_balancer_kick() to
issue kicks with NOHZ_STATS_KICK | NOHZ_BALANCE_KICK instead (Dietmar)
o Added missing NOHZ_STATS_KICK gate for nohz.next_blocked update (Vincent)

v1 -> v2
++++++++

o Ditched the extra cpumasks and went with a sibling of nohz.has_blocked
(Vincent)

Cheers,
Valentin

Valentin Schneider (2):
sched/fair: Add NOHZ balancer flag for nohz.next_balance updates
sched/fair: Trigger nohz.next_balance updates when a CPU goes
NOHZ-idle

kernel/sched/fair.c | 39 +++++++++++++++++++++++++++------------
kernel/sched/sched.h | 8 +++++++-
2 files changed, 34 insertions(+), 13 deletions(-)

--
2.25.1


2021-08-23 11:18:00

by Valentin Schneider

[permalink] [raw]
Subject: [PATCH v3 1/2] sched/fair: Add NOHZ balancer flag for nohz.next_balance updates

A following patch will trigger NOHZ idle balances as a means to update
nohz.next_balance. Vincent noted that blocked load updates can have
non-negligible overhead, which should be avoided if the intent is to only
update nohz.next_balance.

Add a new NOHZ balance kick flag, NOHZ_NEXT_KICK. Gate NOHZ blocked load
update by the presence of NOHZ_STATS_KICK - currently all NOHZ balance
kicks will have the NOHZ_STATS_KICK flag set, so no change in behaviour is
expected.

Suggested-by: Vincent Guittot <[email protected]>
Signed-off-by: Valentin Schneider <[email protected]>
---
kernel/sched/fair.c | 24 ++++++++++++++----------
kernel/sched/sched.h | 8 +++++++-
2 files changed, 21 insertions(+), 11 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 6cd05f1d77ef..4a91f3027c92 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -10342,7 +10342,7 @@ static void nohz_balancer_kick(struct rq *rq)
goto out;

if (rq->nr_running >= 2) {
- flags = NOHZ_KICK_MASK;
+ flags = NOHZ_STATS_KICK | NOHZ_BALANCE_KICK;
goto out;
}

@@ -10356,7 +10356,7 @@ static void nohz_balancer_kick(struct rq *rq)
* on.
*/
if (rq->cfs.h_nr_running >= 1 && check_cpu_capacity(rq, sd)) {
- flags = NOHZ_KICK_MASK;
+ flags = NOHZ_STATS_KICK | NOHZ_BALANCE_KICK;
goto unlock;
}
}
@@ -10370,7 +10370,7 @@ static void nohz_balancer_kick(struct rq *rq)
*/
for_each_cpu_and(i, sched_domain_span(sd), nohz.idle_cpus_mask) {
if (sched_asym_prefer(i, cpu)) {
- flags = NOHZ_KICK_MASK;
+ flags = NOHZ_STATS_KICK | NOHZ_BALANCE_KICK;
goto unlock;
}
}
@@ -10383,7 +10383,7 @@ static void nohz_balancer_kick(struct rq *rq)
* to run the misfit task on.
*/
if (check_misfit_status(rq, sd)) {
- flags = NOHZ_KICK_MASK;
+ flags = NOHZ_STATS_KICK | NOHZ_BALANCE_KICK;
goto unlock;
}

@@ -10410,7 +10410,7 @@ static void nohz_balancer_kick(struct rq *rq)
*/
nr_busy = atomic_read(&sds->nr_busy_cpus);
if (nr_busy > 1) {
- flags = NOHZ_KICK_MASK;
+ flags = NOHZ_STATS_KICK | NOHZ_BALANCE_KICK;
goto unlock;
}
}
@@ -10572,7 +10572,8 @@ static void _nohz_idle_balance(struct rq *this_rq, unsigned int flags,
* setting the flag, we are sure to not clear the state and not
* check the load of an idle cpu.
*/
- WRITE_ONCE(nohz.has_blocked, 0);
+ if (flags & NOHZ_STATS_KICK)
+ WRITE_ONCE(nohz.has_blocked, 0);

/*
* Ensures that if we miss the CPU, we must see the has_blocked
@@ -10594,13 +10595,15 @@ static void _nohz_idle_balance(struct rq *this_rq, unsigned int flags,
* balancing owner will pick it up.
*/
if (need_resched()) {
- has_blocked_load = true;
+ if (flags & NOHZ_STATS_KICK)
+ has_blocked_load = true;
goto abort;
}

rq = cpu_rq(balance_cpu);

- has_blocked_load |= update_nohz_stats(rq);
+ if (flags & NOHZ_STATS_KICK)
+ has_blocked_load |= update_nohz_stats(rq);

/*
* If time for next balance is due,
@@ -10631,8 +10634,9 @@ static void _nohz_idle_balance(struct rq *this_rq, unsigned int flags,
if (likely(update_next_balance))
nohz.next_balance = next_balance;

- WRITE_ONCE(nohz.next_blocked,
- now + msecs_to_jiffies(LOAD_AVG_PERIOD));
+ if (flags & NOHZ_STATS_KICK)
+ WRITE_ONCE(nohz.next_blocked,
+ now + msecs_to_jiffies(LOAD_AVG_PERIOD));

abort:
/* There is still blocked load, enable periodic update */
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index e7e2bba5b520..30b7bd2ef25d 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -2706,12 +2706,18 @@ extern void cfs_bandwidth_usage_dec(void);
#define NOHZ_BALANCE_KICK_BIT 0
#define NOHZ_STATS_KICK_BIT 1
#define NOHZ_NEWILB_KICK_BIT 2
+#define NOHZ_NEXT_KICK_BIT 3

+/* Run rebalance_domains() */
#define NOHZ_BALANCE_KICK BIT(NOHZ_BALANCE_KICK_BIT)
+/* Update blocked load */
#define NOHZ_STATS_KICK BIT(NOHZ_STATS_KICK_BIT)
+/* Update blocked load when entering idle */
#define NOHZ_NEWILB_KICK BIT(NOHZ_NEWILB_KICK_BIT)
+/* Update nohz.next_balance */
+#define NOHZ_NEXT_KICK BIT(NOHZ_NEXT_KICK_BIT)

-#define NOHZ_KICK_MASK (NOHZ_BALANCE_KICK | NOHZ_STATS_KICK)
+#define NOHZ_KICK_MASK (NOHZ_BALANCE_KICK | NOHZ_STATS_KICK | NOHZ_NEXT_KICK)

#define nohz_flags(cpu) (&cpu_rq(cpu)->nohz_flags)

--
2.25.1

2021-08-23 11:18:10

by Valentin Schneider

[permalink] [raw]
Subject: [PATCH v3 2/2] sched/fair: Trigger nohz.next_balance updates when a CPU goes NOHZ-idle

Consider a system with some NOHZ-idle CPUs, such that

nohz.idle_cpus_mask = S
nohz.next_balance = T

When a new CPU k goes NOHZ idle (nohz_balance_enter_idle()), we end up
with:

nohz.idle_cpus_mask = S \U {k}
nohz.next_balance = T

Note that the nohz.next_balance hasn't changed - it won't be updated until
a NOHZ balance is triggered. This is problematic if the newly NOHZ idle CPU
has an earlier rq.next_balance than the other NOHZ idle CPUs, IOW if:

cpu_rq(k).next_balance < nohz.next_balance

In such scenarios, the existing nohz.next_balance will prevent any NOHZ
balance from happening, which itself will prevent nohz.next_balance from
being updated to this new cpu_rq(k).next_balance. Unnecessary load balance
delays of over 12ms caused by this were observed on an arm64 RB5 board.

Use the new nohz.needs_update flag to mark the presence of newly-idle CPUs
that need their rq->next_balance to be collated into
nohz.next_balance. Trigger a NOHZ_NEXT_KICK when the flag is set.

Signed-off-by: Valentin Schneider <[email protected]>
---
kernel/sched/fair.c | 15 +++++++++++++--
1 file changed, 13 insertions(+), 2 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 4a91f3027c92..081a9e54058a 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -5754,6 +5754,7 @@ static struct {
cpumask_var_t idle_cpus_mask;
atomic_t nr_cpus;
int has_blocked; /* Idle CPUS has blocked load */
+ int needs_update; /* Newly idle CPUs need their next_balance collated */
unsigned long next_balance; /* in jiffy units */
unsigned long next_blocked; /* Next update of blocked load in jiffies */
} nohz ____cacheline_aligned;
@@ -10417,6 +10418,9 @@ static void nohz_balancer_kick(struct rq *rq)
unlock:
rcu_read_unlock();
out:
+ if (READ_ONCE(nohz.needs_update))
+ flags |= NOHZ_NEXT_KICK;
+
if (flags)
kick_ilb(flags);
}
@@ -10513,12 +10517,13 @@ void nohz_balance_enter_idle(int cpu)
/*
* Ensures that if nohz_idle_balance() fails to observe our
* @idle_cpus_mask store, it must observe the @has_blocked
- * store.
+ * and @needs_update stores.
*/
smp_mb__after_atomic();

set_cpu_sd_state_idle(cpu);

+ WRITE_ONCE(nohz.needs_update, 1);
out:
/*
* Each time a cpu enter idle, we assume that it has blocked load and
@@ -10567,13 +10572,17 @@ static void _nohz_idle_balance(struct rq *this_rq, unsigned int flags,
/*
* We assume there will be no idle load after this update and clear
* the has_blocked flag. If a cpu enters idle in the mean time, it will
- * set the has_blocked flag and trig another update of idle load.
+ * set the has_blocked flag and trigger another update of idle load.
* Because a cpu that becomes idle, is added to idle_cpus_mask before
* setting the flag, we are sure to not clear the state and not
* check the load of an idle cpu.
+ *
+ * Same applies to idle_cpus_mask vs needs_update.
*/
if (flags & NOHZ_STATS_KICK)
WRITE_ONCE(nohz.has_blocked, 0);
+ if (flags & NOHZ_NEXT_KICK)
+ WRITE_ONCE(nohz.needs_update, 0);

/*
* Ensures that if we miss the CPU, we must see the has_blocked
@@ -10597,6 +10606,8 @@ static void _nohz_idle_balance(struct rq *this_rq, unsigned int flags,
if (need_resched()) {
if (flags & NOHZ_STATS_KICK)
has_blocked_load = true;
+ if (flags & NOHZ_NEXT_KICK)
+ WRITE_ONCE(nohz.needs_update, 1);
goto abort;
}

--
2.25.1

2021-08-23 13:00:08

by Valentin Schneider

[permalink] [raw]
Subject: Re: [PATCH v3 1/2] sched/fair: Add NOHZ balancer flag for nohz.next_balance updates

On 23/08/21 13:59, Peter Zijlstra wrote:
> On Mon, Aug 23, 2021 at 12:16:59PM +0100, Valentin Schneider wrote:
>
>> Gate NOHZ blocked load
>> update by the presence of NOHZ_STATS_KICK - currently all NOHZ balance
>> kicks will have the NOHZ_STATS_KICK flag set, so no change in behaviour is
>> expected.
>
>> @@ -10572,7 +10572,8 @@ static void _nohz_idle_balance(struct rq *this_rq, unsigned int flags,
>> * setting the flag, we are sure to not clear the state and not
>> * check the load of an idle cpu.
>> */
>> - WRITE_ONCE(nohz.has_blocked, 0);
>> + if (flags & NOHZ_STATS_KICK)
>> + WRITE_ONCE(nohz.has_blocked, 0);
>>
>> /*
>> * Ensures that if we miss the CPU, we must see the has_blocked
>> @@ -10594,13 +10595,15 @@ static void _nohz_idle_balance(struct rq *this_rq, unsigned int flags,
>> * balancing owner will pick it up.
>> */
>> if (need_resched()) {
>> - has_blocked_load = true;
>> + if (flags & NOHZ_STATS_KICK)
>> + has_blocked_load = true;
>> goto abort;
>> }
>>
>> rq = cpu_rq(balance_cpu);
>>
>> - has_blocked_load |= update_nohz_stats(rq);
>> + if (flags & NOHZ_STATS_KICK)
>> + has_blocked_load |= update_nohz_stats(rq);
>>
>> /*
>> * If time for next balance is due,
>> @@ -10631,8 +10634,9 @@ static void _nohz_idle_balance(struct rq *this_rq, unsigned int flags,
>> if (likely(update_next_balance))
>> nohz.next_balance = next_balance;
>>
>> - WRITE_ONCE(nohz.next_blocked,
>> - now + msecs_to_jiffies(LOAD_AVG_PERIOD));
>> + if (flags & NOHZ_STATS_KICK)
>> + WRITE_ONCE(nohz.next_blocked,
>> + now + msecs_to_jiffies(LOAD_AVG_PERIOD));
>>
>> abort:
>> /* There is still blocked load, enable periodic update */
>
> I'm a bit puzzled by this; that function has:
>
> SCHED_WARN_ON((flags & NOHZ_KICK_MASK) == NOHZ_BALANCE_KICK);
>
> Which:
>
> - isn't updated
> - implies STATS must be set when BALANCE

Yup

>
> the latter gives rise to my confusion; why add that gate on STATS? It
> just doesn't make sense to do a BALANCE and not update STATS.

AFAIA that warning was only there to catch BALANCE && !STATS, so I didn't
tweak it.

Now, you could still end up with

flags == NOHZ_NEXT_KICK

(e.g. nohz.next_balance is in the future, but a new CPU entered NOHZ-idle
and needs its own rq.next_balance collated into the nohz struct)

in which case you don't do any blocked load update, hence the
gate. In v1 I had that piggyback on NOHZ_STATS_KICK, but Vincent noted
that might not be the best given blocked load updates can be time
consuming - hence the separate flag.

2021-08-23 13:54:33

by Dietmar Eggemann

[permalink] [raw]
Subject: Re: [PATCH v3 1/2] sched/fair: Add NOHZ balancer flag for nohz.next_balance updates

On 23/08/2021 14:57, Valentin Schneider wrote:
> On 23/08/21 13:59, Peter Zijlstra wrote:
>> On Mon, Aug 23, 2021 at 12:16:59PM +0100, Valentin Schneider wrote:
>>
>>> Gate NOHZ blocked load
>>> update by the presence of NOHZ_STATS_KICK - currently all NOHZ balance
>>> kicks will have the NOHZ_STATS_KICK flag set, so no change in behaviour is
>>> expected.
>>
>>> @@ -10572,7 +10572,8 @@ static void _nohz_idle_balance(struct rq *this_rq, unsigned int flags,
>>> * setting the flag, we are sure to not clear the state and not
>>> * check the load of an idle cpu.
>>> */
>>> - WRITE_ONCE(nohz.has_blocked, 0);
>>> + if (flags & NOHZ_STATS_KICK)
>>> + WRITE_ONCE(nohz.has_blocked, 0);
>>>
>>> /*
>>> * Ensures that if we miss the CPU, we must see the has_blocked
>>> @@ -10594,13 +10595,15 @@ static void _nohz_idle_balance(struct rq *this_rq, unsigned int flags,
>>> * balancing owner will pick it up.
>>> */
>>> if (need_resched()) {
>>> - has_blocked_load = true;
>>> + if (flags & NOHZ_STATS_KICK)
>>> + has_blocked_load = true;
>>> goto abort;
>>> }
>>>
>>> rq = cpu_rq(balance_cpu);
>>>
>>> - has_blocked_load |= update_nohz_stats(rq);
>>> + if (flags & NOHZ_STATS_KICK)
>>> + has_blocked_load |= update_nohz_stats(rq);
>>>
>>> /*
>>> * If time for next balance is due,
>>> @@ -10631,8 +10634,9 @@ static void _nohz_idle_balance(struct rq *this_rq, unsigned int flags,
>>> if (likely(update_next_balance))
>>> nohz.next_balance = next_balance;
>>>
>>> - WRITE_ONCE(nohz.next_blocked,
>>> - now + msecs_to_jiffies(LOAD_AVG_PERIOD));
>>> + if (flags & NOHZ_STATS_KICK)
>>> + WRITE_ONCE(nohz.next_blocked,
>>> + now + msecs_to_jiffies(LOAD_AVG_PERIOD));
>>>
>>> abort:
>>> /* There is still blocked load, enable periodic update */
>>
>> I'm a bit puzzled by this; that function has:
>>
>> SCHED_WARN_ON((flags & NOHZ_KICK_MASK) == NOHZ_BALANCE_KICK);
>>
>> Which:
>>
>> - isn't updated
>> - implies STATS must be set when BALANCE
>
> Yup
>
>>
>> the latter gives rise to my confusion; why add that gate on STATS? It
>> just doesn't make sense to do a BALANCE and not update STATS.
>
> AFAIA that warning was only there to catch BALANCE && !STATS, so I didn't
> tweak it.
>
> Now, you could still end up with
>
> flags == NOHZ_NEXT_KICK
>
> (e.g. nohz.next_balance is in the future, but a new CPU entered NOHZ-idle
> and needs its own rq.next_balance collated into the nohz struct)
>
> in which case you don't do any blocked load update, hence the
> gate. In v1 I had that piggyback on NOHZ_STATS_KICK, but Vincent noted
> that might not be the best given blocked load updates can be time
> consuming - hence the separate flag.

Maybe the confusion stems from the fact that the NOHZ_NEXT_KICK-set
changes are only introduced in 2/2?

@@ -10417,6 +10418,9 @@ static void nohz_balancer_kick(struct rq *rq)
unlock:
rcu_read_unlock();
out:
+ if (READ_ONCE(nohz.needs_update))
+ flags |= NOHZ_NEXT_KICK;
+

@@ -10513,12 +10517,13 @@ void nohz_balance_enter_idle(int cpu)

...

+ WRITE_ONCE(nohz.needs_update, 1);

2021-08-24 09:12:51

by Vincent Guittot

[permalink] [raw]
Subject: Re: [PATCH v3 1/2] sched/fair: Add NOHZ balancer flag for nohz.next_balance updates

On Mon, 23 Aug 2021 at 13:17, Valentin Schneider
<[email protected]> wrote:
>
> A following patch will trigger NOHZ idle balances as a means to update
> nohz.next_balance. Vincent noted that blocked load updates can have
> non-negligible overhead, which should be avoided if the intent is to only
> update nohz.next_balance.
>
> Add a new NOHZ balance kick flag, NOHZ_NEXT_KICK. Gate NOHZ blocked load
> update by the presence of NOHZ_STATS_KICK - currently all NOHZ balance
> kicks will have the NOHZ_STATS_KICK flag set, so no change in behaviour is
> expected.
>
> Suggested-by: Vincent Guittot <[email protected]>
> Signed-off-by: Valentin Schneider <[email protected]>

Reviewed-by: Vincent Guittot <[email protected]>

> ---
> kernel/sched/fair.c | 24 ++++++++++++++----------
> kernel/sched/sched.h | 8 +++++++-
> 2 files changed, 21 insertions(+), 11 deletions(-)
>
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index 6cd05f1d77ef..4a91f3027c92 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -10342,7 +10342,7 @@ static void nohz_balancer_kick(struct rq *rq)
> goto out;
>
> if (rq->nr_running >= 2) {
> - flags = NOHZ_KICK_MASK;
> + flags = NOHZ_STATS_KICK | NOHZ_BALANCE_KICK;
> goto out;
> }
>
> @@ -10356,7 +10356,7 @@ static void nohz_balancer_kick(struct rq *rq)
> * on.
> */
> if (rq->cfs.h_nr_running >= 1 && check_cpu_capacity(rq, sd)) {
> - flags = NOHZ_KICK_MASK;
> + flags = NOHZ_STATS_KICK | NOHZ_BALANCE_KICK;
> goto unlock;
> }
> }
> @@ -10370,7 +10370,7 @@ static void nohz_balancer_kick(struct rq *rq)
> */
> for_each_cpu_and(i, sched_domain_span(sd), nohz.idle_cpus_mask) {
> if (sched_asym_prefer(i, cpu)) {
> - flags = NOHZ_KICK_MASK;
> + flags = NOHZ_STATS_KICK | NOHZ_BALANCE_KICK;
> goto unlock;
> }
> }
> @@ -10383,7 +10383,7 @@ static void nohz_balancer_kick(struct rq *rq)
> * to run the misfit task on.
> */
> if (check_misfit_status(rq, sd)) {
> - flags = NOHZ_KICK_MASK;
> + flags = NOHZ_STATS_KICK | NOHZ_BALANCE_KICK;
> goto unlock;
> }
>
> @@ -10410,7 +10410,7 @@ static void nohz_balancer_kick(struct rq *rq)
> */
> nr_busy = atomic_read(&sds->nr_busy_cpus);
> if (nr_busy > 1) {
> - flags = NOHZ_KICK_MASK;
> + flags = NOHZ_STATS_KICK | NOHZ_BALANCE_KICK;
> goto unlock;
> }
> }
> @@ -10572,7 +10572,8 @@ static void _nohz_idle_balance(struct rq *this_rq, unsigned int flags,
> * setting the flag, we are sure to not clear the state and not
> * check the load of an idle cpu.
> */
> - WRITE_ONCE(nohz.has_blocked, 0);
> + if (flags & NOHZ_STATS_KICK)
> + WRITE_ONCE(nohz.has_blocked, 0);
>
> /*
> * Ensures that if we miss the CPU, we must see the has_blocked
> @@ -10594,13 +10595,15 @@ static void _nohz_idle_balance(struct rq *this_rq, unsigned int flags,
> * balancing owner will pick it up.
> */
> if (need_resched()) {
> - has_blocked_load = true;
> + if (flags & NOHZ_STATS_KICK)
> + has_blocked_load = true;
> goto abort;
> }
>
> rq = cpu_rq(balance_cpu);
>
> - has_blocked_load |= update_nohz_stats(rq);
> + if (flags & NOHZ_STATS_KICK)
> + has_blocked_load |= update_nohz_stats(rq);
>
> /*
> * If time for next balance is due,
> @@ -10631,8 +10634,9 @@ static void _nohz_idle_balance(struct rq *this_rq, unsigned int flags,
> if (likely(update_next_balance))
> nohz.next_balance = next_balance;
>
> - WRITE_ONCE(nohz.next_blocked,
> - now + msecs_to_jiffies(LOAD_AVG_PERIOD));
> + if (flags & NOHZ_STATS_KICK)
> + WRITE_ONCE(nohz.next_blocked,
> + now + msecs_to_jiffies(LOAD_AVG_PERIOD));
>
> abort:
> /* There is still blocked load, enable periodic update */
> diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
> index e7e2bba5b520..30b7bd2ef25d 100644
> --- a/kernel/sched/sched.h
> +++ b/kernel/sched/sched.h
> @@ -2706,12 +2706,18 @@ extern void cfs_bandwidth_usage_dec(void);
> #define NOHZ_BALANCE_KICK_BIT 0
> #define NOHZ_STATS_KICK_BIT 1
> #define NOHZ_NEWILB_KICK_BIT 2
> +#define NOHZ_NEXT_KICK_BIT 3
>
> +/* Run rebalance_domains() */
> #define NOHZ_BALANCE_KICK BIT(NOHZ_BALANCE_KICK_BIT)
> +/* Update blocked load */
> #define NOHZ_STATS_KICK BIT(NOHZ_STATS_KICK_BIT)
> +/* Update blocked load when entering idle */
> #define NOHZ_NEWILB_KICK BIT(NOHZ_NEWILB_KICK_BIT)
> +/* Update nohz.next_balance */
> +#define NOHZ_NEXT_KICK BIT(NOHZ_NEXT_KICK_BIT)
>
> -#define NOHZ_KICK_MASK (NOHZ_BALANCE_KICK | NOHZ_STATS_KICK)
> +#define NOHZ_KICK_MASK (NOHZ_BALANCE_KICK | NOHZ_STATS_KICK | NOHZ_NEXT_KICK)
>
> #define nohz_flags(cpu) (&cpu_rq(cpu)->nohz_flags)
>
> --
> 2.25.1
>

2021-08-24 09:13:54

by Vincent Guittot

[permalink] [raw]
Subject: Re: [PATCH v3 2/2] sched/fair: Trigger nohz.next_balance updates when a CPU goes NOHZ-idle

On Mon, 23 Aug 2021 at 13:17, Valentin Schneider
<[email protected]> wrote:
>
> Consider a system with some NOHZ-idle CPUs, such that
>
> nohz.idle_cpus_mask = S
> nohz.next_balance = T
>
> When a new CPU k goes NOHZ idle (nohz_balance_enter_idle()), we end up
> with:
>
> nohz.idle_cpus_mask = S \U {k}
> nohz.next_balance = T
>
> Note that the nohz.next_balance hasn't changed - it won't be updated until
> a NOHZ balance is triggered. This is problematic if the newly NOHZ idle CPU
> has an earlier rq.next_balance than the other NOHZ idle CPUs, IOW if:
>
> cpu_rq(k).next_balance < nohz.next_balance
>
> In such scenarios, the existing nohz.next_balance will prevent any NOHZ
> balance from happening, which itself will prevent nohz.next_balance from
> being updated to this new cpu_rq(k).next_balance. Unnecessary load balance
> delays of over 12ms caused by this were observed on an arm64 RB5 board.
>
> Use the new nohz.needs_update flag to mark the presence of newly-idle CPUs
> that need their rq->next_balance to be collated into
> nohz.next_balance. Trigger a NOHZ_NEXT_KICK when the flag is set.
>
> Signed-off-by: Valentin Schneider <[email protected]>

Reviewed-by: Vincent Guittot <[email protected]>

> ---
> kernel/sched/fair.c | 15 +++++++++++++--
> 1 file changed, 13 insertions(+), 2 deletions(-)
>
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index 4a91f3027c92..081a9e54058a 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -5754,6 +5754,7 @@ static struct {
> cpumask_var_t idle_cpus_mask;
> atomic_t nr_cpus;
> int has_blocked; /* Idle CPUS has blocked load */
> + int needs_update; /* Newly idle CPUs need their next_balance collated */
> unsigned long next_balance; /* in jiffy units */
> unsigned long next_blocked; /* Next update of blocked load in jiffies */
> } nohz ____cacheline_aligned;
> @@ -10417,6 +10418,9 @@ static void nohz_balancer_kick(struct rq *rq)
> unlock:
> rcu_read_unlock();
> out:
> + if (READ_ONCE(nohz.needs_update))
> + flags |= NOHZ_NEXT_KICK;
> +
> if (flags)
> kick_ilb(flags);
> }
> @@ -10513,12 +10517,13 @@ void nohz_balance_enter_idle(int cpu)
> /*
> * Ensures that if nohz_idle_balance() fails to observe our
> * @idle_cpus_mask store, it must observe the @has_blocked
> - * store.
> + * and @needs_update stores.
> */
> smp_mb__after_atomic();
>
> set_cpu_sd_state_idle(cpu);
>
> + WRITE_ONCE(nohz.needs_update, 1);
> out:
> /*
> * Each time a cpu enter idle, we assume that it has blocked load and
> @@ -10567,13 +10572,17 @@ static void _nohz_idle_balance(struct rq *this_rq, unsigned int flags,
> /*
> * We assume there will be no idle load after this update and clear
> * the has_blocked flag. If a cpu enters idle in the mean time, it will
> - * set the has_blocked flag and trig another update of idle load.
> + * set the has_blocked flag and trigger another update of idle load.
> * Because a cpu that becomes idle, is added to idle_cpus_mask before
> * setting the flag, we are sure to not clear the state and not
> * check the load of an idle cpu.
> + *
> + * Same applies to idle_cpus_mask vs needs_update.
> */
> if (flags & NOHZ_STATS_KICK)
> WRITE_ONCE(nohz.has_blocked, 0);
> + if (flags & NOHZ_NEXT_KICK)
> + WRITE_ONCE(nohz.needs_update, 0);
>
> /*
> * Ensures that if we miss the CPU, we must see the has_blocked
> @@ -10597,6 +10606,8 @@ static void _nohz_idle_balance(struct rq *this_rq, unsigned int flags,
> if (need_resched()) {
> if (flags & NOHZ_STATS_KICK)
> has_blocked_load = true;
> + if (flags & NOHZ_NEXT_KICK)
> + WRITE_ONCE(nohz.needs_update, 1);
> goto abort;
> }
>
> --
> 2.25.1
>

Subject: [tip: sched/core] sched/fair: Add NOHZ balancer flag for nohz.next_balance updates

The following commit has been merged into the sched/core branch of tip:

Commit-ID: 013ce5ed58f799a2f035b732f904f6ebd8e8d881
Gitweb: https://git.kernel.org/tip/013ce5ed58f799a2f035b732f904f6ebd8e8d881
Author: Valentin Schneider <[email protected]>
AuthorDate: Mon, 23 Aug 2021 12:16:59 +01:00
Committer: Peter Zijlstra <[email protected]>
CommitterDate: Thu, 09 Sep 2021 11:27:29 +02:00

sched/fair: Add NOHZ balancer flag for nohz.next_balance updates

A following patch will trigger NOHZ idle balances as a means to update
nohz.next_balance. Vincent noted that blocked load updates can have
non-negligible overhead, which should be avoided if the intent is to only
update nohz.next_balance.

Add a new NOHZ balance kick flag, NOHZ_NEXT_KICK. Gate NOHZ blocked load
update by the presence of NOHZ_STATS_KICK - currently all NOHZ balance
kicks will have the NOHZ_STATS_KICK flag set, so no change in behaviour is
expected.

Suggested-by: Vincent Guittot <[email protected]>
Signed-off-by: Valentin Schneider <[email protected]>
Signed-off-by: Peter Zijlstra (Intel) <[email protected]>
Reviewed-by: Vincent Guittot <[email protected]>
Link: https://lkml.kernel.org/r/[email protected]
---
kernel/sched/fair.c | 24 ++++++++++++++----------
kernel/sched/sched.h | 8 +++++++-
2 files changed, 21 insertions(+), 11 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 7b3e859..48ce754 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -10342,7 +10342,7 @@ static void nohz_balancer_kick(struct rq *rq)
goto out;

if (rq->nr_running >= 2) {
- flags = NOHZ_KICK_MASK;
+ flags = NOHZ_STATS_KICK | NOHZ_BALANCE_KICK;
goto out;
}

@@ -10356,7 +10356,7 @@ static void nohz_balancer_kick(struct rq *rq)
* on.
*/
if (rq->cfs.h_nr_running >= 1 && check_cpu_capacity(rq, sd)) {
- flags = NOHZ_KICK_MASK;
+ flags = NOHZ_STATS_KICK | NOHZ_BALANCE_KICK;
goto unlock;
}
}
@@ -10370,7 +10370,7 @@ static void nohz_balancer_kick(struct rq *rq)
*/
for_each_cpu_and(i, sched_domain_span(sd), nohz.idle_cpus_mask) {
if (sched_asym_prefer(i, cpu)) {
- flags = NOHZ_KICK_MASK;
+ flags = NOHZ_STATS_KICK | NOHZ_BALANCE_KICK;
goto unlock;
}
}
@@ -10383,7 +10383,7 @@ static void nohz_balancer_kick(struct rq *rq)
* to run the misfit task on.
*/
if (check_misfit_status(rq, sd)) {
- flags = NOHZ_KICK_MASK;
+ flags = NOHZ_STATS_KICK | NOHZ_BALANCE_KICK;
goto unlock;
}

@@ -10410,7 +10410,7 @@ static void nohz_balancer_kick(struct rq *rq)
*/
nr_busy = atomic_read(&sds->nr_busy_cpus);
if (nr_busy > 1) {
- flags = NOHZ_KICK_MASK;
+ flags = NOHZ_STATS_KICK | NOHZ_BALANCE_KICK;
goto unlock;
}
}
@@ -10572,7 +10572,8 @@ static void _nohz_idle_balance(struct rq *this_rq, unsigned int flags,
* setting the flag, we are sure to not clear the state and not
* check the load of an idle cpu.
*/
- WRITE_ONCE(nohz.has_blocked, 0);
+ if (flags & NOHZ_STATS_KICK)
+ WRITE_ONCE(nohz.has_blocked, 0);

/*
* Ensures that if we miss the CPU, we must see the has_blocked
@@ -10594,13 +10595,15 @@ static void _nohz_idle_balance(struct rq *this_rq, unsigned int flags,
* balancing owner will pick it up.
*/
if (need_resched()) {
- has_blocked_load = true;
+ if (flags & NOHZ_STATS_KICK)
+ has_blocked_load = true;
goto abort;
}

rq = cpu_rq(balance_cpu);

- has_blocked_load |= update_nohz_stats(rq);
+ if (flags & NOHZ_STATS_KICK)
+ has_blocked_load |= update_nohz_stats(rq);

/*
* If time for next balance is due,
@@ -10631,8 +10634,9 @@ static void _nohz_idle_balance(struct rq *this_rq, unsigned int flags,
if (likely(update_next_balance))
nohz.next_balance = next_balance;

- WRITE_ONCE(nohz.next_blocked,
- now + msecs_to_jiffies(LOAD_AVG_PERIOD));
+ if (flags & NOHZ_STATS_KICK)
+ WRITE_ONCE(nohz.next_blocked,
+ now + msecs_to_jiffies(LOAD_AVG_PERIOD));

abort:
/* There is still blocked load, enable periodic update */
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index e7e2bba..30b7bd2 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -2706,12 +2706,18 @@ extern void cfs_bandwidth_usage_dec(void);
#define NOHZ_BALANCE_KICK_BIT 0
#define NOHZ_STATS_KICK_BIT 1
#define NOHZ_NEWILB_KICK_BIT 2
+#define NOHZ_NEXT_KICK_BIT 3

+/* Run rebalance_domains() */
#define NOHZ_BALANCE_KICK BIT(NOHZ_BALANCE_KICK_BIT)
+/* Update blocked load */
#define NOHZ_STATS_KICK BIT(NOHZ_STATS_KICK_BIT)
+/* Update blocked load when entering idle */
#define NOHZ_NEWILB_KICK BIT(NOHZ_NEWILB_KICK_BIT)
+/* Update nohz.next_balance */
+#define NOHZ_NEXT_KICK BIT(NOHZ_NEXT_KICK_BIT)

-#define NOHZ_KICK_MASK (NOHZ_BALANCE_KICK | NOHZ_STATS_KICK)
+#define NOHZ_KICK_MASK (NOHZ_BALANCE_KICK | NOHZ_STATS_KICK | NOHZ_NEXT_KICK)

#define nohz_flags(cpu) (&cpu_rq(cpu)->nohz_flags)

Subject: [tip: sched/core] sched/fair: Trigger nohz.next_balance updates when a CPU goes NOHZ-idle

The following commit has been merged into the sched/core branch of tip:

Commit-ID: df100a6682d3d9d4b7cbb531a3f783035732ba92
Gitweb: https://git.kernel.org/tip/df100a6682d3d9d4b7cbb531a3f783035732ba92
Author: Valentin Schneider <[email protected]>
AuthorDate: Mon, 23 Aug 2021 12:17:00 +01:00
Committer: Peter Zijlstra <[email protected]>
CommitterDate: Thu, 09 Sep 2021 11:27:30 +02:00

sched/fair: Trigger nohz.next_balance updates when a CPU goes NOHZ-idle

Consider a system with some NOHZ-idle CPUs, such that

nohz.idle_cpus_mask = S
nohz.next_balance = T

When a new CPU k goes NOHZ idle (nohz_balance_enter_idle()), we end up
with:

nohz.idle_cpus_mask = S \U {k}
nohz.next_balance = T

Note that the nohz.next_balance hasn't changed - it won't be updated until
a NOHZ balance is triggered. This is problematic if the newly NOHZ idle CPU
has an earlier rq.next_balance than the other NOHZ idle CPUs, IOW if:

cpu_rq(k).next_balance < nohz.next_balance

In such scenarios, the existing nohz.next_balance will prevent any NOHZ
balance from happening, which itself will prevent nohz.next_balance from
being updated to this new cpu_rq(k).next_balance. Unnecessary load balance
delays of over 12ms caused by this were observed on an arm64 RB5 board.

Use the new nohz.needs_update flag to mark the presence of newly-idle CPUs
that need their rq->next_balance to be collated into
nohz.next_balance. Trigger a NOHZ_NEXT_KICK when the flag is set.

Signed-off-by: Valentin Schneider <[email protected]>
Signed-off-by: Peter Zijlstra (Intel) <[email protected]>
Reviewed-by: Vincent Guittot <[email protected]>
Link: https://lkml.kernel.org/r/[email protected]
---
kernel/sched/fair.c | 15 +++++++++++++--
1 file changed, 13 insertions(+), 2 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 48ce754..2a5efde 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -5754,6 +5754,7 @@ static struct {
cpumask_var_t idle_cpus_mask;
atomic_t nr_cpus;
int has_blocked; /* Idle CPUS has blocked load */
+ int needs_update; /* Newly idle CPUs need their next_balance collated */
unsigned long next_balance; /* in jiffy units */
unsigned long next_blocked; /* Next update of blocked load in jiffies */
} nohz ____cacheline_aligned;
@@ -10417,6 +10418,9 @@ static void nohz_balancer_kick(struct rq *rq)
unlock:
rcu_read_unlock();
out:
+ if (READ_ONCE(nohz.needs_update))
+ flags |= NOHZ_NEXT_KICK;
+
if (flags)
kick_ilb(flags);
}
@@ -10513,12 +10517,13 @@ void nohz_balance_enter_idle(int cpu)
/*
* Ensures that if nohz_idle_balance() fails to observe our
* @idle_cpus_mask store, it must observe the @has_blocked
- * store.
+ * and @needs_update stores.
*/
smp_mb__after_atomic();

set_cpu_sd_state_idle(cpu);

+ WRITE_ONCE(nohz.needs_update, 1);
out:
/*
* Each time a cpu enter idle, we assume that it has blocked load and
@@ -10567,13 +10572,17 @@ static void _nohz_idle_balance(struct rq *this_rq, unsigned int flags,
/*
* We assume there will be no idle load after this update and clear
* the has_blocked flag. If a cpu enters idle in the mean time, it will
- * set the has_blocked flag and trig another update of idle load.
+ * set the has_blocked flag and trigger another update of idle load.
* Because a cpu that becomes idle, is added to idle_cpus_mask before
* setting the flag, we are sure to not clear the state and not
* check the load of an idle cpu.
+ *
+ * Same applies to idle_cpus_mask vs needs_update.
*/
if (flags & NOHZ_STATS_KICK)
WRITE_ONCE(nohz.has_blocked, 0);
+ if (flags & NOHZ_NEXT_KICK)
+ WRITE_ONCE(nohz.needs_update, 0);

/*
* Ensures that if we miss the CPU, we must see the has_blocked
@@ -10597,6 +10606,8 @@ static void _nohz_idle_balance(struct rq *this_rq, unsigned int flags,
if (need_resched()) {
if (flags & NOHZ_STATS_KICK)
has_blocked_load = true;
+ if (flags & NOHZ_NEXT_KICK)
+ WRITE_ONCE(nohz.needs_update, 1);
goto abort;
}

Subject: [tip: sched/core] sched/fair: Trigger nohz.next_balance updates when a CPU goes NOHZ-idle

The following commit has been merged into the sched/core branch of tip:

Commit-ID: 7fd7a9e0caba10829b4f8db1aa7711b558681fd4
Gitweb: https://git.kernel.org/tip/7fd7a9e0caba10829b4f8db1aa7711b558681fd4
Author: Valentin Schneider <[email protected]>
AuthorDate: Mon, 23 Aug 2021 12:17:00 +01:00
Committer: Peter Zijlstra <[email protected]>
CommitterDate: Tue, 05 Oct 2021 15:51:31 +02:00

sched/fair: Trigger nohz.next_balance updates when a CPU goes NOHZ-idle

Consider a system with some NOHZ-idle CPUs, such that

nohz.idle_cpus_mask = S
nohz.next_balance = T

When a new CPU k goes NOHZ idle (nohz_balance_enter_idle()), we end up
with:

nohz.idle_cpus_mask = S \U {k}
nohz.next_balance = T

Note that the nohz.next_balance hasn't changed - it won't be updated until
a NOHZ balance is triggered. This is problematic if the newly NOHZ idle CPU
has an earlier rq.next_balance than the other NOHZ idle CPUs, IOW if:

cpu_rq(k).next_balance < nohz.next_balance

In such scenarios, the existing nohz.next_balance will prevent any NOHZ
balance from happening, which itself will prevent nohz.next_balance from
being updated to this new cpu_rq(k).next_balance. Unnecessary load balance
delays of over 12ms caused by this were observed on an arm64 RB5 board.

Use the new nohz.needs_update flag to mark the presence of newly-idle CPUs
that need their rq->next_balance to be collated into
nohz.next_balance. Trigger a NOHZ_NEXT_KICK when the flag is set.

Signed-off-by: Valentin Schneider <[email protected]>
Signed-off-by: Peter Zijlstra (Intel) <[email protected]>
Reviewed-by: Vincent Guittot <[email protected]>
Link: https://lkml.kernel.org/r/[email protected]
---
kernel/sched/fair.c | 15 +++++++++++++--
1 file changed, 13 insertions(+), 2 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index f4de7f5..6cc958e 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -5787,6 +5787,7 @@ static struct {
cpumask_var_t idle_cpus_mask;
atomic_t nr_cpus;
int has_blocked; /* Idle CPUS has blocked load */
+ int needs_update; /* Newly idle CPUs need their next_balance collated */
unsigned long next_balance; /* in jiffy units */
unsigned long next_blocked; /* Next update of blocked load in jiffies */
} nohz ____cacheline_aligned;
@@ -10450,6 +10451,9 @@ static void nohz_balancer_kick(struct rq *rq)
unlock:
rcu_read_unlock();
out:
+ if (READ_ONCE(nohz.needs_update))
+ flags |= NOHZ_NEXT_KICK;
+
if (flags)
kick_ilb(flags);
}
@@ -10546,12 +10550,13 @@ void nohz_balance_enter_idle(int cpu)
/*
* Ensures that if nohz_idle_balance() fails to observe our
* @idle_cpus_mask store, it must observe the @has_blocked
- * store.
+ * and @needs_update stores.
*/
smp_mb__after_atomic();

set_cpu_sd_state_idle(cpu);

+ WRITE_ONCE(nohz.needs_update, 1);
out:
/*
* Each time a cpu enter idle, we assume that it has blocked load and
@@ -10600,13 +10605,17 @@ static void _nohz_idle_balance(struct rq *this_rq, unsigned int flags,
/*
* We assume there will be no idle load after this update and clear
* the has_blocked flag. If a cpu enters idle in the mean time, it will
- * set the has_blocked flag and trig another update of idle load.
+ * set the has_blocked flag and trigger another update of idle load.
* Because a cpu that becomes idle, is added to idle_cpus_mask before
* setting the flag, we are sure to not clear the state and not
* check the load of an idle cpu.
+ *
+ * Same applies to idle_cpus_mask vs needs_update.
*/
if (flags & NOHZ_STATS_KICK)
WRITE_ONCE(nohz.has_blocked, 0);
+ if (flags & NOHZ_NEXT_KICK)
+ WRITE_ONCE(nohz.needs_update, 0);

/*
* Ensures that if we miss the CPU, we must see the has_blocked
@@ -10630,6 +10639,8 @@ static void _nohz_idle_balance(struct rq *this_rq, unsigned int flags,
if (need_resched()) {
if (flags & NOHZ_STATS_KICK)
has_blocked_load = true;
+ if (flags & NOHZ_NEXT_KICK)
+ WRITE_ONCE(nohz.needs_update, 1);
goto abort;
}

Subject: [tip: sched/core] sched/fair: Add NOHZ balancer flag for nohz.next_balance updates

The following commit has been merged into the sched/core branch of tip:

Commit-ID: efd984c481abb516fab8bafb25bf41fd9397a43c
Gitweb: https://git.kernel.org/tip/efd984c481abb516fab8bafb25bf41fd9397a43c
Author: Valentin Schneider <[email protected]>
AuthorDate: Mon, 23 Aug 2021 12:16:59 +01:00
Committer: Peter Zijlstra <[email protected]>
CommitterDate: Tue, 05 Oct 2021 15:51:30 +02:00

sched/fair: Add NOHZ balancer flag for nohz.next_balance updates

A following patch will trigger NOHZ idle balances as a means to update
nohz.next_balance. Vincent noted that blocked load updates can have
non-negligible overhead, which should be avoided if the intent is to only
update nohz.next_balance.

Add a new NOHZ balance kick flag, NOHZ_NEXT_KICK. Gate NOHZ blocked load
update by the presence of NOHZ_STATS_KICK - currently all NOHZ balance
kicks will have the NOHZ_STATS_KICK flag set, so no change in behaviour is
expected.

Suggested-by: Vincent Guittot <[email protected]>
Signed-off-by: Valentin Schneider <[email protected]>
Signed-off-by: Peter Zijlstra (Intel) <[email protected]>
Reviewed-by: Vincent Guittot <[email protected]>
Link: https://lkml.kernel.org/r/[email protected]
---
kernel/sched/fair.c | 24 ++++++++++++++----------
kernel/sched/sched.h | 8 +++++++-
2 files changed, 21 insertions(+), 11 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index f6a05d9..f4de7f5 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -10375,7 +10375,7 @@ static void nohz_balancer_kick(struct rq *rq)
goto out;

if (rq->nr_running >= 2) {
- flags = NOHZ_KICK_MASK;
+ flags = NOHZ_STATS_KICK | NOHZ_BALANCE_KICK;
goto out;
}

@@ -10389,7 +10389,7 @@ static void nohz_balancer_kick(struct rq *rq)
* on.
*/
if (rq->cfs.h_nr_running >= 1 && check_cpu_capacity(rq, sd)) {
- flags = NOHZ_KICK_MASK;
+ flags = NOHZ_STATS_KICK | NOHZ_BALANCE_KICK;
goto unlock;
}
}
@@ -10403,7 +10403,7 @@ static void nohz_balancer_kick(struct rq *rq)
*/
for_each_cpu_and(i, sched_domain_span(sd), nohz.idle_cpus_mask) {
if (sched_asym_prefer(i, cpu)) {
- flags = NOHZ_KICK_MASK;
+ flags = NOHZ_STATS_KICK | NOHZ_BALANCE_KICK;
goto unlock;
}
}
@@ -10416,7 +10416,7 @@ static void nohz_balancer_kick(struct rq *rq)
* to run the misfit task on.
*/
if (check_misfit_status(rq, sd)) {
- flags = NOHZ_KICK_MASK;
+ flags = NOHZ_STATS_KICK | NOHZ_BALANCE_KICK;
goto unlock;
}

@@ -10443,7 +10443,7 @@ static void nohz_balancer_kick(struct rq *rq)
*/
nr_busy = atomic_read(&sds->nr_busy_cpus);
if (nr_busy > 1) {
- flags = NOHZ_KICK_MASK;
+ flags = NOHZ_STATS_KICK | NOHZ_BALANCE_KICK;
goto unlock;
}
}
@@ -10605,7 +10605,8 @@ static void _nohz_idle_balance(struct rq *this_rq, unsigned int flags,
* setting the flag, we are sure to not clear the state and not
* check the load of an idle cpu.
*/
- WRITE_ONCE(nohz.has_blocked, 0);
+ if (flags & NOHZ_STATS_KICK)
+ WRITE_ONCE(nohz.has_blocked, 0);

/*
* Ensures that if we miss the CPU, we must see the has_blocked
@@ -10627,13 +10628,15 @@ static void _nohz_idle_balance(struct rq *this_rq, unsigned int flags,
* balancing owner will pick it up.
*/
if (need_resched()) {
- has_blocked_load = true;
+ if (flags & NOHZ_STATS_KICK)
+ has_blocked_load = true;
goto abort;
}

rq = cpu_rq(balance_cpu);

- has_blocked_load |= update_nohz_stats(rq);
+ if (flags & NOHZ_STATS_KICK)
+ has_blocked_load |= update_nohz_stats(rq);

/*
* If time for next balance is due,
@@ -10664,8 +10667,9 @@ static void _nohz_idle_balance(struct rq *this_rq, unsigned int flags,
if (likely(update_next_balance))
nohz.next_balance = next_balance;

- WRITE_ONCE(nohz.next_blocked,
- now + msecs_to_jiffies(LOAD_AVG_PERIOD));
+ if (flags & NOHZ_STATS_KICK)
+ WRITE_ONCE(nohz.next_blocked,
+ now + msecs_to_jiffies(LOAD_AVG_PERIOD));

abort:
/* There is still blocked load, enable periodic update */
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 3d3e579..1fec313 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -2709,12 +2709,18 @@ extern void cfs_bandwidth_usage_dec(void);
#define NOHZ_BALANCE_KICK_BIT 0
#define NOHZ_STATS_KICK_BIT 1
#define NOHZ_NEWILB_KICK_BIT 2
+#define NOHZ_NEXT_KICK_BIT 3

+/* Run rebalance_domains() */
#define NOHZ_BALANCE_KICK BIT(NOHZ_BALANCE_KICK_BIT)
+/* Update blocked load */
#define NOHZ_STATS_KICK BIT(NOHZ_STATS_KICK_BIT)
+/* Update blocked load when entering idle */
#define NOHZ_NEWILB_KICK BIT(NOHZ_NEWILB_KICK_BIT)
+/* Update nohz.next_balance */
+#define NOHZ_NEXT_KICK BIT(NOHZ_NEXT_KICK_BIT)

-#define NOHZ_KICK_MASK (NOHZ_BALANCE_KICK | NOHZ_STATS_KICK)
+#define NOHZ_KICK_MASK (NOHZ_BALANCE_KICK | NOHZ_STATS_KICK | NOHZ_NEXT_KICK)

#define nohz_flags(cpu) (&cpu_rq(cpu)->nohz_flags)