Hi folks,
The single patch has grown a sibling, and a cover letter along with it.
This was caught up by our testing on an arm64 RB5 board - that's an 8 CPUs
DynamIQ SoC with 4 littles, 3 mediums and 1 big. It seems to rely more on NOHZ
balancing than our other boards being tested, which highlighted that not
including a newly-idle CPU into nohz.next_balance can cause issues (especially
when the other CPUs have had their balance_interval inflated by pinned tasks).
As suggested by Vincent, the approach here is to mimic what was done for
nohz.has_blocked, which gives us sane(ish) ordering guarantees.
Revisions
=========
v1 -> v2
++++++++
o Ditched the extra cpumasks and went with a sibling of nohz.has_blocked
(Vincent)
Cheers,
Valentin
Valentin Schneider (2):
sched/fair: Add NOHZ balancer flag for nohz.next_balance updates
sched/fair: Trigger nohz.next_balance updates when a CPU goes
NOHZ-idle
kernel/sched/fair.c | 24 +++++++++++++++++++-----
kernel/sched/sched.h | 8 +++++++-
2 files changed, 26 insertions(+), 6 deletions(-)
--
2.25.1
A following patch will trigger NOHZ idle balances as a means to update
nohz.next_balance. Vincent noted that blocked load updates can have
non-negligible overhead, which should be avoided if the intent is to only
update nohz.next_balance.
Add a new NOHZ balance kick flag, NOHZ_NEXT_KICK. Gate NOHZ blocked load
update by the presence of NOHZ_STATS_KICK - currently all NOHZ balance
kicks will have the NOHZ_STATS_KICK flag set, so no change in behaviour is
expected.
Suggested-by: Vincent Guittot <[email protected]>
Signed-off-by: Valentin Schneider <[email protected]>
---
kernel/sched/fair.c | 9 ++++++---
kernel/sched/sched.h | 8 +++++++-
2 files changed, 13 insertions(+), 4 deletions(-)
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 11d22943753f..5c88698c3664 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -10506,7 +10506,8 @@ static void _nohz_idle_balance(struct rq *this_rq, unsigned int flags,
* setting the flag, we are sure to not clear the state and not
* check the load of an idle cpu.
*/
- WRITE_ONCE(nohz.has_blocked, 0);
+ if (flags & NOHZ_STATS_KICK)
+ WRITE_ONCE(nohz.has_blocked, 0);
/*
* Ensures that if we miss the CPU, we must see the has_blocked
@@ -10528,13 +10529,15 @@ static void _nohz_idle_balance(struct rq *this_rq, unsigned int flags,
* balancing owner will pick it up.
*/
if (need_resched()) {
- has_blocked_load = true;
+ if (flags & NOHZ_STATS_KICK)
+ has_blocked_load = true;
goto abort;
}
rq = cpu_rq(balance_cpu);
- has_blocked_load |= update_nohz_stats(rq);
+ if (flags & NOHZ_STATS_KICK)
+ has_blocked_load |= update_nohz_stats(rq);
/*
* If time for next balance is due,
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 9a1c6aeb9165..b0f38b5d2550 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -2695,12 +2695,18 @@ extern void cfs_bandwidth_usage_dec(void);
#define NOHZ_BALANCE_KICK_BIT 0
#define NOHZ_STATS_KICK_BIT 1
#define NOHZ_NEWILB_KICK_BIT 2
+#define NOHZ_NEXT_KICK_BIT 3
+/* Run rebalance_domains() */
#define NOHZ_BALANCE_KICK BIT(NOHZ_BALANCE_KICK_BIT)
+/* Update blocked load */
#define NOHZ_STATS_KICK BIT(NOHZ_STATS_KICK_BIT)
+/* Update blocked load when entering idle */
#define NOHZ_NEWILB_KICK BIT(NOHZ_NEWILB_KICK_BIT)
+/* Update nohz.next_balance */
+#define NOHZ_NEXT_KICK BIT(NOHZ_NEXT_KICK_BIT)
-#define NOHZ_KICK_MASK (NOHZ_BALANCE_KICK | NOHZ_STATS_KICK)
+#define NOHZ_KICK_MASK (NOHZ_BALANCE_KICK | NOHZ_STATS_KICK | NOHZ_NEXT_KICK)
#define nohz_flags(cpu) (&cpu_rq(cpu)->nohz_flags)
--
2.25.1
Consider a system with some NOHZ-idle CPUs, such that
nohz.idle_cpus_mask = S
nohz.next_balance = T
When a new CPU k goes NOHZ idle (nohz_balance_enter_idle()), we end up
with:
nohz.idle_cpus_mask = S \U {k}
nohz.next_balance = T
Note that the nohz.next_balance hasn't changed - it won't be updated until
a NOHZ balance is triggered. This is problematic if the newly NOHZ idle CPU
has an earlier rq.next_balance than the other NOHZ idle CPUs, IOW if:
cpu_rq(k).next_balance < nohz.next_balance
In such scenarios, the existing nohz.next_balance will prevent any NOHZ
balance from happening, which itself will prevent nohz.next_balance from
being updated to this new cpu_rq(k).next_balance. Unnecessary load balance
delays of over 12ms caused by this were observed on an arm64 RB5 board.
Use the new nohz.needs_update flag to mark the presence of newly-idle CPUs
that need their rq->next_balance to be collated into
nohz.next_balance. Trigger a NOHZ_NEXT_KICK when the flag is set.
Signed-off-by: Valentin Schneider <[email protected]>
---
kernel/sched/fair.c | 15 +++++++++++++--
1 file changed, 13 insertions(+), 2 deletions(-)
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 5c88698c3664..b5a4ea7715b9 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -5698,6 +5698,7 @@ static struct {
cpumask_var_t idle_cpus_mask;
atomic_t nr_cpus;
int has_blocked; /* Idle CPUS has blocked load */
+ int needs_update; /* Newly idle CPUs need their next_balance collated */
unsigned long next_balance; /* in jiffy units */
unsigned long next_blocked; /* Next update of blocked load in jiffies */
} nohz ____cacheline_aligned;
@@ -10351,6 +10352,9 @@ static void nohz_balancer_kick(struct rq *rq)
unlock:
rcu_read_unlock();
out:
+ if (READ_ONCE(nohz.needs_update))
+ flags |= NOHZ_NEXT_KICK;
+
if (flags)
kick_ilb(flags);
}
@@ -10447,12 +10451,13 @@ void nohz_balance_enter_idle(int cpu)
/*
* Ensures that if nohz_idle_balance() fails to observe our
* @idle_cpus_mask store, it must observe the @has_blocked
- * store.
+ * and @needs_update stores.
*/
smp_mb__after_atomic();
set_cpu_sd_state_idle(cpu);
+ WRITE_ONCE(nohz.needs_update, 1);
out:
/*
* Each time a cpu enter idle, we assume that it has blocked load and
@@ -10501,13 +10506,17 @@ static void _nohz_idle_balance(struct rq *this_rq, unsigned int flags,
/*
* We assume there will be no idle load after this update and clear
* the has_blocked flag. If a cpu enters idle in the mean time, it will
- * set the has_blocked flag and trig another update of idle load.
+ * set the has_blocked flag and trigger another update of idle load.
* Because a cpu that becomes idle, is added to idle_cpus_mask before
* setting the flag, we are sure to not clear the state and not
* check the load of an idle cpu.
+ *
+ * Same applies to idle_cpus_mask vs needs_update.
*/
if (flags & NOHZ_STATS_KICK)
WRITE_ONCE(nohz.has_blocked, 0);
+ if (flags & NOHZ_NEXT_KICK)
+ WRITE_ONCE(nohz.needs_update, 0);
/*
* Ensures that if we miss the CPU, we must see the has_blocked
@@ -10531,6 +10540,8 @@ static void _nohz_idle_balance(struct rq *this_rq, unsigned int flags,
if (need_resched()) {
if (flags & NOHZ_STATS_KICK)
has_blocked_load = true;
+ if (flags & NOHZ_NEXT_KICK)
+ WRITE_ONCE(nohz.needs_update, 1);
goto abort;
}
--
2.25.1
On 19/07/2021 12:31, Valentin Schneider wrote:
[...]
> @@ -10351,6 +10352,9 @@ static void nohz_balancer_kick(struct rq *rq)
> unlock:
> rcu_read_unlock();
> out:
> + if (READ_ONCE(nohz.needs_update))
> + flags |= NOHZ_NEXT_KICK;
> +
Since NOHZ_NEXT_KICK is part of NOHZ_KICK_MASK, some conditions above
will already set it in flags. Is this intended?
> if (flags)
> kick_ilb(flags);
> }
> @@ -10447,12 +10451,13 @@ void nohz_balance_enter_idle(int cpu)
> /*
> * Ensures that if nohz_idle_balance() fails to observe our
> * @idle_cpus_mask store, it must observe the @has_blocked
> - * store.
> + * and @needs_update stores.
> */
> smp_mb__after_atomic();
>
> set_cpu_sd_state_idle(cpu);
>
> + WRITE_ONCE(nohz.needs_update, 1);
> out:
> /*
> * Each time a cpu enter idle, we assume that it has blocked load and
> @@ -10501,13 +10506,17 @@ static void _nohz_idle_balance(struct rq *this_rq, unsigned int flags,
function header would need update to incorporate the new 'update
nohz.next_balance' functionality. It only talks about 'update of blocked
load' and 'complete load balance' so far.
> /*
> * We assume there will be no idle load after this update and clear
> * the has_blocked flag. If a cpu enters idle in the mean time, it will
> - * set the has_blocked flag and trig another update of idle load.
> + * set the has_blocked flag and trigger another update of idle load.
> * Because a cpu that becomes idle, is added to idle_cpus_mask before
> * setting the flag, we are sure to not clear the state and not
> * check the load of an idle cpu.
> + *
> + * Same applies to idle_cpus_mask vs needs_update.
> */
> if (flags & NOHZ_STATS_KICK)
> WRITE_ONCE(nohz.has_blocked, 0);
> + if (flags & NOHZ_NEXT_KICK)
> + WRITE_ONCE(nohz.needs_update, 0);
>
> /*
> * Ensures that if we miss the CPU, we must see the has_blocked
> @@ -10531,6 +10540,8 @@ static void _nohz_idle_balance(struct rq *this_rq, unsigned int flags,
> if (need_resched()) {
> if (flags & NOHZ_STATS_KICK)
> has_blocked_load = true;
This looks weird now? 'has_blocked_load = true' vs
'WRITE_ONCE(nohz.needs_update, 1)'.
> + if (flags & NOHZ_NEXT_KICK)
> + WRITE_ONCE(nohz.needs_update, 1);
> goto abort;
> }
>
>
On 19/07/21 17:24, Dietmar Eggemann wrote:
> On 19/07/2021 12:31, Valentin Schneider wrote:
>
> [...]
>
>> @@ -10351,6 +10352,9 @@ static void nohz_balancer_kick(struct rq *rq)
>> unlock:
>> rcu_read_unlock();
>> out:
>> + if (READ_ONCE(nohz.needs_update))
>> + flags |= NOHZ_NEXT_KICK;
>> +
>
> Since NOHZ_NEXT_KICK is part of NOHZ_KICK_MASK, some conditions above
> will already set it in flags. Is this intended?
So if no kick would be issued (e.g. flags == 0 because nohz.next_balance is
later in the future), then this does the right thing and issues a
NOHZ_NEXT_KICK one.
However you're right to point out that even if nohz.needs_update is false,
we can set NOHZ_NEXT_KICK into the ilb rq's NOHZ flags due to it being
included in NOHZ_KICK_MASK, which I think is a mistake. Looking at it now,
it shouldn't be part of NOHZ_KICK_MASK.
>
>> if (flags)
>> kick_ilb(flags);
>> }
>> @@ -10447,12 +10451,13 @@ void nohz_balance_enter_idle(int cpu)
>> /*
>> * Ensures that if nohz_idle_balance() fails to observe our
>> * @idle_cpus_mask store, it must observe the @has_blocked
>> - * store.
>> + * and @needs_update stores.
>> */
>> smp_mb__after_atomic();
>>
>> set_cpu_sd_state_idle(cpu);
>>
>> + WRITE_ONCE(nohz.needs_update, 1);
>> out:
>> /*
>> * Each time a cpu enter idle, we assume that it has blocked load and
>> @@ -10501,13 +10506,17 @@ static void _nohz_idle_balance(struct rq *this_rq, unsigned int flags,
>
> function header would need update to incorporate the new 'update
> nohz.next_balance' functionality. It only talks about 'update of blocked
> load' and 'complete load balance' so far.
>
>> /*
>> * We assume there will be no idle load after this update and clear
>> * the has_blocked flag. If a cpu enters idle in the mean time, it will
>> - * set the has_blocked flag and trig another update of idle load.
>> + * set the has_blocked flag and trigger another update of idle load.
>> * Because a cpu that becomes idle, is added to idle_cpus_mask before
>> * setting the flag, we are sure to not clear the state and not
>> * check the load of an idle cpu.
>> + *
>> + * Same applies to idle_cpus_mask vs needs_update.
>> */
>> if (flags & NOHZ_STATS_KICK)
>> WRITE_ONCE(nohz.has_blocked, 0);
>> + if (flags & NOHZ_NEXT_KICK)
>> + WRITE_ONCE(nohz.needs_update, 0);
>>
>> /*
>> * Ensures that if we miss the CPU, we must see the has_blocked
>> @@ -10531,6 +10540,8 @@ static void _nohz_idle_balance(struct rq *this_rq, unsigned int flags,
>> if (need_resched()) {
>> if (flags & NOHZ_STATS_KICK)
>> has_blocked_load = true;
>
> This looks weird now? 'has_blocked_load = true' vs
> 'WRITE_ONCE(nohz.needs_update, 1)'.
>
Well, has_blocked_load lets us factorize the nohz.has_blocked write
(one is needed either when aborting or at the tail of the cpumask
iteration), whereas there is just a single write for nohz.needs_update
(when aborting).
>> + if (flags & NOHZ_NEXT_KICK)
>> + WRITE_ONCE(nohz.needs_update, 1);
>> goto abort;
>> }
>>
>>
On 19/07/2021 18:28, Valentin Schneider wrote:
> On 19/07/21 17:24, Dietmar Eggemann wrote:
>> On 19/07/2021 12:31, Valentin Schneider wrote:
[...]
>>> * Ensures that if we miss the CPU, we must see the has_blocked
>>> @@ -10531,6 +10540,8 @@ static void _nohz_idle_balance(struct rq *this_rq, unsigned int flags,
>>> if (need_resched()) {
>>> if (flags & NOHZ_STATS_KICK)
>>> has_blocked_load = true;
>>
>> This looks weird now? 'has_blocked_load = true' vs
>> 'WRITE_ONCE(nohz.needs_update, 1)'.
>>
>
> Well, has_blocked_load lets us factorize the nohz.has_blocked write
> (one is needed either when aborting or at the tail of the cpumask
> iteration), whereas there is just a single write for nohz.needs_update
> (when aborting).
You're right. Looks good then.
[...]
On Mon, 19 Jul 2021 at 12:31, Valentin Schneider
<[email protected]> wrote:
>
> A following patch will trigger NOHZ idle balances as a means to update
> nohz.next_balance. Vincent noted that blocked load updates can have
> non-negligible overhead, which should be avoided if the intent is to only
> update nohz.next_balance.
>
> Add a new NOHZ balance kick flag, NOHZ_NEXT_KICK. Gate NOHZ blocked load
> update by the presence of NOHZ_STATS_KICK - currently all NOHZ balance
> kicks will have the NOHZ_STATS_KICK flag set, so no change in behaviour is
> expected.
>
> Suggested-by: Vincent Guittot <[email protected]>
> Signed-off-by: Valentin Schneider <[email protected]>
> ---
> kernel/sched/fair.c | 9 ++++++---
> kernel/sched/sched.h | 8 +++++++-
> 2 files changed, 13 insertions(+), 4 deletions(-)
>
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index 11d22943753f..5c88698c3664 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -10506,7 +10506,8 @@ static void _nohz_idle_balance(struct rq *this_rq, unsigned int flags,
> * setting the flag, we are sure to not clear the state and not
> * check the load of an idle cpu.
> */
> - WRITE_ONCE(nohz.has_blocked, 0);
> + if (flags & NOHZ_STATS_KICK)
> + WRITE_ONCE(nohz.has_blocked, 0);
>
> /*
> * Ensures that if we miss the CPU, we must see the has_blocked
> @@ -10528,13 +10529,15 @@ static void _nohz_idle_balance(struct rq *this_rq, unsigned int flags,
> * balancing owner will pick it up.
> */
> if (need_resched()) {
> - has_blocked_load = true;
> + if (flags & NOHZ_STATS_KICK)
> + has_blocked_load = true;
> goto abort;
> }
>
> rq = cpu_rq(balance_cpu);
>
> - has_blocked_load |= update_nohz_stats(rq);
> + if (flags & NOHZ_STATS_KICK)
> + has_blocked_load |= update_nohz_stats(rq);
>
> /*
> * If time for next balance is due,
You forgot to skip the update of nohz.next_blocked if NOHZ_STATS_KICK
is not set:
WRITE_ONCE(nohz.next_blocked,
now + msecs_to_jiffies(LOAD_AVG_PERIOD));
> diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
> index 9a1c6aeb9165..b0f38b5d2550 100644
> --- a/kernel/sched/sched.h
> +++ b/kernel/sched/sched.h
> @@ -2695,12 +2695,18 @@ extern void cfs_bandwidth_usage_dec(void);
> #define NOHZ_BALANCE_KICK_BIT 0
> #define NOHZ_STATS_KICK_BIT 1
> #define NOHZ_NEWILB_KICK_BIT 2
> +#define NOHZ_NEXT_KICK_BIT 3
>
> +/* Run rebalance_domains() */
> #define NOHZ_BALANCE_KICK BIT(NOHZ_BALANCE_KICK_BIT)
> +/* Update blocked load */
> #define NOHZ_STATS_KICK BIT(NOHZ_STATS_KICK_BIT)
> +/* Update blocked load when entering idle */
> #define NOHZ_NEWILB_KICK BIT(NOHZ_NEWILB_KICK_BIT)
> +/* Update nohz.next_balance */
> +#define NOHZ_NEXT_KICK BIT(NOHZ_NEXT_KICK_BIT)
>
> -#define NOHZ_KICK_MASK (NOHZ_BALANCE_KICK | NOHZ_STATS_KICK)
> +#define NOHZ_KICK_MASK (NOHZ_BALANCE_KICK | NOHZ_STATS_KICK | NOHZ_NEXT_KICK)
>
> #define nohz_flags(cpu) (&cpu_rq(cpu)->nohz_flags)
>
> --
> 2.25.1
>