2017-12-21 10:23:39

by Peter Zijlstra

[permalink] [raw]
Subject: [RFC PATCH 2/5] sched: Add NOHZ_STATS_KICK


Suggested-by: Vincent Guittot <[email protected]>
Signed-off-by: Peter Zijlstra (Intel) <[email protected]>
---
kernel/sched/core.c | 4 +--
kernel/sched/fair.c | 52 ++++++++++++++++++++++++++++++++++-----------------
kernel/sched/sched.h | 4 +++
3 files changed, 41 insertions(+), 19 deletions(-)

--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -608,7 +608,7 @@ static inline bool got_nohz_idle_kick(vo
{
int cpu = smp_processor_id();

- if (!(atomic_read(nohz_flags(cpu)) & NOHZ_BALANCE_KICK))
+ if (!(atomic_read(nohz_flags(cpu)) & NOHZ_KICK_MASK))
return false;

if (idle_cpu(cpu) && !need_resched())
@@ -618,7 +618,7 @@ static inline bool got_nohz_idle_kick(vo
* We can't run Idle Load Balance on this CPU for this time so we
* cancel it and clear NOHZ_BALANCE_KICK
*/
- atomic_andnot(NOHZ_BALANCE_KICK, nohz_flags(cpu));
+ atomic_andnot(NOHZ_KICK_MASK, nohz_flags(cpu));
return false;
}

--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -9001,8 +9001,8 @@ static void nohz_balancer_kick(void)
if (ilb_cpu >= nr_cpu_ids)
return;

- flags = atomic_fetch_or(NOHZ_BALANCE_KICK, nohz_flags(ilb_cpu));
- if (flags & NOHZ_BALANCE_KICK)
+ flags = atomic_fetch_or(NOHZ_KICK_MASK, nohz_flags(ilb_cpu));
+ if (flags & NOHZ_KICK_MASK)
return;
/*
* Use smp_send_reschedule() instead of resched_cpu().
@@ -9125,8 +9125,6 @@ static void rebalance_domains(struct rq
int need_serialize, need_decay = 0;
u64 max_cost = 0;

- update_blocked_averages(cpu);
-
rcu_read_lock();
for_each_domain(cpu, sd) {
/*
@@ -9221,20 +9219,27 @@ static void rebalance_domains(struct rq
* In CONFIG_NO_HZ_COMMON case, the idle balance kickee will do the
* rebalancing for all the cpus for whom scheduler ticks are stopped.
*/
-static void nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
+static bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
{
- int this_cpu = this_rq->cpu;
- struct rq *rq;
- int balance_cpu;
/* Earliest time when we have to do rebalance again */
unsigned long next_balance = jiffies + 60*HZ;
int update_next_balance = 0;
+ int this_cpu = this_rq->cpu;
+ unsigned int flags;
+ int balance_cpu;
+ struct rq *rq;

- if (!(atomic_read(nohz_flags(this_cpu)) & NOHZ_BALANCE_KICK))
- return;
+ if (!(atomic_read(nohz_flags(this_cpu)) & NOHZ_KICK_MASK))
+ return false;

- if (idle != CPU_IDLE)
- goto end;
+ if (idle != CPU_IDLE) {
+ atomic_andnot(NOHZ_KICK_MASK, nohz_flags(this_cpu));
+ return false;
+ }
+
+ flags = atomic_fetch_andnot(NOHZ_KICK_MASK, nohz_flags(this_cpu));
+
+ SCHED_WARN_ON((flags & NOHZ_KICK_MASK) == NOHZ_BALANCE_KICK);

for_each_cpu(balance_cpu, nohz.idle_cpus_mask) {
if (balance_cpu == this_cpu || !idle_cpu(balance_cpu))
@@ -9262,7 +9267,9 @@ static void nohz_idle_balance(struct rq
cpu_load_update_idle(rq);
rq_unlock_irq(rq, &rf);

- rebalance_domains(rq, CPU_IDLE);
+ update_blocked_averages(rq->cpu);
+ if (flags & NOHZ_BALANCE_KICK)
+ rebalance_domains(rq, CPU_IDLE);
}

if (time_after(next_balance, rq->next_balance)) {
@@ -9271,6 +9278,10 @@ static void nohz_idle_balance(struct rq
}
}

+ update_blocked_averages(this_cpu);
+ if (flags & NOHZ_BALANCE_KICK)
+ rebalance_domains(this_rq, CPU_IDLE);
+
/*
* next_balance will be updated only when there is a need.
* When the CPU is attached to null domain for ex, it will not be
@@ -9278,8 +9289,8 @@ static void nohz_idle_balance(struct rq
*/
if (likely(update_next_balance))
nohz.next_balance = next_balance;
-end:
- atomic_andnot(NOHZ_BALANCE_KICK, nohz_flags(this_cpu));
+
+ return true;
}

/*
@@ -9366,7 +9377,10 @@ static inline bool nohz_kick_needed(stru
return kick;
}
#else
-static void nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle) { }
+static bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
+{
+ return false;
+}
#endif

/*
@@ -9387,7 +9401,11 @@ static __latent_entropy void run_rebalan
* load balance only within the local sched_domain hierarchy
* and abort nohz_idle_balance altogether if we pull some load.
*/
- nohz_idle_balance(this_rq, idle);
+ if (nohz_idle_balance(this_rq, idle))
+ return;
+
+ /* normal load balance */
+ update_blocked_averages(this_rq->cpu);
rebalance_domains(this_rq, idle);
}

--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -2005,9 +2005,13 @@ extern void cfs_bandwidth_usage_dec(void
#ifdef CONFIG_NO_HZ_COMMON
#define NOHZ_TICK_STOPPED_BIT 0
#define NOHZ_BALANCE_KICK_BIT 1
+#define NOHZ_STATS_KICK_BIT 2

#define NOHZ_TICK_STOPPED BIT(NOHZ_TICK_STOPPED_BIT)
#define NOHZ_BALANCE_KICK BIT(NOHZ_BALANCE_KICK_BIT)
+#define NOHZ_STATS_KICK BIT(NOHZ_STATS_KICK_BIT)
+
+#define NOHZ_KICK_MASK (NOHZ_BALANCE_KICK | NOHZ_STATS_KICK)

#define nohz_flags(cpu) (&cpu_rq(cpu)->nohz_flags)




2017-12-21 16:23:50

by Vincent Guittot

[permalink] [raw]
Subject: Re: [RFC PATCH 2/5] sched: Add NOHZ_STATS_KICK

Hi Peter,

I think that part of the proposal is missing.

One goal of the patchset was to kick an update of the stats of idle
cpu when a task wake up on a cpu but the statistic has not been
updated for a while.

That's why there where a call to nohz_kick_needed in the proposal to
kick ilb but only for updating blocked load and not a full idle load
balance

I can't find this call any more in your patchset

On 21 December 2017 at 11:21, Peter Zijlstra <[email protected]> wrote:
>
> Suggested-by: Vincent Guittot <[email protected]>
> Signed-off-by: Peter Zijlstra (Intel) <[email protected]>
> ---
> kernel/sched/core.c | 4 +--
> kernel/sched/fair.c | 52 ++++++++++++++++++++++++++++++++++-----------------
> kernel/sched/sched.h | 4 +++
> 3 files changed, 41 insertions(+), 19 deletions(-)
>
> --- a/kernel/sched/core.c
> +++ b/kernel/sched/core.c
> @@ -608,7 +608,7 @@ static inline bool got_nohz_idle_kick(vo
> {
> int cpu = smp_processor_id();
>
> - if (!(atomic_read(nohz_flags(cpu)) & NOHZ_BALANCE_KICK))
> + if (!(atomic_read(nohz_flags(cpu)) & NOHZ_KICK_MASK))
> return false;
>
> if (idle_cpu(cpu) && !need_resched())
> @@ -618,7 +618,7 @@ static inline bool got_nohz_idle_kick(vo
> * We can't run Idle Load Balance on this CPU for this time so we
> * cancel it and clear NOHZ_BALANCE_KICK
> */
> - atomic_andnot(NOHZ_BALANCE_KICK, nohz_flags(cpu));
> + atomic_andnot(NOHZ_KICK_MASK, nohz_flags(cpu));
> return false;
> }
>
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -9001,8 +9001,8 @@ static void nohz_balancer_kick(void)
> if (ilb_cpu >= nr_cpu_ids)
> return;
>
> - flags = atomic_fetch_or(NOHZ_BALANCE_KICK, nohz_flags(ilb_cpu));
> - if (flags & NOHZ_BALANCE_KICK)
> + flags = atomic_fetch_or(NOHZ_KICK_MASK, nohz_flags(ilb_cpu));
> + if (flags & NOHZ_KICK_MASK)
> return;
> /*
> * Use smp_send_reschedule() instead of resched_cpu().
> @@ -9125,8 +9125,6 @@ static void rebalance_domains(struct rq
> int need_serialize, need_decay = 0;
> u64 max_cost = 0;
>
> - update_blocked_averages(cpu);
> -
> rcu_read_lock();
> for_each_domain(cpu, sd) {
> /*
> @@ -9221,20 +9219,27 @@ static void rebalance_domains(struct rq
> * In CONFIG_NO_HZ_COMMON case, the idle balance kickee will do the
> * rebalancing for all the cpus for whom scheduler ticks are stopped.
> */
> -static void nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
> +static bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
> {
> - int this_cpu = this_rq->cpu;
> - struct rq *rq;
> - int balance_cpu;
> /* Earliest time when we have to do rebalance again */
> unsigned long next_balance = jiffies + 60*HZ;
> int update_next_balance = 0;
> + int this_cpu = this_rq->cpu;
> + unsigned int flags;
> + int balance_cpu;
> + struct rq *rq;
>
> - if (!(atomic_read(nohz_flags(this_cpu)) & NOHZ_BALANCE_KICK))
> - return;
> + if (!(atomic_read(nohz_flags(this_cpu)) & NOHZ_KICK_MASK))
> + return false;
>
> - if (idle != CPU_IDLE)
> - goto end;
> + if (idle != CPU_IDLE) {
> + atomic_andnot(NOHZ_KICK_MASK, nohz_flags(this_cpu));
> + return false;
> + }
> +
> + flags = atomic_fetch_andnot(NOHZ_KICK_MASK, nohz_flags(this_cpu));
> +
> + SCHED_WARN_ON((flags & NOHZ_KICK_MASK) == NOHZ_BALANCE_KICK);
>
> for_each_cpu(balance_cpu, nohz.idle_cpus_mask) {
> if (balance_cpu == this_cpu || !idle_cpu(balance_cpu))
> @@ -9262,7 +9267,9 @@ static void nohz_idle_balance(struct rq
> cpu_load_update_idle(rq);
> rq_unlock_irq(rq, &rf);
>
> - rebalance_domains(rq, CPU_IDLE);
> + update_blocked_averages(rq->cpu);
> + if (flags & NOHZ_BALANCE_KICK)
> + rebalance_domains(rq, CPU_IDLE);
> }
>
> if (time_after(next_balance, rq->next_balance)) {
> @@ -9271,6 +9278,10 @@ static void nohz_idle_balance(struct rq
> }
> }
>
> + update_blocked_averages(this_cpu);
> + if (flags & NOHZ_BALANCE_KICK)
> + rebalance_domains(this_rq, CPU_IDLE);
> +
> /*
> * next_balance will be updated only when there is a need.
> * When the CPU is attached to null domain for ex, it will not be
> @@ -9278,8 +9289,8 @@ static void nohz_idle_balance(struct rq
> */
> if (likely(update_next_balance))
> nohz.next_balance = next_balance;
> -end:
> - atomic_andnot(NOHZ_BALANCE_KICK, nohz_flags(this_cpu));
> +
> + return true;
> }
>
> /*
> @@ -9366,7 +9377,10 @@ static inline bool nohz_kick_needed(stru
> return kick;
> }
> #else
> -static void nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle) { }
> +static bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
> +{
> + return false;
> +}
> #endif
>
> /*
> @@ -9387,7 +9401,11 @@ static __latent_entropy void run_rebalan
> * load balance only within the local sched_domain hierarchy
> * and abort nohz_idle_balance altogether if we pull some load.
> */
> - nohz_idle_balance(this_rq, idle);
> + if (nohz_idle_balance(this_rq, idle))
> + return;
> +
> + /* normal load balance */
> + update_blocked_averages(this_rq->cpu);
> rebalance_domains(this_rq, idle);
> }
>
> --- a/kernel/sched/sched.h
> +++ b/kernel/sched/sched.h
> @@ -2005,9 +2005,13 @@ extern void cfs_bandwidth_usage_dec(void
> #ifdef CONFIG_NO_HZ_COMMON
> #define NOHZ_TICK_STOPPED_BIT 0
> #define NOHZ_BALANCE_KICK_BIT 1
> +#define NOHZ_STATS_KICK_BIT 2
>
> #define NOHZ_TICK_STOPPED BIT(NOHZ_TICK_STOPPED_BIT)
> #define NOHZ_BALANCE_KICK BIT(NOHZ_BALANCE_KICK_BIT)
> +#define NOHZ_STATS_KICK BIT(NOHZ_STATS_KICK_BIT)
> +
> +#define NOHZ_KICK_MASK (NOHZ_BALANCE_KICK | NOHZ_STATS_KICK)
>
> #define nohz_flags(cpu) (&cpu_rq(cpu)->nohz_flags)
>
>
>

2017-12-21 16:56:58

by Vincent Guittot

[permalink] [raw]
Subject: Re: [RFC PATCH 2/5] sched: Add NOHZ_STATS_KICK

On 21 December 2017 at 17:23, Vincent Guittot
<[email protected]> wrote:
> Hi Peter,
>
> I think that part of the proposal is missing.
>
> One goal of the patchset was to kick an update of the stats of idle
> cpu when a task wake up on a cpu but the statistic has not been
> updated for a while.
>
> That's why there where a call to nohz_kick_needed in the proposal to
> kick ilb but only for updating blocked load and not a full idle load

sorry a call to nohz_kick_needed (which becomes nohz_balancer_kick in
yours patchset) in select_task_fair_rq_fair

> balance
>
> I can't find this call any more in your patchset

In fact, we can't only rely on the tick and newly_idle load balance to
ensure a period update of the blocked load because they can never
happen. So we need to find another place to kick for a periodic update
which is when a task wake up


>
> On 21 December 2017 at 11:21, Peter Zijlstra <[email protected]> wrote:
>>
>> Suggested-by: Vincent Guittot <[email protected]>
>> Signed-off-by: Peter Zijlstra (Intel) <[email protected]>
>> ---
>> kernel/sched/core.c | 4 +--
>> kernel/sched/fair.c | 52 ++++++++++++++++++++++++++++++++++-----------------
>> kernel/sched/sched.h | 4 +++
>> 3 files changed, 41 insertions(+), 19 deletions(-)
>>
>> --- a/kernel/sched/core.c
>> +++ b/kernel/sched/core.c
>> @@ -608,7 +608,7 @@ static inline bool got_nohz_idle_kick(vo
>> {
>> int cpu = smp_processor_id();
>>
>> - if (!(atomic_read(nohz_flags(cpu)) & NOHZ_BALANCE_KICK))
>> + if (!(atomic_read(nohz_flags(cpu)) & NOHZ_KICK_MASK))
>> return false;
>>
>> if (idle_cpu(cpu) && !need_resched())
>> @@ -618,7 +618,7 @@ static inline bool got_nohz_idle_kick(vo
>> * We can't run Idle Load Balance on this CPU for this time so we
>> * cancel it and clear NOHZ_BALANCE_KICK
>> */
>> - atomic_andnot(NOHZ_BALANCE_KICK, nohz_flags(cpu));
>> + atomic_andnot(NOHZ_KICK_MASK, nohz_flags(cpu));
>> return false;
>> }
>>
>> --- a/kernel/sched/fair.c
>> +++ b/kernel/sched/fair.c
>> @@ -9001,8 +9001,8 @@ static void nohz_balancer_kick(void)
>> if (ilb_cpu >= nr_cpu_ids)
>> return;
>>
>> - flags = atomic_fetch_or(NOHZ_BALANCE_KICK, nohz_flags(ilb_cpu));
>> - if (flags & NOHZ_BALANCE_KICK)
>> + flags = atomic_fetch_or(NOHZ_KICK_MASK, nohz_flags(ilb_cpu));
>> + if (flags & NOHZ_KICK_MASK)
>> return;
>> /*
>> * Use smp_send_reschedule() instead of resched_cpu().
>> @@ -9125,8 +9125,6 @@ static void rebalance_domains(struct rq
>> int need_serialize, need_decay = 0;
>> u64 max_cost = 0;
>>
>> - update_blocked_averages(cpu);
>> -
>> rcu_read_lock();
>> for_each_domain(cpu, sd) {
>> /*
>> @@ -9221,20 +9219,27 @@ static void rebalance_domains(struct rq
>> * In CONFIG_NO_HZ_COMMON case, the idle balance kickee will do the
>> * rebalancing for all the cpus for whom scheduler ticks are stopped.
>> */
>> -static void nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
>> +static bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
>> {
>> - int this_cpu = this_rq->cpu;
>> - struct rq *rq;
>> - int balance_cpu;
>> /* Earliest time when we have to do rebalance again */
>> unsigned long next_balance = jiffies + 60*HZ;
>> int update_next_balance = 0;
>> + int this_cpu = this_rq->cpu;
>> + unsigned int flags;
>> + int balance_cpu;
>> + struct rq *rq;
>>
>> - if (!(atomic_read(nohz_flags(this_cpu)) & NOHZ_BALANCE_KICK))
>> - return;
>> + if (!(atomic_read(nohz_flags(this_cpu)) & NOHZ_KICK_MASK))
>> + return false;
>>
>> - if (idle != CPU_IDLE)
>> - goto end;
>> + if (idle != CPU_IDLE) {
>> + atomic_andnot(NOHZ_KICK_MASK, nohz_flags(this_cpu));
>> + return false;
>> + }
>> +
>> + flags = atomic_fetch_andnot(NOHZ_KICK_MASK, nohz_flags(this_cpu));
>> +
>> + SCHED_WARN_ON((flags & NOHZ_KICK_MASK) == NOHZ_BALANCE_KICK);
>>
>> for_each_cpu(balance_cpu, nohz.idle_cpus_mask) {
>> if (balance_cpu == this_cpu || !idle_cpu(balance_cpu))
>> @@ -9262,7 +9267,9 @@ static void nohz_idle_balance(struct rq
>> cpu_load_update_idle(rq);
>> rq_unlock_irq(rq, &rf);
>>
>> - rebalance_domains(rq, CPU_IDLE);
>> + update_blocked_averages(rq->cpu);
>> + if (flags & NOHZ_BALANCE_KICK)
>> + rebalance_domains(rq, CPU_IDLE);
>> }
>>
>> if (time_after(next_balance, rq->next_balance)) {
>> @@ -9271,6 +9278,10 @@ static void nohz_idle_balance(struct rq
>> }
>> }
>>
>> + update_blocked_averages(this_cpu);
>> + if (flags & NOHZ_BALANCE_KICK)
>> + rebalance_domains(this_rq, CPU_IDLE);
>> +
>> /*
>> * next_balance will be updated only when there is a need.
>> * When the CPU is attached to null domain for ex, it will not be
>> @@ -9278,8 +9289,8 @@ static void nohz_idle_balance(struct rq
>> */
>> if (likely(update_next_balance))
>> nohz.next_balance = next_balance;
>> -end:
>> - atomic_andnot(NOHZ_BALANCE_KICK, nohz_flags(this_cpu));
>> +
>> + return true;
>> }
>>
>> /*
>> @@ -9366,7 +9377,10 @@ static inline bool nohz_kick_needed(stru
>> return kick;
>> }
>> #else
>> -static void nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle) { }
>> +static bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
>> +{
>> + return false;
>> +}
>> #endif
>>
>> /*
>> @@ -9387,7 +9401,11 @@ static __latent_entropy void run_rebalan
>> * load balance only within the local sched_domain hierarchy
>> * and abort nohz_idle_balance altogether if we pull some load.
>> */
>> - nohz_idle_balance(this_rq, idle);
>> + if (nohz_idle_balance(this_rq, idle))
>> + return;
>> +
>> + /* normal load balance */
>> + update_blocked_averages(this_rq->cpu);
>> rebalance_domains(this_rq, idle);
>> }
>>
>> --- a/kernel/sched/sched.h
>> +++ b/kernel/sched/sched.h
>> @@ -2005,9 +2005,13 @@ extern void cfs_bandwidth_usage_dec(void
>> #ifdef CONFIG_NO_HZ_COMMON
>> #define NOHZ_TICK_STOPPED_BIT 0
>> #define NOHZ_BALANCE_KICK_BIT 1
>> +#define NOHZ_STATS_KICK_BIT 2
>>
>> #define NOHZ_TICK_STOPPED BIT(NOHZ_TICK_STOPPED_BIT)
>> #define NOHZ_BALANCE_KICK BIT(NOHZ_BALANCE_KICK_BIT)
>> +#define NOHZ_STATS_KICK BIT(NOHZ_STATS_KICK_BIT)
>> +
>> +#define NOHZ_KICK_MASK (NOHZ_BALANCE_KICK | NOHZ_STATS_KICK)
>>
>> #define nohz_flags(cpu) (&cpu_rq(cpu)->nohz_flags)
>>
>>
>>

2017-12-22 07:56:52

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [RFC PATCH 2/5] sched: Add NOHZ_STATS_KICK

On Thu, Dec 21, 2017 at 05:23:27PM +0100, Vincent Guittot wrote:
> Hi Peter,
>
> I think that part of the proposal is missing.
>
> One goal of the patchset was to kick an update of the stats of idle
> cpu when a task wake up on a cpu but the statistic has not been
> updated for a while.
>
> That's why there where a call to nohz_kick_needed in the proposal to
> kick ilb but only for updating blocked load and not a full idle load
> balance
>
> I can't find this call any more in your patchset

Yeah, I took it out because it didn't make sense to me. If you're waking
to a stale CPU, you're too late.

2017-12-22 07:59:41

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [RFC PATCH 2/5] sched: Add NOHZ_STATS_KICK

On Thu, Dec 21, 2017 at 05:56:32PM +0100, Vincent Guittot wrote:
> In fact, we can't only rely on the tick and newly_idle load balance to
> ensure a period update of the blocked load because they can never
> happen.

I'm confused, why would the ilb not happen?

2017-12-22 08:04:57

by Vincent Guittot

[permalink] [raw]
Subject: Re: [RFC PATCH 2/5] sched: Add NOHZ_STATS_KICK

On 22 December 2017 at 08:56, Peter Zijlstra <[email protected]> wrote:
> On Thu, Dec 21, 2017 at 05:23:27PM +0100, Vincent Guittot wrote:
>> Hi Peter,
>>
>> I think that part of the proposal is missing.
>>
>> One goal of the patchset was to kick an update of the stats of idle
>> cpu when a task wake up on a cpu but the statistic has not been
>> updated for a while.
>>
>> That's why there where a call to nohz_kick_needed in the proposal to
>> kick ilb but only for updating blocked load and not a full idle load
>> balance
>>
>> I can't find this call any more in your patchset
>
> Yeah, I took it out because it didn't make sense to me. If you're waking
> to a stale CPU, you're too late.

I agree that's it's too late for the current wake up but that's the
trade off with delaying the wake up until all blocked load has been
updated.

>

2017-12-22 08:06:07

by Vincent Guittot

[permalink] [raw]
Subject: Re: [RFC PATCH 2/5] sched: Add NOHZ_STATS_KICK

On 22 December 2017 at 08:59, Peter Zijlstra <[email protected]> wrote:
> On Thu, Dec 21, 2017 at 05:56:32PM +0100, Vincent Guittot wrote:
>> In fact, we can't only rely on the tick and newly_idle load balance to
>> ensure a period update of the blocked load because they can never
>> happen.
>
> I'm confused, why would the ilb not happen?

the ilb will be kick only if tick fires which might not be the case
for task that runs less than a tick

2017-12-22 08:29:26

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [RFC PATCH 2/5] sched: Add NOHZ_STATS_KICK

On Fri, Dec 22, 2017 at 09:05:45AM +0100, Vincent Guittot wrote:
> On 22 December 2017 at 08:59, Peter Zijlstra <[email protected]> wrote:
> > On Thu, Dec 21, 2017 at 05:56:32PM +0100, Vincent Guittot wrote:
> >> In fact, we can't only rely on the tick and newly_idle load balance to
> >> ensure a period update of the blocked load because they can never
> >> happen.
> >
> > I'm confused, why would the ilb not happen?
>
> the ilb will be kick only if tick fires which might not be the case
> for task that runs less than a tick

Oh, urgh, you're talking about when the entire system is idle. Yes
indeed.

Lemme have a think, surely we can do something saner there.

2017-12-22 09:12:43

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [RFC PATCH 2/5] sched: Add NOHZ_STATS_KICK

On Fri, Dec 22, 2017 at 09:29:15AM +0100, Peter Zijlstra wrote:
> On Fri, Dec 22, 2017 at 09:05:45AM +0100, Vincent Guittot wrote:
> > On 22 December 2017 at 08:59, Peter Zijlstra <[email protected]> wrote:
> > > On Thu, Dec 21, 2017 at 05:56:32PM +0100, Vincent Guittot wrote:
> > >> In fact, we can't only rely on the tick and newly_idle load balance to
> > >> ensure a period update of the blocked load because they can never
> > >> happen.
> > >
> > > I'm confused, why would the ilb not happen?
> >
> > the ilb will be kick only if tick fires which might not be the case
> > for task that runs less than a tick
>
> Oh, urgh, you're talking about when the entire system is idle. Yes
> indeed.
>
> Lemme have a think, surely we can do something saner there.

The only thing I could come up with is running a timer for this :/ That
would keep the ILB thing running until all load is decayed (have a patch
for that somewhere).

2017-12-22 14:31:51

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [RFC PATCH 2/5] sched: Add NOHZ_STATS_KICK

On Fri, Dec 22, 2017 at 10:12:21AM +0100, Peter Zijlstra wrote:

> The only thing I could come up with is running a timer for this :/ That
> would keep the ILB thing running until all load is decayed (have a patch
> for that somewhere).

Implemented that; pushed it out, should all be at:

git://git.kernel.org/pub/scm/linux/kernel/git/peterz/queue.git sched/testing

but given how today is going, it'll eat your nan and set your cat on
fire.

2017-12-22 14:33:20

by Vincent Guittot

[permalink] [raw]
Subject: Re: [RFC PATCH 2/5] sched: Add NOHZ_STATS_KICK

On 22 December 2017 at 10:12, Peter Zijlstra <[email protected]> wrote:
> On Fri, Dec 22, 2017 at 09:29:15AM +0100, Peter Zijlstra wrote:
>> On Fri, Dec 22, 2017 at 09:05:45AM +0100, Vincent Guittot wrote:
>> > On 22 December 2017 at 08:59, Peter Zijlstra <[email protected]> wrote:
>> > > On Thu, Dec 21, 2017 at 05:56:32PM +0100, Vincent Guittot wrote:
>> > >> In fact, we can't only rely on the tick and newly_idle load balance to
>> > >> ensure a period update of the blocked load because they can never
>> > >> happen.
>> > >
>> > > I'm confused, why would the ilb not happen?
>> >
>> > the ilb will be kick only if tick fires which might not be the case
>> > for task that runs less than a tick
>>
>> Oh, urgh, you're talking about when the entire system is idle. Yes
>> indeed.
>>
>> Lemme have a think, surely we can do something saner there.
>
> The only thing I could come up with is running a timer for this :/ That
> would keep the ILB thing running until all load is decayed (have a patch
> for that somewhere).

IMHO running a timer doesn't sound really great
When we have enough activity on the system, the tick and the periodic
load balance will ensure the update of load of all cpus (including the
idle cpus) at the load balance period pace But if we don't have enough
activity to trig the periodic update through ilb or because the system
is not overloaded or even almost idle, we don't have these periodic
update anymore. The goal is to do a lazy update of the blocked load to
not hurt too much power consumption of idle CPUs. When a task wakes up
and the blocked idle load have not been updated for a while, we trig
the update of these blocked loads in parallel to the wake up so the
data will be more accurate for the next events. It's already too late
for the current wake up but that's not a big deal because the wake up
path of a light loaded system is mainly choosing between previous and
current cpu and the load_avg_contrib and the utilization will have
been updated for next events.



>

2017-12-22 14:34:54

by Vincent Guittot

[permalink] [raw]
Subject: Re: [RFC PATCH 2/5] sched: Add NOHZ_STATS_KICK

On 22 December 2017 at 15:31, Peter Zijlstra <[email protected]> wrote:
> On Fri, Dec 22, 2017 at 10:12:21AM +0100, Peter Zijlstra wrote:
>
>> The only thing I could come up with is running a timer for this :/ That
>> would keep the ILB thing running until all load is decayed (have a patch
>> for that somewhere).
>
> Implemented that; pushed it out, should all be at:
>
> git://git.kernel.org/pub/scm/linux/kernel/git/peterz/queue.git sched/testing
>
> but given how today is going, it'll eat your nan and set your cat on
> fire.

Our emails crossed. i'm going to have a look at your branch

2017-12-22 18:56:39

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [RFC PATCH 2/5] sched: Add NOHZ_STATS_KICK

On Fri, Dec 22, 2017 at 03:32:53PM +0100, Vincent Guittot wrote:
> > The only thing I could come up with is running a timer for this :/ That
> > would keep the ILB thing running until all load is decayed (have a patch
> > for that somewhere).
>
> IMHO running a timer doesn't sound really great

I tend to agree..

> When we have enough activity on the system, the tick and the periodic
> load balance will ensure the update of load of all cpus (including the
> idle cpus) at the load balance period pace.

> But if we don't have enough activity to trig the periodic update
> through ilb or because the system is not overloaded or even almost
> idle, we don't have these periodic update anymore.

> The goal is to do a lazy update of the blocked load to not hurt too
> much power consumption of idle CPUs. When a task wakes up and the
> blocked idle load have not been updated for a while, we trig the
> update of these blocked loads in parallel to the wake up so the data
> will be more accurate for the next events.

> It's already too late for the current wake up but that's not a big
> deal because the wake up path of a light loaded system is mainly
> choosing between previous and current cpu and the load_avg_contrib and
> the utilization will have been updated for next events.

Right; but I figured we'd try and do it 'right' and see how horrible it
is before we try and do funny things.


2017-12-22 20:42:54

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [RFC PATCH 2/5] sched: Add NOHZ_STATS_KICK

On Fri, Dec 22, 2017 at 07:56:29PM +0100, Peter Zijlstra wrote:
> Right; but I figured we'd try and do it 'right' and see how horrible it
> is before we try and do funny things.

So now it should have a 32ms tick for up to .5s when the system goes
completely idle.

No idea how bad that is..

2018-01-02 15:45:02

by Morten Rasmussen

[permalink] [raw]
Subject: Re: [RFC PATCH 2/5] sched: Add NOHZ_STATS_KICK

On Fri, Dec 22, 2017 at 09:42:47PM +0100, Peter Zijlstra wrote:
> On Fri, Dec 22, 2017 at 07:56:29PM +0100, Peter Zijlstra wrote:
> > Right; but I figured we'd try and do it 'right' and see how horrible it
> > is before we try and do funny things.
>
> So now it should have a 32ms tick for up to .5s when the system goes
> completely idle.
>
> No idea how bad that is..

Does it mean that the 32ms tick will keep going forever if the system
doesn't go completely idle? Some tiny background task or a slightly
bigger one with a longer period?

Do we actually care about stale values if the system is completely idle?

Instead of hacking select_task_rq_fair() to kick off a stats update as
Vincent already proposed, why can't we just modify Brendan's
CPU_NEWLY_IDLE proposal to do a stats update from idle_balance() every
32ms regardless of whether we need to load-balance?

This way we should get updates if there is anything running, we don't
touch the wake-up path, we don't cause any additional wake-ups, and we
don't need a timer. What am I missing?

2018-01-03 09:16:23

by Vincent Guittot

[permalink] [raw]
Subject: Re: [RFC PATCH 2/5] sched: Add NOHZ_STATS_KICK

Hi Peter,

On 22 December 2017 at 21:42, Peter Zijlstra <[email protected]> wrote:
> On Fri, Dec 22, 2017 at 07:56:29PM +0100, Peter Zijlstra wrote:
>> Right; but I figured we'd try and do it 'right' and see how horrible it
>> is before we try and do funny things.
>
> So now it should have a 32ms tick for up to .5s when the system goes
> completely idle.
>
> No idea how bad that is..

I have tested your branch but the timer doesn't seem to fire correctly
because i can still see blocked load in the use case i have run.
I haven't found the reason yet

2018-01-15 08:26:16

by Vincent Guittot

[permalink] [raw]
Subject: Re: [RFC PATCH 2/5] sched: Add NOHZ_STATS_KICK

Le Wednesday 03 Jan 2018 ? 10:16:00 (+0100), Vincent Guittot a ?crit :
> Hi Peter,
>
> On 22 December 2017 at 21:42, Peter Zijlstra <[email protected]> wrote:
> > On Fri, Dec 22, 2017 at 07:56:29PM +0100, Peter Zijlstra wrote:
> >> Right; but I figured we'd try and do it 'right' and see how horrible it
> >> is before we try and do funny things.
> >
> > So now it should have a 32ms tick for up to .5s when the system goes
> > completely idle.
> >
> > No idea how bad that is..
>
> I have tested your branch but the timer doesn't seem to fire correctly
> because i can still see blocked load in the use case i have run.
> I haven't found the reason yet

Hi Peter,

With the patch below on top of your branch, the blocked loads are updated and
decayed regularly. The main differences are:
- It doesn't use a timer to trig ilb but the tick and when a cpu becomes idle.
The main drawback of this solution is that the load is blocked when the
system is fully idle with the advantage of not waking up a fully idle
system. We have to wait for the next tick or newly idle event for updating
blocked load when the system leaves idle stat which can be up to a tick long.
If this is too long, we can check for kicking ilb when task wakes up so the
blocked load will be updated as soon as the system leaves idle state.
The main advantage is that we don't wake up a fully idle system every 32ms to
update blocked load that will be not used.
- I'm working on one more improvement to use nohz_idle_balance in the newly
idle case when the system is not overloaded and
(this_rq->avg_idle > sysctl_sched_migration_cost). In this case, we can try to
use nohz_idle_balance with NOHZ_STATS_KICK and abort as soon as it exceed
this_rq->avg_idle. This will remove some calls to kick_ilb and some wake up
of an idle cpus.

---
kernel/sched/fair.c | 72 +++++++++++++++++++++++++----------------------------
1 file changed, 34 insertions(+), 38 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 52114c6..898785d 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -5386,7 +5386,6 @@ static struct {
atomic_t stats_state;
unsigned long next_balance; /* in jiffy units */
unsigned long next_stats;
- struct timer_list timer;
} nohz ____cacheline_aligned;

#endif /* CONFIG_NO_HZ_COMMON */
@@ -8004,8 +8003,9 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd
prefer_sibling = 1;

#ifdef CONFIG_NO_HZ_COMMON
- if (env->idle == CPU_NEWLY_IDLE)
+ if (env->idle == CPU_NEWLY_IDLE && atomic_read(&nohz.stats_state)) {
env->flags |= LBF_NOHZ_STATS;
+ }
#endif

load_idx = get_sd_load_idx(env->sd, env->idle);
@@ -8818,6 +8818,8 @@ update_next_balance(struct sched_domain *sd, unsigned long *next_balance)
*next_balance = next;
}

+static void kick_ilb(unsigned int flags);
+
/*
* idle_balance is called by schedule() if this_cpu is about to become
* idle. Attempts to pull tasks from other CPUs.
@@ -8852,12 +8854,16 @@ static int idle_balance(struct rq *this_rq, struct rq_flags *rf)

if (this_rq->avg_idle < sysctl_sched_migration_cost ||
!this_rq->rd->overload) {
+ unsigned long next = READ_ONCE(nohz.next_stats);
rcu_read_lock();
sd = rcu_dereference_check_sched_domain(this_rq->sd);
if (sd)
update_next_balance(sd, &next_balance);
rcu_read_unlock();

+ if (time_after(jiffies, next) && atomic_read(&nohz.stats_state))
+ kick_ilb(NOHZ_STATS_KICK);
+
goto out;
}

@@ -9075,18 +9081,6 @@ static void kick_ilb(unsigned int flags)
smp_send_reschedule(ilb_cpu);
}

-void nohz_balance_timer(struct timer_list *timer)
-{
- unsigned long next = READ_ONCE(nohz.next_stats);
-
- if (time_before(jiffies, next)) {
- mod_timer(timer, next);
- return;
- }
-
- kick_ilb(NOHZ_STATS_KICK);
-}
-
/*
* Current heuristic for kicking the idle load balancer in the presence
* of an idle cpu in the system.
@@ -9122,6 +9116,9 @@ static void nohz_balancer_kick(struct rq *rq)
if (likely(!atomic_read(&nohz.nr_cpus)))
return;

+ if (time_after(now, nohz.next_stats) && atomic_read(&nohz.stats_state))
+ flags = NOHZ_STATS_KICK;
+
if (time_before(now, nohz.next_balance))
goto out;

@@ -9227,7 +9224,6 @@ static void set_cpu_sd_state_idle(int cpu)
void nohz_balance_enter_idle(int cpu)
{
struct rq *rq = cpu_rq(cpu);
- unsigned int val, new;

SCHED_WARN_ON(cpu != smp_processor_id());

@@ -9251,6 +9247,7 @@ void nohz_balance_enter_idle(int cpu)
return;

rq->nohz_tick_stopped = 1;
+ rq->has_blocked_load = 1;

cpumask_set_cpu(cpu, nohz.idle_cpus_mask);
atomic_inc(&nohz.nr_cpus);
@@ -9258,21 +9255,11 @@ void nohz_balance_enter_idle(int cpu)
set_cpu_sd_state_idle(cpu);

/*
- * implies a barrier such that if the stats_state update is observed
- * the above updates are also visible. Pairs with stuff in
- * update_sd_lb_stats() and nohz_idle_balance().
+ * Each time a cpu enter idle, we assume that it has blocked load and
+ * enable the periodic update of the load of idle cpus
*/
- val = atomic_read(&nohz.stats_state);
- do {
- new = val + 2;
- new |= 1;
- } while (!atomic_try_cmpxchg(&nohz.stats_state, &val, new));
+ atomic_set(&nohz.stats_state, 1);

- /*
- * If the timer was stopped, restart the thing.
- */
- if (!(val & 1))
- mod_timer(&nohz.timer, jiffies + msecs_to_jiffies(LOAD_AVG_PERIOD));
}
#else
static inline void nohz_balancer_kick(struct rq *rq) { }
@@ -9409,7 +9396,6 @@ static bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
bool has_blocked_load = false;
int update_next_balance = 0;
int this_cpu = this_rq->cpu;
- unsigned int stats_seq;
unsigned int flags;
int balance_cpu;
struct rq *rq;
@@ -9422,7 +9408,6 @@ static bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
return false;
}

- stats_seq = atomic_read(&nohz.stats_state);
/*
* barrier, pairs with nohz_balance_enter_idle(), ensures ...
*/
@@ -9432,6 +9417,16 @@ static bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)

SCHED_WARN_ON((flags & NOHZ_KICK_MASK) == NOHZ_BALANCE_KICK);

+ /*
+ * We assume there will be no idle load after this update and clear
+ * the stats state. If a cpu enters idle in the mean time, it will
+ * set the stats state and trig another update of idle load.
+ * Because a cpu that becomes idle, is added to idle_cpus_mask before
+ * setting the stats state, we are sure to not clear the state and not
+ * check the load of an idle cpu.
+ */
+ atomic_set(&nohz.stats_state, 0);
+
for_each_cpu(balance_cpu, nohz.idle_cpus_mask) {
if (balance_cpu == this_cpu || !idle_cpu(balance_cpu))
continue;
@@ -9441,8 +9436,10 @@ static bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
* work being done for other cpus. Next load
* balancing owner will pick it up.
*/
- if (need_resched())
+ if (need_resched()) {
+ has_blocked_load = true;
break;
+ }

rq = cpu_rq(balance_cpu);

@@ -9477,12 +9474,12 @@ static bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
if (flags & NOHZ_BALANCE_KICK)
rebalance_domains(this_rq, CPU_IDLE);

- if (has_blocked_load ||
- !atomic_try_cmpxchg(&nohz.stats_state, &stats_seq, 0)) {
- WRITE_ONCE(nohz.next_stats,
- now + msecs_to_jiffies(LOAD_AVG_PERIOD));
- mod_timer(&nohz.timer, nohz.next_stats);
- }
+ WRITE_ONCE(nohz.next_stats,
+ now + msecs_to_jiffies(LOAD_AVG_PERIOD));
+
+ /* There is still blocked load, enable periodic update */
+ if (has_blocked_load)
+ atomic_set(&nohz.stats_state, 1);

/*
* next_balance will be updated only when there is a need.
@@ -10115,7 +10112,6 @@ __init void init_sched_fair_class(void)
nohz.next_balance = jiffies;
nohz.next_stats = jiffies;
zalloc_cpumask_var(&nohz.idle_cpus_mask, GFP_NOWAIT);
- timer_setup(&nohz.timer, nohz_balance_timer, 0);
#endif
#endif /* SMP */

--
2.7.4

2018-01-15 09:43:25

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [RFC PATCH 2/5] sched: Add NOHZ_STATS_KICK

On Tue, Jan 02, 2018 at 03:44:57PM +0000, Morten Rasmussen wrote:

> Vincent already proposed, why can't we just modify Brendan's
> CPU_NEWLY_IDLE proposal to do a stats update from idle_balance() every
> 32ms regardless of whether we need to load-balance?

I think that code is there, no?

Subject: sched: Update blocked load from NEWIDLE

2018-01-18 10:33:43

by Morten Rasmussen

[permalink] [raw]
Subject: Re: [RFC PATCH 2/5] sched: Add NOHZ_STATS_KICK

On Mon, Jan 15, 2018 at 10:43:18AM +0100, Peter Zijlstra wrote:
> On Tue, Jan 02, 2018 at 03:44:57PM +0000, Morten Rasmussen wrote:
>
> > Vincent already proposed, why can't we just modify Brendan's
> > CPU_NEWLY_IDLE proposal to do a stats update from idle_balance() every
> > 32ms regardless of whether we need to load-balance?
>
> I think that code is there, no?
>
> Subject: sched: Update blocked load from NEWIDLE

The mechanics are there, but I think the problem is that the
idle_balance() bails out before we get to it in some cases. If we only
have a few small periodic tasks running rd->overload won't be set and
idle_balance() returns before doing anything.

We would need some sort of check to see if a PELT update is due and make
sure it happens, even if idle_balance() has nothing to do.

2018-01-18 10:39:50

by Morten Rasmussen

[permalink] [raw]
Subject: Re: [RFC PATCH 2/5] sched: Add NOHZ_STATS_KICK

On Mon, Jan 15, 2018 at 09:26:09AM +0100, Vincent Guittot wrote:
> Le Wednesday 03 Jan 2018 ? 10:16:00 (+0100), Vincent Guittot a ?crit :
> > Hi Peter,
> >
> > On 22 December 2017 at 21:42, Peter Zijlstra <[email protected]> wrote:
> > > On Fri, Dec 22, 2017 at 07:56:29PM +0100, Peter Zijlstra wrote:
> > >> Right; but I figured we'd try and do it 'right' and see how horrible it
> > >> is before we try and do funny things.
> > >
> > > So now it should have a 32ms tick for up to .5s when the system goes
> > > completely idle.
> > >
> > > No idea how bad that is..
> >
> > I have tested your branch but the timer doesn't seem to fire correctly
> > because i can still see blocked load in the use case i have run.
> > I haven't found the reason yet
>
> Hi Peter,
>
> With the patch below on top of your branch, the blocked loads are updated and
> decayed regularly. The main differences are:
> - It doesn't use a timer to trig ilb but the tick and when a cpu becomes idle.
> The main drawback of this solution is that the load is blocked when the
> system is fully idle with the advantage of not waking up a fully idle
> system. We have to wait for the next tick or newly idle event for updating
> blocked load when the system leaves idle stat which can be up to a tick long.
> If this is too long, we can check for kicking ilb when task wakes up so the
> blocked load will be updated as soon as the system leaves idle state.
> The main advantage is that we don't wake up a fully idle system every 32ms to
> update blocked load that will be not used.
> - I'm working on one more improvement to use nohz_idle_balance in the newly
> idle case when the system is not overloaded and
> (this_rq->avg_idle > sysctl_sched_migration_cost). In this case, we can try to
> use nohz_idle_balance with NOHZ_STATS_KICK and abort as soon as it exceed
> this_rq->avg_idle. This will remove some calls to kick_ilb and some wake up
> of an idle cpus.

This sound like what I meant in my other reply :-)

It seems pointless to have a timer to update PELT if the system is
completely idle, and when it isn't we can piggy back other events to
make the updates happen.

2018-01-22 09:41:46

by Dietmar Eggemann

[permalink] [raw]
Subject: Re: [RFC PATCH 2/5] sched: Add NOHZ_STATS_KICK

On 01/15/2018 08:26 AM, Vincent Guittot wrote:
> Le Wednesday 03 Jan 2018 à 10:16:00 (+0100), Vincent Guittot a écrit :
>> Hi Peter,
>>
>> On 22 December 2017 at 21:42, Peter Zijlstra <[email protected]> wrote:
>>> On Fri, Dec 22, 2017 at 07:56:29PM +0100, Peter Zijlstra wrote:
>>>> Right; but I figured we'd try and do it 'right' and see how horrible it
>>>> is before we try and do funny things.
>>>
>>> So now it should have a 32ms tick for up to .5s when the system goes
>>> completely idle.
>>>
>>> No idea how bad that is..
>>
>> I have tested your branch but the timer doesn't seem to fire correctly
>> because i can still see blocked load in the use case i have run.
>> I haven't found the reason yet

Isn't the issue with the timer based implementation that
rq->has_blocked_load is never set to 1 ?

Something you changed in your implementation by adding a
rq->has_blocked_load = 1 into nohz_balance_enter_idle().

>
> Hi Peter,
>
> With the patch below on top of your branch, the blocked loads are updated and
> decayed regularly. The main differences are:
> - It doesn't use a timer to trig ilb but the tick and when a cpu becomes idle.
> The main drawback of this solution is that the load is blocked when the
> system is fully idle with the advantage of not waking up a fully idle
> system. We have to wait for the next tick or newly idle event for updating
> blocked load when the system leaves idle stat which can be up to a tick long.
> If this is too long, we can check for kicking ilb when task wakes up so the
> blocked load will be updated as soon as the system leaves idle state.
> The main advantage is that we don't wake up a fully idle system every 32ms to
> update blocked load that will be not used.
> - I'm working on one more improvement to use nohz_idle_balance in the newly
> idle case when the system is not overloaded and
> (this_rq->avg_idle > sysctl_sched_migration_cost). In this case, we can try to
> use nohz_idle_balance with NOHZ_STATS_KICK and abort as soon as it exceed
> this_rq->avg_idle. This will remove some calls to kick_ilb and some wake up
> of an idle cpus.
>
> ---
> kernel/sched/fair.c | 72 +++++++++++++++++++++++++----------------------------
> 1 file changed, 34 insertions(+), 38 deletions(-)
>
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index 52114c6..898785d 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -5386,7 +5386,6 @@ static struct {
> atomic_t stats_state;
> unsigned long next_balance; /* in jiffy units */
> unsigned long next_stats;
> - struct timer_list timer;
> } nohz ____cacheline_aligned;
>
> #endif /* CONFIG_NO_HZ_COMMON */
> @@ -8004,8 +8003,9 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd
> prefer_sibling = 1;
>
> #ifdef CONFIG_NO_HZ_COMMON
> - if (env->idle == CPU_NEWLY_IDLE)
> + if (env->idle == CPU_NEWLY_IDLE && atomic_read(&nohz.stats_state)) {
> env->flags |= LBF_NOHZ_STATS;
> + }
> #endif
>
> load_idx = get_sd_load_idx(env->sd, env->idle);
> @@ -8818,6 +8818,8 @@ update_next_balance(struct sched_domain *sd, unsigned long *next_balance)
> *next_balance = next;
> }
>
> +static void kick_ilb(unsigned int flags);
> +
> /*
> * idle_balance is called by schedule() if this_cpu is about to become
> * idle. Attempts to pull tasks from other CPUs.
> @@ -8852,12 +8854,16 @@ static int idle_balance(struct rq *this_rq, struct rq_flags *rf)
>
> if (this_rq->avg_idle < sysctl_sched_migration_cost ||
> !this_rq->rd->overload) {
> + unsigned long next = READ_ONCE(nohz.next_stats);
> rcu_read_lock();
> sd = rcu_dereference_check_sched_domain(this_rq->sd);
> if (sd)
> update_next_balance(sd, &next_balance);
> rcu_read_unlock();
>
> + if (time_after(jiffies, next) && atomic_read(&nohz.stats_state))
> + kick_ilb(NOHZ_STATS_KICK);
> +
> goto out;
> }
>
> @@ -9075,18 +9081,6 @@ static void kick_ilb(unsigned int flags)
> smp_send_reschedule(ilb_cpu);
> }
>
> -void nohz_balance_timer(struct timer_list *timer)
> -{
> - unsigned long next = READ_ONCE(nohz.next_stats);
> -
> - if (time_before(jiffies, next)) {
> - mod_timer(timer, next);
> - return;
> - }
> -
> - kick_ilb(NOHZ_STATS_KICK);
> -}
> -
> /*
> * Current heuristic for kicking the idle load balancer in the presence
> * of an idle cpu in the system.
> @@ -9122,6 +9116,9 @@ static void nohz_balancer_kick(struct rq *rq)
> if (likely(!atomic_read(&nohz.nr_cpus)))
> return;
>
> + if (time_after(now, nohz.next_stats) && atomic_read(&nohz.stats_state))
> + flags = NOHZ_STATS_KICK;
> +
> if (time_before(now, nohz.next_balance))
> goto out;
>
> @@ -9227,7 +9224,6 @@ static void set_cpu_sd_state_idle(int cpu)
> void nohz_balance_enter_idle(int cpu)
> {
> struct rq *rq = cpu_rq(cpu);
> - unsigned int val, new;
>
> SCHED_WARN_ON(cpu != smp_processor_id());
>
> @@ -9251,6 +9247,7 @@ void nohz_balance_enter_idle(int cpu)
> return;
>
> rq->nohz_tick_stopped = 1;
> + rq->has_blocked_load = 1;
>
> cpumask_set_cpu(cpu, nohz.idle_cpus_mask);
> atomic_inc(&nohz.nr_cpus);
> @@ -9258,21 +9255,11 @@ void nohz_balance_enter_idle(int cpu)
> set_cpu_sd_state_idle(cpu);
>
> /*
> - * implies a barrier such that if the stats_state update is observed
> - * the above updates are also visible. Pairs with stuff in
> - * update_sd_lb_stats() and nohz_idle_balance().
> + * Each time a cpu enter idle, we assume that it has blocked load and
> + * enable the periodic update of the load of idle cpus
> */
> - val = atomic_read(&nohz.stats_state);
> - do {
> - new = val + 2;
> - new |= 1;
> - } while (!atomic_try_cmpxchg(&nohz.stats_state, &val, new));
> + atomic_set(&nohz.stats_state, 1);
>
> - /*
> - * If the timer was stopped, restart the thing.
> - */
> - if (!(val & 1))
> - mod_timer(&nohz.timer, jiffies + msecs_to_jiffies(LOAD_AVG_PERIOD));
> }
> #else
> static inline void nohz_balancer_kick(struct rq *rq) { }
> @@ -9409,7 +9396,6 @@ static bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
> bool has_blocked_load = false;
> int update_next_balance = 0;
> int this_cpu = this_rq->cpu;
> - unsigned int stats_seq;
> unsigned int flags;
> int balance_cpu;
> struct rq *rq;
> @@ -9422,7 +9408,6 @@ static bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
> return false;
> }
>
> - stats_seq = atomic_read(&nohz.stats_state);
> /*
> * barrier, pairs with nohz_balance_enter_idle(), ensures ...
> */
> @@ -9432,6 +9417,16 @@ static bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
>
> SCHED_WARN_ON((flags & NOHZ_KICK_MASK) == NOHZ_BALANCE_KICK);
>
> + /*
> + * We assume there will be no idle load after this update and clear
> + * the stats state. If a cpu enters idle in the mean time, it will
> + * set the stats state and trig another update of idle load.
> + * Because a cpu that becomes idle, is added to idle_cpus_mask before
> + * setting the stats state, we are sure to not clear the state and not
> + * check the load of an idle cpu.
> + */
> + atomic_set(&nohz.stats_state, 0);
> +
> for_each_cpu(balance_cpu, nohz.idle_cpus_mask) {
> if (balance_cpu == this_cpu || !idle_cpu(balance_cpu))
> continue;
> @@ -9441,8 +9436,10 @@ static bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
> * work being done for other cpus. Next load
> * balancing owner will pick it up.
> */
> - if (need_resched())
> + if (need_resched()) {
> + has_blocked_load = true;
> break;
> + }
>
> rq = cpu_rq(balance_cpu);
>
> @@ -9477,12 +9474,12 @@ static bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
> if (flags & NOHZ_BALANCE_KICK)
> rebalance_domains(this_rq, CPU_IDLE);
>
> - if (has_blocked_load ||
> - !atomic_try_cmpxchg(&nohz.stats_state, &stats_seq, 0)) {
> - WRITE_ONCE(nohz.next_stats,
> - now + msecs_to_jiffies(LOAD_AVG_PERIOD));
> - mod_timer(&nohz.timer, nohz.next_stats);
> - }
> + WRITE_ONCE(nohz.next_stats,
> + now + msecs_to_jiffies(LOAD_AVG_PERIOD));
> +
> + /* There is still blocked load, enable periodic update */
> + if (has_blocked_load)
> + atomic_set(&nohz.stats_state, 1);
>
> /*
> * next_balance will be updated only when there is a need.
> @@ -10115,7 +10112,6 @@ __init void init_sched_fair_class(void)
> nohz.next_balance = jiffies;
> nohz.next_stats = jiffies;
> zalloc_cpumask_var(&nohz.idle_cpus_mask, GFP_NOWAIT);
> - timer_setup(&nohz.timer, nohz_balance_timer, 0);
> #endif
> #endif /* SMP */
>
>


2018-01-22 10:24:52

by Vincent Guittot

[permalink] [raw]
Subject: Re: [RFC PATCH 2/5] sched: Add NOHZ_STATS_KICK

On 22 January 2018 at 10:40, Dietmar Eggemann <[email protected]> wrote:
> On 01/15/2018 08:26 AM, Vincent Guittot wrote:
>>
>> Le Wednesday 03 Jan 2018 à 10:16:00 (+0100), Vincent Guittot a écrit :
>>>
>>> Hi Peter,
>>>
>>> On 22 December 2017 at 21:42, Peter Zijlstra <[email protected]>
>>> wrote:
>>>>
>>>> On Fri, Dec 22, 2017 at 07:56:29PM +0100, Peter Zijlstra wrote:
>>>>>
>>>>> Right; but I figured we'd try and do it 'right' and see how horrible it
>>>>> is before we try and do funny things.
>>>>
>>>>
>>>> So now it should have a 32ms tick for up to .5s when the system goes
>>>> completely idle.
>>>>
>>>> No idea how bad that is..
>>>
>>>
>>> I have tested your branch but the timer doesn't seem to fire correctly
>>> because i can still see blocked load in the use case i have run.
>>> I haven't found the reason yet
>
>
> Isn't the issue with the timer based implementation that
> rq->has_blocked_load is never set to 1 ?

Yes that's what I suggested to Peter on IRC

>
> Something you changed in your implementation by adding a
> rq->has_blocked_load = 1 into nohz_balance_enter_idle().
>
>
>>
>> Hi Peter,
>>
>> With the patch below on top of your branch, the blocked loads are updated
>> and
>> decayed regularly. The main differences are:
>> - It doesn't use a timer to trig ilb but the tick and when a cpu becomes
>> idle.
>> The main drawback of this solution is that the load is blocked when the
>> system is fully idle with the advantage of not waking up a fully idle
>> system. We have to wait for the next tick or newly idle event for
>> updating
>> blocked load when the system leaves idle stat which can be up to a tick
>> long.
>> If this is too long, we can check for kicking ilb when task wakes up so
>> the
>> blocked load will be updated as soon as the system leaves idle state.
>> The main advantage is that we don't wake up a fully idle system every
>> 32ms to
>> update blocked load that will be not used.
>> - I'm working on one more improvement to use nohz_idle_balance in the
>> newly
>> idle case when the system is not overloaded and
>> (this_rq->avg_idle > sysctl_sched_migration_cost). In this case, we can
>> try to
>> use nohz_idle_balance with NOHZ_STATS_KICK and abort as soon as it
>> exceed
>> this_rq->avg_idle. This will remove some calls to kick_ilb and some
>> wake up
>> of an idle cpus.
>>
>> ---
>> kernel/sched/fair.c | 72
>> +++++++++++++++++++++++++----------------------------
>> 1 file changed, 34 insertions(+), 38 deletions(-)
>>
>> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
>> index 52114c6..898785d 100644
>> --- a/kernel/sched/fair.c
>> +++ b/kernel/sched/fair.c
>> @@ -5386,7 +5386,6 @@ static struct {
>> atomic_t stats_state;
>> unsigned long next_balance; /* in jiffy units */
>> unsigned long next_stats;
>> - struct timer_list timer;
>> } nohz ____cacheline_aligned;
>> #endif /* CONFIG_NO_HZ_COMMON */
>> @@ -8004,8 +8003,9 @@ static inline void update_sd_lb_stats(struct lb_env
>> *env, struct sd_lb_stats *sd
>> prefer_sibling = 1;
>> #ifdef CONFIG_NO_HZ_COMMON
>> - if (env->idle == CPU_NEWLY_IDLE)
>> + if (env->idle == CPU_NEWLY_IDLE && atomic_read(&nohz.stats_state))
>> {
>> env->flags |= LBF_NOHZ_STATS;
>> + }
>> #endif
>> load_idx = get_sd_load_idx(env->sd, env->idle);
>> @@ -8818,6 +8818,8 @@ update_next_balance(struct sched_domain *sd,
>> unsigned long *next_balance)
>> *next_balance = next;
>> }
>> +static void kick_ilb(unsigned int flags);
>> +
>> /*
>> * idle_balance is called by schedule() if this_cpu is about to become
>> * idle. Attempts to pull tasks from other CPUs.
>> @@ -8852,12 +8854,16 @@ static int idle_balance(struct rq *this_rq, struct
>> rq_flags *rf)
>> if (this_rq->avg_idle < sysctl_sched_migration_cost ||
>> !this_rq->rd->overload) {
>> + unsigned long next = READ_ONCE(nohz.next_stats);
>> rcu_read_lock();
>> sd = rcu_dereference_check_sched_domain(this_rq->sd);
>> if (sd)
>> update_next_balance(sd, &next_balance);
>> rcu_read_unlock();
>> + if (time_after(jiffies, next) &&
>> atomic_read(&nohz.stats_state))
>> + kick_ilb(NOHZ_STATS_KICK);
>> +
>> goto out;
>> }
>> @@ -9075,18 +9081,6 @@ static void kick_ilb(unsigned int flags)
>> smp_send_reschedule(ilb_cpu);
>> }
>> -void nohz_balance_timer(struct timer_list *timer)
>> -{
>> - unsigned long next = READ_ONCE(nohz.next_stats);
>> -
>> - if (time_before(jiffies, next)) {
>> - mod_timer(timer, next);
>> - return;
>> - }
>> -
>> - kick_ilb(NOHZ_STATS_KICK);
>> -}
>> -
>> /*
>> * Current heuristic for kicking the idle load balancer in the presence
>> * of an idle cpu in the system.
>> @@ -9122,6 +9116,9 @@ static void nohz_balancer_kick(struct rq *rq)
>> if (likely(!atomic_read(&nohz.nr_cpus)))
>> return;
>> + if (time_after(now, nohz.next_stats) &&
>> atomic_read(&nohz.stats_state))
>> + flags = NOHZ_STATS_KICK;
>> +
>> if (time_before(now, nohz.next_balance))
>> goto out;
>> @@ -9227,7 +9224,6 @@ static void set_cpu_sd_state_idle(int cpu)
>> void nohz_balance_enter_idle(int cpu)
>> {
>> struct rq *rq = cpu_rq(cpu);
>> - unsigned int val, new;
>> SCHED_WARN_ON(cpu != smp_processor_id());
>> @@ -9251,6 +9247,7 @@ void nohz_balance_enter_idle(int cpu)
>> return;
>> rq->nohz_tick_stopped = 1;
>> + rq->has_blocked_load = 1;
>> cpumask_set_cpu(cpu, nohz.idle_cpus_mask);
>> atomic_inc(&nohz.nr_cpus);
>> @@ -9258,21 +9255,11 @@ void nohz_balance_enter_idle(int cpu)
>> set_cpu_sd_state_idle(cpu);
>> /*
>> - * implies a barrier such that if the stats_state update is
>> observed
>> - * the above updates are also visible. Pairs with stuff in
>> - * update_sd_lb_stats() and nohz_idle_balance().
>> + * Each time a cpu enter idle, we assume that it has blocked load
>> and
>> + * enable the periodic update of the load of idle cpus
>> */
>> - val = atomic_read(&nohz.stats_state);
>> - do {
>> - new = val + 2;
>> - new |= 1;
>> - } while (!atomic_try_cmpxchg(&nohz.stats_state, &val, new));
>> + atomic_set(&nohz.stats_state, 1);
>> - /*
>> - * If the timer was stopped, restart the thing.
>> - */
>> - if (!(val & 1))
>> - mod_timer(&nohz.timer, jiffies +
>> msecs_to_jiffies(LOAD_AVG_PERIOD));
>> }
>> #else
>> static inline void nohz_balancer_kick(struct rq *rq) { }
>> @@ -9409,7 +9396,6 @@ static bool nohz_idle_balance(struct rq *this_rq,
>> enum cpu_idle_type idle)
>> bool has_blocked_load = false;
>> int update_next_balance = 0;
>> int this_cpu = this_rq->cpu;
>> - unsigned int stats_seq;
>> unsigned int flags;
>> int balance_cpu;
>> struct rq *rq;
>> @@ -9422,7 +9408,6 @@ static bool nohz_idle_balance(struct rq *this_rq,
>> enum cpu_idle_type idle)
>> return false;
>> }
>> - stats_seq = atomic_read(&nohz.stats_state);
>> /*
>> * barrier, pairs with nohz_balance_enter_idle(), ensures ...
>> */
>> @@ -9432,6 +9417,16 @@ static bool nohz_idle_balance(struct rq *this_rq,
>> enum cpu_idle_type idle)
>> SCHED_WARN_ON((flags & NOHZ_KICK_MASK) == NOHZ_BALANCE_KICK);
>> + /*
>> + * We assume there will be no idle load after this update and
>> clear
>> + * the stats state. If a cpu enters idle in the mean time, it will
>> + * set the stats state and trig another update of idle load.
>> + * Because a cpu that becomes idle, is added to idle_cpus_mask
>> before
>> + * setting the stats state, we are sure to not clear the state and
>> not
>> + * check the load of an idle cpu.
>> + */
>> + atomic_set(&nohz.stats_state, 0);
>> +
>> for_each_cpu(balance_cpu, nohz.idle_cpus_mask) {
>> if (balance_cpu == this_cpu || !idle_cpu(balance_cpu))
>> continue;
>> @@ -9441,8 +9436,10 @@ static bool nohz_idle_balance(struct rq *this_rq,
>> enum cpu_idle_type idle)
>> * work being done for other cpus. Next load
>> * balancing owner will pick it up.
>> */
>> - if (need_resched())
>> + if (need_resched()) {
>> + has_blocked_load = true;
>> break;
>> + }
>> rq = cpu_rq(balance_cpu);
>> @@ -9477,12 +9474,12 @@ static bool nohz_idle_balance(struct rq
>> *this_rq, enum cpu_idle_type idle)
>> if (flags & NOHZ_BALANCE_KICK)
>> rebalance_domains(this_rq, CPU_IDLE);
>> - if (has_blocked_load ||
>> - !atomic_try_cmpxchg(&nohz.stats_state, &stats_seq, 0)) {
>> - WRITE_ONCE(nohz.next_stats,
>> - now + msecs_to_jiffies(LOAD_AVG_PERIOD));
>> - mod_timer(&nohz.timer, nohz.next_stats);
>> - }
>> + WRITE_ONCE(nohz.next_stats,
>> + now + msecs_to_jiffies(LOAD_AVG_PERIOD));
>> +
>> + /* There is still blocked load, enable periodic update */
>> + if (has_blocked_load)
>> + atomic_set(&nohz.stats_state, 1);
>> /*
>> * next_balance will be updated only when there is a need.
>> @@ -10115,7 +10112,6 @@ __init void init_sched_fair_class(void)
>> nohz.next_balance = jiffies;
>> nohz.next_stats = jiffies;
>> zalloc_cpumask_var(&nohz.idle_cpus_mask, GFP_NOWAIT);
>> - timer_setup(&nohz.timer, nohz_balance_timer, 0);
>> #endif
>> #endif /* SMP */
>>
>
>

2018-01-24 08:26:24

by Vincent Guittot

[permalink] [raw]
Subject: Re: [RFC PATCH 2/5] sched: Add NOHZ_STATS_KICK

Hi,

Le Thursday 18 Jan 2018 ? 10:38:07 (+0000), Morten Rasmussen a ?crit :
> On Mon, Jan 15, 2018 at 09:26:09AM +0100, Vincent Guittot wrote:
> > Le Wednesday 03 Jan 2018 ? 10:16:00 (+0100), Vincent Guittot a ?crit :
> > > Hi Peter,
> > >
> > > On 22 December 2017 at 21:42, Peter Zijlstra <[email protected]> wrote:
> > > > On Fri, Dec 22, 2017 at 07:56:29PM +0100, Peter Zijlstra wrote:
> > > >> Right; but I figured we'd try and do it 'right' and see how horrible it
> > > >> is before we try and do funny things.
> > > >
> > > > So now it should have a 32ms tick for up to .5s when the system goes
> > > > completely idle.
> > > >
> > > > No idea how bad that is..
> > >
> > > I have tested your branch but the timer doesn't seem to fire correctly
> > > because i can still see blocked load in the use case i have run.
> > > I haven't found the reason yet
> >
> > Hi Peter,
> >
> > With the patch below on top of your branch, the blocked loads are updated and
> > decayed regularly. The main differences are:
> > - It doesn't use a timer to trig ilb but the tick and when a cpu becomes idle.
> > The main drawback of this solution is that the load is blocked when the
> > system is fully idle with the advantage of not waking up a fully idle
> > system. We have to wait for the next tick or newly idle event for updating
> > blocked load when the system leaves idle stat which can be up to a tick long.
> > If this is too long, we can check for kicking ilb when task wakes up so the
> > blocked load will be updated as soon as the system leaves idle state.
> > The main advantage is that we don't wake up a fully idle system every 32ms to
> > update blocked load that will be not used.
> > - I'm working on one more improvement to use nohz_idle_balance in the newly
> > idle case when the system is not overloaded and
> > (this_rq->avg_idle > sysctl_sched_migration_cost). In this case, we can try to
> > use nohz_idle_balance with NOHZ_STATS_KICK and abort as soon as it exceed
> > this_rq->avg_idle. This will remove some calls to kick_ilb and some wake up
> > of an idle cpus.
>
> This sound like what I meant in my other reply :-)
>
> It seems pointless to have a timer to update PELT if the system is
> completely idle, and when it isn't we can piggy back other events to
> make the updates happen.

The patch below implements what has been described above. It calls part of
nohz_idle_balance when a cpu becomes idle and kick a ilb if it takes too much
time. This removes part of ilb that are kicked on an idle cpu for updating
the blocked load but the ratio really depends on when the tick happens compared
to a cpu becoming idle and the 32ms boundary. I have an additionnal patch that
enables to update the blocked loads when a cpu becomes idle 1 period before
kicking an ilb and there is far less ilb because we give more chance to the
newly idle case (time_after is replaced by time_after_eq in idle_balance()).

The patch also uses a function cfs_rq_has_blocked, which only checks the
util/load_avg, instead of the cfs_rq_is_decayed which check *_sum too. This
reduce significantly the number of update of blocked load. the *_avg will be
fully decayed in around 300~400ms but it's far longer for the *_sum which have
a higher resolution and we can easily reach almost seconds. But only the *_avg
are used to make decision so keeping some blocked *_sum is acceptable.

---
kernel/sched/fair.c | 121 +++++++++++++++++++++++++++++++++++++++-------------
1 file changed, 92 insertions(+), 29 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 898785d..ed90303 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -7356,6 +7356,17 @@ static inline bool cfs_rq_is_decayed(struct cfs_rq *cfs_rq)
return true;
}

+static inline bool cfs_rq_has_blocked(struct cfs_rq *cfs_rq)
+{
+ if (cfs_rq->avg.load_avg)
+ return true;
+
+ if (cfs_rq->avg.util_avg)
+ return true;
+
+ return false;
+}
+
#ifdef CONFIG_FAIR_GROUP_SCHED

static void update_blocked_averages(int cpu)
@@ -7393,7 +7404,9 @@ static void update_blocked_averages(int cpu)
*/
if (cfs_rq_is_decayed(cfs_rq))
list_del_leaf_cfs_rq(cfs_rq);
- else
+
+ /* Don't need periodic decay once load/util_avg are null */
+ if (cfs_rq_has_blocked(cfs_rq))
done = false;
}

@@ -7463,7 +7476,7 @@ static inline void update_blocked_averages(int cpu)
update_cfs_rq_load_avg(cfs_rq_clock_task(cfs_rq), cfs_rq);
#ifdef CONFIG_NO_HZ_COMMON
rq->last_blocked_load_update_tick = jiffies;
- if (cfs_rq_is_decayed(cfs_rq))
+ if (cfs_rq_has_blocked(cfs_rq))
rq->has_blocked_load = 0;
#endif
rq_unlock_irqrestore(rq, &rf);
@@ -8818,6 +8831,7 @@ update_next_balance(struct sched_domain *sd, unsigned long *next_balance)
*next_balance = next;
}

+static bool _nohz_idle_balance(struct rq *this_rq, unsigned int flags, enum cpu_idle_type idle);
static void kick_ilb(unsigned int flags);

/*
@@ -8861,7 +8875,14 @@ static int idle_balance(struct rq *this_rq, struct rq_flags *rf)
update_next_balance(sd, &next_balance);
rcu_read_unlock();

- if (time_after(jiffies, next) && atomic_read(&nohz.stats_state))
+ /*
+ * Update blocked idle load if it has not been done for a
+ * while. Try to do it locally before entering idle but kick a
+ * ilb if it takes too much time and might delay next local
+ * wake up
+ */
+ if (time_after(jiffies, next) && atomic_read(&nohz.stats_state) &&
+ !_nohz_idle_balance(this_rq, NOHZ_STATS_KICK, CPU_NEWLY_IDLE))
kick_ilb(NOHZ_STATS_KICK);

goto out;
@@ -9237,6 +9258,7 @@ void nohz_balance_enter_idle(int cpu)
if (!housekeeping_cpu(cpu, HK_FLAG_SCHED))
return;

+ rq->has_blocked_load = 1;
if (rq->nohz_tick_stopped)
return;

@@ -9247,7 +9269,6 @@ void nohz_balance_enter_idle(int cpu)
return;

rq->nohz_tick_stopped = 1;
- rq->has_blocked_load = 1;

cpumask_set_cpu(cpu, nohz.idle_cpus_mask);
atomic_inc(&nohz.nr_cpus);
@@ -9259,7 +9280,6 @@ void nohz_balance_enter_idle(int cpu)
* enable the periodic update of the load of idle cpus
*/
atomic_set(&nohz.stats_state, 1);
-
}
#else
static inline void nohz_balancer_kick(struct rq *rq) { }
@@ -9385,10 +9405,13 @@ static void rebalance_domains(struct rq *rq, enum cpu_idle_type idle)

#ifdef CONFIG_NO_HZ_COMMON
/*
- * In CONFIG_NO_HZ_COMMON case, the idle balance kickee will do the
- * rebalancing for all the cpus for whom scheduler ticks are stopped.
+ * Internal function that runs load balance for all idle cpus. The load balance
+ * can be a simple update of blocked load or a complete load balance with
+ * tasks movement depending of flags.
+ * For newly idle mode, we abort the loop if it takes too much time and return
+ * false to notify that the loop has not be completed and a ilb shoud be kick.
*/
-static bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
+static bool _nohz_idle_balance(struct rq *this_rq, unsigned int flags, enum cpu_idle_type idle)
{
/* Earliest time when we have to do rebalance again */
unsigned long now = jiffies;
@@ -9396,24 +9419,10 @@ static bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
bool has_blocked_load = false;
int update_next_balance = 0;
int this_cpu = this_rq->cpu;
- unsigned int flags;
int balance_cpu;
+ int ret = false;
struct rq *rq;
-
- if (!(atomic_read(nohz_flags(this_cpu)) & NOHZ_KICK_MASK))
- return false;
-
- if (idle != CPU_IDLE) {
- atomic_andnot(NOHZ_KICK_MASK, nohz_flags(this_cpu));
- return false;
- }
-
- /*
- * barrier, pairs with nohz_balance_enter_idle(), ensures ...
- */
- flags = atomic_fetch_andnot(NOHZ_KICK_MASK, nohz_flags(this_cpu));
- if (!(flags & NOHZ_KICK_MASK))
- return false;
+ u64 curr_cost = 0;

SCHED_WARN_ON((flags & NOHZ_KICK_MASK) == NOHZ_BALANCE_KICK);

@@ -9428,6 +9437,10 @@ static bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
atomic_set(&nohz.stats_state, 0);

for_each_cpu(balance_cpu, nohz.idle_cpus_mask) {
+ u64 t0, domain_cost;
+
+ t0 = sched_clock_cpu(this_cpu);
+
if (balance_cpu == this_cpu || !idle_cpu(balance_cpu))
continue;

@@ -9438,7 +9451,17 @@ static bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
*/
if (need_resched()) {
has_blocked_load = true;
- break;
+ goto abort;
+ }
+
+ /*
+ * If the update is done while CPU becomes idle, we abort
+ * the update when its cost is higher than the average idle
+ * time in orde to not delay a possible wake up.
+ */
+ if (idle == CPU_NEWLY_IDLE && this_rq->avg_idle < curr_cost) {
+ has_blocked_load = true;
+ goto abort;
}

rq = cpu_rq(balance_cpu);
@@ -9453,10 +9476,10 @@ static bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
if (time_after_eq(jiffies, rq->next_balance)) {
struct rq_flags rf;

- rq_lock_irq(rq, &rf);
+ rq_lock_irqsave(rq, &rf);
update_rq_clock(rq);
cpu_load_update_idle(rq);
- rq_unlock_irq(rq, &rf);
+ rq_unlock_irqrestore(rq, &rf);

if (flags & NOHZ_BALANCE_KICK)
rebalance_domains(rq, CPU_IDLE);
@@ -9466,10 +9489,17 @@ static bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
next_balance = rq->next_balance;
update_next_balance = 1;
}
+
+ domain_cost = sched_clock_cpu(this_cpu) - t0;
+ curr_cost += domain_cost;
+
}

- update_blocked_averages(this_cpu);
- has_blocked_load |= this_rq->has_blocked_load;
+ /* Newly idle CPU doesn't need an update */
+ if (idle != CPU_NEWLY_IDLE) {
+ update_blocked_averages(this_cpu);
+ has_blocked_load |= this_rq->has_blocked_load;
+ }

if (flags & NOHZ_BALANCE_KICK)
rebalance_domains(this_rq, CPU_IDLE);
@@ -9477,6 +9507,10 @@ static bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
WRITE_ONCE(nohz.next_stats,
now + msecs_to_jiffies(LOAD_AVG_PERIOD));

+ /* The full idle balance loop has been done */
+ ret = true;
+
+abort:
/* There is still blocked load, enable periodic update */
if (has_blocked_load)
atomic_set(&nohz.stats_state, 1);
@@ -9489,6 +9523,35 @@ static bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
if (likely(update_next_balance))
nohz.next_balance = next_balance;

+ return ret;
+}
+
+/*
+ * In CONFIG_NO_HZ_COMMON case, the idle balance kickee will do the
+ * rebalancing for all the cpus for whom scheduler ticks are stopped.
+ */
+static bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
+{
+ int this_cpu = this_rq->cpu;
+ unsigned int flags;
+
+ if (!(atomic_read(nohz_flags(this_cpu)) & NOHZ_KICK_MASK))
+ return false;
+
+ if (idle != CPU_IDLE) {
+ atomic_andnot(NOHZ_KICK_MASK, nohz_flags(this_cpu));
+ return false;
+ }
+
+ /*
+ * barrier, pairs with nohz_balance_enter_idle(), ensures ...
+ */
+ flags = atomic_fetch_andnot(NOHZ_KICK_MASK, nohz_flags(this_cpu));
+ if (!(flags & NOHZ_KICK_MASK))
+ return false;
+
+ _nohz_idle_balance(this_rq, flags, idle);
+
return true;
}
#else
--
2.7.4


2018-01-29 18:45:37

by Dietmar Eggemann

[permalink] [raw]
Subject: Re: [RFC PATCH 2/5] sched: Add NOHZ_STATS_KICK

On 01/24/2018 09:25 AM, Vincent Guittot wrote:
> Hi,
>
> Le Thursday 18 Jan 2018 à 10:38:07 (+0000), Morten Rasmussen a écrit :
>> On Mon, Jan 15, 2018 at 09:26:09AM +0100, Vincent Guittot wrote:
>>> Le Wednesday 03 Jan 2018 à 10:16:00 (+0100), Vincent Guittot a écrit :

[...]

>>>
>>> Hi Peter,
>>>
>>> With the patch below on top of your branch, the blocked loads are updated and
>>> decayed regularly. The main differences are:
>>> - It doesn't use a timer to trig ilb but the tick and when a cpu becomes idle.
>>> The main drawback of this solution is that the load is blocked when the
>>> system is fully idle with the advantage of not waking up a fully idle
>>> system. We have to wait for the next tick or newly idle event for updating
>>> blocked load when the system leaves idle stat which can be up to a tick long.
>>> If this is too long, we can check for kicking ilb when task wakes up so the
>>> blocked load will be updated as soon as the system leaves idle state.
>>> The main advantage is that we don't wake up a fully idle system every 32ms to
>>> update blocked load that will be not used.
>>> - I'm working on one more improvement to use nohz_idle_balance in the newly
>>> idle case when the system is not overloaded and
>>> (this_rq->avg_idle > sysctl_sched_migration_cost). In this case, we can try to
>>> use nohz_idle_balance with NOHZ_STATS_KICK and abort as soon as it exceed
>>> this_rq->avg_idle. This will remove some calls to kick_ilb and some wake up
>>> of an idle cpus.
>>
>> This sound like what I meant in my other reply :-)
>>
>> It seems pointless to have a timer to update PELT if the system is
>> completely idle, and when it isn't we can piggy back other events to
>> make the updates happen.
>
> The patch below implements what has been described above. It calls part of
> nohz_idle_balance when a cpu becomes idle and kick a ilb if it takes too much
> time. This removes part of ilb that are kicked on an idle cpu for updating
> the blocked load but the ratio really depends on when the tick happens compared
> to a cpu becoming idle and the 32ms boundary. I have an additionnal patch that
> enables to update the blocked loads when a cpu becomes idle 1 period before
> kicking an ilb and there is far less ilb because we give more chance to the
> newly idle case (time_after is replaced by time_after_eq in idle_balance()).
>
> The patch also uses a function cfs_rq_has_blocked, which only checks the
> util/load_avg, instead of the cfs_rq_is_decayed which check *_sum too. This
> reduce significantly the number of update of blocked load. the *_avg will be
> fully decayed in around 300~400ms but it's far longer for the *_sum which have
> a higher resolution and we can easily reach almost seconds. But only the *_avg
> are used to make decision so keeping some blocked *_sum is acceptable.
>
> ---
> kernel/sched/fair.c | 121 +++++++++++++++++++++++++++++++++++++++-------------
> 1 file changed, 92 insertions(+), 29 deletions(-)
>
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index 898785d..ed90303 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -7356,6 +7356,17 @@ static inline bool cfs_rq_is_decayed(struct cfs_rq *cfs_rq)
> return true;
> }
>
> +static inline bool cfs_rq_has_blocked(struct cfs_rq *cfs_rq)
> +{
> + if (cfs_rq->avg.load_avg)
> + return true;
> +
> + if (cfs_rq->avg.util_avg)
> + return true;
> +
> + return false;
> +}
> +

Can we not change cfs_rq_is_decayed() to use avg.foo_avg instead of
avg.foo_sum ?

> #ifdef CONFIG_FAIR_GROUP_SCHED
>
> static void update_blocked_averages(int cpu)
> @@ -7393,7 +7404,9 @@ static void update_blocked_averages(int cpu)
> */
> if (cfs_rq_is_decayed(cfs_rq))
> list_del_leaf_cfs_rq(cfs_rq);
> - else
> +
> + /* Don't need periodic decay once load/util_avg are null */
> + if (cfs_rq_has_blocked(cfs_rq))
> done = false;
> }
>
> @@ -7463,7 +7476,7 @@ static inline void update_blocked_averages(int cpu)
> update_cfs_rq_load_avg(cfs_rq_clock_task(cfs_rq), cfs_rq);
> #ifdef CONFIG_NO_HZ_COMMON
> rq->last_blocked_load_update_tick = jiffies;
> - if (cfs_rq_is_decayed(cfs_rq))
> + if (cfs_rq_has_blocked(cfs_rq))

Schouldn't this be !cfs_rq_has_blocked(cfs_rq) ?

> rq->has_blocked_load = 0;
> #endif
> rq_unlock_irqrestore(rq, &rf);

[...]

> @@ -9438,7 +9451,17 @@ static bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
> */
> if (need_resched()) {
> has_blocked_load = true;
> - break;
> + goto abort;
> + }
> +
> + /*
> + * If the update is done while CPU becomes idle, we abort
> + * the update when its cost is higher than the average idle
> + * time in orde to not delay a possible wake up.
> + */
> + if (idle == CPU_NEWLY_IDLE && this_rq->avg_idle < curr_cost) {
> + has_blocked_load = true;
> + goto abort;
> }
>
> rq = cpu_rq(balance_cpu);
> @@ -9453,10 +9476,10 @@ static bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
> if (time_after_eq(jiffies, rq->next_balance)) {
> struct rq_flags rf;
>
> - rq_lock_irq(rq, &rf);
> + rq_lock_irqsave(rq, &rf);
> update_rq_clock(rq);
> cpu_load_update_idle(rq);
> - rq_unlock_irq(rq, &rf);
> + rq_unlock_irqrestore(rq, &rf);
>
> if (flags & NOHZ_BALANCE_KICK)
> rebalance_domains(rq, CPU_IDLE);
> @@ -9466,10 +9489,17 @@ static bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
> next_balance = rq->next_balance;
> update_next_balance = 1;
> }

Why do you do this cpu_load_update_idle(rq) even this was called with
CPU_NEWLY_IDLE? Wouldn't it be sufficient to jump to the curr_cost
calculation in this case?

> +
> + domain_cost = sched_clock_cpu(this_cpu) - t0;
> + curr_cost += domain_cost;
> +
> }
>
> - update_blocked_averages(this_cpu);
> - has_blocked_load |= this_rq->has_blocked_load;
> + /* Newly idle CPU doesn't need an update */
> + if (idle != CPU_NEWLY_IDLE) {
> + update_blocked_averages(this_cpu);
> + has_blocked_load |= this_rq->has_blocked_load;
> + }
>
> if (flags & NOHZ_BALANCE_KICK)
> rebalance_domains(this_rq, CPU_IDLE);

[...]

2018-01-29 19:31:56

by Valentin Schneider

[permalink] [raw]
Subject: Re: [RFC PATCH 2/5] sched: Add NOHZ_STATS_KICK

Hi Vincent, Peter,

I've been running some tests on your patches (Peter's base + the 2 from
Vincent). The results themselves are hosted at [1].
The base of those tests is the same: a task ("accumulator") is ran for 5
seconds (arbitrary value) to accumulate some load, then goes to sleep
for .5 seconds.

I've set up 3 test scenarios:

Update by nohz_balance_kick()
-----------------------------
Right before the "accumulator" task goes to sleep, a CPU-hogging task
(100% utilization) is spawned on another CPU. It won't go idle so the
only way to update the blocked load generated by "accumulator" is to
kick an ILB (NOHZ_STATS_KICK).

The test shows that this is behaving nicely - we keep kicking an ILB
every ~36ms (see next test for comments on that) until there is no more
blocked load. I did however notice some interesting scenarios: after the
load has been fully decayed, a tiny background task can spawn and end in
less than a scheduling period. However, it still goes through
nohz_balance_enter_idle(), and thus sets nohz.stats_state, which will
later cause an ILB kick.

This makes me wonder if it's worth kicking ILBs for such tiny load
values - perhaps it could be worth having a margin to set
rq->has_blocked_load ?

Furthermore, this tiny task will cause the ILB to iterate over all of
the idle CPUs, although only one has stale load. For load update via
NEWLY_IDLE load_balance() we use:

static bool update_nohz_stats(struct rq *rq)
{
    if (!rq->has_blocked_load)
     return false;
    [...]
}

But for load update via _nohz_idle_balance(), we iterate through all of
the nohz CPUS and unconditionally call update_blocked_averages(). This
could be avoided by remembering which CPUs have stale load before going
idle. Initially I thought that was what nohz.stats_state was for, but it
isn't.
With Vincent's patches it's only ever set to either 0 or 1, but we could
use it as a CPU mask, and use it to skip nohz CPUs that don't have stale
load in _nohz_idle_balance() (when NOHZ_STATS_KICK).

Update by idle_balance()
------------------------
Right before the "accumulator" task goes to sleep, a tiny periodic
(period=32ms) task is spawned on another CPU. It's expected that it will
update the blocked load in idle_balance(), either by running
_nohz_idle_balance() locally or kicking an ILB (The overload flag
shouldn't be set in this test case, so we shouldn't go through the
NEWLY_IDLE load_balance()).

This also seems to be working fine, but I'm noticing a delay between
load updates that is closer to 64ms than 32ms. After digging into it I
found out that the time checks done in idle_balance() and
nohz_balancer_kick() are time_after(jiffies, next_stats), but IMHO they
should be time_after_eq(jiffies, next_stats) to have 32ms-based updates.
This also explains the 36ms periodicity of the updates in the test above.


No update (idle system)
-----------------------
Nothing special here, just making sure nothing happens when the system
is fully idle. On a sidenote, that's relatively hard to achieve - I had
to switch over to Juno because my HiKey960 gets interrupts every 16ms.
The Juno still gets woken up every now and then but it's a bit quieter.


[1]: https://gist.github.com/valschneider/a8da7bb8e11fb1ec63a419710f56c0a0


On 01/24/2018 08:25 AM, Vincent Guittot wrote:
> Hi,
>
> Le Thursday 18 Jan 2018 à 10:38:07 (+0000), Morten Rasmussen a écrit :
>> On Mon, Jan 15, 2018 at 09:26:09AM +0100, Vincent Guittot wrote:
>>> Le Wednesday 03 Jan 2018 à 10:16:00 (+0100), Vincent Guittot a écrit :
>>>> Hi Peter,
>>>>
>>>> On 22 December 2017 at 21:42, Peter Zijlstra <[email protected]> wrote:
>>>>> On Fri, Dec 22, 2017 at 07:56:29PM +0100, Peter Zijlstra wrote:
>>>>>> Right; but I figured we'd try and do it 'right' and see how horrible it
>>>>>> is before we try and do funny things.
>>>>> So now it should have a 32ms tick for up to .5s when the system goes
>>>>> completely idle.
>>>>>
>>>>> No idea how bad that is..
>>>> I have tested your branch but the timer doesn't seem to fire correctly
>>>> because i can still see blocked load in the use case i have run.
>>>> I haven't found the reason yet
>>> Hi Peter,
>>>
>>> With the patch below on top of your branch, the blocked loads are updated and
>>> decayed regularly. The main differences are:
>>> - It doesn't use a timer to trig ilb but the tick and when a cpu becomes idle.
>>> The main drawback of this solution is that the load is blocked when the
>>> system is fully idle with the advantage of not waking up a fully idle
>>> system. We have to wait for the next tick or newly idle event for updating
>>> blocked load when the system leaves idle stat which can be up to a tick long.
>>> If this is too long, we can check for kicking ilb when task wakes up so the
>>> blocked load will be updated as soon as the system leaves idle state.
>>> The main advantage is that we don't wake up a fully idle system every 32ms to
>>> update blocked load that will be not used.
>>> - I'm working on one more improvement to use nohz_idle_balance in the newly
>>> idle case when the system is not overloaded and
>>> (this_rq->avg_idle > sysctl_sched_migration_cost). In this case, we can try to
>>> use nohz_idle_balance with NOHZ_STATS_KICK and abort as soon as it exceed
>>> this_rq->avg_idle. This will remove some calls to kick_ilb and some wake up
>>> of an idle cpus.
>> This sound like what I meant in my other reply :-)
>>
>> It seems pointless to have a timer to update PELT if the system is
>> completely idle, and when it isn't we can piggy back other events to
>> make the updates happen.
> The patch below implements what has been described above. It calls part of
> nohz_idle_balance when a cpu becomes idle and kick a ilb if it takes too much
> time. This removes part of ilb that are kicked on an idle cpu for updating
> the blocked load but the ratio really depends on when the tick happens compared
> to a cpu becoming idle and the 32ms boundary. I have an additionnal patch that
> enables to update the blocked loads when a cpu becomes idle 1 period before
> kicking an ilb and there is far less ilb because we give more chance to the
> newly idle case (time_after is replaced by time_after_eq in idle_balance()).
>
> The patch also uses a function cfs_rq_has_blocked, which only checks the
> util/load_avg, instead of the cfs_rq_is_decayed which check *_sum too. This
> reduce significantly the number of update of blocked load. the *_avg will be
> fully decayed in around 300~400ms but it's far longer for the *_sum which have
> a higher resolution and we can easily reach almost seconds. But only the *_avg
> are used to make decision so keeping some blocked *_sum is acceptable.
>
> ---
> kernel/sched/fair.c | 121 +++++++++++++++++++++++++++++++++++++++-------------
> 1 file changed, 92 insertions(+), 29 deletions(-)
>
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index 898785d..ed90303 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -7356,6 +7356,17 @@ static inline bool cfs_rq_is_decayed(struct cfs_rq *cfs_rq)
> return true;
> }
>
> +static inline bool cfs_rq_has_blocked(struct cfs_rq *cfs_rq)
> +{
> + if (cfs_rq->avg.load_avg)
> + return true;
> +
> + if (cfs_rq->avg.util_avg)
> + return true;
> +
> + return false;
> +}
> +
> #ifdef CONFIG_FAIR_GROUP_SCHED
>
> static void update_blocked_averages(int cpu)
> @@ -7393,7 +7404,9 @@ static void update_blocked_averages(int cpu)
> */
> if (cfs_rq_is_decayed(cfs_rq))
> list_del_leaf_cfs_rq(cfs_rq);
> - else
> +
> + /* Don't need periodic decay once load/util_avg are null */
> + if (cfs_rq_has_blocked(cfs_rq))
> done = false;
> }
>
> @@ -7463,7 +7476,7 @@ static inline void update_blocked_averages(int cpu)
> update_cfs_rq_load_avg(cfs_rq_clock_task(cfs_rq), cfs_rq);
> #ifdef CONFIG_NO_HZ_COMMON
> rq->last_blocked_load_update_tick = jiffies;
> - if (cfs_rq_is_decayed(cfs_rq))
> + if (cfs_rq_has_blocked(cfs_rq))
> rq->has_blocked_load = 0;
> #endif
> rq_unlock_irqrestore(rq, &rf);
> @@ -8818,6 +8831,7 @@ update_next_balance(struct sched_domain *sd, unsigned long *next_balance)
> *next_balance = next;
> }
>
> +static bool _nohz_idle_balance(struct rq *this_rq, unsigned int flags, enum cpu_idle_type idle);
> static void kick_ilb(unsigned int flags);
>
> /*
> @@ -8861,7 +8875,14 @@ static int idle_balance(struct rq *this_rq, struct rq_flags *rf)
> update_next_balance(sd, &next_balance);
> rcu_read_unlock();
>
> - if (time_after(jiffies, next) && atomic_read(&nohz.stats_state))
> + /*
> + * Update blocked idle load if it has not been done for a
> + * while. Try to do it locally before entering idle but kick a
> + * ilb if it takes too much time and might delay next local
> + * wake up
> + */
> + if (time_after(jiffies, next) && atomic_read(&nohz.stats_state) &&
> + !_nohz_idle_balance(this_rq, NOHZ_STATS_KICK, CPU_NEWLY_IDLE))
> kick_ilb(NOHZ_STATS_KICK);
>
> goto out;
> @@ -9237,6 +9258,7 @@ void nohz_balance_enter_idle(int cpu)
> if (!housekeeping_cpu(cpu, HK_FLAG_SCHED))
> return;
>
> + rq->has_blocked_load = 1;
> if (rq->nohz_tick_stopped)
> return;
>
> @@ -9247,7 +9269,6 @@ void nohz_balance_enter_idle(int cpu)
> return;
>
> rq->nohz_tick_stopped = 1;
> - rq->has_blocked_load = 1;
>
> cpumask_set_cpu(cpu, nohz.idle_cpus_mask);
> atomic_inc(&nohz.nr_cpus);
> @@ -9259,7 +9280,6 @@ void nohz_balance_enter_idle(int cpu)
> * enable the periodic update of the load of idle cpus
> */
> atomic_set(&nohz.stats_state, 1);
> -
> }
> #else
> static inline void nohz_balancer_kick(struct rq *rq) { }
> @@ -9385,10 +9405,13 @@ static void rebalance_domains(struct rq *rq, enum cpu_idle_type idle)
>
> #ifdef CONFIG_NO_HZ_COMMON
> /*
> - * In CONFIG_NO_HZ_COMMON case, the idle balance kickee will do the
> - * rebalancing for all the cpus for whom scheduler ticks are stopped.
> + * Internal function that runs load balance for all idle cpus. The load balance
> + * can be a simple update of blocked load or a complete load balance with
> + * tasks movement depending of flags.
> + * For newly idle mode, we abort the loop if it takes too much time and return
> + * false to notify that the loop has not be completed and a ilb shoud be kick.
> */
> -static bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
> +static bool _nohz_idle_balance(struct rq *this_rq, unsigned int flags, enum cpu_idle_type idle)
> {
> /* Earliest time when we have to do rebalance again */
> unsigned long now = jiffies;
> @@ -9396,24 +9419,10 @@ static bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
> bool has_blocked_load = false;
> int update_next_balance = 0;
> int this_cpu = this_rq->cpu;
> - unsigned int flags;
> int balance_cpu;
> + int ret = false;
> struct rq *rq;
> -
> - if (!(atomic_read(nohz_flags(this_cpu)) & NOHZ_KICK_MASK))
> - return false;
> -
> - if (idle != CPU_IDLE) {
> - atomic_andnot(NOHZ_KICK_MASK, nohz_flags(this_cpu));
> - return false;
> - }
> -
> - /*
> - * barrier, pairs with nohz_balance_enter_idle(), ensures ...
> - */
> - flags = atomic_fetch_andnot(NOHZ_KICK_MASK, nohz_flags(this_cpu));
> - if (!(flags & NOHZ_KICK_MASK))
> - return false;
> + u64 curr_cost = 0;
>
> SCHED_WARN_ON((flags & NOHZ_KICK_MASK) == NOHZ_BALANCE_KICK);
>
> @@ -9428,6 +9437,10 @@ static bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
> atomic_set(&nohz.stats_state, 0);
>
> for_each_cpu(balance_cpu, nohz.idle_cpus_mask) {
> + u64 t0, domain_cost;
> +
> + t0 = sched_clock_cpu(this_cpu);
> +
> if (balance_cpu == this_cpu || !idle_cpu(balance_cpu))
> continue;
>
> @@ -9438,7 +9451,17 @@ static bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
> */
> if (need_resched()) {
> has_blocked_load = true;
> - break;
> + goto abort;
> + }
> +
> + /*
> + * If the update is done while CPU becomes idle, we abort
> + * the update when its cost is higher than the average idle
> + * time in orde to not delay a possible wake up.
> + */
> + if (idle == CPU_NEWLY_IDLE && this_rq->avg_idle < curr_cost) {
> + has_blocked_load = true;
> + goto abort;
> }
>
> rq = cpu_rq(balance_cpu);
> @@ -9453,10 +9476,10 @@ static bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
> if (time_after_eq(jiffies, rq->next_balance)) {
> struct rq_flags rf;
>
> - rq_lock_irq(rq, &rf);
> + rq_lock_irqsave(rq, &rf);
> update_rq_clock(rq);
> cpu_load_update_idle(rq);
> - rq_unlock_irq(rq, &rf);
> + rq_unlock_irqrestore(rq, &rf);
>
> if (flags & NOHZ_BALANCE_KICK)
> rebalance_domains(rq, CPU_IDLE);
> @@ -9466,10 +9489,17 @@ static bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
> next_balance = rq->next_balance;
> update_next_balance = 1;
> }
> +
> + domain_cost = sched_clock_cpu(this_cpu) - t0;
> + curr_cost += domain_cost;
> +
> }
>
> - update_blocked_averages(this_cpu);
> - has_blocked_load |= this_rq->has_blocked_load;
> + /* Newly idle CPU doesn't need an update */
> + if (idle != CPU_NEWLY_IDLE) {
> + update_blocked_averages(this_cpu);
> + has_blocked_load |= this_rq->has_blocked_load;
> + }
>
> if (flags & NOHZ_BALANCE_KICK)
> rebalance_domains(this_rq, CPU_IDLE);
> @@ -9477,6 +9507,10 @@ static bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
> WRITE_ONCE(nohz.next_stats,
> now + msecs_to_jiffies(LOAD_AVG_PERIOD));
>
> + /* The full idle balance loop has been done */
> + ret = true;
> +
> +abort:
> /* There is still blocked load, enable periodic update */
> if (has_blocked_load)
> atomic_set(&nohz.stats_state, 1);
> @@ -9489,6 +9523,35 @@ static bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
> if (likely(update_next_balance))
> nohz.next_balance = next_balance;
>
> + return ret;
> +}
> +
> +/*
> + * In CONFIG_NO_HZ_COMMON case, the idle balance kickee will do the
> + * rebalancing for all the cpus for whom scheduler ticks are stopped.
> + */
> +static bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
> +{
> + int this_cpu = this_rq->cpu;
> + unsigned int flags;
> +
> + if (!(atomic_read(nohz_flags(this_cpu)) & NOHZ_KICK_MASK))
> + return false;
> +
> + if (idle != CPU_IDLE) {
> + atomic_andnot(NOHZ_KICK_MASK, nohz_flags(this_cpu));
> + return false;
> + }
> +
> + /*
> + * barrier, pairs with nohz_balance_enter_idle(), ensures ...
> + */
> + flags = atomic_fetch_andnot(NOHZ_KICK_MASK, nohz_flags(this_cpu));
> + if (!(flags & NOHZ_KICK_MASK))
> + return false;
> +
> + _nohz_idle_balance(this_rq, flags, idle);
> +
> return true;
> }
> #else


2018-01-30 08:01:40

by Vincent Guittot

[permalink] [raw]
Subject: Re: [RFC PATCH 2/5] sched: Add NOHZ_STATS_KICK

On 29 January 2018 at 19:43, Dietmar Eggemann <[email protected]> wrote:
> On 01/24/2018 09:25 AM, Vincent Guittot wrote:
>>
>> Hi,
>>
>> Le Thursday 18 Jan 2018 à 10:38:07 (+0000), Morten Rasmussen a écrit :
>>>
>>> On Mon, Jan 15, 2018 at 09:26:09AM +0100, Vincent Guittot wrote:
>>>>
>>>> Le Wednesday 03 Jan 2018 à 10:16:00 (+0100), Vincent Guittot a écrit :
>
>
> [...]
>
>
>>>>
>>>> Hi Peter,
>>>>
>>>> With the patch below on top of your branch, the blocked loads are
>>>> updated and
>>>> decayed regularly. The main differences are:
>>>> - It doesn't use a timer to trig ilb but the tick and when a cpu becomes
>>>> idle.
>>>> The main drawback of this solution is that the load is blocked when
>>>> the
>>>> system is fully idle with the advantage of not waking up a fully idle
>>>> system. We have to wait for the next tick or newly idle event for
>>>> updating
>>>> blocked load when the system leaves idle stat which can be up to a
>>>> tick long.
>>>> If this is too long, we can check for kicking ilb when task wakes up
>>>> so the
>>>> blocked load will be updated as soon as the system leaves idle state.
>>>> The main advantage is that we don't wake up a fully idle system every
>>>> 32ms to
>>>> update blocked load that will be not used.
>>>> - I'm working on one more improvement to use nohz_idle_balance in the
>>>> newly
>>>> idle case when the system is not overloaded and
>>>> (this_rq->avg_idle > sysctl_sched_migration_cost). In this case, we
>>>> can try to
>>>> use nohz_idle_balance with NOHZ_STATS_KICK and abort as soon as it
>>>> exceed
>>>> this_rq->avg_idle. This will remove some calls to kick_ilb and some
>>>> wake up
>>>> of an idle cpus.
>>>
>>>
>>> This sound like what I meant in my other reply :-)
>>>
>>> It seems pointless to have a timer to update PELT if the system is
>>> completely idle, and when it isn't we can piggy back other events to
>>> make the updates happen.
>>
>>
>> The patch below implements what has been described above. It calls part of
>> nohz_idle_balance when a cpu becomes idle and kick a ilb if it takes too
>> much
>> time. This removes part of ilb that are kicked on an idle cpu for updating
>> the blocked load but the ratio really depends on when the tick happens
>> compared
>> to a cpu becoming idle and the 32ms boundary. I have an additionnal patch
>> that
>> enables to update the blocked loads when a cpu becomes idle 1 period
>> before
>> kicking an ilb and there is far less ilb because we give more chance to
>> the
>> newly idle case (time_after is replaced by time_after_eq in
>> idle_balance()).
>>
>> The patch also uses a function cfs_rq_has_blocked, which only checks the
>> util/load_avg, instead of the cfs_rq_is_decayed which check *_sum too.
>> This
>> reduce significantly the number of update of blocked load. the *_avg will
>> be
>> fully decayed in around 300~400ms but it's far longer for the *_sum which
>> have
>> a higher resolution and we can easily reach almost seconds. But only the
>> *_avg
>> are used to make decision so keeping some blocked *_sum is acceptable.
>>
>> ---
>> kernel/sched/fair.c | 121
>> +++++++++++++++++++++++++++++++++++++++-------------
>> 1 file changed, 92 insertions(+), 29 deletions(-)
>>
>> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
>> index 898785d..ed90303 100644
>> --- a/kernel/sched/fair.c
>> +++ b/kernel/sched/fair.c
>> @@ -7356,6 +7356,17 @@ static inline bool cfs_rq_is_decayed(struct cfs_rq
>> *cfs_rq)
>> return true;
>> }
>> +static inline bool cfs_rq_has_blocked(struct cfs_rq *cfs_rq)
>> +{
>> + if (cfs_rq->avg.load_avg)
>> + return true;
>> +
>> + if (cfs_rq->avg.util_avg)
>> + return true;
>> +
>> + return false;
>> +}
>> +
>
>
> Can we not change cfs_rq_is_decayed() to use avg.foo_avg instead of
> avg.foo_sum ?

I don't think so because the *_sum are used to keep coherency bewteen
cfs_rq and cgroups when task migrates and are enqueued/dequeued so wwe
can't remove it until the *_sum are null otherwise the all cfs_rq and
group will be out of sync

>
>> #ifdef CONFIG_FAIR_GROUP_SCHED
>> static void update_blocked_averages(int cpu)
>> @@ -7393,7 +7404,9 @@ static void update_blocked_averages(int cpu)
>> */
>> if (cfs_rq_is_decayed(cfs_rq))
>> list_del_leaf_cfs_rq(cfs_rq);
>> - else
>> +
>> + /* Don't need periodic decay once load/util_avg are null
>> */
>> + if (cfs_rq_has_blocked(cfs_rq))
>> done = false;
>> }
>> @@ -7463,7 +7476,7 @@ static inline void update_blocked_averages(int
>> cpu)
>> update_cfs_rq_load_avg(cfs_rq_clock_task(cfs_rq), cfs_rq);
>> #ifdef CONFIG_NO_HZ_COMMON
>> rq->last_blocked_load_update_tick = jiffies;
>> - if (cfs_rq_is_decayed(cfs_rq))
>> + if (cfs_rq_has_blocked(cfs_rq))
>
>
> Schouldn't this be !cfs_rq_has_blocked(cfs_rq) ?

yes. I copy/pasted too quickly from sched_group_fair to not sched_group_fair

>
>> rq->has_blocked_load = 0;
>> #endif
>> rq_unlock_irqrestore(rq, &rf);
>
>
> [...]
>
>
>> @@ -9438,7 +9451,17 @@ static bool nohz_idle_balance(struct rq *this_rq,
>> enum cpu_idle_type idle)
>> */
>> if (need_resched()) {
>> has_blocked_load = true;
>> - break;
>> + goto abort;
>> + }
>> +
>> + /*
>> + * If the update is done while CPU becomes idle, we abort
>> + * the update when its cost is higher than the average
>> idle
>> + * time in orde to not delay a possible wake up.
>> + */
>> + if (idle == CPU_NEWLY_IDLE && this_rq->avg_idle <
>> curr_cost) {
>> + has_blocked_load = true;
>> + goto abort;
>> }
>> rq = cpu_rq(balance_cpu);
>> @@ -9453,10 +9476,10 @@ static bool nohz_idle_balance(struct rq *this_rq,
>> enum cpu_idle_type idle)
>> if (time_after_eq(jiffies, rq->next_balance)) {
>> struct rq_flags rf;
>> - rq_lock_irq(rq, &rf);
>> + rq_lock_irqsave(rq, &rf);
>> update_rq_clock(rq);
>> cpu_load_update_idle(rq);
>> - rq_unlock_irq(rq, &rf);
>> + rq_unlock_irqrestore(rq, &rf);
>> if (flags & NOHZ_BALANCE_KICK)
>> rebalance_domains(rq, CPU_IDLE);
>> @@ -9466,10 +9489,17 @@ static bool nohz_idle_balance(struct rq *this_rq,
>> enum cpu_idle_type idle)
>> next_balance = rq->next_balance;
>> update_next_balance = 1;
>> }
>
>
> Why do you do this cpu_load_update_idle(rq) even this was called with
> CPU_NEWLY_IDLE? Wouldn't it be sufficient to jump to the curr_cost
> calculation in this case?

just to keep thing similar to what happen with kick_ilb and that's an
occasion to also update the cpu_load

>
>> +
>> + domain_cost = sched_clock_cpu(this_cpu) - t0;
>> + curr_cost += domain_cost;
>> +
>> }
>> - update_blocked_averages(this_cpu);
>> - has_blocked_load |= this_rq->has_blocked_load;
>> + /* Newly idle CPU doesn't need an update */
>> + if (idle != CPU_NEWLY_IDLE) {
>> + update_blocked_averages(this_cpu);
>> + has_blocked_load |= this_rq->has_blocked_load;
>> + }
>> if (flags & NOHZ_BALANCE_KICK)
>> rebalance_domains(this_rq, CPU_IDLE);
>
>
> [...]

2018-01-30 08:34:03

by Vincent Guittot

[permalink] [raw]
Subject: Re: [RFC PATCH 2/5] sched: Add NOHZ_STATS_KICK

On 29 January 2018 at 20:31, Valentin Schneider
<[email protected]> wrote:
> Hi Vincent, Peter,
>
> I've been running some tests on your patches (Peter's base + the 2 from
> Vincent). The results themselves are hosted at [1].
> The base of those tests is the same: a task ("accumulator") is ran for 5
> seconds (arbitrary value) to accumulate some load, then goes to sleep for .5
> seconds.
>
> I've set up 3 test scenarios:
>
> Update by nohz_balance_kick()
> -----------------------------
> Right before the "accumulator" task goes to sleep, a CPU-hogging task (100%
> utilization) is spawned on another CPU. It won't go idle so the only way to
> update the blocked load generated by "accumulator" is to kick an ILB
> (NOHZ_STATS_KICK).
>
> The test shows that this is behaving nicely - we keep kicking an ILB every
> ~36ms (see next test for comments on that) until there is no more blocked
> load. I did however notice some interesting scenarios: after the load has
> been fully decayed, a tiny background task can spawn and end in less than a
> scheduling period. However, it still goes through nohz_balance_enter_idle(),
> and thus sets nohz.stats_state, which will later cause an ILB kick.
>
> This makes me wonder if it's worth kicking ILBs for such tiny load values -
> perhaps it could be worth having a margin to set rq->has_blocked_load ?

So it's difficult to know what will be the load/utilization on the
cfs_rq once the cpu wakes up. Even if it's for a really short time,
that's doesn't mean that the load/utilization is small because it can
be the migration of a big task that just have a very short wakes up
this time.
That's why I don't make any assumption on the utilization/load value
when a cpu goes to sleep

>
> Furthermore, this tiny task will cause the ILB to iterate over all of the
> idle CPUs, although only one has stale load. For load update via NEWLY_IDLE
> load_balance() we use:
>
> static bool update_nohz_stats(struct rq *rq)
> {
> if (!rq->has_blocked_load)
> return false;
> [...]
> }
>
> But for load update via _nohz_idle_balance(), we iterate through all of the
> nohz CPUS and unconditionally call update_blocked_averages(). This could be
> avoided by remembering which CPUs have stale load before going idle.
> Initially I thought that was what nohz.stats_state was for, but it isn't.
> With Vincent's patches it's only ever set to either 0 or 1, but we could use
> it as a CPU mask, and use it to skip nohz CPUs that don't have stale load in
> _nohz_idle_balance() (when NOHZ_STATS_KICK).

I have studied a way to keep track of how many cpus still have blocked
load to try to minimize the number of useless ilb kick but this add
more atomic operations which can impact the system throughput with
heavy load and lot of very small wake up. that's why i have propose
this solution which is more simple. But it's probably just a matter of
where we want to "waste" time. Either we accept to spent a bit more
time to check the state of idle CPUs or we accept to kick ilb from
time to time for no good reason.

>
> Update by idle_balance()
> ------------------------
> Right before the "accumulator" task goes to sleep, a tiny periodic
> (period=32ms) task is spawned on another CPU. It's expected that it will
> update the blocked load in idle_balance(), either by running
> _nohz_idle_balance() locally or kicking an ILB (The overload flag shouldn't
> be set in this test case, so we shouldn't go through the NEWLY_IDLE
> load_balance()).
>
> This also seems to be working fine, but I'm noticing a delay between load
> updates that is closer to 64ms than 32ms. After digging into it I found out
> that the time checks done in idle_balance() and nohz_balancer_kick() are
> time_after(jiffies, next_stats), but IMHO they should be
> time_after_eq(jiffies, next_stats) to have 32ms-based updates. This also
> explains the 36ms periodicity of the updates in the test above.

I have use the 32ms as a minimum value between update. We must use the
time_after() if we want to have at least 32ms between each update. We
will have a 36ms period if the previous update was triggered by the
tick (just after in fact) but there will be only 32ms if the last
update was done during an idle_balance that happens just before the
tick. With time_after_eq, the update period will between 28 and
32ms.

Then, I mention a possible optimization by using time_after_eq in the
idle_balance() so a newly_idle cpu will have more chance (between 0
and 4ms for hz250) to do the update before a ilb is kicked

Thanks,
Vincent

>
>
> No update (idle system)
> -----------------------
> Nothing special here, just making sure nothing happens when the system is
> fully idle. On a sidenote, that's relatively hard to achieve - I had to
> switch over to Juno because my HiKey960 gets interrupts every 16ms. The Juno
> still gets woken up every now and then but it's a bit quieter.
>
>
> [1]: https://gist.github.com/valschneider/a8da7bb8e11fb1ec63a419710f56c0a0
>
>
>
> On 01/24/2018 08:25 AM, Vincent Guittot wrote:
>>
>> Hi,
>>
>> Le Thursday 18 Jan 2018 à 10:38:07 (+0000), Morten Rasmussen a écrit :
>>>
>>> On Mon, Jan 15, 2018 at 09:26:09AM +0100, Vincent Guittot wrote:
>>>>
>>>> Le Wednesday 03 Jan 2018 à 10:16:00 (+0100), Vincent Guittot a écrit :
>>>>>
>>>>> Hi Peter,
>>>>>
>>>>> On 22 December 2017 at 21:42, Peter Zijlstra <[email protected]>
>>>>> wrote:
>>>>>>
>>>>>> On Fri, Dec 22, 2017 at 07:56:29PM +0100, Peter Zijlstra wrote:
>>>>>>>
>>>>>>> Right; but I figured we'd try and do it 'right' and see how horrible
>>>>>>> it
>>>>>>> is before we try and do funny things.
>>>>>>
>>>>>> So now it should have a 32ms tick for up to .5s when the system goes
>>>>>> completely idle.
>>>>>>
>>>>>> No idea how bad that is..
>>>>>
>>>>> I have tested your branch but the timer doesn't seem to fire correctly
>>>>> because i can still see blocked load in the use case i have run.
>>>>> I haven't found the reason yet
>>>>
>>>> Hi Peter,
>>>>
>>>> With the patch below on top of your branch, the blocked loads are
>>>> updated and
>>>> decayed regularly. The main differences are:
>>>> - It doesn't use a timer to trig ilb but the tick and when a cpu becomes
>>>> idle.
>>>> The main drawback of this solution is that the load is blocked when
>>>> the
>>>> system is fully idle with the advantage of not waking up a fully idle
>>>> system. We have to wait for the next tick or newly idle event for
>>>> updating
>>>> blocked load when the system leaves idle stat which can be up to a
>>>> tick long.
>>>> If this is too long, we can check for kicking ilb when task wakes up
>>>> so the
>>>> blocked load will be updated as soon as the system leaves idle state.
>>>> The main advantage is that we don't wake up a fully idle system every
>>>> 32ms to
>>>> update blocked load that will be not used.
>>>> - I'm working on one more improvement to use nohz_idle_balance in the
>>>> newly
>>>> idle case when the system is not overloaded and
>>>> (this_rq->avg_idle > sysctl_sched_migration_cost). In this case, we
>>>> can try to
>>>> use nohz_idle_balance with NOHZ_STATS_KICK and abort as soon as it
>>>> exceed
>>>> this_rq->avg_idle. This will remove some calls to kick_ilb and some
>>>> wake up
>>>> of an idle cpus.
>>>
>>> This sound like what I meant in my other reply :-)
>>>
>>> It seems pointless to have a timer to update PELT if the system is
>>> completely idle, and when it isn't we can piggy back other events to
>>> make the updates happen.
>>
>> The patch below implements what has been described above. It calls part of
>> nohz_idle_balance when a cpu becomes idle and kick a ilb if it takes too
>> much
>> time. This removes part of ilb that are kicked on an idle cpu for updating
>> the blocked load but the ratio really depends on when the tick happens
>> compared
>> to a cpu becoming idle and the 32ms boundary. I have an additionnal patch
>> that
>> enables to update the blocked loads when a cpu becomes idle 1 period
>> before
>> kicking an ilb and there is far less ilb because we give more chance to
>> the
>> newly idle case (time_after is replaced by time_after_eq in
>> idle_balance()).
>>
>> The patch also uses a function cfs_rq_has_blocked, which only checks the
>> util/load_avg, instead of the cfs_rq_is_decayed which check *_sum too.
>> This
>> reduce significantly the number of update of blocked load. the *_avg will
>> be
>> fully decayed in around 300~400ms but it's far longer for the *_sum which
>> have
>> a higher resolution and we can easily reach almost seconds. But only the
>> *_avg
>> are used to make decision so keeping some blocked *_sum is acceptable.
>>
>> ---
>> kernel/sched/fair.c | 121
>> +++++++++++++++++++++++++++++++++++++++-------------
>> 1 file changed, 92 insertions(+), 29 deletions(-)
>>
>> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
>> index 898785d..ed90303 100644
>> --- a/kernel/sched/fair.c
>> +++ b/kernel/sched/fair.c
>> @@ -7356,6 +7356,17 @@ static inline bool cfs_rq_is_decayed(struct cfs_rq
>> *cfs_rq)
>> return true;
>> }
>> +static inline bool cfs_rq_has_blocked(struct cfs_rq *cfs_rq)
>> +{
>> + if (cfs_rq->avg.load_avg)
>> + return true;
>> +
>> + if (cfs_rq->avg.util_avg)
>> + return true;
>> +
>> + return false;
>> +}
>> +
>> #ifdef CONFIG_FAIR_GROUP_SCHED
>> static void update_blocked_averages(int cpu)
>> @@ -7393,7 +7404,9 @@ static void update_blocked_averages(int cpu)
>> */
>> if (cfs_rq_is_decayed(cfs_rq))
>> list_del_leaf_cfs_rq(cfs_rq);
>> - else
>> +
>> + /* Don't need periodic decay once load/util_avg are null
>> */
>> + if (cfs_rq_has_blocked(cfs_rq))
>> done = false;
>> }
>> @@ -7463,7 +7476,7 @@ static inline void update_blocked_averages(int
>> cpu)
>> update_cfs_rq_load_avg(cfs_rq_clock_task(cfs_rq), cfs_rq);
>> #ifdef CONFIG_NO_HZ_COMMON
>> rq->last_blocked_load_update_tick = jiffies;
>> - if (cfs_rq_is_decayed(cfs_rq))
>> + if (cfs_rq_has_blocked(cfs_rq))
>> rq->has_blocked_load = 0;
>> #endif
>> rq_unlock_irqrestore(rq, &rf);
>> @@ -8818,6 +8831,7 @@ update_next_balance(struct sched_domain *sd,
>> unsigned long *next_balance)
>> *next_balance = next;
>> }
>> +static bool _nohz_idle_balance(struct rq *this_rq, unsigned int flags,
>> enum cpu_idle_type idle);
>> static void kick_ilb(unsigned int flags);
>> /*
>> @@ -8861,7 +8875,14 @@ static int idle_balance(struct rq *this_rq, struct
>> rq_flags *rf)
>> update_next_balance(sd, &next_balance);
>> rcu_read_unlock();
>> - if (time_after(jiffies, next) &&
>> atomic_read(&nohz.stats_state))
>> + /*
>> + * Update blocked idle load if it has not been done for a
>> + * while. Try to do it locally before entering idle but
>> kick a
>> + * ilb if it takes too much time and might delay next
>> local
>> + * wake up
>> + */
>> + if (time_after(jiffies, next) &&
>> atomic_read(&nohz.stats_state) &&
>> + !_nohz_idle_balance(this_rq,
>> NOHZ_STATS_KICK, CPU_NEWLY_IDLE))
>> kick_ilb(NOHZ_STATS_KICK);
>> goto out;
>> @@ -9237,6 +9258,7 @@ void nohz_balance_enter_idle(int cpu)
>> if (!housekeeping_cpu(cpu, HK_FLAG_SCHED))
>> return;
>> + rq->has_blocked_load = 1;
>> if (rq->nohz_tick_stopped)
>> return;
>> @@ -9247,7 +9269,6 @@ void nohz_balance_enter_idle(int cpu)
>> return;
>> rq->nohz_tick_stopped = 1;
>> - rq->has_blocked_load = 1;
>> cpumask_set_cpu(cpu, nohz.idle_cpus_mask);
>> atomic_inc(&nohz.nr_cpus);
>> @@ -9259,7 +9280,6 @@ void nohz_balance_enter_idle(int cpu)
>> * enable the periodic update of the load of idle cpus
>> */
>> atomic_set(&nohz.stats_state, 1);
>> -
>> }
>> #else
>> static inline void nohz_balancer_kick(struct rq *rq) { }
>> @@ -9385,10 +9405,13 @@ static void rebalance_domains(struct rq *rq, enum
>> cpu_idle_type idle)
>> #ifdef CONFIG_NO_HZ_COMMON
>> /*
>> - * In CONFIG_NO_HZ_COMMON case, the idle balance kickee will do the
>> - * rebalancing for all the cpus for whom scheduler ticks are stopped.
>> + * Internal function that runs load balance for all idle cpus. The load
>> balance
>> + * can be a simple update of blocked load or a complete load balance with
>> + * tasks movement depending of flags.
>> + * For newly idle mode, we abort the loop if it takes too much time and
>> return
>> + * false to notify that the loop has not be completed and a ilb shoud be
>> kick.
>> */
>> -static bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type
>> idle)
>> +static bool _nohz_idle_balance(struct rq *this_rq, unsigned int flags,
>> enum cpu_idle_type idle)
>> {
>> /* Earliest time when we have to do rebalance again */
>> unsigned long now = jiffies;
>> @@ -9396,24 +9419,10 @@ static bool nohz_idle_balance(struct rq *this_rq,
>> enum cpu_idle_type idle)
>> bool has_blocked_load = false;
>> int update_next_balance = 0;
>> int this_cpu = this_rq->cpu;
>> - unsigned int flags;
>> int balance_cpu;
>> + int ret = false;
>> struct rq *rq;
>> -
>> - if (!(atomic_read(nohz_flags(this_cpu)) & NOHZ_KICK_MASK))
>> - return false;
>> -
>> - if (idle != CPU_IDLE) {
>> - atomic_andnot(NOHZ_KICK_MASK, nohz_flags(this_cpu));
>> - return false;
>> - }
>> -
>> - /*
>> - * barrier, pairs with nohz_balance_enter_idle(), ensures ...
>> - */
>> - flags = atomic_fetch_andnot(NOHZ_KICK_MASK, nohz_flags(this_cpu));
>> - if (!(flags & NOHZ_KICK_MASK))
>> - return false;
>> + u64 curr_cost = 0;
>> SCHED_WARN_ON((flags & NOHZ_KICK_MASK) == NOHZ_BALANCE_KICK);
>> @@ -9428,6 +9437,10 @@ static bool nohz_idle_balance(struct rq *this_rq,
>> enum cpu_idle_type idle)
>> atomic_set(&nohz.stats_state, 0);
>> for_each_cpu(balance_cpu, nohz.idle_cpus_mask) {
>> + u64 t0, domain_cost;
>> +
>> + t0 = sched_clock_cpu(this_cpu);
>> +
>> if (balance_cpu == this_cpu || !idle_cpu(balance_cpu))
>> continue;
>> @@ -9438,7 +9451,17 @@ static bool nohz_idle_balance(struct rq *this_rq,
>> enum cpu_idle_type idle)
>> */
>> if (need_resched()) {
>> has_blocked_load = true;
>> - break;
>> + goto abort;
>> + }
>> +
>> + /*
>> + * If the update is done while CPU becomes idle, we abort
>> + * the update when its cost is higher than the average
>> idle
>> + * time in orde to not delay a possible wake up.
>> + */
>> + if (idle == CPU_NEWLY_IDLE && this_rq->avg_idle <
>> curr_cost) {
>> + has_blocked_load = true;
>> + goto abort;
>> }
>> rq = cpu_rq(balance_cpu);
>> @@ -9453,10 +9476,10 @@ static bool nohz_idle_balance(struct rq *this_rq,
>> enum cpu_idle_type idle)
>> if (time_after_eq(jiffies, rq->next_balance)) {
>> struct rq_flags rf;
>> - rq_lock_irq(rq, &rf);
>> + rq_lock_irqsave(rq, &rf);
>> update_rq_clock(rq);
>> cpu_load_update_idle(rq);
>> - rq_unlock_irq(rq, &rf);
>> + rq_unlock_irqrestore(rq, &rf);
>> if (flags & NOHZ_BALANCE_KICK)
>> rebalance_domains(rq, CPU_IDLE);
>> @@ -9466,10 +9489,17 @@ static bool nohz_idle_balance(struct rq *this_rq,
>> enum cpu_idle_type idle)
>> next_balance = rq->next_balance;
>> update_next_balance = 1;
>> }
>> +
>> + domain_cost = sched_clock_cpu(this_cpu) - t0;
>> + curr_cost += domain_cost;
>> +
>> }
>> - update_blocked_averages(this_cpu);
>> - has_blocked_load |= this_rq->has_blocked_load;
>> + /* Newly idle CPU doesn't need an update */
>> + if (idle != CPU_NEWLY_IDLE) {
>> + update_blocked_averages(this_cpu);
>> + has_blocked_load |= this_rq->has_blocked_load;
>> + }
>> if (flags & NOHZ_BALANCE_KICK)
>> rebalance_domains(this_rq, CPU_IDLE);
>> @@ -9477,6 +9507,10 @@ static bool nohz_idle_balance(struct rq *this_rq,
>> enum cpu_idle_type idle)
>> WRITE_ONCE(nohz.next_stats,
>> now + msecs_to_jiffies(LOAD_AVG_PERIOD));
>> + /* The full idle balance loop has been done */
>> + ret = true;
>> +
>> +abort:
>> /* There is still blocked load, enable periodic update */
>> if (has_blocked_load)
>> atomic_set(&nohz.stats_state, 1);
>> @@ -9489,6 +9523,35 @@ static bool nohz_idle_balance(struct rq *this_rq,
>> enum cpu_idle_type idle)
>> if (likely(update_next_balance))
>> nohz.next_balance = next_balance;
>> + return ret;
>> +}
>> +
>> +/*
>> + * In CONFIG_NO_HZ_COMMON case, the idle balance kickee will do the
>> + * rebalancing for all the cpus for whom scheduler ticks are stopped.
>> + */
>> +static bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type
>> idle)
>> +{
>> + int this_cpu = this_rq->cpu;
>> + unsigned int flags;
>> +
>> + if (!(atomic_read(nohz_flags(this_cpu)) & NOHZ_KICK_MASK))
>> + return false;
>> +
>> + if (idle != CPU_IDLE) {
>> + atomic_andnot(NOHZ_KICK_MASK, nohz_flags(this_cpu));
>> + return false;
>> + }
>> +
>> + /*
>> + * barrier, pairs with nohz_balance_enter_idle(), ensures ...
>> + */
>> + flags = atomic_fetch_andnot(NOHZ_KICK_MASK, nohz_flags(this_cpu));
>> + if (!(flags & NOHZ_KICK_MASK))
>> + return false;
>> +
>> + _nohz_idle_balance(this_rq, flags, idle);
>> +
>> return true;
>> }
>> #else
>
>

2018-01-30 11:42:26

by Valentin Schneider

[permalink] [raw]
Subject: Re: [RFC PATCH 2/5] sched: Add NOHZ_STATS_KICK

(Resending because I snuck in some HTML... Apologies)

On 01/30/2018 08:32 AM, Vincent Guittot wrote:
> On 29 January 2018 at 20:31, Valentin Schneider
> <[email protected]> wrote:
>> Hi Vincent, Peter,
>>
>> I've been running some tests on your patches (Peter's base + the 2 from
>> Vincent). The results themselves are hosted at [1].
>> The base of those tests is the same: a task ("accumulator") is ran for 5
>> seconds (arbitrary value) to accumulate some load, then goes to sleep for .5
>> seconds.
>>
>> I've set up 3 test scenarios:
>>
>> Update by nohz_balance_kick()
>> -----------------------------
>> Right before the "accumulator" task goes to sleep, a CPU-hogging task (100%
>> utilization) is spawned on another CPU. It won't go idle so the only way to
>> update the blocked load generated by "accumulator" is to kick an ILB
>> (NOHZ_STATS_KICK).
>>
>> The test shows that this is behaving nicely - we keep kicking an ILB every
>> ~36ms (see next test for comments on that) until there is no more blocked
>> load. I did however notice some interesting scenarios: after the load has
>> been fully decayed, a tiny background task can spawn and end in less than a
>> scheduling period. However, it still goes through nohz_balance_enter_idle(),
>> and thus sets nohz.stats_state, which will later cause an ILB kick.
>>
>> This makes me wonder if it's worth kicking ILBs for such tiny load values -
>> perhaps it could be worth having a margin to set rq->has_blocked_load ?
>
> So it's difficult to know what will be the load/utilization on the
> cfs_rq once the cpu wakes up. Even if it's for a really short time,
> that's doesn't mean that the load/utilization is small because it can
> be the migration of a big task that just have a very short wakes up
> this time.
> That's why I don't make any assumption on the utilization/load value
> when a cpu goes to sleep
>

Right, hadn't thought about those kind of migrations.

>>
>> Furthermore, this tiny task will cause the ILB to iterate over all of the
>> idle CPUs, although only one has stale load. For load update via NEWLY_IDLE
>> load_balance() we use:
>>
>> static bool update_nohz_stats(struct rq *rq)
>> {
>> if (!rq->has_blocked_load)
>> return false;
>> [...]
>> }
>>
>> But for load update via _nohz_idle_balance(), we iterate through all of the
>> nohz CPUS and unconditionally call update_blocked_averages(). This could be
>> avoided by remembering which CPUs have stale load before going idle.
>> Initially I thought that was what nohz.stats_state was for, but it isn't.
>> With Vincent's patches it's only ever set to either 0 or 1, but we could use
>> it as a CPU mask, and use it to skip nohz CPUs that don't have stale load in
>> _nohz_idle_balance() (when NOHZ_STATS_KICK).
>
> I have studied a way to keep track of how many cpus still have blocked
> load to try to minimize the number of useless ilb kick but this add
> more atomic operations which can impact the system throughput with
> heavy load and lot of very small wake up. that's why i have propose
> this solution which is more simple. But it's probably just a matter of
> where we want to "waste" time. Either we accept to spent a bit more
> time to check the state of idle CPUs or we accept to kick ilb from
> time to time for no good reason.
>

Agreed. I have the feeling that spending more time doing atomic ops
could be worth it - I'll try to test this out and see if it's actually
relevant.

>>
>> Update by idle_balance()
>> ------------------------
>> Right before the "accumulator" task goes to sleep, a tiny periodic
>> (period=32ms) task is spawned on another CPU. It's expected that it will
>> update the blocked load in idle_balance(), either by running
>> _nohz_idle_balance() locally or kicking an ILB (The overload flag shouldn't
>> be set in this test case, so we shouldn't go through the NEWLY_IDLE
>> load_balance()).
>>
>> This also seems to be working fine, but I'm noticing a delay between load
>> updates that is closer to 64ms than 32ms. After digging into it I found out
>> that the time checks done in idle_balance() and nohz_balancer_kick() are
>> time_after(jiffies, next_stats), but IMHO they should be
>> time_after_eq(jiffies, next_stats) to have 32ms-based updates. This also
>> explains the 36ms periodicity of the updates in the test above.
>
> I have use the 32ms as a minimum value between update. We must use the
> time_after() if we want to have at least 32ms between each update. We
> will have a 36ms period if the previous update was triggered by the
> tick (just after in fact) but there will be only 32ms if the last
> update was done during an idle_balance that happens just before the
> tick. With time_after_eq, the update period will between 28 and
> 32ms.
>
> Then, I mention a possible optimization by using time_after_eq in the
> idle_balance() so a newly_idle cpu will have more chance (between 0
> and 4ms for hz250) to do the update before a ilb is kicked
>

IIUC with time_after() the update period should be within ]32, 36] ms,
but it looks like I'm always on that upper bound in my tests.

When evaluating whether we need to kick_ilb() for load updates, we'll
always be right after the tick (excluding the case in idle_balance),
which explains why we wait for an extra tick in the "update by
nohz_balancer_kick()" test case.

The tricky part is that, as you say, the update by idle_balance() can
happen anywhere between [0-4[ ms after a tick (or before, depending on
how you see it), so using time_after_eq could make the update period <
32ms - and this also impacts a load update by nohz_balance_kick() if the
previous update was done by idle_balance()... This is what causes the
update period to be closer to 64ms in my test case, but it's somewhat
artificial because I only have a 32ms-periodic task running - if there
was any other task running the period could remain in that ]32, 36] ms
interval.

Did I get that right ?

> Thanks,
> Vincent
>
>>
>>
>> No update (idle system)
>> -----------------------
>> Nothing special here, just making sure nothing happens when the system is
>> fully idle. On a sidenote, that's relatively hard to achieve - I had to
>> switch over to Juno because my HiKey960 gets interrupts every 16ms. The Juno
>> still gets woken up every now and then but it's a bit quieter.
>>
>>
>> [1]: https://gist.github.com/valschneider/a8da7bb8e11fb1ec63a419710f56c0a0
>>
>>
>>
>> On 01/24/2018 08:25 AM, Vincent Guittot wrote:
>>>
>>> Hi,
>>>
>>> Le Thursday 18 Jan 2018 à 10:38:07 (+0000), Morten Rasmussen a écrit :
>>>>
>>>> On Mon, Jan 15, 2018 at 09:26:09AM +0100, Vincent Guittot wrote:
>>>>>
>>>>> Le Wednesday 03 Jan 2018 à 10:16:00 (+0100), Vincent Guittot a écrit :
>>>>>>
>>>>>> Hi Peter,
>>>>>>
>>>>>> On 22 December 2017 at 21:42, Peter Zijlstra <[email protected]>
>>>>>> wrote:
>>>>>>>
>>>>>>> On Fri, Dec 22, 2017 at 07:56:29PM +0100, Peter Zijlstra wrote:
>>>>>>>>
>>>>>>>> Right; but I figured we'd try and do it 'right' and see how horrible
>>>>>>>> it
>>>>>>>> is before we try and do funny things.
>>>>>>>
>>>>>>> So now it should have a 32ms tick for up to .5s when the system goes
>>>>>>> completely idle.
>>>>>>>
>>>>>>> No idea how bad that is..
>>>>>>
>>>>>> I have tested your branch but the timer doesn't seem to fire correctly
>>>>>> because i can still see blocked load in the use case i have run.
>>>>>> I haven't found the reason yet
>>>>>
>>>>> Hi Peter,
>>>>>
>>>>> With the patch below on top of your branch, the blocked loads are
>>>>> updated and
>>>>> decayed regularly. The main differences are:
>>>>> - It doesn't use a timer to trig ilb but the tick and when a cpu becomes
>>>>> idle.
>>>>> The main drawback of this solution is that the load is blocked when
>>>>> the
>>>>> system is fully idle with the advantage of not waking up a fully idle
>>>>> system. We have to wait for the next tick or newly idle event for
>>>>> updating
>>>>> blocked load when the system leaves idle stat which can be up to a
>>>>> tick long.
>>>>> If this is too long, we can check for kicking ilb when task wakes up
>>>>> so the
>>>>> blocked load will be updated as soon as the system leaves idle state.
>>>>> The main advantage is that we don't wake up a fully idle system every
>>>>> 32ms to
>>>>> update blocked load that will be not used.
>>>>> - I'm working on one more improvement to use nohz_idle_balance in the
>>>>> newly
>>>>> idle case when the system is not overloaded and
>>>>> (this_rq->avg_idle > sysctl_sched_migration_cost). In this case, we
>>>>> can try to
>>>>> use nohz_idle_balance with NOHZ_STATS_KICK and abort as soon as it
>>>>> exceed
>>>>> this_rq->avg_idle. This will remove some calls to kick_ilb and some
>>>>> wake up
>>>>> of an idle cpus.
>>>>
>>>> This sound like what I meant in my other reply :-)
>>>>
>>>> It seems pointless to have a timer to update PELT if the system is
>>>> completely idle, and when it isn't we can piggy back other events to
>>>> make the updates happen.
>>>
>>> The patch below implements what has been described above. It calls part of
>>> nohz_idle_balance when a cpu becomes idle and kick a ilb if it takes too
>>> much
>>> time. This removes part of ilb that are kicked on an idle cpu for updating
>>> the blocked load but the ratio really depends on when the tick happens
>>> compared
>>> to a cpu becoming idle and the 32ms boundary. I have an additionnal patch
>>> that
>>> enables to update the blocked loads when a cpu becomes idle 1 period
>>> before
>>> kicking an ilb and there is far less ilb because we give more chance to
>>> the
>>> newly idle case (time_after is replaced by time_after_eq in
>>> idle_balance()).
>>>
>>> The patch also uses a function cfs_rq_has_blocked, which only checks the
>>> util/load_avg, instead of the cfs_rq_is_decayed which check *_sum too.
>>> This
>>> reduce significantly the number of update of blocked load. the *_avg will
>>> be
>>> fully decayed in around 300~400ms but it's far longer for the *_sum which
>>> have
>>> a higher resolution and we can easily reach almost seconds. But only the
>>> *_avg
>>> are used to make decision so keeping some blocked *_sum is acceptable.
>>>
>>> ---
>>> kernel/sched/fair.c | 121
>>> +++++++++++++++++++++++++++++++++++++++-------------
>>> 1 file changed, 92 insertions(+), 29 deletions(-)
>>>
>>> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
>>> index 898785d..ed90303 100644
>>> --- a/kernel/sched/fair.c
>>> +++ b/kernel/sched/fair.c
>>> @@ -7356,6 +7356,17 @@ static inline bool cfs_rq_is_decayed(struct cfs_rq
>>> *cfs_rq)
>>> return true;
>>> }
>>> +static inline bool cfs_rq_has_blocked(struct cfs_rq *cfs_rq)
>>> +{
>>> + if (cfs_rq->avg.load_avg)
>>> + return true;
>>> +
>>> + if (cfs_rq->avg.util_avg)
>>> + return true;
>>> +
>>> + return false;
>>> +}
>>> +
>>> #ifdef CONFIG_FAIR_GROUP_SCHED
>>> static void update_blocked_averages(int cpu)
>>> @@ -7393,7 +7404,9 @@ static void update_blocked_averages(int cpu)
>>> */
>>> if (cfs_rq_is_decayed(cfs_rq))
>>> list_del_leaf_cfs_rq(cfs_rq);
>>> - else
>>> +
>>> + /* Don't need periodic decay once load/util_avg are null
>>> */
>>> + if (cfs_rq_has_blocked(cfs_rq))
>>> done = false;
>>> }
>>> @@ -7463,7 +7476,7 @@ static inline void update_blocked_averages(int
>>> cpu)
>>> update_cfs_rq_load_avg(cfs_rq_clock_task(cfs_rq), cfs_rq);
>>> #ifdef CONFIG_NO_HZ_COMMON
>>> rq->last_blocked_load_update_tick = jiffies;
>>> - if (cfs_rq_is_decayed(cfs_rq))
>>> + if (cfs_rq_has_blocked(cfs_rq))
>>> rq->has_blocked_load = 0;
>>> #endif
>>> rq_unlock_irqrestore(rq, &rf);
>>> @@ -8818,6 +8831,7 @@ update_next_balance(struct sched_domain *sd,
>>> unsigned long *next_balance)
>>> *next_balance = next;
>>> }
>>> +static bool _nohz_idle_balance(struct rq *this_rq, unsigned int flags,
>>> enum cpu_idle_type idle);
>>> static void kick_ilb(unsigned int flags);
>>> /*
>>> @@ -8861,7 +8875,14 @@ static int idle_balance(struct rq *this_rq, struct
>>> rq_flags *rf)
>>> update_next_balance(sd, &next_balance);
>>> rcu_read_unlock();
>>> - if (time_after(jiffies, next) &&
>>> atomic_read(&nohz.stats_state))
>>> + /*
>>> + * Update blocked idle load if it has not been done for a
>>> + * while. Try to do it locally before entering idle but
>>> kick a
>>> + * ilb if it takes too much time and might delay next
>>> local
>>> + * wake up
>>> + */
>>> + if (time_after(jiffies, next) &&
>>> atomic_read(&nohz.stats_state) &&
>>> + !_nohz_idle_balance(this_rq,
>>> NOHZ_STATS_KICK, CPU_NEWLY_IDLE))
>>> kick_ilb(NOHZ_STATS_KICK);
>>> goto out;
>>> @@ -9237,6 +9258,7 @@ void nohz_balance_enter_idle(int cpu)
>>> if (!housekeeping_cpu(cpu, HK_FLAG_SCHED))
>>> return;
>>> + rq->has_blocked_load = 1;
>>> if (rq->nohz_tick_stopped)
>>> return;
>>> @@ -9247,7 +9269,6 @@ void nohz_balance_enter_idle(int cpu)
>>> return;
>>> rq->nohz_tick_stopped = 1;
>>> - rq->has_blocked_load = 1;
>>> cpumask_set_cpu(cpu, nohz.idle_cpus_mask);
>>> atomic_inc(&nohz.nr_cpus);
>>> @@ -9259,7 +9280,6 @@ void nohz_balance_enter_idle(int cpu)
>>> * enable the periodic update of the load of idle cpus
>>> */
>>> atomic_set(&nohz.stats_state, 1);
>>> -
>>> }
>>> #else
>>> static inline void nohz_balancer_kick(struct rq *rq) { }
>>> @@ -9385,10 +9405,13 @@ static void rebalance_domains(struct rq *rq, enum
>>> cpu_idle_type idle)
>>> #ifdef CONFIG_NO_HZ_COMMON
>>> /*
>>> - * In CONFIG_NO_HZ_COMMON case, the idle balance kickee will do the
>>> - * rebalancing for all the cpus for whom scheduler ticks are stopped.
>>> + * Internal function that runs load balance for all idle cpus. The load
>>> balance
>>> + * can be a simple update of blocked load or a complete load balance with
>>> + * tasks movement depending of flags.
>>> + * For newly idle mode, we abort the loop if it takes too much time and
>>> return
>>> + * false to notify that the loop has not be completed and a ilb shoud be
>>> kick.
>>> */
>>> -static bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type
>>> idle)
>>> +static bool _nohz_idle_balance(struct rq *this_rq, unsigned int flags,
>>> enum cpu_idle_type idle)
>>> {
>>> /* Earliest time when we have to do rebalance again */
>>> unsigned long now = jiffies;
>>> @@ -9396,24 +9419,10 @@ static bool nohz_idle_balance(struct rq *this_rq,
>>> enum cpu_idle_type idle)
>>> bool has_blocked_load = false;
>>> int update_next_balance = 0;
>>> int this_cpu = this_rq->cpu;
>>> - unsigned int flags;
>>> int balance_cpu;
>>> + int ret = false;
>>> struct rq *rq;
>>> -
>>> - if (!(atomic_read(nohz_flags(this_cpu)) & NOHZ_KICK_MASK))
>>> - return false;
>>> -
>>> - if (idle != CPU_IDLE) {
>>> - atomic_andnot(NOHZ_KICK_MASK, nohz_flags(this_cpu));
>>> - return false;
>>> - }
>>> -
>>> - /*
>>> - * barrier, pairs with nohz_balance_enter_idle(), ensures ...
>>> - */
>>> - flags = atomic_fetch_andnot(NOHZ_KICK_MASK, nohz_flags(this_cpu));
>>> - if (!(flags & NOHZ_KICK_MASK))
>>> - return false;
>>> + u64 curr_cost = 0;
>>> SCHED_WARN_ON((flags & NOHZ_KICK_MASK) == NOHZ_BALANCE_KICK);
>>> @@ -9428,6 +9437,10 @@ static bool nohz_idle_balance(struct rq *this_rq,
>>> enum cpu_idle_type idle)
>>> atomic_set(&nohz.stats_state, 0);
>>> for_each_cpu(balance_cpu, nohz.idle_cpus_mask) {
>>> + u64 t0, domain_cost;
>>> +
>>> + t0 = sched_clock_cpu(this_cpu);
>>> +
>>> if (balance_cpu == this_cpu || !idle_cpu(balance_cpu))
>>> continue;
>>> @@ -9438,7 +9451,17 @@ static bool nohz_idle_balance(struct rq *this_rq,
>>> enum cpu_idle_type idle)
>>> */
>>> if (need_resched()) {
>>> has_blocked_load = true;
>>> - break;
>>> + goto abort;
>>> + }
>>> +
>>> + /*
>>> + * If the update is done while CPU becomes idle, we abort
>>> + * the update when its cost is higher than the average
>>> idle
>>> + * time in orde to not delay a possible wake up.
>>> + */
>>> + if (idle == CPU_NEWLY_IDLE && this_rq->avg_idle <
>>> curr_cost) {
>>> + has_blocked_load = true;
>>> + goto abort;
>>> }
>>> rq = cpu_rq(balance_cpu);
>>> @@ -9453,10 +9476,10 @@ static bool nohz_idle_balance(struct rq *this_rq,
>>> enum cpu_idle_type idle)
>>> if (time_after_eq(jiffies, rq->next_balance)) {
>>> struct rq_flags rf;
>>> - rq_lock_irq(rq, &rf);
>>> + rq_lock_irqsave(rq, &rf);
>>> update_rq_clock(rq);
>>> cpu_load_update_idle(rq);
>>> - rq_unlock_irq(rq, &rf);
>>> + rq_unlock_irqrestore(rq, &rf);
>>> if (flags & NOHZ_BALANCE_KICK)
>>> rebalance_domains(rq, CPU_IDLE);
>>> @@ -9466,10 +9489,17 @@ static bool nohz_idle_balance(struct rq *this_rq,
>>> enum cpu_idle_type idle)
>>> next_balance = rq->next_balance;
>>> update_next_balance = 1;
>>> }
>>> +
>>> + domain_cost = sched_clock_cpu(this_cpu) - t0;
>>> + curr_cost += domain_cost;
>>> +
>>> }
>>> - update_blocked_averages(this_cpu);
>>> - has_blocked_load |= this_rq->has_blocked_load;
>>> + /* Newly idle CPU doesn't need an update */
>>> + if (idle != CPU_NEWLY_IDLE) {
>>> + update_blocked_averages(this_cpu);
>>> + has_blocked_load |= this_rq->has_blocked_load;
>>> + }
>>> if (flags & NOHZ_BALANCE_KICK)
>>> rebalance_domains(this_rq, CPU_IDLE);
>>> @@ -9477,6 +9507,10 @@ static bool nohz_idle_balance(struct rq *this_rq,
>>> enum cpu_idle_type idle)
>>> WRITE_ONCE(nohz.next_stats,
>>> now + msecs_to_jiffies(LOAD_AVG_PERIOD));
>>> + /* The full idle balance loop has been done */
>>> + ret = true;
>>> +
>>> +abort:
>>> /* There is still blocked load, enable periodic update */
>>> if (has_blocked_load)
>>> atomic_set(&nohz.stats_state, 1);
>>> @@ -9489,6 +9523,35 @@ static bool nohz_idle_balance(struct rq *this_rq,
>>> enum cpu_idle_type idle)
>>> if (likely(update_next_balance))
>>> nohz.next_balance = next_balance;
>>> + return ret;
>>> +}
>>> +
>>> +/*
>>> + * In CONFIG_NO_HZ_COMMON case, the idle balance kickee will do the
>>> + * rebalancing for all the cpus for whom scheduler ticks are stopped.
>>> + */
>>> +static bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type
>>> idle)
>>> +{
>>> + int this_cpu = this_rq->cpu;
>>> + unsigned int flags;
>>> +
>>> + if (!(atomic_read(nohz_flags(this_cpu)) & NOHZ_KICK_MASK))
>>> + return false;
>>> +
>>> + if (idle != CPU_IDLE) {
>>> + atomic_andnot(NOHZ_KICK_MASK, nohz_flags(this_cpu));
>>> + return false;
>>> + }
>>> +
>>> + /*
>>> + * barrier, pairs with nohz_balance_enter_idle(), ensures ...
>>> + */
>>> + flags = atomic_fetch_andnot(NOHZ_KICK_MASK, nohz_flags(this_cpu));
>>> + if (!(flags & NOHZ_KICK_MASK))
>>> + return false;
>>> +
>>> + _nohz_idle_balance(this_rq, flags, idle);
>>> +
>>> return true;
>>> }
>>> #else
>>
>>

2018-01-30 13:07:06

by Vincent Guittot

[permalink] [raw]
Subject: Re: [RFC PATCH 2/5] sched: Add NOHZ_STATS_KICK

On 30 January 2018 at 12:41, Valentin Schneider
<[email protected]> wrote:
> (Resending because I snuck in some HTML... Apologies)
>
> On 01/30/2018 08:32 AM, Vincent Guittot wrote:
>>
>> On 29 January 2018 at 20:31, Valentin Schneider
>> <[email protected]> wrote:
>>>
>>> Hi Vincent, Peter,
>>>
>>> I've been running some tests on your patches (Peter's base + the 2 from
>>> Vincent). The results themselves are hosted at [1].
>>> The base of those tests is the same: a task ("accumulator") is ran for 5
>>> seconds (arbitrary value) to accumulate some load, then goes to sleep for
>>> .5
>>> seconds.
>>>
>>> I've set up 3 test scenarios:
>>>
>>> Update by nohz_balance_kick()
>>> -----------------------------
>>> Right before the "accumulator" task goes to sleep, a CPU-hogging task
>>> (100%
>>> utilization) is spawned on another CPU. It won't go idle so the only way
>>> to
>>> update the blocked load generated by "accumulator" is to kick an ILB
>>> (NOHZ_STATS_KICK).
>>>
>>> The test shows that this is behaving nicely - we keep kicking an ILB
>>> every
>>> ~36ms (see next test for comments on that) until there is no more blocked
>>> load. I did however notice some interesting scenarios: after the load has
>>> been fully decayed, a tiny background task can spawn and end in less than
>>> a
>>> scheduling period. However, it still goes through
>>> nohz_balance_enter_idle(),
>>> and thus sets nohz.stats_state, which will later cause an ILB kick.
>>>
>>> This makes me wonder if it's worth kicking ILBs for such tiny load values
>>> -
>>> perhaps it could be worth having a margin to set rq->has_blocked_load ?
>>
>>
>> So it's difficult to know what will be the load/utilization on the
>> cfs_rq once the cpu wakes up. Even if it's for a really short time,
>> that's doesn't mean that the load/utilization is small because it can
>> be the migration of a big task that just have a very short wakes up
>> this time.
>> That's why I don't make any assumption on the utilization/load value
>> when a cpu goes to sleep
>>
>
> Right, hadn't thought about those kind of migrations.
>
>>>
>>> Furthermore, this tiny task will cause the ILB to iterate over all of the
>>> idle CPUs, although only one has stale load. For load update via
>>> NEWLY_IDLE
>>> load_balance() we use:
>>>
>>> static bool update_nohz_stats(struct rq *rq)
>>> {
>>> if (!rq->has_blocked_load)
>>> return false;
>>> [...]
>>> }
>>>
>>> But for load update via _nohz_idle_balance(), we iterate through all of
>>> the
>>> nohz CPUS and unconditionally call update_blocked_averages(). This could
>>> be
>>> avoided by remembering which CPUs have stale load before going idle.
>>> Initially I thought that was what nohz.stats_state was for, but it isn't.
>>> With Vincent's patches it's only ever set to either 0 or 1, but we could
>>> use
>>> it as a CPU mask, and use it to skip nohz CPUs that don't have stale load
>>> in
>>> _nohz_idle_balance() (when NOHZ_STATS_KICK).
>>
>>
>> I have studied a way to keep track of how many cpus still have blocked
>> load to try to minimize the number of useless ilb kick but this add
>> more atomic operations which can impact the system throughput with
>> heavy load and lot of very small wake up. that's why i have propose
>> this solution which is more simple. But it's probably just a matter of
>> where we want to "waste" time. Either we accept to spent a bit more
>> time to check the state of idle CPUs or we accept to kick ilb from
>> time to time for no good reason.
>>
>
> Agreed. I have the feeling that spending more time doing atomic ops could be
> worth it - I'll try to test this out and see if it's actually relevant.
>
>>>
>>> Update by idle_balance()
>>> ------------------------
>>> Right before the "accumulator" task goes to sleep, a tiny periodic
>>> (period=32ms) task is spawned on another CPU. It's expected that it will
>>> update the blocked load in idle_balance(), either by running
>>> _nohz_idle_balance() locally or kicking an ILB (The overload flag
>>> shouldn't
>>> be set in this test case, so we shouldn't go through the NEWLY_IDLE
>>> load_balance()).
>>>
>>> This also seems to be working fine, but I'm noticing a delay between load
>>> updates that is closer to 64ms than 32ms. After digging into it I found
>>> out
>>> that the time checks done in idle_balance() and nohz_balancer_kick() are
>>> time_after(jiffies, next_stats), but IMHO they should be
>>> time_after_eq(jiffies, next_stats) to have 32ms-based updates. This also
>>> explains the 36ms periodicity of the updates in the test above.
>>
>>
>> I have use the 32ms as a minimum value between update. We must use the
>> time_after() if we want to have at least 32ms between each update. We
>> will have a 36ms period if the previous update was triggered by the
>> tick (just after in fact) but there will be only 32ms if the last
>> update was done during an idle_balance that happens just before the
>> tick. With time_after_eq, the update period will between 28 and
>> 32ms.
>>
>> Then, I mention a possible optimization by using time_after_eq in the
>> idle_balance() so a newly_idle cpu will have more chance (between 0
>> and 4ms for hz250) to do the update before a ilb is kicked
>>
>
> IIUC with time_after() the update period should be within ]32, 36] ms, but
> it looks like I'm always on that upper bound in my tests.
>
> When evaluating whether we need to kick_ilb() for load updates, we'll always
> be right after the tick (excluding the case in idle_balance), which explains
> why we wait for an extra tick in the "update by nohz_balancer_kick()" test
> case.
>
> The tricky part is that, as you say, the update by idle_balance() can happen
> anywhere between [0-4[ ms after a tick (or before, depending on how you see
> it), so using time_after_eq could make the update period < 32ms - and this
> also impacts a load update by nohz_balance_kick() if the previous update was
> done by idle_balance()... This is what causes the update period to be closer
> to 64ms in my test case, but it's somewhat artificial because I only have a
> 32ms-periodic task running - if there was any other task running the period
> could remain in that ]32, 36] ms interval.
>
> Did I get that right ?

yes

>
>> Thanks,
>> Vincent
>>
>>>
>>>
>>> No update (idle system)
>>> -----------------------
>>> Nothing special here, just making sure nothing happens when the system is
>>> fully idle. On a sidenote, that's relatively hard to achieve - I had to
>>> switch over to Juno because my HiKey960 gets interrupts every 16ms. The
>>> Juno
>>> still gets woken up every now and then but it's a bit quieter.
>>>
>>>
>>> [1]:
>>> https://gist.github.com/valschneider/a8da7bb8e11fb1ec63a419710f56c0a0
>>>
>>>
>>>


[snip]

2018-02-01 16:54:16

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [RFC PATCH 2/5] sched: Add NOHZ_STATS_KICK

On Mon, Jan 15, 2018 at 09:26:09AM +0100, Vincent Guittot wrote:

Would've probably been easier to read if you'd not included the revert
of that timer patch...

> @@ -9258,21 +9255,11 @@ void nohz_balance_enter_idle(int cpu)
> set_cpu_sd_state_idle(cpu);
>
> /*
> - * implies a barrier such that if the stats_state update is observed
> - * the above updates are also visible. Pairs with stuff in
> - * update_sd_lb_stats() and nohz_idle_balance().
> + * Each time a cpu enter idle, we assume that it has blocked load and
> + * enable the periodic update of the load of idle cpus
> */
> - val = atomic_read(&nohz.stats_state);
> - do {
> - new = val + 2;
> - new |= 1;
> - } while (!atomic_try_cmpxchg(&nohz.stats_state, &val, new));
> + atomic_set(&nohz.stats_state, 1);
>

> @@ -9422,7 +9408,6 @@ static bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
> return false;
> }
>
> - stats_seq = atomic_read(&nohz.stats_state);
> /*
> * barrier, pairs with nohz_balance_enter_idle(), ensures ...
> */
> @@ -9432,6 +9417,16 @@ static bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
>
> SCHED_WARN_ON((flags & NOHZ_KICK_MASK) == NOHZ_BALANCE_KICK);
>
> + /*
> + * We assume there will be no idle load after this update and clear
> + * the stats state. If a cpu enters idle in the mean time, it will
> + * set the stats state and trig another update of idle load.
> + * Because a cpu that becomes idle, is added to idle_cpus_mask before
> + * setting the stats state, we are sure to not clear the state and not
> + * check the load of an idle cpu.
> + */
> + atomic_set(&nohz.stats_state, 0);
> +
> for_each_cpu(balance_cpu, nohz.idle_cpus_mask) {
> if (balance_cpu == this_cpu || !idle_cpu(balance_cpu))
> continue;
> @@ -9441,8 +9436,10 @@ static bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
> * work being done for other cpus. Next load
> * balancing owner will pick it up.
> */
> - if (need_resched())
> + if (need_resched()) {
> + has_blocked_load = true;
> break;
> + }
>
> rq = cpu_rq(balance_cpu);
>
> @@ -9477,12 +9474,12 @@ static bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
> if (flags & NOHZ_BALANCE_KICK)
> rebalance_domains(this_rq, CPU_IDLE);
>
> - if (has_blocked_load ||
> - !atomic_try_cmpxchg(&nohz.stats_state, &stats_seq, 0)) {
> - WRITE_ONCE(nohz.next_stats,
> - now + msecs_to_jiffies(LOAD_AVG_PERIOD));
> - mod_timer(&nohz.timer, nohz.next_stats);
> - }
> + WRITE_ONCE(nohz.next_stats,
> + now + msecs_to_jiffies(LOAD_AVG_PERIOD));
> +
> + /* There is still blocked load, enable periodic update */
> + if (has_blocked_load)
> + atomic_set(&nohz.stats_state, 1);
>
> /*
> * next_balance will be updated only when there is a need.

After this there is no point for stats_state to be atomic anymore. Also
a better name.

Maybe if I drop the last two patches (and you re-introduce the bits
from: Subject: sched: Optimize nohz stats, that you do need) this all
becomes more readable?

2018-02-01 16:57:21

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [RFC PATCH 2/5] sched: Add NOHZ_STATS_KICK

On Thu, Jan 18, 2018 at 10:38:07AM +0000, Morten Rasmussen wrote:
> It seems pointless to have a timer to update PELT if the system is
> completely idle, and when it isn't we can piggy back other events to
> make the updates happen.

Only if we do that update before making decisions based on the values.
The thing I was bothered by in the earlier patches was that wakeup would
use whatever current value and async kick something to go update.

That just seems wrong.

2018-02-01 16:58:41

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [RFC PATCH 2/5] sched: Add NOHZ_STATS_KICK

On Wed, Jan 24, 2018 at 09:25:36AM +0100, Vincent Guittot wrote:
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index 898785d..ed90303 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -7356,6 +7356,17 @@ static inline bool cfs_rq_is_decayed(struct cfs_rq *cfs_rq)
> return true;
> }
>
> +static inline bool cfs_rq_has_blocked(struct cfs_rq *cfs_rq)
> +{
> + if (cfs_rq->avg.load_avg)
> + return true;
> +
> + if (cfs_rq->avg.util_avg)
> + return true;
> +
> + return false;
> +}
> +
> #ifdef CONFIG_FAIR_GROUP_SCHED
>
> static void update_blocked_averages(int cpu)
> @@ -7393,7 +7404,9 @@ static void update_blocked_averages(int cpu)
> */
> if (cfs_rq_is_decayed(cfs_rq))
> list_del_leaf_cfs_rq(cfs_rq);
> - else
> +
> + /* Don't need periodic decay once load/util_avg are null */
> + if (cfs_rq_has_blocked(cfs_rq))
> done = false;
> }
>
> @@ -7463,7 +7476,7 @@ static inline void update_blocked_averages(int cpu)
> update_cfs_rq_load_avg(cfs_rq_clock_task(cfs_rq), cfs_rq);
> #ifdef CONFIG_NO_HZ_COMMON
> rq->last_blocked_load_update_tick = jiffies;
> - if (cfs_rq_is_decayed(cfs_rq))
> + if (cfs_rq_has_blocked(cfs_rq))
> rq->has_blocked_load = 0;
> #endif
> rq_unlock_irqrestore(rq, &rf);

OK makes sense; would've been even better as a separate patch :-)

2018-02-01 17:28:36

by Vincent Guittot

[permalink] [raw]
Subject: Re: [RFC PATCH 2/5] sched: Add NOHZ_STATS_KICK

On 1 February 2018 at 17:57, Peter Zijlstra <[email protected]> wrote:
> On Wed, Jan 24, 2018 at 09:25:36AM +0100, Vincent Guittot wrote:
>> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
>> index 898785d..ed90303 100644
>> --- a/kernel/sched/fair.c
>> +++ b/kernel/sched/fair.c
>> @@ -7356,6 +7356,17 @@ static inline bool cfs_rq_is_decayed(struct cfs_rq *cfs_rq)
>> return true;
>> }
>>
>> +static inline bool cfs_rq_has_blocked(struct cfs_rq *cfs_rq)
>> +{
>> + if (cfs_rq->avg.load_avg)
>> + return true;
>> +
>> + if (cfs_rq->avg.util_avg)
>> + return true;
>> +
>> + return false;
>> +}
>> +
>> #ifdef CONFIG_FAIR_GROUP_SCHED
>>
>> static void update_blocked_averages(int cpu)
>> @@ -7393,7 +7404,9 @@ static void update_blocked_averages(int cpu)
>> */
>> if (cfs_rq_is_decayed(cfs_rq))
>> list_del_leaf_cfs_rq(cfs_rq);
>> - else
>> +
>> + /* Don't need periodic decay once load/util_avg are null */
>> + if (cfs_rq_has_blocked(cfs_rq))
>> done = false;
>> }
>>
>> @@ -7463,7 +7476,7 @@ static inline void update_blocked_averages(int cpu)
>> update_cfs_rq_load_avg(cfs_rq_clock_task(cfs_rq), cfs_rq);
>> #ifdef CONFIG_NO_HZ_COMMON
>> rq->last_blocked_load_update_tick = jiffies;
>> - if (cfs_rq_is_decayed(cfs_rq))
>> + if (cfs_rq_has_blocked(cfs_rq))
>> rq->has_blocked_load = 0;
>> #endif
>> rq_unlock_irqrestore(rq, &rf);
>
> OK makes sense; would've been even better as a separate patch :-)

Yes i will make a separate patch for that

2018-02-01 17:28:40

by Vincent Guittot

[permalink] [raw]
Subject: Re: [RFC PATCH 2/5] sched: Add NOHZ_STATS_KICK

On 1 February 2018 at 17:52, Peter Zijlstra <[email protected]> wrote:
> On Mon, Jan 15, 2018 at 09:26:09AM +0100, Vincent Guittot wrote:
>
> Would've probably been easier to read if you'd not included the revert
> of that timer patch...
>
>> @@ -9258,21 +9255,11 @@ void nohz_balance_enter_idle(int cpu)
>> set_cpu_sd_state_idle(cpu);
>>
>> /*
>> - * implies a barrier such that if the stats_state update is observed
>> - * the above updates are also visible. Pairs with stuff in
>> - * update_sd_lb_stats() and nohz_idle_balance().
>> + * Each time a cpu enter idle, we assume that it has blocked load and
>> + * enable the periodic update of the load of idle cpus
>> */
>> - val = atomic_read(&nohz.stats_state);
>> - do {
>> - new = val + 2;
>> - new |= 1;
>> - } while (!atomic_try_cmpxchg(&nohz.stats_state, &val, new));
>> + atomic_set(&nohz.stats_state, 1);
>>
>
>> @@ -9422,7 +9408,6 @@ static bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
>> return false;
>> }
>>
>> - stats_seq = atomic_read(&nohz.stats_state);
>> /*
>> * barrier, pairs with nohz_balance_enter_idle(), ensures ...
>> */
>> @@ -9432,6 +9417,16 @@ static bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
>>
>> SCHED_WARN_ON((flags & NOHZ_KICK_MASK) == NOHZ_BALANCE_KICK);
>>
>> + /*
>> + * We assume there will be no idle load after this update and clear
>> + * the stats state. If a cpu enters idle in the mean time, it will
>> + * set the stats state and trig another update of idle load.
>> + * Because a cpu that becomes idle, is added to idle_cpus_mask before
>> + * setting the stats state, we are sure to not clear the state and not
>> + * check the load of an idle cpu.
>> + */
>> + atomic_set(&nohz.stats_state, 0);
>> +
>> for_each_cpu(balance_cpu, nohz.idle_cpus_mask) {
>> if (balance_cpu == this_cpu || !idle_cpu(balance_cpu))
>> continue;
>> @@ -9441,8 +9436,10 @@ static bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
>> * work being done for other cpus. Next load
>> * balancing owner will pick it up.
>> */
>> - if (need_resched())
>> + if (need_resched()) {
>> + has_blocked_load = true;
>> break;
>> + }
>>
>> rq = cpu_rq(balance_cpu);
>>
>> @@ -9477,12 +9474,12 @@ static bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
>> if (flags & NOHZ_BALANCE_KICK)
>> rebalance_domains(this_rq, CPU_IDLE);
>>
>> - if (has_blocked_load ||
>> - !atomic_try_cmpxchg(&nohz.stats_state, &stats_seq, 0)) {
>> - WRITE_ONCE(nohz.next_stats,
>> - now + msecs_to_jiffies(LOAD_AVG_PERIOD));
>> - mod_timer(&nohz.timer, nohz.next_stats);
>> - }
>> + WRITE_ONCE(nohz.next_stats,
>> + now + msecs_to_jiffies(LOAD_AVG_PERIOD));
>> +
>> + /* There is still blocked load, enable periodic update */
>> + if (has_blocked_load)
>> + atomic_set(&nohz.stats_state, 1);
>>
>> /*
>> * next_balance will be updated only when there is a need.
>
> After this there is no point for stats_state to be atomic anymore. Also
> a better name.

Ok

>
> Maybe if I drop the last two patches (and you re-introduce the bits
> from: Subject: sched: Optimize nohz stats, that you do need) this all
> becomes more readable?

Yes. we can do like that

2018-02-01 18:11:54

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [RFC PATCH 2/5] sched: Add NOHZ_STATS_KICK

On Wed, Jan 24, 2018 at 09:25:36AM +0100, Vincent Guittot wrote:
> @@ -8861,7 +8875,14 @@ static int idle_balance(struct rq *this_rq, struct rq_flags *rf)
> update_next_balance(sd, &next_balance);
> rcu_read_unlock();
>
> - if (time_after(jiffies, next) && atomic_read(&nohz.stats_state))
> + /*
> + * Update blocked idle load if it has not been done for a
> + * while. Try to do it locally before entering idle but kick a
> + * ilb if it takes too much time and might delay next local
> + * wake up
> + */
> + if (time_after(jiffies, next) && atomic_read(&nohz.stats_state) &&
> + !_nohz_idle_balance(this_rq, NOHZ_STATS_KICK, CPU_NEWLY_IDLE))
> kick_ilb(NOHZ_STATS_KICK);
>
> goto out;

This I really dislike. We're here because avg_idle is _really_ low, we
really should not then call _nohz_idle_balance().

2018-02-01 18:18:47

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [RFC PATCH 2/5] sched: Add NOHZ_STATS_KICK

On Mon, Jan 29, 2018 at 07:31:07PM +0000, Valentin Schneider wrote:

> But for load update via _nohz_idle_balance(), we iterate through all of the
> nohz CPUS and unconditionally call update_blocked_averages(). This could be
> avoided by remembering which CPUs have stale load before going idle.
> Initially I thought that was what nohz.stats_state was for, but it isn't.
> With Vincent's patches it's only ever set to either 0 or 1, but we could use
> it as a CPU mask, and use it to skip nohz CPUs that don't have stale load in
> _nohz_idle_balance() (when NOHZ_STATS_KICK).

Yes, you'd need to allocate a second cpumask, worse you need atomic
bitops to set and clear bits there.

That all _might_ be worth it... dunno.


2018-02-01 19:31:24

by Vincent Guittot

[permalink] [raw]
Subject: Re: [RFC PATCH 2/5] sched: Add NOHZ_STATS_KICK

On 1 February 2018 at 19:10, Peter Zijlstra <[email protected]> wrote:
> On Wed, Jan 24, 2018 at 09:25:36AM +0100, Vincent Guittot wrote:
>> @@ -8861,7 +8875,14 @@ static int idle_balance(struct rq *this_rq, struct rq_flags *rf)
>> update_next_balance(sd, &next_balance);
>> rcu_read_unlock();
>>
>> - if (time_after(jiffies, next) && atomic_read(&nohz.stats_state))
>> + /*
>> + * Update blocked idle load if it has not been done for a
>> + * while. Try to do it locally before entering idle but kick a
>> + * ilb if it takes too much time and might delay next local
>> + * wake up
>> + */
>> + if (time_after(jiffies, next) && atomic_read(&nohz.stats_state) &&
>> + !_nohz_idle_balance(this_rq, NOHZ_STATS_KICK, CPU_NEWLY_IDLE))
>> kick_ilb(NOHZ_STATS_KICK);
>>
>> goto out;
>
> This I really dislike. We're here because avg_idle is _really_ low, we
> really should not then call _nohz_idle_balance().

Yes. In fact I was targeting the case were (this_rq->avg_idle >=
sysctl_sched_migration_cost) and the system is not overloaded

2018-02-05 22:19:41

by Valentin Schneider

[permalink] [raw]
Subject: Re: [RFC PATCH 2/5] sched: Add NOHZ_STATS_KICK

On 01/30/2018 11:41 AM, Valentin Schneider wrote:
> [...]
>> I have studied a way to keep track of how many cpus still have blocked
>> load to try to minimize the number of useless ilb kick but this add
>> more atomic operations which can impact the system throughput with
>> heavy load and lot of very small wake up. that's why i have propose
>> this solution which is more simple. But it's probably just a matter of
>> where we want to "waste" time. Either we accept to spent a bit more
>> time to check the state of idle CPUs or we accept to kick ilb from
>> time to time for no good reason.
>>
>
> Agreed. I have the feeling that spending more time doing atomic ops could be worth it - I'll try to test this out and see if it's actually relevant.
>

I gave this a spin, still using Vincent's patches with the included patch on top. Nothing too clever, just seeing how replacing nohz.stats_state with a cpumask would go.

I've replaced nohz.stats_state by nohz.stale_cpus_mask. I kept changes minimal - there are some places where I think nohz.idle_cpus_mask could be substituted by nohz.stale_cpus_mask. Speaking about that, I was about to write a comment regarding the fact that nohz.stale_cpus_mask should be a subset of nohz.idle_cpus_mask, but I realized it's actually not true:

In the current implementation (cpumask or stats_state flag), an idle CPU is defined as having blocked load as soon as it goes through nohz_balance_enter_idle(), and that flag is cleared when we go through _nohz_idle_balance() (and newly idle load balance in my cpumask implementation).
However we can imagine a scenario where a CPU goes idle, is flagged as having blocked load, then it wakes up and goes through its periodic balance code and updates that load. Yet, the flag (or cpumask) won't have been updated.
So I think there could be a discussion on whether the flag should be cleared on nohz_balance_exit_idle() if we assume periodic balance now takes care of this. It could cause issues if we have a weird scenario where a CPU keeps going online/idle but never stays online long enough for a tick though.
Alternatively we could clear that flag when going through the first periodic balance after idling, but then that's a bit weird because we're using a nohz structure in a non-nohz context.


Anyway, I tried to get some profiling done with the cpumask but there's something wrong with my setup, I would only get nonsense numbers (for both baseline & my patch), so I added start/end trace_printks to _nohz_idle_balance(). It's ugly, inaccurate and unorthodox but it still gives a rough idea of how the cpumask impacts stuff.
I ran 20 iterations of my "nohz update" test case (a task accumulates load, goes to sleep, and another always-running task keeps kicking an ILB to decay that blocked load) and the time saved by skipping CPUs is in the ballpark of 20%. Notebook is at [1].

I'll try to get a proper function profiling working for when Vincent posts his "v2".

[1]: https://gist.github.com/valschneider/6f203143bee1e149f24c44e9582a9eff

---
kernel/sched/fair.c | 72 ++++++++++++++++++++++++++++-------------------------
1 file changed, 38 insertions(+), 34 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 3ada92b..8bcf465 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -5404,8 +5404,8 @@ decay_load_missed(unsigned long load, unsigned long missed_updates, int idx)

static struct {
cpumask_var_t idle_cpus_mask;
+ cpumask_var_t stale_cpus_mask;
atomic_t nr_cpus;
- atomic_t stats_state;
unsigned long next_balance; /* in jiffy units */
unsigned long next_stats;
} nohz ____cacheline_aligned;
@@ -6968,7 +6968,6 @@ enum fbq_type { regular, remote, all };
#define LBF_DST_PINNED 0x04
#define LBF_SOME_PINNED 0x08
#define LBF_NOHZ_STATS 0x10
-#define LBF_NOHZ_AGAIN 0x20

struct lb_env {
struct sched_domain *sd;
@@ -7829,25 +7828,25 @@ group_type group_classify(struct sched_group *group,
return group_other;
}

-static bool update_nohz_stats(struct rq *rq)
+static void update_nohz_stats(struct rq *rq)
{
#ifdef CONFIG_NO_HZ_COMMON
unsigned int cpu = rq->cpu;

- if (!rq->has_blocked_load)
- return false;
-
- if (!cpumask_test_cpu(cpu, nohz.idle_cpus_mask))
- return false;
+ if (!cpumask_test_cpu(cpu, nohz.stale_cpus_mask))
+ return;

if (!time_after(jiffies, rq->last_blocked_load_update_tick))
- return true;
+ return;

update_blocked_averages(cpu);

- return rq->has_blocked_load;
+ if (rq->has_blocked_load)
+ return;
+
+ cpumask_clear_cpu(cpu, nohz.stale_cpus_mask);
#else
- return false;
+ return;
#endif
}

@@ -7873,8 +7872,8 @@ static inline void update_sg_lb_stats(struct lb_env *env,
for_each_cpu_and(i, sched_group_span(group), env->cpus) {
struct rq *rq = cpu_rq(i);

- if ((env->flags & LBF_NOHZ_STATS) && update_nohz_stats(rq))
- env->flags |= LBF_NOHZ_AGAIN;
+ if (env->flags & LBF_NOHZ_STATS)
+ update_nohz_stats(rq);

/* Bias balancing toward cpus of our domain */
if (local_group)
@@ -8032,7 +8031,8 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd
prefer_sibling = 1;

#ifdef CONFIG_NO_HZ_COMMON
- if (env->idle == CPU_NEWLY_IDLE && atomic_read(&nohz.stats_state)) {
+ if (env->idle == CPU_NEWLY_IDLE &&
+ cpumask_intersects(sched_domain_span(env->sd), nohz.stale_cpus_mask)) {
env->flags |= LBF_NOHZ_STATS;
}
#endif
@@ -8091,8 +8091,13 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd
} while (sg != env->sd->groups);

#ifdef CONFIG_NO_HZ_COMMON
- if ((env->flags & LBF_NOHZ_AGAIN) &&
- cpumask_subset(nohz.idle_cpus_mask, sched_domain_span(env->sd))) {
+ /*
+ * All nohz CPUs with blocked load were visited but some haven't fully
+ * decayed. Visit them again later.
+ */
+ if ((env->flags & LBF_NOHZ_STATS) &&
+ cpumask_subset(nohz.idle_cpus_mask, sched_domain_span(env->sd)) &&
+ !cpumask_empty(nohz.stale_cpus_mask)) {

WRITE_ONCE(nohz.next_stats,
jiffies + msecs_to_jiffies(LOAD_AVG_PERIOD));
@@ -8897,7 +8902,7 @@ static int idle_balance(struct rq *this_rq, struct rq_flags *rf)
* ilb if it takes too much time and might delay next local
* wake up
*/
- if (time_after(jiffies, next) && atomic_read(&nohz.stats_state) &&
+ if (time_after(jiffies, next) && !cpumask_empty(nohz.stale_cpus_mask) &&
!_nohz_idle_balance(this_rq, NOHZ_STATS_KICK, CPU_NEWLY_IDLE))
kick_ilb(NOHZ_STATS_KICK);

@@ -9153,7 +9158,7 @@ static void nohz_balancer_kick(struct rq *rq)
if (likely(!atomic_read(&nohz.nr_cpus)))
return;

- if (time_after(now, nohz.next_stats) && atomic_read(&nohz.stats_state))
+ if (time_after(now, nohz.next_stats) && !cpumask_empty(nohz.stale_cpus_mask))
flags = NOHZ_STATS_KICK;

if (time_before(now, nohz.next_balance))
@@ -9292,10 +9297,10 @@ void nohz_balance_enter_idle(int cpu)
set_cpu_sd_state_idle(cpu);

/*
- * Each time a cpu enter idle, we assume that it has blocked load and
- * enable the periodic update of the load of idle cpus
+ * Each time a cpu enters idle, we assume that it has blocked load and
+ * thus enable the periodic update of its load
*/
- atomic_set(&nohz.stats_state, 1);
+ cpumask_set_cpu(cpu, nohz.stale_cpus_mask);
}
#else
static inline void nohz_balancer_kick(struct rq *rq) { }
@@ -9432,7 +9437,6 @@ static bool _nohz_idle_balance(struct rq *this_rq, unsigned int flags, enum cpu_
/* Earliest time when we have to do rebalance again */
unsigned long now = jiffies;
unsigned long next_balance = now + 60*HZ;
- bool has_blocked_load = false;
int update_next_balance = 0;
int this_cpu = this_rq->cpu;
int balance_cpu;
@@ -9450,7 +9454,6 @@ static bool _nohz_idle_balance(struct rq *this_rq, unsigned int flags, enum cpu_
* setting the stats state, we are sure to not clear the state and not
* check the load of an idle cpu.
*/
- atomic_set(&nohz.stats_state, 0);

for_each_cpu(balance_cpu, nohz.idle_cpus_mask) {
u64 t0, domain_cost;
@@ -9460,30 +9463,31 @@ static bool _nohz_idle_balance(struct rq *this_rq, unsigned int flags, enum cpu_
if (balance_cpu == this_cpu || !idle_cpu(balance_cpu))
continue;

+ if ((flags & NOHZ_KICK_MASK) == NOHZ_STATS_KICK &&
+ !cpumask_test_cpu(balance_cpu, nohz.stale_cpus_mask))
+ continue;
+
/*
* If this cpu gets work to do, stop the load balancing
* work being done for other cpus. Next load
* balancing owner will pick it up.
*/
- if (need_resched()) {
- has_blocked_load = true;
+ if (need_resched())
goto abort;
- }

/*
* If the update is done while CPU becomes idle, we abort
* the update when its cost is higher than the average idle
* time in orde to not delay a possible wake up.
*/
- if (idle == CPU_NEWLY_IDLE && this_rq->avg_idle < curr_cost) {
- has_blocked_load = true;
+ if (idle == CPU_NEWLY_IDLE && this_rq->avg_idle < curr_cost)
goto abort;
- }

rq = cpu_rq(balance_cpu);

update_blocked_averages(rq->cpu);
- has_blocked_load |= rq->has_blocked_load;
+ if (!rq->has_blocked_load)
+ cpumask_clear_cpu(balance_cpu, nohz.stale_cpus_mask);

/*
* If time for next balance is due,
@@ -9514,7 +9518,9 @@ static bool _nohz_idle_balance(struct rq *this_rq, unsigned int flags, enum cpu_
/* Newly idle CPU doesn't need an update */
if (idle != CPU_NEWLY_IDLE) {
update_blocked_averages(this_cpu);
- has_blocked_load |= this_rq->has_blocked_load;
+ if (!this_rq->has_blocked_load)
+ cpumask_clear_cpu(this_cpu, nohz.stale_cpus_mask);
+
}

if (flags & NOHZ_BALANCE_KICK)
@@ -9527,9 +9533,6 @@ static bool _nohz_idle_balance(struct rq *this_rq, unsigned int flags, enum cpu_
ret = true;

abort:
- /* There is still blocked load, enable periodic update */
- if (has_blocked_load)
- atomic_set(&nohz.stats_state, 1);

/*
* next_balance will be updated only when there is a need.
@@ -10190,6 +10193,7 @@ __init void init_sched_fair_class(void)
#ifdef CONFIG_NO_HZ_COMMON
nohz.next_balance = jiffies;
nohz.next_stats = jiffies;
+ zalloc_cpumask_var(&nohz.stale_cpus_mask, GFP_NOWAIT);
zalloc_cpumask_var(&nohz.idle_cpus_mask, GFP_NOWAIT);
#endif
#endif /* SMP */
--
2.7.4

2018-02-06 08:33:29

by Vincent Guittot

[permalink] [raw]
Subject: [PATCH 1/3] sched: Stop nohz stats when decayed

Stopped the periodic update of blocked load when all idle CPUs have fully
decayed. We introduce a new nohz.has_blocked that reflect if some idle
CPUs has blocked load that have to be periodiccally updated. nohz.has_blocked
is set everytime that a Idle CPU can have blocked load and it is then clear
when no more blocked load has been detected during an update. We don't need
atomic operation but only to make cure of the right ordering when updating
nohz.idle_cpus_mask and nohz.has_blocked.

Suggested-by: Peter Zijlstra (Intel) <[email protected]>
Signed-off-by: Vincent Guittot <[email protected]>
---
kernel/sched/fair.c | 94 +++++++++++++++++++++++++++++++++++++++++-----------
kernel/sched/sched.h | 1 +
2 files changed, 75 insertions(+), 20 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 7af1fa9..279f4b2 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -5383,8 +5383,9 @@ decay_load_missed(unsigned long load, unsigned long missed_updates, int idx)
static struct {
cpumask_var_t idle_cpus_mask;
atomic_t nr_cpus;
+ int has_blocked; /* Idle CPUS has blocked load */
unsigned long next_balance; /* in jiffy units */
- unsigned long next_stats;
+ unsigned long next_blocked; /* Next update of blocked load in jiffies */
} nohz ____cacheline_aligned;

#endif /* CONFIG_NO_HZ_COMMON */
@@ -6951,6 +6952,7 @@ enum fbq_type { regular, remote, all };
#define LBF_DST_PINNED 0x04
#define LBF_SOME_PINNED 0x08
#define LBF_NOHZ_STATS 0x10
+#define LBF_NOHZ_AGAIN 0x20

struct lb_env {
struct sched_domain *sd;
@@ -7335,8 +7337,6 @@ static void attach_tasks(struct lb_env *env)
rq_unlock(env->dst_rq, &rf);
}

-#ifdef CONFIG_FAIR_GROUP_SCHED
-
static inline bool cfs_rq_is_decayed(struct cfs_rq *cfs_rq)
{
if (cfs_rq->load.weight)
@@ -7354,11 +7354,14 @@ static inline bool cfs_rq_is_decayed(struct cfs_rq *cfs_rq)
return true;
}

+#ifdef CONFIG_FAIR_GROUP_SCHED
+
static void update_blocked_averages(int cpu)
{
struct rq *rq = cpu_rq(cpu);
struct cfs_rq *cfs_rq, *pos;
struct rq_flags rf;
+ bool done = true;

rq_lock_irqsave(rq, &rf);
update_rq_clock(rq);
@@ -7388,10 +7391,14 @@ static void update_blocked_averages(int cpu)
*/
if (cfs_rq_is_decayed(cfs_rq))
list_del_leaf_cfs_rq(cfs_rq);
+ else
+ done = false;
}

#ifdef CONFIG_NO_HZ_COMMON
rq->last_blocked_load_update_tick = jiffies;
+ if (done)
+ rq->has_blocked_load = 0;
#endif
rq_unlock_irqrestore(rq, &rf);
}
@@ -7454,6 +7461,8 @@ static inline void update_blocked_averages(int cpu)
update_cfs_rq_load_avg(cfs_rq_clock_task(cfs_rq), cfs_rq);
#ifdef CONFIG_NO_HZ_COMMON
rq->last_blocked_load_update_tick = jiffies;
+ if (cfs_rq_is_decayed(cfs_rq))
+ rq->has_blocked_load = 0;
#endif
rq_unlock_irqrestore(rq, &rf);
}
@@ -7789,18 +7798,25 @@ group_type group_classify(struct sched_group *group,
return group_other;
}

-static void update_nohz_stats(struct rq *rq)
+static bool update_nohz_stats(struct rq *rq)
{
#ifdef CONFIG_NO_HZ_COMMON
unsigned int cpu = rq->cpu;

+ if (!rq->has_blocked_load)
+ return false;
+
if (!cpumask_test_cpu(cpu, nohz.idle_cpus_mask))
- return;
+ return false;

if (!time_after(jiffies, rq->last_blocked_load_update_tick))
- return;
+ return true;

update_blocked_averages(cpu);
+
+ return rq->has_blocked_load;
+#else
+ return false;
#endif
}

@@ -7826,8 +7842,8 @@ static inline void update_sg_lb_stats(struct lb_env *env,
for_each_cpu_and(i, sched_group_span(group), env->cpus) {
struct rq *rq = cpu_rq(i);

- if (env->flags & LBF_NOHZ_STATS)
- update_nohz_stats(rq);
+ if ((env->flags & LBF_NOHZ_STATS) && update_nohz_stats(rq))
+ env->flags |= LBF_NOHZ_AGAIN;

/* Bias balancing toward cpus of our domain */
if (local_group)
@@ -7979,18 +7995,15 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd
struct sg_lb_stats *local = &sds->local_stat;
struct sg_lb_stats tmp_sgs;
int load_idx, prefer_sibling = 0;
+ int has_blocked = READ_ONCE(nohz.has_blocked);
bool overload = false;

if (child && child->flags & SD_PREFER_SIBLING)
prefer_sibling = 1;

#ifdef CONFIG_NO_HZ_COMMON
- if (env->idle == CPU_NEWLY_IDLE) {
+ if (env->idle == CPU_NEWLY_IDLE && has_blocked)
env->flags |= LBF_NOHZ_STATS;
-
- if (cpumask_subset(nohz.idle_cpus_mask, sched_domain_span(env->sd)))
- nohz.next_stats = jiffies + msecs_to_jiffies(LOAD_AVG_PERIOD);
- }
#endif

load_idx = get_sd_load_idx(env->sd, env->idle);
@@ -8046,6 +8059,15 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd
sg = sg->next;
} while (sg != env->sd->groups);

+#ifdef CONFIG_NO_HZ_COMMON
+ if ((env->flags & LBF_NOHZ_AGAIN) &&
+ cpumask_subset(nohz.idle_cpus_mask, sched_domain_span(env->sd))) {
+
+ WRITE_ONCE(nohz.next_blocked,
+ jiffies + msecs_to_jiffies(LOAD_AVG_PERIOD));
+ }
+#endif
+
if (env->sd->flags & SD_NUMA)
env->fbq_type = fbq_classify_group(&sds->busiest_stat);

@@ -9069,6 +9091,8 @@ static void nohz_balancer_kick(struct rq *rq)
struct sched_domain *sd;
int nr_busy, i, cpu = rq->cpu;
unsigned int flags = 0;
+ unsigned long has_blocked = READ_ONCE(nohz.has_blocked);
+ unsigned long next = READ_ONCE(nohz.next_blocked);

if (unlikely(rq->idle_balance))
return;
@@ -9086,7 +9110,7 @@ static void nohz_balancer_kick(struct rq *rq)
if (likely(!atomic_read(&nohz.nr_cpus)))
return;

- if (time_after(now, nohz.next_stats))
+ if (time_after(now, next) && has_blocked)
flags = NOHZ_STATS_KICK;

if (time_before(now, nohz.next_balance))
@@ -9207,13 +9231,15 @@ void nohz_balance_enter_idle(int cpu)
if (!housekeeping_cpu(cpu, HK_FLAG_SCHED))
return;

+ rq->has_blocked_load = 1;
+
if (rq->nohz_tick_stopped)
- return;
+ goto out;

/*
* If we're a completely isolated CPU, we don't play.
*/
- if (on_null_domain(cpu_rq(cpu)))
+ if (on_null_domain(rq))
return;

rq->nohz_tick_stopped = 1;
@@ -9222,6 +9248,13 @@ void nohz_balance_enter_idle(int cpu)
atomic_inc(&nohz.nr_cpus);

set_cpu_sd_state_idle(cpu);
+
+out:
+ /*
+ * Each time a cpu enter idle, we assume that it has blocked load and
+ * enable the periodic update of the load of idle cpus
+ */
+ WRITE_ONCE(nohz.has_blocked, 1);
}
#else
static inline void nohz_balancer_kick(struct rq *rq) { }
@@ -9374,6 +9407,16 @@ static bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)

SCHED_WARN_ON((flags & NOHZ_KICK_MASK) == NOHZ_BALANCE_KICK);

+ /*
+ * We assume there will be no idle load after this update and clear
+ * the stats state. If a cpu enters idle in the mean time, it will
+ * set the stats state and trig another update of idle load.
+ * Because a cpu that becomes idle, is added to idle_cpus_mask before
+ * setting the stats state, we are sure to not clear the state and not
+ * check the load of an idle cpu.
+ */
+ WRITE_ONCE(nohz.has_blocked, 0);
+
for_each_cpu(balance_cpu, nohz.idle_cpus_mask) {
if (balance_cpu == this_cpu || !idle_cpu(balance_cpu))
continue;
@@ -9383,11 +9426,16 @@ static bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
* work being done for other cpus. Next load
* balancing owner will pick it up.
*/
- if (need_resched())
- break;
+ if (need_resched()) {
+ has_blocked_load = true;
+ goto abort;
+ }

rq = cpu_rq(balance_cpu);

+ update_blocked_averages(rq->cpu);
+ has_blocked_load |= rq->has_blocked_load;
+
/*
* If time for next balance is due,
* do the balance.
@@ -9400,7 +9448,6 @@ static bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
cpu_load_update_idle(rq);
rq_unlock_irq(rq, &rf);

- update_blocked_averages(rq->cpu);
if (flags & NOHZ_BALANCE_KICK)
rebalance_domains(rq, CPU_IDLE);
}
@@ -9415,7 +9462,13 @@ static bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
if (flags & NOHZ_BALANCE_KICK)
rebalance_domains(this_rq, CPU_IDLE);

- nohz.next_stats = next_stats;
+ WRITE_ONCE(nohz.next_blocked,
+ now + msecs_to_jiffies(LOAD_AVG_PERIOD));
+
+abort:
+ /* There is still blocked load, enable periodic update */
+ if (has_blocked_load)
+ WRITE_ONCE(nohz.has_blocked, 1);

/*
* next_balance will be updated only when there is a need.
@@ -10046,6 +10099,7 @@ __init void init_sched_fair_class(void)

#ifdef CONFIG_NO_HZ_COMMON
nohz.next_balance = jiffies;
+ nohz.next_blocked = jiffies;
zalloc_cpumask_var(&nohz.idle_cpus_mask, GFP_NOWAIT);
#endif
#endif /* SMP */
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index e200045..ad9b929 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -723,6 +723,7 @@ struct rq {
#ifdef CONFIG_SMP
unsigned long last_load_update_tick;
unsigned long last_blocked_load_update_tick;
+ unsigned int has_blocked_load;
#endif /* CONFIG_SMP */
unsigned int nohz_tick_stopped;
atomic_t nohz_flags;
--
2.7.4


2018-02-06 08:33:52

by Vincent Guittot

[permalink] [raw]
Subject: [PATCH 2/3] sched: reduce the periodic update duration

Instead of using the cfs_rq_is_decayed() which monitors all *_avg
and *_sum, we create a cfs_rq_has_blocked() which only takes care of
util_avg and load_avg. We are only interested by these 2 values which are
decaying faster than the *_sum so we can stop the periodic update earlier.

Signed-off-by: Vincent Guittot <[email protected]>
---
kernel/sched/fair.c | 21 +++++++++++++++++----
1 file changed, 17 insertions(+), 4 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 279f4b2..6998528 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -7337,6 +7337,19 @@ static void attach_tasks(struct lb_env *env)
rq_unlock(env->dst_rq, &rf);
}

+static inline bool cfs_rq_has_blocked(struct cfs_rq *cfs_rq)
+{
+ if (cfs_rq->avg.load_avg)
+ return true;
+
+ if (cfs_rq->avg.util_avg)
+ return true;
+
+ return false;
+}
+
+#ifdef CONFIG_FAIR_GROUP_SCHED
+
static inline bool cfs_rq_is_decayed(struct cfs_rq *cfs_rq)
{
if (cfs_rq->load.weight)
@@ -7354,8 +7367,6 @@ static inline bool cfs_rq_is_decayed(struct cfs_rq *cfs_rq)
return true;
}

-#ifdef CONFIG_FAIR_GROUP_SCHED
-
static void update_blocked_averages(int cpu)
{
struct rq *rq = cpu_rq(cpu);
@@ -7391,7 +7402,9 @@ static void update_blocked_averages(int cpu)
*/
if (cfs_rq_is_decayed(cfs_rq))
list_del_leaf_cfs_rq(cfs_rq);
- else
+
+ /* Don't need periodic decay once load/util_avg are null */
+ if (cfs_rq_has_blocked(cfs_rq))
done = false;
}

@@ -7461,7 +7474,7 @@ static inline void update_blocked_averages(int cpu)
update_cfs_rq_load_avg(cfs_rq_clock_task(cfs_rq), cfs_rq);
#ifdef CONFIG_NO_HZ_COMMON
rq->last_blocked_load_update_tick = jiffies;
- if (cfs_rq_is_decayed(cfs_rq))
+ if (!cfs_rq_has_blocked(cfs_rq))
rq->has_blocked_load = 0;
#endif
rq_unlock_irqrestore(rq, &rf);
--
2.7.4


2018-02-06 08:35:08

by Vincent Guittot

[permalink] [raw]
Subject: [PATCH 3/3] sched: update blocked load when newly idle

When NEWLY_IDLE load balance is not triggered, we might need to update the
blocked load anyway. We can kick an ilb so an idle CPU will take care of
updating blocked load or we can try to update them locally before entering
idle. In the latter case, we reuse part of the nohz_idle_balance.

Signed-off-by: Vincent Guittot <[email protected]>
---
kernel/sched/fair.c | 102 ++++++++++++++++++++++++++++++++++++++++++----------
1 file changed, 84 insertions(+), 18 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 6998528..256defe 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -8829,6 +8829,9 @@ update_next_balance(struct sched_domain *sd, unsigned long *next_balance)
*next_balance = next;
}

+static bool _nohz_idle_balance(struct rq *this_rq, unsigned int flags, enum cpu_idle_type idle);
+static void kick_ilb(unsigned int flags);
+
/*
* idle_balance is called by schedule() if this_cpu is about to become
* idle. Attempts to pull tasks from other CPUs.
@@ -8863,12 +8866,26 @@ static int idle_balance(struct rq *this_rq, struct rq_flags *rf)

if (this_rq->avg_idle < sysctl_sched_migration_cost ||
!this_rq->rd->overload) {
+ unsigned long has_blocked = READ_ONCE(nohz.has_blocked);
+ unsigned long next = READ_ONCE(nohz.next_blocked);
+
rcu_read_lock();
sd = rcu_dereference_check_sched_domain(this_rq->sd);
if (sd)
update_next_balance(sd, &next_balance);
rcu_read_unlock();

+ /*
+ * Update blocked idle load if it has not been done for a
+ * while. Try to do it locally before entering idle but kick a
+ * ilb if it takes too much time and/or might delay next local
+ * wake up
+ */
+ if (has_blocked && time_after_eq(jiffies, next) &&
+ (this_rq->avg_idle < sysctl_sched_migration_cost ||
+ !_nohz_idle_balance(this_rq, NOHZ_STATS_KICK, CPU_NEWLY_IDLE)))
+ kick_ilb(NOHZ_STATS_KICK);
+
goto out;
}

@@ -9393,30 +9410,24 @@ static void rebalance_domains(struct rq *rq, enum cpu_idle_type idle)

#ifdef CONFIG_NO_HZ_COMMON
/*
- * In CONFIG_NO_HZ_COMMON case, the idle balance kickee will do the
- * rebalancing for all the cpus for whom scheduler ticks are stopped.
+ * Internal function that runs load balance for all idle cpus. The load balance
+ * can be a simple update of blocked load or a complete load balance with
+ * tasks movement depending of flags.
+ * For newly idle mode, we abort the loop if it takes too much time and return
+ * false to notify that the loop has not be completed and a ilb should be kick.
*/
-static bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
+static bool _nohz_idle_balance(struct rq *this_rq, unsigned int flags, enum cpu_idle_type idle)
{
/* Earliest time when we have to do rebalance again */
unsigned long now = jiffies;
unsigned long next_balance = now + 60*HZ;
- unsigned long next_stats = now + msecs_to_jiffies(LOAD_AVG_PERIOD);
+ bool has_blocked_load = false;
int update_next_balance = 0;
int this_cpu = this_rq->cpu;
- unsigned int flags;
int balance_cpu;
+ int ret = false;
struct rq *rq;
-
- if (!(atomic_read(nohz_flags(this_cpu)) & NOHZ_KICK_MASK))
- return false;
-
- if (idle != CPU_IDLE) {
- atomic_andnot(NOHZ_KICK_MASK, nohz_flags(this_cpu));
- return false;
- }
-
- flags = atomic_fetch_andnot(NOHZ_KICK_MASK, nohz_flags(this_cpu));
+ u64 curr_cost = 0;

SCHED_WARN_ON((flags & NOHZ_KICK_MASK) == NOHZ_BALANCE_KICK);

@@ -9431,6 +9442,10 @@ static bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
WRITE_ONCE(nohz.has_blocked, 0);

for_each_cpu(balance_cpu, nohz.idle_cpus_mask) {
+ u64 t0, domain_cost;
+
+ t0 = sched_clock_cpu(this_cpu);
+
if (balance_cpu == this_cpu || !idle_cpu(balance_cpu))
continue;

@@ -9444,6 +9459,16 @@ static bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
goto abort;
}

+ /*
+ * If the update is done while CPU becomes idle, we abort
+ * the update when its cost is higher than the average idle
+ * time in orde to not delay a possible wake up.
+ */
+ if (idle == CPU_NEWLY_IDLE && this_rq->avg_idle < curr_cost) {
+ has_blocked_load = true;
+ goto abort;
+ }
+
rq = cpu_rq(balance_cpu);

update_blocked_averages(rq->cpu);
@@ -9456,10 +9481,10 @@ static bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
if (time_after_eq(jiffies, rq->next_balance)) {
struct rq_flags rf;

- rq_lock_irq(rq, &rf);
+ rq_lock_irqsave(rq, &rf);
update_rq_clock(rq);
cpu_load_update_idle(rq);
- rq_unlock_irq(rq, &rf);
+ rq_unlock_irqrestore(rq, &rf);

if (flags & NOHZ_BALANCE_KICK)
rebalance_domains(rq, CPU_IDLE);
@@ -9469,15 +9494,27 @@ static bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
next_balance = rq->next_balance;
update_next_balance = 1;
}
+
+ domain_cost = sched_clock_cpu(this_cpu) - t0;
+ curr_cost += domain_cost;
+
+ }
+
+ /* Newly idle CPU doesn't need an update */
+ if (idle != CPU_NEWLY_IDLE) {
+ update_blocked_averages(this_cpu);
+ has_blocked_load |= this_rq->has_blocked_load;
}

- update_blocked_averages(this_cpu);
if (flags & NOHZ_BALANCE_KICK)
rebalance_domains(this_rq, CPU_IDLE);

WRITE_ONCE(nohz.next_blocked,
now + msecs_to_jiffies(LOAD_AVG_PERIOD));

+ /* The full idle balance loop has been done */
+ ret = true;
+
abort:
/* There is still blocked load, enable periodic update */
if (has_blocked_load)
@@ -9491,6 +9528,35 @@ static bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
if (likely(update_next_balance))
nohz.next_balance = next_balance;

+ return ret;
+}
+
+/*
+ * In CONFIG_NO_HZ_COMMON case, the idle balance kickee will do the
+ * rebalancing for all the cpus for whom scheduler ticks are stopped.
+ */
+static bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
+{
+ int this_cpu = this_rq->cpu;
+ unsigned int flags;
+
+ if (!(atomic_read(nohz_flags(this_cpu)) & NOHZ_KICK_MASK))
+ return false;
+
+ if (idle != CPU_IDLE) {
+ atomic_andnot(NOHZ_KICK_MASK, nohz_flags(this_cpu));
+ return false;
+ }
+
+ /*
+ * barrier, pairs with nohz_balance_enter_idle(), ensures ...
+ */
+ flags = atomic_fetch_andnot(NOHZ_KICK_MASK, nohz_flags(this_cpu));
+ if (!(flags & NOHZ_KICK_MASK))
+ return false;
+
+ _nohz_idle_balance(this_rq, flags, idle);
+
return true;
}
#else
--
2.7.4


2018-02-06 08:58:40

by Vincent Guittot

[permalink] [raw]
Subject: Re: [PATCH 1/3] sched: Stop nohz stats when decayed

Hi Peter,

On 6 February 2018 at 09:32, Vincent Guittot <[email protected]> wrote:
> Stopped the periodic update of blocked load when all idle CPUs have fully
> decayed. We introduce a new nohz.has_blocked that reflect if some idle
> CPUs has blocked load that have to be periodiccally updated. nohz.has_blocked
> is set everytime that a Idle CPU can have blocked load and it is then clear
> when no more blocked load has been detected during an update. We don't need
> atomic operation but only to make cure of the right ordering when updating
> nohz.idle_cpus_mask and nohz.has_blocked.
>
> Suggested-by: Peter Zijlstra (Intel) <[email protected]>
> Signed-off-by: Vincent Guittot <[email protected]>
> ---

This patchset applies on your testing branch after removing the 2 last commits:
56eb4679 ("sched: Clean up nohz enter/exit")

> kernel/sched/fair.c | 94 +++++++++++++++++++++++++++++++++++++++++-----------
> kernel/sched/sched.h | 1 +
> 2 files changed, 75 insertions(+), 20 deletions(-)
>
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index 7af1fa9..279f4b2 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -5383,8 +5383,9 @@ decay_load_missed(unsigned long load, unsigned long missed_updates, int idx)
> static struct {
> cpumask_var_t idle_cpus_mask;
> atomic_t nr_cpus;
> + int has_blocked; /* Idle CPUS has blocked load */
> unsigned long next_balance; /* in jiffy units */
> - unsigned long next_stats;
> + unsigned long next_blocked; /* Next update of blocked load in jiffies */
> } nohz ____cacheline_aligned;
>
> #endif /* CONFIG_NO_HZ_COMMON */
> @@ -6951,6 +6952,7 @@ enum fbq_type { regular, remote, all };
> #define LBF_DST_PINNED 0x04
> #define LBF_SOME_PINNED 0x08
> #define LBF_NOHZ_STATS 0x10
> +#define LBF_NOHZ_AGAIN 0x20
>
> struct lb_env {
> struct sched_domain *sd;
> @@ -7335,8 +7337,6 @@ static void attach_tasks(struct lb_env *env)
> rq_unlock(env->dst_rq, &rf);
> }
>
> -#ifdef CONFIG_FAIR_GROUP_SCHED
> -
> static inline bool cfs_rq_is_decayed(struct cfs_rq *cfs_rq)
> {
> if (cfs_rq->load.weight)
> @@ -7354,11 +7354,14 @@ static inline bool cfs_rq_is_decayed(struct cfs_rq *cfs_rq)
> return true;
> }
>
> +#ifdef CONFIG_FAIR_GROUP_SCHED
> +
> static void update_blocked_averages(int cpu)
> {
> struct rq *rq = cpu_rq(cpu);
> struct cfs_rq *cfs_rq, *pos;
> struct rq_flags rf;
> + bool done = true;
>
> rq_lock_irqsave(rq, &rf);
> update_rq_clock(rq);
> @@ -7388,10 +7391,14 @@ static void update_blocked_averages(int cpu)
> */
> if (cfs_rq_is_decayed(cfs_rq))
> list_del_leaf_cfs_rq(cfs_rq);
> + else
> + done = false;
> }
>
> #ifdef CONFIG_NO_HZ_COMMON
> rq->last_blocked_load_update_tick = jiffies;
> + if (done)
> + rq->has_blocked_load = 0;
> #endif
> rq_unlock_irqrestore(rq, &rf);
> }
> @@ -7454,6 +7461,8 @@ static inline void update_blocked_averages(int cpu)
> update_cfs_rq_load_avg(cfs_rq_clock_task(cfs_rq), cfs_rq);
> #ifdef CONFIG_NO_HZ_COMMON
> rq->last_blocked_load_update_tick = jiffies;
> + if (cfs_rq_is_decayed(cfs_rq))
> + rq->has_blocked_load = 0;
> #endif
> rq_unlock_irqrestore(rq, &rf);
> }
> @@ -7789,18 +7798,25 @@ group_type group_classify(struct sched_group *group,
> return group_other;
> }
>
> -static void update_nohz_stats(struct rq *rq)
> +static bool update_nohz_stats(struct rq *rq)
> {
> #ifdef CONFIG_NO_HZ_COMMON
> unsigned int cpu = rq->cpu;
>
> + if (!rq->has_blocked_load)
> + return false;
> +
> if (!cpumask_test_cpu(cpu, nohz.idle_cpus_mask))
> - return;
> + return false;
>
> if (!time_after(jiffies, rq->last_blocked_load_update_tick))
> - return;
> + return true;
>
> update_blocked_averages(cpu);
> +
> + return rq->has_blocked_load;
> +#else
> + return false;
> #endif
> }
>
> @@ -7826,8 +7842,8 @@ static inline void update_sg_lb_stats(struct lb_env *env,
> for_each_cpu_and(i, sched_group_span(group), env->cpus) {
> struct rq *rq = cpu_rq(i);
>
> - if (env->flags & LBF_NOHZ_STATS)
> - update_nohz_stats(rq);
> + if ((env->flags & LBF_NOHZ_STATS) && update_nohz_stats(rq))
> + env->flags |= LBF_NOHZ_AGAIN;
>
> /* Bias balancing toward cpus of our domain */
> if (local_group)
> @@ -7979,18 +7995,15 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd
> struct sg_lb_stats *local = &sds->local_stat;
> struct sg_lb_stats tmp_sgs;
> int load_idx, prefer_sibling = 0;
> + int has_blocked = READ_ONCE(nohz.has_blocked);
> bool overload = false;
>
> if (child && child->flags & SD_PREFER_SIBLING)
> prefer_sibling = 1;
>
> #ifdef CONFIG_NO_HZ_COMMON
> - if (env->idle == CPU_NEWLY_IDLE) {
> + if (env->idle == CPU_NEWLY_IDLE && has_blocked)
> env->flags |= LBF_NOHZ_STATS;
> -
> - if (cpumask_subset(nohz.idle_cpus_mask, sched_domain_span(env->sd)))
> - nohz.next_stats = jiffies + msecs_to_jiffies(LOAD_AVG_PERIOD);
> - }
> #endif
>
> load_idx = get_sd_load_idx(env->sd, env->idle);
> @@ -8046,6 +8059,15 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd
> sg = sg->next;
> } while (sg != env->sd->groups);
>
> +#ifdef CONFIG_NO_HZ_COMMON
> + if ((env->flags & LBF_NOHZ_AGAIN) &&
> + cpumask_subset(nohz.idle_cpus_mask, sched_domain_span(env->sd))) {
> +
> + WRITE_ONCE(nohz.next_blocked,
> + jiffies + msecs_to_jiffies(LOAD_AVG_PERIOD));
> + }
> +#endif
> +
> if (env->sd->flags & SD_NUMA)
> env->fbq_type = fbq_classify_group(&sds->busiest_stat);
>
> @@ -9069,6 +9091,8 @@ static void nohz_balancer_kick(struct rq *rq)
> struct sched_domain *sd;
> int nr_busy, i, cpu = rq->cpu;
> unsigned int flags = 0;
> + unsigned long has_blocked = READ_ONCE(nohz.has_blocked);
> + unsigned long next = READ_ONCE(nohz.next_blocked);
>
> if (unlikely(rq->idle_balance))
> return;
> @@ -9086,7 +9110,7 @@ static void nohz_balancer_kick(struct rq *rq)
> if (likely(!atomic_read(&nohz.nr_cpus)))
> return;
>
> - if (time_after(now, nohz.next_stats))
> + if (time_after(now, next) && has_blocked)
> flags = NOHZ_STATS_KICK;
>
> if (time_before(now, nohz.next_balance))
> @@ -9207,13 +9231,15 @@ void nohz_balance_enter_idle(int cpu)
> if (!housekeeping_cpu(cpu, HK_FLAG_SCHED))
> return;
>
> + rq->has_blocked_load = 1;
> +
> if (rq->nohz_tick_stopped)
> - return;
> + goto out;
>
> /*
> * If we're a completely isolated CPU, we don't play.
> */
> - if (on_null_domain(cpu_rq(cpu)))
> + if (on_null_domain(rq))
> return;
>
> rq->nohz_tick_stopped = 1;
> @@ -9222,6 +9248,13 @@ void nohz_balance_enter_idle(int cpu)
> atomic_inc(&nohz.nr_cpus);
>
> set_cpu_sd_state_idle(cpu);
> +
> +out:
> + /*
> + * Each time a cpu enter idle, we assume that it has blocked load and
> + * enable the periodic update of the load of idle cpus
> + */
> + WRITE_ONCE(nohz.has_blocked, 1);
> }
> #else
> static inline void nohz_balancer_kick(struct rq *rq) { }
> @@ -9374,6 +9407,16 @@ static bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
>
> SCHED_WARN_ON((flags & NOHZ_KICK_MASK) == NOHZ_BALANCE_KICK);
>
> + /*
> + * We assume there will be no idle load after this update and clear
> + * the stats state. If a cpu enters idle in the mean time, it will
> + * set the stats state and trig another update of idle load.
> + * Because a cpu that becomes idle, is added to idle_cpus_mask before
> + * setting the stats state, we are sure to not clear the state and not
> + * check the load of an idle cpu.
> + */
> + WRITE_ONCE(nohz.has_blocked, 0);
> +
> for_each_cpu(balance_cpu, nohz.idle_cpus_mask) {
> if (balance_cpu == this_cpu || !idle_cpu(balance_cpu))
> continue;
> @@ -9383,11 +9426,16 @@ static bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
> * work being done for other cpus. Next load
> * balancing owner will pick it up.
> */
> - if (need_resched())
> - break;
> + if (need_resched()) {
> + has_blocked_load = true;
> + goto abort;
> + }
>
> rq = cpu_rq(balance_cpu);
>
> + update_blocked_averages(rq->cpu);
> + has_blocked_load |= rq->has_blocked_load;
> +
> /*
> * If time for next balance is due,
> * do the balance.
> @@ -9400,7 +9448,6 @@ static bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
> cpu_load_update_idle(rq);
> rq_unlock_irq(rq, &rf);
>
> - update_blocked_averages(rq->cpu);
> if (flags & NOHZ_BALANCE_KICK)
> rebalance_domains(rq, CPU_IDLE);
> }
> @@ -9415,7 +9462,13 @@ static bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
> if (flags & NOHZ_BALANCE_KICK)
> rebalance_domains(this_rq, CPU_IDLE);
>
> - nohz.next_stats = next_stats;
> + WRITE_ONCE(nohz.next_blocked,
> + now + msecs_to_jiffies(LOAD_AVG_PERIOD));
> +
> +abort:
> + /* There is still blocked load, enable periodic update */
> + if (has_blocked_load)
> + WRITE_ONCE(nohz.has_blocked, 1);
>
> /*
> * next_balance will be updated only when there is a need.
> @@ -10046,6 +10099,7 @@ __init void init_sched_fair_class(void)
>
> #ifdef CONFIG_NO_HZ_COMMON
> nohz.next_balance = jiffies;
> + nohz.next_blocked = jiffies;
> zalloc_cpumask_var(&nohz.idle_cpus_mask, GFP_NOWAIT);
> #endif
> #endif /* SMP */
> diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
> index e200045..ad9b929 100644
> --- a/kernel/sched/sched.h
> +++ b/kernel/sched/sched.h
> @@ -723,6 +723,7 @@ struct rq {
> #ifdef CONFIG_SMP
> unsigned long last_load_update_tick;
> unsigned long last_blocked_load_update_tick;
> + unsigned int has_blocked_load;
> #endif /* CONFIG_SMP */
> unsigned int nohz_tick_stopped;
> atomic_t nohz_flags;
> --
> 2.7.4
>

2018-02-06 09:24:15

by Vincent Guittot

[permalink] [raw]
Subject: Re: [RFC PATCH 2/5] sched: Add NOHZ_STATS_KICK

On 5 February 2018 at 23:18, Valentin Schneider
<[email protected]> wrote:
> On 01/30/2018 11:41 AM, Valentin Schneider wrote:
>> [...]
>>> I have studied a way to keep track of how many cpus still have blocked
>>> load to try to minimize the number of useless ilb kick but this add
>>> more atomic operations which can impact the system throughput with
>>> heavy load and lot of very small wake up. that's why i have propose
>>> this solution which is more simple. But it's probably just a matter of
>>> where we want to "waste" time. Either we accept to spent a bit more
>>> time to check the state of idle CPUs or we accept to kick ilb from
>>> time to time for no good reason.
>>>
>>
>> Agreed. I have the feeling that spending more time doing atomic ops could be worth it - I'll try to test this out and see if it's actually relevant.
>>
>
> I gave this a spin, still using Vincent's patches with the included patch on top. Nothing too clever, just seeing how replacing nohz.stats_state with a cpumask would go.
>
> I've replaced nohz.stats_state by nohz.stale_cpus_mask. I kept changes minimal - there are some places where I think nohz.idle_cpus_mask could be substituted by nohz.stale_cpus_mask. Speaking about that, I was about to write a comment regarding the fact that nohz.stale_cpus_mask should be a subset of nohz.idle_cpus_mask, but I realized it's actually not true:
>
> In the current implementation (cpumask or stats_state flag), an idle CPU is defined as having blocked load as soon as it goes through nohz_balance_enter_idle(), and that flag is cleared when we go through _nohz_idle_balance() (and newly idle load balance in my cpumask implementation).
> However we can imagine a scenario where a CPU goes idle, is flagged as having blocked load, then it wakes up and goes through its periodic balance code and updates that load. Yet, the flag (or cpumask) won't have been updated.
> So I think there could be a discussion on whether the flag should be cleared on nohz_balance_exit_idle() if we assume periodic balance now takes care of this. It could cause issues if we have a weird scenario where a CPU keeps going online/idle but never stays online long enough for a tick though.
> Alternatively we could clear that flag when going through the first periodic balance after idling, but then that's a bit weird because we're using a nohz structure in a non-nohz context.
>
>
> Anyway, I tried to get some profiling done with the cpumask but there's something wrong with my setup, I would only get nonsense numbers (for both baseline & my patch), so I added start/end trace_printks to _nohz_idle_balance(). It's ugly, inaccurate and unorthodox but it still gives a rough idea of how the cpumask impacts stuff.
> I ran 20 iterations of my "nohz update" test case (a task accumulates load, goes to sleep, and another always-running task keeps kicking an ILB to decay that blocked load) and the time saved by skipping CPUs is in the ballpark of 20%. Notebook is at [1].

Even if saving time in the ILB is interesting, we have to check the
impact of bitmask operation on the wake up latency of the CPUs when
the system is heavily used and generate a lot of sleep/wakeup on CPUs


>
> I'll try to get a proper function profiling working for when Vincent posts his "v2".
>
> [1]: https://gist.github.com/valschneider/6f203143bee1e149f24c44e9582a9eff
>
> ---

2018-02-06 14:17:23

by Valentin Schneider

[permalink] [raw]
Subject: Re: [PATCH 1/3] sched: Stop nohz stats when decayed

Hi Vincent,

On 02/06/2018 08:32 AM, Vincent Guittot wrote:
> Stopped the periodic update of blocked load when all idle CPUs have fully
> decayed. We introduce a new nohz.has_blocked that reflect if some idle
> CPUs has blocked load that have to be periodiccally updated. nohz.has_blocked
> is set everytime that a Idle CPU can have blocked load and it is then clear
> when no more blocked load has been detected during an update. We don't need
> atomic operation but only to make cure of the right ordering when updating
> nohz.idle_cpus_mask and nohz.has_blocked.
>
> Suggested-by: Peter Zijlstra (Intel) <[email protected]>
> Signed-off-by: Vincent Guittot <[email protected]>
> ---
> kernel/sched/fair.c | 94 +++++++++++++++++++++++++++++++++++++++++-----------
> kernel/sched/sched.h | 1 +
> 2 files changed, 75 insertions(+), 20 deletions(-)
>
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index 7af1fa9..279f4b2 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -5383,8 +5383,9 @@ decay_load_missed(unsigned long load, unsigned long missed_updates, int idx)
> static struct {
> cpumask_var_t idle_cpus_mask;
> atomic_t nr_cpus;
> + int has_blocked; /* Idle CPUS has blocked load */
> unsigned long next_balance; /* in jiffy units */
> - unsigned long next_stats;
> + unsigned long next_blocked; /* Next update of blocked load in jiffies */
> } nohz ____cacheline_aligned;
>
> #endif /* CONFIG_NO_HZ_COMMON */
> @@ -6951,6 +6952,7 @@ enum fbq_type { regular, remote, all };
> #define LBF_DST_PINNED 0x04
> #define LBF_SOME_PINNED 0x08
> #define LBF_NOHZ_STATS 0x10
> +#define LBF_NOHZ_AGAIN 0x20
>
> struct lb_env {
> struct sched_domain *sd;
> @@ -7335,8 +7337,6 @@ static void attach_tasks(struct lb_env *env)
> rq_unlock(env->dst_rq, &rf);
> }
>
> -#ifdef CONFIG_FAIR_GROUP_SCHED
> -
> static inline bool cfs_rq_is_decayed(struct cfs_rq *cfs_rq)
> {
> if (cfs_rq->load.weight)
> @@ -7354,11 +7354,14 @@ static inline bool cfs_rq_is_decayed(struct cfs_rq *cfs_rq)
> return true;
> }
>
> +#ifdef CONFIG_FAIR_GROUP_SCHED
> +
> static void update_blocked_averages(int cpu)
> {
> struct rq *rq = cpu_rq(cpu);
> struct cfs_rq *cfs_rq, *pos;
> struct rq_flags rf;
> + bool done = true;
>
> rq_lock_irqsave(rq, &rf);
> update_rq_clock(rq);
> @@ -7388,10 +7391,14 @@ static void update_blocked_averages(int cpu)
> */
> if (cfs_rq_is_decayed(cfs_rq))
> list_del_leaf_cfs_rq(cfs_rq);
> + else
> + done = false;
> }
>
> #ifdef CONFIG_NO_HZ_COMMON
> rq->last_blocked_load_update_tick = jiffies;
> + if (done)
> + rq->has_blocked_load = 0;
> #endif
> rq_unlock_irqrestore(rq, &rf);
> }
> @@ -7454,6 +7461,8 @@ static inline void update_blocked_averages(int cpu)
> update_cfs_rq_load_avg(cfs_rq_clock_task(cfs_rq), cfs_rq);
> #ifdef CONFIG_NO_HZ_COMMON
> rq->last_blocked_load_update_tick = jiffies;
> + if (cfs_rq_is_decayed(cfs_rq))
> + rq->has_blocked_load = 0;
> #endif
> rq_unlock_irqrestore(rq, &rf);
> }
> @@ -7789,18 +7798,25 @@ group_type group_classify(struct sched_group *group,
> return group_other;
> }
>
> -static void update_nohz_stats(struct rq *rq)
> +static bool update_nohz_stats(struct rq *rq)
> {
> #ifdef CONFIG_NO_HZ_COMMON
> unsigned int cpu = rq->cpu;
>
> + if (!rq->has_blocked_load)
> + return false;
> +
> if (!cpumask_test_cpu(cpu, nohz.idle_cpus_mask))
> - return;
> + return false;
>
> if (!time_after(jiffies, rq->last_blocked_load_update_tick))

I forgot to ask this on the initial thread - what's the point of this condition ? At first glance I would have said that this would make more sense:

if (!time_after(jiffies, rq->last_blocked_load_update_tick + msecs_to_jiffies(LOAD_AVG_PERIOD))
return false;

But maybe it's simply there to skip an update if it has already been done in the same jiffy interval ?

> - return;
> + return true;
>
> update_blocked_averages(cpu);
> +
> + return rq->has_blocked_load;
> +#else
> + return false;
> #endif
> }
>
> @@ -7826,8 +7842,8 @@ static inline void update_sg_lb_stats(struct lb_env *env,
> for_each_cpu_and(i, sched_group_span(group), env->cpus) {
> struct rq *rq = cpu_rq(i);
>
> - if (env->flags & LBF_NOHZ_STATS)
> - update_nohz_stats(rq);
> + if ((env->flags & LBF_NOHZ_STATS) && update_nohz_stats(rq))
> + env->flags |= LBF_NOHZ_AGAIN;
>
> /* Bias balancing toward cpus of our domain */
> if (local_group)
> @@ -7979,18 +7995,15 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd
> struct sg_lb_stats *local = &sds->local_stat;
> struct sg_lb_stats tmp_sgs;
> int load_idx, prefer_sibling = 0;
> + int has_blocked = READ_ONCE(nohz.has_blocked);
> bool overload = false;
>
> if (child && child->flags & SD_PREFER_SIBLING)
> prefer_sibling = 1;
>
> #ifdef CONFIG_NO_HZ_COMMON
> - if (env->idle == CPU_NEWLY_IDLE) {
> + if (env->idle == CPU_NEWLY_IDLE && has_blocked)
> env->flags |= LBF_NOHZ_STATS;
> -
> - if (cpumask_subset(nohz.idle_cpus_mask, sched_domain_span(env->sd)))
> - nohz.next_stats = jiffies + msecs_to_jiffies(LOAD_AVG_PERIOD);
> - }
> #endif
>
> load_idx = get_sd_load_idx(env->sd, env->idle);
> @@ -8046,6 +8059,15 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd
> sg = sg->next;
> } while (sg != env->sd->groups);
>
> +#ifdef CONFIG_NO_HZ_COMMON
> + if ((env->flags & LBF_NOHZ_AGAIN) &&
> + cpumask_subset(nohz.idle_cpus_mask, sched_domain_span(env->sd))) {
> +
> + WRITE_ONCE(nohz.next_blocked,
> + jiffies + msecs_to_jiffies(LOAD_AVG_PERIOD));
> + }
> +#endif
> +
> if (env->sd->flags & SD_NUMA)
> env->fbq_type = fbq_classify_group(&sds->busiest_stat);
>
> @@ -9069,6 +9091,8 @@ static void nohz_balancer_kick(struct rq *rq)
> struct sched_domain *sd;
> int nr_busy, i, cpu = rq->cpu;
> unsigned int flags = 0;
> + unsigned long has_blocked = READ_ONCE(nohz.has_blocked);
> + unsigned long next = READ_ONCE(nohz.next_blocked);

What about something slightly more explicit, e.g. next_stats/next_blocked ? There's also nohz.next_balance referenced here so IMO it's best to keep things clear.

>
> if (unlikely(rq->idle_balance))
> return;
> @@ -9086,7 +9110,7 @@ static void nohz_balancer_kick(struct rq *rq)
> if (likely(!atomic_read(&nohz.nr_cpus)))
> return;
>
> - if (time_after(now, nohz.next_stats))
> + if (time_after(now, next) && has_blocked)
> flags = NOHZ_STATS_KICK;
>
> if (time_before(now, nohz.next_balance))
> @@ -9207,13 +9231,15 @@ void nohz_balance_enter_idle(int cpu)
> if (!housekeeping_cpu(cpu, HK_FLAG_SCHED))
> return;
>
> + rq->has_blocked_load = 1;
> +
> if (rq->nohz_tick_stopped)
> - return;
> + goto out;
>
> /*
> * If we're a completely isolated CPU, we don't play.
> */
> - if (on_null_domain(cpu_rq(cpu)))
> + if (on_null_domain(rq))
> return;
>
> rq->nohz_tick_stopped = 1;
> @@ -9222,6 +9248,13 @@ void nohz_balance_enter_idle(int cpu)
> atomic_inc(&nohz.nr_cpus);
>
> set_cpu_sd_state_idle(cpu);
> +
> +out:
> + /*
> + * Each time a cpu enter idle, we assume that it has blocked load and
> + * enable the periodic update of the load of idle cpus
> + */
> + WRITE_ONCE(nohz.has_blocked, 1);
> }
> #else
> static inline void nohz_balancer_kick(struct rq *rq) { }
> @@ -9374,6 +9407,16 @@ static bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
>
> SCHED_WARN_ON((flags & NOHZ_KICK_MASK) == NOHZ_BALANCE_KICK);
>
> + /*
> + * We assume there will be no idle load after this update and clear
> + * the stats state. If a cpu enters idle in the mean time, it will

s/stats state/has_blocked flag/ (repeated a few times in the comment block)

> + * set the stats state and trig another update of idle load.
> + * Because a cpu that becomes idle, is added to idle_cpus_mask before
> + * setting the stats state, we are sure to not clear the state and not
> + * check the load of an idle cpu.
> + */
> + WRITE_ONCE(nohz.has_blocked, 0);
> +
> for_each_cpu(balance_cpu, nohz.idle_cpus_mask) {
> if (balance_cpu == this_cpu || !idle_cpu(balance_cpu))
> continue;
> @@ -9383,11 +9426,16 @@ static bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
> * work being done for other cpus. Next load
> * balancing owner will pick it up.
> */
> - if (need_resched())
> - break;
> + if (need_resched()) {
> + has_blocked_load = true;
> + goto abort;
> + }
>
> rq = cpu_rq(balance_cpu);
>
> + update_blocked_averages(rq->cpu);
> + has_blocked_load |= rq->has_blocked_load;
> +
> /*
> * If time for next balance is due,
> * do the balance.
> @@ -9400,7 +9448,6 @@ static bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
> cpu_load_update_idle(rq);
> rq_unlock_irq(rq, &rf);
>
> - update_blocked_averages(rq->cpu);
> if (flags & NOHZ_BALANCE_KICK)
> rebalance_domains(rq, CPU_IDLE);
> }
> @@ -9415,7 +9462,13 @@ static bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
> if (flags & NOHZ_BALANCE_KICK)
> rebalance_domains(this_rq, CPU_IDLE);
>
> - nohz.next_stats = next_stats;
> + WRITE_ONCE(nohz.next_blocked,
> + now + msecs_to_jiffies(LOAD_AVG_PERIOD));
> +
> +abort:
> + /* There is still blocked load, enable periodic update */
> + if (has_blocked_load)
> + WRITE_ONCE(nohz.has_blocked, 1);
>
> /*
> * next_balance will be updated only when there is a need.
> @@ -10046,6 +10099,7 @@ __init void init_sched_fair_class(void)
>
> #ifdef CONFIG_NO_HZ_COMMON
> nohz.next_balance = jiffies;
> + nohz.next_blocked = jiffies;
> zalloc_cpumask_var(&nohz.idle_cpus_mask, GFP_NOWAIT);
> #endif
> #endif /* SMP */
> diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
> index e200045..ad9b929 100644
> --- a/kernel/sched/sched.h
> +++ b/kernel/sched/sched.h
> @@ -723,6 +723,7 @@ struct rq {
> #ifdef CONFIG_SMP
> unsigned long last_load_update_tick;
> unsigned long last_blocked_load_update_tick;
> + unsigned int has_blocked_load;
> #endif /* CONFIG_SMP */
> unsigned int nohz_tick_stopped;
> atomic_t nohz_flags;
>

2018-02-06 14:32:38

by Vincent Guittot

[permalink] [raw]
Subject: Re: [PATCH 1/3] sched: Stop nohz stats when decayed

On 6 February 2018 at 15:16, Valentin Schneider
<[email protected]> wrote:
> Hi Vincent,
>
> On 02/06/2018 08:32 AM, Vincent Guittot wrote:
>> Stopped the periodic update of blocked load when all idle CPUs have fully
>> decayed. We introduce a new nohz.has_blocked that reflect if some idle
>> CPUs has blocked load that have to be periodiccally updated. nohz.has_blocked
>> is set everytime that a Idle CPU can have blocked load and it is then clear
>> when no more blocked load has been detected during an update. We don't need
>> atomic operation but only to make cure of the right ordering when updating
>> nohz.idle_cpus_mask and nohz.has_blocked.
>>
>> Suggested-by: Peter Zijlstra (Intel) <[email protected]>
>> Signed-off-by: Vincent Guittot <[email protected]>
>> ---
>> kernel/sched/fair.c | 94 +++++++++++++++++++++++++++++++++++++++++-----------
>> kernel/sched/sched.h | 1 +
>> 2 files changed, 75 insertions(+), 20 deletions(-)
>>
>> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
>> index 7af1fa9..279f4b2 100644
>> --- a/kernel/sched/fair.c
>> +++ b/kernel/sched/fair.c
>> @@ -5383,8 +5383,9 @@ decay_load_missed(unsigned long load, unsigned long missed_updates, int idx)
>> static struct {
>> cpumask_var_t idle_cpus_mask;
>> atomic_t nr_cpus;
>> + int has_blocked; /* Idle CPUS has blocked load */
>> unsigned long next_balance; /* in jiffy units */
>> - unsigned long next_stats;
>> + unsigned long next_blocked; /* Next update of blocked load in jiffies */
>> } nohz ____cacheline_aligned;
>>
>> #endif /* CONFIG_NO_HZ_COMMON */
>> @@ -6951,6 +6952,7 @@ enum fbq_type { regular, remote, all };
>> #define LBF_DST_PINNED 0x04
>> #define LBF_SOME_PINNED 0x08
>> #define LBF_NOHZ_STATS 0x10
>> +#define LBF_NOHZ_AGAIN 0x20
>>
>> struct lb_env {
>> struct sched_domain *sd;
>> @@ -7335,8 +7337,6 @@ static void attach_tasks(struct lb_env *env)
>> rq_unlock(env->dst_rq, &rf);
>> }
>>
>> -#ifdef CONFIG_FAIR_GROUP_SCHED
>> -
>> static inline bool cfs_rq_is_decayed(struct cfs_rq *cfs_rq)
>> {
>> if (cfs_rq->load.weight)
>> @@ -7354,11 +7354,14 @@ static inline bool cfs_rq_is_decayed(struct cfs_rq *cfs_rq)
>> return true;
>> }
>>
>> +#ifdef CONFIG_FAIR_GROUP_SCHED
>> +
>> static void update_blocked_averages(int cpu)
>> {
>> struct rq *rq = cpu_rq(cpu);
>> struct cfs_rq *cfs_rq, *pos;
>> struct rq_flags rf;
>> + bool done = true;
>>
>> rq_lock_irqsave(rq, &rf);
>> update_rq_clock(rq);
>> @@ -7388,10 +7391,14 @@ static void update_blocked_averages(int cpu)
>> */
>> if (cfs_rq_is_decayed(cfs_rq))
>> list_del_leaf_cfs_rq(cfs_rq);
>> + else
>> + done = false;
>> }
>>
>> #ifdef CONFIG_NO_HZ_COMMON
>> rq->last_blocked_load_update_tick = jiffies;
>> + if (done)
>> + rq->has_blocked_load = 0;
>> #endif
>> rq_unlock_irqrestore(rq, &rf);
>> }
>> @@ -7454,6 +7461,8 @@ static inline void update_blocked_averages(int cpu)
>> update_cfs_rq_load_avg(cfs_rq_clock_task(cfs_rq), cfs_rq);
>> #ifdef CONFIG_NO_HZ_COMMON
>> rq->last_blocked_load_update_tick = jiffies;
>> + if (cfs_rq_is_decayed(cfs_rq))
>> + rq->has_blocked_load = 0;
>> #endif
>> rq_unlock_irqrestore(rq, &rf);
>> }
>> @@ -7789,18 +7798,25 @@ group_type group_classify(struct sched_group *group,
>> return group_other;
>> }
>>
>> -static void update_nohz_stats(struct rq *rq)
>> +static bool update_nohz_stats(struct rq *rq)
>> {
>> #ifdef CONFIG_NO_HZ_COMMON
>> unsigned int cpu = rq->cpu;
>>
>> + if (!rq->has_blocked_load)
>> + return false;
>> +
>> if (!cpumask_test_cpu(cpu, nohz.idle_cpus_mask))
>> - return;
>> + return false;
>>
>> if (!time_after(jiffies, rq->last_blocked_load_update_tick))
>
> I forgot to ask this on the initial thread - what's the point of this condition ? At first glance I would have said that this would make more sense:
>
> if (!time_after(jiffies, rq->last_blocked_load_update_tick + msecs_to_jiffies(LOAD_AVG_PERIOD))
> return false;
>
> But maybe it's simply there to skip an update if it has already been done in the same jiffy interval ?

That's exactly the purpose.

>
>> - return;
>> + return true;
>>
>> update_blocked_averages(cpu);
>> +
>> + return rq->has_blocked_load;
>> +#else
>> + return false;
>> #endif
>> }
>>
>> @@ -7826,8 +7842,8 @@ static inline void update_sg_lb_stats(struct lb_env *env,
>> for_each_cpu_and(i, sched_group_span(group), env->cpus) {
>> struct rq *rq = cpu_rq(i);
>>
>> - if (env->flags & LBF_NOHZ_STATS)
>> - update_nohz_stats(rq);
>> + if ((env->flags & LBF_NOHZ_STATS) && update_nohz_stats(rq))
>> + env->flags |= LBF_NOHZ_AGAIN;
>>
>> /* Bias balancing toward cpus of our domain */
>> if (local_group)
>> @@ -7979,18 +7995,15 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd
>> struct sg_lb_stats *local = &sds->local_stat;
>> struct sg_lb_stats tmp_sgs;
>> int load_idx, prefer_sibling = 0;
>> + int has_blocked = READ_ONCE(nohz.has_blocked);
>> bool overload = false;
>>
>> if (child && child->flags & SD_PREFER_SIBLING)
>> prefer_sibling = 1;
>>
>> #ifdef CONFIG_NO_HZ_COMMON
>> - if (env->idle == CPU_NEWLY_IDLE) {
>> + if (env->idle == CPU_NEWLY_IDLE && has_blocked)
>> env->flags |= LBF_NOHZ_STATS;
>> -
>> - if (cpumask_subset(nohz.idle_cpus_mask, sched_domain_span(env->sd)))
>> - nohz.next_stats = jiffies + msecs_to_jiffies(LOAD_AVG_PERIOD);
>> - }
>> #endif
>>
>> load_idx = get_sd_load_idx(env->sd, env->idle);
>> @@ -8046,6 +8059,15 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd
>> sg = sg->next;
>> } while (sg != env->sd->groups);
>>
>> +#ifdef CONFIG_NO_HZ_COMMON
>> + if ((env->flags & LBF_NOHZ_AGAIN) &&
>> + cpumask_subset(nohz.idle_cpus_mask, sched_domain_span(env->sd))) {
>> +
>> + WRITE_ONCE(nohz.next_blocked,
>> + jiffies + msecs_to_jiffies(LOAD_AVG_PERIOD));
>> + }
>> +#endif
>> +
>> if (env->sd->flags & SD_NUMA)
>> env->fbq_type = fbq_classify_group(&sds->busiest_stat);
>>
>> @@ -9069,6 +9091,8 @@ static void nohz_balancer_kick(struct rq *rq)
>> struct sched_domain *sd;
>> int nr_busy, i, cpu = rq->cpu;
>> unsigned int flags = 0;
>> + unsigned long has_blocked = READ_ONCE(nohz.has_blocked);
>> + unsigned long next = READ_ONCE(nohz.next_blocked);
>
> What about something slightly more explicit, e.g. next_stats/next_blocked ? There's also nohz.next_balance referenced here so IMO it's best to keep things clear.

ok

>
>>
>> if (unlikely(rq->idle_balance))
>> return;
>> @@ -9086,7 +9110,7 @@ static void nohz_balancer_kick(struct rq *rq)
>> if (likely(!atomic_read(&nohz.nr_cpus)))
>> return;
>>
>> - if (time_after(now, nohz.next_stats))
>> + if (time_after(now, next) && has_blocked)
>> flags = NOHZ_STATS_KICK;
>>
>> if (time_before(now, nohz.next_balance))
>> @@ -9207,13 +9231,15 @@ void nohz_balance_enter_idle(int cpu)
>> if (!housekeeping_cpu(cpu, HK_FLAG_SCHED))
>> return;
>>
>> + rq->has_blocked_load = 1;
>> +
>> if (rq->nohz_tick_stopped)
>> - return;
>> + goto out;
>>
>> /*
>> * If we're a completely isolated CPU, we don't play.
>> */
>> - if (on_null_domain(cpu_rq(cpu)))
>> + if (on_null_domain(rq))
>> return;
>>
>> rq->nohz_tick_stopped = 1;
>> @@ -9222,6 +9248,13 @@ void nohz_balance_enter_idle(int cpu)
>> atomic_inc(&nohz.nr_cpus);
>>
>> set_cpu_sd_state_idle(cpu);
>> +
>> +out:
>> + /*
>> + * Each time a cpu enter idle, we assume that it has blocked load and
>> + * enable the periodic update of the load of idle cpus
>> + */
>> + WRITE_ONCE(nohz.has_blocked, 1);
>> }
>> #else
>> static inline void nohz_balancer_kick(struct rq *rq) { }
>> @@ -9374,6 +9407,16 @@ static bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
>>
>> SCHED_WARN_ON((flags & NOHZ_KICK_MASK) == NOHZ_BALANCE_KICK);
>>
>> + /*
>> + * We assume there will be no idle load after this update and clear
>> + * the stats state. If a cpu enters idle in the mean time, it will
>
> s/stats state/has_blocked flag/ (repeated a few times in the comment block)

yes. I will update others as well

>
>> + * set the stats state and trig another update of idle load.
>> + * Because a cpu that becomes idle, is added to idle_cpus_mask before
>> + * setting the stats state, we are sure to not clear the state and not
>> + * check the load of an idle cpu.
>> + */
>> + WRITE_ONCE(nohz.has_blocked, 0);
>> +
>> for_each_cpu(balance_cpu, nohz.idle_cpus_mask) {
>> if (balance_cpu == this_cpu || !idle_cpu(balance_cpu))
>> continue;
>> @@ -9383,11 +9426,16 @@ static bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
>> * work being done for other cpus. Next load
>> * balancing owner will pick it up.
>> */
>> - if (need_resched())
>> - break;
>> + if (need_resched()) {
>> + has_blocked_load = true;
>> + goto abort;
>> + }
>>
>> rq = cpu_rq(balance_cpu);
>>
>> + update_blocked_averages(rq->cpu);
>> + has_blocked_load |= rq->has_blocked_load;
>> +
>> /*
>> * If time for next balance is due,
>> * do the balance.
>> @@ -9400,7 +9448,6 @@ static bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
>> cpu_load_update_idle(rq);
>> rq_unlock_irq(rq, &rf);
>>
>> - update_blocked_averages(rq->cpu);
>> if (flags & NOHZ_BALANCE_KICK)
>> rebalance_domains(rq, CPU_IDLE);
>> }
>> @@ -9415,7 +9462,13 @@ static bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
>> if (flags & NOHZ_BALANCE_KICK)
>> rebalance_domains(this_rq, CPU_IDLE);
>>
>> - nohz.next_stats = next_stats;
>> + WRITE_ONCE(nohz.next_blocked,
>> + now + msecs_to_jiffies(LOAD_AVG_PERIOD));
>> +
>> +abort:
>> + /* There is still blocked load, enable periodic update */
>> + if (has_blocked_load)
>> + WRITE_ONCE(nohz.has_blocked, 1);
>>
>> /*
>> * next_balance will be updated only when there is a need.
>> @@ -10046,6 +10099,7 @@ __init void init_sched_fair_class(void)
>>
>> #ifdef CONFIG_NO_HZ_COMMON
>> nohz.next_balance = jiffies;
>> + nohz.next_blocked = jiffies;
>> zalloc_cpumask_var(&nohz.idle_cpus_mask, GFP_NOWAIT);
>> #endif
>> #endif /* SMP */
>> diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
>> index e200045..ad9b929 100644
>> --- a/kernel/sched/sched.h
>> +++ b/kernel/sched/sched.h
>> @@ -723,6 +723,7 @@ struct rq {
>> #ifdef CONFIG_SMP
>> unsigned long last_load_update_tick;
>> unsigned long last_blocked_load_update_tick;
>> + unsigned int has_blocked_load;
>> #endif /* CONFIG_SMP */
>> unsigned int nohz_tick_stopped;
>> atomic_t nohz_flags;
>>

2018-02-06 14:34:40

by Valentin Schneider

[permalink] [raw]
Subject: Re: [PATCH 3/3] sched: update blocked load when newly idle

Hi Vincent,

On 02/06/2018 08:32 AM, Vincent Guittot wrote:
> When NEWLY_IDLE load balance is not triggered, we might need to update the
> blocked load anyway. We can kick an ilb so an idle CPU will take care of
> updating blocked load or we can try to update them locally before entering
> idle. In the latter case, we reuse part of the nohz_idle_balance.
>
> Signed-off-by: Vincent Guittot <[email protected]>
> ---
> kernel/sched/fair.c | 102 ++++++++++++++++++++++++++++++++++++++++++----------
> 1 file changed, 84 insertions(+), 18 deletions(-)
>
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index 6998528..256defe 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -8829,6 +8829,9 @@ update_next_balance(struct sched_domain *sd, unsigned long *next_balance)
> *next_balance = next;
> }
>
> +static bool _nohz_idle_balance(struct rq *this_rq, unsigned int flags, enum cpu_idle_type idle);
> +static void kick_ilb(unsigned int flags);
> +
> /*
> * idle_balance is called by schedule() if this_cpu is about to become
> * idle. Attempts to pull tasks from other CPUs.
> @@ -8863,12 +8866,26 @@ static int idle_balance(struct rq *this_rq, struct rq_flags *rf)
>
> if (this_rq->avg_idle < sysctl_sched_migration_cost ||
> !this_rq->rd->overload) {
> + unsigned long has_blocked = READ_ONCE(nohz.has_blocked);
> + unsigned long next = READ_ONCE(nohz.next_blocked);

Ditto on 'next' - there's next_balance referenced in here so it'd be nice to make clear which is which.

> +
> rcu_read_lock();
> sd = rcu_dereference_check_sched_domain(this_rq->sd);
> if (sd)
> update_next_balance(sd, &next_balance);
> rcu_read_unlock();
>
> + /*
> + * Update blocked idle load if it has not been done for a
> + * while. Try to do it locally before entering idle but kick a
> + * ilb if it takes too much time and/or might delay next local
> + * wake up
> + */
> + if (has_blocked && time_after_eq(jiffies, next) &&
> + (this_rq->avg_idle < sysctl_sched_migration_cost ||
> + !_nohz_idle_balance(this_rq, NOHZ_STATS_KICK, CPU_NEWLY_IDLE)))

"this_rq->avg_idle < sysctl_sched_migration_cost" is used twice now, how about storing it in an "idle_too_short" variable ?

> + kick_ilb(NOHZ_STATS_KICK);
> +
> goto out;
> }
>
> @@ -9393,30 +9410,24 @@ static void rebalance_domains(struct rq *rq, enum cpu_idle_type idle)
>
> #ifdef CONFIG_NO_HZ_COMMON
> /*
> - * In CONFIG_NO_HZ_COMMON case, the idle balance kickee will do the
> - * rebalancing for all the cpus for whom scheduler ticks are stopped.
> + * Internal function that runs load balance for all idle cpus. The load balance
> + * can be a simple update of blocked load or a complete load balance with
> + * tasks movement depending of flags.
> + * For newly idle mode, we abort the loop if it takes too much time and return
> + * false to notify that the loop has not be completed and a ilb should be kick.
> */
> -static bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
> +static bool _nohz_idle_balance(struct rq *this_rq, unsigned int flags, enum cpu_idle_type idle)
> {
> /* Earliest time when we have to do rebalance again */
> unsigned long now = jiffies;
> unsigned long next_balance = now + 60*HZ;
> - unsigned long next_stats = now + msecs_to_jiffies(LOAD_AVG_PERIOD);
> + bool has_blocked_load = false;
> int update_next_balance = 0;
> int this_cpu = this_rq->cpu;
> - unsigned int flags;
> int balance_cpu;
> + int ret = false;
> struct rq *rq;
> -
> - if (!(atomic_read(nohz_flags(this_cpu)) & NOHZ_KICK_MASK))
> - return false;
> -
> - if (idle != CPU_IDLE) {
> - atomic_andnot(NOHZ_KICK_MASK, nohz_flags(this_cpu));
> - return false;
> - }
> -
> - flags = atomic_fetch_andnot(NOHZ_KICK_MASK, nohz_flags(this_cpu));
> + u64 curr_cost = 0;
>
> SCHED_WARN_ON((flags & NOHZ_KICK_MASK) == NOHZ_BALANCE_KICK);
>
> @@ -9431,6 +9442,10 @@ static bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
> WRITE_ONCE(nohz.has_blocked, 0);
>
> for_each_cpu(balance_cpu, nohz.idle_cpus_mask) {
> + u64 t0, domain_cost;
> +
> + t0 = sched_clock_cpu(this_cpu);
> +
> if (balance_cpu == this_cpu || !idle_cpu(balance_cpu))
> continue;
>
> @@ -9444,6 +9459,16 @@ static bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
> goto abort;
> }
>
> + /*
> + * If the update is done while CPU becomes idle, we abort
> + * the update when its cost is higher than the average idle
> + * time in orde to not delay a possible wake up.
> + */
> + if (idle == CPU_NEWLY_IDLE && this_rq->avg_idle < curr_cost) {
> + has_blocked_load = true;
> + goto abort;
> + }
> +
> rq = cpu_rq(balance_cpu);
>
> update_blocked_averages(rq->cpu);
> @@ -9456,10 +9481,10 @@ static bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
> if (time_after_eq(jiffies, rq->next_balance)) {
> struct rq_flags rf;
>
> - rq_lock_irq(rq, &rf);
> + rq_lock_irqsave(rq, &rf);
> update_rq_clock(rq);
> cpu_load_update_idle(rq);
> - rq_unlock_irq(rq, &rf);
> + rq_unlock_irqrestore(rq, &rf);
>
> if (flags & NOHZ_BALANCE_KICK)
> rebalance_domains(rq, CPU_IDLE);
> @@ -9469,15 +9494,27 @@ static bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
> next_balance = rq->next_balance;
> update_next_balance = 1;
> }
> +
> + domain_cost = sched_clock_cpu(this_cpu) - t0;
> + curr_cost += domain_cost;
> +
> + }
> +
> + /* Newly idle CPU doesn't need an update */
> + if (idle != CPU_NEWLY_IDLE) {
> + update_blocked_averages(this_cpu);
> + has_blocked_load |= this_rq->has_blocked_load;
> }
>
> - update_blocked_averages(this_cpu);
> if (flags & NOHZ_BALANCE_KICK)
> rebalance_domains(this_rq, CPU_IDLE);
>
> WRITE_ONCE(nohz.next_blocked,
> now + msecs_to_jiffies(LOAD_AVG_PERIOD));
>
> + /* The full idle balance loop has been done */
> + ret = true;
> +
> abort:
> /* There is still blocked load, enable periodic update */
> if (has_blocked_load)
> @@ -9491,6 +9528,35 @@ static bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
> if (likely(update_next_balance))
> nohz.next_balance = next_balance;
>
> + return ret;
> +}
> +
> +/*
> + * In CONFIG_NO_HZ_COMMON case, the idle balance kickee will do the
> + * rebalancing for all the cpus for whom scheduler ticks are stopped.
> + */
> +static bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
> +{
> + int this_cpu = this_rq->cpu;
> + unsigned int flags;
> +
> + if (!(atomic_read(nohz_flags(this_cpu)) & NOHZ_KICK_MASK))
> + return false;
> +
> + if (idle != CPU_IDLE) {
> + atomic_andnot(NOHZ_KICK_MASK, nohz_flags(this_cpu));
> + return false;
> + }
> +
> + /*
> + * barrier, pairs with nohz_balance_enter_idle(), ensures ...
> + */
> + flags = atomic_fetch_andnot(NOHZ_KICK_MASK, nohz_flags(this_cpu));
> + if (!(flags & NOHZ_KICK_MASK))
> + return false;
> +
> + _nohz_idle_balance(this_rq, flags, idle);
> +
> return true;
> }
> #else
>

2018-02-06 16:20:32

by Vincent Guittot

[permalink] [raw]
Subject: Re: [PATCH 3/3] sched: update blocked load when newly idle

On 6 February 2018 at 15:32, Valentin Schneider
<[email protected]> wrote:
> Hi Vincent,
>
> On 02/06/2018 08:32 AM, Vincent Guittot wrote:
>> When NEWLY_IDLE load balance is not triggered, we might need to update the
>> blocked load anyway. We can kick an ilb so an idle CPU will take care of
>> updating blocked load or we can try to update them locally before entering
>> idle. In the latter case, we reuse part of the nohz_idle_balance.
>>
>> Signed-off-by: Vincent Guittot <[email protected]>
>> ---
>> kernel/sched/fair.c | 102 ++++++++++++++++++++++++++++++++++++++++++----------
>> 1 file changed, 84 insertions(+), 18 deletions(-)
>>
>> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
>> index 6998528..256defe 100644
>> --- a/kernel/sched/fair.c
>> +++ b/kernel/sched/fair.c
>> @@ -8829,6 +8829,9 @@ update_next_balance(struct sched_domain *sd, unsigned long *next_balance)
>> *next_balance = next;
>> }
>>
>> +static bool _nohz_idle_balance(struct rq *this_rq, unsigned int flags, enum cpu_idle_type idle);
>> +static void kick_ilb(unsigned int flags);
>> +
>> /*
>> * idle_balance is called by schedule() if this_cpu is about to become
>> * idle. Attempts to pull tasks from other CPUs.
>> @@ -8863,12 +8866,26 @@ static int idle_balance(struct rq *this_rq, struct rq_flags *rf)
>>
>> if (this_rq->avg_idle < sysctl_sched_migration_cost ||
>> !this_rq->rd->overload) {
>> + unsigned long has_blocked = READ_ONCE(nohz.has_blocked);
>> + unsigned long next = READ_ONCE(nohz.next_blocked);
>
> Ditto on 'next' - there's next_balance referenced in here so it'd be nice to make clear which is which.
>
>> +
>> rcu_read_lock();
>> sd = rcu_dereference_check_sched_domain(this_rq->sd);
>> if (sd)
>> update_next_balance(sd, &next_balance);
>> rcu_read_unlock();
>>
>> + /*
>> + * Update blocked idle load if it has not been done for a
>> + * while. Try to do it locally before entering idle but kick a
>> + * ilb if it takes too much time and/or might delay next local
>> + * wake up
>> + */
>> + if (has_blocked && time_after_eq(jiffies, next) &&
>> + (this_rq->avg_idle < sysctl_sched_migration_cost ||
>> + !_nohz_idle_balance(this_rq, NOHZ_STATS_KICK, CPU_NEWLY_IDLE)))
>
> "this_rq->avg_idle < sysctl_sched_migration_cost" is used twice now, how about storing it in an "idle_too_short" variable ?

In fact it's already the 3rd time
Why do you want it to be stored in an "idle_too_short" variable ?

>
>> + kick_ilb(NOHZ_STATS_KICK);
>> +
>> goto out;
>> }
>>
>> @@ -9393,30 +9410,24 @@ static void rebalance_domains(struct rq *rq, enum cpu_idle_type idle)
>>
>> #ifdef CONFIG_NO_HZ_COMMON
>> /*
>> - * In CONFIG_NO_HZ_COMMON case, the idle balance kickee will do the
>> - * rebalancing for all the cpus for whom scheduler ticks are stopped.
>> + * Internal function that runs load balance for all idle cpus. The load balance
>> + * can be a simple update of blocked load or a complete load balance with
>> + * tasks movement depending of flags.
>> + * For newly idle mode, we abort the loop if it takes too much time and return
>> + * false to notify that the loop has not be completed and a ilb should be kick.
>> */
>> -static bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
>> +static bool _nohz_idle_balance(struct rq *this_rq, unsigned int flags, enum cpu_idle_type idle)
>> {
>> /* Earliest time when we have to do rebalance again */
>> unsigned long now = jiffies;
>> unsigned long next_balance = now + 60*HZ;
>> - unsigned long next_stats = now + msecs_to_jiffies(LOAD_AVG_PERIOD);
>> + bool has_blocked_load = false;
>> int update_next_balance = 0;
>> int this_cpu = this_rq->cpu;
>> - unsigned int flags;
>> int balance_cpu;
>> + int ret = false;
>> struct rq *rq;
>> -
>> - if (!(atomic_read(nohz_flags(this_cpu)) & NOHZ_KICK_MASK))
>> - return false;
>> -
>> - if (idle != CPU_IDLE) {
>> - atomic_andnot(NOHZ_KICK_MASK, nohz_flags(this_cpu));
>> - return false;
>> - }
>> -
>> - flags = atomic_fetch_andnot(NOHZ_KICK_MASK, nohz_flags(this_cpu));
>> + u64 curr_cost = 0;
>>
>> SCHED_WARN_ON((flags & NOHZ_KICK_MASK) == NOHZ_BALANCE_KICK);
>>
>> @@ -9431,6 +9442,10 @@ static bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
>> WRITE_ONCE(nohz.has_blocked, 0);
>>
>> for_each_cpu(balance_cpu, nohz.idle_cpus_mask) {
>> + u64 t0, domain_cost;
>> +
>> + t0 = sched_clock_cpu(this_cpu);
>> +
>> if (balance_cpu == this_cpu || !idle_cpu(balance_cpu))
>> continue;
>>
>> @@ -9444,6 +9459,16 @@ static bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
>> goto abort;
>> }
>>
>> + /*
>> + * If the update is done while CPU becomes idle, we abort
>> + * the update when its cost is higher than the average idle
>> + * time in orde to not delay a possible wake up.
>> + */
>> + if (idle == CPU_NEWLY_IDLE && this_rq->avg_idle < curr_cost) {
>> + has_blocked_load = true;
>> + goto abort;
>> + }
>> +
>> rq = cpu_rq(balance_cpu);
>>
>> update_blocked_averages(rq->cpu);
>> @@ -9456,10 +9481,10 @@ static bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
>> if (time_after_eq(jiffies, rq->next_balance)) {
>> struct rq_flags rf;
>>
>> - rq_lock_irq(rq, &rf);
>> + rq_lock_irqsave(rq, &rf);
>> update_rq_clock(rq);
>> cpu_load_update_idle(rq);
>> - rq_unlock_irq(rq, &rf);
>> + rq_unlock_irqrestore(rq, &rf);
>>
>> if (flags & NOHZ_BALANCE_KICK)
>> rebalance_domains(rq, CPU_IDLE);
>> @@ -9469,15 +9494,27 @@ static bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
>> next_balance = rq->next_balance;
>> update_next_balance = 1;
>> }
>> +
>> + domain_cost = sched_clock_cpu(this_cpu) - t0;
>> + curr_cost += domain_cost;
>> +
>> + }
>> +
>> + /* Newly idle CPU doesn't need an update */
>> + if (idle != CPU_NEWLY_IDLE) {
>> + update_blocked_averages(this_cpu);
>> + has_blocked_load |= this_rq->has_blocked_load;
>> }
>>
>> - update_blocked_averages(this_cpu);
>> if (flags & NOHZ_BALANCE_KICK)
>> rebalance_domains(this_rq, CPU_IDLE);
>>
>> WRITE_ONCE(nohz.next_blocked,
>> now + msecs_to_jiffies(LOAD_AVG_PERIOD));
>>
>> + /* The full idle balance loop has been done */
>> + ret = true;
>> +
>> abort:
>> /* There is still blocked load, enable periodic update */
>> if (has_blocked_load)
>> @@ -9491,6 +9528,35 @@ static bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
>> if (likely(update_next_balance))
>> nohz.next_balance = next_balance;
>>
>> + return ret;
>> +}
>> +
>> +/*
>> + * In CONFIG_NO_HZ_COMMON case, the idle balance kickee will do the
>> + * rebalancing for all the cpus for whom scheduler ticks are stopped.
>> + */
>> +static bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
>> +{
>> + int this_cpu = this_rq->cpu;
>> + unsigned int flags;
>> +
>> + if (!(atomic_read(nohz_flags(this_cpu)) & NOHZ_KICK_MASK))
>> + return false;
>> +
>> + if (idle != CPU_IDLE) {
>> + atomic_andnot(NOHZ_KICK_MASK, nohz_flags(this_cpu));
>> + return false;
>> + }
>> +
>> + /*
>> + * barrier, pairs with nohz_balance_enter_idle(), ensures ...
>> + */
>> + flags = atomic_fetch_andnot(NOHZ_KICK_MASK, nohz_flags(this_cpu));
>> + if (!(flags & NOHZ_KICK_MASK))
>> + return false;
>> +
>> + _nohz_idle_balance(this_rq, flags, idle);
>> +
>> return true;
>> }
>> #else
>>

2018-02-06 16:34:54

by Valentin Schneider

[permalink] [raw]
Subject: Re: [PATCH 3/3] sched: update blocked load when newly idle

On 02/06/2018 04:17 PM, Vincent Guittot wrote:
> On 6 February 2018 at 15:32, Valentin Schneider
> <[email protected]> wrote:
>> Hi Vincent,
>>
>> On 02/06/2018 08:32 AM, Vincent Guittot wrote:
>>> When NEWLY_IDLE load balance is not triggered, we might need to update the
>>> blocked load anyway. We can kick an ilb so an idle CPU will take care of
>>> updating blocked load or we can try to update them locally before entering
>>> idle. In the latter case, we reuse part of the nohz_idle_balance.
>>>
>>> Signed-off-by: Vincent Guittot <[email protected]>
>>> ---
>>> kernel/sched/fair.c | 102 ++++++++++++++++++++++++++++++++++++++++++----------
>>> 1 file changed, 84 insertions(+), 18 deletions(-)
>>>
>>> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
>>> index 6998528..256defe 100644
>>> --- a/kernel/sched/fair.c
>>> +++ b/kernel/sched/fair.c
>>> @@ -8829,6 +8829,9 @@ update_next_balance(struct sched_domain *sd, unsigned long *next_balance)
>>> *next_balance = next;
>>> }
>>>
>>> +static bool _nohz_idle_balance(struct rq *this_rq, unsigned int flags, enum cpu_idle_type idle);
>>> +static void kick_ilb(unsigned int flags);
>>> +
>>> /*
>>> * idle_balance is called by schedule() if this_cpu is about to become
>>> * idle. Attempts to pull tasks from other CPUs.
>>> @@ -8863,12 +8866,26 @@ static int idle_balance(struct rq *this_rq, struct rq_flags *rf)
>>>
>>> if (this_rq->avg_idle < sysctl_sched_migration_cost ||
>>> !this_rq->rd->overload) {
>>> + unsigned long has_blocked = READ_ONCE(nohz.has_blocked);
>>> + unsigned long next = READ_ONCE(nohz.next_blocked);
>>
>> Ditto on 'next' - there's next_balance referenced in here so it'd be nice to make clear which is which.
>>
>>> +
>>> rcu_read_lock();
>>> sd = rcu_dereference_check_sched_domain(this_rq->sd);
>>> if (sd)
>>> update_next_balance(sd, &next_balance);
>>> rcu_read_unlock();
>>>
>>> + /*
>>> + * Update blocked idle load if it has not been done for a
>>> + * while. Try to do it locally before entering idle but kick a
>>> + * ilb if it takes too much time and/or might delay next local
>>> + * wake up
>>> + */
>>> + if (has_blocked && time_after_eq(jiffies, next) &&
>>> + (this_rq->avg_idle < sysctl_sched_migration_cost ||
>>> + !_nohz_idle_balance(this_rq, NOHZ_STATS_KICK, CPU_NEWLY_IDLE)))
>>
>> "this_rq->avg_idle < sysctl_sched_migration_cost" is used twice now, how about storing it in an "idle_too_short" variable ?
>
> In fact it's already the 3rd time
> Why do you want it to be stored in an "idle_too_short" variable ?

I meant that locally in idle_balance() to not write the same thing twice. TBH that's me being nitpicky (and liking explicit variables).

>
>>
>>> + kick_ilb(NOHZ_STATS_KICK);
>>> +
>>> goto out;
>>> }
>>>
>>> @@ -9393,30 +9410,24 @@ static void rebalance_domains(struct rq *rq, enum cpu_idle_type idle)
>>>
>>> #ifdef CONFIG_NO_HZ_COMMON
>>> /*
>>> - * In CONFIG_NO_HZ_COMMON case, the idle balance kickee will do the
>>> - * rebalancing for all the cpus for whom scheduler ticks are stopped.
>>> + * Internal function that runs load balance for all idle cpus. The load balance
>>> + * can be a simple update of blocked load or a complete load balance with
>>> + * tasks movement depending of flags.
>>> + * For newly idle mode, we abort the loop if it takes too much time and return
>>> + * false to notify that the loop has not be completed and a ilb should be kick.
>>> */
>>> -static bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
>>> +static bool _nohz_idle_balance(struct rq *this_rq, unsigned int flags, enum cpu_idle_type idle)
>>> {
>>> /* Earliest time when we have to do rebalance again */
>>> unsigned long now = jiffies;
>>> unsigned long next_balance = now + 60*HZ;
>>> - unsigned long next_stats = now + msecs_to_jiffies(LOAD_AVG_PERIOD);
>>> + bool has_blocked_load = false;
>>> int update_next_balance = 0;
>>> int this_cpu = this_rq->cpu;
>>> - unsigned int flags;
>>> int balance_cpu;
>>> + int ret = false;
>>> struct rq *rq;
>>> -
>>> - if (!(atomic_read(nohz_flags(this_cpu)) & NOHZ_KICK_MASK))
>>> - return false;
>>> -
>>> - if (idle != CPU_IDLE) {
>>> - atomic_andnot(NOHZ_KICK_MASK, nohz_flags(this_cpu));
>>> - return false;
>>> - }
>>> -
>>> - flags = atomic_fetch_andnot(NOHZ_KICK_MASK, nohz_flags(this_cpu));
>>> + u64 curr_cost = 0;
>>>
>>> SCHED_WARN_ON((flags & NOHZ_KICK_MASK) == NOHZ_BALANCE_KICK);
>>>
>>> @@ -9431,6 +9442,10 @@ static bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
>>> WRITE_ONCE(nohz.has_blocked, 0);
>>>
>>> for_each_cpu(balance_cpu, nohz.idle_cpus_mask) {
>>> + u64 t0, domain_cost;
>>> +
>>> + t0 = sched_clock_cpu(this_cpu);
>>> +
>>> if (balance_cpu == this_cpu || !idle_cpu(balance_cpu))
>>> continue;
>>>
>>> @@ -9444,6 +9459,16 @@ static bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
>>> goto abort;
>>> }
>>>
>>> + /*
>>> + * If the update is done while CPU becomes idle, we abort
>>> + * the update when its cost is higher than the average idle
>>> + * time in orde to not delay a possible wake up.
>>> + */
>>> + if (idle == CPU_NEWLY_IDLE && this_rq->avg_idle < curr_cost) {
>>> + has_blocked_load = true;
>>> + goto abort;
>>> + }
>>> +
>>> rq = cpu_rq(balance_cpu);
>>>
>>> update_blocked_averages(rq->cpu);
>>> @@ -9456,10 +9481,10 @@ static bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
>>> if (time_after_eq(jiffies, rq->next_balance)) {
>>> struct rq_flags rf;
>>>
>>> - rq_lock_irq(rq, &rf);
>>> + rq_lock_irqsave(rq, &rf);
>>> update_rq_clock(rq);
>>> cpu_load_update_idle(rq);
>>> - rq_unlock_irq(rq, &rf);
>>> + rq_unlock_irqrestore(rq, &rf);
>>>
>>> if (flags & NOHZ_BALANCE_KICK)
>>> rebalance_domains(rq, CPU_IDLE);
>>> @@ -9469,15 +9494,27 @@ static bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
>>> next_balance = rq->next_balance;
>>> update_next_balance = 1;
>>> }
>>> +
>>> + domain_cost = sched_clock_cpu(this_cpu) - t0;
>>> + curr_cost += domain_cost;
>>> +
>>> + }
>>> +
>>> + /* Newly idle CPU doesn't need an update */
>>> + if (idle != CPU_NEWLY_IDLE) {
>>> + update_blocked_averages(this_cpu);
>>> + has_blocked_load |= this_rq->has_blocked_load;
>>> }
>>>
>>> - update_blocked_averages(this_cpu);
>>> if (flags & NOHZ_BALANCE_KICK)
>>> rebalance_domains(this_rq, CPU_IDLE);
>>>
>>> WRITE_ONCE(nohz.next_blocked,
>>> now + msecs_to_jiffies(LOAD_AVG_PERIOD));
>>>
>>> + /* The full idle balance loop has been done */
>>> + ret = true;
>>> +
>>> abort:
>>> /* There is still blocked load, enable periodic update */
>>> if (has_blocked_load)
>>> @@ -9491,6 +9528,35 @@ static bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
>>> if (likely(update_next_balance))
>>> nohz.next_balance = next_balance;
>>>
>>> + return ret;
>>> +}
>>> +
>>> +/*
>>> + * In CONFIG_NO_HZ_COMMON case, the idle balance kickee will do the
>>> + * rebalancing for all the cpus for whom scheduler ticks are stopped.
>>> + */
>>> +static bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
>>> +{
>>> + int this_cpu = this_rq->cpu;
>>> + unsigned int flags;
>>> +
>>> + if (!(atomic_read(nohz_flags(this_cpu)) & NOHZ_KICK_MASK))
>>> + return false;
>>> +
>>> + if (idle != CPU_IDLE) {
>>> + atomic_andnot(NOHZ_KICK_MASK, nohz_flags(this_cpu));
>>> + return false;
>>> + }
>>> +
>>> + /*
>>> + * barrier, pairs with nohz_balance_enter_idle(), ensures ...
>>> + */
>>> + flags = atomic_fetch_andnot(NOHZ_KICK_MASK, nohz_flags(this_cpu));
>>> + if (!(flags & NOHZ_KICK_MASK))
>>> + return false;
>>> +
>>> + _nohz_idle_balance(this_rq, flags, idle);
>>> +
>>> return true;
>>> }
>>> #else
>>>