Instead of waking up a random and already idle CPU, we can take advantage
of this_cpu being about to enter idle to run the ILB and update the
blocked load.
Signed-off-by: Vincent Guittot <[email protected]>
---
include/linux/sched/nohz.h | 2 ++
kernel/sched/fair.c | 11 ++++++++---
kernel/sched/idle.c | 6 ++++++
3 files changed, 16 insertions(+), 3 deletions(-)
diff --git a/include/linux/sched/nohz.h b/include/linux/sched/nohz.h
index 6d67e9a5af6b..74cdc4e87310 100644
--- a/include/linux/sched/nohz.h
+++ b/include/linux/sched/nohz.h
@@ -9,8 +9,10 @@
#if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON)
extern void nohz_balance_enter_idle(int cpu);
extern int get_nohz_timer_target(void);
+extern void nohz_run_idle_balance(int cpu);
#else
static inline void nohz_balance_enter_idle(int cpu) { }
+static inline void nohz_run_idle_balance(int cpu) { }
#endif
#ifdef CONFIG_NO_HZ_COMMON
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 935594cd5430..3d2ab28d5736 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -10461,6 +10461,11 @@ static bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
return true;
}
+void nohz_run_idle_balance(int cpu)
+{
+ nohz_idle_balance(cpu_rq(cpu), CPU_IDLE);
+}
+
static void nohz_newidle_balance(struct rq *this_rq)
{
int this_cpu = this_rq->cpu;
@@ -10482,10 +10487,10 @@ static void nohz_newidle_balance(struct rq *this_rq)
return;
/*
- * Blocked load of idle CPUs need to be updated.
- * Kick an ILB to update statistics.
+ * Set the need to trigger ILB in order to update blocked load
+ * before entering idle state.
*/
- kick_ilb(NOHZ_STATS_KICK);
+ this_rq->nohz_idle_balance = NOHZ_STATS_KICK;
}
#else /* !CONFIG_NO_HZ_COMMON */
diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c
index 305727ea0677..52a4e9ce2f9b 100644
--- a/kernel/sched/idle.c
+++ b/kernel/sched/idle.c
@@ -261,6 +261,12 @@ static void cpuidle_idle_call(void)
static void do_idle(void)
{
int cpu = smp_processor_id();
+
+ /*
+ * Check if we need to update some blocked load
+ */
+ nohz_run_idle_balance(cpu);
+
/*
* If the arch has a polling bit, we maintain an invariant:
*
--
2.17.1
On 05/02/21 12:48, Vincent Guittot wrote:
> Instead of waking up a random and already idle CPU, we can take advantage
> of this_cpu being about to enter idle to run the ILB and update the
> blocked load.
>
> Signed-off-by: Vincent Guittot <[email protected]>
> ---
> include/linux/sched/nohz.h | 2 ++
> kernel/sched/fair.c | 11 ++++++++---
> kernel/sched/idle.c | 6 ++++++
> 3 files changed, 16 insertions(+), 3 deletions(-)
>
> diff --git a/include/linux/sched/nohz.h b/include/linux/sched/nohz.h
> index 6d67e9a5af6b..74cdc4e87310 100644
> --- a/include/linux/sched/nohz.h
> +++ b/include/linux/sched/nohz.h
> @@ -9,8 +9,10 @@
> #if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON)
> extern void nohz_balance_enter_idle(int cpu);
> extern int get_nohz_timer_target(void);
> +extern void nohz_run_idle_balance(int cpu);
> #else
> static inline void nohz_balance_enter_idle(int cpu) { }
> +static inline void nohz_run_idle_balance(int cpu) { }
> #endif
>
> #ifdef CONFIG_NO_HZ_COMMON
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index 935594cd5430..3d2ab28d5736 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -10461,6 +10461,11 @@ static bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
> return true;
> }
>
> +void nohz_run_idle_balance(int cpu)
> +{
> + nohz_idle_balance(cpu_rq(cpu), CPU_IDLE);
> +}
> +
> static void nohz_newidle_balance(struct rq *this_rq)
> {
> int this_cpu = this_rq->cpu;
> @@ -10482,10 +10487,10 @@ static void nohz_newidle_balance(struct rq *this_rq)
> return;
>
> /*
> - * Blocked load of idle CPUs need to be updated.
> - * Kick an ILB to update statistics.
> + * Set the need to trigger ILB in order to update blocked load
> + * before entering idle state.
> */
> - kick_ilb(NOHZ_STATS_KICK);
> + this_rq->nohz_idle_balance = NOHZ_STATS_KICK;
> }
>
> #else /* !CONFIG_NO_HZ_COMMON */
> diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c
> index 305727ea0677..52a4e9ce2f9b 100644
> --- a/kernel/sched/idle.c
> +++ b/kernel/sched/idle.c
> @@ -261,6 +261,12 @@ static void cpuidle_idle_call(void)
> static void do_idle(void)
> {
> int cpu = smp_processor_id();
> +
> + /*
> + * Check if we need to update some blocked load
> + */
> + nohz_run_idle_balance(cpu);
> +
What do we gain from doing this here vs having a stats update in
newidle_balance()?
The current approach is to have a combined load_balance() + blocked load
update during newidle, and I get that this can take too long. But then,
we could still have what you're adding to do_idle() in the tail of
newidle_balance() itself, no? i.e.
newidle_balance()
...
for_each_domain(this_cpu, sd) {
...
pulled_task = load_balance(...);
...
}
...
if (!pulled_task && !this_rq->nr_running) {
this_rq->nohz_idle_balance = NOHZ_STATS_KICK;
_nohz_idle_balance();
}
or somesuch.
> /*
> * If the arch has a polling bit, we maintain an invariant:
> *
> --
> 2.17.1
On 05/02/2021 12:48, Vincent Guittot wrote:
> Instead of waking up a random and already idle CPU, we can take advantage
> of this_cpu being about to enter idle to run the ILB and update the
> blocked load.
>
> Signed-off-by: Vincent Guittot <[email protected]>
> ---
> include/linux/sched/nohz.h | 2 ++
> kernel/sched/fair.c | 11 ++++++++---
> kernel/sched/idle.c | 6 ++++++
> 3 files changed, 16 insertions(+), 3 deletions(-)
>
> diff --git a/include/linux/sched/nohz.h b/include/linux/sched/nohz.h
> index 6d67e9a5af6b..74cdc4e87310 100644
> --- a/include/linux/sched/nohz.h
> +++ b/include/linux/sched/nohz.h
> @@ -9,8 +9,10 @@
> #if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON)
> extern void nohz_balance_enter_idle(int cpu);
> extern int get_nohz_timer_target(void);
> +extern void nohz_run_idle_balance(int cpu);
> #else
> static inline void nohz_balance_enter_idle(int cpu) { }
> +static inline void nohz_run_idle_balance(int cpu) { }
> #endif
(1) Since nohz_run_idle_balance() would be an interface one sched class
(fair) exports to another (idle) I wonder if kernel/sched/sched.h would
be the more appropriate include file to export/define it?
nohz_balance_exit_idle() is exported via kernel/sched/sched.h (used only
within the scheduler) whereas nohz_balance_enter_idle() is exported via
include/linux/sched/nohz.h (used in kernel/time/tick-sched.c).
Isn't include/linux/sched/nohz.h the interface between kernel/sched/ and
kernel/time?
There is one exception already though: calc_load_nohz_remote() defined
in kernel/sched/loadavg.c and (only) used in kernel/sched/core.c.
(2) Is there a need for an extra function nohz_run_idle_balance()?
do_idle() could call nohz_idle_balance() directly in case in would be
exported instead.
[...]
On Tue, 9 Feb 2021 at 14:09, Valentin Schneider
<[email protected]> wrote:
>
> On 05/02/21 12:48, Vincent Guittot wrote:
> > Instead of waking up a random and already idle CPU, we can take advantage
> > of this_cpu being about to enter idle to run the ILB and update the
> > blocked load.
> >
> > Signed-off-by: Vincent Guittot <[email protected]>
> > ---
> > include/linux/sched/nohz.h | 2 ++
> > kernel/sched/fair.c | 11 ++++++++---
> > kernel/sched/idle.c | 6 ++++++
> > 3 files changed, 16 insertions(+), 3 deletions(-)
> >
> > diff --git a/include/linux/sched/nohz.h b/include/linux/sched/nohz.h
> > index 6d67e9a5af6b..74cdc4e87310 100644
> > --- a/include/linux/sched/nohz.h
> > +++ b/include/linux/sched/nohz.h
> > @@ -9,8 +9,10 @@
> > #if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON)
> > extern void nohz_balance_enter_idle(int cpu);
> > extern int get_nohz_timer_target(void);
> > +extern void nohz_run_idle_balance(int cpu);
> > #else
> > static inline void nohz_balance_enter_idle(int cpu) { }
> > +static inline void nohz_run_idle_balance(int cpu) { }
> > #endif
> >
> > #ifdef CONFIG_NO_HZ_COMMON
> > diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> > index 935594cd5430..3d2ab28d5736 100644
> > --- a/kernel/sched/fair.c
> > +++ b/kernel/sched/fair.c
> > @@ -10461,6 +10461,11 @@ static bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
> > return true;
> > }
> >
> > +void nohz_run_idle_balance(int cpu)
> > +{
> > + nohz_idle_balance(cpu_rq(cpu), CPU_IDLE);
> > +}
> > +
> > static void nohz_newidle_balance(struct rq *this_rq)
> > {
> > int this_cpu = this_rq->cpu;
> > @@ -10482,10 +10487,10 @@ static void nohz_newidle_balance(struct rq *this_rq)
> > return;
> >
> > /*
> > - * Blocked load of idle CPUs need to be updated.
> > - * Kick an ILB to update statistics.
> > + * Set the need to trigger ILB in order to update blocked load
> > + * before entering idle state.
> > */
> > - kick_ilb(NOHZ_STATS_KICK);
> > + this_rq->nohz_idle_balance = NOHZ_STATS_KICK;
> > }
> >
> > #else /* !CONFIG_NO_HZ_COMMON */
> > diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c
> > index 305727ea0677..52a4e9ce2f9b 100644
> > --- a/kernel/sched/idle.c
> > +++ b/kernel/sched/idle.c
> > @@ -261,6 +261,12 @@ static void cpuidle_idle_call(void)
> > static void do_idle(void)
> > {
> > int cpu = smp_processor_id();
> > +
> > + /*
> > + * Check if we need to update some blocked load
> > + */
> > + nohz_run_idle_balance(cpu);
> > +
>
> What do we gain from doing this here vs having a stats update in
> newidle_balance()?
As mentioned by Joel, newidle_balance is called in the schedule
context with preempt and irq off which prevent any local activity
like irq/timer. Whereas in this new place, we have the same condition
as during ILB with only preemptoff and _nohz_idle_balance() regularly
checks if it has to abort because something has been scheduled on the
cpu.
>
> The current approach is to have a combined load_balance() + blocked load
> update during newidle, and I get that this can take too long. But then,
> we could still have what you're adding to do_idle() in the tail of
> newidle_balance() itself, no? i.e.
>
> newidle_balance()
> ...
> for_each_domain(this_cpu, sd) {
> ...
> pulled_task = load_balance(...);
> ...
> }
> ...
> if (!pulled_task && !this_rq->nr_running) {
> this_rq->nohz_idle_balance = NOHZ_STATS_KICK;
> _nohz_idle_balance();
> }
>
> or somesuch.
>
> > /*
> > * If the arch has a polling bit, we maintain an invariant:
> > *
> > --
> > 2.17.1
On Tue, 9 Feb 2021 at 14:47, Dietmar Eggemann <[email protected]> wrote:
>
> On 05/02/2021 12:48, Vincent Guittot wrote:
> > Instead of waking up a random and already idle CPU, we can take advantage
> > of this_cpu being about to enter idle to run the ILB and update the
> > blocked load.
> >
> > Signed-off-by: Vincent Guittot <[email protected]>
> > ---
> > include/linux/sched/nohz.h | 2 ++
> > kernel/sched/fair.c | 11 ++++++++---
> > kernel/sched/idle.c | 6 ++++++
> > 3 files changed, 16 insertions(+), 3 deletions(-)
> >
> > diff --git a/include/linux/sched/nohz.h b/include/linux/sched/nohz.h
> > index 6d67e9a5af6b..74cdc4e87310 100644
> > --- a/include/linux/sched/nohz.h
> > +++ b/include/linux/sched/nohz.h
> > @@ -9,8 +9,10 @@
> > #if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON)
> > extern void nohz_balance_enter_idle(int cpu);
> > extern int get_nohz_timer_target(void);
> > +extern void nohz_run_idle_balance(int cpu);
> > #else
> > static inline void nohz_balance_enter_idle(int cpu) { }
> > +static inline void nohz_run_idle_balance(int cpu) { }
> > #endif
>
> (1) Since nohz_run_idle_balance() would be an interface one sched class
> (fair) exports to another (idle) I wonder if kernel/sched/sched.h would
> be the more appropriate include file to export/define it?
Yes probably. I have been influenced by the "nohz" filename but
kernel/sched/sched.h is better
>
> nohz_balance_exit_idle() is exported via kernel/sched/sched.h (used only
> within the scheduler) whereas nohz_balance_enter_idle() is exported via
> include/linux/sched/nohz.h (used in kernel/time/tick-sched.c).
>
> Isn't include/linux/sched/nohz.h the interface between kernel/sched/ and
> kernel/time?
>
> There is one exception already though: calc_load_nohz_remote() defined
> in kernel/sched/loadavg.c and (only) used in kernel/sched/core.c.
>
>
> (2) Is there a need for an extra function nohz_run_idle_balance()?
> do_idle() could call nohz_idle_balance() directly in case in would be
> exported instead.
I didn't want to expose the 2 parameters of nohz_idle_balance in
do_idle() and especially the enum cpu_idle_type but it seems that it
is already available so I can probably call
nohz_idle_balance(cpu_rq(cpu), CPU_IDLE); directly
>
> [...]
On 09/02/21 14:57, Vincent Guittot wrote:
> On Tue, 9 Feb 2021 at 14:09, Valentin Schneider
> <[email protected]> wrote:
>> On 05/02/21 12:48, Vincent Guittot wrote:
>> > @@ -261,6 +261,12 @@ static void cpuidle_idle_call(void)
>> > static void do_idle(void)
>> > {
>> > int cpu = smp_processor_id();
>> > +
>> > + /*
>> > + * Check if we need to update some blocked load
>> > + */
>> > + nohz_run_idle_balance(cpu);
>> > +
>>
>> What do we gain from doing this here vs having a stats update in
>> newidle_balance()?
>
> As mentioned by Joel, newidle_balance is called in the schedule
> context with preempt and irq off which prevent any local activity
> like irq/timer. Whereas in this new place, we have the same condition
> as during ILB with only preemptoff and _nohz_idle_balance() regularly
> checks if it has to abort because something has been scheduled on the
> cpu.
>
Gotcha, that's already hinted at in the cover letter. Could you point this
out in the changelog? Other than that, I don't see anything wrong with this
approach.