2021-04-28 23:29:47

by Crystal Wood

[permalink] [raw]
Subject: [PATCH v2 0/3] newidle_balance() PREEMPT_RT latency mitigations

These patches mitigate latency caused by newidle_balance() on large
systems when PREEMPT_RT is enabled, by enabling interrupts when the lock
is dropped, and exiting early at various points if an RT task is runnable
on the current CPU.

On a system with 128 CPUs, these patches dropped latency (as measured by
a 12 hour rteval run) from 1045us to 317us (when applied to
5.12.0-rc3-rt3).

I tried a couple scheduler benchmarks (perf bench sched pipe, and
sysbench threads) to try to determine whether the overhead is measurable
on non-RT, but the results varied widely enough (with or without the patches)
that I couldn't draw any conclusions from them. So at least for now, I
limited the balance callback change to when PREEMPT_RT is enabled.

Link to v1 RFC patches:
https://lore.kernel.org/lkml/[email protected]/

Scott Wood (3):
sched/fair: Call newidle_balance() from balance_callback on PREEMPT_RT
sched/fair: Enable interrupts when dropping lock in newidle_balance()
sched/fair: break out of newidle balancing if an RT task appears

kernel/sched/fair.c | 66 ++++++++++++++++++++++++++++++++++++++------
kernel/sched/sched.h | 6 ++++
2 files changed, 64 insertions(+), 8 deletions(-)

--
2.27.0


2021-04-28 23:30:27

by Crystal Wood

[permalink] [raw]
Subject: [PATCH v2 2/3] sched/fair: Enable interrupts when dropping lock in newidle_balance()

When combined with the next patch, which breaks out of rebalancing
when an RT task is runnable, significant latency reductions are seen
on systems with many CPUs.

Signed-off-by: Scott Wood <[email protected]>
---
kernel/sched/fair.c | 8 ++++++++
1 file changed, 8 insertions(+)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index ff369c38a5b5..aa8c87b6aff8 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -10521,6 +10521,8 @@ static void nohz_newidle_balance(struct rq *this_rq)
return;

raw_spin_unlock(&this_rq->lock);
+ if (newidle_balance_in_callback)
+ local_irq_enable();
/*
* This CPU is going to be idle and blocked load of idle CPUs
* need to be updated. Run the ilb locally as it is a good
@@ -10529,6 +10531,8 @@ static void nohz_newidle_balance(struct rq *this_rq)
*/
if (!_nohz_idle_balance(this_rq, NOHZ_STATS_KICK, CPU_NEWLY_IDLE))
kick_ilb(NOHZ_STATS_KICK);
+ if (newidle_balance_in_callback)
+ local_irq_disable();
raw_spin_lock(&this_rq->lock);
}

@@ -10599,6 +10603,8 @@ static int do_newidle_balance(struct rq *this_rq, struct rq_flags *rf)
}

raw_spin_unlock(&this_rq->lock);
+ if (newidle_balance_in_callback)
+ local_irq_enable();

update_blocked_averages(this_cpu);
rcu_read_lock();
@@ -10636,6 +10642,8 @@ static int do_newidle_balance(struct rq *this_rq, struct rq_flags *rf)
}
rcu_read_unlock();

+ if (newidle_balance_in_callback)
+ local_irq_disable();
raw_spin_lock(&this_rq->lock);

if (curr_cost > this_rq->max_idle_balance_cost)
--
2.27.0

2021-04-28 23:30:32

by Crystal Wood

[permalink] [raw]
Subject: [PATCH v2 3/3] sched/fair: break out of newidle balancing if an RT task appears

The CFS load balancer can take a little while, to the point of it having
a special LBF_NEED_BREAK flag, when the task moving code takes a
breather.

However, at that point it will jump right back in to load balancing,
without checking whether the CPU has gained any runnable real time
(or deadline) tasks.

Break out of load balancing in the CPU_NEWLY_IDLE case, to allow the
scheduling of the RT task. Without this, latencies of over 1ms are
seen on large systems.

Signed-off-by: Rik van Riel <[email protected]>
Reported-by: Clark Williams <[email protected]>
Signed-off-by: Clark Williams <[email protected]>
[swood: Limit change to newidle]
Signed-off-by: Scott Wood <[email protected]>
---
v2: Only break out of newidle balancing

kernel/sched/fair.c | 24 ++++++++++++++++++++----
kernel/sched/sched.h | 6 ++++++
2 files changed, 26 insertions(+), 4 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index aa8c87b6aff8..c3500c963af2 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -9502,10 +9502,21 @@ imbalanced_active_balance(struct lb_env *env)
return 0;
}

-static int need_active_balance(struct lb_env *env)
+static bool stop_balance_early(struct lb_env *env)
+{
+ return env->idle == CPU_NEWLY_IDLE && rq_has_higher_tasks(env->dst_rq);
+}
+
+static int need_active_balance(struct lb_env *env, int *continue_balancing)
{
struct sched_domain *sd = env->sd;

+ /* Run the realtime task now; load balance later. */
+ if (stop_balance_early(env)) {
+ *continue_balancing = 0;
+ return 0;
+ }
+
if (asym_active_balance(env))
return 1;

@@ -9550,7 +9561,7 @@ static int should_we_balance(struct lb_env *env)
* to do the newly idle load balance.
*/
if (env->idle == CPU_NEWLY_IDLE)
- return 1;
+ return !rq_has_higher_tasks(env->dst_rq);

/* Try to find first idle CPU */
for_each_cpu_and(cpu, group_balance_mask(sg), env->cpus) {
@@ -9660,6 +9671,11 @@ static int load_balance(int this_cpu, struct rq *this_rq,

local_irq_restore(rf.flags);

+ if (stop_balance_early(&env)) {
+ *continue_balancing = 0;
+ goto out;
+ }
+
if (env.flags & LBF_NEED_BREAK) {
env.flags &= ~LBF_NEED_BREAK;
goto more_balance;
@@ -9743,7 +9759,7 @@ static int load_balance(int this_cpu, struct rq *this_rq,
if (idle != CPU_NEWLY_IDLE)
sd->nr_balance_failed++;

- if (need_active_balance(&env)) {
+ if (need_active_balance(&env, continue_balancing)) {
unsigned long flags;

raw_spin_lock_irqsave(&busiest->lock, flags);
@@ -9787,7 +9803,7 @@ static int load_balance(int this_cpu, struct rq *this_rq,
sd->nr_balance_failed = 0;
}

- if (likely(!active_balance) || need_active_balance(&env)) {
+ if (likely(!active_balance) || need_active_balance(&env, continue_balancing)) {
/* We were unbalanced, so reset the balancing interval */
sd->balance_interval = sd->min_interval;
}
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 10a1522b1e30..88be4ed58924 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1987,6 +1987,12 @@ static inline struct cpuidle_state *idle_get_state(struct rq *rq)

return rq->idle_state;
}
+
+/* Is there a task of a high priority class? */
+static inline bool rq_has_higher_tasks(struct rq *rq)
+{
+ return unlikely(rq->nr_running != rq->cfs.h_nr_running);
+}
#else
static inline void idle_set_state(struct rq *rq,
struct cpuidle_state *idle_state)
--
2.27.0

2021-04-29 04:13:42

by kernel test robot

[permalink] [raw]
Subject: Re: [PATCH v2 3/3] sched/fair: break out of newidle balancing if an RT task appears

Hi Scott,

I love your patch! Yet something to improve:

[auto build test ERROR on linux/master]
[also build test ERROR on v5.12]
[cannot apply to tip/sched/core tip/master linus/master next-20210428]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch]

url: https://github.com/0day-ci/linux/commits/Scott-Wood/newidle_balance-PREEMPT_RT-latency-mitigations/20210429-073033
base: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git 1fe5501ba1abf2b7e78295df73675423bd6899a0
config: x86_64-randconfig-a006-20210428 (attached as .config)
compiler: clang version 13.0.0 (https://github.com/llvm/llvm-project 9131a078901b00e68248a27a4f8c4b11bb1db1ae)
reproduce (this is a W=1 build):
wget https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O ~/bin/make.cross
chmod +x ~/bin/make.cross
# install x86_64 cross compiling tool for clang build
# apt-get install binutils-x86-64-linux-gnu
# https://github.com/0day-ci/linux/commit/a88b1e73d8edba9a004ca170d136eccc8bcf7e9f
git remote add linux-review https://github.com/0day-ci/linux
git fetch --no-tags linux-review Scott-Wood/newidle_balance-PREEMPT_RT-latency-mitigations/20210429-073033
git checkout a88b1e73d8edba9a004ca170d136eccc8bcf7e9f
# save the attached .config to linux build tree
COMPILER_INSTALL_PATH=$HOME/0day COMPILER=clang make.cross W=1 ARCH=x86_64

If you fix the issue, kindly add following tag as appropriate
Reported-by: kernel test robot <[email protected]>

All errors (new ones prefixed by >>):

>> kernel/sched/fair.c:9507:40: error: implicit declaration of function 'rq_has_higher_tasks' [-Werror,-Wimplicit-function-declaration]
return env->idle == CPU_NEWLY_IDLE && rq_has_higher_tasks(env->dst_rq);
^
kernel/sched/fair.c:9564:11: error: implicit declaration of function 'rq_has_higher_tasks' [-Werror,-Wimplicit-function-declaration]
return !rq_has_higher_tasks(env->dst_rq);
^
2 errors generated.


vim +/rq_has_higher_tasks +9507 kernel/sched/fair.c

9504
9505 static bool stop_balance_early(struct lb_env *env)
9506 {
> 9507 return env->idle == CPU_NEWLY_IDLE && rq_has_higher_tasks(env->dst_rq);
9508 }
9509

---
0-DAY CI Kernel Test Service, Intel Corporation
https://lists.01.org/hyperkitty/list/[email protected]


Attachments:
(No filename) (2.57 kB)
.config.gz (36.38 kB)
Download all attachments

2021-04-29 06:41:08

by kernel test robot

[permalink] [raw]
Subject: Re: [PATCH v2 3/3] sched/fair: break out of newidle balancing if an RT task appears

Hi Scott,

I love your patch! Yet something to improve:

[auto build test ERROR on linux/master]
[also build test ERROR on v5.12]
[cannot apply to tip/sched/core tip/master linus/master next-20210428]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch]

url: https://github.com/0day-ci/linux/commits/Scott-Wood/newidle_balance-PREEMPT_RT-latency-mitigations/20210429-073033
base: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git 1fe5501ba1abf2b7e78295df73675423bd6899a0
config: arc-allyesconfig (attached as .config)
compiler: arceb-elf-gcc (GCC) 9.3.0
reproduce (this is a W=1 build):
wget https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O ~/bin/make.cross
chmod +x ~/bin/make.cross
# https://github.com/0day-ci/linux/commit/a88b1e73d8edba9a004ca170d136eccc8bcf7e9f
git remote add linux-review https://github.com/0day-ci/linux
git fetch --no-tags linux-review Scott-Wood/newidle_balance-PREEMPT_RT-latency-mitigations/20210429-073033
git checkout a88b1e73d8edba9a004ca170d136eccc8bcf7e9f
# save the attached .config to linux build tree
COMPILER_INSTALL_PATH=$HOME/0day COMPILER=gcc-9.3.0 make.cross W=1 ARCH=arc

If you fix the issue, kindly add following tag as appropriate
Reported-by: kernel test robot <[email protected]>

All errors (new ones prefixed by >>):

kernel/sched/fair.c: In function 'stop_balance_early':
>> kernel/sched/fair.c:9507:40: error: implicit declaration of function 'rq_has_higher_tasks' [-Werror=implicit-function-declaration]
9507 | return env->idle == CPU_NEWLY_IDLE && rq_has_higher_tasks(env->dst_rq);
| ^~~~~~~~~~~~~~~~~~~
In file included from include/linux/perf_event.h:25,
from include/linux/trace_events.h:10,
from include/trace/syscall.h:7,
from include/linux/syscalls.h:85,
from kernel/sched/sched.h:65,
from kernel/sched/fair.c:23:
At top level:
arch/arc/include/asm/perf_event.h:126:23: warning: 'arc_pmu_cache_map' defined but not used [-Wunused-const-variable=]
126 | static const unsigned arc_pmu_cache_map[C(MAX)][C(OP_MAX)][C(RESULT_MAX)] = {
| ^~~~~~~~~~~~~~~~~
arch/arc/include/asm/perf_event.h:91:27: warning: 'arc_pmu_ev_hw_map' defined but not used [-Wunused-const-variable=]
91 | static const char * const arc_pmu_ev_hw_map[] = {
| ^~~~~~~~~~~~~~~~~
cc1: some warnings being treated as errors


vim +/rq_has_higher_tasks +9507 kernel/sched/fair.c

9504
9505 static bool stop_balance_early(struct lb_env *env)
9506 {
> 9507 return env->idle == CPU_NEWLY_IDLE && rq_has_higher_tasks(env->dst_rq);
9508 }
9509

---
0-DAY CI Kernel Test Service, Intel Corporation
https://lists.01.org/hyperkitty/list/[email protected]


Attachments:
(No filename) (3.07 kB)
.config.gz (65.91 kB)
Download all attachments

2021-04-29 07:14:07

by Vincent Guittot

[permalink] [raw]
Subject: Re: [PATCH v2 0/3] newidle_balance() PREEMPT_RT latency mitigations

Hi Scott,

On Thu, 29 Apr 2021 at 01:28, Scott Wood <[email protected]> wrote:
>
> These patches mitigate latency caused by newidle_balance() on large
> systems when PREEMPT_RT is enabled, by enabling interrupts when the lock
> is dropped, and exiting early at various points if an RT task is runnable
> on the current CPU.
>
> On a system with 128 CPUs, these patches dropped latency (as measured by
> a 12 hour rteval run) from 1045us to 317us (when applied to
> 5.12.0-rc3-rt3).

The patch below has been queued for v5.13 and removed the update of
blocked load what seemed to be the major reason for long preempt/irq
off during newly idle balance:
https://lore.kernel.org/lkml/[email protected]/

I would be curious to see how it impacts your cases

>
> I tried a couple scheduler benchmarks (perf bench sched pipe, and
> sysbench threads) to try to determine whether the overhead is measurable
> on non-RT, but the results varied widely enough (with or without the patches)
> that I couldn't draw any conclusions from them. So at least for now, I
> limited the balance callback change to when PREEMPT_RT is enabled.
>
> Link to v1 RFC patches:
> https://lore.kernel.org/lkml/[email protected]/
>
> Scott Wood (3):
> sched/fair: Call newidle_balance() from balance_callback on PREEMPT_RT
> sched/fair: Enable interrupts when dropping lock in newidle_balance()
> sched/fair: break out of newidle balancing if an RT task appears
>
> kernel/sched/fair.c | 66 ++++++++++++++++++++++++++++++++++++++------
> kernel/sched/sched.h | 6 ++++
> 2 files changed, 64 insertions(+), 8 deletions(-)
>
> --
> 2.27.0
>

2021-05-01 22:06:24

by Crystal Wood

[permalink] [raw]
Subject: Re: [PATCH v2 0/3] newidle_balance() PREEMPT_RT latency mitigations

On Thu, 2021-04-29 at 09:12 +0200, Vincent Guittot wrote:
> Hi Scott,
>
> On Thu, 29 Apr 2021 at 01:28, Scott Wood <[email protected]> wrote:
> > These patches mitigate latency caused by newidle_balance() on large
> > systems when PREEMPT_RT is enabled, by enabling interrupts when the lock
> > is dropped, and exiting early at various points if an RT task is
> > runnable
> > on the current CPU.
> >
> > On a system with 128 CPUs, these patches dropped latency (as measured by
> > a 12 hour rteval run) from 1045us to 317us (when applied to
> > 5.12.0-rc3-rt3).
>
> The patch below has been queued for v5.13 and removed the update of
> blocked load what seemed to be the major reason for long preempt/irq
> off during newly idle balance:
> https://lore.kernel.org/lkml/[email protected]/
>
> I would be curious to see how it impacts your cases

I still get 1000+ ms latencies with those patches applied.

-Scott


2021-05-02 03:30:51

by Mike Galbraith

[permalink] [raw]
Subject: Re: [PATCH v2 0/3] newidle_balance() PREEMPT_RT latency mitigations

On Sat, 2021-05-01 at 17:03 -0500, Scott Wood wrote:
> On Thu, 2021-04-29 at 09:12 +0200, Vincent Guittot wrote:
> > Hi Scott,
> >
> > On Thu, 29 Apr 2021 at 01:28, Scott Wood <[email protected]> wrote:
> > > These patches mitigate latency caused by newidle_balance() on large
> > > systems when PREEMPT_RT is enabled, by enabling interrupts when the lock
> > > is dropped, and exiting early at various points if an RT task is
> > > runnable
> > > on the current CPU.
> > >
> > > On a system with 128 CPUs, these patches dropped latency (as measured by
> > > a 12 hour rteval run) from 1045us to 317us (when applied to
> > > 5.12.0-rc3-rt3).
> >
> > The patch below has been queued for v5.13 and removed the update of
> > blocked load what seemed to be the major reason for long preempt/irq
> > off during newly idle balance:
> > https://lore.kernel.org/lkml/[email protected]/
> >
> > I would be curious to see how it impacts your cases
>
> I still get 1000+ ms latencies with those patches applied.

If NEWIDLE balancing migrates one task, how does that manage to consume
a full *millisecond*, and why would that only be a problem for RT?

-Mike

(rt tasks don't play !rt balancer here, if CPU goes idle, tough titty)

2021-05-03 18:11:30

by Crystal Wood

[permalink] [raw]
Subject: Re: [PATCH v2 0/3] newidle_balance() PREEMPT_RT latency mitigations

On Sun, 2021-05-02 at 05:25 +0200, Mike Galbraith wrote:
> On Sat, 2021-05-01 at 17:03 -0500, Scott Wood wrote:
> > On Thu, 2021-04-29 at 09:12 +0200, Vincent Guittot wrote:
> > > Hi Scott,
> > >
> > > On Thu, 29 Apr 2021 at 01:28, Scott Wood <[email protected]> wrote:
> > > > These patches mitigate latency caused by newidle_balance() on large
> > > > systems when PREEMPT_RT is enabled, by enabling interrupts when the
> > > > lock
> > > > is dropped, and exiting early at various points if an RT task is
> > > > runnable
> > > > on the current CPU.
> > > >
> > > > On a system with 128 CPUs, these patches dropped latency (as
> > > > measured by
> > > > a 12 hour rteval run) from 1045us to 317us (when applied to
> > > > 5.12.0-rc3-rt3).
> > >
> > > The patch below has been queued for v5.13 and removed the update of
> > > blocked load what seemed to be the major reason for long preempt/irq
> > > off during newly idle balance:
> > > https://lore.kernel.org/lkml/[email protected]/
> > >
> > > I would be curious to see how it impacts your cases
> >
> > I still get 1000+ ms latencies with those patches applied.
>
> If NEWIDLE balancing migrates one task, how does that manage to consume
> a full *millisecond*, and why would that only be a problem for RT?
>
> -Mike
>
> (rt tasks don't play !rt balancer here, if CPU goes idle, tough titty)

Determining which task to pull is apparently taking that long (again, this
is on a 128-cpu system). RT is singled out because that is the config that
makes significant tradeoffs to keep latencies down (I expect this would be
far from the only possible 1ms+ latency on a non-RT kernel), and there was
concern about the overhead of a double context switch when pulling a task to
a newidle cpu.

-Scott


2021-05-03 18:54:45

by Mike Galbraith

[permalink] [raw]
Subject: Re: [PATCH v2 0/3] newidle_balance() PREEMPT_RT latency mitigations

On Mon, 2021-05-03 at 11:33 -0500, Scott Wood wrote:
> On Sun, 2021-05-02 at 05:25 +0200, Mike Galbraith wrote:
> > On Sat, 2021-05-01 at 17:03 -0500, Scott Wood wrote:
> > > On Thu, 2021-04-29 at 09:12 +0200, Vincent Guittot wrote:
> > > > Hi Scott,
> > > >
> > > > On Thu, 29 Apr 2021 at 01:28, Scott Wood <[email protected]> wrote:
> > > > > These patches mitigate latency caused by newidle_balance() on large
> > > > > systems when PREEMPT_RT is enabled, by enabling interrupts when the
> > > > > lock
> > > > > is dropped, and exiting early at various points if an RT task is
> > > > > runnable
> > > > > on the current CPU.
> > > > >
> > > > > On a system with 128 CPUs, these patches dropped latency (as
> > > > > measured by
> > > > > a 12 hour rteval run) from 1045us to 317us (when applied to
> > > > > 5.12.0-rc3-rt3).
> > > >
> > > > The patch below has been queued for v5.13 and removed the update of
> > > > blocked load what seemed to be the major reason for long preempt/irq
> > > > off during newly idle balance:
> > > > https://lore.kernel.org/lkml/[email protected]/
> > > >
> > > > I would be curious to see how it impacts your cases
> > >
> > > I still get 1000+ ms latencies with those patches applied.
> >
> > If NEWIDLE balancing migrates one task, how does that manage to consume
> > a full *millisecond*, and why would that only be a problem for RT?
> >
> > -Mike
> >
> > (rt tasks don't play !rt balancer here, if CPU goes idle, tough titty)
>
> Determining which task to pull is apparently taking that long (again, this
> is on a 128-cpu system). RT is singled out because that is the config that
> makes significant tradeoffs to keep latencies down (I expect this would be
> far from the only possible 1ms+ latency on a non-RT kernel), and there was
> concern about the overhead of a double context switch when pulling a task to
> a newidle cpu.

What I think has be going on is that you're running a synchronized RT
load, many CPUs go idle as a thundering herd, and meet at focal point
busiest. What I was alluding to was that preventing such size scale
pile-ups would be way better than poking holes in it for RT to try to
sneak through. If pile-up it is, while not particularly likely, the
same should happen with normal tasks, wasting cycles generating heat.

The main issue I see with these patches is that the resulting number is
still so gawd awful as to mean "nope, not rt ready", making the whole
exercise look a bit like a noop.

-Mike

2021-05-03 21:58:51

by Crystal Wood

[permalink] [raw]
Subject: Re: [PATCH v2 0/3] newidle_balance() PREEMPT_RT latency mitigations

On Mon, 2021-05-03 at 20:52 +0200, Mike Galbraith wrote:
> On Mon, 2021-05-03 at 11:33 -0500, Scott Wood wrote:
> > On Sun, 2021-05-02 at 05:25 +0200, Mike Galbraith wrote:
> > > If NEWIDLE balancing migrates one task, how does that manage to
> > > consume
> > > a full *millisecond*, and why would that only be a problem for RT?
> > >
> > > -Mike
> > >
> > > (rt tasks don't play !rt balancer here, if CPU goes idle, tough titty)
> >
> > Determining which task to pull is apparently taking that long (again,
> > this is on a 128-cpu system). RT is singled out because that is the
> > config that makes significant tradeoffs to keep latencies down (I
> > expect this would be far from the only possible 1ms+ latency on a
> > non-RT kernel), and there was concern about the overhead of a double
> > context switch when pulling a task to a newidle cpu.
>
> What I think has be going on is that you're running a synchronized RT
> load, many CPUs go idle as a thundering herd, and meet at focal point
> busiest. What I was alluding to was that preventing such size scale
> pile-ups would be way better than poking holes in it for RT to try to
> sneak through. If pile-up it is, while not particularly likely, the
> same should happen with normal tasks, wasting cycles generating heat.
>
> The main issue I see with these patches is that the resulting number is
> still so gawd awful as to mean "nope, not rt ready", making the whole
> exercise look a bit like a noop.

It doesn't look like rteval asks cyclictest to synchronize, but
regardless, how is this "poking holes"? Making sure interrupts are
enabled during potentially long-running activities is pretty fundamental
to PREEMPT_RT. What specifically is your suggestion?

And yes, 317 us is still not a very good number for PREEMPT_RT, but
progress is progress. It's hard to address the moderate latency spikes
if they're obscured by large latency spikes. One also needs to have
realistic expectations when it comes to RT on large systems, particularly
when not isolating the latency-sensitive CPUs.

-Scott


2021-05-04 04:09:39

by Mike Galbraith

[permalink] [raw]
Subject: Re: [PATCH v2 0/3] newidle_balance() PREEMPT_RT latency mitigations

On Mon, 2021-05-03 at 16:57 -0500, Scott Wood wrote:
> On Mon, 2021-05-03 at 20:52 +0200, Mike Galbraith wrote:
> > On Mon, 2021-05-03 at 11:33 -0500, Scott Wood wrote:
> > > On Sun, 2021-05-02 at 05:25 +0200, Mike Galbraith wrote:
> > > > If NEWIDLE balancing migrates one task, how does that manage to
> > > > consume
> > > > a full *millisecond*, and why would that only be a problem for RT?
> > > >
> > > > -Mike
> > > >
> > > > (rt tasks don't play !rt balancer here, if CPU goes idle, tough titty)
> > >
> > > Determining which task to pull is apparently taking that long (again,
> > > this is on a 128-cpu system). RT is singled out because that is the
> > > config that makes significant tradeoffs to keep latencies down (I
> > > expect this would be far from the only possible 1ms+ latency on a
> > > non-RT kernel), and there was concern about the overhead of a double
> > > context switch when pulling a task to a newidle cpu.
> >
> > What I think has be going on is that you're running a synchronized RT
> > load, many CPUs go idle as a thundering herd, and meet at focal point
> > busiest. What I was alluding to was that preventing such size scale
> > pile-ups would be way better than poking holes in it for RT to try to
> > sneak through. If pile-up it is, while not particularly likely, the
> > same should happen with normal tasks, wasting cycles generating heat.
> >
> > The main issue I see with these patches is that the resulting number is
> > still so gawd awful as to mean "nope, not rt ready", making the whole
> > exercise look a bit like a noop.
>
> It doesn't look like rteval asks cyclictest to synchronize, but
> regardless, how is this "poking holes"?

Pulling a single task is taking _a full millisecond_, which I see as a
mountain of cycles, directly through which you open a path for wakeups.
That "poking holes" isn't meant to be some kind of crude derogatory
remark, it's just the way I see what was done. The mountain still
stands, you didn't remove it.

> Making sure interrupts are
> enabled during potentially long-running activities is pretty fundamental
> to PREEMPT_RT. What specifically is your suggestion?

Try to include fair class in any LB improvement if at all possible,
because that's where most of the real world benefit is to be had.

> And yes, 317 us is still not a very good number for PREEMPT_RT, but
> progress is progress. It's hard to address the moderate latency spikes
> if they're obscured by large latency spikes. One also needs to have
> realistic expectations when it comes to RT on large systems, particularly
> when not isolating the latency-sensitive CPUs.

Agreed. But. Specifically because the result remains intolerable to
anything remotely sensitive, users running such on their big boxen are
not going to be doing mixed load, that flat does not work, which is why
I said the patch set looks a bit like a noop: it excludes the audience
that stands to gain.. nearly anything. Big box HPC (acronym includes
RT) gains absolutely nothing, as does big box general case with its not
particularly prevalent, but definitely existent RT tasks. Big box
users who are NOT running anything they care deeply about do receive
some love.. but don't care deeply, and certainly don't care more any
more deeply than general case users do about these collision induced
latency spikes.

-Mike

2021-05-07 15:07:56

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [PATCH v2 3/3] sched/fair: break out of newidle balancing if an RT task appears


I'm going to pretend to have never seen the prior two patches. They do
absolutely horrible things for unspecified reasons. You've utterly
failed to explain what exactly is taking that 1ms+.

newidle_balance() already has 'stop, you're spending too much time'
controls; you've failed to explain how those are falling short and why
they cannot be improved.

On Wed, Apr 28, 2021 at 06:28:21PM -0500, Scott Wood wrote:
> The CFS load balancer can take a little while, to the point of it having
> a special LBF_NEED_BREAK flag, when the task moving code takes a
> breather.
>
> However, at that point it will jump right back in to load balancing,
> without checking whether the CPU has gained any runnable real time
> (or deadline) tasks.
>
> Break out of load balancing in the CPU_NEWLY_IDLE case, to allow the
> scheduling of the RT task. Without this, latencies of over 1ms are
> seen on large systems.
>
> Signed-off-by: Rik van Riel <[email protected]>
> Reported-by: Clark Williams <[email protected]>
> Signed-off-by: Clark Williams <[email protected]>
> [swood: Limit change to newidle]
> Signed-off-by: Scott Wood <[email protected]>
> ---
> v2: Only break out of newidle balancing
>
> kernel/sched/fair.c | 24 ++++++++++++++++++++----
> kernel/sched/sched.h | 6 ++++++
> 2 files changed, 26 insertions(+), 4 deletions(-)
>
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index aa8c87b6aff8..c3500c963af2 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -9502,10 +9502,21 @@ imbalanced_active_balance(struct lb_env *env)
> return 0;
> }
>
> -static int need_active_balance(struct lb_env *env)
> +static bool stop_balance_early(struct lb_env *env)
> +{
> + return env->idle == CPU_NEWLY_IDLE && rq_has_higher_tasks(env->dst_rq);
> +}
> +
> +static int need_active_balance(struct lb_env *env, int *continue_balancing)
> {
> struct sched_domain *sd = env->sd;
>
> + /* Run the realtime task now; load balance later. */
> + if (stop_balance_early(env)) {
> + *continue_balancing = 0;
> + return 0;
> + }

This placement doesn't make any sense. You very much want this to return
true for the sd->balance_interval = sd->min_interval block for example.

And the other callsite already has an if (idle != CPU_NEWLY_IDLE)
condition to use.

Also, I don't think we care about the higher thing here (either);
newidle is about getting *any* work here, if there's something to do, we
don't need to do more.

> +
> if (asym_active_balance(env))
> return 1;
>
> @@ -9550,7 +9561,7 @@ static int should_we_balance(struct lb_env *env)
> * to do the newly idle load balance.
> */
> if (env->idle == CPU_NEWLY_IDLE)
> - return 1;
> + return !rq_has_higher_tasks(env->dst_rq);

has_higher_task makes no sense here, newidle can stop the moment
nr_running != 0.

>
> /* Try to find first idle CPU */
> for_each_cpu_and(cpu, group_balance_mask(sg), env->cpus) {
> @@ -9660,6 +9671,11 @@ static int load_balance(int this_cpu, struct rq *this_rq,
>
> local_irq_restore(rf.flags);
>
> + if (stop_balance_early(&env)) {
> + *continue_balancing = 0;
> + goto out;
> + }

Same thing.

> +
> if (env.flags & LBF_NEED_BREAK) {
> env.flags &= ~LBF_NEED_BREAK;
> goto more_balance;

2021-05-15 11:36:52

by Mike Galbraith

[permalink] [raw]
Subject: Re: [PATCH v2 3/3] sched/fair: break out of newidle balancing if an RT task appears

P.S. The largest I spotted before boredom won:

kworker/3:1-22161 [003] d...2.. 11559.111261: load_balance: NEWIDLE rq:4 lock acquired
kworker/0:0-21137 [000] d...1.. 11559.111262: load_balance: NEWIDLE rq:4 locked - aborting
kworker/7:1-20870 [007] d...1.. 11559.111262: load_balance: NEWIDLE rq:4 locked - aborting
<...>-22133 [002] d...1.. 11559.111263: load_balance: NEWIDLE rq:4 locked - aborting
kworker/6:1-22213 [006] d...1.. 11559.111263: load_balance: NEWIDLE rq:4 locked - aborting

5/8 of a 128 rq box meeting like that is.. hopefully impossible :)

-Mike


2021-05-15 17:37:27

by Mike Galbraith

[permalink] [raw]
Subject: Re: [PATCH v2 3/3] sched/fair: break out of newidle balancing if an RT task appears

On Fri, 2021-05-07 at 13:03 +0200, Peter Zijlstra wrote:
>
> newidle_balance() already has 'stop, you're spending too much time'
> controls; you've failed to explain how those are falling short and why
> they cannot be improved.

Greetings,

I bet a nickle he's meeting contention on a more painful scale than the
2212.429350 sample from my little i4790 desktop box, as master.today+rt
does.. nothing remotely RT like, just plays youtube as I instead watch
trace output.

homer:..debug/tracing # tail -20 trace
MediaPl~back #1-17837 [003] d...2.. 2212.292335: load_balance: NEWIDLE rq:2 lock acquired
Gecko_IOThread-4001 [000] d...2.. 2212.315030: load_balance: NEWIDLE rq:3 lock acquired
firefox-3980 [007] d...1.. 2212.315030: load_balance: NEWIDLE rq:3 locked - aborting
Timer-4089 [007] d...2.. 2212.317260: load_balance: NEWIDLE rq:0 lock acquired
Timer-4089 [007] d...2.. 2212.317319: load_balance: NEWIDLE rq:0 lock acquired
ksoftirqd/6-51 [006] d...2.. 2212.317358: load_balance: NEWIDLE rq:0 lock acquired
Timer-4089 [007] d...2.. 2212.317474: load_balance: NEWIDLE rq:0 lock acquired
MediaPl~back #2-17839 [002] d...1.. 2212.345438: load_balance: NEWIDLE rq:3 locked - aborting
Chrome_~dThread-16830 [006] d...2.. 2212.345438: load_balance: NEWIDLE rq:3 lock acquired
AudioIP~ent RPC-17665 [003] d...2.. 2212.404999: load_balance: NEWIDLE rq:5 lock acquired
ImageBridgeChld-16855 [001] d...2.. 2212.429350: load_balance: NEWIDLE rq:6 lock acquired
MediaPl~back #1-17837 [000] d...1.. 2212.429351: load_balance: NEWIDLE rq:6 locked - aborting
Chrome_~dThread-16830 [002] d...1.. 2212.429356: load_balance: NEWIDLE rq:6 locked - aborting
X-2157 [003] d...2.. 2212.461351: load_balance: NEWIDLE rq:6 lock acquired
<...>-4043 [006] d...2.. 2212.480451: load_balance: NEWIDLE rq:2 lock acquired
ksoftirqd/0-12 [000] d...2.. 2212.505545: load_balance: NEWIDLE rq:3 lock acquired
ksoftirqd/0-12 [000] d...2.. 2212.505550: load_balance: NEWIDLE rq:3 lock acquired
threaded-ml-4399 [001] d...2.. 2212.517943: load_balance: NEWIDLE rq:7 lock acquired
pulseaudio-1917 [002] d...2.. 2212.575245: load_balance: NEWIDLE rq:6 lock acquired
IPDL Background-4028 [004] d...2.. 2212.581085: load_balance: NEWIDLE rq:5 lock acquired
homer:..debug/tracing #

homer:..debug/tracing # cat trace | grep lock | wc -l
218165
homer:..debug/tracing # cat trace | grep abort | wc -l
40081

I still carry the below in my local RT trees, to which I added those
trace_printk()s. It was born some years ago on a 64 core box.

---
kernel/sched/fair.c | 11 +++++++++++
kernel/sched/features.h | 4 ++++
kernel/sched/sched.h | 10 ++++++++++
3 files changed, 25 insertions(+)

--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -7200,6 +7200,12 @@ done: __maybe_unused;
if (!rf)
return NULL;

+#ifdef CONFIG_PREEMPT_RT
+ /* RT tasks have better things to do than load balancing. */
+ if (prev && prev->sched_class != &fair_sched_class)
+ return NULL;
+#endif
+
new_tasks = newidle_balance(rq, rf);

/*
@@ -9669,7 +9675,12 @@ static int load_balance(int this_cpu, st
env.loop_max = min(sysctl_sched_nr_migrate, busiest->nr_running);

more_balance:
+#ifdef CONFIG_PREEMPT_RT
+ if (!rq_lock_trylock_irqsave(busiest, &rf))
+ goto out;
+#else
rq_lock_irqsave(busiest, &rf);
+#endif
update_rq_clock(busiest);

/*
--- a/kernel/sched/features.h
+++ b/kernel/sched/features.h
@@ -86,7 +86,11 @@ SCHED_FEAT(RT_PUSH_IPI, true)
#endif

SCHED_FEAT(RT_RUNTIME_SHARE, false)
+#ifndef CONFIG_PREEMPT_RT
SCHED_FEAT(LB_MIN, false)
+#else
+SCHED_FEAT(LB_MIN, true)
+#endif
SCHED_FEAT(ATTACH_AGE_LOAD, true)

SCHED_FEAT(WA_IDLE, true)
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1322,6 +1322,16 @@ rq_lock_irqsave(struct rq *rq, struct rq
rq_pin_lock(rq, rf);
}

+static inline int
+rq_lock_trylock_irqsave(struct rq *rq, struct rq_flags *rf)
+ __acquires(rq->lock)
+{
+ if (!raw_spin_trylock_irqsave(&rq->lock, rf->flags))
+ return 0;
+ rq_pin_lock(rq, rf);
+ return 1;
+}
+
static inline void
rq_lock_irq(struct rq *rq, struct rq_flags *rf)
__acquires(rq->lock)