2021-07-01 05:55:10

by Nicholas Piggin

[permalink] [raw]
Subject: [PATCH] nohz: nohz idle balancing per node

Currently a single nohz idle CPU is designated to perform balancing on
behalf of all other nohz idle CPUs in the system. Implement a per node
nohz balancer to minimize cross-node memory accesses and runqueue lock
acquisitions.

On a 4 node system, this improves performance by 9.3% on a 'pgbench -N'
with 32 clients/jobs (which is about where throughput maxes out due to
IO and contention in postgres).

Performance consistency and drop-off after peak is much improved:

https://user-images.githubusercontent.com/19833002/124070985-0e923a80-da82-11eb-81c8-6788be27dfdb.png

Signed-off-by: Nicholas Piggin <[email protected]>
---
kernel/sched/core.c | 3 ++
kernel/sched/fair.c | 90 +++++++++++++++++++++++++++++++--------------
2 files changed, 65 insertions(+), 28 deletions(-)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index cf16f8fda9a6..52681f11e98c 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -9067,6 +9067,9 @@ void __init sched_init(void)
*/
init_idle(current, smp_processor_id());

+ /* Must run before init_sched_fair_class(), which tests housekeeping */
+ housekeeping_init();
+
calc_load_update = jiffies + LOAD_FREQ;

#ifdef CONFIG_SMP
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index fb469b26b00a..832f8673bba1 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -5722,13 +5722,27 @@ DEFINE_PER_CPU(cpumask_var_t, select_idle_mask);

#ifdef CONFIG_NO_HZ_COMMON

-static struct {
+struct nohz {
cpumask_var_t idle_cpus_mask;
atomic_t nr_cpus;
int has_blocked; /* Idle CPUS has blocked load */
unsigned long next_balance; /* in jiffy units */
unsigned long next_blocked; /* Next update of blocked load in jiffies */
-} nohz ____cacheline_aligned;
+} ____cacheline_aligned;
+
+static struct nohz **nohz_nodes __ro_after_init;
+
+static struct nohz *get_nohz(void)
+{
+#ifdef CONFIG_CPU_ISOLATION
+ /*
+ * May not have a house keeping CPU per node, do global idle balancing.
+ */
+ if (static_branch_unlikely(&housekeeping_overridden))
+ return nohz_nodes[0];
+#endif
+ return nohz_nodes[numa_node_id()];
+}

#endif /* CONFIG_NO_HZ_COMMON */

@@ -10217,7 +10231,7 @@ static inline int find_new_ilb(void)
{
int ilb;

- for_each_cpu_and(ilb, nohz.idle_cpus_mask,
+ for_each_cpu_and(ilb, get_nohz()->idle_cpus_mask,
housekeeping_cpumask(HK_FLAG_MISC)) {

if (ilb == smp_processor_id())
@@ -10243,7 +10257,7 @@ static void kick_ilb(unsigned int flags)
* not if we only update stats.
*/
if (flags & NOHZ_BALANCE_KICK)
- nohz.next_balance = jiffies+1;
+ get_nohz()->next_balance = jiffies+1;

ilb_cpu = find_new_ilb();

@@ -10291,14 +10305,14 @@ static void nohz_balancer_kick(struct rq *rq)
* None are in tickless mode and hence no need for NOHZ idle load
* balancing.
*/
- if (likely(!atomic_read(&nohz.nr_cpus)))
+ if (likely(!atomic_read(&get_nohz()->nr_cpus)))
return;

- if (READ_ONCE(nohz.has_blocked) &&
- time_after(now, READ_ONCE(nohz.next_blocked)))
+ if (READ_ONCE(get_nohz()->has_blocked) &&
+ time_after(now, READ_ONCE(get_nohz()->next_blocked)))
flags = NOHZ_STATS_KICK;

- if (time_before(now, nohz.next_balance))
+ if (time_before(now, get_nohz()->next_balance))
goto out;

if (rq->nr_running >= 2) {
@@ -10328,7 +10342,7 @@ static void nohz_balancer_kick(struct rq *rq)
* currently idle; in which case, kick the ILB to move tasks
* around.
*/
- for_each_cpu_and(i, sched_domain_span(sd), nohz.idle_cpus_mask) {
+ for_each_cpu_and(i, sched_domain_span(sd), get_nohz()->idle_cpus_mask) {
if (sched_asym_prefer(i, cpu)) {
flags = NOHZ_KICK_MASK;
goto unlock;
@@ -10405,8 +10419,8 @@ void nohz_balance_exit_idle(struct rq *rq)
return;

rq->nohz_tick_stopped = 0;
- cpumask_clear_cpu(rq->cpu, nohz.idle_cpus_mask);
- atomic_dec(&nohz.nr_cpus);
+ cpumask_clear_cpu(rq->cpu, get_nohz()->idle_cpus_mask);
+ atomic_dec(&get_nohz()->nr_cpus);

set_cpu_sd_state_busy(rq->cpu);
}
@@ -10467,8 +10481,8 @@ void nohz_balance_enter_idle(int cpu)

rq->nohz_tick_stopped = 1;

- cpumask_set_cpu(cpu, nohz.idle_cpus_mask);
- atomic_inc(&nohz.nr_cpus);
+ cpumask_set_cpu(cpu, get_nohz()->idle_cpus_mask);
+ atomic_inc(&get_nohz()->nr_cpus);

/*
* Ensures that if nohz_idle_balance() fails to observe our
@@ -10484,7 +10498,7 @@ void nohz_balance_enter_idle(int cpu)
* Each time a cpu enter idle, we assume that it has blocked load and
* enable the periodic update of the load of idle cpus
*/
- WRITE_ONCE(nohz.has_blocked, 1);
+ WRITE_ONCE(get_nohz()->has_blocked, 1);
}

static bool update_nohz_stats(struct rq *rq)
@@ -10494,7 +10508,7 @@ static bool update_nohz_stats(struct rq *rq)
if (!rq->has_blocked_load)
return false;

- if (!cpumask_test_cpu(cpu, nohz.idle_cpus_mask))
+ if (!cpumask_test_cpu(cpu, get_nohz()->idle_cpus_mask))
return false;

if (!time_after(jiffies, READ_ONCE(rq->last_blocked_load_update_tick)))
@@ -10532,7 +10546,7 @@ static void _nohz_idle_balance(struct rq *this_rq, unsigned int flags,
* setting the flag, we are sure to not clear the state and not
* check the load of an idle cpu.
*/
- WRITE_ONCE(nohz.has_blocked, 0);
+ WRITE_ONCE(get_nohz()->has_blocked, 0);

/*
* Ensures that if we miss the CPU, we must see the has_blocked
@@ -10544,10 +10558,7 @@ static void _nohz_idle_balance(struct rq *this_rq, unsigned int flags,
* Start with the next CPU after this_cpu so we will end with this_cpu and let a
* chance for other idle cpu to pull load.
*/
- for_each_cpu_wrap(balance_cpu, nohz.idle_cpus_mask, this_cpu+1) {
- if (!idle_cpu(balance_cpu))
- continue;
-
+ for_each_cpu_wrap(balance_cpu, get_nohz()->idle_cpus_mask, this_cpu+1) {
/*
* If this CPU gets work to do, stop the load balancing
* work being done for other CPUs. Next load
@@ -10558,6 +10569,9 @@ static void _nohz_idle_balance(struct rq *this_rq, unsigned int flags,
goto abort;
}

+ if (!idle_cpu(balance_cpu))
+ continue;
+
rq = cpu_rq(balance_cpu);

has_blocked_load |= update_nohz_stats(rq);
@@ -10589,15 +10603,15 @@ static void _nohz_idle_balance(struct rq *this_rq, unsigned int flags,
* updated.
*/
if (likely(update_next_balance))
- nohz.next_balance = next_balance;
+ get_nohz()->next_balance = next_balance;

- WRITE_ONCE(nohz.next_blocked,
+ WRITE_ONCE(get_nohz()->next_blocked,
now + msecs_to_jiffies(LOAD_AVG_PERIOD));

abort:
/* There is still blocked load, enable periodic update */
if (has_blocked_load)
- WRITE_ONCE(nohz.has_blocked, 1);
+ WRITE_ONCE(get_nohz()->has_blocked, 1);
}

/*
@@ -10655,8 +10669,8 @@ static void nohz_newidle_balance(struct rq *this_rq)
return;

/* Don't need to update blocked load of idle CPUs*/
- if (!READ_ONCE(nohz.has_blocked) ||
- time_before(jiffies, READ_ONCE(nohz.next_blocked)))
+ if (!READ_ONCE(get_nohz()->has_blocked) ||
+ time_before(jiffies, READ_ONCE(get_nohz()->next_blocked)))
return;

/*
@@ -11573,9 +11587,29 @@ __init void init_sched_fair_class(void)
open_softirq(SCHED_SOFTIRQ, run_rebalance_domains);

#ifdef CONFIG_NO_HZ_COMMON
- nohz.next_balance = jiffies;
- nohz.next_blocked = jiffies;
- zalloc_cpumask_var(&nohz.idle_cpus_mask, GFP_NOWAIT);
+ if (static_branch_unlikely(&housekeeping_overridden)) {
+ struct nohz *nohz;
+
+ nohz_nodes = kcalloc(1, sizeof(struct nohz *), GFP_NOWAIT);
+ nohz = kmalloc(sizeof(struct nohz), GFP_NOWAIT);
+ nohz->next_balance = jiffies;
+ nohz->next_blocked = jiffies;
+ zalloc_cpumask_var(&nohz->idle_cpus_mask, GFP_NOWAIT);
+ nohz_nodes[0] = nohz;
+ } else {
+ int n;
+
+ nohz_nodes = kcalloc(nr_node_ids, sizeof(struct nohz *), GFP_NOWAIT);
+ for_each_node(n) {
+ struct nohz *nohz;
+
+ nohz = kmalloc_node(sizeof(struct nohz), GFP_NOWAIT, n);
+ nohz->next_balance = jiffies;
+ nohz->next_blocked = jiffies;
+ zalloc_cpumask_var_node(&nohz->idle_cpus_mask, GFP_NOWAIT, n);
+ nohz_nodes[n] = nohz;
+ }
+ }
#endif
#endif /* SMP */

--
2.23.0


2021-07-01 16:51:33

by kernel test robot

[permalink] [raw]
Subject: Re: [PATCH] nohz: nohz idle balancing per node

Hi Nicholas,

I love your patch! Yet something to improve:

[auto build test ERROR on linux/master]
[also build test ERROR on linus/master v5.13]
[cannot apply to tip/sched/core next-20210701]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch]

url: https://github.com/0day-ci/linux/commits/Nicholas-Piggin/nohz-nohz-idle-balancing-per-node/20210701-135507
base: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git c54b245d011855ea91c5beff07f1db74143ce614
config: x86_64-randconfig-c022-20210630 (attached as .config)
compiler: gcc-9 (Debian 9.3.0-22) 9.3.0
reproduce (this is a W=1 build):
# https://github.com/0day-ci/linux/commit/1bede18329ddbaed4099e980fdf62092ff881932
git remote add linux-review https://github.com/0day-ci/linux
git fetch --no-tags linux-review Nicholas-Piggin/nohz-nohz-idle-balancing-per-node/20210701-135507
git checkout 1bede18329ddbaed4099e980fdf62092ff881932
# save the attached .config to linux build tree
mkdir build_dir
make W=1 O=build_dir ARCH=x86_64 SHELL=/bin/bash

If you fix the issue, kindly add following tag as appropriate
Reported-by: kernel test robot <[email protected]>

All errors (new ones prefixed by >>):

kernel/sched/fair.c:5437:6: warning: no previous prototype for 'init_cfs_bandwidth' [-Wmissing-prototypes]
5437 | void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b) {}
| ^~~~~~~~~~~~~~~~~~
kernel/sched/fair.c:10937:6: warning: no previous prototype for 'task_vruntime_update' [-Wmissing-prototypes]
10937 | void task_vruntime_update(struct rq *rq, struct task_struct *p, bool in_fi)
| ^~~~~~~~~~~~~~~~~~~~
kernel/sched/fair.c:11470:6: warning: no previous prototype for 'free_fair_sched_group' [-Wmissing-prototypes]
11470 | void free_fair_sched_group(struct task_group *tg) { }
| ^~~~~~~~~~~~~~~~~~~~~
kernel/sched/fair.c:11472:5: warning: no previous prototype for 'alloc_fair_sched_group' [-Wmissing-prototypes]
11472 | int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
| ^~~~~~~~~~~~~~~~~~~~~~
kernel/sched/fair.c:11477:6: warning: no previous prototype for 'online_fair_sched_group' [-Wmissing-prototypes]
11477 | void online_fair_sched_group(struct task_group *tg) { }
| ^~~~~~~~~~~~~~~~~~~~~~~
kernel/sched/fair.c:11479:6: warning: no previous prototype for 'unregister_fair_sched_group' [-Wmissing-prototypes]
11479 | void unregister_fair_sched_group(struct task_group *tg) { }
| ^~~~~~~~~~~~~~~~~~~~~~~~~~~
In file included from arch/x86/include/asm/current.h:5,
from include/linux/sched.h:12,
from kernel/sched/sched.h:5,
from kernel/sched/fair.c:23:
kernel/sched/fair.c: In function 'init_sched_fair_class':
>> kernel/sched/fair.c:11590:30: error: 'housekeeping_overridden' undeclared (first use in this function); did you mean 'housekeeping_affine'?
11590 | if (static_branch_unlikely(&housekeeping_overridden)) {
| ^~~~~~~~~~~~~~~~~~~~~~~
include/linux/compiler.h:78:42: note: in definition of macro 'unlikely'
78 | # define unlikely(x) __builtin_expect(!!(x), 0)
| ^
include/linux/jump_label.h:508:35: note: in expansion of macro 'unlikely_notrace'
508 | #define static_branch_unlikely(x) unlikely_notrace(static_key_enabled(&(x)->key))
| ^~~~~~~~~~~~~~~~
include/linux/jump_label.h:508:52: note: in expansion of macro 'static_key_enabled'
508 | #define static_branch_unlikely(x) unlikely_notrace(static_key_enabled(&(x)->key))
| ^~~~~~~~~~~~~~~~~~
kernel/sched/fair.c:11590:6: note: in expansion of macro 'static_branch_unlikely'
11590 | if (static_branch_unlikely(&housekeeping_overridden)) {
| ^~~~~~~~~~~~~~~~~~~~~~
kernel/sched/fair.c:11590:30: note: each undeclared identifier is reported only once for each function it appears in
11590 | if (static_branch_unlikely(&housekeeping_overridden)) {
| ^~~~~~~~~~~~~~~~~~~~~~~
include/linux/compiler.h:78:42: note: in definition of macro 'unlikely'
78 | # define unlikely(x) __builtin_expect(!!(x), 0)
| ^
include/linux/jump_label.h:508:35: note: in expansion of macro 'unlikely_notrace'
508 | #define static_branch_unlikely(x) unlikely_notrace(static_key_enabled(&(x)->key))
| ^~~~~~~~~~~~~~~~
include/linux/jump_label.h:508:52: note: in expansion of macro 'static_key_enabled'
508 | #define static_branch_unlikely(x) unlikely_notrace(static_key_enabled(&(x)->key))
| ^~~~~~~~~~~~~~~~~~
kernel/sched/fair.c:11590:6: note: in expansion of macro 'static_branch_unlikely'
11590 | if (static_branch_unlikely(&housekeeping_overridden)) {
| ^~~~~~~~~~~~~~~~~~~~~~


vim +11590 kernel/sched/fair.c

11583
11584 __init void init_sched_fair_class(void)
11585 {
11586 #ifdef CONFIG_SMP
11587 open_softirq(SCHED_SOFTIRQ, run_rebalance_domains);
11588
11589 #ifdef CONFIG_NO_HZ_COMMON
11590 if (static_branch_unlikely(&housekeeping_overridden)) {
11591 struct nohz *nohz;
11592
11593 nohz_nodes = kcalloc(1, sizeof(struct nohz *), GFP_NOWAIT);
11594 nohz = kmalloc(sizeof(struct nohz), GFP_NOWAIT);
11595 nohz->next_balance = jiffies;
11596 nohz->next_blocked = jiffies;
11597 zalloc_cpumask_var(&nohz->idle_cpus_mask, GFP_NOWAIT);
11598 nohz_nodes[0] = nohz;
11599 } else {
11600 int n;
11601
11602 nohz_nodes = kcalloc(nr_node_ids, sizeof(struct nohz *), GFP_NOWAIT);
11603 for_each_node(n) {
11604 struct nohz *nohz;
11605
11606 nohz = kmalloc_node(sizeof(struct nohz), GFP_NOWAIT, n);
11607 nohz->next_balance = jiffies;
11608 nohz->next_blocked = jiffies;
11609 zalloc_cpumask_var_node(&nohz->idle_cpus_mask, GFP_NOWAIT, n);
11610 nohz_nodes[n] = nohz;
11611 }
11612 }
11613 #endif
11614 #endif /* SMP */
11615

---
0-DAY CI Kernel Test Service, Intel Corporation
https://lists.01.org/hyperkitty/list/[email protected]


Attachments:
(No filename) (6.50 kB)
.config.gz (46.17 kB)
Download all attachments

2021-07-01 16:56:29

by kernel test robot

[permalink] [raw]
Subject: Re: [PATCH] nohz: nohz idle balancing per node

Hi Nicholas,

I love your patch! Yet something to improve:

[auto build test ERROR on linux/master]
[also build test ERROR on linus/master v5.13]
[cannot apply to tip/sched/core next-20210701]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch]

url: https://github.com/0day-ci/linux/commits/Nicholas-Piggin/nohz-nohz-idle-balancing-per-node/20210701-135507
base: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git c54b245d011855ea91c5beff07f1db74143ce614
config: x86_64-randconfig-a016-20210630 (attached as .config)
compiler: clang version 13.0.0 (https://github.com/llvm/llvm-project e7e71e9454ed76c1b3d8140170b5333c28bef1be)
reproduce (this is a W=1 build):
wget https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O ~/bin/make.cross
chmod +x ~/bin/make.cross
# install x86_64 cross compiling tool for clang build
# apt-get install binutils-x86-64-linux-gnu
# https://github.com/0day-ci/linux/commit/1bede18329ddbaed4099e980fdf62092ff881932
git remote add linux-review https://github.com/0day-ci/linux
git fetch --no-tags linux-review Nicholas-Piggin/nohz-nohz-idle-balancing-per-node/20210701-135507
git checkout 1bede18329ddbaed4099e980fdf62092ff881932
# save the attached .config to linux build tree
mkdir build_dir
COMPILER_INSTALL_PATH=$HOME/0day COMPILER=clang make.cross O=build_dir ARCH=x86_64 SHELL=/bin/bash

If you fix the issue, kindly add following tag as appropriate
Reported-by: kernel test robot <[email protected]>

All errors (new ones prefixed by >>):

>> kernel/sched/fair.c:11590:30: error: use of undeclared identifier 'housekeeping_overridden'
if (static_branch_unlikely(&housekeeping_overridden)) {
^
>> kernel/sched/fair.c:11590:30: error: use of undeclared identifier 'housekeeping_overridden'
>> kernel/sched/fair.c:11590:30: error: use of undeclared identifier 'housekeeping_overridden'
>> kernel/sched/fair.c:11590:30: error: use of undeclared identifier 'housekeeping_overridden'
>> kernel/sched/fair.c:11590:30: error: use of undeclared identifier 'housekeeping_overridden'
>> kernel/sched/fair.c:11590:30: error: use of undeclared identifier 'housekeeping_overridden'
>> kernel/sched/fair.c:11590:30: error: use of undeclared identifier 'housekeeping_overridden'
>> kernel/sched/fair.c:11590:30: error: use of undeclared identifier 'housekeeping_overridden'
>> kernel/sched/fair.c:11590:30: error: use of undeclared identifier 'housekeeping_overridden'
>> kernel/sched/fair.c:11590:30: error: use of undeclared identifier 'housekeeping_overridden'
>> kernel/sched/fair.c:11590:30: error: use of undeclared identifier 'housekeeping_overridden'
>> kernel/sched/fair.c:11590:30: error: use of undeclared identifier 'housekeeping_overridden'
>> kernel/sched/fair.c:11590:30: error: use of undeclared identifier 'housekeeping_overridden'
>> kernel/sched/fair.c:11590:30: error: use of undeclared identifier 'housekeeping_overridden'
>> kernel/sched/fair.c:11590:30: error: use of undeclared identifier 'housekeeping_overridden'
>> kernel/sched/fair.c:11590:30: error: use of undeclared identifier 'housekeeping_overridden'
>> kernel/sched/fair.c:11590:30: error: use of undeclared identifier 'housekeeping_overridden'
>> kernel/sched/fair.c:11590:30: error: use of undeclared identifier 'housekeeping_overridden'
>> kernel/sched/fair.c:11590:30: error: use of undeclared identifier 'housekeeping_overridden'
fatal error: too many errors emitted, stopping now [-ferror-limit=]
20 errors generated.


vim +/housekeeping_overridden +11590 kernel/sched/fair.c

11583
11584 __init void init_sched_fair_class(void)
11585 {
11586 #ifdef CONFIG_SMP
11587 open_softirq(SCHED_SOFTIRQ, run_rebalance_domains);
11588
11589 #ifdef CONFIG_NO_HZ_COMMON
11590 if (static_branch_unlikely(&housekeeping_overridden)) {
11591 struct nohz *nohz;
11592
11593 nohz_nodes = kcalloc(1, sizeof(struct nohz *), GFP_NOWAIT);
11594 nohz = kmalloc(sizeof(struct nohz), GFP_NOWAIT);
11595 nohz->next_balance = jiffies;
11596 nohz->next_blocked = jiffies;
11597 zalloc_cpumask_var(&nohz->idle_cpus_mask, GFP_NOWAIT);
11598 nohz_nodes[0] = nohz;
11599 } else {
11600 int n;
11601
11602 nohz_nodes = kcalloc(nr_node_ids, sizeof(struct nohz *), GFP_NOWAIT);
11603 for_each_node(n) {
11604 struct nohz *nohz;
11605
11606 nohz = kmalloc_node(sizeof(struct nohz), GFP_NOWAIT, n);
11607 nohz->next_balance = jiffies;
11608 nohz->next_blocked = jiffies;
11609 zalloc_cpumask_var_node(&nohz->idle_cpus_mask, GFP_NOWAIT, n);
11610 nohz_nodes[n] = nohz;
11611 }
11612 }
11613 #endif
11614 #endif /* SMP */
11615

---
0-DAY CI Kernel Test Service, Intel Corporation
https://lists.01.org/hyperkitty/list/[email protected]


Attachments:
(No filename) (4.99 kB)
.config.gz (46.38 kB)
Download all attachments

2021-07-01 23:37:09

by Nicholas Piggin

[permalink] [raw]
Subject: Re: [PATCH] nohz: nohz idle balancing per node

Excerpts from Mel Gorman's message of July 1, 2021 11:11 pm:
> On Thu, Jul 01, 2021 at 12:18:18PM +0200, Peter Zijlstra wrote:
>> On Thu, Jul 01, 2021 at 03:53:23PM +1000, Nicholas Piggin wrote:
>> > Currently a single nohz idle CPU is designated to perform balancing on
>> > behalf of all other nohz idle CPUs in the system. Implement a per node
>> > nohz balancer to minimize cross-node memory accesses and runqueue lock
>> > acquisitions.
>> >
>> > On a 4 node system, this improves performance by 9.3% on a 'pgbench -N'
>> > with 32 clients/jobs (which is about where throughput maxes out due to
>> > IO and contention in postgres).
>>
>> Hmm, Suresh tried something like this around 2010 and then we ran into
>> trouble that when once node went completely idle and another node was
>> fully busy, the completely idle node would not run ILB and the node
>> would forever stay idle.
>>
>
> An effect like that *might* be visible at
> https://beta.suse.com/private/mgorman/melt/v5.13/3-perf-test/sched/sched-nohznuma-v1r1/html/network-tbench/hardy2/
> at the CPU usage heatmaps ordered by topology at the very bottom of
> the page.
>
> The heatmap covers all client counts so there are "blocks" of activity for
> each client count tested. The third block is for 8 thread counts so a node
> is not fully busy yet.

I'm not sure what I'm looking at. Where are these blocks? Along the x
axis?

> However, with the vanilla kernel, there is some
> load on each node but with the patch all the load is on one node. This
> did not happen on the two other test machines so the observation is not
> reliable and could be a total coincidence.

tbench is pretty finicky so it could be.

>
> That said, there were some gains but large losses depending on the client
> count across the 3 machines for tbench which is a concern. Other results,
> like pgbench mentioned in the changelog, will not complete until tomorrow
> to see if it is a general pattern or tbench-specific.
>
> https://beta.suse.com/private/mgorman/melt/v5.13/3-perf-test/sched/sched-nohznuma-v1r1/html/network-tbench/bing2/
> https://beta.suse.com/private/mgorman/melt/v5.13/3-perf-test/sched/sched-nohznuma-v1r1/html/network-tbench/hardy2/
> https://beta.suse.com/private/mgorman/melt/v5.13/3-perf-test/sched/sched-nohznuma-v1r1/html/network-tbench/marvin2/

All 2-node. How many runs does it do at each clinet count? There's a big
regression at one clinet with one of them, but the other two have small
gains.

Thanks,
Nick

2021-07-01 23:41:20

by Nicholas Piggin

[permalink] [raw]
Subject: Re: [PATCH] nohz: nohz idle balancing per node

Excerpts from Peter Zijlstra's message of July 1, 2021 8:18 pm:
> On Thu, Jul 01, 2021 at 03:53:23PM +1000, Nicholas Piggin wrote:
>> Currently a single nohz idle CPU is designated to perform balancing on
>> behalf of all other nohz idle CPUs in the system. Implement a per node
>> nohz balancer to minimize cross-node memory accesses and runqueue lock
>> acquisitions.
>>
>> On a 4 node system, this improves performance by 9.3% on a 'pgbench -N'
>> with 32 clients/jobs (which is about where throughput maxes out due to
>> IO and contention in postgres).
>
> Hmm, Suresh tried something like this around 2010 and then we ran into
> trouble that when once node went completely idle and another node was
> fully busy, the completely idle node would not run ILB and the node
> would forever stay idle.

I've only changed which CPU runs ilb on behalf of others, not how the
balancing is done.

If one node is idle, then one of the CPUs in that node will be the nohz
balancer for the others in that node. In any other respect, the
balancing will be done the same as if the nohz balancer was on a
different node. So I don't see how it could get into that situation
unless I have a bug. Maybe Suresh's patch was doing something different.

> I've not gone through the code to see if that could still happen -- lots
> has changed since, but it is something to consider.
>
> Some further nits below..
>
>> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
>> index fb469b26b00a..832f8673bba1 100644
>> --- a/kernel/sched/fair.c
>> +++ b/kernel/sched/fair.c
>> @@ -5722,13 +5722,27 @@ DEFINE_PER_CPU(cpumask_var_t, select_idle_mask);
>>
>> #ifdef CONFIG_NO_HZ_COMMON
>>
>> -static struct {
>> +struct nohz {
>> cpumask_var_t idle_cpus_mask;
>> atomic_t nr_cpus;
>> int has_blocked; /* Idle CPUS has blocked load */
>> unsigned long next_balance; /* in jiffy units */
>> unsigned long next_blocked; /* Next update of blocked load in jiffies */
>> -} nohz ____cacheline_aligned;
>> +} ____cacheline_aligned;
>> +
>> +static struct nohz **nohz_nodes __ro_after_init;
>> +
>> +static struct nohz *get_nohz(void)
>> +{
>> +#ifdef CONFIG_CPU_ISOLATION
>> + /*
>> + * May not have a house keeping CPU per node, do global idle balancing.
>> + */
>> + if (static_branch_unlikely(&housekeeping_overridden))
>> + return nohz_nodes[0];
>> +#endif
>> + return nohz_nodes[numa_node_id()];
>> +}
>
> *urgh* I'm not a fan of that isolation/hk behaviour. Even when the HK
> mask allows for a CPU per node, this code won't DTRT. Do we want a
> KERN_INFO message that performance will suffer here?

IIRC it turned out to be non-trivial to work out the housekeeping vs
online CPUs but it might be doable later.

I'm not sure about a printk, that would just fire every time. Maybe a
generic printk or something in the docs to say housekeeping can affect
performance of non-isolated CPUs so make sure you measure.

>
>> #endif /* CONFIG_NO_HZ_COMMON */
>>
>> @@ -10291,14 +10305,14 @@ static void nohz_balancer_kick(struct rq *rq)
>> * None are in tickless mode and hence no need for NOHZ idle load
>> * balancing.
>> */
>> - if (likely(!atomic_read(&nohz.nr_cpus)))
>> + if (likely(!atomic_read(&get_nohz()->nr_cpus)))
>> return;
>>
>> - if (READ_ONCE(nohz.has_blocked) &&
>> - time_after(now, READ_ONCE(nohz.next_blocked)))
>> + if (READ_ONCE(get_nohz()->has_blocked) &&
>> + time_after(now, READ_ONCE(get_nohz()->next_blocked)))
>> flags = NOHZ_STATS_KICK;
>>
>> - if (time_before(now, nohz.next_balance))
>> + if (time_before(now, get_nohz()->next_balance))
>> goto out;
>>
>> if (rq->nr_running >= 2) {
>
> Also, stuff like the above, will result in horrific code-gen, because
> it cannot CSE get_nohz().
>

It was somewhat mechanical conversion. I could use a local variable for
a couple of these cases.

Thanks,
Nick

2021-07-02 08:51:27

by Mel Gorman

[permalink] [raw]
Subject: Re: [PATCH] nohz: nohz idle balancing per node

On Fri, Jul 02, 2021 at 09:33:45AM +1000, Nicholas Piggin wrote:
> Excerpts from Mel Gorman's message of July 1, 2021 11:11 pm:
> > On Thu, Jul 01, 2021 at 12:18:18PM +0200, Peter Zijlstra wrote:
> >> On Thu, Jul 01, 2021 at 03:53:23PM +1000, Nicholas Piggin wrote:
> >> > Currently a single nohz idle CPU is designated to perform balancing on
> >> > behalf of all other nohz idle CPUs in the system. Implement a per node
> >> > nohz balancer to minimize cross-node memory accesses and runqueue lock
> >> > acquisitions.
> >> >
> >> > On a 4 node system, this improves performance by 9.3% on a 'pgbench -N'
> >> > with 32 clients/jobs (which is about where throughput maxes out due to
> >> > IO and contention in postgres).
> >>
> >> Hmm, Suresh tried something like this around 2010 and then we ran into
> >> trouble that when once node went completely idle and another node was
> >> fully busy, the completely idle node would not run ILB and the node
> >> would forever stay idle.
> >>
> >
> > An effect like that *might* be visible at
> > https://beta.suse.com/private/mgorman/melt/v5.13/3-perf-test/sched/sched-nohznuma-v1r1/html/network-tbench/hardy2/
> > at the CPU usage heatmaps ordered by topology at the very bottom of
> > the page.
> >
> > The heatmap covers all client counts so there are "blocks" of activity for
> > each client count tested. The third block is for 8 thread counts so a node
> > is not fully busy yet.
>
> I'm not sure what I'm looking at. Where are these blocks? Along the x
> axis?
>

The X axis is time. Each row is a CPU with a vertical line colored based on
the utilisation (white for idle, green for low utilisation, red for higher
utilisation). Along the Y axis, for a 2-socket machine, the top half 1
one node, the bottom half is the second node. Each "pair" of rows, where
pairs are indicated on the left with the CPU number, are SMT siblings.

The "blocks" along the xaxis represent 3 minutes running the benchmark
for a given client which is why the pattern changes as it starts at 1
client and increases the client count over time.

> > However, with the vanilla kernel, there is some
> > load on each node but with the patch all the load is on one node. This
> > did not happen on the two other test machines so the observation is not
> > reliable and could be a total coincidence.
>
> tbench is pretty finicky so it could be.
>

It's even likely. It's also not triggering the situation Peter described --
"once node went completely idle and another node was fully busy", tbench
doesn't do this except by accident.

> >
> > That said, there were some gains but large losses depending on the client
> > count across the 3 machines for tbench which is a concern. Other results,
> > like pgbench mentioned in the changelog, will not complete until tomorrow
> > to see if it is a general pattern or tbench-specific.
> >
> > https://beta.suse.com/private/mgorman/melt/v5.13/3-perf-test/sched/sched-nohznuma-v1r1/html/network-tbench/bing2/
> > https://beta.suse.com/private/mgorman/melt/v5.13/3-perf-test/sched/sched-nohznuma-v1r1/html/network-tbench/hardy2/
> > https://beta.suse.com/private/mgorman/melt/v5.13/3-perf-test/sched/sched-nohznuma-v1r1/html/network-tbench/marvin2/
>
> All 2-node.

Yes, I only use a limited set of machines initially. It's only when
2-node passes that a series may get evaluated on large machines, zen*
generations etc.

> How many runs does it do at each clinet count? There's a big
> regression at one clinet with one of them, but the other two have small
> gains.
>

tbench can be finicky so I treat it with caution even though I generally
use it a sniff test. Only one iteration is run per thread count. What is
graphed is the reported throughput over time which is a dangerous metric
because each datapoint is average throughput since the test started. A more
robust metric would be to run the benchmark multiple times taking the final
throughput for each iteration. There are a few reasons why I didn't do that

If the reported throughput is highly variable over time, that is an
interesting result in itself as it can imply that steady progress is
not being made.

Looking at how the workload behaves during an iteration is useful

It takes an unreasonable amount of time to run it multiple times and
the full mix of scheduler tests I ran takes almost a day per tested
kernel. For example, these tests are not even complete yet and
probably won't be until late tonight.

I'm well aware of the limitations of testing tbench like this. dbench for
example is a similiarly designed workload except dbench gets patched to
report the time taken to process the loadfile once and uses that as a
primary metric instead of estimated throughput.

--
Mel Gorman
SUSE Labs

2021-07-07 07:51:41

by Vincent Guittot

[permalink] [raw]
Subject: Re: [PATCH] nohz: nohz idle balancing per node

On Thu, 1 Jul 2021 at 07:53, Nicholas Piggin <[email protected]> wrote:
>
> Currently a single nohz idle CPU is designated to perform balancing on
> behalf of all other nohz idle CPUs in the system. Implement a per node
> nohz balancer to minimize cross-node memory accesses and runqueue lock
> acquisitions.
>
> On a 4 node system, this improves performance by 9.3% on a 'pgbench -N'
> with 32 clients/jobs (which is about where throughput maxes out due to
> IO and contention in postgres).
>
> Performance consistency and drop-off after peak is much improved:
>
> https://user-images.githubusercontent.com/19833002/124070985-0e923a80-da82-11eb-81c8-6788be27dfdb.png
>
> Signed-off-by: Nicholas Piggin <[email protected]>
> ---
> kernel/sched/core.c | 3 ++
> kernel/sched/fair.c | 90 +++++++++++++++++++++++++++++++--------------
> 2 files changed, 65 insertions(+), 28 deletions(-)
>
> diff --git a/kernel/sched/core.c b/kernel/sched/core.c
> index cf16f8fda9a6..52681f11e98c 100644
> --- a/kernel/sched/core.c
> +++ b/kernel/sched/core.c
> @@ -9067,6 +9067,9 @@ void __init sched_init(void)
> */
> init_idle(current, smp_processor_id());
>
> + /* Must run before init_sched_fair_class(), which tests housekeeping */
> + housekeeping_init();

You keep calling it after sched_init() in start_kernel()

> +
> calc_load_update = jiffies + LOAD_FREQ;
>
> #ifdef CONFIG_SMP
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index fb469b26b00a..832f8673bba1 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -5722,13 +5722,27 @@ DEFINE_PER_CPU(cpumask_var_t, select_idle_mask);
>
> #ifdef CONFIG_NO_HZ_COMMON
>
> -static struct {
> +struct nohz {
> cpumask_var_t idle_cpus_mask;
> atomic_t nr_cpus;
> int has_blocked; /* Idle CPUS has blocked load */
> unsigned long next_balance; /* in jiffy units */
> unsigned long next_blocked; /* Next update of blocked load in jiffies */
> -} nohz ____cacheline_aligned;
> +} ____cacheline_aligned;
> +
> +static struct nohz **nohz_nodes __ro_after_init;
> +
> +static struct nohz *get_nohz(void)
> +{
> +#ifdef CONFIG_CPU_ISOLATION
> + /*
> + * May not have a house keeping CPU per node, do global idle balancing.
> + */
> + if (static_branch_unlikely(&housekeeping_overridden))
> + return nohz_nodes[0];
> +#endif
> + return nohz_nodes[numa_node_id()];

This assumes that we call get_nohz() from local cpu or at least local
node. Have you checked that this is always the case ?

> +}
>
> #endif /* CONFIG_NO_HZ_COMMON */
>
> @@ -10217,7 +10231,7 @@ static inline int find_new_ilb(void)
> {
> int ilb;
>
> - for_each_cpu_and(ilb, nohz.idle_cpus_mask,
> + for_each_cpu_and(ilb, get_nohz()->idle_cpus_mask,
> housekeeping_cpumask(HK_FLAG_MISC)) {
>
> if (ilb == smp_processor_id())
> @@ -10243,7 +10257,7 @@ static void kick_ilb(unsigned int flags)
> * not if we only update stats.
> */
> if (flags & NOHZ_BALANCE_KICK)
> - nohz.next_balance = jiffies+1;
> + get_nohz()->next_balance = jiffies+1;
>
> ilb_cpu = find_new_ilb();
>
> @@ -10291,14 +10305,14 @@ static void nohz_balancer_kick(struct rq *rq)
> * None are in tickless mode and hence no need for NOHZ idle load
> * balancing.
> */
> - if (likely(!atomic_read(&nohz.nr_cpus)))
> + if (likely(!atomic_read(&get_nohz()->nr_cpus)))
> return;
>
> - if (READ_ONCE(nohz.has_blocked) &&
> - time_after(now, READ_ONCE(nohz.next_blocked)))
> + if (READ_ONCE(get_nohz()->has_blocked) &&
> + time_after(now, READ_ONCE(get_nohz()->next_blocked)))
> flags = NOHZ_STATS_KICK;
>
> - if (time_before(now, nohz.next_balance))
> + if (time_before(now, get_nohz()->next_balance))
> goto out;
>
> if (rq->nr_running >= 2) {
> @@ -10328,7 +10342,7 @@ static void nohz_balancer_kick(struct rq *rq)
> * currently idle; in which case, kick the ILB to move tasks
> * around.
> */
> - for_each_cpu_and(i, sched_domain_span(sd), nohz.idle_cpus_mask) {
> + for_each_cpu_and(i, sched_domain_span(sd), get_nohz()->idle_cpus_mask) {
> if (sched_asym_prefer(i, cpu)) {
> flags = NOHZ_KICK_MASK;
> goto unlock;
> @@ -10405,8 +10419,8 @@ void nohz_balance_exit_idle(struct rq *rq)
> return;
>
> rq->nohz_tick_stopped = 0;
> - cpumask_clear_cpu(rq->cpu, nohz.idle_cpus_mask);
> - atomic_dec(&nohz.nr_cpus);
> + cpumask_clear_cpu(rq->cpu, get_nohz()->idle_cpus_mask);
> + atomic_dec(&get_nohz()->nr_cpus);
>
> set_cpu_sd_state_busy(rq->cpu);
> }
> @@ -10467,8 +10481,8 @@ void nohz_balance_enter_idle(int cpu)
>
> rq->nohz_tick_stopped = 1;
>
> - cpumask_set_cpu(cpu, nohz.idle_cpus_mask);
> - atomic_inc(&nohz.nr_cpus);
> + cpumask_set_cpu(cpu, get_nohz()->idle_cpus_mask);
> + atomic_inc(&get_nohz()->nr_cpus);
>
> /*
> * Ensures that if nohz_idle_balance() fails to observe our
> @@ -10484,7 +10498,7 @@ void nohz_balance_enter_idle(int cpu)
> * Each time a cpu enter idle, we assume that it has blocked load and
> * enable the periodic update of the load of idle cpus
> */
> - WRITE_ONCE(nohz.has_blocked, 1);
> + WRITE_ONCE(get_nohz()->has_blocked, 1);
> }
>
> static bool update_nohz_stats(struct rq *rq)
> @@ -10494,7 +10508,7 @@ static bool update_nohz_stats(struct rq *rq)
> if (!rq->has_blocked_load)
> return false;
>
> - if (!cpumask_test_cpu(cpu, nohz.idle_cpus_mask))
> + if (!cpumask_test_cpu(cpu, get_nohz()->idle_cpus_mask))
> return false;
>
> if (!time_after(jiffies, READ_ONCE(rq->last_blocked_load_update_tick)))
> @@ -10532,7 +10546,7 @@ static void _nohz_idle_balance(struct rq *this_rq, unsigned int flags,
> * setting the flag, we are sure to not clear the state and not
> * check the load of an idle cpu.
> */
> - WRITE_ONCE(nohz.has_blocked, 0);
> + WRITE_ONCE(get_nohz()->has_blocked, 0);
>
> /*
> * Ensures that if we miss the CPU, we must see the has_blocked
> @@ -10544,10 +10558,7 @@ static void _nohz_idle_balance(struct rq *this_rq, unsigned int flags,
> * Start with the next CPU after this_cpu so we will end with this_cpu and let a
> * chance for other idle cpu to pull load.
> */
> - for_each_cpu_wrap(balance_cpu, nohz.idle_cpus_mask, this_cpu+1) {
> - if (!idle_cpu(balance_cpu))
> - continue;
> -
> + for_each_cpu_wrap(balance_cpu, get_nohz()->idle_cpus_mask, this_cpu+1) {
> /*
> * If this CPU gets work to do, stop the load balancing
> * work being done for other CPUs. Next load
> @@ -10558,6 +10569,9 @@ static void _nohz_idle_balance(struct rq *this_rq, unsigned int flags,
> goto abort;
> }
>
> + if (!idle_cpu(balance_cpu))
> + continue;

The move of this test seems to be out of the scope of the patch subject

> +
> rq = cpu_rq(balance_cpu);
>
> has_blocked_load |= update_nohz_stats(rq);
> @@ -10589,15 +10603,15 @@ static void _nohz_idle_balance(struct rq *this_rq, unsigned int flags,
> * updated.
> */
> if (likely(update_next_balance))
> - nohz.next_balance = next_balance;
> + get_nohz()->next_balance = next_balance;
>
> - WRITE_ONCE(nohz.next_blocked,
> + WRITE_ONCE(get_nohz()->next_blocked,
> now + msecs_to_jiffies(LOAD_AVG_PERIOD));
>
> abort:
> /* There is still blocked load, enable periodic update */
> if (has_blocked_load)
> - WRITE_ONCE(nohz.has_blocked, 1);
> + WRITE_ONCE(get_nohz()->has_blocked, 1);
> }
>
> /*
> @@ -10655,8 +10669,8 @@ static void nohz_newidle_balance(struct rq *this_rq)
> return;
>
> /* Don't need to update blocked load of idle CPUs*/
> - if (!READ_ONCE(nohz.has_blocked) ||
> - time_before(jiffies, READ_ONCE(nohz.next_blocked)))
> + if (!READ_ONCE(get_nohz()->has_blocked) ||
> + time_before(jiffies, READ_ONCE(get_nohz()->next_blocked)))
> return;
>
> /*
> @@ -11573,9 +11587,29 @@ __init void init_sched_fair_class(void)
> open_softirq(SCHED_SOFTIRQ, run_rebalance_domains);
>
> #ifdef CONFIG_NO_HZ_COMMON
> - nohz.next_balance = jiffies;
> - nohz.next_blocked = jiffies;
> - zalloc_cpumask_var(&nohz.idle_cpus_mask, GFP_NOWAIT);
> + if (static_branch_unlikely(&housekeeping_overridden)) {
> + struct nohz *nohz;
> +
> + nohz_nodes = kcalloc(1, sizeof(struct nohz *), GFP_NOWAIT);
> + nohz = kmalloc(sizeof(struct nohz), GFP_NOWAIT);
> + nohz->next_balance = jiffies;
> + nohz->next_blocked = jiffies;
> + zalloc_cpumask_var(&nohz->idle_cpus_mask, GFP_NOWAIT);
> + nohz_nodes[0] = nohz;
> + } else {
> + int n;
> +
> + nohz_nodes = kcalloc(nr_node_ids, sizeof(struct nohz *), GFP_NOWAIT);
> + for_each_node(n) {
> + struct nohz *nohz;
> +
> + nohz = kmalloc_node(sizeof(struct nohz), GFP_NOWAIT, n);
> + nohz->next_balance = jiffies;
> + nohz->next_blocked = jiffies;
> + zalloc_cpumask_var_node(&nohz->idle_cpus_mask, GFP_NOWAIT, n);
> + nohz_nodes[n] = nohz;
> + }
> + }
> #endif
> #endif /* SMP */
>
> --
> 2.23.0
>

2021-07-12 14:00:12

by Vincent Guittot

[permalink] [raw]
Subject: Re: [PATCH] nohz: nohz idle balancing per node

On Wed, 7 Jul 2021 at 09:50, Vincent Guittot <[email protected]> wrote:
>
> On Thu, 1 Jul 2021 at 07:53, Nicholas Piggin <[email protected]> wrote:
> >
> > Currently a single nohz idle CPU is designated to perform balancing on
> > behalf of all other nohz idle CPUs in the system. Implement a per node
> > nohz balancer to minimize cross-node memory accesses and runqueue lock
> > acquisitions.
> >
> > On a 4 node system, this improves performance by 9.3% on a 'pgbench -N'
> > with 32 clients/jobs (which is about where throughput maxes out due to
> > IO and contention in postgres).
> >
> > Performance consistency and drop-off after peak is much improved:
> >
> > https://user-images.githubusercontent.com/19833002/124070985-0e923a80-da82-11eb-81c8-6788be27dfdb.png

I run some perf tests with the patch.

For hackbench, I see significant improvement with only 1 group and
other tests with more groups don't show any difference. I don't know
if the root cause of the improvement is that the ilb happens on the
local node or if we can run some ilb in parallel on each node
tip + patch
13.683(+/- 3.91%) 12.306(+/- 1.59%) +10.06%

I was about to say that I have a regression for tbench with high
number of client but one iteration failed during the test and a new
run doesn't show any egression.

> >
> > Signed-off-by: Nicholas Piggin <[email protected]>
> > ---
> > kernel/sched/core.c | 3 ++
> > kernel/sched/fair.c | 90 +++++++++++++++++++++++++++++++--------------
> > 2 files changed, 65 insertions(+), 28 deletions(-)
> >
> > diff --git a/kernel/sched/core.c b/kernel/sched/core.c
> > index cf16f8fda9a6..52681f11e98c 100644
> > --- a/kernel/sched/core.c
> > +++ b/kernel/sched/core.c
> > @@ -9067,6 +9067,9 @@ void __init sched_init(void)
> > */
> > init_idle(current, smp_processor_id());
> >
> > + /* Must run before init_sched_fair_class(), which tests housekeeping */
> > + housekeeping_init();
>
> You keep calling it after sched_init() in start_kernel()
>
> > +
> > calc_load_update = jiffies + LOAD_FREQ;
> >
> > #ifdef CONFIG_SMP
> > diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> > index fb469b26b00a..832f8673bba1 100644
> > --- a/kernel/sched/fair.c
> > +++ b/kernel/sched/fair.c
> > @@ -5722,13 +5722,27 @@ DEFINE_PER_CPU(cpumask_var_t, select_idle_mask);
> >
> > #ifdef CONFIG_NO_HZ_COMMON
> >
> > -static struct {
> > +struct nohz {
> > cpumask_var_t idle_cpus_mask;
> > atomic_t nr_cpus;
> > int has_blocked; /* Idle CPUS has blocked load */
> > unsigned long next_balance; /* in jiffy units */
> > unsigned long next_blocked; /* Next update of blocked load in jiffies */
> > -} nohz ____cacheline_aligned;
> > +} ____cacheline_aligned;
> > +
> > +static struct nohz **nohz_nodes __ro_after_init;
> > +
> > +static struct nohz *get_nohz(void)
> > +{
> > +#ifdef CONFIG_CPU_ISOLATION
> > + /*
> > + * May not have a house keeping CPU per node, do global idle balancing.
> > + */
> > + if (static_branch_unlikely(&housekeeping_overridden))
> > + return nohz_nodes[0];
> > +#endif
> > + return nohz_nodes[numa_node_id()];
>
> This assumes that we call get_nohz() from local cpu or at least local
> node. Have you checked that this is always the case ?
>
> > +}
> >
> > #endif /* CONFIG_NO_HZ_COMMON */
> >
> > @@ -10217,7 +10231,7 @@ static inline int find_new_ilb(void)
> > {
> > int ilb;
> >
> > - for_each_cpu_and(ilb, nohz.idle_cpus_mask,
> > + for_each_cpu_and(ilb, get_nohz()->idle_cpus_mask,
> > housekeeping_cpumask(HK_FLAG_MISC)) {
> >
> > if (ilb == smp_processor_id())
> > @@ -10243,7 +10257,7 @@ static void kick_ilb(unsigned int flags)
> > * not if we only update stats.
> > */
> > if (flags & NOHZ_BALANCE_KICK)
> > - nohz.next_balance = jiffies+1;
> > + get_nohz()->next_balance = jiffies+1;
> >
> > ilb_cpu = find_new_ilb();
> >
> > @@ -10291,14 +10305,14 @@ static void nohz_balancer_kick(struct rq *rq)
> > * None are in tickless mode and hence no need for NOHZ idle load
> > * balancing.
> > */
> > - if (likely(!atomic_read(&nohz.nr_cpus)))
> > + if (likely(!atomic_read(&get_nohz()->nr_cpus)))
> > return;
> >
> > - if (READ_ONCE(nohz.has_blocked) &&
> > - time_after(now, READ_ONCE(nohz.next_blocked)))
> > + if (READ_ONCE(get_nohz()->has_blocked) &&
> > + time_after(now, READ_ONCE(get_nohz()->next_blocked)))
> > flags = NOHZ_STATS_KICK;
> >
> > - if (time_before(now, nohz.next_balance))
> > + if (time_before(now, get_nohz()->next_balance))
> > goto out;
> >
> > if (rq->nr_running >= 2) {
> > @@ -10328,7 +10342,7 @@ static void nohz_balancer_kick(struct rq *rq)
> > * currently idle; in which case, kick the ILB to move tasks
> > * around.
> > */
> > - for_each_cpu_and(i, sched_domain_span(sd), nohz.idle_cpus_mask) {
> > + for_each_cpu_and(i, sched_domain_span(sd), get_nohz()->idle_cpus_mask) {
> > if (sched_asym_prefer(i, cpu)) {
> > flags = NOHZ_KICK_MASK;
> > goto unlock;
> > @@ -10405,8 +10419,8 @@ void nohz_balance_exit_idle(struct rq *rq)
> > return;
> >
> > rq->nohz_tick_stopped = 0;
> > - cpumask_clear_cpu(rq->cpu, nohz.idle_cpus_mask);
> > - atomic_dec(&nohz.nr_cpus);
> > + cpumask_clear_cpu(rq->cpu, get_nohz()->idle_cpus_mask);
> > + atomic_dec(&get_nohz()->nr_cpus);
> >
> > set_cpu_sd_state_busy(rq->cpu);
> > }
> > @@ -10467,8 +10481,8 @@ void nohz_balance_enter_idle(int cpu)
> >
> > rq->nohz_tick_stopped = 1;
> >
> > - cpumask_set_cpu(cpu, nohz.idle_cpus_mask);
> > - atomic_inc(&nohz.nr_cpus);
> > + cpumask_set_cpu(cpu, get_nohz()->idle_cpus_mask);
> > + atomic_inc(&get_nohz()->nr_cpus);
> >
> > /*
> > * Ensures that if nohz_idle_balance() fails to observe our
> > @@ -10484,7 +10498,7 @@ void nohz_balance_enter_idle(int cpu)
> > * Each time a cpu enter idle, we assume that it has blocked load and
> > * enable the periodic update of the load of idle cpus
> > */
> > - WRITE_ONCE(nohz.has_blocked, 1);
> > + WRITE_ONCE(get_nohz()->has_blocked, 1);
> > }
> >
> > static bool update_nohz_stats(struct rq *rq)
> > @@ -10494,7 +10508,7 @@ static bool update_nohz_stats(struct rq *rq)
> > if (!rq->has_blocked_load)
> > return false;
> >
> > - if (!cpumask_test_cpu(cpu, nohz.idle_cpus_mask))
> > + if (!cpumask_test_cpu(cpu, get_nohz()->idle_cpus_mask))
> > return false;
> >
> > if (!time_after(jiffies, READ_ONCE(rq->last_blocked_load_update_tick)))
> > @@ -10532,7 +10546,7 @@ static void _nohz_idle_balance(struct rq *this_rq, unsigned int flags,
> > * setting the flag, we are sure to not clear the state and not
> > * check the load of an idle cpu.
> > */
> > - WRITE_ONCE(nohz.has_blocked, 0);
> > + WRITE_ONCE(get_nohz()->has_blocked, 0);
> >
> > /*
> > * Ensures that if we miss the CPU, we must see the has_blocked
> > @@ -10544,10 +10558,7 @@ static void _nohz_idle_balance(struct rq *this_rq, unsigned int flags,
> > * Start with the next CPU after this_cpu so we will end with this_cpu and let a
> > * chance for other idle cpu to pull load.
> > */
> > - for_each_cpu_wrap(balance_cpu, nohz.idle_cpus_mask, this_cpu+1) {
> > - if (!idle_cpu(balance_cpu))
> > - continue;
> > -
> > + for_each_cpu_wrap(balance_cpu, get_nohz()->idle_cpus_mask, this_cpu+1) {
> > /*
> > * If this CPU gets work to do, stop the load balancing
> > * work being done for other CPUs. Next load
> > @@ -10558,6 +10569,9 @@ static void _nohz_idle_balance(struct rq *this_rq, unsigned int flags,
> > goto abort;
> > }
> >
> > + if (!idle_cpu(balance_cpu))
> > + continue;
>
> The move of this test seems to be out of the scope of the patch subject
>
> > +
> > rq = cpu_rq(balance_cpu);
> >
> > has_blocked_load |= update_nohz_stats(rq);
> > @@ -10589,15 +10603,15 @@ static void _nohz_idle_balance(struct rq *this_rq, unsigned int flags,
> > * updated.
> > */
> > if (likely(update_next_balance))
> > - nohz.next_balance = next_balance;
> > + get_nohz()->next_balance = next_balance;
> >
> > - WRITE_ONCE(nohz.next_blocked,
> > + WRITE_ONCE(get_nohz()->next_blocked,
> > now + msecs_to_jiffies(LOAD_AVG_PERIOD));
> >
> > abort:
> > /* There is still blocked load, enable periodic update */
> > if (has_blocked_load)
> > - WRITE_ONCE(nohz.has_blocked, 1);
> > + WRITE_ONCE(get_nohz()->has_blocked, 1);
> > }
> >
> > /*
> > @@ -10655,8 +10669,8 @@ static void nohz_newidle_balance(struct rq *this_rq)
> > return;
> >
> > /* Don't need to update blocked load of idle CPUs*/
> > - if (!READ_ONCE(nohz.has_blocked) ||
> > - time_before(jiffies, READ_ONCE(nohz.next_blocked)))
> > + if (!READ_ONCE(get_nohz()->has_blocked) ||
> > + time_before(jiffies, READ_ONCE(get_nohz()->next_blocked)))
> > return;
> >
> > /*
> > @@ -11573,9 +11587,29 @@ __init void init_sched_fair_class(void)
> > open_softirq(SCHED_SOFTIRQ, run_rebalance_domains);
> >
> > #ifdef CONFIG_NO_HZ_COMMON
> > - nohz.next_balance = jiffies;
> > - nohz.next_blocked = jiffies;
> > - zalloc_cpumask_var(&nohz.idle_cpus_mask, GFP_NOWAIT);
> > + if (static_branch_unlikely(&housekeeping_overridden)) {
> > + struct nohz *nohz;
> > +
> > + nohz_nodes = kcalloc(1, sizeof(struct nohz *), GFP_NOWAIT);
> > + nohz = kmalloc(sizeof(struct nohz), GFP_NOWAIT);
> > + nohz->next_balance = jiffies;
> > + nohz->next_blocked = jiffies;
> > + zalloc_cpumask_var(&nohz->idle_cpus_mask, GFP_NOWAIT);
> > + nohz_nodes[0] = nohz;
> > + } else {
> > + int n;
> > +
> > + nohz_nodes = kcalloc(nr_node_ids, sizeof(struct nohz *), GFP_NOWAIT);
> > + for_each_node(n) {
> > + struct nohz *nohz;
> > +
> > + nohz = kmalloc_node(sizeof(struct nohz), GFP_NOWAIT, n);
> > + nohz->next_balance = jiffies;
> > + nohz->next_blocked = jiffies;
> > + zalloc_cpumask_var_node(&nohz->idle_cpus_mask, GFP_NOWAIT, n);
> > + nohz_nodes[n] = nohz;
> > + }
> > + }
> > #endif
> > #endif /* SMP */
> >
> > --
> > 2.23.0
> >