2009-12-11 01:39:10

by Pallipadi, Venkatesh

[permalink] [raw]
Subject: [patch 2/2] sched: Scale the nohz_tracker logic by making it per NUMA node

Having one idle CPU doing the rebalancing for all the idle CPUs in
nohz mode does not scale well with increasing number of cores and
sockets. Make the nohz_tracker per NUMA node. This results in multiple
idle load balancing happening at NUMA node level and idle load balancer
only does the rebalance domain among all the other nohz CPUs in that
NUMA node.

This addresses the below problem with the current nohz ilb logic
* The lone balancer may end up spending a lot of time doing the
* balancing on
behalf of nohz CPUs, especially with increasing number of sockets and
cores in the platform.

Signed-off-by: Venkatesh Pallipadi <[email protected]>
Signed-off-by: Suresh Siddha <[email protected]>
---
kernel/sched.c | 177 +++++++++++++++++++++++++++++++++++++++++++-------------
1 files changed, 136 insertions(+), 41 deletions(-)

diff --git a/kernel/sched.c b/kernel/sched.c
index aea2e32..1cc1485 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -4535,22 +4535,90 @@ static void active_load_balance(struct rq *busiest_rq, int busiest_cpu)
* With new logic, idle load balancer tracks the rq->next_balance for all
* the idle CPUs and does idle load balancing only when needed.
*/
-static struct {
+struct nohz_tracker {
atomic_t load_balancer;
atomic_t first_pick_cpu;
atomic_t second_pick_cpu;
cpumask_var_t idle_cpus_mask;
cpumask_var_t tmp_nohz_mask;
unsigned long next_balance; /* in jiffy units */
-} nohz ____cacheline_aligned = {
- .load_balancer = ATOMIC_INIT(-1),
- .first_pick_cpu = ATOMIC_INIT(-1),
- .second_pick_cpu = ATOMIC_INIT(-1),
};

+static DEFINE_PER_CPU(struct nohz_tracker *, cpu_node_nohz_ptr);
+static struct nohz_tracker **nohz_tracker_ptrs;
+
+static int alloc_node_nohz_tracker(void)
+{
+ int i, j;
+
+ /* Do all the allocations only once per boot */
+ if (nohz_tracker_ptrs)
+ return 0;
+
+ nohz_tracker_ptrs = kzalloc(nr_node_ids * sizeof(struct nohz_tracker *),
+ GFP_KERNEL);
+ if (!nohz_tracker_ptrs) {
+ printk(KERN_WARNING "Can not alloc nohz trackers\n");
+ return -ENOMEM;
+ }
+
+ for (i = 0; i < nr_node_ids; i++) {
+ nohz_tracker_ptrs[i] = kzalloc_node(sizeof(struct nohz_tracker),
+ GFP_KERNEL, i);
+ if (!nohz_tracker_ptrs[i]) {
+ printk(KERN_WARNING "Can not alloc domain group for "
+ "node %d\n", i);
+ goto free_ret;
+ }
+
+ if (!zalloc_cpumask_var_node(&nohz_tracker_ptrs[i]->idle_cpus_mask,
+ GFP_KERNEL, i)) {
+ kfree(nohz_tracker_ptrs[i]);
+ goto free_ret;
+ }
+
+ if (!zalloc_cpumask_var_node(&nohz_tracker_ptrs[i]->tmp_nohz_mask,
+ GFP_KERNEL, i)) {
+ free_cpumask_var(nohz_tracker_ptrs[i]->idle_cpus_mask);
+ kfree(nohz_tracker_ptrs[i]);
+ goto free_ret;
+ }
+ atomic_set(&nohz_tracker_ptrs[i]->load_balancer, -1);
+ atomic_set(&nohz_tracker_ptrs[i]->first_pick_cpu, -1);
+ atomic_set(&nohz_tracker_ptrs[i]->second_pick_cpu, -1);
+ }
+
+ return 0;
+
+free_ret:
+ for (j = 0; j < i; j++) {
+ free_cpumask_var(nohz_tracker_ptrs[j]->tmp_nohz_mask);
+ free_cpumask_var(nohz_tracker_ptrs[j]->idle_cpus_mask);
+ kfree(nohz_tracker_ptrs[j]);
+ }
+
+ kfree(nohz_tracker_ptrs);
+
+ for_each_online_cpu(i)
+ per_cpu(cpu_node_nohz_ptr, i) = NULL;
+
+ nohz_tracker_ptrs = NULL;
+ return -ENOMEM;
+}
+
+static int get_nohz_load_balancer_node(struct nohz_tracker *node_nohz)
+{
+ if (!node_nohz)
+ return -1;
+
+ return atomic_read(&node_nohz->load_balancer);
+}
+
int get_nohz_load_balancer(void)
{
- return atomic_read(&nohz.load_balancer);
+ int cpu = smp_processor_id();
+ struct nohz_tracker *node_nohz = per_cpu(cpu_node_nohz_ptr, cpu);
+ return get_nohz_load_balancer_node(node_nohz);
}

#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
@@ -4591,6 +4659,7 @@ static inline struct sched_domain *lowest_flag_domain(int cpu, int flag)
/**
* is_semi_idle_group - Checks if the given sched_group is semi-idle.
* @ilb_group: group to be checked for semi-idleness
+ * @node_nohz: nohz_tracker for the node
*
* Returns: 1 if the group is semi-idle. 0 otherwise.
*
@@ -4598,26 +4667,30 @@ static inline struct sched_domain *lowest_flag_domain(int cpu, int flag)
* and atleast one non-idle CPU. This helper function checks if the given
* sched_group is semi-idle or not.
*/
-static inline int is_semi_idle_group(struct sched_group *ilb_group)
+static inline int is_semi_idle_group(struct sched_group *ilb_group,
+ struct nohz_tracker *node_nohz)
{
- cpumask_and(nohz.tmp_nohz_mask, nohz.idle_cpus_mask,
+ cpumask_and(node_nohz->tmp_nohz_mask, node_nohz->idle_cpus_mask,
sched_group_cpus(ilb_group));

/*
* A sched_group is semi-idle when it has atleast one busy cpu
* and atleast one idle cpu.
*/
- if (cpumask_empty(nohz.tmp_nohz_mask))
+ if (cpumask_empty(node_nohz->tmp_nohz_mask))
return 0;

- if (cpumask_equal(nohz.tmp_nohz_mask, sched_group_cpus(ilb_group)))
+ if (cpumask_equal(node_nohz->tmp_nohz_mask,
+ sched_group_cpus(ilb_group))) {
return 0;
+ }

return 1;
}
/**
* find_new_ilb - Finds the optimum idle load balancer for nomination.
* @cpu: The cpu which is nominating a new idle_load_balancer.
+ * @node_nohz: nohz_tracker for the node
*
* Returns: Returns the id of the idle load balancer if it exists,
* Else, returns >= nr_cpu_ids.
@@ -4627,7 +4700,7 @@ static inline int is_semi_idle_group(struct sched_group *ilb_group)
* completely idle packages/cores just for the purpose of idle load balancing
* when there are other idle cpu's which are better suited for that job.
*/
-static int find_new_ilb(int cpu)
+static int find_new_ilb(int cpu, struct nohz_tracker *node_nohz)
{
struct sched_domain *sd;
struct sched_group *ilb_group;
@@ -4643,15 +4716,15 @@ static int find_new_ilb(int cpu)
* Optimize for the case when we have no idle CPUs or only one
* idle CPU. Don't walk the sched_domain hierarchy in such cases
*/
- if (cpumask_weight(nohz.idle_cpus_mask) < 2)
+ if (cpumask_weight(node_nohz->idle_cpus_mask) < 2)
goto out_done;

for_each_flag_domain(cpu, sd, SD_POWERSAVINGS_BALANCE) {
ilb_group = sd->groups;

do {
- if (is_semi_idle_group(ilb_group))
- return cpumask_first(nohz.tmp_nohz_mask);
+ if (is_semi_idle_group(ilb_group, node_nohz))
+ return cpumask_first(node_nohz->tmp_nohz_mask);

ilb_group = ilb_group->next;

@@ -4662,7 +4735,8 @@ out_done:
return nr_cpu_ids;
}
#else /* (CONFIG_SCHED_MC || CONFIG_SCHED_SMT) */
-static inline int find_new_ilb(int call_cpu)
+static inline int find_new_ilb(int call_cpu,
+ struct nohz_tracker *node_nohz)
{
return nr_cpu_ids;
}
@@ -4676,12 +4750,16 @@ static inline int find_new_ilb(int call_cpu)
static void nohz_balancer_kick(int cpu)
{
int ilb_cpu;
+ struct nohz_tracker *node_nohz = per_cpu(cpu_node_nohz_ptr, cpu);
+
+ if (unlikely(!node_nohz))
+ return;

- nohz.next_balance++;
+ node_nohz->next_balance++;

- ilb_cpu = get_nohz_load_balancer();
+ ilb_cpu = get_nohz_load_balancer_node(node_nohz);
if (ilb_cpu < 0) {
- ilb_cpu = cpumask_first(nohz.idle_cpus_mask);
+ ilb_cpu = cpumask_first(node_nohz->idle_cpus_mask);
if (ilb_cpu >= nr_cpu_ids)
return;
}
@@ -4709,51 +4787,55 @@ static void nohz_balancer_kick(int cpu)
void select_nohz_load_balancer(int stop_tick)
{
int cpu = smp_processor_id();
+ struct nohz_tracker *node_nohz = per_cpu(cpu_node_nohz_ptr, cpu);
+
+ if (unlikely(!node_nohz))
+ return;

if (stop_tick) {
if (!cpu_active(cpu)) {
- if (atomic_read(&nohz.load_balancer) != cpu)
+ if (atomic_read(&node_nohz->load_balancer) != cpu)
return;

/*
* If we are going offline and still the leader,
* give up!
*/
- if (atomic_cmpxchg(&nohz.load_balancer, cpu, -1) != cpu)
+ if (atomic_cmpxchg(&node_nohz->load_balancer, cpu, -1) != cpu)
BUG();

return;
}

- cpumask_set_cpu(cpu, nohz.idle_cpus_mask);
- atomic_cmpxchg(&nohz.first_pick_cpu, cpu, -1);
- atomic_cmpxchg(&nohz.second_pick_cpu, cpu, -1);
+ cpumask_set_cpu(cpu, node_nohz->idle_cpus_mask);
+ atomic_cmpxchg(&node_nohz->first_pick_cpu, cpu, -1);
+ atomic_cmpxchg(&node_nohz->second_pick_cpu, cpu, -1);

- if (atomic_read(&nohz.load_balancer) == -1) {
+ if (atomic_read(&node_nohz->load_balancer) == -1) {
int new_ilb;

/* make me the ilb owner */
- if (atomic_cmpxchg(&nohz.load_balancer, -1, cpu) != -1)
+ if (atomic_cmpxchg(&node_nohz->load_balancer, -1, cpu) != -1)
return;

/*
* Check to see if there is a more power-efficient
* ilb.
*/
- new_ilb = find_new_ilb(cpu);
+ new_ilb = find_new_ilb(cpu, node_nohz);
if (new_ilb < nr_cpu_ids && new_ilb != cpu) {
- atomic_set(&nohz.load_balancer, -1);
+ atomic_set(&node_nohz->load_balancer, -1);
resched_cpu(new_ilb);
}
}
} else {
- if (!cpumask_test_cpu(cpu, nohz.idle_cpus_mask))
+ if (!cpumask_test_cpu(cpu, node_nohz->idle_cpus_mask))
return;

- cpumask_clear_cpu(cpu, nohz.idle_cpus_mask);
+ cpumask_clear_cpu(cpu, node_nohz->idle_cpus_mask);

- if (atomic_read(&nohz.load_balancer) == cpu)
- if (atomic_cmpxchg(&nohz.load_balancer, cpu, -1) != cpu)
+ if (atomic_read(&node_nohz->load_balancer) == cpu)
+ if (atomic_cmpxchg(&node_nohz->load_balancer, cpu, -1) != cpu)
BUG();
}
}
@@ -4857,8 +4939,13 @@ static void run_rebalance_domains(struct softirq_action *h)
*/
static void nohz_idle_balance(int this_cpu, struct rq *this_rq)
{
+ struct nohz_tracker *node_nohz = per_cpu(cpu_node_nohz_ptr, this_cpu);
+
rebalance_domains(this_cpu, CPU_IDLE);

+ if (unlikely(!node_nohz))
+ return;
+
/*
* If this cpu is the owner for idle load balancing, then do the
* balancing on behalf of the other idle cpus whose ticks are
@@ -4868,7 +4955,7 @@ static void nohz_idle_balance(int this_cpu, struct rq *this_rq)
struct rq *rq;
int balance_cpu;

- for_each_cpu(balance_cpu, nohz.idle_cpus_mask) {
+ for_each_cpu(balance_cpu, node_nohz->idle_cpus_mask) {
if (balance_cpu == this_cpu)
continue;

@@ -4886,7 +4973,7 @@ static void nohz_idle_balance(int this_cpu, struct rq *this_rq)
if (time_after(this_rq->next_balance, rq->next_balance))
this_rq->next_balance = rq->next_balance;
}
- nohz.next_balance = this_rq->next_balance;
+ node_nohz->next_balance = this_rq->next_balance;
this_rq->nohz_balance_kick = 0;
}
}
@@ -4912,20 +4999,24 @@ static inline int nohz_kick_needed(struct rq *rq, int cpu)
{
unsigned long now = jiffies;
int ret;
+ struct nohz_tracker *node_nohz = per_cpu(cpu_node_nohz_ptr, cpu);

- if (time_before(now, nohz.next_balance))
+ if (unlikely(!node_nohz))
+ return 0;
+
+ if (time_before(now, node_nohz->next_balance))
return 0;

if (!rq->nr_running)
return 0;

- ret = atomic_cmpxchg(&nohz.first_pick_cpu, -1, cpu);
+ ret = atomic_cmpxchg(&node_nohz->first_pick_cpu, -1, cpu);
if (ret == -1 || ret == cpu) {
- atomic_cmpxchg(&nohz.second_pick_cpu, cpu, -1);
+ atomic_cmpxchg(&node_nohz->second_pick_cpu, cpu, -1);
if (rq->nr_running > 1)
return 1;
} else {
- ret = atomic_cmpxchg(&nohz.second_pick_cpu, -1, cpu);
+ ret = atomic_cmpxchg(&node_nohz->second_pick_cpu, -1, cpu);
if (ret == -1 || ret == cpu) {
if (rq->nr_running)
return 1;
@@ -8878,6 +8969,14 @@ static int __build_sched_domains(const struct cpumask *cpu_map,
goto error;
#endif

+ if (alloc_node_nohz_tracker())
+ goto error;
+
+ for_each_cpu(i, cpu_map) {
+ per_cpu(cpu_node_nohz_ptr, i) =
+ nohz_tracker_ptrs[cpu_to_node(i)];
+ }
+
/* Calculate CPU power for physical packages and nodes */
#ifdef CONFIG_SCHED_SMT
for_each_cpu(i, cpu_map) {
@@ -9625,10 +9724,6 @@ void __init sched_init(void)
/* Allocate the nohz_cpu_mask if CONFIG_CPUMASK_OFFSTACK */
zalloc_cpumask_var(&nohz_cpu_mask, GFP_NOWAIT);
#ifdef CONFIG_SMP
-#ifdef CONFIG_NO_HZ
- zalloc_cpumask_var(&nohz.idle_cpus_mask, GFP_NOWAIT);
- alloc_cpumask_var(&nohz.tmp_nohz_mask, GFP_NOWAIT);
-#endif
zalloc_cpumask_var(&cpu_isolated_map, GFP_NOWAIT);
#endif /* SMP */

--
1.6.0.6

--


2009-12-14 22:21:37

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [patch 2/2] sched: Scale the nohz_tracker logic by making it per NUMA node

On Thu, 2009-12-10 at 17:27 -0800, [email protected] wrote:
> Having one idle CPU doing the rebalancing for all the idle CPUs in
> nohz mode does not scale well with increasing number of cores and
> sockets. Make the nohz_tracker per NUMA node. This results in multiple
> idle load balancing happening at NUMA node level and idle load balancer
> only does the rebalance domain among all the other nohz CPUs in that
> NUMA node.
>
> This addresses the below problem with the current nohz ilb logic
> * The lone balancer may end up spending a lot of time doing the
> * balancing on
> behalf of nohz CPUs, especially with increasing number of sockets and
> cores in the platform.

If the purpose is to keep sockets idle, doing things per node doesn't
seem like a fine plan, since we're having nodes <= socket machines these
days.


2009-12-14 22:32:56

by Pallipadi, Venkatesh

[permalink] [raw]
Subject: Re: [patch 2/2] sched: Scale the nohz_tracker logic by making it per NUMA node

On Mon, 2009-12-14 at 14:21 -0800, Peter Zijlstra wrote:
> On Thu, 2009-12-10 at 17:27 -0800, [email protected] wrote:
> > Having one idle CPU doing the rebalancing for all the idle CPUs in
> > nohz mode does not scale well with increasing number of cores and
> > sockets. Make the nohz_tracker per NUMA node. This results in multiple
> > idle load balancing happening at NUMA node level and idle load balancer
> > only does the rebalance domain among all the other nohz CPUs in that
> > NUMA node.
> >
> > This addresses the below problem with the current nohz ilb logic
> > * The lone balancer may end up spending a lot of time doing the
> > * balancing on
> > behalf of nohz CPUs, especially with increasing number of sockets and
> > cores in the platform.
>
> If the purpose is to keep sockets idle, doing things per node doesn't
> seem like a fine plan, since we're having nodes <= socket machines these
> days.

The idea is to do idle balance only within the nodes.
Eg: 4 node (and 4 socket) system with each socket having 4 cores.
If there is a single active thread on such a system, say on socket 3.
Without this change we have 1 idle load balancer (which may be in socket
0) which has periodic ticks and remaining 14 cores will be tickless.
But this one idle load balancer does load balance on behalf of itself +
14 other idle cores.

With the change proposed in this patch, we will have 3 completely idle
nodes/sockets. We will not do load balance on these cores at all.
Remaining one active socket will have one idle load balancer, which when
needed will do idle load balancing on behalf of itself + 2 other idle
cores in that socket.

If there all sockets have atleast one busy core, then we may have more
than one idle load balancer, but each will only do idle load balance on
behalf of idle processors in its own node, so total idle load balance
will be same as now.

Thanks,
Venki

2009-12-14 22:58:33

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [patch 2/2] sched: Scale the nohz_tracker logic by making it per NUMA node

On Mon, 2009-12-14 at 14:32 -0800, Pallipadi, Venkatesh wrote:
> On Mon, 2009-12-14 at 14:21 -0800, Peter Zijlstra wrote:
> > On Thu, 2009-12-10 at 17:27 -0800, [email protected] wrote:
> > > Having one idle CPU doing the rebalancing for all the idle CPUs in
> > > nohz mode does not scale well with increasing number of cores and
> > > sockets. Make the nohz_tracker per NUMA node. This results in multiple
> > > idle load balancing happening at NUMA node level and idle load balancer
> > > only does the rebalance domain among all the other nohz CPUs in that
> > > NUMA node.
> > >
> > > This addresses the below problem with the current nohz ilb logic
> > > * The lone balancer may end up spending a lot of time doing the
> > > * balancing on
> > > behalf of nohz CPUs, especially with increasing number of sockets and
> > > cores in the platform.
> >
> > If the purpose is to keep sockets idle, doing things per node doesn't
> > seem like a fine plan, since we're having nodes <= socket machines these
> > days.
>
> The idea is to do idle balance only within the nodes.
> Eg: 4 node (and 4 socket) system with each socket having 4 cores.
> If there is a single active thread on such a system, say on socket 3.
> Without this change we have 1 idle load balancer (which may be in socket
> 0) which has periodic ticks and remaining 14 cores will be tickless.
> But this one idle load balancer does load balance on behalf of itself +
> 14 other idle cores.
>
> With the change proposed in this patch, we will have 3 completely idle
> nodes/sockets. We will not do load balance on these cores at all.

That seems like a behavioural change, not balancing these 3 nodes at all
could lead to overload scenarios on the one active node, right?

> Remaining one active socket will have one idle load balancer, which when
> needed will do idle load balancing on behalf of itself + 2 other idle
> cores in that socket.

> If there all sockets have atleast one busy core, then we may have more
> than one idle load balancer, but each will only do idle load balance on
> behalf of idle processors in its own node, so total idle load balance
> will be same as now.

How about things like Magny-Cours which will have multiple nodes per
socket, wouldn't that be best served by having the total socket idle,
instead of just half of it?

2009-12-15 01:00:19

by Pallipadi, Venkatesh

[permalink] [raw]
Subject: Re: [patch 2/2] sched: Scale the nohz_tracker logic by making it per NUMA node

On Mon, 2009-12-14 at 14:58 -0800, Peter Zijlstra wrote:
> On Mon, 2009-12-14 at 14:32 -0800, Pallipadi, Venkatesh wrote:
> >
> > The idea is to do idle balance only within the nodes.
> > Eg: 4 node (and 4 socket) system with each socket having 4 cores.
> > If there is a single active thread on such a system, say on socket 3.
> > Without this change we have 1 idle load balancer (which may be in socket
> > 0) which has periodic ticks and remaining 14 cores will be tickless.
> > But this one idle load balancer does load balance on behalf of itself +
> > 14 other idle cores.
> >
> > With the change proposed in this patch, we will have 3 completely idle
> > nodes/sockets. We will not do load balance on these cores at all.
>
> That seems like a behavioural change, not balancing these 3 nodes at all
> could lead to overload scenarios on the one active node, right?
>

Yes. You are right. This can result in some node level imbalance. The
main problem that we were trying to solve is over-aggressive attempt to
load balance idle CPUs. We have seen on a system with 64 logical CPUs,
if there is only active thread, we have seen one other CPU (the idle
load balancer) spending 3-5% time being non-idle just trying to do load
balance on behalf of 63 idle CPUs on a continuous basis. Trying idle
rebalance every jiffy across all nodes when balance across nodes has
interval of 8 or 16 jiffies. There are other forms of rebalancing like
fork and exec that will still balance across nodes. But, if there are no
forks/execs, we will have the overload scenario you pointed out.

I guess we need to look at other alternatives to make this cross node
idle load balancing more intelligent. However, first patch in this
series has its share of advantages in avoiding unneeded idle balancing.
And with first patch, cross node issues will be no worse than current
state. So, that is worth as a stand alone change as well.

> > Remaining one active socket will have one idle load balancer, which when
> > needed will do idle load balancing on behalf of itself + 2 other idle
> > cores in that socket.
>
> > If there all sockets have atleast one busy core, then we may have more
> > than one idle load balancer, but each will only do idle load balance on
> > behalf of idle processors in its own node, so total idle load balance
> > will be same as now.
>
> How about things like Magny-Cours which will have multiple nodes per
> socket, wouldn't that be best served by having the total socket idle,
> instead of just half of it?
>

Yes. But, that will be same with general load balancing behavior and not
just idle load balancing. That would probably need another level in
scheduler domain?

Thanks,
Venki

2009-12-15 10:22:13

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [patch 2/2] sched: Scale the nohz_tracker logic by making it per NUMA node

On Mon, 2009-12-14 at 17:00 -0800, Pallipadi, Venkatesh wrote:
> On Mon, 2009-12-14 at 14:58 -0800, Peter Zijlstra wrote:
> > On Mon, 2009-12-14 at 14:32 -0800, Pallipadi, Venkatesh wrote:
> > >
> > > The idea is to do idle balance only within the nodes.
> > > Eg: 4 node (and 4 socket) system with each socket having 4 cores.
> > > If there is a single active thread on such a system, say on socket 3.
> > > Without this change we have 1 idle load balancer (which may be in socket
> > > 0) which has periodic ticks and remaining 14 cores will be tickless.
> > > But this one idle load balancer does load balance on behalf of itself +
> > > 14 other idle cores.
> > >
> > > With the change proposed in this patch, we will have 3 completely idle
> > > nodes/sockets. We will not do load balance on these cores at all.
> >
> > That seems like a behavioural change, not balancing these 3 nodes at all
> > could lead to overload scenarios on the one active node, right?
> >
>
> Yes. You are right. This can result in some node level imbalance. The
> main problem that we were trying to solve is over-aggressive attempt to
> load balance idle CPUs. We have seen on a system with 64 logical CPUs,
> if there is only active thread, we have seen one other CPU (the idle
> load balancer) spending 3-5% time being non-idle just trying to do load
> balance on behalf of 63 idle CPUs on a continuous basis. Trying idle
> rebalance every jiffy across all nodes when balance across nodes has
> interval of 8 or 16 jiffies. There are other forms of rebalancing like
> fork and exec that will still balance across nodes. But, if there are no
> forks/execs, we will have the overload scenario you pointed out.
>
> I guess we need to look at other alternatives to make this cross node
> idle load balancing more intelligent. However, first patch in this
> series has its share of advantages in avoiding unneeded idle balancing.
> And with first patch, cross node issues will be no worse than current
> state. So, that is worth as a stand alone change as well.

OK, I'll actually have a look at the patch now that I understand what
we're trying to do here ;-)

Thanks!

> > > Remaining one active socket will have one idle load balancer, which when
> > > needed will do idle load balancing on behalf of itself + 2 other idle
> > > cores in that socket.
> >
> > > If there all sockets have atleast one busy core, then we may have more
> > > than one idle load balancer, but each will only do idle load balance on
> > > behalf of idle processors in its own node, so total idle load balance
> > > will be same as now.
> >
> > How about things like Magny-Cours which will have multiple nodes per
> > socket, wouldn't that be best served by having the total socket idle,
> > instead of just half of it?
> >
>
> Yes. But, that will be same with general load balancing behavior and not
> just idle load balancing. That would probably need another level in
> scheduler domain?

Right, Andreas was supposed to look at doing that, not sure if he ever
got around to it though.

2009-12-21 13:12:40

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [patch 2/2] sched: Scale the nohz_tracker logic by making it per NUMA node

On Thu, 2009-12-10 at 17:27 -0800, [email protected] wrote:
> plain text document attachment
> (0002-sched-Scale-the-nohz_tracker-logic-by-making-it-per.patch)
> Having one idle CPU doing the rebalancing for all the idle CPUs in
> nohz mode does not scale well with increasing number of cores and
> sockets. Make the nohz_tracker per NUMA node. This results in multiple
> idle load balancing happening at NUMA node level and idle load balancer
> only does the rebalance domain among all the other nohz CPUs in that
> NUMA node.
>
> This addresses the below problem with the current nohz ilb logic
> * The lone balancer may end up spending a lot of time doing the
> * balancing on
> behalf of nohz CPUs, especially with increasing number of sockets and
> cores in the platform.

Right, so I think the whole NODE idea here is wrong, it all seems to
work out properly if you simply pick one sched domain larger than the
one that contains all of the current socket and contains an idle unit.

Except that the sched domain stuff is not properly aware of bigger
topology things atm.

The sched domain tree should not view node as the largest structure and
we should remove that current random node split crap we have.

Instead the sched domains should continue to express the topology, like
nodes within 1 hop, nodes within 2 hops, etc.

Then this nohz idle balancing should pick the socket level (which might
be larger than the node level), and walks up the domain tree, until we
reach a level where it has a whole idle group.

This means that we'll always span at least 2 sockets, which means we'll
gracefully deal with the overload scenario.