LinuxLists.cc - [PATCH] sched: fix constructing the span cpu mask of sched domain

2011-05-05 12:53:23

Subject: [PATCH] sched: fix constructing the span cpu mask of sched domain

For a given node, when constructing the cpumask for its sched_domain
to span, if there is no best node available after searching, further
efforts could be saved, based on small change in the return value of
find_next_best_node().

Signed-off-by: Hillf Danton <[email protected]>
---

--- a/kernel/sched.c 2011-04-27 11:48:50.000000000 +0800
+++ b/kernel/sched.c 2011-05-05 20:44:52.000000000 +0800
@@ -6787,7 +6787,7 @@ init_sched_build_groups(const struct cpu
*/
static int find_next_best_node(int node, nodemask_t *used_nodes)
{
- int i, n, val, min_val, best_node = 0;
+ int i, n, val, min_val, best_node = -1;

min_val = INT_MAX;

@@ -6811,7 +6811,8 @@ static int find_next_best_node(int node,
}
}

- node_set(best_node, *used_nodes);
+ if (best_node != -1)
+ node_set(best_node, *used_nodes);
return best_node;
}

@@ -6837,7 +6838,8 @@ static void sched_domain_node_span(int n

for (i = 1; i < SD_NODES_PER_DOMAIN; i++) {
int next_node = find_next_best_node(node, &used_nodes);
-
+ if (next_node < 0)
+ break;
cpumask_or(span, span, cpumask_of_node(next_node));
}
}

2011-05-06 07:12:38

by Ingo Molnar

[permalink] [raw]

Subject: Re: [PATCH] sched: fix constructing the span cpu mask of sched domain

* Hillf Danton <[email protected]> wrote:

> For a given node, when constructing the cpumask for its sched_domain
> to span, if there is no best node available after searching, further
> efforts could be saved, based on small change in the return value of
> find_next_best_node().
>
> Signed-off-by: Hillf Danton <[email protected]>
> ---
>
> --- a/kernel/sched.c 2011-04-27 11:48:50.000000000 +0800
> +++ b/kernel/sched.c 2011-05-05 20:44:52.000000000 +0800
> @@ -6787,7 +6787,7 @@ init_sched_build_groups(const struct cpu
> */
> static int find_next_best_node(int node, nodemask_t *used_nodes)
> {
> - int i, n, val, min_val, best_node = 0;
> + int i, n, val, min_val, best_node = -1;
>
> min_val = INT_MAX;
>
> @@ -6811,7 +6811,8 @@ static int find_next_best_node(int node,
> }
> }
>
> - node_set(best_node, *used_nodes);
> + if (best_node != -1)
> + node_set(best_node, *used_nodes);
> return best_node;
> }
>
> @@ -6837,7 +6838,8 @@ static void sched_domain_node_span(int n
>
> for (i = 1; i < SD_NODES_PER_DOMAIN; i++) {
> int next_node = find_next_best_node(node, &used_nodes);
> -
> + if (next_node < 0)
> + break;
> cpumask_or(span, span, cpumask_of_node(next_node));
> }
> }

Looks good. I changed the title from:

sched: fix constructing the span cpu mask of sched domain

to:

sched: Shorten the constructing of the span cpu mask of sched domain

Because unless i missed some side effect it really does not 'fix' anything, it
avoids repetitive (and ultimately fruitless) extra work, right?

Thanks,

Ingo

2011-05-06 07:40:48

by Hillf Danton

[permalink] [raw]

Subject: [tip:sched/core] sched: Shorten the construction of the span cpu mask of sched domain

Commit-ID: 7142d17e8f935fa842e9f6eece2281b6d41625d6
Gitweb: http://git.kernel.org/tip/7142d17e8f935fa842e9f6eece2281b6d41625d6
Author: Hillf Danton <[email protected]>
AuthorDate: Thu, 5 May 2011 20:53:20 +0800
Committer: Ingo Molnar <[email protected]>
CommitDate: Fri, 6 May 2011 09:13:05 +0200

sched: Shorten the construction of the span cpu mask of sched domain

For a given node, when constructing the cpumask for its
sched_domain to span, if there is no best node available after
searching, further efforts could be saved, based on small change
in the return value of find_next_best_node().

Signed-off-by: Hillf Danton <[email protected]>
Cc: Peter Zijlstra <[email protected]>
Cc: Mike Galbraith <[email protected]>
Cc: Yong Zhang <[email protected]>
Link: http://lkml.kernel.org/r/BANLkTi%3DqPWxRAa6%2BdT3ohEP6Z%3D0v%[email protected]
Signed-off-by: Ingo Molnar <[email protected]>
---
kernel/sched.c | 8 +++++---
1 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/kernel/sched.c b/kernel/sched.c
index 3d8a1b2..da93381 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -6807,7 +6807,7 @@ __setup("isolcpus=", isolated_cpu_setup);
*/
static int find_next_best_node(int node, nodemask_t *used_nodes)
{
- int i, n, val, min_val, best_node = 0;
+ int i, n, val, min_val, best_node = -1;

min_val = INT_MAX;

@@ -6831,7 +6831,8 @@ static int find_next_best_node(int node, nodemask_t *used_nodes)
}
}

- node_set(best_node, *used_nodes);
+ if (best_node != -1)
+ node_set(best_node, *used_nodes);
return best_node;
}

@@ -6857,7 +6858,8 @@ static void sched_domain_node_span(int node, struct cpumask *span)

for (i = 1; i < SD_NODES_PER_DOMAIN; i++) {
int next_node = find_next_best_node(node, &used_nodes);
-
+ if (next_node < 0)
+ break;
cpumask_or(span, span, cpumask_of_node(next_node));
}
}

2011-05-10 08:29:10

by Peter Zijlstra

[permalink] [raw]

Subject: Re: [PATCH] sched: fix constructing the span cpu mask of sched domain

On Thu, 2011-05-05 at 20:53 +0800, Hillf Danton wrote:
> For a given node, when constructing the cpumask for its sched_domain
> to span, if there is no best node available after searching, further
> efforts could be saved, based on small change in the return value of
> find_next_best_node().
>
> Signed-off-by: Hillf Danton <[email protected]>
> ---
>
> --- a/kernel/sched.c 2011-04-27 11:48:50.000000000 +0800
> +++ b/kernel/sched.c 2011-05-05 20:44:52.000000000 +0800
> @@ -6787,7 +6787,7 @@ init_sched_build_groups(const struct cpu
> */
> static int find_next_best_node(int node, nodemask_t *used_nodes)
> {
> - int i, n, val, min_val, best_node = 0;
> + int i, n, val, min_val, best_node = -1;
>
> min_val = INT_MAX;
>
> @@ -6811,7 +6811,8 @@ static int find_next_best_node(int node,
> }
> }
>
> - node_set(best_node, *used_nodes);
> + if (best_node != -1)
> + node_set(best_node, *used_nodes);
> return best_node;
> }
>
> @@ -6837,7 +6838,8 @@ static void sched_domain_node_span(int n
>
> for (i = 1; i < SD_NODES_PER_DOMAIN; i++) {
> int next_node = find_next_best_node(node, &used_nodes);
> -
> + if (next_node < 0)
> + break;
> cpumask_or(span, span, cpumask_of_node(next_node));
> }
> }

If you're interested in this area of the scheduler, you might want to
have a poke at:

http://marc.info/?l=linux-kernel&m=130218515520540

That tries to rewrite the CONFIG_NUMA support for the sched_domain stuff
to create domains based on the node_distance() to better reflect the
actual machine topology.

As stated, that patch is currently very broken, mostly because the
topologies encountered don't map to non-overlapping trees. I've not yet
come up with how to deal with that, but we sure need to do something
like that, the current group 16 nodes and a group of all simply doesn't
work well for today's machines now that NUMA is both common and the
inter-node latencies are more relevant.

2011-05-10 12:29:39

by Hillf Danton

[permalink] [raw]

Subject: Re: [PATCH] sched: fix constructing the span cpu mask of sched domain

On Tue, May 10, 2011 at 4:32 PM, Peter Zijlstra <[email protected]> wrote:
>
> If you're interested in this area of the scheduler, you might want to
> have a poke at:
>
> http://marc.info/?l=linux-kernel&m=130218515520540
>
> That tries to rewrite the CONFIG_NUMA support for the sched_domain stuff
> to create domains based on the node_distance() to better reflect the
> actual machine topology.
>
Hi Peter

Thank you very much for sharing the info.

Hillf

2011-05-11 16:07:01

by Hillf Danton

[permalink] [raw]

Subject: Re: [PATCH] sched: fix constructing the span cpu mask of sched domain

On Tue, May 10, 2011 at 4:32 PM, Peter Zijlstra <[email protected]> wrote:
> If you're interested in this area of the scheduler, you might want to
> have a poke at:
>
> http://marc.info/?l=linux-kernel&m=130218515520540
>
> That tries to rewrite the CONFIG_NUMA support for the sched_domain stuff
> to create domains based on the node_distance() to better reflect the
> actual machine topology.
>
> As stated, that patch is currently very broken, mostly because the
> topologies encountered don't map to non-overlapping trees. I've not yet
> come up with how to deal with that, but we sure need to do something
> like that, the current group 16 nodes and a group of all simply doesn't
> work well for today's machines now that NUMA is both common and the
> inter-node latencies are more relevant.
>

Hi Peter

Your work for rewriting NUMA support, published at
http://marc.info/?l=linux-kernel&m=130218515520540
is patched by changing how level is computed and by changing how it is
used to build the mask.

When computing, some valid levels are lost in your work.

When building mask, nodes are selected only if they have same distance,
thus nodes of less distance are also masked out since the computation of
level now is tough.

Without MUNA hardware, I did not test the patch:(

Hillf
---

--- numa_by_peter.c 2011-05-11 20:22:10.000000000 +0800
+++ numa_by_hillf.c 2011-05-11 21:06:26.000000000 +0800
@@ -1,6 +1,5 @@
static void sched_init_numa(void)
{
- int next_distance, curr_distance = node_distance(0, 0);
struct sched_domain_topology_level *tl;
int level = 0;
int i, j, k;
@@ -11,21 +10,34 @@ static void sched_init_numa(void)
if (!sched_domains_numa_distance)
return;

- next_distance = curr_distance;
- for (i = 0; i < nr_node_ids; i++) {
- for (j = 0; j < nr_node_ids; j++) {
- int distance = node_distance(0, j);
- printk("distance(0,%d): %d\n", j, distance);
- if (distance > curr_distance &&
- (distance < next_distance ||
- next_distance == curr_distance))
- next_distance = distance;
+ for (j = 0; j < nr_node_ids; j++) {
+ int distance = node_distance(0, j);
+ printk("distance(0,%d): %d\n", j, distance);
+ if (j == 0) {
+ sched_domains_numa_distance[j] = distance;
+ sched_domains_numa_levels = ++level;
+ continue;
}
- if (next_distance != curr_distance) {
- sched_domains_numa_distance[level++] = next_distance;
+ for (i = 0; i < level; i++) {
+ /* check if already exist */
+ if (distance == sched_domains_numa_distance[i])
+ goto next_node;
+ /* sort and insert it */
+ if (distance < sched_domains_numa_distance[i])
+ break;
+ }
+ if (i == level) {
+ sched_domains_numa_distance[level++] = distance;
sched_domains_numa_levels = level;
- curr_distance = next_distance;
- } else break;
+ continue;
+ }
+ for (k = level -1; k >= i; k--)
+ sched_domains_numa_distance[k+1] =
+ sched_domains_numa_distance[k];
+ sched_domains_numa_distance[i] = distance;
+ sched_domains_numa_levels = ++level;
+next_node:
+ ;
}

sched_domains_numa_masks = kzalloc(sizeof(void *) * level, GFP_KERNEL);
@@ -44,8 +56,9 @@ static void sched_init_numa(void)
struct cpumask *mask =
per_cpu_ptr(sched_domains_numa_masks[i], j);

+ cpumask_clear(mask);
for (k = 0; k < nr_node_ids; k++) {
- if (node_distance(cpu_to_node(j), k) >
+ if (node_distance(cpu_to_node(j), k) !=
sched_domains_numa_distance[i])
continue;

2011-05-11 15:59:20

by Peter Zijlstra

[permalink] [raw]

Subject: Re: [PATCH] sched: fix constructing the span cpu mask of sched domain

On Wed, 2011-05-11 at 21:26 +0800, Hillf Danton wrote:
> Your work for rewriting NUMA support, published at
> http://marc.info/?l=linux-kernel&m=130218515520540
> is patched by changing how level is computed and by changing how it is
> used to build the mask.
>
> When computing, some valid levels are lost in your work.
>
> When building mask, nodes are selected only if they have same distance,
> thus nodes of less distance are also masked out since the computation of
> level now is tough.
>
> Without MUNA hardware, I did not test the patch:(

I do have a (tiny) NUMA box (2 nodes) but that actually booted with the
old code too, Andreas Hermann from AMD (CC'ed) is usually willing to
test such patches on somewhat larger systems. Please send a full patch
against tip/master for him to apply.

2011-05-11 16:07:45

by Hillf Danton

[permalink] [raw]

Subject: Re: [PATCH] sched: fix constructing the span cpu mask of sched domain

2011-05-11 17:08:20

by Ingo Molnar

[permalink] [raw]

Subject: Re: [PATCH] sched: fix constructing the span cpu mask of sched domain

* Hillf Danton <[email protected]> wrote:

> On Wed, May 11, 2011 at 9:54 PM, Peter Zijlstra <[email protected]> wrote:
> > On Wed, 2011-05-11 at 21:26 +0800, Hillf Danton wrote:
> >> Your work for rewriting NUMA support, published at
> >> ? ? ? ? ?http://marc.info/?l=linux-kernel&m=130218515520540
> >> is patched by changing how level is computed and by changing how it is
> >> used to build the mask.
> >>
> >> When computing, some valid levels are lost in your work.
> >>
> >> When building mask, nodes are selected only if they have same distance,
> >> thus nodes of less distance are also masked out since the computation of
> >> level now is tough.
> >>
> >> Without MUNA hardware, I did not test the patch:(
> >
> > I do have a (tiny) NUMA box (2 nodes) but that actually booted with the
> > old code too, Andreas Hermann from AMD (CC'ed) is usually willing to
> > test such patches on somewhat larger systems. Please send a full patch
> > against tip/master for him to apply.
> >
> Oh my God, I am not good at playing game with git, already laughed by many
> guys on LKML. As you see, I delivered all works based on diff through 512K ADSL,
> things like "git clone" sounds really monstrous.
>
> Would you please, Peter, teach me how to play git, since I want to do more?

Here's how you fetch the scheduler tree:

http://people.redhat.com/mingo/tip.git/README

and here's the Kernel Hacker's Guide to Git:

http://linux.yyz.us/git-howto.html

Thanks,

Ingo

2011-05-11 16:03:29

by Hillf Danton

[permalink] [raw]

Subject: Re: [PATCH] sched: fix constructing the span cpu mask of sched domain

On Wed, May 11, 2011 at 10:15 PM, Ingo Molnar <[email protected]> wrote:
>
> Here's how you fetch the scheduler tree:
>
> http://people.redhat.com/mingo/tip.git/README
>
> and here's the Kernel Hacker's Guide to Git:
>
> http://linux.yyz.us/git-howto.html
>

Thank you, Ingo, very much for guiding git and scheduler tree.

I will try git clone the scheduler tree this weekend.

Thanks,
Hillf

2011-05-13 13:06:05

by Hillf Danton

[permalink] [raw]

Subject: Re: [PATCH] sched: fix constructing the span cpu mask of sched domain

2011-05-15 05:50:45

by Hillf Danton

[permalink] [raw]

Subject: Re: [PATCH] sched: fix constructing the span cpu mask of sched domain

On Wed, May 11, 2011 at 9:54 PM, Peter Zijlstra <[email protected]> wrote:
> On Wed, 2011-05-11 at 21:26 +0800, Hillf Danton wrote:
>> Your work for rewriting NUMA support, published at
>> http://marc.info/?l=linux-kernel&m=130218515520540
>> is patched by changing how level is computed and by changing how it is
>> used to build the mask.
>>
>> When computing, some valid levels are lost in your work.
>>
>> When building mask, nodes are selected only if they have same distance,
>> thus nodes of less distance are also masked out since the computation of
>> level now is tough.
>>
>> Without MUNA hardware, I did not test the patch:(
>
> I do have a (tiny) NUMA box (2 nodes) but that actually booted with the
> old code too, Andreas Hermann from AMD (CC'ed) is usually willing to
> test such patches on somewhat larger systems. Please send a full patch
> against tip/master for him to apply.
>

Hi Peter

With the guiding from Ingo on git fetch the tip/master, the work is
now finished:)

Hopely it is not too late for Andreas.

In the following, the concern is also added for distances not covered by level,
please review again.

thanks
Hillf
---
include/linux/topology.h | 25 -----
kernel/sched.c | 220 ++++++++++++++++++++++++++--------------------
2 files changed, 126 insertions(+), 119 deletions(-)

diff --git a/include/linux/topology.h b/include/linux/topology.h
index b91a40e..fce56c8 100644
--- a/include/linux/topology.h
+++ b/include/linux/topology.h
@@ -176,31 +176,6 @@ int arch_update_cpu_topology(void);
}
#endif

-/* sched_domains SD_ALLNODES_INIT for NUMA machines */
-#define SD_ALLNODES_INIT (struct sched_domain) { \
- .min_interval = 64, \
- .max_interval = 64*num_online_cpus(), \
- .busy_factor = 128, \
- .imbalance_pct = 133, \
- .cache_nice_tries = 1, \
- .busy_idx = 3, \
- .idle_idx = 3, \
- .flags = 1*SD_LOAD_BALANCE \
- | 1*SD_BALANCE_NEWIDLE \
- | 0*SD_BALANCE_EXEC \
- | 0*SD_BALANCE_FORK \
- | 0*SD_BALANCE_WAKE \
- | 0*SD_WAKE_AFFINE \
- | 0*SD_SHARE_CPUPOWER \
- | 0*SD_POWERSAVINGS_BALANCE \
- | 0*SD_SHARE_PKG_RESOURCES \
- | 1*SD_SERIALIZE \
- | 0*SD_PREFER_SIBLING \
- , \
- .last_balance = jiffies, \
- .balance_interval = 64, \
-}
-
#ifdef CONFIG_SCHED_BOOK
#ifndef SD_BOOK_INIT
#error Please define an appropriate SD_BOOK_INIT in include/asm/topology.h!!!
diff --git a/kernel/sched.c b/kernel/sched.c
index f9778c0..5845815 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -6791,94 +6791,6 @@ static int __init isolated_cpu_setup(char *str)

__setup("isolcpus=", isolated_cpu_setup);

-#define SD_NODES_PER_DOMAIN 16
-
-#ifdef CONFIG_NUMA
-
-/**
- * find_next_best_node - find the next node to include in a sched_domain
- * @node: node whose sched_domain we're building
- * @used_nodes: nodes already in the sched_domain
- *
- * Find the next node to include in a given scheduling domain. Simply
- * finds the closest node not already in the @used_nodes map.
- *
- * Should use nodemask_t.
- */
-static int find_next_best_node(int node, nodemask_t *used_nodes)
-{
- int i, n, val, min_val, best_node = -1;
-
- min_val = INT_MAX;
-
- for (i = 0; i < nr_node_ids; i++) {
- /* Start at @node */
- n = (node + i) % nr_node_ids;
-
- if (!nr_cpus_node(n))
- continue;
-
- /* Skip already used nodes */
- if (node_isset(n, *used_nodes))
- continue;
-
- /* Simple min distance search */
- val = node_distance(node, n);
-
- if (val < min_val) {
- min_val = val;
- best_node = n;
- }
- }
-
- if (best_node != -1)
- node_set(best_node, *used_nodes);
- return best_node;
-}
-
-/**
- * sched_domain_node_span - get a cpumask for a node's sched_domain
- * @node: node whose cpumask we're constructing
- * @span: resulting cpumask
- *
- * Given a node, construct a good cpumask for its sched_domain to span. It
- * should be one that prevents unnecessary balancing, but also spreads tasks
- * out optimally.
- */
-static void sched_domain_node_span(int node, struct cpumask *span)
-{
- nodemask_t used_nodes;
- int i;
-
- cpumask_clear(span);
- nodes_clear(used_nodes);
-
- cpumask_or(span, span, cpumask_of_node(node));
- node_set(node, used_nodes);
-
- for (i = 1; i < SD_NODES_PER_DOMAIN; i++) {
- int next_node = find_next_best_node(node, &used_nodes);
- if (next_node < 0)
- break;
- cpumask_or(span, span, cpumask_of_node(next_node));
- }
-}
-
-static const struct cpumask *cpu_node_mask(int cpu)
-{
- lockdep_assert_held(&sched_domains_mutex);
-
- sched_domain_node_span(cpu_to_node(cpu), sched_domains_tmpmask);
-
- return sched_domains_tmpmask;
-}
-
-static const struct cpumask *cpu_allnodes_mask(int cpu)
-{
- return cpu_possible_mask;
-}
-#endif /* CONFIG_NUMA */
-
static const struct cpumask *cpu_cpu_mask(int cpu)
{
return cpumask_of_node(cpu_to_node(cpu));
@@ -6911,6 +6823,7 @@ typedef const struct cpumask
*(*sched_domain_mask_f)(int cpu);
struct sched_domain_topology_level {
sched_domain_init_f init;
sched_domain_mask_f mask;
+ int numa_level;
struct sd_data data;
};

@@ -7029,7 +6942,6 @@ sd_init_##type(struct
sched_domain_topology_level *tl, int cpu) \

SD_INIT_FUNC(CPU)
#ifdef CONFIG_NUMA
- SD_INIT_FUNC(ALLNODES)
SD_INIT_FUNC(NODE)
#endif
#ifdef CONFIG_SCHED_SMT
@@ -7153,15 +7065,135 @@ static struct sched_domain_topology_level
default_topology[] = {
{ sd_init_BOOK, cpu_book_mask, },
#endif
{ sd_init_CPU, cpu_cpu_mask, },
-#ifdef CONFIG_NUMA
- { sd_init_NODE, cpu_node_mask, },
- { sd_init_ALLNODES, cpu_allnodes_mask, },
-#endif
{ NULL, },
};

static struct sched_domain_topology_level *sched_domain_topology =
default_topology;

+#ifdef CONFIG_NUMA
+static int sched_domains_numa_levels;
+static int *sched_domains_numa_distance;
+static struct cpumask ** __percpu sched_domains_numa_masks;
+static int sched_domains_curr_level;
+
+static struct sched_domain *
+sd_init_NUMA(struct sched_domain_topology_level *tl, int cpu)
+{
+ sched_domains_curr_level = tl->numa_level;
+ return sd_init_NODE(tl, cpu);
+}
+
+static const struct cpumask *sd_numa_mask(int cpu)
+{
+ return per_cpu_ptr(sched_domains_numa_masks[sched_domains_curr_level], cpu);
+}
+
+static void sched_init_numa(void)
+{
+ struct sched_domain_topology_level *tl;
+ int level = 0;
+ int i, j, k;
+ char str[256];
+
+ sched_domains_numa_distance =
+ kzalloc(sizeof(int) * nr_node_ids, GFP_KERNEL);
+ if (!sched_domains_numa_distance)
+ return;
+
+ for (j = 0; j < nr_node_ids; j++) {
+ int distance = node_distance(0, j);
+ printk("distance(0,%d): %d\n", j, distance);
+ for (i = 0; i < level; i++) {
+ /* check if already exist */
+ if (distance == sched_domains_numa_distance[i])
+ goto next_node;
+ /* sort and insert distance */
+ if (distance < sched_domains_numa_distance[i])
+ break;
+ }
+ if (i == level) {
+ sched_domains_numa_distance[level++] = distance;
+ sched_domains_numa_levels = level;
+ continue;
+ }
+ for (k = level -1; k >= i; k--)
+ sched_domains_numa_distance[k+1] =
+ sched_domains_numa_distance[k];
+
+ sched_domains_numa_distance[i] = distance;
+ sched_domains_numa_levels = ++level;
+next_node:
+ ;
+ }
+
+ sched_domains_numa_masks = kzalloc(sizeof(void *) * level, GFP_KERNEL);
+ if (!sched_domains_numa_masks)
+ return;
+
+ printk("numa levels: %d\n", level);
+ for (i = 0; i < level; i++) {
+ printk("numa distance(%d): %d\n",
+ i, sched_domains_numa_distance[i]);
+
+ sched_domains_numa_masks[i] = alloc_percpu(cpumask_t);
+ if (!sched_domains_numa_masks[i])
+ return;
+
+ for_each_possible_cpu(j) {
+ struct cpumask *mask =
+ per_cpu_ptr(sched_domains_numa_masks[i], j);
+
+ cpumask_clear(mask);
+ for (k = 0; k < nr_node_ids; k++) {
+ if (node_distance(cpu_to_node(j), k) !=
+ sched_domains_numa_distance[i])
+ continue;
+ cpumask_or(mask, mask, cpumask_of_node(k));
+ }
+
+ cpulist_scnprintf(str, sizeof(str), mask);
+ printk("numa cpu(%d) mask: %s\n", j, str);
+ }
+ }
+
+ for (j = 0; j < nr_node_ids; j++) {
+ for (k = 0; k < nr_node_ids; k++) {
+ int distance = node_distance(j, k);
+ for (i = 0; i < level; i++)
+ if (distance == sched_domains_numa_distance[i])
+ goto covered;
+
+ printk("distance(%d,%d): %d not covered by level\n",
+ j, k, distance);
+ covered:
+ ;
+ }
+ }
+
+ tl = kzalloc((ARRAY_SIZE(default_topology) + level) *
+ sizeof(struct sched_domain_topology_level), GFP_KERNEL);
+ if (!tl)
+ return;
+
+ sched_domain_topology = tl;
+ for (i = 0; default_topology[i].init; i++)
+ tl[i] = default_topology[i];
+
+ for (j = 0; j < level; i++, j++)
+ tl[i] = (struct sched_domain_topology_level) {
+ .init = sd_init_NUMA,
+ .mask = sd_numa_mask,
+ .numa_level = j,
+ };
+
+
+ for (tl = sched_domain_topology; tl->init; tl++)
+ printk("Topology: %pF\n", tl->init);
+}
+#else
+static inline void sched_init_numa(void) {}
+#endif /* CONFIG_NUMA */
+
static int __sdt_alloc(const struct cpumask *cpu_map)
{
struct sched_domain_topology_level *tl;
@@ -7647,7 +7679,7 @@ void __init sched_init_smp(void)

alloc_cpumask_var(&non_isolated_cpus, GFP_KERNEL);
alloc_cpumask_var(&fallback_doms, GFP_KERNEL);
-
+ sched_init_numa();
get_online_cpus();
mutex_lock(&sched_domains_mutex);
init_sched_domains(cpu_active_mask);

2011-05-17 09:23:46

by tip-bot for Andreas Herrmann

[permalink] [raw]

Subject: Re: [PATCH] sched: fix constructing the span cpu mask of sched domain

On Sun, May 15, 2011 at 01:50:42AM -0400, Hillf Danton wrote:
> On Wed, May 11, 2011 at 9:54 PM, Peter Zijlstra <[email protected]> wrote:
> > On Wed, 2011-05-11 at 21:26 +0800, Hillf Danton wrote:
> >> Your work for rewriting NUMA support, published at
> >> http://marc.info/?l=linux-kernel&m=130218515520540
> >> is patched by changing how level is computed and by changing how it is
> >> used to build the mask.
> >>
> >> When computing, some valid levels are lost in your work.
> >>
> >> When building mask, nodes are selected only if they have same distance,
> >> thus nodes of less distance are also masked out since the computation of
> >> level now is tough.
> >>
> >> Without MUNA hardware, I did not test the patch:(
> >
> > I do have a (tiny) NUMA box (2 nodes) but that actually booted with the
> > old code too, Andreas Hermann from AMD (CC'ed) is usually willing to
> > test such patches on somewhat larger systems. Please send a full patch
> > against tip/master for him to apply.
> >
>
> Hi Peter
>
> With the guiding from Ingo on git fetch the tip/master, the work is
> now finished:)
>
> Hopely it is not too late for Andreas.
>
> In the following, the concern is also added for distances not covered by level,
> please review again.

Sorry, can't test your stuff. What git-tree and branch did you use to
build this patch?

Your patch seems to be corrupted/broken:

# scripts/checkpatch.pl your_patch.diff
ERROR: patch seems to be corrupt (line wrapped?)
#175: FILE: kernel/sched.c:6822:
*(*sched_domain_mask_f)(int cpu);

WARNING: line over 80 characters
#222: FILE: kernel/sched.c:7088:
+ return per_cpu_ptr(sched_domains_numa_masks[sched_domains_curr_level], cpu);

WARNING: printk() should include KERN_ facility level
#239: FILE: kernel/sched.c:7105:
+ printk("distance(0,%d): %d\n", j, distance);

ERROR: need consistent spacing around '-' (ctx:WxV)
#253: FILE: kernel/sched.c:7119:
+ for (k = level -1; k >= i; k--)
^
...

(I assume that it wasn't corrupted on my end.)

Andreas

2011-05-17 14:36:23

by Hillf Danton

[permalink] [raw]

Subject: Re: [PATCH] sched: fix constructing the span cpu mask of sched domain

On Tue, May 17, 2011 at 5:23 PM, Andreas Herrmann
<[email protected]> wrote:
> On Sun, May 15, 2011 at 01:50:42AM -0400, Hillf Danton wrote:
>> On Wed, May 11, 2011 at 9:54 PM, Peter Zijlstra <[email protected]> wrote:
>> > On Wed, 2011-05-11 at 21:26 +0800, Hillf Danton wrote:
>> >> Your work for rewriting NUMA support, published at
>> >> http://marc.info/?l=linux-kernel&m=130218515520540
>> >> is patched by changing how level is computed and by changing how it is
>> >> used to build the mask.
>> >>
>> >> When computing, some valid levels are lost in your work.
>> >>
>> >> When building mask, nodes are selected only if they have same distance,
>> >> thus nodes of less distance are also masked out since the computation of
>> >> level now is tough.
>> >>
>> >> Without MUNA hardware, I did not test the patch:(
>> >
>> > I do have a (tiny) NUMA box (2 nodes) but that actually booted with the
>> > old code too, Andreas Hermann from AMD (CC'ed) is usually willing to
>> > test such patches on somewhat larger systems. Please send a full patch
>> > against tip/master for him to apply.
>> >
>>
>> Hi Peter
>>
>> With the guiding from Ingo on git fetch the tip/master, the work is
>> now finished:)
>>
>> Hopely it is not too late for Andreas.
>>
>> In the following, the concern is also added for distances not covered by level,
>> please review again.
>
> Sorry, can't test your stuff. What git-tree and branch did you use to
> build this patch?
>
> Your patch seems to be corrupted/broken:
>
> # scripts/checkpatch.pl your_patch.diff
> ERROR: patch seems to be corrupt (line wrapped?)
> #175: FILE: kernel/sched.c:6822:
> *(*sched_domain_mask_f)(int cpu);
>
> WARNING: line over 80 characters
> #222: FILE: kernel/sched.c:7088:
> + return per_cpu_ptr(sched_domains_numa_masks[sched_domains_curr_level], cpu);
>
> WARNING: printk() should include KERN_ facility level
> #239: FILE: kernel/sched.c:7105:
> + printk("distance(0,%d): %d\n", j, distance);
>
> ERROR: need consistent spacing around '-' (ctx:WxV)
> #253: FILE: kernel/sched.c:7119:
> + for (k = level -1; k >= i; k--)
> ^
> ...
>
> (I assume that it wasn't corrupted on my end.)
>
>
Hi Andreas

First I say sorry to you for the not well prepared patch.

Since I am not good at playing git, I am not sure the patch was
against the latest tip/master,
and I have no idea about the ERROR above.

Then it looks better that Peter will prepare the patch again.

thanks
Hillf

2011-05-18 02:46:04

by Yong Zhang

[permalink] [raw]

Subject: Re: [PATCH] sched: fix constructing the span cpu mask of sched domain

On Tue, May 17, 2011 at 10:36 PM, Hillf Danton <[email protected]> wrote:
> On Tue, May 17, 2011 at 5:23 PM, Andreas Herrmann
> <[email protected]> wrote:
>> On Sun, May 15, 2011 at 01:50:42AM -0400, Hillf Danton wrote:
>>> On Wed, May 11, 2011 at 9:54 PM, Peter Zijlstra <[email protected]> wrote:
>>> > On Wed, 2011-05-11 at 21:26 +0800, Hillf Danton wrote:
>>> >> Your work for rewriting NUMA support, published at
>>> >> http://marc.info/?l=linux-kernel&m=130218515520540
>>> >> is patched by changing how level is computed and by changing how it is
>>> >> used to build the mask.
>>> >>
>>> >> When computing, some valid levels are lost in your work.
>>> >>
>>> >> When building mask, nodes are selected only if they have same distance,
>>> >> thus nodes of less distance are also masked out since the computation of
>>> >> level now is tough.
>>> >>
>>> >> Without MUNA hardware, I did not test the patch:(
>>> >
>>> > I do have a (tiny) NUMA box (2 nodes) but that actually booted with the
>>> > old code too, Andreas Hermann from AMD (CC'ed) is usually willing to
>>> > test such patches on somewhat larger systems. Please send a full patch
>>> > against tip/master for him to apply.
>>> >
>>>
>>> Hi Peter
>>>
>>> With the guiding from Ingo on git fetch the tip/master, the work is
>>> now finished:)
>>>
>>> Hopely it is not too late for Andreas.
>>>
>>> In the following, the concern is also added for distances not covered by level,
>>> please review again.
>>
>> Sorry, can't test your stuff. What git-tree and branch did you use to
>> build this patch?
>>
>> Your patch seems to be corrupted/broken:
>>
>> # scripts/checkpatch.pl your_patch.diff
>> ERROR: patch seems to be corrupt (line wrapped?)
>> #175: FILE: kernel/sched.c:6822:
>> *(*sched_domain_mask_f)(int cpu);
>>
>> WARNING: line over 80 characters
>> #222: FILE: kernel/sched.c:7088:
>> + return per_cpu_ptr(sched_domains_numa_masks[sched_domains_curr_level], cpu);
>>
>> WARNING: printk() should include KERN_ facility level
>> #239: FILE: kernel/sched.c:7105:
>> + printk("distance(0,%d): %d\n", j, distance);
>>
>> ERROR: need consistent spacing around '-' (ctx:WxV)
>> #253: FILE: kernel/sched.c:7119:
>> + for (k = level -1; k >= i; k--)
>> ^
>> ...
>>
>> (I assume that it wasn't corrupted on my end.)
>>
>>
> Hi Andreas
>
> First I say sorry to you for the not well prepared patch.
>
> Since I am not good at playing git, I am not sure the patch was
> against the latest tip/master,
> and I have no idea about the ERROR above.

Hint: before sending your patch out, you could run
kernel-dir/scripts/checkpatch.pl on your patch, then fix the warning/error.

Thanks,
Yong

--
Only stand for myself

2011-05-18 15:12:47

by Hillf Danton

[permalink] [raw]

Subject: Re: [PATCH] sched: fix constructing the span cpu mask of sched domain

Hi all

I take another try like the following,

Administrator@WWW-24D9C4C6548 ~/linux.tree.git (tip_latest)
$ git add include/linux/topology.h kernel/sched.c

Administrator@WWW-24D9C4C6548 ~/linux.tree.git (tip_latest)
$ git commit -a -m 'for peter and andreas'

Administrator@WWW-24D9C4C6548 ~/linux.tree.git (tip_latest)
$ git commit -a -m 'for peter and andreas'

Administrator@WWW-24D9C4C6548 ~/linux.tree.git (tip_latest)
$ git diff -p --stat HEAD~1 HEAD > ../patch-dhillf/numa-git.diff

Administrator@WWW-24D9C4C6548 ~/linux.tree.git (tip_latest)
$ ./scripts/checkpatch.pl ../patch-dhillf/numa-git.diff
ERROR: Missing Signed-off-by: line(s)

total: 1 errors, 0 warnings, 290 lines checked

../patch-dhillf/numa-git.diff has style problems, please review. If
any of these errors
are false positives report them to the maintainer, see
CHECKPATCH in MAINTAINERS.

and please blame as much as you could if it is broken again:(

I have to go bed now, who am I?

btw, the following Signed-off-by lines are purely against the above
error, specially
without the approval by Peter.

Signed-off-by: Peter Zijlstra <[email protected]>
Signed-off-by: Hillf Danton <[email protected]>
---
include/linux/topology.h | 25 -----
kernel/sched.c | 224 +++++++++++++++++++++++++++-------------------
2 files changed, 130 insertions(+), 119 deletions(-)

diff --git a/include/linux/topology.h b/include/linux/topology.h
index b91a40e..fce56c8 100644
--- a/include/linux/topology.h
+++ b/include/linux/topology.h
@@ -176,31 +176,6 @@ int arch_update_cpu_topology(void);
}
#endif

-/* sched_domains SD_ALLNODES_INIT for NUMA machines */
-#define SD_ALLNODES_INIT (struct sched_domain) { \
- .min_interval = 64, \
- .max_interval = 64*num_online_cpus(), \
- .busy_factor = 128, \
- .imbalance_pct = 133, \
- .cache_nice_tries = 1, \
- .busy_idx = 3, \
- .idle_idx = 3, \
- .flags = 1*SD_LOAD_BALANCE \
- | 1*SD_BALANCE_NEWIDLE \
- | 0*SD_BALANCE_EXEC \
- | 0*SD_BALANCE_FORK \
- | 0*SD_BALANCE_WAKE \
- | 0*SD_WAKE_AFFINE \
- | 0*SD_SHARE_CPUPOWER \
- | 0*SD_POWERSAVINGS_BALANCE \
- | 0*SD_SHARE_PKG_RESOURCES \
- | 1*SD_SERIALIZE \
- | 0*SD_PREFER_SIBLING \
- , \
- .last_balance = jiffies, \
- .balance_interval = 64, \
-}
-
#ifdef CONFIG_SCHED_BOOK
#ifndef SD_BOOK_INIT
#error Please define an appropriate SD_BOOK_INIT in include/asm/topology.h!!!
diff --git a/kernel/sched.c b/kernel/sched.c
index f9778c0..aa089a9 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -6791,94 +6791,6 @@ static int __init isolated_cpu_setup(char *str)

__setup("isolcpus=", isolated_cpu_setup);

-#define SD_NODES_PER_DOMAIN 16
-
-#ifdef CONFIG_NUMA
-
-/**
- * find_next_best_node - find the next node to include in a sched_domain
- * @node: node whose sched_domain we're building
- * @used_nodes: nodes already in the sched_domain
- *
- * Find the next node to include in a given scheduling domain. Simply
- * finds the closest node not already in the @used_nodes map.
- *
- * Should use nodemask_t.
- */
-static int find_next_best_node(int node, nodemask_t *used_nodes)
-{
- int i, n, val, min_val, best_node = -1;
-
- min_val = INT_MAX;
-
- for (i = 0; i < nr_node_ids; i++) {
- /* Start at @node */
- n = (node + i) % nr_node_ids;
-
- if (!nr_cpus_node(n))
- continue;
-
- /* Skip already used nodes */
- if (node_isset(n, *used_nodes))
- continue;
-
- /* Simple min distance search */
- val = node_distance(node, n);
-
- if (val < min_val) {
- min_val = val;
- best_node = n;
- }
- }
-
- if (best_node != -1)
- node_set(best_node, *used_nodes);
- return best_node;
-}
-
-/**
- * sched_domain_node_span - get a cpumask for a node's sched_domain
- * @node: node whose cpumask we're constructing
- * @span: resulting cpumask
- *
- * Given a node, construct a good cpumask for its sched_domain to span. It
- * should be one that prevents unnecessary balancing, but also spreads tasks
- * out optimally.
- */
-static void sched_domain_node_span(int node, struct cpumask *span)
-{
- nodemask_t used_nodes;
- int i;
-
- cpumask_clear(span);
- nodes_clear(used_nodes);
-
- cpumask_or(span, span, cpumask_of_node(node));
- node_set(node, used_nodes);
-
- for (i = 1; i < SD_NODES_PER_DOMAIN; i++) {
- int next_node = find_next_best_node(node, &used_nodes);
- if (next_node < 0)
- break;
- cpumask_or(span, span, cpumask_of_node(next_node));
- }
-}
-
-static const struct cpumask *cpu_node_mask(int cpu)
-{
- lockdep_assert_held(&sched_domains_mutex);
-
- sched_domain_node_span(cpu_to_node(cpu), sched_domains_tmpmask);
-
- return sched_domains_tmpmask;
-}
-
-static const struct cpumask *cpu_allnodes_mask(int cpu)
-{
- return cpu_possible_mask;
-}
-#endif /* CONFIG_NUMA */
-
static const struct cpumask *cpu_cpu_mask(int cpu)
{
return cpumask_of_node(cpu_to_node(cpu));
@@ -6911,6 +6823,7 @@ typedef const struct cpumask
*(*sched_domain_mask_f)(int cpu);
struct sched_domain_topology_level {
sched_domain_init_f init;
sched_domain_mask_f mask;
+ int numa_level;
struct sd_data data;
};

@@ -7029,7 +6942,6 @@ sd_init_##type(struct
sched_domain_topology_level *tl, int cpu) \

SD_INIT_FUNC(CPU)
#ifdef CONFIG_NUMA
- SD_INIT_FUNC(ALLNODES)
SD_INIT_FUNC(NODE)
#endif
#ifdef CONFIG_SCHED_SMT
@@ -7153,15 +7065,139 @@ static struct sched_domain_topology_level
default_topology[] = {
{ sd_init_BOOK, cpu_book_mask, },
#endif
{ sd_init_CPU, cpu_cpu_mask, },
-#ifdef CONFIG_NUMA
- { sd_init_NODE, cpu_node_mask, },
- { sd_init_ALLNODES, cpu_allnodes_mask, },
-#endif
{ NULL, },
};

static struct sched_domain_topology_level *sched_domain_topology =
default_topology;

+#ifdef CONFIG_NUMA
+static int sched_domains_numa_levels;
+static int *sched_domains_numa_distance;
+static struct cpumask ** __percpu sched_domains_numa_masks;
+static int sched_domains_curr_level;
+
+static struct sched_domain *
+sd_init_NUMA(struct sched_domain_topology_level *tl, int cpu)
+{
+ sched_domains_curr_level = tl->numa_level;
+ return sd_init_NODE(tl, cpu);
+}
+
+static const struct cpumask *sd_numa_mask(int cpu)
+{
+ return
+ per_cpu_ptr(sched_domains_numa_masks[sched_domains_curr_level], cpu);
+}
+
+static void sched_init_numa(void)
+{
+ struct sched_domain_topology_level *tl;
+ int level = 0;
+ int i, j, k;
+ char str[256];
+
+ sched_domains_numa_distance =
+ kzalloc(sizeof(int) * nr_node_ids, GFP_KERNEL);
+ if (!sched_domains_numa_distance)
+ return;
+
+ for (j = 0; j < nr_node_ids; j++) {
+ int distance = node_distance(0, j);
+ printk(KERN_INFO "distance(0,%d): %d\n", j, distance);
+ for (i = 0; i < level; i++) {
+ /* check if already exist */
+ if (distance == sched_domains_numa_distance[i])
+ goto next_node;
+ /* sort and insert distance */
+ if (distance < sched_domains_numa_distance[i])
+ break;
+ }
+ if (i == level) {
+ sched_domains_numa_distance[level++] = distance;
+ sched_domains_numa_levels = level;
+ continue;
+ }
+ for (k = level - 1; k >= i; k--)
+ sched_domains_numa_distance[k+1] =
+ sched_domains_numa_distance[k];
+
+ sched_domains_numa_distance[i] = distance;
+ sched_domains_numa_levels = ++level;
+next_node:
+ ;
+ }
+
+ sched_domains_numa_masks = kzalloc(sizeof(void *) * level, GFP_KERNEL);
+ if (!sched_domains_numa_masks)
+ return;
+
+ printk(KERN_INFO "numa levels: %d\n", level);
+ for (i = 0; i < level; i++) {
+ printk(KERN_INFO "numa distance(%d): %d\n",
+ i, sched_domains_numa_distance[i]);
+
+ sched_domains_numa_masks[i] = alloc_percpu(cpumask_t);
+ if (!sched_domains_numa_masks[i])
+ return;
+
+ for_each_possible_cpu(j) {
+ struct cpumask *mask =
+ per_cpu_ptr(sched_domains_numa_masks[i], j);
+
+ cpumask_clear(mask);
+ for (k = 0; k < nr_node_ids; k++) {
+ if (node_distance(cpu_to_node(j), k) !=
+ sched_domains_numa_distance[i])
+ continue;
+ cpumask_or(mask, mask, cpumask_of_node(k));
+ }
+
+ cpulist_scnprintf(str, sizeof(str), mask);
+ printk(KERN_INFO "numa cpu(%d) mask: %s\n", j, str);
+ }
+ }
+
+ for (j = 0; j < nr_node_ids; j++) {
+ for (k = 0; k < nr_node_ids; k++) {
+ int distance = node_distance(j, k);
+ for (i = 0; i < level; i++)
+ if (distance ==
+ sched_domains_numa_distance[i])
+ goto covered;
+
+ printk(KERN_INFO
+ "distance(%d,%d): %d not covered by level\n",
+ j, k, distance);
+covered:
+ ;
+ }
+ }
+
+ tl = kzalloc((ARRAY_SIZE(default_topology) + level) *
+ sizeof(struct sched_domain_topology_level),
+ GFP_KERNEL);
+ if (!tl)
+ return;
+
+ sched_domain_topology = tl;
+ for (i = 0; default_topology[i].init; i++)
+ tl[i] = default_topology[i];
+
+ for (j = 0; j < level; i++, j++)
+ tl[i] = (struct sched_domain_topology_level) {
+ .init = sd_init_NUMA,
+ .mask = sd_numa_mask,
+ .numa_level = j,
+ };
+
+
+ for (tl = sched_domain_topology; tl->init; tl++)
+ printk(KERN_INFO "Topology: %pF\n", tl->init);
+}
+#else
+static inline void sched_init_numa(void) {}
+#endif /* CONFIG_NUMA */
+
static int __sdt_alloc(const struct cpumask *cpu_map)
{
struct sched_domain_topology_level *tl;
@@ -7647,7 +7683,7 @@ void __init sched_init_smp(void)

alloc_cpumask_var(&non_isolated_cpus, GFP_KERNEL);
alloc_cpumask_var(&fallback_doms, GFP_KERNEL);
-
+ sched_init_numa();
get_online_cpus();
mutex_lock(&sched_domains_mutex);
init_sched_domains(cpu_active_mask);