IA64 already has their own version of SD_NODE_INIT, tuned for their
extremely large machines. I think that all arches would benefit from
having their own, arch-specific SD_NODE_INIT initializer, rather than
the one-size-fits-all variant we've got now.
This patch just creates one instance of SD_NODE_INIT per architecture in
the arch's include/asm/topology.h file. IA64's wasn't defined there, so
for consistency I moved it. Also, in each topology.h file I touched, I
removed the NODE_BALANCE_RATE definition since a grep of the -mm tree
revealed that it is defined all over the place, but no longer used.
This patch does NOT attempt any actual tuning of the values. Every
architecture has the same values as the current one-size-fits-all
version. Anyone who is interested in the 4 main NUMA arches (i386,
ia64, x86_64 & ppc64) please test this and feel free to send me any
"tweaked" values that might help performance for your arch.
Compiled and booted on i386 and x86_64.
[mcd@arrakis source]$ diffstat ~/linux/patches/sched_domains/per_arch-SD_INIT.patch
arch/ia64/kernel/domain.c | 1
include/asm-i386/topology.h | 49 +++++++++++++++++++++++++++++++++++-------
include/asm-ia64/processor.h | 21 ------------------
include/asm-ia64/topology.h | 23 +++++++++++++++++--
include/asm-ppc64/topology.h | 21 ++++++++++++++++--
include/asm-x86_64/topology.h | 22 ++++++++++++++++++
include/linux/sched.h | 21 ++----------------
7 files changed, 104 insertions(+), 54 deletions(-)
-Matt
diff -Nurp --exclude-from=/home/mcd/.dontdiff linux-2.6.9-rc2-mm4/arch/ia64/kernel/domain.c linux-2.6.9-rc2-mm4+per_arch-SD_INITs/arch/ia64/kernel/domain.c
--- linux-2.6.9-rc2-mm4/arch/ia64/kernel/domain.c 2004-09-27 15:57:19.000000000 -0700
+++ linux-2.6.9-rc2-mm4+per_arch-SD_INITs/arch/ia64/kernel/domain.c 2004-09-27 17:42:59.000000000 -0700
@@ -11,7 +11,6 @@
#include <linux/cpumask.h>
#include <linux/init.h>
#include <linux/topology.h>
-#include <asm/processor.h>
#define SD_NODES_PER_DOMAIN 6
diff -Nurp --exclude-from=/home/mcd/.dontdiff linux-2.6.9-rc2-mm4/include/asm-i386/topology.h linux-2.6.9-rc2-mm4+per_arch-SD_INITs/include/asm-i386/topology.h
--- linux-2.6.9-rc2-mm4/include/asm-i386/topology.h 2004-09-16 15:02:45.000000000 -0700
+++ linux-2.6.9-rc2-mm4+per_arch-SD_INITs/include/asm-i386/topology.h 2004-09-27 17:38:45.000000000 -0700
@@ -69,17 +69,50 @@ static inline cpumask_t pcibus_to_cpumas
/* Node-to-Node distance */
#define node_distance(from, to) ((from) != (to))
-/* Cross-node load balancing interval. */
-#define NODE_BALANCE_RATE 100
+#ifdef CONFIG_X86_NUMAQ
+/* sched_domains SD_NODE_INIT for IBM/Sequent NUMAQ machines */
+#define SD_NODE_INIT (struct sched_domain) { \
+ .span = CPU_MASK_NONE, \
+ .parent = NULL, \
+ .groups = NULL, \
+ .min_interval = 8, \
+ .max_interval = 32, \
+ .busy_factor = 32, \
+ .imbalance_pct = 125, \
+ .cache_hot_time = (10*1000), \
+ .cache_nice_tries = 1, \
+ .per_cpu_gain = 100, \
+ .flags = SD_LOAD_BALANCE \
+ | SD_BALANCE_EXEC \
+ | SD_WAKE_BALANCE, \
+ .last_balance = jiffies, \
+ .balance_interval = 1, \
+ .nr_balance_failed = 0, \
+}
+#else
+/* sched_domains SD_NODE_INIT for other i386 NUMA machines */
+#define SD_NODE_INIT (struct sched_domain) { \
+ .span = CPU_MASK_NONE, \
+ .parent = NULL, \
+ .groups = NULL, \
+ .min_interval = 8, \
+ .max_interval = 32, \
+ .busy_factor = 32, \
+ .imbalance_pct = 125, \
+ .cache_hot_time = (10*1000), \
+ .cache_nice_tries = 1, \
+ .per_cpu_gain = 100, \
+ .flags = SD_LOAD_BALANCE \
+ | SD_BALANCE_EXEC \
+ | SD_WAKE_BALANCE, \
+ .last_balance = jiffies, \
+ .balance_interval = 1, \
+ .nr_balance_failed = 0, \
+}
+#endif /* CONFIG_X86_NUMAQ */
#else /* !CONFIG_NUMA */
-/*
- * Other i386 platforms should define their own version of the
- * above macros here.
- */
-
#include <asm-generic/topology.h>
-
#endif /* CONFIG_NUMA */
#endif /* _ASM_I386_TOPOLOGY_H */
diff -Nurp --exclude-from=/home/mcd/.dontdiff linux-2.6.9-rc2-mm4/include/asm-ia64/processor.h linux-2.6.9-rc2-mm4+per_arch-SD_INITs/include/asm-ia64/processor.h
--- linux-2.6.9-rc2-mm4/include/asm-ia64/processor.h 2004-09-27 15:57:51.000000000 -0700
+++ linux-2.6.9-rc2-mm4+per_arch-SD_INITs/include/asm-ia64/processor.h 2004-09-27 17:40:05.000000000 -0700
@@ -337,27 +337,6 @@ struct task_struct;
/* Prepare to copy thread state - unlazy all lazy status */
#define prepare_to_copy(tsk) do { } while (0)
-#ifdef CONFIG_NUMA
-#define SD_NODE_INIT (struct sched_domain) { \
- .span = CPU_MASK_NONE, \
- .parent = NULL, \
- .groups = NULL, \
- .min_interval = 80, \
- .max_interval = 320, \
- .busy_factor = 320, \
- .imbalance_pct = 125, \
- .cache_hot_time = (10*1000000), \
- .cache_nice_tries = 1, \
- .per_cpu_gain = 100, \
- .flags = SD_LOAD_BALANCE \
- | SD_BALANCE_EXEC \
- | SD_WAKE_BALANCE, \
- .last_balance = jiffies, \
- .balance_interval = 10, \
- .nr_balance_failed = 0, \
-}
-#endif
-
/*
* This is the mechanism for creating a new kernel thread.
*
diff -Nurp --exclude-from=/home/mcd/.dontdiff linux-2.6.9-rc2-mm4/include/asm-ia64/topology.h linux-2.6.9-rc2-mm4+per_arch-SD_INITs/include/asm-ia64/topology.h
--- linux-2.6.9-rc2-mm4/include/asm-ia64/topology.h 2004-08-13 22:36:11.000000000 -0700
+++ linux-2.6.9-rc2-mm4+per_arch-SD_INITs/include/asm-ia64/topology.h 2004-09-27 17:40:55.000000000 -0700
@@ -40,11 +40,28 @@
*/
#define node_to_first_cpu(node) (__ffs(node_to_cpumask(node)))
-/* Cross-node load balancing interval. */
-#define NODE_BALANCE_RATE 10
-
void build_cpu_to_node_map(void);
+/* sched_domains SD_NODE_INIT for IA64 NUMA machines */
+#define SD_NODE_INIT (struct sched_domain) { \
+ .span = CPU_MASK_NONE, \
+ .parent = NULL, \
+ .groups = NULL, \
+ .min_interval = 80, \
+ .max_interval = 320, \
+ .busy_factor = 320, \
+ .imbalance_pct = 125, \
+ .cache_hot_time = (10*1000000), \
+ .cache_nice_tries = 1, \
+ .per_cpu_gain = 100, \
+ .flags = SD_LOAD_BALANCE \
+ | SD_BALANCE_EXEC \
+ | SD_WAKE_BALANCE, \
+ .last_balance = jiffies, \
+ .balance_interval = 10, \
+ .nr_balance_failed = 0, \
+}
+
#endif /* CONFIG_NUMA */
#include <asm-generic/topology.h>
diff -Nurp --exclude-from=/home/mcd/.dontdiff linux-2.6.9-rc2-mm4/include/asm-ppc64/topology.h linux-2.6.9-rc2-mm4+per_arch-SD_INITs/include/asm-ppc64/topology.h
--- linux-2.6.9-rc2-mm4/include/asm-ppc64/topology.h 2004-08-13 22:38:08.000000000 -0700
+++ linux-2.6.9-rc2-mm4+per_arch-SD_INITs/include/asm-ppc64/topology.h 2004-09-27 17:56:06.000000000 -0700
@@ -37,8 +37,25 @@ static inline int node_to_first_cpu(int
#define nr_cpus_node(node) (nr_cpus_in_node[node])
-/* Cross-node load balancing interval. */
-#define NODE_BALANCE_RATE 10
+/* sched_domains SD_NODE_INIT for PPC64 machines */
+#define SD_NODE_INIT (struct sched_domain) { \
+ .span = CPU_MASK_NONE, \
+ .parent = NULL, \
+ .groups = NULL, \
+ .min_interval = 8, \
+ .max_interval = 32, \
+ .busy_factor = 32, \
+ .imbalance_pct = 125, \
+ .cache_hot_time = (10*1000), \
+ .cache_nice_tries = 1, \
+ .per_cpu_gain = 100, \
+ .flags = SD_LOAD_BALANCE \
+ | SD_BALANCE_EXEC \
+ | SD_WAKE_BALANCE, \
+ .last_balance = jiffies, \
+ .balance_interval = 1, \
+ .nr_balance_failed = 0, \
+}
#else /* !CONFIG_NUMA */
diff -Nurp --exclude-from=/home/mcd/.dontdiff linux-2.6.9-rc2-mm4/include/asm-x86_64/topology.h linux-2.6.9-rc2-mm4+per_arch-SD_INITs/include/asm-x86_64/topology.h
--- linux-2.6.9-rc2-mm4/include/asm-x86_64/topology.h 2004-09-16 15:02:46.000000000 -0700
+++ linux-2.6.9-rc2-mm4+per_arch-SD_INITs/include/asm-x86_64/topology.h 2004-09-28 15:45:38.000000000 -0700
@@ -32,7 +32,27 @@ static inline cpumask_t __pcibus_to_cpum
/* broken generic file uses #ifndef later on this */
#define pcibus_to_cpumask(bus) __pcibus_to_cpumask(bus)
-#define NODE_BALANCE_RATE 30 /* CHECKME */
+#ifdef CONFIG_NUMA
+/* sched_domains SD_NODE_INIT for X86_64 machines */
+#define SD_NODE_INIT (struct sched_domain) { \
+ .span = CPU_MASK_NONE, \
+ .parent = NULL, \
+ .groups = NULL, \
+ .min_interval = 8, \
+ .max_interval = 32, \
+ .busy_factor = 32, \
+ .imbalance_pct = 125, \
+ .cache_hot_time = (10*1000), \
+ .cache_nice_tries = 1, \
+ .per_cpu_gain = 100, \
+ .flags = SD_LOAD_BALANCE \
+ | SD_BALANCE_EXEC \
+ | SD_WAKE_BALANCE, \
+ .last_balance = jiffies, \
+ .balance_interval = 1, \
+ .nr_balance_failed = 0, \
+}
+#endif /* CONFIG_NUMA */
#endif
diff -Nurp --exclude-from=/home/mcd/.dontdiff linux-2.6.9-rc2-mm4/include/linux/sched.h linux-2.6.9-rc2-mm4+per_arch-SD_INITs/include/linux/sched.h
--- linux-2.6.9-rc2-mm4/include/linux/sched.h 2004-09-27 15:57:56.000000000 -0700
+++ linux-2.6.9-rc2-mm4+per_arch-SD_INITs/include/linux/sched.h 2004-09-28 15:48:32.000000000 -0700
@@ -30,6 +30,7 @@
#include <linux/completion.h>
#include <linux/pid.h>
#include <linux/percpu.h>
+#include <linux/topology.h>
struct exec_domain;
@@ -538,25 +539,9 @@ extern void cpu_attach_domain(struct sch
}
#if defined(CONFIG_NUMA) && !defined(SD_NODE_INIT)
-#define SD_NODE_INIT (struct sched_domain) { \
- .span = CPU_MASK_NONE, \
- .parent = NULL, \
- .groups = NULL, \
- .min_interval = 8, \
- .max_interval = 32, \
- .busy_factor = 32, \
- .imbalance_pct = 125, \
- .cache_hot_time = (10*1000), \
- .cache_nice_tries = 1, \
- .per_cpu_gain = 100, \
- .flags = SD_LOAD_BALANCE \
- | SD_BALANCE_EXEC \
- | SD_WAKE_BALANCE, \
- .last_balance = jiffies, \
- .balance_interval = 1, \
- .nr_balance_failed = 0, \
-}
+#error Please define an appropriate SD_NODE_INIT in include/asm/topology.h
#endif
+
#endif /* ARCH_HAS_SCHED_TUNE */
#endif /* CONFIG_SMP */
Matthew Dobson wrote:
> IA64 already has their own version of SD_NODE_INIT, tuned for their
> extremely large machines. I think that all arches would benefit from
> having their own, arch-specific SD_NODE_INIT initializer, rather than
> the one-size-fits-all variant we've got now.
>
I suppose the patch is pretty good (IIRC Martin liked the idea).
I guess it will at least increase the incidence of copy+paste,
if not getting people to think harder ;)
Can I be lame and ask that you keep this around until closer
to 2.6.10? I have a few possible scheduler performance
improvments that I'd like to get tested in -mm after 2.6.9
and this would make things a bit harder :P
I don't think anyone is looking at getting any tweaks in before
then...
On Thu, 2004-09-30 at 01:15, Nick Piggin wrote:
> Matthew Dobson wrote:
> > IA64 already has their own version of SD_NODE_INIT, tuned for their
> > extremely large machines. I think that all arches would benefit from
> > having their own, arch-specific SD_NODE_INIT initializer, rather than
> > the one-size-fits-all variant we've got now.
> >
>
> I suppose the patch is pretty good (IIRC Martin liked the idea).
> I guess it will at least increase the incidence of copy+paste,
> if not getting people to think harder ;)
Thanks! Martin does like the idea, and I think Andi Kleen likes the
idea of being able to tune sched_domains for x86_64, too. Any comments,
Andi?
The patch is pretty simple. I don't think it will increase any
copy+pasting because I don't believe anyone has modified SD_NODE_INIT at
all since it's been implemented, and certainly not for many kernel
releases. I think part of the reason for that is that it is currently
impossible to tweak the values for your architecture of choice because
modifying the values now will change EVERYONE's sched_domains timings.
Which is bad. :( If anyone wants to tweak SD_NODE_INIT, they shouldn't
be copying+pasting those values to all architectures. Besides, IA64
already gets their own SD_NODE_INIT to play with, why shouldn't everyone
else! ;)
> Can I be lame and ask that you keep this around until closer
> to 2.6.10? I have a few possible scheduler performance
> improvments that I'd like to get tested in -mm after 2.6.9
> and this would make things a bit harder :P
>
> I don't think anyone is looking at getting any tweaks in before
> then...
I would like to try to get this in before then, unless this will really
make things difficult for you. 2.6.9 is looking to be a pickup point
for distros, so getting this patch in now (pre 2.6.9) means that distros
can add tiny patches to their builds to simply tweak individual
architecture values without having to diverge from mainline by
implementing this patch on their own. It also means that any tuning
work that the distros do can easily be pushed back to mainline by
simpling sending a patch per architecture with new values. It makes
those patches safe and minimizes conflicts.
Will this patch really make your scheduler improvements that much harder
to test/implement? I don't think it should, unless your improvements
are tweaking the values in SD_NODE_INIT, since that is all this
touches... Even after this patch, SD_NODE_INIT is still picked up in
include/linux/sched.h, so the changes required to cope with this patch
should be minimal...
-Matt
Matthew Dobson <[email protected]> wrote:
>
> I would like to try to get this in before then, unless this will really
> make things difficult for you.
It's about three weeks late for 2.6.9. I already have a string of CPU
scheduler patches awaiting the 2.6.10 stream and once we're at -rc2 we
really should only be looking at bugfixes.
Grumble, mutter.. it looks like one of those "if it compiled, it works"
things. Problem is, any time anyone touches that particular piece of the
kernel, half the architectures stop compiing.
On Thu, 2004-09-30 at 12:23, Andrew Morton wrote:
> Matthew Dobson <[email protected]> wrote:
> >
> > I would like to try to get this in before then, unless this will really
> > make things difficult for you.
>
> It's about three weeks late for 2.6.9. I already have a string of CPU
> scheduler patches awaiting the 2.6.10 stream and once we're at -rc2 we
> really should only be looking at bugfixes.
Yeah, that's entirely my fault for slacking on sending this out... I
should have sent this a while ago. It is a small portion of some larger
sched_domains changes that I am working on, but at some point I realized
my larger changeset will be far more controversial and have a much
larger impact than some of the smaller bits, as well as not being ready
for prime time yet. Plus, like I said earlier, this allows
arch-specific tweaking with minimal intrusiveness from the application
of this patch forward.
> Grumble, mutter.. it looks like one of those "if it compiled, it works"
> things. Problem is, any time anyone touches that particular piece of the
> kernel, half the architectures stop compiing.
It *should* be. I'd be quite happy if you just picked it up in -mm to
assure it far wider testing. I've compiled and booted it on x86, x86_64
& ppc64. I've got no access to ia64 right now, or I'd test it there.
But the patch *will* spit out #errors for any arch that doesn't have
SD_NODE_INIT defined if they also have NUMA defined. I'm don't know of
anyone else (ie: *not* x86, x86_64, ppc64 & ia64) that is building NUMA
kernels, but if they are, it's a trivial patch to their
include/asm/topology.h to make the arch build.
Of course, the ultimate decision is yours, Andrew...
-Matt
On Thu, Sep 30, 2004 at 11:36:52AM -0700, Matthew Dobson wrote:
> On Thu, 2004-09-30 at 01:15, Nick Piggin wrote:
> > Matthew Dobson wrote:
> > > IA64 already has their own version of SD_NODE_INIT, tuned for their
> > > extremely large machines. I think that all arches would benefit from
> > > having their own, arch-specific SD_NODE_INIT initializer, rather than
> > > the one-size-fits-all variant we've got now.
> > >
> >
> > I suppose the patch is pretty good (IIRC Martin liked the idea).
> > I guess it will at least increase the incidence of copy+paste,
> > if not getting people to think harder ;)
>
> Thanks! Martin does like the idea, and I think Andi Kleen likes the
> idea of being able to tune sched_domains for x86_64, too. Any comments,
> Andi?
It doesn't help me directly - what i need is the same thing
for SD_SIBLING_INIT for the CMP changes.
But it seems I need to do some other work to properly support the K8
CMP first, so I'm defering attacking this a bit.
> The patch is pretty simple. I don't think it will increase any
> copy+pasting because I don't believe anyone has modified SD_NODE_INIT at
> all since it's been implemented, and certainly not for many kernel
> releases. I think part of the reason for that is that it is currently
> impossible to tweak the values for your architecture of choice because
> modifying the values now will change EVERYONE's sched_domains timings.
> Which is bad. :( If anyone wants to tweak SD_NODE_INIT, they shouldn't
> be copying+pasting those values to all architectures. Besides, IA64
> already gets their own SD_NODE_INIT to play with, why shouldn't everyone
> else! ;)
It would be nice if there was a SD_DEFAULT_NODE_INIT and a
SD_DEFAULT_SIBLING_INIT in some generic
file that architecture code can use as a base for tweaking.
For the CMP change I currently only want to remove SD_SHAREPOWER
from SIBLING_INIT to get rid of SMT nice.
Later we'll probably want a SD_DEFAULT_CMP_INIT too that gives
generic values for a dual core. Dual cores should be soon pretty
common and tuning for them will be needed on several architectures
(ppc64, ia64, x86, x86-64, sparc, parisc? ...). But figuring out good
values for this will require a lot of benchmarking first.
-Andi
On Thu, 2004-09-30 at 13:45, Andi Kleen wrote:
> On Thu, Sep 30, 2004 at 11:36:52AM -0700, Matthew Dobson wrote:
> > On Thu, 2004-09-30 at 01:15, Nick Piggin wrote:
> > > Matthew Dobson wrote:
> > > > IA64 already has their own version of SD_NODE_INIT, tuned for their
> > > > extremely large machines. I think that all arches would benefit from
> > > > having their own, arch-specific SD_NODE_INIT initializer, rather than
> > > > the one-size-fits-all variant we've got now.
> > > >
> > >
> > > I suppose the patch is pretty good (IIRC Martin liked the idea).
> > > I guess it will at least increase the incidence of copy+paste,
> > > if not getting people to think harder ;)
> >
> > Thanks! Martin does like the idea, and I think Andi Kleen likes the
> > idea of being able to tune sched_domains for x86_64, too. Any comments,
> > Andi?
>
> It doesn't help me directly - what i need is the same thing
> for SD_SIBLING_INIT for the CMP changes.
>
> But it seems I need to do some other work to properly support the K8
> CMP first, so I'm defering attacking this a bit.
I see... Martin was under the impression you were looking to tweak the
SD_NODE_INIT values. I'd really like to see all 3 initializers become
per-arch. Siblings and CPUs are going to behave differently on
different platforms. The idea that a P3 in a NUMAQ box will perform
optimally with the same SD_CPU_INIT values as a Power5 CPU or an Opteron
is just silly. But, I figured this would be a baby step in the right
direction, and doing only for NUMA architectures minimizes the number of
affected machines. If this works well, I would do the same with
SD_SIBLING_INIT and SD_CPU_INIT.
> > The patch is pretty simple. I don't think it will increase any
> > copy+pasting because I don't believe anyone has modified SD_NODE_INIT at
> > all since it's been implemented, and certainly not for many kernel
> > releases. I think part of the reason for that is that it is currently
> > impossible to tweak the values for your architecture of choice because
> > modifying the values now will change EVERYONE's sched_domains timings.
> > Which is bad. :( If anyone wants to tweak SD_NODE_INIT, they shouldn't
> > be copying+pasting those values to all architectures. Besides, IA64
> > already gets their own SD_NODE_INIT to play with, why shouldn't everyone
> > else! ;)
>
> It would be nice if there was a SD_DEFAULT_NODE_INIT and a
> SD_DEFAULT_SIBLING_INIT in some generic
> file that architecture code can use as a base for tweaking.
> For the CMP change I currently only want to remove SD_SHAREPOWER
> from SIBLING_INIT to get rid of SMT nice.
Well, you can certainly base the x86_64 CMP values on the current
SD_SIBLING_INIT values. Those are well publicized, see
include/linux/sched.h! ;)
> Later we'll probably want a SD_DEFAULT_CMP_INIT too that gives
> generic values for a dual core. Dual cores should be soon pretty
> common and tuning for them will be needed on several architectures
> (ppc64, ia64, x86, x86-64, sparc, parisc? ...). But figuring out good
> values for this will require a lot of benchmarking first.
>
> -Andi
I suppose it would be pretty trivial to define defaults in
include/asm-generic/topology.h, and allow arches that care to define
their own SD_*_INITs without disrupting anyone else. Actually, that's
far better than what I've got now. I'll run that patch up after the
meeting I'm currently late for and post it in a couple hours.
And I agree that LOTS of benchmarking will be required to find the
optimal values for these fields.
-Matt
> Well, you can certainly base the x86_64 CMP values on the current
> SD_SIBLING_INIT values. Those are well publicized, see
> include/linux/sched.h! ;)
Current BK has it in kernel/sched.c.
And it also broke NUMA kernels on UP, but that's a different issue.
> I suppose it would be pretty trivial to define defaults in
> include/asm-generic/topology.h, and allow arches that care to define
> their own SD_*_INITs without disrupting anyone else. Actually, that's
> far better than what I've got now. I'll run that patch up after the
> meeting I'm currently late for and post it in a couple hours.
Full override isn't good imho because it could lead to bit rot,
better is to have defaults that can be used as a base, but tweaked.
-Andi
On Thu, 2004-09-30 at 14:12, Andi Kleen wrote:
> > Well, you can certainly base the x86_64 CMP values on the current
> > SD_SIBLING_INIT values. Those are well publicized, see
> > include/linux/sched.h! ;)
>
> Current BK has it in kernel/sched.c.
Fair enough. I was thinking about the -mm tree. :)
> And it also broke NUMA kernels on UP, but that's a different issue.
What broke NUMA kernels on UP? Are you talking about the cpu_online_map
vs. cpu_possible_map thing from a little bit ago?
> > I suppose it would be pretty trivial to define defaults in
> > include/asm-generic/topology.h, and allow arches that care to define
> > their own SD_*_INITs without disrupting anyone else. Actually, that's
> > far better than what I've got now. I'll run that patch up after the
> > meeting I'm currently late for and post it in a couple hours.
>
> Full override isn't good imho because it could lead to bit rot,
> better is to have defaults that can be used as a base, but tweaked.
I'm not sure why it would lead to bit rot? Because every arch would
define their own initializers and not use the generic ones? If so, we
could always rip them out... I doubt that will happen, since I don't
foresee most arches caring enough to set up custom initializers.
Especially since no one has done it yet, and some of the bigger arches
really need to.
I'm also not quite sure what you mean about using one common set
definitions as a base? How would you tweak a generic SD_NODE_INIT
initializer without overriding it?
Here's a smaller patch (only compile tested) to implement this in a much
better way. What it does is:
1) Rip SD_*_INIT definitions out of linux/sched.h and move them into
linux/topology.h and have linux/sched.h include linux/topology.h
2) Move IA64's arch-specific SD_NODE_INIT definition from
asm/processor.h to asm/topology.h.
This way, all an architecture has to do to set up their own
arch-specific initializers is define them in asm/topology.h. It makes
it totally trivial for an arch to set this up without changing or
breaking anyone else's values.
[mcd@arrakis source]$ diffstat ~/linux/patches/sched_domains/per_arch-SD_INIT.patch
arch/ia64/kernel/domain.c | 1
include/asm-ia64/processor.h | 21 -----------
include/asm-ia64/topology.h | 20 +++++++++++
include/linux/sched.h | 74 +---------------------------------------
include/linux/topology.h | 78 +++++++++++++++++++++++++++++++++++++++++++
5 files changed, 100 insertions(+), 94 deletions(-)
-Matt
diff -Nurp --exclude-from=/home/mcd/.dontdiff linux-2.6.9-rc2-mm4/arch/ia64/kernel/domain.c linux-2.6.9-rc2-mm4+per_arch-SD_INITs/arch/ia64/kernel/domain.c
--- linux-2.6.9-rc2-mm4/arch/ia64/kernel/domain.c 2004-09-27 15:57:19.000000000 -0700
+++ linux-2.6.9-rc2-mm4+per_arch-SD_INITs/arch/ia64/kernel/domain.c 2004-09-27 17:42:59.000000000 -0700
@@ -11,7 +11,6 @@
#include <linux/cpumask.h>
#include <linux/init.h>
#include <linux/topology.h>
-#include <asm/processor.h>
#define SD_NODES_PER_DOMAIN 6
diff -Nurp --exclude-from=/home/mcd/.dontdiff linux-2.6.9-rc2-mm4/include/asm-ia64/processor.h linux-2.6.9-rc2-mm4+per_arch-SD_INITs/include/asm-ia64/processor.h
--- linux-2.6.9-rc2-mm4/include/asm-ia64/processor.h 2004-09-27 15:57:51.000000000 -0700
+++ linux-2.6.9-rc2-mm4+per_arch-SD_INITs/include/asm-ia64/processor.h 2004-09-27 17:40:05.000000000 -0700
@@ -337,27 +337,6 @@ struct task_struct;
/* Prepare to copy thread state - unlazy all lazy status */
#define prepare_to_copy(tsk) do { } while (0)
-#ifdef CONFIG_NUMA
-#define SD_NODE_INIT (struct sched_domain) { \
- .span = CPU_MASK_NONE, \
- .parent = NULL, \
- .groups = NULL, \
- .min_interval = 80, \
- .max_interval = 320, \
- .busy_factor = 320, \
- .imbalance_pct = 125, \
- .cache_hot_time = (10*1000000), \
- .cache_nice_tries = 1, \
- .per_cpu_gain = 100, \
- .flags = SD_LOAD_BALANCE \
- | SD_BALANCE_EXEC \
- | SD_WAKE_BALANCE, \
- .last_balance = jiffies, \
- .balance_interval = 10, \
- .nr_balance_failed = 0, \
-}
-#endif
-
/*
* This is the mechanism for creating a new kernel thread.
*
diff -Nurp --exclude-from=/home/mcd/.dontdiff linux-2.6.9-rc2-mm4/include/asm-ia64/topology.h linux-2.6.9-rc2-mm4+per_arch-SD_INITs/include/asm-ia64/topology.h
--- linux-2.6.9-rc2-mm4/include/asm-ia64/topology.h 2004-08-13 22:36:11.000000000 -0700
+++ linux-2.6.9-rc2-mm4+per_arch-SD_INITs/include/asm-ia64/topology.h 2004-09-30 16:06:47.000000000 -0700
@@ -45,6 +45,26 @@
void build_cpu_to_node_map(void);
+/* sched_domains SD_NODE_INIT for IA64 NUMA machines */
+#define SD_NODE_INIT (struct sched_domain) { \
+ .span = CPU_MASK_NONE, \
+ .parent = NULL, \
+ .groups = NULL, \
+ .min_interval = 80, \
+ .max_interval = 320, \
+ .busy_factor = 320, \
+ .imbalance_pct = 125, \
+ .cache_hot_time = (10*1000000), \
+ .cache_nice_tries = 1, \
+ .per_cpu_gain = 100, \
+ .flags = SD_LOAD_BALANCE \
+ | SD_BALANCE_EXEC \
+ | SD_WAKE_BALANCE, \
+ .last_balance = jiffies, \
+ .balance_interval = 10, \
+ .nr_balance_failed = 0, \
+}
+
#endif /* CONFIG_NUMA */
#include <asm-generic/topology.h>
diff -Nurp --exclude-from=/home/mcd/.dontdiff linux-2.6.9-rc2-mm4/include/linux/sched.h linux-2.6.9-rc2-mm4+per_arch-SD_INITs/include/linux/sched.h
--- linux-2.6.9-rc2-mm4/include/linux/sched.h 2004-09-27 15:57:56.000000000 -0700
+++ linux-2.6.9-rc2-mm4+per_arch-SD_INITs/include/linux/sched.h 2004-09-30 16:03:06.000000000 -0700
@@ -30,6 +30,7 @@
#include <linux/completion.h>
#include <linux/pid.h>
#include <linux/percpu.h>
+#include <linux/topology.h>
struct exec_domain;
@@ -486,78 +487,7 @@ extern cpumask_t cpu_isolated_map;
extern void init_sched_build_groups(struct sched_group groups[],
cpumask_t span, int (*group_fn)(int cpu));
extern void cpu_attach_domain(struct sched_domain *sd, int cpu);
-#endif
-
-#ifndef ARCH_HAS_SCHED_TUNE
-#ifdef CONFIG_SCHED_SMT
-#define ARCH_HAS_SCHED_WAKE_IDLE
-/* Common values for SMT siblings */
-#define SD_SIBLING_INIT (struct sched_domain) { \
- .span = CPU_MASK_NONE, \
- .parent = NULL, \
- .groups = NULL, \
- .min_interval = 1, \
- .max_interval = 2, \
- .busy_factor = 8, \
- .imbalance_pct = 110, \
- .cache_hot_time = 0, \
- .cache_nice_tries = 0, \
- .per_cpu_gain = 25, \
- .flags = SD_LOAD_BALANCE \
- | SD_BALANCE_NEWIDLE \
- | SD_BALANCE_EXEC \
- | SD_WAKE_AFFINE \
- | SD_WAKE_IDLE \
- | SD_SHARE_CPUPOWER, \
- .last_balance = jiffies, \
- .balance_interval = 1, \
- .nr_balance_failed = 0, \
-}
-#endif
-
-/* Common values for CPUs */
-#define SD_CPU_INIT (struct sched_domain) { \
- .span = CPU_MASK_NONE, \
- .parent = NULL, \
- .groups = NULL, \
- .min_interval = 1, \
- .max_interval = 4, \
- .busy_factor = 64, \
- .imbalance_pct = 125, \
- .cache_hot_time = (5*1000/2), \
- .cache_nice_tries = 1, \
- .per_cpu_gain = 100, \
- .flags = SD_LOAD_BALANCE \
- | SD_BALANCE_NEWIDLE \
- | SD_BALANCE_EXEC \
- | SD_WAKE_AFFINE \
- | SD_WAKE_BALANCE, \
- .last_balance = jiffies, \
- .balance_interval = 1, \
- .nr_balance_failed = 0, \
-}
-
-#if defined(CONFIG_NUMA) && !defined(SD_NODE_INIT)
-#define SD_NODE_INIT (struct sched_domain) { \
- .span = CPU_MASK_NONE, \
- .parent = NULL, \
- .groups = NULL, \
- .min_interval = 8, \
- .max_interval = 32, \
- .busy_factor = 32, \
- .imbalance_pct = 125, \
- .cache_hot_time = (10*1000), \
- .cache_nice_tries = 1, \
- .per_cpu_gain = 100, \
- .flags = SD_LOAD_BALANCE \
- | SD_BALANCE_EXEC \
- | SD_WAKE_BALANCE, \
- .last_balance = jiffies, \
- .balance_interval = 1, \
- .nr_balance_failed = 0, \
-}
-#endif
-#endif /* ARCH_HAS_SCHED_TUNE */
+#endif /* ARCH_HAS_SCHED_DOMAIN */
#endif /* CONFIG_SMP */
diff -Nurp --exclude-from=/home/mcd/.dontdiff linux-2.6.9-rc2-mm4/include/linux/topology.h linux-2.6.9-rc2-mm4+per_arch-SD_INITs/include/linux/topology.h
--- linux-2.6.9-rc2-mm4/include/linux/topology.h 2004-09-16 15:02:47.000000000 -0700
+++ linux-2.6.9-rc2-mm4+per_arch-SD_INITs/include/linux/topology.h 2004-09-30 16:27:43.000000000 -0700
@@ -61,4 +61,82 @@ static inline int __next_node_with_cpus(
#define PENALTY_FOR_NODE_WITH_CPUS (1)
#endif
+#ifdef CONFIG_SCHED_SMT
+/* MCD - Do we really need this? It is always on if CONFIG_SCHED_SMT is,
+ * so can't we drop this in favor of CONFIG_SCHED_SMT?
+ */
+#define ARCH_HAS_SCHED_WAKE_IDLE
+/* Common values for SMT siblings */
+#ifndef SD_SIBLING_INIT
+#define SD_SIBLING_INIT (struct sched_domain) { \
+ .span = CPU_MASK_NONE, \
+ .parent = NULL, \
+ .groups = NULL, \
+ .min_interval = 1, \
+ .max_interval = 2, \
+ .busy_factor = 8, \
+ .imbalance_pct = 110, \
+ .cache_hot_time = 0, \
+ .cache_nice_tries = 0, \
+ .per_cpu_gain = 25, \
+ .flags = SD_LOAD_BALANCE \
+ | SD_BALANCE_NEWIDLE \
+ | SD_BALANCE_EXEC \
+ | SD_WAKE_AFFINE \
+ | SD_WAKE_IDLE \
+ | SD_SHARE_CPUPOWER, \
+ .last_balance = jiffies, \
+ .balance_interval = 1, \
+ .nr_balance_failed = 0, \
+}
+#endif
+#endif /* CONFIG_SCHED_SMT */
+
+/* Common values for CPUs */
+#ifndef SD_CPU_INIT
+#define SD_CPU_INIT (struct sched_domain) { \
+ .span = CPU_MASK_NONE, \
+ .parent = NULL, \
+ .groups = NULL, \
+ .min_interval = 1, \
+ .max_interval = 4, \
+ .busy_factor = 64, \
+ .imbalance_pct = 125, \
+ .cache_hot_time = (5*1000/2), \
+ .cache_nice_tries = 1, \
+ .per_cpu_gain = 100, \
+ .flags = SD_LOAD_BALANCE \
+ | SD_BALANCE_NEWIDLE \
+ | SD_BALANCE_EXEC \
+ | SD_WAKE_AFFINE \
+ | SD_WAKE_BALANCE, \
+ .last_balance = jiffies, \
+ .balance_interval = 1, \
+ .nr_balance_failed = 0, \
+}
+#endif
+
+#ifdef CONFIG_NUMA
+#ifndef SD_NODE_INIT
+#define SD_NODE_INIT (struct sched_domain) { \
+ .span = CPU_MASK_NONE, \
+ .parent = NULL, \
+ .groups = NULL, \
+ .min_interval = 8, \
+ .max_interval = 32, \
+ .busy_factor = 32, \
+ .imbalance_pct = 125, \
+ .cache_hot_time = (10*1000), \
+ .cache_nice_tries = 1, \
+ .per_cpu_gain = 100, \
+ .flags = SD_LOAD_BALANCE \
+ | SD_BALANCE_EXEC \
+ | SD_WAKE_BALANCE, \
+ .last_balance = jiffies, \
+ .balance_interval = 1, \
+ .nr_balance_failed = 0, \
+}
+#endif
+#endif /* CONFIG_NUMA */
+
#endif /* _LINUX_TOPOLOGY_H */
--Andrew Morton <[email protected]> wrote (on Thursday, September 30, 2004 12:23:12 -0700):
> Matthew Dobson <[email protected]> wrote:
>>
>> I would like to try to get this in before then, unless this will really
>> make things difficult for you.
>
> It's about three weeks late for 2.6.9. I already have a string of CPU
> scheduler patches awaiting the 2.6.10 stream and once we're at -rc2 we
> really should only be looking at bugfixes.
Yup, seems a bit late for that, but early 2.6.10 would be nice if possible?
> Grumble, mutter.. it looks like one of those "if it compiled, it works"
> things. Problem is, any time anyone touches that particular piece of the
> kernel, half the architectures stop compiing.
I tested it - worked for me ;-)
This is the first step to getting the arches to actually use the flexibility
we had, and stop Andi complaining the scheduler is tuned for one arch rather
than another ;-) These params definitely need to be per arch/subarch, and
probably some other ones too, but this seems like a good start.
M.
On Thu, 2004-09-30 at 23:15, Martin J. Bligh wrote:
> --Andrew Morton <[email protected]> wrote (on Thursday, September 30, 2004 12:23:12 -0700):
>
> > Matthew Dobson <[email protected]> wrote:
> >>
> >> I would like to try to get this in before then, unless this will really
> >> make things difficult for you.
> >
> > It's about three weeks late for 2.6.9. I already have a string of CPU
> > scheduler patches awaiting the 2.6.10 stream and once we're at -rc2 we
> > really should only be looking at bugfixes.
>
> Yup, seems a bit late for that, but early 2.6.10 would be nice if possible?
>
> > Grumble, mutter.. it looks like one of those "if it compiled, it works"
> > things. Problem is, any time anyone touches that particular piece of the
> > kernel, half the architectures stop compiing.
>
> I tested it - worked for me ;-)
>
> This is the first step to getting the arches to actually use the flexibility
> we had, and stop Andi complaining the scheduler is tuned for one arch rather
> than another ;-) These params definitely need to be per arch/subarch, and
> probably some other ones too, but this seems like a good start.
>
> M.
Martin, Andi, Andrew & anyone else still reading this thread,
Here's yet another version of a patch to implement per-arch SD_*_INITs.
This follows the same basic idea of my last patch, but
1) defines an arch-specific SD_NODE_INIT for the 4 NUMA arches (i386,
x86_64, IA64 & PPC64),
2) defines *default* SD_CPU_INIT & SD_SIBLING_INIT for *all* arches,
with the possibility of them being overridden by simply defining an
arch-specific version in include/asm/topology.h.
The motivation behind the third version of this patch is that Martin
feels that there should be no "default" NUMA initializer because NUMA
characteristics are *very* arch/platform specific, and hence a "default"
NUMA initializer can only lead to confusion. I agree with most of that,
but don't quite see as much harm in having a default as he does.
Nevertheless, to keep him quiet, I've run up this version of the patch.
Martin, please run this through your magic test suite and make sure I
didn't break anything trivial.
[mcd@arrakis source]$ diffstat ~/linux/patches/sched_domains/per_arch-SD_INIT.patch
arch/ia64/kernel/domain.c | 1
include/asm-i386/topology.h | 20 +++++++++++
include/asm-ia64/processor.h | 21 -----------
include/asm-ia64/topology.h | 20 +++++++++++
include/asm-ppc64/topology.h | 20 +++++++++++
include/asm-x86_64/topology.h | 22 ++++++++++++
include/linux/sched.h | 74 +-----------------------------------------
include/linux/topology.h | 72 ++++++++++++++++++++++++++++++++++++++++
8 files changed, 156 insertions(+), 94 deletions(-)
-Matt
diff -Nurp --exclude-from=/home/mcd/.dontdiff linux-2.6.9-rc2-mm4/arch/ia64/kernel/domain.c linux-2.6.9-rc2-mm4+per_arch-SD_INITs/arch/ia64/kernel/domain.c
--- linux-2.6.9-rc2-mm4/arch/ia64/kernel/domain.c 2004-09-27 15:57:19.000000000 -0700
+++ linux-2.6.9-rc2-mm4+per_arch-SD_INITs/arch/ia64/kernel/domain.c 2004-09-27 17:42:59.000000000 -0700
@@ -11,7 +11,6 @@
#include <linux/cpumask.h>
#include <linux/init.h>
#include <linux/topology.h>
-#include <asm/processor.h>
#define SD_NODES_PER_DOMAIN 6
diff -Nurp --exclude-from=/home/mcd/.dontdiff linux-2.6.9-rc2-mm4/include/asm-i386/topology.h linux-2.6.9-rc2-mm4+per_arch-SD_INITs/include/asm-i386/topology.h
--- linux-2.6.9-rc2-mm4/include/asm-i386/topology.h 2004-09-16 15:02:45.000000000 -0700
+++ linux-2.6.9-rc2-mm4+per_arch-SD_INITs/include/asm-i386/topology.h 2004-10-01 15:06:30.000000000 -0700
@@ -72,6 +72,26 @@ static inline cpumask_t pcibus_to_cpumas
/* Cross-node load balancing interval. */
#define NODE_BALANCE_RATE 100
+/* sched_domains SD_NODE_INIT for NUMAQ machines */
+#define SD_NODE_INIT (struct sched_domain) { \
+ .span = CPU_MASK_NONE, \
+ .parent = NULL, \
+ .groups = NULL, \
+ .min_interval = 8, \
+ .max_interval = 32, \
+ .busy_factor = 32, \
+ .imbalance_pct = 125, \
+ .cache_hot_time = (10*1000), \
+ .cache_nice_tries = 1, \
+ .per_cpu_gain = 100, \
+ .flags = SD_LOAD_BALANCE \
+ | SD_BALANCE_EXEC \
+ | SD_WAKE_BALANCE, \
+ .last_balance = jiffies, \
+ .balance_interval = 1, \
+ .nr_balance_failed = 0, \
+}
+
#else /* !CONFIG_NUMA */
/*
* Other i386 platforms should define their own version of the
diff -Nurp --exclude-from=/home/mcd/.dontdiff linux-2.6.9-rc2-mm4/include/asm-ia64/processor.h linux-2.6.9-rc2-mm4+per_arch-SD_INITs/include/asm-ia64/processor.h
--- linux-2.6.9-rc2-mm4/include/asm-ia64/processor.h 2004-09-27 15:57:51.000000000 -0700
+++ linux-2.6.9-rc2-mm4+per_arch-SD_INITs/include/asm-ia64/processor.h 2004-09-27 17:40:05.000000000 -0700
@@ -337,27 +337,6 @@ struct task_struct;
/* Prepare to copy thread state - unlazy all lazy status */
#define prepare_to_copy(tsk) do { } while (0)
-#ifdef CONFIG_NUMA
-#define SD_NODE_INIT (struct sched_domain) { \
- .span = CPU_MASK_NONE, \
- .parent = NULL, \
- .groups = NULL, \
- .min_interval = 80, \
- .max_interval = 320, \
- .busy_factor = 320, \
- .imbalance_pct = 125, \
- .cache_hot_time = (10*1000000), \
- .cache_nice_tries = 1, \
- .per_cpu_gain = 100, \
- .flags = SD_LOAD_BALANCE \
- | SD_BALANCE_EXEC \
- | SD_WAKE_BALANCE, \
- .last_balance = jiffies, \
- .balance_interval = 10, \
- .nr_balance_failed = 0, \
-}
-#endif
-
/*
* This is the mechanism for creating a new kernel thread.
*
diff -Nurp --exclude-from=/home/mcd/.dontdiff linux-2.6.9-rc2-mm4/include/asm-ia64/topology.h linux-2.6.9-rc2-mm4+per_arch-SD_INITs/include/asm-ia64/topology.h
--- linux-2.6.9-rc2-mm4/include/asm-ia64/topology.h 2004-08-13 22:36:11.000000000 -0700
+++ linux-2.6.9-rc2-mm4+per_arch-SD_INITs/include/asm-ia64/topology.h 2004-09-30 16:06:47.000000000 -0700
@@ -45,6 +45,26 @@
void build_cpu_to_node_map(void);
+/* sched_domains SD_NODE_INIT for IA64 NUMA machines */
+#define SD_NODE_INIT (struct sched_domain) { \
+ .span = CPU_MASK_NONE, \
+ .parent = NULL, \
+ .groups = NULL, \
+ .min_interval = 80, \
+ .max_interval = 320, \
+ .busy_factor = 320, \
+ .imbalance_pct = 125, \
+ .cache_hot_time = (10*1000000), \
+ .cache_nice_tries = 1, \
+ .per_cpu_gain = 100, \
+ .flags = SD_LOAD_BALANCE \
+ | SD_BALANCE_EXEC \
+ | SD_WAKE_BALANCE, \
+ .last_balance = jiffies, \
+ .balance_interval = 10, \
+ .nr_balance_failed = 0, \
+}
+
#endif /* CONFIG_NUMA */
#include <asm-generic/topology.h>
diff -Nurp --exclude-from=/home/mcd/.dontdiff linux-2.6.9-rc2-mm4/include/asm-ppc64/topology.h linux-2.6.9-rc2-mm4+per_arch-SD_INITs/include/asm-ppc64/topology.h
--- linux-2.6.9-rc2-mm4/include/asm-ppc64/topology.h 2004-08-13 22:38:08.000000000 -0700
+++ linux-2.6.9-rc2-mm4+per_arch-SD_INITs/include/asm-ppc64/topology.h 2004-10-01 15:07:24.000000000 -0700
@@ -40,6 +40,26 @@ static inline int node_to_first_cpu(int
/* Cross-node load balancing interval. */
#define NODE_BALANCE_RATE 10
+/* sched_domains SD_NODE_INIT for PPC64 machines */
+#define SD_NODE_INIT (struct sched_domain) { \
+ .span = CPU_MASK_NONE, \
+ .parent = NULL, \
+ .groups = NULL, \
+ .min_interval = 8, \
+ .max_interval = 32, \
+ .busy_factor = 32, \
+ .imbalance_pct = 125, \
+ .cache_hot_time = (10*1000), \
+ .cache_nice_tries = 1, \
+ .per_cpu_gain = 100, \
+ .flags = SD_LOAD_BALANCE \
+ | SD_BALANCE_EXEC \
+ | SD_WAKE_BALANCE, \
+ .last_balance = jiffies, \
+ .balance_interval = 1, \
+ .nr_balance_failed = 0, \
+}
+
#else /* !CONFIG_NUMA */
#include <asm-generic/topology.h>
diff -Nurp --exclude-from=/home/mcd/.dontdiff linux-2.6.9-rc2-mm4/include/asm-x86_64/topology.h linux-2.6.9-rc2-mm4+per_arch-SD_INITs/include/asm-x86_64/topology.h
--- linux-2.6.9-rc2-mm4/include/asm-x86_64/topology.h 2004-09-16 15:02:46.000000000 -0700
+++ linux-2.6.9-rc2-mm4+per_arch-SD_INITs/include/asm-x86_64/topology.h 2004-10-01 15:07:35.000000000 -0700
@@ -34,6 +34,28 @@ static inline cpumask_t __pcibus_to_cpum
#define NODE_BALANCE_RATE 30 /* CHECKME */
+#ifdef CONFIG_NUMA
+/* sched_domains SD_NODE_INIT for x86_64 machines */
+#define SD_NODE_INIT (struct sched_domain) { \
+ .span = CPU_MASK_NONE, \
+ .parent = NULL, \
+ .groups = NULL, \
+ .min_interval = 8, \
+ .max_interval = 32, \
+ .busy_factor = 32, \
+ .imbalance_pct = 125, \
+ .cache_hot_time = (10*1000), \
+ .cache_nice_tries = 1, \
+ .per_cpu_gain = 100, \
+ .flags = SD_LOAD_BALANCE \
+ | SD_BALANCE_EXEC \
+ | SD_WAKE_BALANCE, \
+ .last_balance = jiffies, \
+ .balance_interval = 1, \
+ .nr_balance_failed = 0, \
+}
+#endif
+
#endif
#include <asm-generic/topology.h>
diff -Nurp --exclude-from=/home/mcd/.dontdiff linux-2.6.9-rc2-mm4/include/linux/sched.h linux-2.6.9-rc2-mm4+per_arch-SD_INITs/include/linux/sched.h
--- linux-2.6.9-rc2-mm4/include/linux/sched.h 2004-09-27 15:57:56.000000000 -0700
+++ linux-2.6.9-rc2-mm4+per_arch-SD_INITs/include/linux/sched.h 2004-09-30 16:03:06.000000000 -0700
@@ -30,6 +30,7 @@
#include <linux/completion.h>
#include <linux/pid.h>
#include <linux/percpu.h>
+#include <linux/topology.h>
struct exec_domain;
@@ -486,78 +487,7 @@ extern cpumask_t cpu_isolated_map;
extern void init_sched_build_groups(struct sched_group groups[],
cpumask_t span, int (*group_fn)(int cpu));
extern void cpu_attach_domain(struct sched_domain *sd, int cpu);
-#endif
-
-#ifndef ARCH_HAS_SCHED_TUNE
-#ifdef CONFIG_SCHED_SMT
-#define ARCH_HAS_SCHED_WAKE_IDLE
-/* Common values for SMT siblings */
-#define SD_SIBLING_INIT (struct sched_domain) { \
- .span = CPU_MASK_NONE, \
- .parent = NULL, \
- .groups = NULL, \
- .min_interval = 1, \
- .max_interval = 2, \
- .busy_factor = 8, \
- .imbalance_pct = 110, \
- .cache_hot_time = 0, \
- .cache_nice_tries = 0, \
- .per_cpu_gain = 25, \
- .flags = SD_LOAD_BALANCE \
- | SD_BALANCE_NEWIDLE \
- | SD_BALANCE_EXEC \
- | SD_WAKE_AFFINE \
- | SD_WAKE_IDLE \
- | SD_SHARE_CPUPOWER, \
- .last_balance = jiffies, \
- .balance_interval = 1, \
- .nr_balance_failed = 0, \
-}
-#endif
-
-/* Common values for CPUs */
-#define SD_CPU_INIT (struct sched_domain) { \
- .span = CPU_MASK_NONE, \
- .parent = NULL, \
- .groups = NULL, \
- .min_interval = 1, \
- .max_interval = 4, \
- .busy_factor = 64, \
- .imbalance_pct = 125, \
- .cache_hot_time = (5*1000/2), \
- .cache_nice_tries = 1, \
- .per_cpu_gain = 100, \
- .flags = SD_LOAD_BALANCE \
- | SD_BALANCE_NEWIDLE \
- | SD_BALANCE_EXEC \
- | SD_WAKE_AFFINE \
- | SD_WAKE_BALANCE, \
- .last_balance = jiffies, \
- .balance_interval = 1, \
- .nr_balance_failed = 0, \
-}
-
-#if defined(CONFIG_NUMA) && !defined(SD_NODE_INIT)
-#define SD_NODE_INIT (struct sched_domain) { \
- .span = CPU_MASK_NONE, \
- .parent = NULL, \
- .groups = NULL, \
- .min_interval = 8, \
- .max_interval = 32, \
- .busy_factor = 32, \
- .imbalance_pct = 125, \
- .cache_hot_time = (10*1000), \
- .cache_nice_tries = 1, \
- .per_cpu_gain = 100, \
- .flags = SD_LOAD_BALANCE \
- | SD_BALANCE_EXEC \
- | SD_WAKE_BALANCE, \
- .last_balance = jiffies, \
- .balance_interval = 1, \
- .nr_balance_failed = 0, \
-}
-#endif
-#endif /* ARCH_HAS_SCHED_TUNE */
+#endif /* ARCH_HAS_SCHED_DOMAIN */
#endif /* CONFIG_SMP */
diff -Nurp --exclude-from=/home/mcd/.dontdiff linux-2.6.9-rc2-mm4/include/linux/topology.h linux-2.6.9-rc2-mm4+per_arch-SD_INITs/include/linux/topology.h
--- linux-2.6.9-rc2-mm4/include/linux/topology.h 2004-09-16 15:02:47.000000000 -0700
+++ linux-2.6.9-rc2-mm4+per_arch-SD_INITs/include/linux/topology.h 2004-10-01 15:15:56.000000000 -0700
@@ -61,4 +61,76 @@ static inline int __next_node_with_cpus(
#define PENALTY_FOR_NODE_WITH_CPUS (1)
#endif
+/*
+ * Below are the 3 major initializers used in building sched_domains:
+ * SD_SIBLING_INIT, for SMT domains
+ * SD_CPU_INIT, for SMP domains
+ * SD_NODE_INIT, for NUMA domains
+ *
+ * Any architecture that cares to do any tuning to these values should do so
+ * by defining their own arch-specific initializer in include/asm/topology.h.
+ * A definition there will automagically override these default initializers
+ * and allow arch-specific performance tuning of sched_domains.
+ */
+#ifdef CONFIG_SCHED_SMT
+/* MCD - Do we really need this? It is always on if CONFIG_SCHED_SMT is,
+ * so can't we drop this in favor of CONFIG_SCHED_SMT?
+ */
+#define ARCH_HAS_SCHED_WAKE_IDLE
+/* Common values for SMT siblings */
+#ifndef SD_SIBLING_INIT
+#define SD_SIBLING_INIT (struct sched_domain) { \
+ .span = CPU_MASK_NONE, \
+ .parent = NULL, \
+ .groups = NULL, \
+ .min_interval = 1, \
+ .max_interval = 2, \
+ .busy_factor = 8, \
+ .imbalance_pct = 110, \
+ .cache_hot_time = 0, \
+ .cache_nice_tries = 0, \
+ .per_cpu_gain = 25, \
+ .flags = SD_LOAD_BALANCE \
+ | SD_BALANCE_NEWIDLE \
+ | SD_BALANCE_EXEC \
+ | SD_WAKE_AFFINE \
+ | SD_WAKE_IDLE \
+ | SD_SHARE_CPUPOWER, \
+ .last_balance = jiffies, \
+ .balance_interval = 1, \
+ .nr_balance_failed = 0, \
+}
+#endif
+#endif /* CONFIG_SCHED_SMT */
+
+/* Common values for CPUs */
+#ifndef SD_CPU_INIT
+#define SD_CPU_INIT (struct sched_domain) { \
+ .span = CPU_MASK_NONE, \
+ .parent = NULL, \
+ .groups = NULL, \
+ .min_interval = 1, \
+ .max_interval = 4, \
+ .busy_factor = 64, \
+ .imbalance_pct = 125, \
+ .cache_hot_time = (5*1000/2), \
+ .cache_nice_tries = 1, \
+ .per_cpu_gain = 100, \
+ .flags = SD_LOAD_BALANCE \
+ | SD_BALANCE_NEWIDLE \
+ | SD_BALANCE_EXEC \
+ | SD_WAKE_AFFINE \
+ | SD_WAKE_BALANCE, \
+ .last_balance = jiffies, \
+ .balance_interval = 1, \
+ .nr_balance_failed = 0, \
+}
+#endif
+
+#ifdef CONFIG_NUMA
+#ifndef SD_NODE_INIT
+#error Please define an appropriate SD_NODE_INIT in include/asm/topology.h!!!
+#endif
+#endif /* CONFIG_NUMA */
+
#endif /* _LINUX_TOPOLOGY_H */
> Martin, Andi, Andrew & anyone else still reading this thread,
> Here's yet another version of a patch to implement per-arch SD_*_INITs.
> This follows the same basic idea of my last patch, but
> 1) defines an arch-specific SD_NODE_INIT for the 4 NUMA arches (i386,
> x86_64, IA64 & PPC64),
> 2) defines *default* SD_CPU_INIT & SD_SIBLING_INIT for *all* arches,
> with the possibility of them being overridden by simply defining an
> arch-specific version in include/asm/topology.h.
Looks good. tested. works ;-)
M.