There is an "interesting" quality of POWER6 cores, which each have 2
hardware threads: assuming one thread on the core is idle, the primary
thread is a little "faster" than the secondary thread. To illustrate:
for cpumask in 0x1 0x2 ; do
taskset $cpumask /usr/bin/time -f "%e elapsed, %U user, %S sys" \
/bin/sh -c "i=1000000 ; while (( i-- )) ; do : ; done"
done
17.05 elapsed, 16.83 user, 0.22 sys
17.54 elapsed, 17.32 user, 0.22 sys
(The first result is for a primary thread; the second result for a
secondary thread.)
So it would be nice to have the scheduler slightly prefer primary
threads on POWER6 machines. These patches, which allow the
architecture to override the scheduler's CPU "power" calculation, are
one possible approach, but I'm open to others. Please note: these
seemed to have the desired effect on 2.6.25-rc kernels (2-3%
improvement in a kernbench-like make -j <nr_cores>), but I'm not
seeing this improvement with 2.6.26-rc kernels for some reason I am
still trying to track down.
Nathan Lynch (3):
sched: support arch override of sched_group cpu power
add cpu_power to machdep_calls, override SD_SIBLING_INIT
adjust cpu power for secondary threads on POWER6
arch/powerpc/kernel/setup-common.c | 7 ++++++
arch/powerpc/platforms/pseries/setup.c | 37 ++++++++++++++++++++++++++++++++
include/asm-powerpc/cputable.h | 3 +-
include/asm-powerpc/machdep.h | 2 +
include/asm-powerpc/topology.h | 31 ++++++++++++++++++++++++++
include/linux/sched.h | 1 +
kernel/sched.c | 14 ++++++++++++
7 files changed, 94 insertions(+), 1 deletions(-)
Add a new sched domain flag, SD_ASYM_CPU_POWER, which signifies that
the architecture may override the cpu power for a cpu via a hook in
init_sched_groups_power(). Add a dummy definition of arch_cpu_power()
which conforms with the existing behavior.
Signed-off-by: Nathan Lynch <[email protected]>
---
include/linux/sched.h | 1 +
kernel/sched.c | 14 ++++++++++++++
2 files changed, 15 insertions(+), 0 deletions(-)
diff --git a/include/linux/sched.h b/include/linux/sched.h
index c5d3f84..cfbefca 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -713,6 +713,7 @@ enum cpu_idle_type {
#define SD_SHARE_PKG_RESOURCES 512 /* Domain members share cpu pkg resources */
#define SD_SERIALIZE 1024 /* Only a single load balancing instance */
#define SD_WAKE_IDLE_FAR 2048 /* Gain latency sacrificing cache hit */
+#define SD_ASYM_CPU_POWER 4096 /* Domain members of unequal power */
#define BALANCE_FOR_MC_POWER \
(sched_smt_power_savings ? SD_POWERSAVINGS_BALANCE : 0)
diff --git a/kernel/sched.c b/kernel/sched.c
index eaf6751..3fba083 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -6761,6 +6761,13 @@ static void free_sched_groups(const cpumask_t *cpu_map, cpumask_t *nodemask)
}
#endif
+#ifndef arch_cpu_power
+static inline unsigned int arch_cpu_power(int cpu, unsigned int default_power)
+{
+ return default_power;
+}
+#endif
+
/*
* Initialize sched groups cpu_power.
*
@@ -6789,6 +6796,13 @@ static void init_sched_groups_power(int cpu, struct sched_domain *sd)
sd->groups->__cpu_power = 0;
+ if (!child && (sd->flags & SD_ASYM_CPU_POWER)) {
+ unsigned int power = arch_cpu_power(cpu, SCHED_LOAD_SCALE);
+
+ sg_inc_cpu_power(sd->groups, power);
+ return;
+ }
+
/*
* For perf policy, if the groups in child domain share resources
* (for example cores sharing some portions of the cache hierarchy
--
1.5.5
Add a cpu_power() call to machdep_calls, which will allow platforms to
override the scheduler's default cpu power calculation. If the
platform does not provide a cpu_power() method, the scheduler's
default value is used.
Copy the default SD_SIBLING_INIT to powerpc's topology.h and add the
SD_ASYM_CPU_POWER flag, which will cause ppc_md.cpu_power() to be
invoked (via arch_cpu_power()) during sched domain initialization.
Signed-off-by: Nathan Lynch <[email protected]>
---
arch/powerpc/kernel/setup-common.c | 7 +++++++
include/asm-powerpc/machdep.h | 2 ++
include/asm-powerpc/topology.h | 31 +++++++++++++++++++++++++++++++
3 files changed, 40 insertions(+), 0 deletions(-)
diff --git a/arch/powerpc/kernel/setup-common.c b/arch/powerpc/kernel/setup-common.c
index db540ea..609d09e 100644
--- a/arch/powerpc/kernel/setup-common.c
+++ b/arch/powerpc/kernel/setup-common.c
@@ -671,3 +671,10 @@ static int powerpc_debugfs_init(void)
}
arch_initcall(powerpc_debugfs_init);
#endif
+
+unsigned int arch_cpu_power(int cpu, unsigned int default_power)
+{
+ if (ppc_md.cpu_power)
+ return ppc_md.cpu_power(cpu, default_power);
+ return default_power;
+}
diff --git a/include/asm-powerpc/machdep.h b/include/asm-powerpc/machdep.h
index 54ed64d..de6ff6b 100644
--- a/include/asm-powerpc/machdep.h
+++ b/include/asm-powerpc/machdep.h
@@ -260,6 +260,8 @@ struct machdep_calls {
void (*suspend_disable_irqs)(void);
void (*suspend_enable_irqs)(void);
#endif
+ /* Override scheduler's cpu power calculation */
+ unsigned int (*cpu_power)(int cpu, unsigned int default_power);
};
extern void power4_idle(void);
diff --git a/include/asm-powerpc/topology.h b/include/asm-powerpc/topology.h
index 100c6fb..4335c15 100644
--- a/include/asm-powerpc/topology.h
+++ b/include/asm-powerpc/topology.h
@@ -72,6 +72,37 @@ static inline int pcibus_to_node(struct pci_bus *bus)
.nr_balance_failed = 0, \
}
+#define SD_SIBLING_INIT (struct sched_domain) { \
+ .span = CPU_MASK_NONE, \
+ .parent = NULL, \
+ .child = NULL, \
+ .groups = NULL, \
+ .min_interval = 1, \
+ .max_interval = 2, \
+ .busy_factor = 64, \
+ .imbalance_pct = 110, \
+ .cache_nice_tries = 0, \
+ .busy_idx = 0, \
+ .idle_idx = 0, \
+ .newidle_idx = 0, \
+ .wake_idx = 0, \
+ .forkexec_idx = 0, \
+ .flags = SD_LOAD_BALANCE \
+ | SD_BALANCE_NEWIDLE \
+ | SD_BALANCE_FORK \
+ | SD_BALANCE_EXEC \
+ | SD_WAKE_AFFINE \
+ | SD_WAKE_IDLE \
+ | SD_SHARE_CPUPOWER \
+ | SD_ASYM_CPU_POWER, \
+ .last_balance = jiffies, \
+ .balance_interval = 1, \
+ .nr_balance_failed = 0, \
+}
+
+#define arch_cpu_power(cpu, power) arch_cpu_power(cpu, power)
+extern unsigned int arch_cpu_power(int cpu, unsigned int default_power);
+
extern void __init dump_numa_cpu_topology(void);
extern int sysfs_add_device_to_node(struct sys_device *dev, int nid);
--
1.5.5
On POWER6 processors, cpu-bound programs generally perform better on
the primary thread when the secondary thread is idle than they do on
the secondary thread while the primary thread is idle. This
difference can be observed by timing a simple shell loop:
for cpumask in 0x1 0x2 ; do
taskset $cpumask /usr/bin/time -f "%e elapsed, %U user, %S sys" \
/bin/sh -c "i=1000000 ; while (( i-- )) ; do : ; done"
done
17.05 elapsed, 16.83 user, 0.22 sys
17.54 elapsed, 17.32 user, 0.22 sys
(The first result is for a primary thread; the second result for a
secondary thread.)
So we want the CPU scheduler to slightly favor primary threads on
POWER6.
Add a new cpu feature bit which indicates the need to override the
scheduler's cpu power calculation.
Implement ppc_md.cpu_power for the pseries platform, and scale
secondary threads' cpu power to 97% of the (default) primary threads'
cpu power. Allow this percentage to be overriden on the kernel
command line via "sec_thread_power_scale=".
Signed-off-by: Nathan Lynch <[email protected]>
---
arch/powerpc/platforms/pseries/setup.c | 37 ++++++++++++++++++++++++++++++++
include/asm-powerpc/cputable.h | 3 +-
2 files changed, 39 insertions(+), 1 deletions(-)
diff --git a/arch/powerpc/platforms/pseries/setup.c b/arch/powerpc/platforms/pseries/setup.c
index f5d29f5..a1141c0 100644
--- a/arch/powerpc/platforms/pseries/setup.c
+++ b/arch/powerpc/platforms/pseries/setup.c
@@ -501,6 +501,42 @@ void pSeries_power_off(void)
for (;;);
}
+/* Percentage by which the cpu power of secondary threads is adjusted */
+static unsigned int sec_thread_power_scale = 97;
+static int __init setup_sec_thread_power_scale(char *str)
+{
+ int power;
+
+ if (get_option(&str, &power) && power > 0 && power <= 100)
+ sec_thread_power_scale = power;
+
+ return 1;
+}
+__setup("sec_thread_power_scale=", setup_sec_thread_power_scale);
+
+static unsigned int pseries_cpu_power(int cpu, unsigned int default_power)
+{
+ struct device_node *np;
+ unsigned int thread, power;
+
+ if (!cpu_has_feature(CPU_FTR_ASYM_POWER))
+ return default_power;
+
+ power = default_power;
+
+ np = of_get_cpu_node(cpu, &thread);
+ WARN_ON(!np);
+ if (!np)
+ goto out;
+
+ /* If this isn't a primary thread, scale the power */
+ if (thread != 0)
+ power = default_power * sec_thread_power_scale / 100;
+out:
+ of_node_put(np);
+ return power;
+}
+
#ifndef CONFIG_PCI
void pSeries_final_fixup(void) { }
#endif
@@ -525,4 +561,5 @@ define_machine(pseries) {
.progress = rtas_progress,
.system_reset_exception = pSeries_system_reset_exception,
.machine_check_exception = pSeries_machine_check_exception,
+ .cpu_power = pseries_cpu_power,
};
diff --git a/include/asm-powerpc/cputable.h b/include/asm-powerpc/cputable.h
index 1e79673..eb886a9 100644
--- a/include/asm-powerpc/cputable.h
+++ b/include/asm-powerpc/cputable.h
@@ -152,6 +152,7 @@ extern void do_feature_fixups(unsigned long value, void *fixup_start,
#define CPU_FTR_UNIFIED_ID_CACHE ASM_CONST(0x0000000001000000)
#define CPU_FTR_SPE ASM_CONST(0x0000000002000000)
#define CPU_FTR_NEED_PAIRED_STWCX ASM_CONST(0x0000000004000000)
+#define CPU_FTR_ASYM_POWER ASM_CONST(0x0000000008000000)
/*
* Add the 64-bit processor unique features in the top half of the word;
@@ -375,7 +376,7 @@ extern void do_feature_fixups(unsigned long value, void *fixup_start,
CPU_FTR_MMCRA | CPU_FTR_SMT | \
CPU_FTR_COHERENT_ICACHE | CPU_FTR_LOCKLESS_TLBIE | \
CPU_FTR_PURR | CPU_FTR_SPURR | CPU_FTR_REAL_LE | \
- CPU_FTR_DSCR)
+ CPU_FTR_DSCR | CPU_FTR_ASYM_POWER)
#define CPU_FTRS_CELL (CPU_FTR_USE_TB | \
CPU_FTR_HPTE_TABLE | CPU_FTR_PPCAS_ARCH_V2 | CPU_FTR_CTRL | \
CPU_FTR_ALTIVEC_COMP | CPU_FTR_MMCRA | CPU_FTR_SMT | \
--
1.5.5
Hi,
> +static unsigned int pseries_cpu_power(int cpu, unsigned int
> default_power)
> +{
> + struct device_node *np;
> + unsigned int thread, power;
> +
> + if (!cpu_has_feature(CPU_FTR_ASYM_POWER))
> + return default_power;
> +
> + power = default_power;
Why not just NULL out the cpu_power function pointers on platforms
that don't have the feature bit instead? (or the other way around:
NULL by default, and set only on platforms that have imbalanced threads.
It'd be even nicer to do so without using up a cpu feature bit, since
it'd then only be used in determining whether to set the ppc_md
function pointer or not, but I don't know if there's a convenient way
to probe for this at runtime (cpu device tree node properties?).
-Olof
On Jun 18, 2008, at 9:58 PM, Olof Johansson wrote:
> Why not just NULL out the cpu_power function pointers on platforms
> that don't have the feature bit instead? (or the other way around:
> NULL by default, and set only on platforms that have imbalanced
> threads.
Nevermind; bogus optimization. I just noticed that it's only used when
setting up the sched domains, and isn't adding path length later on. :)
-Olof
* Nathan Lynch <[email protected]> wrote:
> There is an "interesting" quality of POWER6 cores, which each have 2
> hardware threads: assuming one thread on the core is idle, the primary
> thread is a little "faster" than the secondary thread. To illustrate:
>
> for cpumask in 0x1 0x2 ; do
> taskset $cpumask /usr/bin/time -f "%e elapsed, %U user, %S sys" \
> /bin/sh -c "i=1000000 ; while (( i-- )) ; do : ; done"
> done
>
> 17.05 elapsed, 16.83 user, 0.22 sys
> 17.54 elapsed, 17.32 user, 0.22 sys
>
> (The first result is for a primary thread; the second result for a
> secondary thread.)
>
> So it would be nice to have the scheduler slightly prefer primary
> threads on POWER6 machines. These patches, which allow the
> architecture to override the scheduler's CPU "power" calculation, are
> one possible approach, but I'm open to others. Please note: these
> seemed to have the desired effect on 2.6.25-rc kernels (2-3%
> improvement in a kernbench-like make -j <nr_cores>), but I'm not
> seeing this improvement with 2.6.26-rc kernels for some reason I am
> still trying to track down.
ok, i guess that discrepancy has to be tracked down before we can think
about these patches - but the principle is OK.
One problem is that the whole cpu-power balancing code in sched.c is a
bit ... unclear and under-documented. So any change to this area should
begin at documenting the basics: what do the units mean exactly, how are
they used in balancing and what is the desired effect.
I'd not be surprised if there were a few buglets in this area, SMT is
not at the forefront of testing at the moment. There's nothing
spectacularly broken in it (i have a HT machine myself), but the
concepts have bitrotten a bit. Patches - even if they just add comments
- are welcome :-)
Ingo
Ingo Molnar wrote:
>
> * Nathan Lynch <[email protected]> wrote:
> > So it would be nice to have the scheduler slightly prefer primary
> > threads on POWER6 machines. These patches, which allow the
> > architecture to override the scheduler's CPU "power" calculation, are
> > one possible approach, but I'm open to others. Please note: these
> > seemed to have the desired effect on 2.6.25-rc kernels (2-3%
> > improvement in a kernbench-like make -j <nr_cores>), but I'm not
> > seeing this improvement with 2.6.26-rc kernels for some reason I am
> > still trying to track down.
>
> ok, i guess that discrepancy has to be tracked down before we can think
> about these patches - but the principle is OK.
Great. I'll keep trying to figure out what's going on there.
> One problem is that the whole cpu-power balancing code in sched.c is a
> bit ... unclear and under-documented. So any change to this area should
> begin at documenting the basics: what do the units mean exactly, how are
> they used in balancing and what is the desired effect.
>
> I'd not be surprised if there were a few buglets in this area, SMT is
> not at the forefront of testing at the moment. There's nothing
> spectacularly broken in it (i have a HT machine myself), but the
> concepts have bitrotten a bit. Patches - even if they just add comments
> - are welcome :-)
Okay, I'll have a look. Thanks Ingo.
Hi Nathan,
Nathan Lynch wrote:
> There is an "interesting" quality of POWER6 cores, which each have 2
> hardware threads: assuming one thread on the core is idle, the primary
> thread is a little "faster" than the secondary thread. To illustrate:
>
I found this feature interesting and decided to do some tests.
After some tests I found that the example you post really runs fast in
the first CPU, but a more "elaborated" application runs slower on the
first CPU.
Here is a small example:
# taskset 0x1 time -f "%e, %U, %S" ./a.out ; taskset 0x2 time -f "%e,
%U, %S" ./a.out
10.77, 10.72, 0.01
10.53, 10.48, 0.01
# taskset 0x2 time -f "%e, %U, %S" ./a.out ; taskset 0x1 time -f "%e,
%U, %S" ./a.out
10.55, 10.50, 0.01
10.77, 10.72, 0.01
# cat calc.c
#include <stdio.h>
int main(){
int j = 0;
float i = 42;
srand(123);
while (j++ < 100000000){
i = i*i + i;
i = i/2 + random(2);
}
printf("%d\n", i);
return 0;
}
# cat /proc/cpuinfo
processor : 0
cpu : POWER6 (architected), altivec supported
clock : 5000.001000MHz
revision : 3.2 (pvr 003e 0302)
processor : 1
cpu : POWER6 (architected), altivec supported
clock : 5000.001000MHz
revision : 3.2 (pvr 003e 0302)
...
Note that the IRQ are balanced among the 8 CPUs, and the machine is idle.
Do you know why I get this difference? Something wrong with the test?
Thanks
-
Breno Leitao
Linux Technology Center Brazil
Phone: +55-16-8115-3915 (T/L: 839-1293)
[email protected]
Breno Leitao wrote:
> Nathan Lynch wrote:
>> There is an "interesting" quality of POWER6 cores, which each have 2
>> hardware threads: assuming one thread on the core is idle, the primary
>> thread is a little "faster" than the secondary thread. To illustrate:
>>
> I found this feature interesting and decided to do some tests.
> After some tests I found that the example you post really runs fast in
> the first CPU, but a more "elaborated" application runs slower on the
> first CPU.
> Here is a small example:
>
> # taskset 0x1 time -f "%e, %U, %S" ./a.out ; taskset 0x2 time -f "%e,
> %U, %S" ./a.out
> 10.77, 10.72, 0.01
> 10.53, 10.48, 0.01
>
> # taskset 0x2 time -f "%e, %U, %S" ./a.out ; taskset 0x1 time -f "%e,
> %U, %S" ./a.out
> 10.55, 10.50, 0.01
> 10.77, 10.72, 0.01
I've been able to duplicate your results, thanks for the testcase.
Guess I'll need to understand what's going on before continuing with
this...