Subject: [PATCH 0/4] x86: urgent fixes for AMD Magny-Cours support

Hi,

so finally I've created a basic set of patches to fix Linux for AMD
Magny-Cours CPU. Basic means no new sysfs attributes, no new cpumasks
etc. Most of it is AMD specific code. Only patch 4 modifies a more
generic x86 function.

I kindly ask to apply these patches for .31 -- even so late in the
game. It should not introduce regressions but allows to run .31 on
Magny-Cours without hiccups.

If it is really too late for .31 it should be scheduled for .32 and I'll try
to bring it into stable-31.


Thanks!

Andreas

--
Operating | Advanced Micro Devices GmbH
System | Karl-Hammerschmidt-Str. 34, 85609 Dornach b. M?nchen, Germany
Research | Gesch?ftsf?hrer: Andrew Bowd, Thomas M. McCoy, Giuliano Meroni
Center | Sitz: Dornach, Gemeinde Aschheim, Landkreis M?nchen
(OSRC) | Registergericht M?nchen, HRB Nr. 43632


Subject: [PATCH 1/4] x86: Fix CPU llc_shared_map information for AMD Magny-Cours


Construct entire NodeID and use it as cpu_llc_id. Thus internal node
siblings are stored in llc_shared_map.

Signed-off-by: Andreas Herrmann <[email protected]>
---
arch/x86/include/asm/cpufeature.h | 1 +
arch/x86/kernel/cpu/amd.c | 64 ++++++++++++++++++++++++++++++++++++-
2 files changed, 64 insertions(+), 1 deletions(-)

diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h
index 4a28d22..847fee6 100644
--- a/arch/x86/include/asm/cpufeature.h
+++ b/arch/x86/include/asm/cpufeature.h
@@ -95,6 +95,7 @@
#define X86_FEATURE_NONSTOP_TSC (3*32+24) /* TSC does not stop in C states */
#define X86_FEATURE_CLFLUSH_MONITOR (3*32+25) /* "" clflush reqd with monitor */
#define X86_FEATURE_EXTD_APICID (3*32+26) /* has extended APICID (8 bits) */
+#define X86_FEATURE_AMD_DCM (3*32+27) /* multi-node processor */

/* Intel-defined CPU features, CPUID level 0x00000001 (ecx), word 4 */
#define X86_FEATURE_XMM3 (4*32+ 0) /* "pni" SSE-3 */
diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c
index 63fddcd..7a00ba3 100644
--- a/arch/x86/kernel/cpu/amd.c
+++ b/arch/x86/kernel/cpu/amd.c
@@ -251,6 +251,64 @@ static int __cpuinit nearby_node(int apicid)
#endif

/*
+ * Fixup core topology information for AMD multi-node processors.
+ * Assumption 1: Number of cores in each internal node is the same.
+ * Assumption 2: Mixed systems with both single-node and dual-node
+ * processors are not supported.
+ */
+#ifdef CONFIG_X86_HT
+static void __cpuinit amd_fixup_dcm(struct cpuinfo_x86 *c)
+{
+#ifdef CONFIG_PCI
+ u32 t, cpn;
+ u8 n, n_id;
+ int cpu = smp_processor_id();
+
+ /* fixup topology information only once for a core */
+ if (cpu_has(c, X86_FEATURE_AMD_DCM))
+ return;
+
+ /* check for multi-node processor on boot cpu */
+ t = read_pci_config(0, 24, 3, 0xe8);
+ if (!(t & (1 << 29)))
+ return;
+
+ set_cpu_cap(c, X86_FEATURE_AMD_DCM);
+
+ /* cores per node: each internal node has half the number of cores */
+ cpn = c->x86_max_cores >> 1;
+
+ /* even-numbered NB_id of this dual-node processor */
+ n = c->phys_proc_id << 1;
+
+ /*
+ * determine internal node id and assign cores fifty-fifty to
+ * each node of the dual-node processor
+ */
+ t = read_pci_config(0, 24 + n, 3, 0xe8);
+ n = (t>>30) & 0x3;
+ if (n == 0) {
+ if (c->cpu_core_id < cpn)
+ n_id = 0;
+ else
+ n_id = 1;
+ } else {
+ if (c->cpu_core_id < cpn)
+ n_id = 1;
+ else
+ n_id = 0;
+ }
+
+ /* compute entire NodeID, use llc_shared_map to store sibling info */
+ per_cpu(cpu_llc_id, cpu) = (c->phys_proc_id << 1) + n_id;
+
+ /* fixup core id to be in range from 0 to cpn */
+ c->cpu_core_id = c->cpu_core_id % cpn;
+#endif
+}
+#endif
+
+/*
* On a AMD dual core setup the lower bits of the APIC id distingush the cores.
* Assumes number of cores is a power of two.
*/
@@ -267,6 +325,9 @@ static void __cpuinit amd_detect_cmp(struct cpuinfo_x86 *c)
c->phys_proc_id = c->initial_apicid >> bits;
/* use socket ID also for last level cache */
per_cpu(cpu_llc_id, cpu) = c->phys_proc_id;
+ /* fixup topology information on multi-node processors */
+ if ((c->x86 == 0x10) && (c->x86_model == 9))
+ amd_fixup_dcm(c);
#endif
}

@@ -277,7 +338,8 @@ static void __cpuinit srat_detect_node(struct cpuinfo_x86 *c)
int node;
unsigned apicid = cpu_has_apic ? hard_smp_processor_id() : c->apicid;

- node = c->phys_proc_id;
+ node = per_cpu(cpu_llc_id, cpu);
+
if (apicid_to_node[apicid] != NUMA_NO_NODE)
node = apicid_to_node[apicid];
if (!node_online(node)) {
--
1.6.4


Subject: [PATCH 2/4] x86, cacheinfo: Fixup L3 cache information for AMD multi-node processors


L3 cache size, associativity and shared_cpu information need to be
adapted to show information for an internal node instead of the
entire physical package.

Signed-off-by: Andreas Herrmann <[email protected]>
---
arch/x86/kernel/cpu/intel_cacheinfo.c | 32 ++++++++++++++++++++++----------
1 files changed, 22 insertions(+), 10 deletions(-)

diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/intel_cacheinfo.c
index 789efe2..cb98a73 100644
--- a/arch/x86/kernel/cpu/intel_cacheinfo.c
+++ b/arch/x86/kernel/cpu/intel_cacheinfo.c
@@ -241,7 +241,7 @@ amd_cpuid4(int leaf, union _cpuid4_leaf_eax *eax,
case 0:
if (!l1->val)
return;
- assoc = l1->assoc;
+ assoc = assocs[l1->assoc];
line_size = l1->line_size;
lines_per_tag = l1->lines_per_tag;
size_in_kb = l1->size_in_kb;
@@ -249,7 +249,7 @@ amd_cpuid4(int leaf, union _cpuid4_leaf_eax *eax,
case 2:
if (!l2.val)
return;
- assoc = l2.assoc;
+ assoc = assocs[l2.assoc];
line_size = l2.line_size;
lines_per_tag = l2.lines_per_tag;
/* cpu_data has errata corrections for K7 applied */
@@ -258,10 +258,14 @@ amd_cpuid4(int leaf, union _cpuid4_leaf_eax *eax,
case 3:
if (!l3.val)
return;
- assoc = l3.assoc;
+ assoc = assocs[l3.assoc];
line_size = l3.line_size;
lines_per_tag = l3.lines_per_tag;
size_in_kb = l3.size_encoded * 512;
+ if (boot_cpu_has(X86_FEATURE_AMD_DCM)) {
+ size_in_kb = size_in_kb >> 1;
+ assoc = assoc >> 1;
+ }
break;
default:
return;
@@ -270,18 +274,14 @@ amd_cpuid4(int leaf, union _cpuid4_leaf_eax *eax,
eax->split.is_self_initializing = 1;
eax->split.type = types[leaf];
eax->split.level = levels[leaf];
- if (leaf == 3)
- eax->split.num_threads_sharing =
- current_cpu_data.x86_max_cores - 1;
- else
- eax->split.num_threads_sharing = 0;
+ eax->split.num_threads_sharing = 0;
eax->split.num_cores_on_die = current_cpu_data.x86_max_cores - 1;


- if (assoc == 0xf)
+ if (assoc == 0xffff)
eax->split.is_fully_associative = 1;
ebx->split.coherency_line_size = line_size - 1;
- ebx->split.ways_of_associativity = assocs[assoc] - 1;
+ ebx->split.ways_of_associativity = assoc - 1;
ebx->split.physical_line_partition = lines_per_tag - 1;
ecx->split.number_of_sets = (size_in_kb * 1024) / line_size /
(ebx->split.ways_of_associativity + 1) - 1;
@@ -522,6 +522,18 @@ static void __cpuinit cache_shared_cpu_map_setup(unsigned int cpu, int index)
int index_msb, i;
struct cpuinfo_x86 *c = &cpu_data(cpu);

+ if ((index == 3) && (c->x86_vendor == X86_VENDOR_AMD)) {
+ struct cpuinfo_x86 *d;
+ for_each_online_cpu(i) {
+ if (!per_cpu(cpuid4_info, i))
+ continue;
+ d = &cpu_data(i);
+ this_leaf = CPUID4_INFO_IDX(i, index);
+ cpumask_copy(to_cpumask(this_leaf->shared_cpu_map),
+ d->llc_shared_map);
+ }
+ return;
+ }
this_leaf = CPUID4_INFO_IDX(cpu, index);
num_threads_sharing = 1 + this_leaf->eax.split.num_threads_sharing;

--
1.6.4


Subject: [PATCH 3/4] x86, mcheck: Use correct cpumask for shared bank4


This fixes threshold_bank4 support on multi-node processors.

The correct mask to use is llc_shared_map, representing an internal
node on Magny-Cours.

We need to create 2 sets of symlinks for sibling shared banks -- one
set for each internal node, symlinks of each set should target the
first core on same internal node.

Currently only one set is created where all symlinks are targeting
the first core of the entire socket.

Signed-off-by: Andreas Herrmann <[email protected]>
---
arch/x86/kernel/cpu/mcheck/mce_amd.c | 8 +++++---
1 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd.c b/arch/x86/kernel/cpu/mcheck/mce_amd.c
index ddae216..1fecba4 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_amd.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_amd.c
@@ -489,12 +489,14 @@ static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank)
int i, err = 0;
struct threshold_bank *b = NULL;
char name[32];
+ struct cpuinfo_x86 *c = &cpu_data(cpu);
+

sprintf(name, "threshold_bank%i", bank);

#ifdef CONFIG_SMP
if (cpu_data(cpu).cpu_core_id && shared_bank[bank]) { /* symlink */
- i = cpumask_first(cpu_core_mask(cpu));
+ i = cpumask_first(c->llc_shared_map);

/* first core not up yet */
if (cpu_data(i).cpu_core_id)
@@ -514,7 +516,7 @@ static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank)
if (err)
goto out;

- cpumask_copy(b->cpus, cpu_core_mask(cpu));
+ cpumask_copy(b->cpus, c->llc_shared_map);
per_cpu(threshold_banks, cpu)[bank] = b;

goto out;
@@ -539,7 +541,7 @@ static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank)
#ifndef CONFIG_SMP
cpumask_setall(b->cpus);
#else
- cpumask_copy(b->cpus, cpu_core_mask(cpu));
+ cpumask_copy(b->cpus, c->llc_shared_map);
#endif

per_cpu(threshold_banks, cpu)[bank] = b;
--
1.6.4


Subject: [PATCH 4/4] x86, sched: Workaround broken sched domain creation for AMD Magny-Cours


Current sched domain creation code can't handle multi-node processors.
When switching to power_savings scheduling errors show up and
system might hang later on (due to broken sched domain hierarchy):

# echo 0 >> /sys/devices/system/cpu/sched_mc_power_savings
CPU0 attaching sched-domain:
domain 0: span 0-5 level MC
groups: 0 1 2 3 4 5
domain 1: span 0-23 level NODE
groups: 0-5 6-11 18-23 12-17
...
# echo 1 >> /sys/devices/system/cpu/sched_mc_power_savings
CPU0 attaching sched-domain:
domain 0: span 0-11 level MC
groups: 0 1 2 3 4 5 6 7 8 9 10 11
ERROR: parent span is not a superset of domain->span
domain 1: span 0-5 level CPU
ERROR: domain->groups does not contain CPU0
groups: 6-11 (__cpu_power = 12288)
ERROR: groups don't span domain->span
domain 2: span 0-23 level NODE
groups:
ERROR: domain->cpu_power not set

ERROR: groups don't span domain->span
...

Fixing all aspects of power-savings scheduling for Magny-Cours needs
some larger changes in the sched domain creation code.

As a short-term and temporary workaround avoid the problems by
extending "the worst possible hack" ;-(
and always use llc_shared_map on AMD Magny-Cours when MC domain span
is calculated.

With this I get:

# echo 1 >> /sys/devices/system/cpu/sched_mc_power_savings
CPU0 attaching sched-domain:
domain 0: span 0-5 level MC
groups: 0 1 2 3 4 5
domain 1: span 0-5 level CPU
groups: 0-5 (__cpu_power = 6144)
domain 2: span 0-23 level NODE
groups: 0-5 (__cpu_power = 6144) 6-11 (__cpu_power = 6144) 18-23 (__cpu_power = 6144) 12-17 (__cpu_power = 6144)
...

I.e. no errors during sched domain creation, no system hangs, and also
mc_power_savings scheduling works to a certain extend.

Cc: Peter Zijlstra <[email protected]>
Signed-off-by: Andreas Herrmann <[email protected]>
---
arch/x86/kernel/smpboot.c | 3 ++-
1 files changed, 2 insertions(+), 1 deletions(-)

diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index 2fecda6..c36cc14 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -434,7 +434,8 @@ const struct cpumask *cpu_coregroup_mask(int cpu)
* For perf, we return last level cache shared map.
* And for power savings, we return cpu_core_map
*/
- if (sched_mc_power_savings || sched_smt_power_savings)
+ if ((sched_mc_power_savings || sched_smt_power_savings) &&
+ !(cpu_has(c, X86_FEATURE_AMD_DCM)))
return cpu_core_mask(cpu);
else
return c->llc_shared_map;
--
1.6.4