2012-11-05 06:06:01

by Daniel J Blueman

[permalink] [raw]
Subject: [PATCH 1/3, v5] AMD64 EDAC: Add muli-domain support

Fix the handling of memory controller detection to index the array
of detected Northbridges, allowing memory controllers over multiple
PCI domains in federated systems eg using Numascale's NumaConnect/
NumaChip.

v4: Generate linear Northbridge ID by indexing detected Northbridges
v5: Reorder functions to prevent extra function declaration; merge 4th
patch; simplify Fam15h code; add detail to warning

Signed-off-by: Daniel J Blueman <[email protected]>
---
arch/x86/include/asm/amd_nb.h | 13 +++++++++++
drivers/edac/amd64_edac.c | 49 ++++++++++++++++++++++-------------------
drivers/edac/amd64_edac.h | 6 -----
3 files changed, 39 insertions(+), 29 deletions(-)

diff --git a/arch/x86/include/asm/amd_nb.h b/arch/x86/include/asm/amd_nb.h
index b3341e9..9f5532a 100644
--- a/arch/x86/include/asm/amd_nb.h
+++ b/arch/x86/include/asm/amd_nb.h
@@ -81,6 +81,19 @@ static inline struct amd_northbridge *node_to_amd_nb(int node)
return (node < amd_northbridges.num) ? &amd_northbridges.nb[node] : NULL;
}

+static inline u16 amd_get_node_id(struct pci_dev *pdev)
+{
+ int i;
+
+ for (i = 0; i != amd_nb_num(); i++)
+ if (pci_domain_nr(node_to_amd_nb(i)->misc->bus) == pci_domain_nr(pdev->bus) &&
+ PCI_SLOT(node_to_amd_nb(i)->misc->devfn) == PCI_SLOT(pdev->devfn))
+ return i;
+
+ WARN(1, "Unable to find AMD Northbridge identifier for %s\n", pci_name(pdev));
+ return 0;
+}
+
#else

#define amd_nb_num(x) 0
diff --git a/drivers/edac/amd64_edac.c b/drivers/edac/amd64_edac.c
index cc8e7c7..852f1cd 100644
--- a/drivers/edac/amd64_edac.c
+++ b/drivers/edac/amd64_edac.c
@@ -982,6 +982,24 @@ static u64 get_error_address(struct mce *m)
return addr;
}

+static struct pci_dev *pci_get_related_function(unsigned int vendor,
+ unsigned int device,
+ struct pci_dev *related)
+{
+ struct pci_dev *dev = NULL;
+
+ dev = pci_get_device(vendor, device, dev);
+ while (dev) {
+ if (pci_domain_nr(dev->bus) == pci_domain_nr(related->bus) &&
+ (dev->bus->number == related->bus->number) &&
+ (PCI_SLOT(dev->devfn) == PCI_SLOT(related->devfn)))
+ break;
+ dev = pci_get_device(vendor, device, dev);
+ }
+
+ return dev;
+}
+
static void read_dram_base_limit_regs(struct amd64_pvt *pvt, unsigned range)
{
struct cpuinfo_x86 *c = &boot_cpu_data;
@@ -1001,11 +1019,13 @@ static void read_dram_base_limit_regs(struct amd64_pvt *pvt, unsigned range)

/* Factor in CC6 save area by reading dst node's limit reg */
if (c->x86 == 0x15) {
- struct pci_dev *f1 = NULL;
- u8 nid = dram_dst_node(pvt, range);
+ struct pci_dev *misc, *f1 = NULL;
+ struct amd64_family_type *fam_type;
+ u16 nid = dram_dst_node(pvt, range);
u32 llim;

- f1 = pci_get_domain_bus_and_slot(0, 0, PCI_DEVFN(0x18 + nid, 1));
+ misc = node_to_amd_nb(nid)->misc;
+ f1 = pci_get_related_function(misc->vendor, PCI_DEVICE_ID_AMD_15H_NB_F1, misc);
if (WARN_ON(!f1))
return;

@@ -1712,23 +1732,6 @@ static struct amd64_family_type amd64_family_types[] = {
},
};

-static struct pci_dev *pci_get_related_function(unsigned int vendor,
- unsigned int device,
- struct pci_dev *related)
-{
- struct pci_dev *dev = NULL;
-
- dev = pci_get_device(vendor, device, dev);
- while (dev) {
- if ((dev->bus->number == related->bus->number) &&
- (PCI_SLOT(dev->devfn) == PCI_SLOT(related->devfn)))
- break;
- dev = pci_get_device(vendor, device, dev);
- }
-
- return dev;
-}
-
/*
* These are tables of eigenvectors (one per line) which can be used for the
* construction of the syndrome tables. The modified syndrome search algorithm
@@ -2546,7 +2549,7 @@ static int amd64_init_one_instance(struct pci_dev *F2)
struct mem_ctl_info *mci = NULL;
struct edac_mc_layer layers[2];
int err = 0, ret;
- u8 nid = get_node_id(F2);
+ u8 nid = amd_get_node_id(F2);

ret = -ENOMEM;
pvt = kzalloc(sizeof(struct amd64_pvt), GFP_KERNEL);
@@ -2637,7 +2640,7 @@ err_ret:
static int __devinit amd64_probe_one_instance(struct pci_dev *pdev,
const struct pci_device_id *mc_type)
{
- u8 nid = get_node_id(pdev);
+ u8 nid = amd_get_node_id(pdev);
struct pci_dev *F3 = node_to_amd_nb(nid)->misc;
struct ecc_settings *s;
int ret = 0;
@@ -2687,7 +2690,7 @@ static void __devexit amd64_remove_one_instance(struct pci_dev *pdev)
{
struct mem_ctl_info *mci;
struct amd64_pvt *pvt;
- u8 nid = get_node_id(pdev);
+ u8 nid = amd_get_node_id(pdev);
struct pci_dev *F3 = node_to_amd_nb(nid)->misc;
struct ecc_settings *s = ecc_stngs[nid];

diff --git a/drivers/edac/amd64_edac.h b/drivers/edac/amd64_edac.h
index 8d48047..90cae61 100644
--- a/drivers/edac/amd64_edac.h
+++ b/drivers/edac/amd64_edac.h
@@ -290,12 +290,6 @@
/* MSRs */
#define MSR_MCGCTL_NBE BIT(4)

-/* AMD sets the first MC device at device ID 0x18. */
-static inline u8 get_node_id(struct pci_dev *pdev)
-{
- return PCI_SLOT(pdev->devfn) - 0x18;
-}
-
enum amd_families {
K8_CPUS = 0,
F10_CPUS,
--
1.7.10.4


2012-11-05 06:06:10

by Daniel J Blueman

[permalink] [raw]
Subject: [PATCH 2/3, v3] AMD64 EDAC: Support >255 memory controllers

As the AMD64 last-level-cache ID is 16-bits and federated systems
eg using Numascale's NumaConnect/NumaChip can have more than 255 memory
controllers, use 16-bits to store the ID.

v2: Avoid change to intlv_en variable
v3: Drop unneeded change to index

Signed-off-by: Daniel J Blueman <[email protected]>
---
drivers/edac/amd64_edac.c | 19 ++++++++++---------
1 file changed, 10 insertions(+), 9 deletions(-)

diff --git a/drivers/edac/amd64_edac.c b/drivers/edac/amd64_edac.c
index 852f1cd..5dfe452 100644
--- a/drivers/edac/amd64_edac.c
+++ b/drivers/edac/amd64_edac.c
@@ -942,7 +942,8 @@ static u64 get_error_address(struct mce *m)
struct amd64_pvt *pvt;
u64 cc6_base, tmp_addr;
u32 tmp;
- u8 mce_nid, intlv_en;
+ u16 mce_nid;
+ u8 intlv_en;

if ((addr & GENMASK(24, 47)) >> 24 != 0x00fdf7)
return addr;
@@ -2299,7 +2300,7 @@ out:
return ret;
}

-static int toggle_ecc_err_reporting(struct ecc_settings *s, u8 nid, bool on)
+static int toggle_ecc_err_reporting(struct ecc_settings *s, u16 nid, bool on)
{
cpumask_var_t cmask;
int cpu;
@@ -2337,7 +2338,7 @@ static int toggle_ecc_err_reporting(struct ecc_settings *s, u8 nid, bool on)
return 0;
}

-static bool enable_ecc_error_reporting(struct ecc_settings *s, u8 nid,
+static bool enable_ecc_error_reporting(struct ecc_settings *s, u16 nid,
struct pci_dev *F3)
{
bool ret = true;
@@ -2389,7 +2390,7 @@ static bool enable_ecc_error_reporting(struct ecc_settings *s, u8 nid,
return ret;
}

-static void restore_ecc_error_reporting(struct ecc_settings *s, u8 nid,
+static void restore_ecc_error_reporting(struct ecc_settings *s, u16 nid,
struct pci_dev *F3)
{
u32 value, mask = 0x3; /* UECC/CECC enable */
@@ -2428,7 +2429,7 @@ static const char *ecc_msg =
"'ecc_enable_override'.\n"
" (Note that use of the override may cause unknown side effects.)\n";

-static bool ecc_enabled(struct pci_dev *F3, u8 nid)
+static bool ecc_enabled(struct pci_dev *F3, u16 nid)
{
u32 value;
u8 ecc_en = 0;
@@ -2549,7 +2550,7 @@ static int amd64_init_one_instance(struct pci_dev *F2)
struct mem_ctl_info *mci = NULL;
struct edac_mc_layer layers[2];
int err = 0, ret;
- u8 nid = amd_get_node_id(F2);
+ u16 nid = amd_get_node_id(F2);

ret = -ENOMEM;
pvt = kzalloc(sizeof(struct amd64_pvt), GFP_KERNEL);
@@ -2640,7 +2641,7 @@ err_ret:
static int __devinit amd64_probe_one_instance(struct pci_dev *pdev,
const struct pci_device_id *mc_type)
{
- u8 nid = amd_get_node_id(pdev);
+ u16 nid = amd_get_node_id(pdev);
struct pci_dev *F3 = node_to_amd_nb(nid)->misc;
struct ecc_settings *s;
int ret = 0;
@@ -2690,7 +2691,7 @@ static void __devexit amd64_remove_one_instance(struct pci_dev *pdev)
{
struct mem_ctl_info *mci;
struct amd64_pvt *pvt;
- u8 nid = amd_get_node_id(pdev);
+ u16 nid = amd_get_node_id(pdev);
struct pci_dev *F3 = node_to_amd_nb(nid)->misc;
struct ecc_settings *s = ecc_stngs[nid];

--
1.7.10.4

2012-11-05 06:06:12

by Daniel J Blueman

[permalink] [raw]
Subject: [PATCH 3/3, v2] AMD64 EDAC: Cleanup type usage to be consistent

As the Northbridge IDs are at most 16-bits, use the same type
consistently and cleanup some indexes to use smaller types.

v2: Drop unneeded changes and changes Boris will cleanup later

Signed-off-by: Daniel J Blueman <[email protected]>
---
arch/x86/include/asm/amd_nb.h | 2 +-
arch/x86/include/asm/processor.h | 2 +-
arch/x86/kernel/cpu/amd.c | 4 ++--
drivers/edac/amd64_edac.c | 14 +++++++-------
drivers/edac/amd64_edac.h | 6 +++---
5 files changed, 14 insertions(+), 14 deletions(-)

diff --git a/arch/x86/include/asm/amd_nb.h b/arch/x86/include/asm/amd_nb.h
index 9f5532a..b0815a0 100644
--- a/arch/x86/include/asm/amd_nb.h
+++ b/arch/x86/include/asm/amd_nb.h
@@ -76,7 +76,7 @@ static inline bool amd_nb_has_feature(unsigned feature)
return ((amd_northbridges.flags & feature) == feature);
}

-static inline struct amd_northbridge *node_to_amd_nb(int node)
+static inline struct amd_northbridge *node_to_amd_nb(u16 node)
{
return (node < amd_northbridges.num) ? &amd_northbridges.nb[node] : NULL;
}
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index ad1fc85..eb3ba58 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -934,7 +934,7 @@ extern void start_thread(struct pt_regs *regs, unsigned long new_ip,
extern int get_tsc_mode(unsigned long adr);
extern int set_tsc_mode(unsigned int val);

-extern int amd_get_nb_id(int cpu);
+extern u16 amd_get_nb_id(int cpu);

struct aperfmperf {
u64 aperf, mperf;
diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c
index f7e98a2..52cab1f 100644
--- a/arch/x86/kernel/cpu/amd.c
+++ b/arch/x86/kernel/cpu/amd.c
@@ -364,9 +364,9 @@ static void __cpuinit amd_detect_cmp(struct cpuinfo_x86 *c)
#endif
}

-int amd_get_nb_id(int cpu)
+u16 amd_get_nb_id(int cpu)
{
- int id = 0;
+ u16 id = 0;
#ifdef CONFIG_SMP
id = per_cpu(cpu_llc_id, cpu);
#endif
diff --git a/drivers/edac/amd64_edac.c b/drivers/edac/amd64_edac.c
index 5dfe452..a3e297a 100644
--- a/drivers/edac/amd64_edac.c
+++ b/drivers/edac/amd64_edac.c
@@ -239,7 +239,7 @@ static int amd64_get_scrub_rate(struct mem_ctl_info *mci)
* DRAM base/limit associated with node_id
*/
static bool amd64_base_limit_match(struct amd64_pvt *pvt, u64 sys_addr,
- unsigned nid)
+ u8 nid)
{
u64 addr;

@@ -265,7 +265,7 @@ static struct mem_ctl_info *find_mc_by_sys_addr(struct mem_ctl_info *mci,
u64 sys_addr)
{
struct amd64_pvt *pvt;
- unsigned node_id;
+ u8 node_id;
u32 intlv_en, bits;

/*
@@ -1349,7 +1349,7 @@ static u8 f1x_determine_channel(struct amd64_pvt *pvt, u64 sys_addr,
}

/* Convert the sys_addr to the normalized DCT address */
-static u64 f1x_get_norm_dct_addr(struct amd64_pvt *pvt, unsigned range,
+static u64 f1x_get_norm_dct_addr(struct amd64_pvt *pvt, u8 range,
u64 sys_addr, bool hi_rng,
u32 dct_sel_base_addr)
{
@@ -1400,7 +1400,7 @@ static u64 f1x_get_norm_dct_addr(struct amd64_pvt *pvt, unsigned range,
* checks if the csrow passed in is marked as SPARED, if so returns the new
* spare row
*/
-static int f10_process_possible_spare(struct amd64_pvt *pvt, u8 dct, int csrow)
+static int f10_process_possible_spare(struct amd64_pvt *pvt, u16 dct, int csrow)
{
int tmp_cs;

@@ -1425,7 +1425,7 @@ static int f10_process_possible_spare(struct amd64_pvt *pvt, u8 dct, int csrow)
* -EINVAL: NOT FOUND
* 0..csrow = Chip-Select Row
*/
-static int f1x_lookup_addr_in_dct(u64 in_addr, u32 nid, u8 dct)
+static int f1x_lookup_addr_in_dct(u64 in_addr, u16 nid, u8 dct)
{
struct mem_ctl_info *mci;
struct amd64_pvt *pvt;
@@ -2257,7 +2257,7 @@ static int init_csrows(struct mem_ctl_info *mci)
}

/* get all cores on this DCT */
-static void get_cpus_on_this_dct_cpumask(struct cpumask *mask, unsigned nid)
+static void get_cpus_on_this_dct_cpumask(struct cpumask *mask, u16 nid)
{
int cpu;

@@ -2267,7 +2267,7 @@ static void get_cpus_on_this_dct_cpumask(struct cpumask *mask, unsigned nid)
}

/* check MCG_CTL on all the cpus on this node */
-static bool amd64_nb_mce_bank_enabled_on_node(unsigned nid)
+static bool amd64_nb_mce_bank_enabled_on_node(u16 nid)
{
cpumask_var_t mask;
int cpu, nbe;
diff --git a/drivers/edac/amd64_edac.h b/drivers/edac/amd64_edac.h
index 90cae61..a2ea6a4 100644
--- a/drivers/edac/amd64_edac.h
+++ b/drivers/edac/amd64_edac.h
@@ -332,7 +332,7 @@ struct amd64_pvt {
/* pci_device handles which we utilize */
struct pci_dev *F1, *F2, *F3;

- unsigned mc_node_id; /* MC index of this MC node */
+ u16 mc_node_id; /* MC index of this MC node */
int ext_model; /* extended model value of this node */
int channel_count;

@@ -368,7 +368,7 @@ struct amd64_pvt {
struct error_injection injection;
};

-static inline u64 get_dram_base(struct amd64_pvt *pvt, unsigned i)
+static inline u64 get_dram_base(struct amd64_pvt *pvt, u8 i)
{
u64 addr = ((u64)pvt->ranges[i].base.lo & 0xffff0000) << 8;

@@ -378,7 +378,7 @@ static inline u64 get_dram_base(struct amd64_pvt *pvt, unsigned i)
return (((u64)pvt->ranges[i].base.hi & 0x000000ff) << 40) | addr;
}

-static inline u64 get_dram_limit(struct amd64_pvt *pvt, unsigned i)
+static inline u64 get_dram_limit(struct amd64_pvt *pvt, u8 i)
{
u64 lim = (((u64)pvt->ranges[i].lim.lo & 0xffff0000) << 8) | 0x00ffffff;

--
1.7.10.4

2012-11-12 13:24:40

by Borislav Petkov

[permalink] [raw]
Subject: Re: [PATCH 1/3, v5] AMD64 EDAC: Add muli-domain support

On Mon, Nov 05, 2012 at 02:05:24PM +0800, Daniel J Blueman wrote:
> Fix the handling of memory controller detection to index the array
> of detected Northbridges, allowing memory controllers over multiple
> PCI domains in federated systems eg using Numascale's NumaConnect/
> NumaChip.
>
> v4: Generate linear Northbridge ID by indexing detected Northbridges
> v5: Reorder functions to prevent extra function declaration; merge 4th
> patch; simplify Fam15h code; add detail to warning
>
> Signed-off-by: Daniel J Blueman <[email protected]>

Acked-by: Borislav Petkov <[email protected]>

Btw, I don't have access to a multi-socket single-board AMD system right
now so would you please test the patchset on such a system too, if you
haven't done so yet?

Thanks a lot.

--
Regards/Gruss,
Boris.

2012-11-16 08:46:26

by Daniel J Blueman

[permalink] [raw]
Subject: Re: [PATCH 1/3, v5] AMD64 EDAC: Add muli-domain support

On 12/11/2012 21:24, Borislav Petkov wrote:
> On Mon, Nov 05, 2012 at 02:05:24PM +0800, Daniel J Blueman wrote:
>> Fix the handling of memory controller detection to index the array
>> of detected Northbridges, allowing memory controllers over multiple
>> PCI domains in federated systems eg using Numascale's NumaConnect/
>> NumaChip.
>>
>> v4: Generate linear Northbridge ID by indexing detected Northbridges
>> v5: Reorder functions to prevent extra function declaration; merge 4th
>> patch; simplify Fam15h code; add detail to warning
>>
>> Signed-off-by: Daniel J Blueman <[email protected]>
>
> Acked-by: Borislav Petkov <[email protected]>
>
> Btw, I don't have access to a multi-socket single-board AMD system right
> now so would you please test the patchset on such a system too, if you
> haven't done so yet?
>
> Thanks a lot.

Yep, the expected memory controller indexes, population, column-strobe
rows, banks and sysfs paths are detected on my hex-northbridge fam10h
box with 3.7-rc5 with these patches:

EDAC MC: Ver: 3.0.0
AMD64 EDAC driver v3.4.0
EDAC amd64: DRAM ECC enabled.
EDAC amd64: F10h detected (node 0).
EDAC MC: DCT0 chip selects:
EDAC amd64: MC: 0: 0MB 1: 0MB
EDAC amd64: MC: 2: 4096MB 3: 4096MB
EDAC amd64: MC: 4: 0MB 5: 0MB
EDAC amd64: MC: 6: 0MB 7: 0MB
EDAC MC: DCT1 chip selects:
EDAC amd64: MC: 0: 0MB 1: 0MB
EDAC amd64: MC: 2: 4096MB 3: 4096MB
EDAC amd64: MC: 4: 0MB 5: 0MB
EDAC amd64: MC: 6: 0MB 7: 0MB
EDAC amd64: using x8 syndromes.
EDAC amd64: MCT channel count: 2
EDAC amd64: CS2: Registered DDR3 RAM
EDAC amd64: CS3: Registered DDR3 RAM
EDAC MC0: Giving out device to 'amd64_edac' 'F10h': DEV 0000:00:18.2
EDAC amd64: DRAM ECC enabled.
EDAC amd64: F10h detected (node 1).
EDAC MC: DCT0 chip selects:
EDAC amd64: MC: 0: 0MB 1: 0MB
EDAC amd64: MC: 2: 4096MB 3: 4096MB
EDAC amd64: MC: 4: 0MB 5: 0MB
EDAC amd64: MC: 6: 0MB 7: 0MB
EDAC MC: DCT1 chip selects:
EDAC amd64: MC: 0: 0MB 1: 0MB
EDAC amd64: MC: 2: 4096MB 3: 4096MB
EDAC amd64: MC: 4: 0MB 5: 0MB
EDAC amd64: MC: 6: 0MB 7: 0MB
EDAC amd64: using x8 syndromes.
EDAC amd64: MCT channel count: 2
EDAC amd64: CS2: Registered DDR3 RAM
EDAC amd64: CS3: Registered DDR3 RAM
EDAC MC1: Giving out device to 'amd64_edac' 'F10h': DEV 0000:00:19.2
EDAC amd64: DRAM ECC enabled.
EDAC amd64: F10h detected (node 2).
EDAC MC: DCT0 chip selects:
EDAC amd64: MC: 0: 0MB 1: 0MB
EDAC amd64: MC: 2: 4096MB 3: 4096MB
EDAC amd64: MC: 4: 0MB 5: 0MB
EDAC amd64: MC: 6: 0MB 7: 0MB
EDAC MC: DCT1 chip selects:
EDAC amd64: MC: 0: 0MB 1: 0MB
EDAC amd64: MC: 2: 4096MB 3: 4096MB
EDAC amd64: MC: 4: 0MB 5: 0MB
EDAC amd64: MC: 6: 0MB 7: 0MB
EDAC amd64: using x8 syndromes.
EDAC amd64: MCT channel count: 2
EDAC amd64: CS2: Registered DDR3 RAM
EDAC amd64: CS3: Registered DDR3 RAM
EDAC MC2: Giving out device to 'amd64_edac' 'F10h': DEV 0000:00:1a.2
EDAC amd64: DRAM ECC enabled.
EDAC amd64: F10h detected (node 3).
EDAC MC: DCT0 chip selects:
EDAC amd64: MC: 0: 0MB 1: 0MB
EDAC amd64: MC: 2: 4096MB 3: 4096MB
EDAC amd64: MC: 4: 0MB 5: 0MB
EDAC amd64: MC: 6: 0MB 7: 0MB
EDAC MC: DCT1 chip selects:
EDAC amd64: MC: 0: 0MB 1: 0MB
EDAC amd64: MC: 2: 4096MB 3: 4096MB
EDAC amd64: MC: 4: 0MB 5: 0MB
EDAC amd64: MC: 6: 0MB 7: 0MB
EDAC amd64: using x8 syndromes.
EDAC amd64: MCT channel count: 2
EDAC amd64: CS2: Registered DDR3 RAM
EDAC amd64: CS3: Registered DDR3 RAM
EDAC MC3: Giving out device to 'amd64_edac' 'F10h': DEV 0000:00:1b.2
EDAC amd64: DRAM ECC enabled.
EDAC amd64: F10h detected (node 4).
EDAC MC: DCT0 chip selects:
EDAC amd64: MC: 0: 0MB 1: 0MB
EDAC amd64: MC: 2: 4096MB 3: 4096MB
EDAC amd64: MC: 4: 0MB 5: 0MB
EDAC amd64: MC: 6: 0MB 7: 0MB
EDAC MC: DCT1 chip selects:
EDAC amd64: MC: 0: 0MB 1: 0MB
EDAC amd64: MC: 2: 4096MB 3: 4096MB
EDAC amd64: MC: 4: 0MB 5: 0MB
EDAC amd64: MC: 6: 0MB 7: 0MB
EDAC amd64: using x8 syndromes.
EDAC amd64: MCT channel count: 2
EDAC amd64: CS2: Registered DDR3 RAM
EDAC amd64: CS3: Registered DDR3 RAM
EDAC MC4: Giving out device to 'amd64_edac' 'F10h': DEV 0000:00:1c.2
EDAC amd64: DRAM ECC enabled.
EDAC amd64: F10h detected (node 5).
EDAC MC: DCT0 chip selects:
EDAC amd64: MC: 0: 0MB 1: 0MB
EDAC amd64: MC: 2: 4096MB 3: 4096MB
EDAC amd64: MC: 4: 0MB 5: 0MB
EDAC amd64: MC: 6: 0MB 7: 0MB
EDAC MC: DCT1 chip selects:
EDAC amd64: MC: 0: 0MB 1: 0MB
EDAC amd64: MC: 2: 4096MB 3: 4096MB
EDAC amd64: MC: 4: 0MB 5: 0MB
EDAC amd64: MC: 6: 0MB 7: 0MB
EDAC amd64: using x8 syndromes.
EDAC amd64: MCT channel count: 2
EDAC amd64: CS2: Registered DDR3 RAM
EDAC amd64: CS3: Registered DDR3 RAM
EDAC MC5: Giving out device to 'amd64_edac' 'F10h': DEV 0000:00:1d.2
EDAC PCI0: Giving out device to module 'amd64_edac' controller 'EDAC PCI
controller': DEV '0000:00:18.2' (POLLED)

root@ibm-x3755-01:/sys/devices/system/edac# ls -d mc/mc*/{rank*,csrow*}
mc/mc0/csrow2 mc/mc1/csrow2 mc/mc2/csrow2 mc/mc3/csrow2
mc/mc4/csrow2 mc/mc5/csrow2
mc/mc0/csrow3 mc/mc1/csrow3 mc/mc2/csrow3 mc/mc3/csrow3
mc/mc4/csrow3 mc/mc5/csrow3
mc/mc0/rank10 mc/mc1/rank10 mc/mc2/rank10 mc/mc3/rank10
mc/mc4/rank10 mc/mc5/rank10
mc/mc0/rank11 mc/mc1/rank11 mc/mc2/rank11 mc/mc3/rank11
mc/mc4/rank11 mc/mc5/rank11
mc/mc0/rank2 mc/mc1/rank2 mc/mc2/rank2 mc/mc3/rank2 mc/mc4/rank2
mc/mc5/rank2
mc/mc0/rank3 mc/mc1/rank3 mc/mc2/rank3 mc/mc3/rank3 mc/mc4/rank3
mc/mc5/rank3
--
Daniel J Blueman
Principal Software Engineer, Numascale Asia

2012-11-17 14:50:12

by Borislav Petkov

[permalink] [raw]
Subject: Re: [PATCH 1/3, v5] AMD64 EDAC: Add muli-domain support

On Fri, Nov 16, 2012 at 04:46:20PM +0800, Daniel J Blueman wrote:
> Yep, the expected memory controller indexes, population, column-strobe
> rows, banks and sysfs paths are detected on my hex-northbridge fam10h
> box with 3.7-rc5 with these patches:

Thanks, it looks correct to me.

--
Regards/Gruss,
Boris.

2012-11-17 14:50:35

by Borislav Petkov

[permalink] [raw]
Subject: Re: [PATCH 3/3, v2] AMD64 EDAC: Cleanup type usage to be consistent

On Mon, Nov 05, 2012 at 02:05:26PM +0800, Daniel J Blueman wrote:
> As the Northbridge IDs are at most 16-bits, use the same type
> consistently and cleanup some indexes to use smaller types.
>
> v2: Drop unneeded changes and changes Boris will cleanup later
>
> Signed-off-by: Daniel J Blueman <[email protected]>
> ---
> arch/x86/include/asm/amd_nb.h | 2 +-
> arch/x86/include/asm/processor.h | 2 +-
> arch/x86/kernel/cpu/amd.c | 4 ++--
> drivers/edac/amd64_edac.c | 14 +++++++-------
> drivers/edac/amd64_edac.h | 6 +++---
> 5 files changed, 14 insertions(+), 14 deletions(-)
>
> diff --git a/arch/x86/include/asm/amd_nb.h b/arch/x86/include/asm/amd_nb.h
> index 9f5532a..b0815a0 100644
> --- a/arch/x86/include/asm/amd_nb.h
> +++ b/arch/x86/include/asm/amd_nb.h
> @@ -76,7 +76,7 @@ static inline bool amd_nb_has_feature(unsigned feature)
> return ((amd_northbridges.flags & feature) == feature);
> }
>
> -static inline struct amd_northbridge *node_to_amd_nb(int node)
> +static inline struct amd_northbridge *node_to_amd_nb(u16 node)
> {
> return (node < amd_northbridges.num) ? &amd_northbridges.nb[node] : NULL;
> }
> diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
> index ad1fc85..eb3ba58 100644
> --- a/arch/x86/include/asm/processor.h
> +++ b/arch/x86/include/asm/processor.h
> @@ -934,7 +934,7 @@ extern void start_thread(struct pt_regs *regs, unsigned long new_ip,
> extern int get_tsc_mode(unsigned long adr);
> extern int set_tsc_mode(unsigned int val);
>
> -extern int amd_get_nb_id(int cpu);
> +extern u16 amd_get_nb_id(int cpu);

This is correct - this function actually returns cpu_llc_id which is
u16. However, other places in the kernel save the result in an int which
is not absolutely kosher but I guess this is ok since sizeof(int) >=
sizeof(u16) on x86.

Someone should probably go and fix the rest of the places where
amd_get_nb_id is being used when someone is bored :-).

> struct aperfmperf {
> u64 aperf, mperf;
> diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c
> index f7e98a2..52cab1f 100644
> --- a/arch/x86/kernel/cpu/amd.c
> +++ b/arch/x86/kernel/cpu/amd.c
> @@ -364,9 +364,9 @@ static void __cpuinit amd_detect_cmp(struct cpuinfo_x86 *c)
> #endif
> }
>
> -int amd_get_nb_id(int cpu)
> +u16 amd_get_nb_id(int cpu)
> {
> - int id = 0;
> + u16 id = 0;
> #ifdef CONFIG_SMP
> id = per_cpu(cpu_llc_id, cpu);
> #endif
> diff --git a/drivers/edac/amd64_edac.c b/drivers/edac/amd64_edac.c
> index 5dfe452..a3e297a 100644
> --- a/drivers/edac/amd64_edac.c
> +++ b/drivers/edac/amd64_edac.c
> @@ -239,7 +239,7 @@ static int amd64_get_scrub_rate(struct mem_ctl_info *mci)
> * DRAM base/limit associated with node_id
> */
> static bool amd64_base_limit_match(struct amd64_pvt *pvt, u64 sys_addr,
> - unsigned nid)
> + u8 nid)
> {
> u64 addr;
>
> @@ -265,7 +265,7 @@ static struct mem_ctl_info *find_mc_by_sys_addr(struct mem_ctl_info *mci,
> u64 sys_addr)
> {
> struct amd64_pvt *pvt;
> - unsigned node_id;
> + u8 node_id;
> u32 intlv_en, bits;
>
> /*
> @@ -1349,7 +1349,7 @@ static u8 f1x_determine_channel(struct amd64_pvt *pvt, u64 sys_addr,
> }
>
> /* Convert the sys_addr to the normalized DCT address */
> -static u64 f1x_get_norm_dct_addr(struct amd64_pvt *pvt, unsigned range,
> +static u64 f1x_get_norm_dct_addr(struct amd64_pvt *pvt, u8 range,
> u64 sys_addr, bool hi_rng,
> u32 dct_sel_base_addr)
> {
> @@ -1400,7 +1400,7 @@ static u64 f1x_get_norm_dct_addr(struct amd64_pvt *pvt, unsigned range,
> * checks if the csrow passed in is marked as SPARED, if so returns the new
> * spare row
> */
> -static int f10_process_possible_spare(struct amd64_pvt *pvt, u8 dct, int csrow)
> +static int f10_process_possible_spare(struct amd64_pvt *pvt, u16 dct, int csrow)
> {
> int tmp_cs;
>
> @@ -1425,7 +1425,7 @@ static int f10_process_possible_spare(struct amd64_pvt *pvt, u8 dct, int csrow)
> * -EINVAL: NOT FOUND
> * 0..csrow = Chip-Select Row
> */
> -static int f1x_lookup_addr_in_dct(u64 in_addr, u32 nid, u8 dct)
> +static int f1x_lookup_addr_in_dct(u64 in_addr, u16 nid, u8 dct)

This nid comes from dram_dst_node and it is

#define dram_dst_node(pvt, i) ((u8)(pvt->ranges[i].lim.lo & 0x7))

so nid is only three bits wide. It actually should be u8.

> {
> struct mem_ctl_info *mci;
> struct amd64_pvt *pvt;
> @@ -2257,7 +2257,7 @@ static int init_csrows(struct mem_ctl_info *mci)
> }
>
> /* get all cores on this DCT */
> -static void get_cpus_on_this_dct_cpumask(struct cpumask *mask, unsigned nid)
> +static void get_cpus_on_this_dct_cpumask(struct cpumask *mask, u16 nid)
> {
> int cpu;
>
> @@ -2267,7 +2267,7 @@ static void get_cpus_on_this_dct_cpumask(struct cpumask *mask, unsigned nid)
> }
>
> /* check MCG_CTL on all the cpus on this node */
> -static bool amd64_nb_mce_bank_enabled_on_node(unsigned nid)
> +static bool amd64_nb_mce_bank_enabled_on_node(u16 nid)
> {
> cpumask_var_t mask;
> int cpu, nbe;
> diff --git a/drivers/edac/amd64_edac.h b/drivers/edac/amd64_edac.h
> index 90cae61..a2ea6a4 100644
> --- a/drivers/edac/amd64_edac.h
> +++ b/drivers/edac/amd64_edac.h
> @@ -332,7 +332,7 @@ struct amd64_pvt {
> /* pci_device handles which we utilize */
> struct pci_dev *F1, *F2, *F3;
>
> - unsigned mc_node_id; /* MC index of this MC node */
> + u16 mc_node_id; /* MC index of this MC node */
> int ext_model; /* extended model value of this node */
> int channel_count;
>
> @@ -368,7 +368,7 @@ struct amd64_pvt {
> struct error_injection injection;
> };
>
> -static inline u64 get_dram_base(struct amd64_pvt *pvt, unsigned i)
> +static inline u64 get_dram_base(struct amd64_pvt *pvt, u8 i)
> {
> u64 addr = ((u64)pvt->ranges[i].base.lo & 0xffff0000) << 8;
>
> @@ -378,7 +378,7 @@ static inline u64 get_dram_base(struct amd64_pvt *pvt, unsigned i)
> return (((u64)pvt->ranges[i].base.hi & 0x000000ff) << 40) | addr;
> }
>
> -static inline u64 get_dram_limit(struct amd64_pvt *pvt, unsigned i)
> +static inline u64 get_dram_limit(struct amd64_pvt *pvt, u8 i)
> {
> u64 lim = (((u64)pvt->ranges[i].lim.lo & 0xffff0000) << 8) | 0x00ffffff;
>
> --
> 1.7.10.4
>
>

--
Regards/Gruss,
Boris.