2012-11-27 06:34:58

by Daniel J Blueman

[permalink] [raw]
Subject: [PATCH 1/4 v7] AMD64 EDAC: Add multi-domain support

Fix get_node_id to match northbridge IDs from the array of detected ones,
allowing multi-server support such as with Numascale's NumaConnect, renaming
to 'amd_get_node_id' for consistency.

v7: Refactor patches grouping changes

Signed-off-by: Daniel J Blueman <[email protected]>
---
arch/x86/include/asm/amd_nb.h | 17 +++++++++++++++++
drivers/edac/amd64_edac.c | 6 +++---
drivers/edac/amd64_edac.h | 6 ------
3 files changed, 20 insertions(+), 9 deletions(-)

diff --git a/arch/x86/include/asm/amd_nb.h b/arch/x86/include/asm/amd_nb.h
index b3341e9..417eb24 100644
--- a/arch/x86/include/asm/amd_nb.h
+++ b/arch/x86/include/asm/amd_nb.h
@@ -81,6 +81,23 @@ static inline struct amd_northbridge *node_to_amd_nb(int node)
return (node < amd_northbridges.num) ? &amd_northbridges.nb[node] : NULL;
}

+static inline u16 amd_get_node_id(struct pci_dev *pdev)
+{
+ struct pci_dev *misc;
+ int i;
+
+ for (i = 0; i != amd_nb_num(); i++) {
+ misc = node_to_amd_nb(i)->misc;
+
+ if (pci_domain_nr(misc->bus) == pci_domain_nr(pdev->bus) &&
+ PCI_SLOT(misc->devfn) == PCI_SLOT(pdev->devfn))
+ return i;
+ }
+
+ WARN(1, "Unable to find AMD Northbridge identifier for %s\n", pci_name(pdev));
+ return 0;
+}
+
#else

#define amd_nb_num(x) 0
diff --git a/drivers/edac/amd64_edac.c b/drivers/edac/amd64_edac.c
index cc8e7c7..9ba70a5 100644
--- a/drivers/edac/amd64_edac.c
+++ b/drivers/edac/amd64_edac.c
@@ -2546,7 +2546,7 @@ static int amd64_init_one_instance(struct pci_dev *F2)
struct mem_ctl_info *mci = NULL;
struct edac_mc_layer layers[2];
int err = 0, ret;
- u8 nid = get_node_id(F2);
+ u16 nid = amd_get_node_id(F2);

ret = -ENOMEM;
pvt = kzalloc(sizeof(struct amd64_pvt), GFP_KERNEL);
@@ -2637,7 +2637,7 @@ err_ret:
static int __devinit amd64_probe_one_instance(struct pci_dev *pdev,
const struct pci_device_id *mc_type)
{
- u8 nid = get_node_id(pdev);
+ u16 nid = amd_get_node_id(pdev);
struct pci_dev *F3 = node_to_amd_nb(nid)->misc;
struct ecc_settings *s;
int ret = 0;
@@ -2687,7 +2687,7 @@ static void __devexit amd64_remove_one_instance(struct pci_dev *pdev)
{
struct mem_ctl_info *mci;
struct amd64_pvt *pvt;
- u8 nid = get_node_id(pdev);
+ u16 nid = amd_get_node_id(pdev);
struct pci_dev *F3 = node_to_amd_nb(nid)->misc;
struct ecc_settings *s = ecc_stngs[nid];

diff --git a/drivers/edac/amd64_edac.h b/drivers/edac/amd64_edac.h
index 8c41396..cecd0c4 100644
--- a/drivers/edac/amd64_edac.h
+++ b/drivers/edac/amd64_edac.h
@@ -290,12 +290,6 @@
/* MSRs */
#define MSR_MCGCTL_NBE BIT(4)

-/* AMD sets the first MC device at device ID 0x18. */
-static inline u8 get_node_id(struct pci_dev *pdev)
-{
- return PCI_SLOT(pdev->devfn) - 0x18;
-}
-
enum amd_families {
K8_CPUS = 0,
F10_CPUS,
--
1.7.10.4


2012-11-27 06:35:09

by Daniel J Blueman

[permalink] [raw]
Subject: [PATCH 2/4 v7] AMD64 EDAC: Consistently use u16 for northbridge IDs in amd_get_nb_id

Change amd_get_nb_id to return u16 to support >255 memory controllers,
and related consistency fixes.

v7: Refactor patches grouping changes

Signed-off-by: Daniel J Blueman <[email protected]>
---
arch/x86/include/asm/processor.h | 2 +-
arch/x86/kernel/cpu/amd.c | 4 ++--
drivers/edac/amd64_edac.c | 5 +++--
3 files changed, 6 insertions(+), 5 deletions(-)

diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index ad1fc85..eb3ba58 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -934,7 +934,7 @@ extern void start_thread(struct pt_regs *regs, unsigned long new_ip,
extern int get_tsc_mode(unsigned long adr);
extern int set_tsc_mode(unsigned int val);

-extern int amd_get_nb_id(int cpu);
+extern u16 amd_get_nb_id(int cpu);

struct aperfmperf {
u64 aperf, mperf;
diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c
index 1b7d165..2e298e9 100644
--- a/arch/x86/kernel/cpu/amd.c
+++ b/arch/x86/kernel/cpu/amd.c
@@ -364,9 +364,9 @@ static void __cpuinit amd_detect_cmp(struct cpuinfo_x86 *c)
#endif
}

-int amd_get_nb_id(int cpu)
+u16 amd_get_nb_id(int cpu)
{
- int id = 0;
+ u16 id = 0;
#ifdef CONFIG_SMP
id = per_cpu(cpu_llc_id, cpu);
#endif
diff --git a/drivers/edac/amd64_edac.c b/drivers/edac/amd64_edac.c
index 9ba70a5..60e93fa 100644
--- a/drivers/edac/amd64_edac.c
+++ b/drivers/edac/amd64_edac.c
@@ -942,7 +942,8 @@ static u64 get_error_address(struct mce *m)
struct amd64_pvt *pvt;
u64 cc6_base, tmp_addr;
u32 tmp;
- u8 mce_nid, intlv_en;
+ u16 mce_nid;
+ u8 intlv_en;

if ((addr & GENMASK(24, 47)) >> 24 != 0x00fdf7)
return addr;
@@ -2253,7 +2254,7 @@ static int init_csrows(struct mem_ctl_info *mci)
}

/* get all cores on this DCT */
-static void get_cpus_on_this_dct_cpumask(struct cpumask *mask, unsigned nid)
+static void get_cpus_on_this_dct_cpumask(struct cpumask *mask, u16 nid)
{
int cpu;

--
1.7.10.4

2012-11-27 06:35:19

by Daniel J Blueman

[permalink] [raw]
Subject: [PATCH 3/4 v7] AMD64 EDAC: Fix PCI function lookup

Fix locating sibling memory controller PCI functions by using the
correct PCI domain.

v7: Refactor patches grouping changes

Signed-off-by: Daniel J Blueman <[email protected]>
---
drivers/edac/amd64_edac.c | 40 +++++++++++++++++++++-------------------
1 file changed, 21 insertions(+), 19 deletions(-)

diff --git a/drivers/edac/amd64_edac.c b/drivers/edac/amd64_edac.c
index 60e93fa..62b7b17 100644
--- a/drivers/edac/amd64_edac.c
+++ b/drivers/edac/amd64_edac.c
@@ -983,6 +983,24 @@ static u64 get_error_address(struct mce *m)
return addr;
}

+static struct pci_dev *pci_get_related_function(unsigned int vendor,
+ unsigned int device,
+ struct pci_dev *related)
+{
+ struct pci_dev *dev = NULL;
+
+ dev = pci_get_device(vendor, device, dev);
+ while (dev) {
+ if (pci_domain_nr(dev->bus) == pci_domain_nr(related->bus) &&
+ (dev->bus->number == related->bus->number) &&
+ (PCI_SLOT(dev->devfn) == PCI_SLOT(related->devfn)))
+ break;
+ dev = pci_get_device(vendor, device, dev);
+ }
+
+ return dev;
+}
+
static void read_dram_base_limit_regs(struct amd64_pvt *pvt, unsigned range)
{
struct cpuinfo_x86 *c = &boot_cpu_data;
@@ -1002,11 +1020,12 @@ static void read_dram_base_limit_regs(struct amd64_pvt *pvt, unsigned range)

/* Factor in CC6 save area by reading dst node's limit reg */
if (c->x86 == 0x15) {
- struct pci_dev *f1 = NULL;
+ struct pci_dev *misc, *f1 = NULL;
u8 nid = dram_dst_node(pvt, range);
u32 llim;

- f1 = pci_get_domain_bus_and_slot(0, 0, PCI_DEVFN(0x18 + nid, 1));
+ misc = node_to_amd_nb(nid)->misc;
+ f1 = pci_get_related_function(misc->vendor, PCI_DEVICE_ID_AMD_15H_NB_F1, misc);
if (WARN_ON(!f1))
return;

@@ -1713,23 +1732,6 @@ static struct amd64_family_type amd64_family_types[] = {
},
};

-static struct pci_dev *pci_get_related_function(unsigned int vendor,
- unsigned int device,
- struct pci_dev *related)
-{
- struct pci_dev *dev = NULL;
-
- dev = pci_get_device(vendor, device, dev);
- while (dev) {
- if ((dev->bus->number == related->bus->number) &&
- (PCI_SLOT(dev->devfn) == PCI_SLOT(related->devfn)))
- break;
- dev = pci_get_device(vendor, device, dev);
- }
-
- return dev;
-}
-
/*
* These are tables of eigenvectors (one per line) which can be used for the
* construction of the syndrome tables. The modified syndrome search algorithm
--
1.7.10.4

2012-11-27 06:35:56

by Daniel J Blueman

[permalink] [raw]
Subject: [PATCH 4/4 v7] AMD64 EDAC: Fix type usage in NB IDs and memory ranges

Use appropriate types for northbridge IDs and memory ranges.

v7: Refactor patches grouping changes

Signed-off-by: Daniel J Blueman <[email protected]>
---
arch/x86/include/asm/amd_nb.h | 2 +-
drivers/edac/amd64_edac.c | 20 ++++++++++----------
drivers/edac/amd64_edac.h | 6 +++---
3 files changed, 14 insertions(+), 14 deletions(-)

diff --git a/arch/x86/include/asm/amd_nb.h b/arch/x86/include/asm/amd_nb.h
index 417eb24..d2e703b 100644
--- a/arch/x86/include/asm/amd_nb.h
+++ b/arch/x86/include/asm/amd_nb.h
@@ -76,7 +76,7 @@ static inline bool amd_nb_has_feature(unsigned feature)
return ((amd_northbridges.flags & feature) == feature);
}

-static inline struct amd_northbridge *node_to_amd_nb(int node)
+static inline struct amd_northbridge *node_to_amd_nb(u16 node)
{
return (node < amd_northbridges.num) ? &amd_northbridges.nb[node] : NULL;
}
diff --git a/drivers/edac/amd64_edac.c b/drivers/edac/amd64_edac.c
index 62b7b17..b27412a 100644
--- a/drivers/edac/amd64_edac.c
+++ b/drivers/edac/amd64_edac.c
@@ -239,7 +239,7 @@ static int amd64_get_scrub_rate(struct mem_ctl_info *mci)
* DRAM base/limit associated with node_id
*/
static bool amd64_base_limit_match(struct amd64_pvt *pvt, u64 sys_addr,
- unsigned nid)
+ u8 nid)
{
u64 addr;

@@ -265,7 +265,7 @@ static struct mem_ctl_info *find_mc_by_sys_addr(struct mem_ctl_info *mci,
u64 sys_addr)
{
struct amd64_pvt *pvt;
- unsigned node_id;
+ u8 node_id;
u32 intlv_en, bits;

/*
@@ -1348,7 +1348,7 @@ static u8 f1x_determine_channel(struct amd64_pvt *pvt, u64 sys_addr,
}

/* Convert the sys_addr to the normalized DCT address */
-static u64 f1x_get_norm_dct_addr(struct amd64_pvt *pvt, unsigned range,
+static u64 f1x_get_norm_dct_addr(struct amd64_pvt *pvt, u8 range,
u64 sys_addr, bool hi_rng,
u32 dct_sel_base_addr)
{
@@ -1399,7 +1399,7 @@ static u64 f1x_get_norm_dct_addr(struct amd64_pvt *pvt, unsigned range,
* checks if the csrow passed in is marked as SPARED, if so returns the new
* spare row
*/
-static int f10_process_possible_spare(struct amd64_pvt *pvt, u8 dct, int csrow)
+static int f10_process_possible_spare(struct amd64_pvt *pvt, u16 dct, int csrow)
{
int tmp_cs;

@@ -1424,7 +1424,7 @@ static int f10_process_possible_spare(struct amd64_pvt *pvt, u8 dct, int csrow)
* -EINVAL: NOT FOUND
* 0..csrow = Chip-Select Row
*/
-static int f1x_lookup_addr_in_dct(u64 in_addr, u32 nid, u8 dct)
+static int f1x_lookup_addr_in_dct(u64 in_addr, u8 nid, u8 dct)
{
struct mem_ctl_info *mci;
struct amd64_pvt *pvt;
@@ -2266,7 +2266,7 @@ static void get_cpus_on_this_dct_cpumask(struct cpumask *mask, u16 nid)
}

/* check MCG_CTL on all the cpus on this node */
-static bool amd64_nb_mce_bank_enabled_on_node(unsigned nid)
+static bool amd64_nb_mce_bank_enabled_on_node(u16 nid)
{
cpumask_var_t mask;
int cpu, nbe;
@@ -2299,7 +2299,7 @@ out:
return ret;
}

-static int toggle_ecc_err_reporting(struct ecc_settings *s, u8 nid, bool on)
+static int toggle_ecc_err_reporting(struct ecc_settings *s, u16 nid, bool on)
{
cpumask_var_t cmask;
int cpu;
@@ -2337,7 +2337,7 @@ static int toggle_ecc_err_reporting(struct ecc_settings *s, u8 nid, bool on)
return 0;
}

-static bool enable_ecc_error_reporting(struct ecc_settings *s, u8 nid,
+static bool enable_ecc_error_reporting(struct ecc_settings *s, u16 nid,
struct pci_dev *F3)
{
bool ret = true;
@@ -2389,7 +2389,7 @@ static bool enable_ecc_error_reporting(struct ecc_settings *s, u8 nid,
return ret;
}

-static void restore_ecc_error_reporting(struct ecc_settings *s, u8 nid,
+static void restore_ecc_error_reporting(struct ecc_settings *s, u16 nid,
struct pci_dev *F3)
{
u32 value, mask = 0x3; /* UECC/CECC enable */
@@ -2428,7 +2428,7 @@ static const char *ecc_msg =
"'ecc_enable_override'.\n"
" (Note that use of the override may cause unknown side effects.)\n";

-static bool ecc_enabled(struct pci_dev *F3, u8 nid)
+static bool ecc_enabled(struct pci_dev *F3, u16 nid)
{
u32 value;
u8 ecc_en = 0;
diff --git a/drivers/edac/amd64_edac.h b/drivers/edac/amd64_edac.h
index cecd0c4..a558084 100644
--- a/drivers/edac/amd64_edac.h
+++ b/drivers/edac/amd64_edac.h
@@ -332,7 +332,7 @@ struct amd64_pvt {
/* pci_device handles which we utilize */
struct pci_dev *F1, *F2, *F3;

- unsigned mc_node_id; /* MC index of this MC node */
+ u16 mc_node_id; /* MC index of this MC node */
int ext_model; /* extended model value of this node */
int channel_count;

@@ -368,7 +368,7 @@ struct amd64_pvt {
struct error_injection injection;
};

-static inline u64 get_dram_base(struct amd64_pvt *pvt, unsigned i)
+static inline u64 get_dram_base(struct amd64_pvt *pvt, u8 i)
{
u64 addr = ((u64)pvt->ranges[i].base.lo & 0xffff0000) << 8;

@@ -378,7 +378,7 @@ static inline u64 get_dram_base(struct amd64_pvt *pvt, unsigned i)
return (((u64)pvt->ranges[i].base.hi & 0x000000ff) << 40) | addr;
}

-static inline u64 get_dram_limit(struct amd64_pvt *pvt, unsigned i)
+static inline u64 get_dram_limit(struct amd64_pvt *pvt, u8 i)
{
u64 lim = (((u64)pvt->ranges[i].lim.lo & 0xffff0000) << 8) | 0x00ffffff;

--
1.7.10.4

2012-11-29 13:36:56

by Borislav Petkov

[permalink] [raw]
Subject: Re: [PATCH 3/4 v7] AMD64 EDAC: Fix PCI function lookup

On Tue, Nov 27, 2012 at 02:32:11PM +0800, Daniel J Blueman wrote:
> Fix locating sibling memory controller PCI functions by using the
> correct PCI domain.
>
> v7: Refactor patches grouping changes
>
> Signed-off-by: Daniel J Blueman <[email protected]>
> ---
> drivers/edac/amd64_edac.c | 40 +++++++++++++++++++++-------------------
> 1 file changed, 21 insertions(+), 19 deletions(-)
>
> diff --git a/drivers/edac/amd64_edac.c b/drivers/edac/amd64_edac.c
> index 60e93fa..62b7b17 100644
> --- a/drivers/edac/amd64_edac.c
> +++ b/drivers/edac/amd64_edac.c
> @@ -983,6 +983,24 @@ static u64 get_error_address(struct mce *m)
> return addr;
> }
>
> +static struct pci_dev *pci_get_related_function(unsigned int vendor,
> + unsigned int device,
> + struct pci_dev *related)
> +{
> + struct pci_dev *dev = NULL;
> +
> + dev = pci_get_device(vendor, device, dev);
> + while (dev) {
> + if (pci_domain_nr(dev->bus) == pci_domain_nr(related->bus) &&
> + (dev->bus->number == related->bus->number) &&
> + (PCI_SLOT(dev->devfn) == PCI_SLOT(related->devfn)))
> + break;
> + dev = pci_get_device(vendor, device, dev);
> + }

This loop looks strange and I'm wondering, wouldn't it be more
straightforward to simply do:

while (dev = pci_get_device(vendor, device, dev)) {
if (...)
break;
}

return dev;

I realize the original code does this pci_get_device twice but it is
crap IMO.

Thanks.

--
Regards/Gruss,
Boris.

2012-11-29 14:34:58

by Borislav Petkov

[permalink] [raw]
Subject: Re: [PATCH 4/4 v7] AMD64 EDAC: Fix type usage in NB IDs and memory ranges

On Tue, Nov 27, 2012 at 02:32:12PM +0800, Daniel J Blueman wrote:
> Use appropriate types for northbridge IDs and memory ranges.
>
> v7: Refactor patches grouping changes
>
> Signed-off-by: Daniel J Blueman <[email protected]>
> ---
> arch/x86/include/asm/amd_nb.h | 2 +-
> drivers/edac/amd64_edac.c | 20 ++++++++++----------
> drivers/edac/amd64_edac.h | 6 +++---
> 3 files changed, 14 insertions(+), 14 deletions(-)
>
> diff --git a/arch/x86/include/asm/amd_nb.h b/arch/x86/include/asm/amd_nb.h
> index 417eb24..d2e703b 100644
> --- a/arch/x86/include/asm/amd_nb.h
> +++ b/arch/x86/include/asm/amd_nb.h
> @@ -76,7 +76,7 @@ static inline bool amd_nb_has_feature(unsigned feature)
> return ((amd_northbridges.flags & feature) == feature);
> }
>
> -static inline struct amd_northbridge *node_to_amd_nb(int node)
> +static inline struct amd_northbridge *node_to_amd_nb(u16 node)
> {
> return (node < amd_northbridges.num) ? &amd_northbridges.nb[node] : NULL;
> }
> diff --git a/drivers/edac/amd64_edac.c b/drivers/edac/amd64_edac.c
> index 62b7b17..b27412a 100644
> --- a/drivers/edac/amd64_edac.c
> +++ b/drivers/edac/amd64_edac.c
> @@ -239,7 +239,7 @@ static int amd64_get_scrub_rate(struct mem_ctl_info *mci)
> * DRAM base/limit associated with node_id
> */
> static bool amd64_base_limit_match(struct amd64_pvt *pvt, u64 sys_addr,
> - unsigned nid)
> + u8 nid)
> {
> u64 addr;
>
> @@ -265,7 +265,7 @@ static struct mem_ctl_info *find_mc_by_sys_addr(struct mem_ctl_info *mci,
> u64 sys_addr)
> {
> struct amd64_pvt *pvt;
> - unsigned node_id;
> + u8 node_id;
> u32 intlv_en, bits;
>
> /*
> @@ -1348,7 +1348,7 @@ static u8 f1x_determine_channel(struct amd64_pvt *pvt, u64 sys_addr,
> }
>
> /* Convert the sys_addr to the normalized DCT address */
> -static u64 f1x_get_norm_dct_addr(struct amd64_pvt *pvt, unsigned range,
> +static u64 f1x_get_norm_dct_addr(struct amd64_pvt *pvt, u8 range,
> u64 sys_addr, bool hi_rng,
> u32 dct_sel_base_addr)
> {
> @@ -1399,7 +1399,7 @@ static u64 f1x_get_norm_dct_addr(struct amd64_pvt *pvt, unsigned range,
> * checks if the csrow passed in is marked as SPARED, if so returns the new
> * spare row
> */
> -static int f10_process_possible_spare(struct amd64_pvt *pvt, u8 dct, int csrow)
> +static int f10_process_possible_spare(struct amd64_pvt *pvt, u16 dct, int csrow)

This one can stay u8 since it comes from dram_dst_node() through
f1x_lookup_addr_in_dct() and it is u8 already there.

But, in general, the patches look much more straightforward and easy to
review, so please add those minor changes and I'll pick them up.

Also, I'm assuming you're testing them on both your Numascale systems
and on a normal AMD multisocket box, correct?

Thanks.

--
Regards/Gruss,
Boris.