2012-10-03 09:20:49

by Daniel J Blueman

[permalink] [raw]
Subject: [PATCH] RFC: Fix AMD Northbridge-ID contiguity assumptions

The AMD Northbridge initialisation code and EDAC assume the Northbridge IDs
are contiguous, which no longer holds on federated systems with multiple
HyperTransport fabrics with multiple PCI domains.

Address this assumption by searching the Northbridge ID array, rather than
directly indexing it, using the upper bits for the PCI domain.

Signed-off-by: Daniel J Blueman <[email protected]>
---
arch/x86/include/asm/amd_nb.h | 17 +++++++++++++++--
arch/x86/kernel/amd_nb.c | 15 ++++++++-------
drivers/edac/amd64_edac.c | 18 +++++++++---------
drivers/edac/amd64_edac.h | 4 ++--
4 files changed, 34 insertions(+), 20 deletions(-)

diff --git a/arch/x86/include/asm/amd_nb.h b/arch/x86/include/asm/amd_nb.h
index b3341e9..016448c 100644
--- a/arch/x86/include/asm/amd_nb.h
+++ b/arch/x86/include/asm/amd_nb.h
@@ -47,6 +47,7 @@ struct threshold_bank {
};

struct amd_northbridge {
+ u32 node;
struct pci_dev *misc;
struct pci_dev *link;
struct amd_l3_cache l3_cache;
@@ -76,15 +77,27 @@ static inline bool amd_nb_has_feature(unsigned feature)
return ((amd_northbridges.flags & feature) == feature);
}

-static inline struct amd_northbridge *node_to_amd_nb(int node)
+static inline int node_to_amd_index(u32 node)
{
- return (node < amd_northbridges.num) ? &amd_northbridges.nb[node] : NULL;
+ int i;
+
+ for (i = 0; i < amd_northbridges.num; i++)
+ if (amd_northbridges.nb[i].node == node)
+ return i;
+
+ return 0;
+}
+
+static inline struct amd_northbridge *node_to_amd_nb(u32 node)
+{
+ return &amd_northbridges.nb[node_to_amd_index(node)];
}

#else

#define amd_nb_num(x) 0
#define amd_nb_has_feature(x) false
+#define node_to_amd_index(x) 0
#define node_to_amd_nb(x) NULL

#endif
diff --git a/arch/x86/kernel/amd_nb.c b/arch/x86/kernel/amd_nb.c
index aadf335..011eca1 100644
--- a/arch/x86/kernel/amd_nb.c
+++ b/arch/x86/kernel/amd_nb.c
@@ -75,10 +75,9 @@ int amd_cache_northbridges(void)

link = misc = NULL;
for (i = 0; i != amd_nb_num(); i++) {
- node_to_amd_nb(i)->misc = misc =
- next_northbridge(misc, amd_nb_misc_ids);
- node_to_amd_nb(i)->link = link =
- next_northbridge(link, amd_nb_link_ids);
+ nb->misc = misc = next_northbridge(misc, amd_nb_misc_ids);
+ nb->link = link = next_northbridge(link, amd_nb_link_ids);
+ nb++;
}

/* some CPU families (e.g. family 0x11) do not support GART */
@@ -212,6 +211,7 @@ int amd_set_subcaches(int cpu, int mask)
static int amd_cache_gart(void)
{
u16 i;
+ struct amd_northbridge *nb = amd_northbridges.nb;

if (!amd_nb_has_feature(AMD_NB_GART))
return 0;
@@ -222,9 +222,10 @@ static int amd_cache_gart(void)
return -ENOMEM;
}

- for (i = 0; i != amd_nb_num(); i++)
- pci_read_config_dword(node_to_amd_nb(i)->misc, 0x9c,
- &flush_words[i]);
+ for (i = 0; i != amd_nb_num(); i++) {
+ pci_read_config_dword(nb->misc, 0x9c, &flush_words[i]);
+ nb++;
+ }

return 0;
}
diff --git a/drivers/edac/amd64_edac.c b/drivers/edac/amd64_edac.c
index 5a297a2..9c35565 100644
--- a/drivers/edac/amd64_edac.c
+++ b/drivers/edac/amd64_edac.c
@@ -2549,7 +2549,7 @@ static int amd64_init_one_instance(struct pci_dev *F2)
struct mem_ctl_info *mci = NULL;
struct edac_mc_layer layers[2];
int err = 0, ret;
- u8 nid = get_node_id(F2);
+ u32 nid = get_node_id(F2);

ret = -ENOMEM;
pvt = kzalloc(sizeof(struct amd64_pvt), GFP_KERNEL);
@@ -2640,7 +2640,7 @@ err_ret:
static int __devinit amd64_probe_one_instance(struct pci_dev *pdev,
const struct pci_device_id *mc_type)
{
- u8 nid = get_node_id(pdev);
+ u32 nid = get_node_id(pdev);
struct pci_dev *F3 = node_to_amd_nb(nid)->misc;
struct ecc_settings *s;
int ret = 0;
@@ -2656,7 +2656,7 @@ static int __devinit amd64_probe_one_instance(struct pci_dev *pdev,
if (!s)
goto err_out;

- ecc_stngs[nid] = s;
+ ecc_stngs[node_to_amd_index(nid)] = s;

if (!ecc_enabled(F3, nid)) {
ret = -ENODEV;
@@ -2680,7 +2680,7 @@ static int __devinit amd64_probe_one_instance(struct pci_dev *pdev,

err_enable:
kfree(s);
- ecc_stngs[nid] = NULL;
+ ecc_stngs[node_to_amd_index(nid)] = NULL;

err_out:
return ret;
@@ -2690,9 +2690,9 @@ static void __devexit amd64_remove_one_instance(struct pci_dev *pdev)
{
struct mem_ctl_info *mci;
struct amd64_pvt *pvt;
- u8 nid = get_node_id(pdev);
+ u32 nid = get_node_id(pdev);
struct pci_dev *F3 = node_to_amd_nb(nid)->misc;
- struct ecc_settings *s = ecc_stngs[nid];
+ struct ecc_settings *s = ecc_stngs[node_to_amd_index(nid)];

mci = find_mci_by_dev(&pdev->dev);
del_mc_sysfs_attrs(mci);
@@ -2711,12 +2711,12 @@ static void __devexit amd64_remove_one_instance(struct pci_dev *pdev)
amd_report_gart_errors(false);
amd_unregister_ecc_decoder(amd64_decode_bus_error);

- kfree(ecc_stngs[nid]);
- ecc_stngs[nid] = NULL;
+ kfree(ecc_stngs[node_to_amd_index(nid)]);
+ ecc_stngs[node_to_amd_index(nid)] = NULL;

/* Free the EDAC CORE resources */
mci->pvt_info = NULL;
- mcis[nid] = NULL;
+ mcis[node_to_amd_index(nid)] = NULL;

kfree(pvt);
edac_mc_free(mci);
diff --git a/drivers/edac/amd64_edac.h b/drivers/edac/amd64_edac.h
index 8d48047..2a7189b 100644
--- a/drivers/edac/amd64_edac.h
+++ b/drivers/edac/amd64_edac.h
@@ -291,9 +291,9 @@
#define MSR_MCGCTL_NBE BIT(4)

/* AMD sets the first MC device at device ID 0x18. */
-static inline u8 get_node_id(struct pci_dev *pdev)
+static inline u32 get_node_id(struct pci_dev *pdev)
{
- return PCI_SLOT(pdev->devfn) - 0x18;
+ return (pci_domain_nr(pdev->bus) << 8) | (PCI_SLOT(pdev->devfn) - 0x18);
}

enum amd_families {
--
1.7.9.5


2012-10-03 15:16:37

by H. Peter Anvin

[permalink] [raw]
Subject: Re: [PATCH] RFC: Fix AMD Northbridge-ID contiguity assumptions

On 10/03/2012 02:20 AM, Daniel J Blueman wrote:
> The AMD Northbridge initialisation code and EDAC assume the Northbridge IDs
> are contiguous, which no longer holds on federated systems with multiple
> HyperTransport fabrics with multiple PCI domains.

Is that "on NumaScale systems"? If so, please say so rather than trying
to make it sound generic; if it is not, can you give some other examples?

-hpa

2012-10-03 15:51:16

by H. Peter Anvin

[permalink] [raw]
Subject: Re: [PATCH] RFC: Fix AMD Northbridge-ID contiguity assumptions

On 10/03/2012 08:30 AM, Daniel Blueman wrote:
> >
> > Is that "on NumaScale systems"? If so, please say so rather than trying
> > to make it sound generic; if it is not, can you give some other examples?
>
> It is for Numascale (NumaChip) systems for our purposes.
>
> Any other systems which interconnect Opterons via address space routing
> (needed for >8 HT nodes) will get this benefit. I can't put my hand to
> exactly what is out there, but can find out.
>

The reason I'm asking is because it is an important bit of the record of
the code to know if this is a specific need or a general need. This may
be obvious now, but 5-10 years from now someone will need to know why or
what.

The two paragraphs above is exactly what is needed, i.e. "NumaChip or
any other design which shares these specific design features: ..."

-hpa


--
H. Peter Anvin, Intel Open Source Technology Center
I work for Intel. I don't speak on their behalf.

2012-10-03 18:09:12

by Borislav Petkov

[permalink] [raw]
Subject: Re: [PATCH] RFC: Fix AMD Northbridge-ID contiguity assumptions

On Wed, Oct 03, 2012 at 08:50:51AM -0700, H. Peter Anvin wrote:
> On 10/03/2012 08:30 AM, Daniel Blueman wrote:
> > >
> > > Is that "on NumaScale systems"? If so, please say so rather than trying
> > > to make it sound generic; if it is not, can you give some other examples?
> >
> >It is for Numascale (NumaChip) systems for our purposes.
> >
> >Any other systems which interconnect Opterons via address space routing
> >(needed for >8 HT nodes) will get this benefit. I can't put my hand to
> >exactly what is out there, but can find out.
> >
>
> The reason I'm asking is because it is an important bit of the
> record of the code to know if this is a specific need or a general
> need. This may be obvious now, but 5-10 years from now someone will
> need to know why or what.
>
> The two paragraphs above is exactly what is needed, i.e. "NumaChip
> or any other design which shares these specific design features:
> ..."

Absolutely!

And it would be best to put that explanation in the code somewhere
around node_to_amd_index() so that it is there at a first glance.

Btw, I'll review the patch tomorrow since it is a holiday today here.

I have only two nits for now:

* node is u32, do you really have such big systems with 2^32-1 nodes? Or
can the max node number fit into a, say, u16 or u8?

* node_to_amd_index returns 0 in the unsuccessful case but node index 0
seems ok to me, i.e. the first element in the array of northbridges. It
probably should return a negative value rather to signal a failure...

Thanks.

--
Regards/Gruss,
Boris.

Advanced Micro Devices GmbH
Einsteinring 24, 85609 Dornach
GM: Alberto Bozzo
Reg: Dornach, Landkreis Muenchen
HRB Nr. 43632 WEEE Registernr: 129 19551