2009-11-30 17:28:50

by Randy Dunlap

[permalink] [raw]
Subject: 2.6.32-rc8: amd64_edac slub error

Hi,

Loading amd64_edac_mod on an amd64 system without the expected hardware support
causes memory usage error(s).

Is this already fixed/patched? Do you need more info?
Thanks.



# modprobe amd64_edac_mod
calling amd64_edac_init+0x0/0x849 [amd64_edac_mod] @ 22156
EDAC amd64_edac: Ver: 3.2.0 Nov 24 2009
EDAC amd64: ECC is enabled by BIOS.
=============================================================================
BUG kmalloc-16: Redzone overwritten
-----------------------------------------------------------------------------

INFO: 0xffff880139e25600-0xffff880139e25607. First byte 0x1f instead of 0xcc
INFO: Slab 0xffffea00044a9818 objects=102 used=38 fp=0xffff880139e255c8 flags=0x400000000000c3
INFO: Object 0xffff880139e255f0 @offset=1520 fp=0xffff880139e255c8

Bytes b4 0xffff880139e255e0: 18 56 e2 39 01 88 ff ff 5a 5a 5a 5a 5a 5a 5a 5a .V�9..����ZZZZZZZZ
Object 0xffff880139e255f0: 1f 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................
Redzone 0xffff880139e25600: 1f 00 00 00 00 00 00 00 ........
Padding 0xffff880139e25610: 5a 5a 5a 5a 5a 5a 5a 5a ZZZZZZZZ
Pid: 22158, comm: work_for_cpu Not tainted 2.6.32-rc8 #2
Call Trace:
[<ffffffff81109383>] print_trailer+0x12a/0x133
[<ffffffff81109836>] check_bytes_and_report+0xa9/0xd0
[<ffffffffa008ba4c>] ? amd64_init_one_instance+0x220/0x33e [amd64_edac_mod]
[<ffffffff811098bb>] check_object+0x5e/0x1b6
[<ffffffff8110a889>] __slab_free+0x13a/0x25c
[<ffffffff8110ac2b>] kfree+0xd6/0xe8
[<ffffffffa008ba4c>] amd64_init_one_instance+0x220/0x33e [amd64_edac_mod]
[<ffffffff81072167>] ? do_work_for_cpu+0x0/0x2b
[<ffffffff812062e5>] local_pci_probe+0x17/0x1b
[<ffffffff8107217f>] do_work_for_cpu+0x18/0x2b
[<ffffffff81072167>] ? do_work_for_cpu+0x0/0x2b
[<ffffffff810769fb>] kthread+0x6e/0x76
[<ffffffff81012daa>] child_rip+0xa/0x20
[<ffffffff8107698d>] ? kthread+0x0/0x76
[<ffffffff81012da0>] ? child_rip+0x0/0x20
FIX kmalloc-16: Restoring 0xffff880139e25600-0xffff880139e25607=0xcc

EDAC amd64: NB MCE bank disabled, set MSR 0x0000017b[4] on node 0 to enable.
EDAC amd64: WARNING: ECC is disabled by BIOS. Module will NOT be loaded.
Either Enable ECC in the BIOS, or set 'ecc_enable_override'.
Also, use of the override can cause unknown side effects.
amd64_edac: probe of 0000:00:18.2 failed with error -22
EDAC amd64: ECC is enabled by BIOS.
=============================================================================
BUG kmalloc-16: Redzone overwritten
-----------------------------------------------------------------------------

INFO: 0xffff88027582e060-0xffff88027582e067. First byte 0x1f instead of 0xcc
INFO: Slab 0xffffea00089b4a10 objects=102 used=2 fp=0xffff88027582e028 flags=0xc00000000000c3
INFO: Object 0xffff88027582e050 @offset=80 fp=0xffff88027582e028

Bytes b4 0xffff88027582e040: 78 e0 82 75 02 88 ff ff 5a 5a 5a 5a 5a 5a 5a 5a x�.u..����ZZZZZZZZ
Object 0xffff88027582e050: 1f 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................
Redzone 0xffff88027582e060: 1f 00 00 00 00 00 00 00 [root@ca-ostest293 ~]# ........
Padding 0xffff88027582e070: 5a 5a 5a 5a 5a 5a 5a 5a ZZZZZZZZ
Pid: 22161, comm: work_for_cpu Not tainted 2.6.32-rc8 #2
Call Trace:
[<ffffffff81109383>] print_trailer+0x12a/0x133
[<ffffffff81109836>] check_bytes_and_report+0xa9/0xd0
[<ffffffffa008ba4c>] ? amd64_init_one_instance+0x220/0x33e [amd64_edac_mod]
[<ffffffff811098bb>] check_object+0x5e/0x1b6
[<ffffffff8110a889>] __slab_free+0x13a/0x25c
[<ffffffff8110ac2b>] kfree+0xd6/0xe8
[<ffffffffa008ba4c>] amd64_init_one_instance+0x220/0x33e [amd64_edac_mod]
[<ffffffff81072167>] ? do_work_for_cpu+0x0/0x2b
[<ffffffff812062e5>] local_pci_probe+0x17/0x1b
[<ffffffff8107217f>] do_work_for_cpu+0x18/0x2b
[<ffffffff81072167>] ? do_work_for_cpu+0x0/0x2b
[<ffffffff810769fb>] kthread+0x6e/0x76
[<ffffffff81012daa>] child_rip+0xa/0x20
[<ffffffff8107698d>] ? kthread+0x0/0x76
[<ffffffff81012da0>] ? child_rip+0x0/0x20
FIX kmalloc-16: Restoring 0xffff88027582e060-0xffff88027582e067=0xcc

EDAC amd64: NB MCE bank disabled, set MSR 0x0000017b[4] on node 1 to enable.
EDAC amd64: WARNING: ECC is disabled by BIOS. Module will NOT be loaded.
Either Enable ECC in the BIOS, or set 'ecc_enable_override'.
Also, use of the override can cause unknown side effects.
amd64_edac: probe of 0000:00:19.2 failed with error -22
initcall amd64_edac_init+0x0/0x849 [amd64_edac_mod] returned 0 after 1381423 usecs

--
~Randy


2009-11-30 20:35:48

by Borislav Petkov

[permalink] [raw]
Subject: Re: 2.6.32-rc8: amd64_edac slub error

Hi Randy,

On Mon, Nov 30, 2009 at 09:28:19AM -0800, Randy Dunlap wrote:
> Loading amd64_edac_mod on an amd64 system without the expected hardware support
> causes memory usage error(s).

Well, this is new!

> Is this already fixed/patched? Do you need more info?

Nope :(.

I've tried to reproduce it here by selecting CONFIG_SLUB no success.
Please send me your config.

Also, it would be very helpful if you could enable CONFIG_EDAC_DEBUG and
run it again.

>From looking at the error trace, though, it looks like we're
not allocating enough memory for the struct msr things in
amd64_nb_mce_bank_enabled_on_node(). This is just a hunch though and you
could give the following debug patch a try:

---
diff --git a/drivers/edac/amd64_edac.c b/drivers/edac/amd64_edac.c
index a38831c..139bc14 100644
--- a/drivers/edac/amd64_edac.c
+++ b/drivers/edac/amd64_edac.c
@@ -2739,8 +2739,10 @@ static void get_cpus_on_this_dct_cpumask(cpumask_t *mask, int nid)
int cpu;

for_each_online_cpu(cpu)
- if (amd_get_nb_id(cpu) == nid)
+ if (amd_get_nb_id(cpu) == nid) {
+ pr_err("%s: nid: %d, cpu: %d\n", __func__, nid, cpu);
cpumask_set_cpu(cpu, mask);
+ }
}

/* check MCG_CTL on all the cpus on this node */
@@ -2755,6 +2757,8 @@ static bool amd64_nb_mce_bank_enabled_on_node(int nid)

get_cpus_on_this_dct_cpumask(&mask, nid);

+ pr_err("%s: weight: %d\n", __func__, cpumask_weight(&mask));
+
msrs = kzalloc(sizeof(struct msr) * cpumask_weight(&mask), GFP_KERNEL);
if (!msrs) {
amd64_printk(KERN_WARNING, "%s: error allocating msrs\n",

--

PS. I'm travelling till the end of the week and won't have constant
access to mail but I'll do my best to fix this, sorry.

Thanks.

--
Regards/Gruss,
Boris.

2009-11-30 21:30:28

by Randy Dunlap

[permalink] [raw]
Subject: Re: 2.6.32-rc8: amd64_edac slub error

On Mon, 30 Nov 2009 21:35:47 +0100 Borislav Petkov wrote:

> Hi Randy,
>
> On Mon, Nov 30, 2009 at 09:28:19AM -0800, Randy Dunlap wrote:
> > Loading amd64_edac_mod on an amd64 system without the expected hardware support
> > causes memory usage error(s).
>
> Well, this is new!
>
> > Is this already fixed/patched? Do you need more info?
>
> Nope :(.
>
> I've tried to reproduce it here by selecting CONFIG_SLUB no success.
> Please send me your config.

attached (after enabling EDAC_DEBUG).

> Also, it would be very helpful if you could enable CONFIG_EDAC_DEBUG and
> run it again.

doing that now.

> From looking at the error trace, though, it looks like we're
> not allocating enough memory for the struct msr things in
> amd64_nb_mce_bank_enabled_on_node(). This is just a hunch though and you
> could give the following debug patch a try:

and that.


> ---
> diff --git a/drivers/edac/amd64_edac.c b/drivers/edac/amd64_edac.c
> index a38831c..139bc14 100644
> --- a/drivers/edac/amd64_edac.c
> +++ b/drivers/edac/amd64_edac.c
> @@ -2739,8 +2739,10 @@ static void get_cpus_on_this_dct_cpumask(cpumask_t *mask, int nid)
> int cpu;
>
> for_each_online_cpu(cpu)
> - if (amd_get_nb_id(cpu) == nid)
> + if (amd_get_nb_id(cpu) == nid) {
> + pr_err("%s: nid: %d, cpu: %d\n", __func__, nid, cpu);
> cpumask_set_cpu(cpu, mask);
> + }
> }
>
> /* check MCG_CTL on all the cpus on this node */
> @@ -2755,6 +2757,8 @@ static bool amd64_nb_mce_bank_enabled_on_node(int nid)
>
> get_cpus_on_this_dct_cpumask(&mask, nid);
>
> + pr_err("%s: weight: %d\n", __func__, cpumask_weight(&mask));
> +
> msrs = kzalloc(sizeof(struct msr) * cpumask_weight(&mask), GFP_KERNEL);
> if (!msrs) {
> amd64_printk(KERN_WARNING, "%s: error allocating msrs\n",
>
> --
>
> PS. I'm travelling till the end of the week and won't have constant
> access to mail but I'll do my best to fix this, sorry.
>
> Thanks.
>
> --
> Regards/Gruss,
> Boris.


---
~Randy


Attachments:
config-amd64-edac (98.43 kB)

2009-11-30 22:48:42

by Randy Dunlap

[permalink] [raw]
Subject: Re: 2.6.32-rc8: amd64_edac slub error

On Mon, 30 Nov 2009 21:35:47 +0100 Borislav Petkov wrote:

> Hi Randy,
>
> On Mon, Nov 30, 2009 at 09:28:19AM -0800, Randy Dunlap wrote:
> > Loading amd64_edac_mod on an amd64 system without the expected hardware support
> > causes memory usage error(s).
>
> Well, this is new!
>
> > Is this already fixed/patched? Do you need more info?
>
> Nope :(.
>
> I've tried to reproduce it here by selecting CONFIG_SLUB no success.
> Please send me your config.
>
> Also, it would be very helpful if you could enable CONFIG_EDAC_DEBUG and
> run it again.
>
> From looking at the error trace, though, it looks like we're
> not allocating enough memory for the struct msr things in
> amd64_nb_mce_bank_enabled_on_node(). This is just a hunch though and you
> could give the following debug patch a try:

Full boot log is attached. Thanks.


---
~Randy


Attachments:
edac.log (106.68 kB)
Subject: Re: 2.6.32-rc8: amd64_edac slub error

> EDAC DEBUG: in drivers/edac/amd64_edac.c, line at 2367: DRAM MEM-CTL PCI Bus ID: 0000:00:18.2
> EDAC DEBUG: in drivers/edac/amd64_edac.c, line at 2369: Misc device PCI Bus ID: 0000:00:18.3
> calling alsa_pcm_init+0x0/0x71 [snd_pcm] @ 1402
> initcall alsa_pcm_init+0x0/0x71 [snd_pcm] returned 0 after 17 usecs
> EDAC amd64: ECC is enabled by BIOS.
> get_cpus_on_this_dct_cpumask: nid: 0, cpu: 0
> get_cpus_on_this_dct_cpumask: nid: 0, cpu: 2
> amd64_nb_mce_bank_enabled_on_node: weight: 2
> EDAC DEBUG: in drivers/edac/amd64_edac.c, line at 2776: core: 0, MCG_CTL: 0x1f, NB MSR is enabled
> EDAC DEBUG: in drivers/edac/amd64_edac.c, line at 2776: core: 2, MCG_CTL: 0x0, NB MSR is disabled
> =============================================================================
> BUG kmalloc-16: Redzone overwritten
> -----------------------------------------------------------------------------

Hmm, I think I know what happens. This machine has non-contigious
core enumeration on a node (e.g. 0,2 on node 0 instead of 0,1) but
rdmsr_on_cpus assumes the former. Therefore we write outside of the
allocated msrs struct and thus the redzone overwrite. Here's a simple
fix that should take care of it. Please apply on top of the debugging
patch and catch the output again so that we could verify it.

I'll fix this properly when I get back and then maybe even backport it
depending on the intrusiveness of the changes.

Thanks.

---
drivers/edac/amd64_edac.c | 15 +++++++++++++--
1 files changed, 13 insertions(+), 2 deletions(-)

diff --git a/drivers/edac/amd64_edac.c b/drivers/edac/amd64_edac.c
index 139bc14..c013261 100644
--- a/drivers/edac/amd64_edac.c
+++ b/drivers/edac/amd64_edac.c
@@ -2750,7 +2750,8 @@ static bool amd64_nb_mce_bank_enabled_on_node(int nid)
{
cpumask_t mask;
struct msr *msrs;
- int cpu, nbe, idx = 0;
+ int cpu, nbe, i, idx = 0;
+ int first_cpu, last_cpu = 0;
bool ret = false;

cpumask_clear(&mask);
@@ -2759,7 +2760,17 @@ static bool amd64_nb_mce_bank_enabled_on_node(int nid)

pr_err("%s: weight: %d\n", __func__, cpumask_weight(&mask));

- msrs = kzalloc(sizeof(struct msr) * cpumask_weight(&mask), GFP_KERNEL);
+ /*
+ * calc. cores interval when non-contigious core enumeration
+ */
+ first_cpu = cpumask_first(&mask);
+
+ for (i = first_cpu; i < nr_cpu_ids; i++)
+ if (cpumask_test_cpu(i, &mask))
+ last_cpu = i;
+
+ msrs = kzalloc(sizeof(struct msr) * (last_cpu - first_cpu + 1),
+ GFP_KERNEL);
if (!msrs) {
amd64_printk(KERN_WARNING, "%s: error allocating msrs\n",
__func__);
--
1.6.4.3

--
Regards/Gruss,
Boris.

Operating | Advanced Micro Devices GmbH
System | Karl-Hammerschmidt-Str. 34, 85609 Dornach b. M?nchen, Germany
Research | Gesch?ftsf?hrer: Andrew Bowd, Thomas M. McCoy, Giuliano Meroni
Center | Sitz: Dornach, Gemeinde Aschheim, Landkreis M?nchen
(OSRC) | Registergericht M?nchen, HRB Nr. 43632

2009-12-01 17:20:50

by Randy Dunlap

[permalink] [raw]
Subject: Re: 2.6.32-rc8: amd64_edac slub error

Borislav Petkov wrote:
>> EDAC DEBUG: in drivers/edac/amd64_edac.c, line at 2367: DRAM MEM-CTL PCI Bus ID: 0000:00:18.2
>> EDAC DEBUG: in drivers/edac/amd64_edac.c, line at 2369: Misc device PCI Bus ID: 0000:00:18.3
>> calling alsa_pcm_init+0x0/0x71 [snd_pcm] @ 1402
>> initcall alsa_pcm_init+0x0/0x71 [snd_pcm] returned 0 after 17 usecs
>> EDAC amd64: ECC is enabled by BIOS.
>> get_cpus_on_this_dct_cpumask: nid: 0, cpu: 0
>> get_cpus_on_this_dct_cpumask: nid: 0, cpu: 2
>> amd64_nb_mce_bank_enabled_on_node: weight: 2
>> EDAC DEBUG: in drivers/edac/amd64_edac.c, line at 2776: core: 0, MCG_CTL: 0x1f, NB MSR is enabled
>> EDAC DEBUG: in drivers/edac/amd64_edac.c, line at 2776: core: 2, MCG_CTL: 0x0, NB MSR is disabled
>> =============================================================================
>> BUG kmalloc-16: Redzone overwritten
>> -----------------------------------------------------------------------------
>
> Hmm, I think I know what happens. This machine has non-contigious
> core enumeration on a node (e.g. 0,2 on node 0 instead of 0,1) but
> rdmsr_on_cpus assumes the former. Therefore we write outside of the
> allocated msrs struct and thus the redzone overwrite. Here's a simple
> fix that should take care of it. Please apply on top of the debugging
> patch and catch the output again so that we could verify it.
>
> I'll fix this properly when I get back and then maybe even backport it
> depending on the intrusiveness of the changes.

Hi,
Here's the new log file (attached).

thanks.
--
~Randy


Attachments:
edac2.log (103.54 kB)
Subject: Re: 2.6.32-rc8: amd64_edac slub error

On Tue, Dec 01, 2009 at 09:19:31AM -0800, Randy Dunlap wrote:
> Here's the new log file (attached).

Thanks for testing. Meanwhile, I noticed that the other places where
rdmsr_on_cpus() gets called with non-contigious cpumasks need fixing
too. Here's a version that takes care of that, I'd be nice if you could
give it a run too (patch against today's upstream). You could also
enforce the module loading by setting 'ecc_enable_override=1' to verify
the other rdmsr_on_cpus calls.

Thanks.

---
diff --git a/drivers/edac/amd64_edac.c b/drivers/edac/amd64_edac.c
index a38831c..da2428b 100644
--- a/drivers/edac/amd64_edac.c
+++ b/drivers/edac/amd64_edac.c
@@ -2618,6 +2618,9 @@ static int amd64_init_csrows(struct mem_ctl_info *mci)
return empty;
}

+static struct msr *alloc_msrs(const cpumask_t *mask);
+static void free_msrs(struct msr *msrs);
+
/*
* Only if 'ecc_enable_override' is set AND BIOS had ECC disabled, do "we"
* enable it.
@@ -2627,14 +2630,16 @@ static void amd64_enable_ecc_error_reporting(struct mem_ctl_info *mci)
struct amd64_pvt *pvt = mci->pvt_info;
const cpumask_t *cpumask = cpumask_of_node(pvt->mc_node_id);
int cpu, idx = 0, err = 0;
- struct msr msrs[cpumask_weight(cpumask)];
+ struct msr *msrs;
u32 value;
u32 mask = K8_NBCTL_CECCEn | K8_NBCTL_UECCEn;

if (!ecc_enable_override)
return;

- memset(msrs, 0, sizeof(msrs));
+ msrs = alloc_msrs(cpumask);
+ if (!msrs)
+ return;

amd64_printk(KERN_WARNING,
"'ecc_enable_override' parameter is active, "
@@ -2697,20 +2702,24 @@ static void amd64_enable_ecc_error_reporting(struct mem_ctl_info *mci)
(value & K8_NBCFG_ECC_ENABLE) ? "Enabled" : "Disabled");

pvt->ctl_error_info.nbcfg = value;
+
+ free_msrs(msrs);
}

static void amd64_restore_ecc_error_reporting(struct amd64_pvt *pvt)
{
const cpumask_t *cpumask = cpumask_of_node(pvt->mc_node_id);
int cpu, idx = 0, err = 0;
- struct msr msrs[cpumask_weight(cpumask)];
+ struct msr *msrs;
u32 value;
u32 mask = K8_NBCTL_CECCEn | K8_NBCTL_UECCEn;

if (!pvt->nbctl_mcgctl_saved)
return;

- memset(msrs, 0, sizeof(msrs));
+ msrs = alloc_msrs(cpumask);
+ if (!msrs)
+ return;

err = pci_read_config_dword(pvt->misc_f3_ctl, K8_NBCTL, &value);
if (err)
@@ -2731,6 +2740,8 @@ static void amd64_restore_ecc_error_reporting(struct amd64_pvt *pvt)
}

wrmsr_on_cpus(cpumask, K8_MSR_MCGCTL, msrs);
+
+ free_msrs(msrs);
}

/* get all cores on this DCT */
@@ -2743,6 +2754,40 @@ static void get_cpus_on_this_dct_cpumask(cpumask_t *mask, int nid)
cpumask_set_cpu(cpu, mask);
}

+/*
+ * Allocate enough msr structs for the supplied cpumask. Also, take care of
+ * non-contigious bitmasks.
+ */
+static struct msr *alloc_msrs(const cpumask_t *mask)
+{
+ struct msr *msrs;
+ int i, first_cpu, last_cpu = 0;
+
+ if (cpumask_empty(mask)) {
+ amd64_printk(KERN_WARNING, "%s: Empty cpumask!\n", __func__);
+ return NULL;
+ }
+
+ first_cpu = cpumask_first(mask);
+ for (i = first_cpu; i < nr_cpu_ids; i++)
+ if (cpumask_test_cpu(i, mask))
+ last_cpu = i;
+
+ msrs = kzalloc(sizeof(*msrs) * (last_cpu - first_cpu + 1), GFP_KERNEL);
+ if (!msrs) {
+ amd64_printk(KERN_WARNING, "%s: error allocating msrs\n",
+ __func__);
+ return NULL;
+ }
+
+ return msrs;
+}
+
+static void free_msrs(struct msr *msrs)
+{
+ kfree(msrs);
+}
+
/* check MCG_CTL on all the cpus on this node */
static bool amd64_nb_mce_bank_enabled_on_node(int nid)
{
@@ -2755,12 +2800,9 @@ static bool amd64_nb_mce_bank_enabled_on_node(int nid)

get_cpus_on_this_dct_cpumask(&mask, nid);

- msrs = kzalloc(sizeof(struct msr) * cpumask_weight(&mask), GFP_KERNEL);
- if (!msrs) {
- amd64_printk(KERN_WARNING, "%s: error allocating msrs\n",
- __func__);
- return false;
- }
+ msrs = alloc_msrs(&mask);
+ if (!msrs)
+ goto out_err;

rdmsr_on_cpus(&mask, MSR_IA32_MCG_CTL, msrs);

@@ -2779,7 +2821,9 @@ static bool amd64_nb_mce_bank_enabled_on_node(int nid)
ret = true;

out:
- kfree(msrs);
+ free_msrs(msrs);
+
+out_err:
return ret;
}


--
Regards/Gruss,
Boris.

Operating | Advanced Micro Devices GmbH
System | Karl-Hammerschmidt-Str. 34, 85609 Dornach b. M?nchen, Germany
Research | Gesch?ftsf?hrer: Andrew Bowd, Thomas M. McCoy, Giuliano Meroni
Center | Sitz: Dornach, Gemeinde Aschheim, Landkreis M?nchen
(OSRC) | Registergericht M?nchen, HRB Nr. 43632

2009-12-02 18:11:27

by Randy Dunlap

[permalink] [raw]
Subject: Re: 2.6.32-rc8: amd64_edac slub error

On Wed, 2 Dec 2009 11:58:38 +0100 Borislav Petkov wrote:

> On Tue, Dec 01, 2009 at 09:19:31AM -0800, Randy Dunlap wrote:
> > Here's the new log file (attached).
>
> Thanks for testing. Meanwhile, I noticed that the other places where
> rdmsr_on_cpus() gets called with non-contigious cpumasks need fixing
> too. Here's a version that takes care of that, I'd be nice if you could
> give it a run too (patch against today's upstream). You could also
> enforce the module loading by setting 'ecc_enable_override=1' to verify
> the other rdmsr_on_cpus calls.
>
> Thanks.

This patch also works for me. Thanks.

Acked-by: Randy Dunlap <[email protected]>


boot log attached.

> ---
> diff --git a/drivers/edac/amd64_edac.c b/drivers/edac/amd64_edac.c
> index a38831c..da2428b 100644
> --- a/drivers/edac/amd64_edac.c
> +++ b/drivers/edac/amd64_edac.c
> @@ -2618,6 +2618,9 @@ static int amd64_init_csrows(struct mem_ctl_info *mci)
> return empty;
> }
>
> +static struct msr *alloc_msrs(const cpumask_t *mask);
> +static void free_msrs(struct msr *msrs);
> +
> /*
> * Only if 'ecc_enable_override' is set AND BIOS had ECC disabled, do "we"
> * enable it.
> @@ -2627,14 +2630,16 @@ static void amd64_enable_ecc_error_reporting(struct mem_ctl_info *mci)
> struct amd64_pvt *pvt = mci->pvt_info;
> const cpumask_t *cpumask = cpumask_of_node(pvt->mc_node_id);
> int cpu, idx = 0, err = 0;
> - struct msr msrs[cpumask_weight(cpumask)];
> + struct msr *msrs;
> u32 value;
> u32 mask = K8_NBCTL_CECCEn | K8_NBCTL_UECCEn;
>
> if (!ecc_enable_override)
> return;
>
> - memset(msrs, 0, sizeof(msrs));
> + msrs = alloc_msrs(cpumask);
> + if (!msrs)
> + return;
>
> amd64_printk(KERN_WARNING,
> "'ecc_enable_override' parameter is active, "
> @@ -2697,20 +2702,24 @@ static void amd64_enable_ecc_error_reporting(struct mem_ctl_info *mci)
> (value & K8_NBCFG_ECC_ENABLE) ? "Enabled" : "Disabled");
>
> pvt->ctl_error_info.nbcfg = value;
> +
> + free_msrs(msrs);
> }
>
> static void amd64_restore_ecc_error_reporting(struct amd64_pvt *pvt)
> {
> const cpumask_t *cpumask = cpumask_of_node(pvt->mc_node_id);
> int cpu, idx = 0, err = 0;
> - struct msr msrs[cpumask_weight(cpumask)];
> + struct msr *msrs;
> u32 value;
> u32 mask = K8_NBCTL_CECCEn | K8_NBCTL_UECCEn;
>
> if (!pvt->nbctl_mcgctl_saved)
> return;
>
> - memset(msrs, 0, sizeof(msrs));
> + msrs = alloc_msrs(cpumask);
> + if (!msrs)
> + return;
>
> err = pci_read_config_dword(pvt->misc_f3_ctl, K8_NBCTL, &value);
> if (err)
> @@ -2731,6 +2740,8 @@ static void amd64_restore_ecc_error_reporting(struct amd64_pvt *pvt)
> }
>
> wrmsr_on_cpus(cpumask, K8_MSR_MCGCTL, msrs);
> +
> + free_msrs(msrs);
> }
>
> /* get all cores on this DCT */
> @@ -2743,6 +2754,40 @@ static void get_cpus_on_this_dct_cpumask(cpumask_t *mask, int nid)
> cpumask_set_cpu(cpu, mask);
> }
>
> +/*
> + * Allocate enough msr structs for the supplied cpumask. Also, take care of
> + * non-contigious bitmasks.
> + */
> +static struct msr *alloc_msrs(const cpumask_t *mask)
> +{
> + struct msr *msrs;
> + int i, first_cpu, last_cpu = 0;
> +
> + if (cpumask_empty(mask)) {
> + amd64_printk(KERN_WARNING, "%s: Empty cpumask!\n", __func__);
> + return NULL;
> + }
> +
> + first_cpu = cpumask_first(mask);
> + for (i = first_cpu; i < nr_cpu_ids; i++)
> + if (cpumask_test_cpu(i, mask))
> + last_cpu = i;
> +
> + msrs = kzalloc(sizeof(*msrs) * (last_cpu - first_cpu + 1), GFP_KERNEL);
> + if (!msrs) {
> + amd64_printk(KERN_WARNING, "%s: error allocating msrs\n",
> + __func__);
> + return NULL;
> + }
> +
> + return msrs;
> +}
> +
> +static void free_msrs(struct msr *msrs)
> +{
> + kfree(msrs);
> +}
> +
> /* check MCG_CTL on all the cpus on this node */
> static bool amd64_nb_mce_bank_enabled_on_node(int nid)
> {
> @@ -2755,12 +2800,9 @@ static bool amd64_nb_mce_bank_enabled_on_node(int nid)
>
> get_cpus_on_this_dct_cpumask(&mask, nid);
>
> - msrs = kzalloc(sizeof(struct msr) * cpumask_weight(&mask), GFP_KERNEL);
> - if (!msrs) {
> - amd64_printk(KERN_WARNING, "%s: error allocating msrs\n",
> - __func__);
> - return false;
> - }
> + msrs = alloc_msrs(&mask);
> + if (!msrs)
> + goto out_err;
>
> rdmsr_on_cpus(&mask, MSR_IA32_MCG_CTL, msrs);
>
> @@ -2779,7 +2821,9 @@ static bool amd64_nb_mce_bank_enabled_on_node(int nid)
> ret = true;
>
> out:
> - kfree(msrs);
> + free_msrs(msrs);
> +
> +out_err:
> return ret;
> }
>
>
> --
> Regards/Gruss,
> Boris.
>
> Operating | Advanced Micro Devices GmbH
> System | Karl-Hammerschmidt-Str. 34, 85609 Dornach b. M?nchen, Germany
> Research | Gesch?ftsf?hrer: Andrew Bowd, Thomas M. McCoy, Giuliano Meroni
> Center | Sitz: Dornach, Gemeinde Aschheim, Landkreis M?nchen
> (OSRC) | Registergericht M?nchen, HRB Nr. 43632
>


---
~Randy


Attachments:
edac-ml.log (105.84 kB)

2009-12-02 22:12:01

by Doug Thompson

[permalink] [raw]
Subject: Re: 2.6.32-rc8: amd64_edac slub error



--- On Wed, 12/2/09, Randy Dunlap <[email protected]> wrote:

> From: Randy Dunlap <[email protected]>
> Subject: Re: 2.6.32-rc8: amd64_edac slub error
> To: "Borislav Petkov" <[email protected]>
> Cc: "Borislav Petkov" <[email protected]>, "LKML" <[email protected]>, "Doug Thompson" <[email protected]>
> Date: Wednesday, December 2, 2009, 11:11 AM
> On Wed, 2 Dec 2009 11:58:38 +0100
> Borislav Petkov wrote:
>
> > On Tue, Dec 01, 2009 at 09:19:31AM -0800, Randy Dunlap
> wrote:
> > > Here's the new log file (attached).
> >
> > Thanks for testing. Meanwhile, I noticed that the
> other places where
> > rdmsr_on_cpus() gets called with non-contigious
> cpumasks need fixing
> > too. Here's a version that takes care of that, I'd be
> nice if you could
> > give it a run too (patch against today's upstream).
> You could also
> > enforce the module loading by setting
> 'ecc_enable_override=1' to verify
> > the other rdmsr_on_cpus calls.
> >
> > Thanks.
>
> This patch also works for me.? Thanks.
>
> Acked-by: Randy Dunlap <[email protected]>

Acked-by: Doug Thompson <[email protected]>

>
>
> boot log attached.
>
> > ---
> > diff --git a/drivers/edac/amd64_edac.c
> b/drivers/edac/amd64_edac.c
> > index a38831c..da2428b 100644
> > --- a/drivers/edac/amd64_edac.c
> > +++ b/drivers/edac/amd64_edac.c
> > @@ -2618,6 +2618,9 @@ static int
> amd64_init_csrows(struct mem_ctl_info *mci)
> >? ??? return empty;
> >? }
> >?
> > +static struct msr *alloc_msrs(const cpumask_t
> *mask);
> > +static void free_msrs(struct msr *msrs);
> > +
> >? /*
> >???* Only if 'ecc_enable_override' is
> set AND BIOS had ECC disabled, do "we"
> >???* enable it.
> > @@ -2627,14 +2630,16 @@ static void
> amd64_enable_ecc_error_reporting(struct mem_ctl_info *mci)
> >? ??? struct amd64_pvt *pvt =
> mci->pvt_info;
> >? ??? const cpumask_t *cpumask =
> cpumask_of_node(pvt->mc_node_id);
> >? ??? int cpu, idx = 0, err = 0;
> > -??? struct msr
> msrs[cpumask_weight(cpumask)];
> > +??? struct msr *msrs;
> >? ??? u32 value;
> >? ??? u32 mask = K8_NBCTL_CECCEn |
> K8_NBCTL_UECCEn;
> >?
> >? ??? if (!ecc_enable_override)
> >? ??? ??? return;
> >?
> > -??? memset(msrs, 0, sizeof(msrs));
> > +??? msrs = alloc_msrs(cpumask);
> > +??? if (!msrs)
> > +??? ??? return;
> >?
> >? ??? amd64_printk(KERN_WARNING,
> >? ??? ???
> "'ecc_enable_override' parameter is active, "
> > @@ -2697,20 +2702,24 @@ static void
> amd64_enable_ecc_error_reporting(struct mem_ctl_info *mci)
> >? ??? ??? (value
> & K8_NBCFG_ECC_ENABLE) ? "Enabled" : "Disabled");
> >?
> >? ??? pvt->ctl_error_info.nbcfg
> = value;
> > +
> > +??? free_msrs(msrs);
> >? }
> >?
> >? static void
> amd64_restore_ecc_error_reporting(struct amd64_pvt *pvt)
> >? {
> >? ??? const cpumask_t *cpumask =
> cpumask_of_node(pvt->mc_node_id);
> >? ??? int cpu, idx = 0, err = 0;
> > -??? struct msr
> msrs[cpumask_weight(cpumask)];
> > +??? struct msr *msrs;
> >? ??? u32 value;
> >? ??? u32 mask = K8_NBCTL_CECCEn |
> K8_NBCTL_UECCEn;
> >?
> >? ??? if
> (!pvt->nbctl_mcgctl_saved)
> >? ??? ??? return;
> >?
> > -??? memset(msrs, 0, sizeof(msrs));
> > +??? msrs = alloc_msrs(cpumask);
> > +??? if (!msrs)
> > +??? ??? return;
> >?
> >? ??? err =
> pci_read_config_dword(pvt->misc_f3_ctl, K8_NBCTL,
> &value);
> >? ??? if (err)
> > @@ -2731,6 +2740,8 @@ static void
> amd64_restore_ecc_error_reporting(struct amd64_pvt *pvt)
> >? ??? }
> >?
> >? ??? wrmsr_on_cpus(cpumask,
> K8_MSR_MCGCTL, msrs);
> > +
> > +??? free_msrs(msrs);
> >? }
> >?
> >? /* get all cores on this DCT */
> > @@ -2743,6 +2754,40 @@ static void
> get_cpus_on_this_dct_cpumask(cpumask_t *mask, int nid)
> >? ??? ???
> ??? cpumask_set_cpu(cpu, mask);
> >? }
> >?
> > +/*
> > + * Allocate enough msr structs for the supplied
> cpumask. Also, take care of
> > + * non-contigious bitmasks.
> > + */
> > +static struct msr *alloc_msrs(const cpumask_t *mask)
> > +{
> > +??? struct msr *msrs;
> > +??? int i, first_cpu, last_cpu = 0;
> > +
> > +??? if (cpumask_empty(mask)) {
> > +??? ???
> amd64_printk(KERN_WARNING, "%s: Empty cpumask!\n",
> __func__);
> > +??? ??? return NULL;
> > +??? }
> > +
> > +??? first_cpu = cpumask_first(mask);
> > +??? for (i = first_cpu; i <
> nr_cpu_ids; i++)
> > +??? ??? if
> (cpumask_test_cpu(i, mask))
> > +??? ???
> ??? last_cpu = i;
> > +
> > +??? msrs = kzalloc(sizeof(*msrs) *
> (last_cpu - first_cpu + 1), GFP_KERNEL);
> > +??? if (!msrs) {
> > +??? ???
> amd64_printk(KERN_WARNING, "%s: error allocating msrs\n",
> > +??? ???
> ??? ? ? ? __func__);
> > +???
> ?????return NULL;
> > +??? }
> > +
> > +??? return msrs;
> > +}
> > +
> > +static void free_msrs(struct msr *msrs)
> > +{
> > +?????kfree(msrs);
> > +}
> > +
> >? /* check MCG_CTL on all the cpus on this node
> */
> >? static bool
> amd64_nb_mce_bank_enabled_on_node(int nid)
> >? {
> > @@ -2755,12 +2800,9 @@ static bool
> amd64_nb_mce_bank_enabled_on_node(int nid)
> >?
> >? ???
> get_cpus_on_this_dct_cpumask(&mask, nid);
> >?
> > -??? msrs = kzalloc(sizeof(struct msr)
> * cpumask_weight(&mask), GFP_KERNEL);
> > -??? if (!msrs) {
> > -??? ???
> amd64_printk(KERN_WARNING, "%s: error allocating msrs\n",
> > -??? ???
> ??? ? ? ? __func__);
> > -???
> ?????return false;
> > -??? }
> > +??? msrs = alloc_msrs(&mask);
> > +??? if (!msrs)
> > +??? ??? goto out_err;
> >?
> >? ??? rdmsr_on_cpus(&mask,
> MSR_IA32_MCG_CTL, msrs);
> >?
> > @@ -2779,7 +2821,9 @@ static bool
> amd64_nb_mce_bank_enabled_on_node(int nid)
> >? ??? ret = true;
> >?
> >? out:
> > -??? kfree(msrs);
> > +??? free_msrs(msrs);
> > +
> > +out_err:
> >? ??? return ret;
> >? }
> >?
> >
> > --
> > Regards/Gruss,
> > Boris.
> >
> > Operating | Advanced Micro Devices GmbH
> >???System? |
> Karl-Hammerschmidt-Str. 34, 85609 Dornach b. M?nchen,
> Germany
> >? Research | Gesch?ftsf?hrer: Andrew Bowd,
> Thomas M. McCoy, Giuliano Meroni
> >???Center? | Sitz: Dornach,
> Gemeinde Aschheim, Landkreis M?nchen
> >???(OSRC)? | Registergericht
> M?nchen, HRB Nr. 43632
> >
>
>
> ---
> ~Randy
>