Subject: [PATCH] x86, msr: add support for non-contiguous cpumasks

Hi Peter,

can you please take a look at the following patch which fixes a problem
with non-contiguous cpumasks submitted to rd/wrmsr_on_cpus() (see URL in
the commit message below) and let me know whether this approach looks
ok. If yes, I could submit it with my patchqueue along with the rest of
amd64_edac updates for it'll be very convenient if we could get it in
now so that it could be backported to .32.

Thanks.

---
The current rd/wrmsr_on_cpus helpers assume that the supplied
cpumasks are contiguous. However, there are machines out there
like some K8 multinode Opterons which have a non-contiguous core
enumeration on each node (e.g. cores 0,2 on node 0 instead of 0,1), see
http://www.gossamer-threads.com/lists/linux/kernel/1160268.

This patch fixes out-of-bounds writes (see URL above) by making sure we
stay inbounds by using an msr->cpu field which tells us which CPU's MSRs
to access.

Additionally, this patch adds msrs_{alloc,free} helpers for preparing
the array of MSRs to access.

Cc: H. Peter Anvin <[email protected]>
Cc: Mauro Carvalho Chehab <[email protected]>
Cc: Aristeu Rozanski <[email protected]>
Cc: Randy Dunlap <[email protected]>
Cc: Doug Thompson <[email protected]>
Not-yet-signed-off-by: Borislav Petkov <[email protected]>
---
arch/x86/include/asm/msr.h | 11 ++++++
arch/x86/lib/msr.c | 81 +++++++++++++++++++++++++++++++++++++------
drivers/edac/amd64_edac.c | 17 ++++-----
3 files changed, 87 insertions(+), 22 deletions(-)

diff --git a/arch/x86/include/asm/msr.h b/arch/x86/include/asm/msr.h
index 9a00219..4f528f8 100644
--- a/arch/x86/include/asm/msr.h
+++ b/arch/x86/include/asm/msr.h
@@ -18,6 +18,7 @@
#include <asm/cpumask.h>

struct msr {
+ int cpu;
union {
struct {
u32 l;
@@ -247,6 +248,8 @@ do { \
#ifdef CONFIG_SMP
int rdmsr_on_cpu(unsigned int cpu, u32 msr_no, u32 *l, u32 *h);
int wrmsr_on_cpu(unsigned int cpu, u32 msr_no, u32 l, u32 h);
+struct msr *msrs_alloc(const struct cpumask *mask);
+void msrs_free(struct msr *msrs);
void rdmsr_on_cpus(const struct cpumask *mask, u32 msr_no, struct msr *msrs);
void wrmsr_on_cpus(const struct cpumask *mask, u32 msr_no, struct msr *msrs);
int rdmsr_safe_on_cpu(unsigned int cpu, u32 msr_no, u32 *l, u32 *h);
@@ -264,6 +267,14 @@ static inline int wrmsr_on_cpu(unsigned int cpu, u32 msr_no, u32 l, u32 h)
wrmsr(msr_no, l, h);
return 0;
}
+struct msr *msrs_alloc(const struct cpumask *mask)
+{
+ return kzalloc(sizeof(struct msr), GFP_KERNEL);
+}
+void msrs_free(struct msr *msrs)
+{
+ kfree(msrs);
+}
static inline void rdmsr_on_cpus(const cpumask_t *m, u32 msr_no,
struct msr *msrs)
{
diff --git a/arch/x86/lib/msr.c b/arch/x86/lib/msr.c
index 41628b1..e2213de 100644
--- a/arch/x86/lib/msr.c
+++ b/arch/x86/lib/msr.c
@@ -7,20 +7,40 @@ struct msr_info {
u32 msr_no;
struct msr reg;
struct msr *msrs;
- int off;
+ unsigned msrs_len;
int err;
};

+static inline struct msr *scroll_to_cpu(struct msr_info *rv, int this_cpu,
+ const char *caller)
+{
+ int i = 0;
+
+ if (!rv->msrs)
+ return &rv->reg;
+
+ while (i < rv->msrs_len &&
+ rv->msrs[i].cpu != this_cpu)
+ i++;
+
+ if (rv->msrs[i].cpu != this_cpu) {
+ pr_err("%s: called on a wrong cpu (%d vs %d)?\n",
+ caller, rv->msrs[i].cpu, this_cpu);
+ return NULL;
+ }
+
+ return &rv->msrs[i];
+}
+
static void __rdmsr_on_cpu(void *info)
{
struct msr_info *rv = info;
struct msr *reg;
int this_cpu = raw_smp_processor_id();

- if (rv->msrs)
- reg = &rv->msrs[this_cpu - rv->off];
- else
- reg = &rv->reg;
+ reg = scroll_to_cpu(rv, this_cpu, __func__);
+ if (!reg)
+ return;

rdmsr(rv->msr_no, reg->l, reg->h);
}
@@ -31,10 +51,9 @@ static void __wrmsr_on_cpu(void *info)
struct msr *reg;
int this_cpu = raw_smp_processor_id();

- if (rv->msrs)
- reg = &rv->msrs[this_cpu - rv->off];
- else
- reg = &rv->reg;
+ reg = scroll_to_cpu(rv, this_cpu, __func__);
+ if (!reg)
+ return;

wrmsr(rv->msr_no, reg->l, reg->h);
}
@@ -80,9 +99,9 @@ static void __rwmsr_on_cpus(const struct cpumask *mask, u32 msr_no,

memset(&rv, 0, sizeof(rv));

- rv.off = cpumask_first(mask);
- rv.msrs = msrs;
- rv.msr_no = msr_no;
+ rv.msr_no = msr_no;
+ rv.msrs = msrs;
+ rv.msrs_len = cpumask_weight(mask);

this_cpu = get_cpu();

@@ -120,6 +139,44 @@ void wrmsr_on_cpus(const struct cpumask *mask, u32 msr_no, struct msr *msrs)
}
EXPORT_SYMBOL(wrmsr_on_cpus);

+/*
+ * Allocate enough msr structs for the supplied cpumask. Also, take care of
+ * non-contigious bitmasks.
+ */
+struct msr *msrs_alloc(const struct cpumask *mask)
+{
+ struct msr *msrs;
+ int i, cpu = -1;
+ unsigned msrs_len;
+
+ if (cpumask_empty(mask)) {
+ pr_warning("%s: empty cpumask!\n", __func__);
+ return NULL;
+ }
+
+ msrs_len = cpumask_weight(mask);
+
+ msrs = kzalloc(sizeof(struct msr) * msrs_len, GFP_KERNEL);
+ if (!msrs) {
+ pr_warning("%s: error allocating msrs\n", __func__);
+ return NULL;
+ }
+
+ for (i = 0; i < msrs_len; i++) {
+ cpu = cpumask_next(cpu, mask);
+ msrs[i].cpu = cpu;
+ }
+
+ return msrs;
+}
+EXPORT_SYMBOL(msrs_alloc);
+
+void msrs_free(struct msr *msrs)
+{
+ kfree(msrs);
+}
+EXPORT_SYMBOL(msrs_free);
+
/* These "safe" variants are slower and should be used when the target MSR
may not actually exist. */
static void __rdmsr_safe_on_cpu(void *info)
diff --git a/drivers/edac/amd64_edac.c b/drivers/edac/amd64_edac.c
index 533f5ff..784c968 100644
--- a/drivers/edac/amd64_edac.c
+++ b/drivers/edac/amd64_edac.c
@@ -2507,10 +2507,8 @@ static bool amd64_nb_mce_bank_enabled_on_node(int nid)

get_cpus_on_this_dct_cpumask(mask, nid);

- msrs = kzalloc(sizeof(struct msr) * cpumask_weight(mask), GFP_KERNEL);
+ msrs = msrs_alloc(mask);
if (!msrs) {
- amd64_printk(KERN_WARNING, "%s: error allocating msrs\n",
- __func__);
free_cpumask_var(mask);
return false;
}
@@ -2532,7 +2530,7 @@ static bool amd64_nb_mce_bank_enabled_on_node(int nid)
ret = true;

out:
- kfree(msrs);
+ msrs_free(msrs);
free_cpumask_var(mask);
return ret;
}
@@ -2551,17 +2549,16 @@ static int amd64_toggle_ecc_err_reporting(struct amd64_pvt *pvt, bool on)

get_cpus_on_this_dct_cpumask(cmask, pvt->mc_node_id);

- msrs = kzalloc(sizeof(struct msr) * cpumask_weight(cmask), GFP_KERNEL);
- if (!msrs) {
- amd64_printk(KERN_WARNING, "%s: error allocating msrs\n",
- __func__);
+ msrs = msrs_alloc(cmask);
+ if (!msrs)
return -ENOMEM;
- }

rdmsr_on_cpus(cmask, MSR_IA32_MCG_CTL, msrs);

for_each_cpu(cpu, cmask) {

+ WARN_ON(cpu != msrs[idx].cpu);
+
if (on) {
if (msrs[idx].l & K8_MSR_MCGCTL_NBE)
pvt->flags.ecc_report = 1;
@@ -2578,7 +2575,7 @@ static int amd64_toggle_ecc_err_reporting(struct amd64_pvt *pvt, bool on)
}
wrmsr_on_cpus(cmask, MSR_IA32_MCG_CTL, msrs);

- kfree(msrs);
+ msrs_free(msrs);
free_cpumask_var(cmask);

return 0;
--
1.6.5.3


--
Regards/Gruss,
Boris.

Operating | Advanced Micro Devices GmbH
System | Karl-Hammerschmidt-Str. 34, 85609 Dornach b. M?nchen, Germany
Research | Gesch?ftsf?hrer: Andrew Bowd, Thomas M. McCoy, Giuliano Meroni
Center | Sitz: Dornach, Gemeinde Aschheim, Landkreis M?nchen
(OSRC) | Registergericht M?nchen, HRB Nr. 43632


2009-12-10 07:35:20

by Borislav Petkov

[permalink] [raw]
Subject: Re: [PATCH] x86, msr: add support for non-contiguous cpumasks

Hi Peter,

any nacks/acks on the fix below? Because we really need some version of
it to go in now.

Thanks.

On Mon, Dec 07, 2009 at 01:21:33PM +0100, Borislav Petkov wrote:
> Hi Peter,
>
> can you please take a look at the following patch which fixes a problem
> with non-contiguous cpumasks submitted to rd/wrmsr_on_cpus() (see URL in
> the commit message below) and let me know whether this approach looks
> ok. If yes, I could submit it with my patchqueue along with the rest of
> amd64_edac updates for it'll be very convenient if we could get it in
> now so that it could be backported to .32.
>
> Thanks.
>
> ---
> The current rd/wrmsr_on_cpus helpers assume that the supplied
> cpumasks are contiguous. However, there are machines out there
> like some K8 multinode Opterons which have a non-contiguous core
> enumeration on each node (e.g. cores 0,2 on node 0 instead of 0,1), see
> http://www.gossamer-threads.com/lists/linux/kernel/1160268.
>
> This patch fixes out-of-bounds writes (see URL above) by making sure we
> stay inbounds by using an msr->cpu field which tells us which CPU's MSRs
> to access.
>
> Additionally, this patch adds msrs_{alloc,free} helpers for preparing
> the array of MSRs to access.
>
> Cc: H. Peter Anvin <[email protected]>
> Cc: Mauro Carvalho Chehab <[email protected]>
> Cc: Aristeu Rozanski <[email protected]>
> Cc: Randy Dunlap <[email protected]>
> Cc: Doug Thompson <[email protected]>
> Not-yet-signed-off-by: Borislav Petkov <[email protected]>
> ---
> arch/x86/include/asm/msr.h | 11 ++++++
> arch/x86/lib/msr.c | 81 +++++++++++++++++++++++++++++++++++++------
> drivers/edac/amd64_edac.c | 17 ++++-----
> 3 files changed, 87 insertions(+), 22 deletions(-)
>
> diff --git a/arch/x86/include/asm/msr.h b/arch/x86/include/asm/msr.h
> index 9a00219..4f528f8 100644
> --- a/arch/x86/include/asm/msr.h
> +++ b/arch/x86/include/asm/msr.h
> @@ -18,6 +18,7 @@
> #include <asm/cpumask.h>
>
> struct msr {
> + int cpu;
> union {
> struct {
> u32 l;
> @@ -247,6 +248,8 @@ do { \
> #ifdef CONFIG_SMP
> int rdmsr_on_cpu(unsigned int cpu, u32 msr_no, u32 *l, u32 *h);
> int wrmsr_on_cpu(unsigned int cpu, u32 msr_no, u32 l, u32 h);
> +struct msr *msrs_alloc(const struct cpumask *mask);
> +void msrs_free(struct msr *msrs);
> void rdmsr_on_cpus(const struct cpumask *mask, u32 msr_no, struct msr *msrs);
> void wrmsr_on_cpus(const struct cpumask *mask, u32 msr_no, struct msr *msrs);
> int rdmsr_safe_on_cpu(unsigned int cpu, u32 msr_no, u32 *l, u32 *h);
> @@ -264,6 +267,14 @@ static inline int wrmsr_on_cpu(unsigned int cpu, u32 msr_no, u32 l, u32 h)
> wrmsr(msr_no, l, h);
> return 0;
> }
> +struct msr *msrs_alloc(const struct cpumask *mask)
> +{
> + return kzalloc(sizeof(struct msr), GFP_KERNEL);
> +}
> +void msrs_free(struct msr *msrs)
> +{
> + kfree(msrs);
> +}
> static inline void rdmsr_on_cpus(const cpumask_t *m, u32 msr_no,
> struct msr *msrs)
> {
> diff --git a/arch/x86/lib/msr.c b/arch/x86/lib/msr.c
> index 41628b1..e2213de 100644
> --- a/arch/x86/lib/msr.c
> +++ b/arch/x86/lib/msr.c
> @@ -7,20 +7,40 @@ struct msr_info {
> u32 msr_no;
> struct msr reg;
> struct msr *msrs;
> - int off;
> + unsigned msrs_len;
> int err;
> };
>
> +static inline struct msr *scroll_to_cpu(struct msr_info *rv, int this_cpu,
> + const char *caller)
> +{
> + int i = 0;
> +
> + if (!rv->msrs)
> + return &rv->reg;
> +
> + while (i < rv->msrs_len &&
> + rv->msrs[i].cpu != this_cpu)
> + i++;
> +
> + if (rv->msrs[i].cpu != this_cpu) {
> + pr_err("%s: called on a wrong cpu (%d vs %d)?\n",
> + caller, rv->msrs[i].cpu, this_cpu);
> + return NULL;
> + }
> +
> + return &rv->msrs[i];
> +}
> +
> static void __rdmsr_on_cpu(void *info)
> {
> struct msr_info *rv = info;
> struct msr *reg;
> int this_cpu = raw_smp_processor_id();
>
> - if (rv->msrs)
> - reg = &rv->msrs[this_cpu - rv->off];
> - else
> - reg = &rv->reg;
> + reg = scroll_to_cpu(rv, this_cpu, __func__);
> + if (!reg)
> + return;
>
> rdmsr(rv->msr_no, reg->l, reg->h);
> }
> @@ -31,10 +51,9 @@ static void __wrmsr_on_cpu(void *info)
> struct msr *reg;
> int this_cpu = raw_smp_processor_id();
>
> - if (rv->msrs)
> - reg = &rv->msrs[this_cpu - rv->off];
> - else
> - reg = &rv->reg;
> + reg = scroll_to_cpu(rv, this_cpu, __func__);
> + if (!reg)
> + return;
>
> wrmsr(rv->msr_no, reg->l, reg->h);
> }
> @@ -80,9 +99,9 @@ static void __rwmsr_on_cpus(const struct cpumask *mask, u32 msr_no,
>
> memset(&rv, 0, sizeof(rv));
>
> - rv.off = cpumask_first(mask);
> - rv.msrs = msrs;
> - rv.msr_no = msr_no;
> + rv.msr_no = msr_no;
> + rv.msrs = msrs;
> + rv.msrs_len = cpumask_weight(mask);
>
> this_cpu = get_cpu();
>
> @@ -120,6 +139,44 @@ void wrmsr_on_cpus(const struct cpumask *mask, u32 msr_no, struct msr *msrs)
> }
> EXPORT_SYMBOL(wrmsr_on_cpus);
>
> +/*
> + * Allocate enough msr structs for the supplied cpumask. Also, take care of
> + * non-contigious bitmasks.
> + */
> +struct msr *msrs_alloc(const struct cpumask *mask)
> +{
> + struct msr *msrs;
> + int i, cpu = -1;
> + unsigned msrs_len;
> +
> + if (cpumask_empty(mask)) {
> + pr_warning("%s: empty cpumask!\n", __func__);
> + return NULL;
> + }
> +
> + msrs_len = cpumask_weight(mask);
> +
> + msrs = kzalloc(sizeof(struct msr) * msrs_len, GFP_KERNEL);
> + if (!msrs) {
> + pr_warning("%s: error allocating msrs\n", __func__);
> + return NULL;
> + }
> +
> + for (i = 0; i < msrs_len; i++) {
> + cpu = cpumask_next(cpu, mask);
> + msrs[i].cpu = cpu;
> + }
> +
> + return msrs;
> +}
> +EXPORT_SYMBOL(msrs_alloc);
> +
> +void msrs_free(struct msr *msrs)
> +{
> + kfree(msrs);
> +}
> +EXPORT_SYMBOL(msrs_free);
> +
> /* These "safe" variants are slower and should be used when the target MSR
> may not actually exist. */
> static void __rdmsr_safe_on_cpu(void *info)
> diff --git a/drivers/edac/amd64_edac.c b/drivers/edac/amd64_edac.c
> index 533f5ff..784c968 100644
> --- a/drivers/edac/amd64_edac.c
> +++ b/drivers/edac/amd64_edac.c
> @@ -2507,10 +2507,8 @@ static bool amd64_nb_mce_bank_enabled_on_node(int nid)
>
> get_cpus_on_this_dct_cpumask(mask, nid);
>
> - msrs = kzalloc(sizeof(struct msr) * cpumask_weight(mask), GFP_KERNEL);
> + msrs = msrs_alloc(mask);
> if (!msrs) {
> - amd64_printk(KERN_WARNING, "%s: error allocating msrs\n",
> - __func__);
> free_cpumask_var(mask);
> return false;
> }
> @@ -2532,7 +2530,7 @@ static bool amd64_nb_mce_bank_enabled_on_node(int nid)
> ret = true;
>
> out:
> - kfree(msrs);
> + msrs_free(msrs);
> free_cpumask_var(mask);
> return ret;
> }
> @@ -2551,17 +2549,16 @@ static int amd64_toggle_ecc_err_reporting(struct amd64_pvt *pvt, bool on)
>
> get_cpus_on_this_dct_cpumask(cmask, pvt->mc_node_id);
>
> - msrs = kzalloc(sizeof(struct msr) * cpumask_weight(cmask), GFP_KERNEL);
> - if (!msrs) {
> - amd64_printk(KERN_WARNING, "%s: error allocating msrs\n",
> - __func__);
> + msrs = msrs_alloc(cmask);
> + if (!msrs)
> return -ENOMEM;
> - }
>
> rdmsr_on_cpus(cmask, MSR_IA32_MCG_CTL, msrs);
>
> for_each_cpu(cpu, cmask) {
>
> + WARN_ON(cpu != msrs[idx].cpu);
> +
> if (on) {
> if (msrs[idx].l & K8_MSR_MCGCTL_NBE)
> pvt->flags.ecc_report = 1;
> @@ -2578,7 +2575,7 @@ static int amd64_toggle_ecc_err_reporting(struct amd64_pvt *pvt, bool on)
> }
> wrmsr_on_cpus(cmask, MSR_IA32_MCG_CTL, msrs);
>
> - kfree(msrs);
> + msrs_free(msrs);
> free_cpumask_var(cmask);
>
> return 0;
> --
> 1.6.5.3
>
>
> --
> Regards/Gruss,
> Boris.
>
> Operating | Advanced Micro Devices GmbH
> System | Karl-Hammerschmidt-Str. 34, 85609 Dornach b. München, Germany
> Research | Geschäftsführer: Andrew Bowd, Thomas M. McCoy, Giuliano Meroni
> Center | Sitz: Dornach, Gemeinde Aschheim, Landkreis München
> (OSRC) | Registergericht München, HRB Nr. 43632
>
> --
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to [email protected]
> More majordomo info at http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at http://www.tux.org/lkml/

--
Regards/Gruss,
Boris.

2009-12-10 07:39:48

by H. Peter Anvin

[permalink] [raw]
Subject: Re: [PATCH] x86, msr: add support for non-contiguous cpumasks

On 12/09/2009 11:35 PM, Borislav Petkov wrote:
> Hi Peter,
>
> any nacks/acks on the fix below? Because we really need some version of
> it to go in now.
>
> Thanks.
>

Looks OK at a glance, but reviewing it is on tomorrow's list... too late
now, I'm sorry.

-hpa

--
H. Peter Anvin, Intel Open Source Technology Center
I work for Intel. I don't speak on their behalf.

2009-12-11 01:50:09

by H. Peter Anvin

[permalink] [raw]
Subject: Re: [PATCH] x86, msr: add support for non-contiguous cpumasks

On 12/07/2009 04:21 AM, Borislav Petkov wrote:
>
> struct msr {
> + int cpu;
> union {
> struct {
> u32 l;

I really don't like this patch, for multiple reasons. One of them is
the above: this has no business being part of struct msr, which reflects
an MSR value (and ideally should replace at least the use of two
pointers in other places over time). Having a CPU field and not an MSR
number field particular doesn't make any sense.

The second is a linear(!!) search executed on every CPU... at the same
time, over the same data structure.

The ideal probably would be to use a percpu variable. Now, this would
either have to be a dynamic percpu allocation (which would have to be
the responsibility of the caller, and reused, lest this would be a
*very* expensive proposition), or we would have to make these functions
run under a mutex. However, as long as the expected callers of this are
things that get set up once and then pretty much stick around, a percpu
variable might just work.

The slightly less radical version would be to simply require an array
that spans the extremes of the CPU numbers in the mask. Even on a
4096-CPU system, we're only talking about 32K worth of memory here.
This is basically the original solution, just accounting for the fact
that the bitmap may be sparse when allocating it.

The third option would be to at least require that the struct msr
contents are at least serial in the order of the bitmask, not by adding
another field.

-hpa

Subject: Re: [PATCH] x86, msr: add support for non-contiguous cpumasks

On Thu, Dec 10, 2009 at 05:49:02PM -0800, H. Peter Anvin wrote:
> On 12/07/2009 04:21 AM, Borislav Petkov wrote:
> >
> > struct msr {
> > + int cpu;
> > union {
> > struct {
> > u32 l;
>
> I really don't like this patch, for multiple reasons. One of them is
> the above: this has no business being part of struct msr, which reflects
> an MSR value (and ideally should replace at least the use of two
> pointers in other places over time). Having a CPU field and not an MSR
> number field particular doesn't make any sense.

Why, MSRs are per-CPU. My reasoning here is to reflect the value of an
MSR on a particular CPU...

[..]

> The ideal probably would be to use a percpu variable. Now, this would
> either have to be a dynamic percpu allocation (which would have to be
> the responsibility of the caller, and reused, lest this would be a
> *very* expensive proposition), or we would have to make these functions
> run under a mutex. However, as long as the expected callers of this are
> things that get set up once and then pretty much stick around, a percpu
> variable might just work.

I think this would be the cleanest way. Also, best it'll be to allocate
those dynamically only when they're really needed (e.g. on driver
loading) and later reuse them.

[..]

> The third option would be to at least require that the struct msr
> contents are at least serial in the order of the bitmask, not by adding
> another field.

I had that version already done but it seemed half-baked and clumsy for
the MSRs array to traverse. I'll give the percpu variables a shot and
get back to you when I have something ready.

Thanks for reviewing.

--
Regards/Gruss,
Boris.

Operating | Advanced Micro Devices GmbH
System | Karl-Hammerschmidt-Str. 34, 85609 Dornach b. M?nchen, Germany
Research | Gesch?ftsf?hrer: Andrew Bowd, Thomas M. McCoy, Giuliano Meroni
Center | Sitz: Dornach, Gemeinde Aschheim, Landkreis M?nchen
(OSRC) | Registergericht M?nchen, HRB Nr. 43632

Subject: [PATCH v2] x86, msr: add support for non-contiguous cpumasks

Ok, here's a second (and much leaner, removing a bunch of code) version.
Tested on K8 and F10h machines here.

--
From: Borislav Petkov <[email protected]>
Date: Fri Dec 11 18:08:51 CET 2009
Date: Sun, 6 Dec 2009 13:43:43 +0100
Subject: [PATCH v2] x86, msr: add support for non-contiguous cpumasks

The current rd/wrmsr_on_cpus helpers assume that the supplied
cpumasks are contiguous. However, there are machines out there
like some K8 multinode Opterons which have a non-contiguous core
enumeration on each node (e.g. cores 0,2 on node 0 instead of 0,1), see
http://www.gossamer-threads.com/lists/linux/kernel/1160268.

This patch fixes out-of-bounds writes (see URL above) by adding per-CPU
msr structs which are used on the respective cores.

Additionally, two helpers, msrs_{alloc,free}, are provided for use by
the callers of the MSR accessors.

Cc: H. Peter Anvin <[email protected]>
Cc: Mauro Carvalho Chehab <[email protected]>
Cc: Aristeu Rozanski <[email protected]>
Cc: Randy Dunlap <[email protected]>
Cc: Doug Thompson <[email protected]>
Signed-off-by: Borislav Petkov <[email protected]>
---
arch/x86/include/asm/msr.h | 3 ++
arch/x86/lib/msr.c | 26 +++++++++++++++++++++---
drivers/edac/amd64_edac.c | 46 ++++++++++++++++---------------------------
3 files changed, 42 insertions(+), 33 deletions(-)

diff --git a/arch/x86/include/asm/msr.h b/arch/x86/include/asm/msr.h
index 5bef931..2d228fc 100644
--- a/arch/x86/include/asm/msr.h
+++ b/arch/x86/include/asm/msr.h
@@ -244,6 +244,9 @@ do { \

#define write_rdtscp_aux(val) wrmsr(0xc0000103, (val), 0)

+struct msr *msrs_alloc(void);
+void msrs_free(struct msr *msrs);
+
#ifdef CONFIG_SMP
int rdmsr_on_cpu(unsigned int cpu, u32 msr_no, u32 *l, u32 *h);
int wrmsr_on_cpu(unsigned int cpu, u32 msr_no, u32 l, u32 h);
diff --git a/arch/x86/lib/msr.c b/arch/x86/lib/msr.c
index 41628b1..8728341 100644
--- a/arch/x86/lib/msr.c
+++ b/arch/x86/lib/msr.c
@@ -7,7 +7,6 @@ struct msr_info {
u32 msr_no;
struct msr reg;
struct msr *msrs;
- int off;
int err;
};

@@ -18,7 +17,7 @@ static void __rdmsr_on_cpu(void *info)
int this_cpu = raw_smp_processor_id();

if (rv->msrs)
- reg = &rv->msrs[this_cpu - rv->off];
+ reg = per_cpu_ptr(rv->msrs, this_cpu);
else
reg = &rv->reg;

@@ -32,7 +31,7 @@ static void __wrmsr_on_cpu(void *info)
int this_cpu = raw_smp_processor_id();

if (rv->msrs)
- reg = &rv->msrs[this_cpu - rv->off];
+ reg = per_cpu_ptr(rv->msrs, this_cpu);
else
reg = &rv->reg;

@@ -80,7 +79,6 @@ static void __rwmsr_on_cpus(const struct cpumask *mask, u32 msr_no,

memset(&rv, 0, sizeof(rv));

- rv.off = cpumask_first(mask);
rv.msrs = msrs;
rv.msr_no = msr_no;

@@ -120,6 +118,26 @@ void wrmsr_on_cpus(const struct cpumask *mask, u32 msr_no, struct msr *msrs)
}
EXPORT_SYMBOL(wrmsr_on_cpus);

+struct msr *msrs_alloc(void)
+{
+ struct msr *msrs = NULL;
+
+ msrs = alloc_percpu(struct msr);
+ if (!msrs) {
+ pr_warning("%s: error allocating msrs\n", __func__);
+ return NULL;
+ }
+
+ return msrs;
+}
+EXPORT_SYMBOL(msrs_alloc);
+
+void msrs_free(struct msr *msrs)
+{
+ free_percpu(msrs);
+}
+EXPORT_SYMBOL(msrs_free);
+
/* These "safe" variants are slower and should be used when the target MSR
may not actually exist. */
static void __rdmsr_safe_on_cpu(void *info)
diff --git a/drivers/edac/amd64_edac.c b/drivers/edac/amd64_edac.c
index 5fdd6da..df5b684 100644
--- a/drivers/edac/amd64_edac.c
+++ b/drivers/edac/amd64_edac.c
@@ -13,6 +13,8 @@ module_param(report_gart_errors, int, 0644);
static int ecc_enable_override;
module_param(ecc_enable_override, int, 0644);

+static struct msr *msrs;
+
/* Lookup table for all possible MC control instances */
struct amd64_pvt;
static struct mem_ctl_info *mci_lookup[EDAC_MAX_NUMNODES];
@@ -2495,8 +2497,7 @@ static void get_cpus_on_this_dct_cpumask(struct cpumask *mask, int nid)
static bool amd64_nb_mce_bank_enabled_on_node(int nid)
{
cpumask_var_t mask;
- struct msr *msrs;
- int cpu, nbe, idx = 0;
+ int cpu, nbe;
bool ret = false;

if (!zalloc_cpumask_var(&mask, GFP_KERNEL)) {
@@ -2507,32 +2508,22 @@ static bool amd64_nb_mce_bank_enabled_on_node(int nid)

get_cpus_on_this_dct_cpumask(mask, nid);

- msrs = kzalloc(sizeof(struct msr) * cpumask_weight(mask), GFP_KERNEL);
- if (!msrs) {
- amd64_printk(KERN_WARNING, "%s: error allocating msrs\n",
- __func__);
- free_cpumask_var(mask);
- return false;
- }
-
rdmsr_on_cpus(mask, MSR_IA32_MCG_CTL, msrs);

for_each_cpu(cpu, mask) {
- nbe = msrs[idx].l & K8_MSR_MCGCTL_NBE;
+ struct msr *reg = per_cpu_ptr(msrs, cpu);
+ nbe = reg->l & K8_MSR_MCGCTL_NBE;

debugf0("core: %u, MCG_CTL: 0x%llx, NB MSR is %s\n",
- cpu, msrs[idx].q,
+ cpu, reg->q,
(nbe ? "enabled" : "disabled"));

if (!nbe)
goto out;
-
- idx++;
}
ret = true;

out:
- kfree(msrs);
free_cpumask_var(mask);
return ret;
}
@@ -2540,8 +2531,7 @@ out:
static int amd64_toggle_ecc_err_reporting(struct amd64_pvt *pvt, bool on)
{
cpumask_var_t cmask;
- struct msr *msrs = NULL;
- int cpu, idx = 0;
+ int cpu;

if (!zalloc_cpumask_var(&cmask, GFP_KERNEL)) {
amd64_printk(KERN_WARNING, "%s: error allocating mask\n",
@@ -2551,34 +2541,27 @@ static int amd64_toggle_ecc_err_reporting(struct amd64_pvt *pvt, bool on)

get_cpus_on_this_dct_cpumask(cmask, pvt->mc_node_id);

- msrs = kzalloc(sizeof(struct msr) * cpumask_weight(cmask), GFP_KERNEL);
- if (!msrs) {
- amd64_printk(KERN_WARNING, "%s: error allocating msrs\n",
- __func__);
- return -ENOMEM;
- }
-
rdmsr_on_cpus(cmask, MSR_IA32_MCG_CTL, msrs);

for_each_cpu(cpu, cmask) {

+ struct msr *reg = per_cpu_ptr(msrs, cpu);
+
if (on) {
- if (msrs[idx].l & K8_MSR_MCGCTL_NBE)
+ if (reg->l & K8_MSR_MCGCTL_NBE)
pvt->flags.ecc_report = 1;

- msrs[idx].l |= K8_MSR_MCGCTL_NBE;
+ reg->l |= K8_MSR_MCGCTL_NBE;
} else {
/*
* Turn off ECC reporting only when it was off before
*/
if (!pvt->flags.ecc_report)
- msrs[idx].l &= ~K8_MSR_MCGCTL_NBE;
+ reg->l &= ~K8_MSR_MCGCTL_NBE;
}
- idx++;
}
wrmsr_on_cpus(cmask, MSR_IA32_MCG_CTL, msrs);

- kfree(msrs);
free_cpumask_var(cmask);

return 0;
@@ -3036,6 +3019,8 @@ static int __init amd64_edac_init(void)
if (cache_k8_northbridges() < 0)
return err;

+ msrs = msrs_alloc();
+
err = pci_register_driver(&amd64_pci_driver);
if (err)
return err;
@@ -3071,6 +3056,9 @@ static void __exit amd64_edac_exit(void)
edac_pci_release_generic_ctl(amd64_ctl_pci);

pci_unregister_driver(&amd64_pci_driver);
+
+ msrs_free(msrs);
+ msrs = NULL;
}

module_init(amd64_edac_init);
--
1.6.5.4


--
Regards/Gruss,
Boris.

Operating | Advanced Micro Devices GmbH
System | Karl-Hammerschmidt-Str. 34, 85609 Dornach b. M?nchen, Germany
Research | Gesch?ftsf?hrer: Andrew Bowd, Thomas M. McCoy, Giuliano Meroni
Center | Sitz: Dornach, Gemeinde Aschheim, Landkreis M?nchen
(OSRC) | Registergericht M?nchen, HRB Nr. 43632

2009-12-11 18:03:01

by H. Peter Anvin

[permalink] [raw]
Subject: Re: [PATCH v2] x86, msr: add support for non-contiguous cpumasks

On 12/11/2009 09:14 AM, Borislav Petkov wrote:
> Ok, here's a second (and much leaner, removing a bunch of code) version.
> Tested on K8 and F10h machines here.

Much better, thanks!

-hpa

--
H. Peter Anvin, Intel Open Source Technology Center
I work for Intel. I don't speak on their behalf.

Subject: Re: [PATCH v2] x86, msr: add support for non-contiguous cpumasks

On Fri, Dec 11, 2009 at 10:01:38AM -0800, H. Peter Anvin wrote:
> On 12/11/2009 09:14 AM, Borislav Petkov wrote:
> > Ok, here's a second (and much leaner, removing a bunch of code) version.
> > Tested on K8 and F10h machines here.
>
> Much better, thanks!

Ok, cool. Should I send it to Linus or would you like to do it?

--
Regards/Gruss,
Boris.

Operating | Advanced Micro Devices GmbH
System | Karl-Hammerschmidt-Str. 34, 85609 Dornach b. M?nchen, Germany
Research | Gesch?ftsf?hrer: Andrew Bowd, Thomas M. McCoy, Giuliano Meroni
Center | Sitz: Dornach, Gemeinde Aschheim, Landkreis M?nchen
(OSRC) | Registergericht M?nchen, HRB Nr. 43632

2009-12-11 18:46:42

by H. Peter Anvin

[permalink] [raw]
Subject: Re: [PATCH v2] x86, msr: add support for non-contiguous cpumasks

On 12/11/2009 10:36 AM, Borislav Petkov wrote:
> On Fri, Dec 11, 2009 at 10:01:38AM -0800, H. Peter Anvin wrote:
>> On 12/11/2009 09:14 AM, Borislav Petkov wrote:
>>> Ok, here's a second (and much leaner, removing a bunch of code) version.
>>> Tested on K8 and F10h machines here.
>>
>> Much better, thanks!
>
> Ok, cool. Should I send it to Linus or would you like to do it?
>

I am just in the process of putting it into -tip, but if you prefer to
send it directly you can have my

Acked-by: H. Peter Anvin <[email protected]>

Whichever way is easier for you is fine with me.

-hpa

--
H. Peter Anvin, Intel Open Source Technology Center
I work for Intel. I don't speak on their behalf.

2009-12-11 18:59:59

by H. Peter Anvin

[permalink] [raw]
Subject: Re: [PATCH v2] x86, msr: add support for non-contiguous cpumasks

On 12/11/2009 10:36 AM, Borislav Petkov wrote:
> On Fri, Dec 11, 2009 at 10:01:38AM -0800, H. Peter Anvin wrote:
>> On 12/11/2009 09:14 AM, Borislav Petkov wrote:
>>> Ok, here's a second (and much leaner, removing a bunch of code) version.
>>> Tested on K8 and F10h machines here.
>>
>> Much better, thanks!
>
> Ok, cool. Should I send it to Linus or would you like to do it?
>

I'm just going to put this in the queue for the next drop to Linus.

-hpa

--
H. Peter Anvin, Intel Open Source Technology Center
I work for Intel. I don't speak on their behalf.

Subject: Re: [PATCH v2] x86, msr: add support for non-contiguous cpumasks

On Fri, Dec 11, 2009 at 10:45:37AM -0800, H. Peter Anvin wrote:
> On 12/11/2009 10:36 AM, Borislav Petkov wrote:
> > On Fri, Dec 11, 2009 at 10:01:38AM -0800, H. Peter Anvin wrote:
> >> On 12/11/2009 09:14 AM, Borislav Petkov wrote:
> >>> Ok, here's a second (and much leaner, removing a bunch of code) version.
> >>> Tested on K8 and F10h machines here.
> >>
> >> Much better, thanks!
> >
> > Ok, cool. Should I send it to Linus or would you like to do it?
> >
>
> I am just in the process of putting it into -tip, but if you prefer to
> send it directly you can have my
>
> Acked-by: H. Peter Anvin <[email protected]>
>
> Whichever way is easier for you is fine with me.

I doesn't matter to me either. It touches x86-related code, so it makes
sense to go through you, besides you've already started it :).

My only concern is that it should go in now since it got caught couple
of days before 2.6.32 got released. A proper fix wouldn't have made it
in at that late in the release cycle (-rc8), therefore, I'll be at least
able to backport it now as soon as it hits mainline so that at least
2.6.32.2 has the proper fix.

Thanks.

--
Regards/Gruss,
Boris.

Operating | Advanced Micro Devices GmbH
System | Karl-Hammerschmidt-Str. 34, 85609 Dornach b. M?nchen, Germany
Research | Gesch?ftsf?hrer: Andrew Bowd, Thomas M. McCoy, Giuliano Meroni
Center | Sitz: Dornach, Gemeinde Aschheim, Landkreis M?nchen
(OSRC) | Registergericht M?nchen, HRB Nr. 43632

Subject: [tip:x86/urgent] x86, msr: Add support for non-contiguous cpumasks

Commit-ID: 505422517d3f126bb939439e9d15dece94e11d2c
Gitweb: http://git.kernel.org/tip/505422517d3f126bb939439e9d15dece94e11d2c
Author: Borislav Petkov <[email protected]>
AuthorDate: Fri, 11 Dec 2009 18:14:40 +0100
Committer: H. Peter Anvin <[email protected]>
CommitDate: Fri, 11 Dec 2009 10:59:21 -0800

x86, msr: Add support for non-contiguous cpumasks

The current rd/wrmsr_on_cpus helpers assume that the supplied
cpumasks are contiguous. However, there are machines out there
like some K8 multinode Opterons which have a non-contiguous core
enumeration on each node (e.g. cores 0,2 on node 0 instead of 0,1), see
http://www.gossamer-threads.com/lists/linux/kernel/1160268.

This patch fixes out-of-bounds writes (see URL above) by adding per-CPU
msr structs which are used on the respective cores.

Additionally, two helpers, msrs_{alloc,free}, are provided for use by
the callers of the MSR accessors.

Cc: H. Peter Anvin <[email protected]>
Cc: Mauro Carvalho Chehab <[email protected]>
Cc: Aristeu Rozanski <[email protected]>
Cc: Randy Dunlap <[email protected]>
Cc: Doug Thompson <[email protected]>
Signed-off-by: Borislav Petkov <[email protected]>
LKML-Reference: <20091211171440.GD31998@aftab>
Signed-off-by: H. Peter Anvin <[email protected]>
---
arch/x86/include/asm/msr.h | 3 ++
arch/x86/lib/msr.c | 26 +++++++++++++++++++++---
drivers/edac/amd64_edac.c | 46 ++++++++++++++++---------------------------
3 files changed, 42 insertions(+), 33 deletions(-)

diff --git a/arch/x86/include/asm/msr.h b/arch/x86/include/asm/msr.h
index 5bef931..2d228fc 100644
--- a/arch/x86/include/asm/msr.h
+++ b/arch/x86/include/asm/msr.h
@@ -244,6 +244,9 @@ do { \

#define write_rdtscp_aux(val) wrmsr(0xc0000103, (val), 0)

+struct msr *msrs_alloc(void);
+void msrs_free(struct msr *msrs);
+
#ifdef CONFIG_SMP
int rdmsr_on_cpu(unsigned int cpu, u32 msr_no, u32 *l, u32 *h);
int wrmsr_on_cpu(unsigned int cpu, u32 msr_no, u32 l, u32 h);
diff --git a/arch/x86/lib/msr.c b/arch/x86/lib/msr.c
index 41628b1..8728341 100644
--- a/arch/x86/lib/msr.c
+++ b/arch/x86/lib/msr.c
@@ -7,7 +7,6 @@ struct msr_info {
u32 msr_no;
struct msr reg;
struct msr *msrs;
- int off;
int err;
};

@@ -18,7 +17,7 @@ static void __rdmsr_on_cpu(void *info)
int this_cpu = raw_smp_processor_id();

if (rv->msrs)
- reg = &rv->msrs[this_cpu - rv->off];
+ reg = per_cpu_ptr(rv->msrs, this_cpu);
else
reg = &rv->reg;

@@ -32,7 +31,7 @@ static void __wrmsr_on_cpu(void *info)
int this_cpu = raw_smp_processor_id();

if (rv->msrs)
- reg = &rv->msrs[this_cpu - rv->off];
+ reg = per_cpu_ptr(rv->msrs, this_cpu);
else
reg = &rv->reg;

@@ -80,7 +79,6 @@ static void __rwmsr_on_cpus(const struct cpumask *mask, u32 msr_no,

memset(&rv, 0, sizeof(rv));

- rv.off = cpumask_first(mask);
rv.msrs = msrs;
rv.msr_no = msr_no;

@@ -120,6 +118,26 @@ void wrmsr_on_cpus(const struct cpumask *mask, u32 msr_no, struct msr *msrs)
}
EXPORT_SYMBOL(wrmsr_on_cpus);

+struct msr *msrs_alloc(void)
+{
+ struct msr *msrs = NULL;
+
+ msrs = alloc_percpu(struct msr);
+ if (!msrs) {
+ pr_warning("%s: error allocating msrs\n", __func__);
+ return NULL;
+ }
+
+ return msrs;
+}
+EXPORT_SYMBOL(msrs_alloc);
+
+void msrs_free(struct msr *msrs)
+{
+ free_percpu(msrs);
+}
+EXPORT_SYMBOL(msrs_free);
+
/* These "safe" variants are slower and should be used when the target MSR
may not actually exist. */
static void __rdmsr_safe_on_cpu(void *info)
diff --git a/drivers/edac/amd64_edac.c b/drivers/edac/amd64_edac.c
index 5fdd6da..df5b684 100644
--- a/drivers/edac/amd64_edac.c
+++ b/drivers/edac/amd64_edac.c
@@ -13,6 +13,8 @@ module_param(report_gart_errors, int, 0644);
static int ecc_enable_override;
module_param(ecc_enable_override, int, 0644);

+static struct msr *msrs;
+
/* Lookup table for all possible MC control instances */
struct amd64_pvt;
static struct mem_ctl_info *mci_lookup[EDAC_MAX_NUMNODES];
@@ -2495,8 +2497,7 @@ static void get_cpus_on_this_dct_cpumask(struct cpumask *mask, int nid)
static bool amd64_nb_mce_bank_enabled_on_node(int nid)
{
cpumask_var_t mask;
- struct msr *msrs;
- int cpu, nbe, idx = 0;
+ int cpu, nbe;
bool ret = false;

if (!zalloc_cpumask_var(&mask, GFP_KERNEL)) {
@@ -2507,32 +2508,22 @@ static bool amd64_nb_mce_bank_enabled_on_node(int nid)

get_cpus_on_this_dct_cpumask(mask, nid);

- msrs = kzalloc(sizeof(struct msr) * cpumask_weight(mask), GFP_KERNEL);
- if (!msrs) {
- amd64_printk(KERN_WARNING, "%s: error allocating msrs\n",
- __func__);
- free_cpumask_var(mask);
- return false;
- }
-
rdmsr_on_cpus(mask, MSR_IA32_MCG_CTL, msrs);

for_each_cpu(cpu, mask) {
- nbe = msrs[idx].l & K8_MSR_MCGCTL_NBE;
+ struct msr *reg = per_cpu_ptr(msrs, cpu);
+ nbe = reg->l & K8_MSR_MCGCTL_NBE;

debugf0("core: %u, MCG_CTL: 0x%llx, NB MSR is %s\n",
- cpu, msrs[idx].q,
+ cpu, reg->q,
(nbe ? "enabled" : "disabled"));

if (!nbe)
goto out;
-
- idx++;
}
ret = true;

out:
- kfree(msrs);
free_cpumask_var(mask);
return ret;
}
@@ -2540,8 +2531,7 @@ out:
static int amd64_toggle_ecc_err_reporting(struct amd64_pvt *pvt, bool on)
{
cpumask_var_t cmask;
- struct msr *msrs = NULL;
- int cpu, idx = 0;
+ int cpu;

if (!zalloc_cpumask_var(&cmask, GFP_KERNEL)) {
amd64_printk(KERN_WARNING, "%s: error allocating mask\n",
@@ -2551,34 +2541,27 @@ static int amd64_toggle_ecc_err_reporting(struct amd64_pvt *pvt, bool on)

get_cpus_on_this_dct_cpumask(cmask, pvt->mc_node_id);

- msrs = kzalloc(sizeof(struct msr) * cpumask_weight(cmask), GFP_KERNEL);
- if (!msrs) {
- amd64_printk(KERN_WARNING, "%s: error allocating msrs\n",
- __func__);
- return -ENOMEM;
- }
-
rdmsr_on_cpus(cmask, MSR_IA32_MCG_CTL, msrs);

for_each_cpu(cpu, cmask) {

+ struct msr *reg = per_cpu_ptr(msrs, cpu);
+
if (on) {
- if (msrs[idx].l & K8_MSR_MCGCTL_NBE)
+ if (reg->l & K8_MSR_MCGCTL_NBE)
pvt->flags.ecc_report = 1;

- msrs[idx].l |= K8_MSR_MCGCTL_NBE;
+ reg->l |= K8_MSR_MCGCTL_NBE;
} else {
/*
* Turn off ECC reporting only when it was off before
*/
if (!pvt->flags.ecc_report)
- msrs[idx].l &= ~K8_MSR_MCGCTL_NBE;
+ reg->l &= ~K8_MSR_MCGCTL_NBE;
}
- idx++;
}
wrmsr_on_cpus(cmask, MSR_IA32_MCG_CTL, msrs);

- kfree(msrs);
free_cpumask_var(cmask);

return 0;
@@ -3036,6 +3019,8 @@ static int __init amd64_edac_init(void)
if (cache_k8_northbridges() < 0)
return err;

+ msrs = msrs_alloc();
+
err = pci_register_driver(&amd64_pci_driver);
if (err)
return err;
@@ -3071,6 +3056,9 @@ static void __exit amd64_edac_exit(void)
edac_pci_release_generic_ctl(amd64_ctl_pci);

pci_unregister_driver(&amd64_pci_driver);
+
+ msrs_free(msrs);
+ msrs = NULL;
}

module_init(amd64_edac_init);