2009-04-22 09:11:38

by Huang, Ying

[permalink] [raw]
Subject: Re-implement MCE log ring buffer as per-CPU ring buffer

Re-implement MCE log ring buffer as per-CPU ring buffer for better
scalability. Basic design is as follow:

- One ring buffer for each CPU

+ MCEs are added to corresponding local per-CPU buffer, instead of
one big global buffer. Contention/unfairness between CPUs is
eleminated.

+ MCE records are read out and removed from per-CPU buffers by mutex
protected global reader function. Because there are no many
readers in system to contend in most cases.

- Per-CPU ring buffer data structure

+ An array is used to hold MCE records. integer "head" indicates
next writing position and integer "tail" indicates next reading
position.

+ To distinguish buffer empty and full, head and tail wrap to 0 at
MCE_LOG_LIMIT instead of MCE_LOG_LEN. Then the real next writing
position is head % MCE_LOG_LEN, and real next reading position is
tail % MCE_LOG_LEN. If buffer is empty, head == tail, if buffer is
full, head % MCE_LOG_LEN == tail % MCE_LOG_LEN and head != tail.

- Lock-less for writer side

+ MCE log writer may come from NMI, so the writer side must be
lock-less. For per-CPU buffer of one CPU, writers may come from
process, IRQ or NMI context, so "head" is increased with
cmpxchg_local() to allocate buffer space.

+ Reader side is protected with a mutex to guarantee only one reader
is active in the whole system.


Performance test show that the throughput of per-CPU mcelog buffer can
reach 430k records/s compared with 5.3k records/s for original
implementation on a 2-core 2.1GHz Core2 machine.


Signed-off-by: Huang Ying <[email protected]>
CC: Andi Kleen <[email protected]>

---
arch/x86/include/asm/mce.h | 13 +
arch/x86/kernel/cpu/mcheck/mce_64.c | 280 +++++++++++++++++++++++-------------
2 files changed, 192 insertions(+), 101 deletions(-)

--- a/arch/x86/include/asm/mce.h
+++ b/arch/x86/include/asm/mce.h
@@ -50,15 +50,22 @@ struct mce {
* is set.
*/

-#define MCE_LOG_LEN 32
+#define MCE_LOG_LEN 32
+#define MCE_LOG_LIMIT (MCE_LOG_LEN * 2 - 1)
+
+static inline int mce_log_index(int n)
+{
+ return n >= MCE_LOG_LEN ? n - MCE_LOG_LEN : n;
+}
+
+struct mce_log_cpu;

struct mce_log {
char signature[12]; /* "MACHINECHECK" */
unsigned len; /* = MCE_LOG_LEN */
- unsigned next;
unsigned flags;
unsigned pad0;
- struct mce entry[MCE_LOG_LEN];
+ struct mce_log_cpu **log_cpus;
};

#define MCE_OVERFLOW 0 /* bit 0 in flags means overflow */
--- a/arch/x86/kernel/cpu/mcheck/mce_64.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_64.c
@@ -38,6 +38,9 @@

#define MISC_MCELOG_MINOR 227

+/* Timeout log reader to wait writer to finish */
+#define WRITER_TIMEOUT_CYC 100000
+
atomic_t mce_entry;

static int mce_dont_init;
@@ -86,38 +89,44 @@ static struct mce_log mcelog = {
MCE_LOG_LEN,
};

+struct mce_log_cpu {
+ int head;
+ int tail;
+ unsigned long flags;
+ struct mce entry[MCE_LOG_LEN];
+};
+
+static DEFINE_PER_CPU(struct mce_log_cpu, mcelog_cpus);
+
void mce_log(struct mce *mce)
{
- unsigned next, entry;
+ struct mce_log_cpu *mcelog_cpu = &__get_cpu_var(mcelog_cpus);
+ int head, ihead, tail, next;
+
atomic_inc(&mce_events);
mce->finished = 0;
- wmb();
- for (;;) {
- entry = rcu_dereference(mcelog.next);
- for (;;) {
- /* When the buffer fills up discard new entries. Assume
- that the earlier errors are the more interesting. */
- if (entry >= MCE_LOG_LEN) {
- set_bit(MCE_OVERFLOW, (unsigned long *)&mcelog.flags);
- return;
- }
- /* Old left over entry. Skip. */
- if (mcelog.entry[entry].finished) {
- entry++;
- continue;
- }
- break;
+ /* mce->finished must be set to 0 before written to ring
+ * buffer */
+ smp_wmb();
+ do {
+ head = mcelog_cpu->head;
+ tail = mcelog_cpu->tail;
+ ihead = mce_log_index(head);
+ /* When the buffer fills up discard new entries. Assume
+ that the earlier errors are the more interesting. */
+ if (ihead == mce_log_index(tail) && head != tail) {
+ set_bit(MCE_OVERFLOW, &mcelog_cpu->flags);
+ return;
}
- smp_rmb();
- next = entry + 1;
- if (cmpxchg(&mcelog.next, entry, next) == entry)
- break;
- }
- memcpy(mcelog.entry + entry, mce, sizeof(struct mce));
- wmb();
- mcelog.entry[entry].finished = 1;
- wmb();
-
+ next = head == MCE_LOG_LIMIT ? 0 : head + 1;
+ } while (cmpxchg_local(&mcelog_cpu->head, head, next) != head);
+ memcpy(mcelog_cpu->entry + ihead, mce, sizeof(struct mce));
+ /* ".finished" of MCE record in ring buffer must be set after
+ * copy */
+ smp_wmb();
+ mcelog_cpu->entry[ihead].finished = 1;
+ /* bit 0 of notify_user should be set after finished be set */
+ smp_wmb();
set_bit(0, &notify_user);
}

@@ -147,21 +156,37 @@ static void print_mce(struct mce *m)
"and contact your hardware vendor\n");
}

-static void mce_panic(char *msg, struct mce *backup, unsigned long start)
+static int mce_print_cpu(int cpu, struct mce *backup, unsigned long start)
{
int i;
+ struct mce_log_cpu *mcelog_cpu;

- oops_begin();
+ mcelog_cpu = &per_cpu(mcelog_cpus, cpu);
for (i = 0; i < MCE_LOG_LEN; i++) {
- unsigned long tsc = mcelog.entry[i].tsc;
+ unsigned long tsc = mcelog_cpu->entry[i].tsc;

if (time_before(tsc, start))
continue;
- print_mce(&mcelog.entry[i]);
- if (backup && mcelog.entry[i].tsc == backup->tsc)
+ print_mce(&mcelog_cpu->entry[i]);
+ if (backup && mcelog_cpu->entry[i].tsc == backup->tsc)
backup = NULL;
}
- if (backup)
+ return backup == NULL;
+}
+
+static void mce_panic(char *msg, struct mce *backup, u64 start)
+{
+ int cpu, cpu_self;
+
+ oops_begin();
+ cpu_self = smp_processor_id();
+ for_each_online_cpu(cpu) {
+ if (cpu == cpu_self)
+ continue;
+ if (mce_print_cpu(cpu, backup, start))
+ backup = NULL;
+ }
+ if (!mce_print_cpu(cpu_self, backup, start))
print_mce(backup);
panic(msg);
}
@@ -574,6 +599,24 @@ static int mce_cap_init(void)
return 0;
}

+/*
+ * Initialize MCE per-CPU log buffer
+ */
+static void mce_log_init(void)
+{
+ int cpu;
+ struct mce_log_cpu *mcelog_cpu;
+
+ if (mcelog.log_cpus)
+ return;
+ mcelog.log_cpus = kmalloc(num_possible_cpus() * sizeof(mcelog_cpu),
+ GFP_KERNEL);
+ for_each_possible_cpu(cpu) {
+ mcelog_cpu = &per_cpu(mcelog_cpus, cpu);
+ mcelog.log_cpus[cpu] = mcelog_cpu;
+ }
+}
+
static void mce_init(void *dummy)
{
u64 cap;
@@ -656,6 +699,7 @@ void __cpuinit mcheck_init(struct cpuinf
mce_dont_init = 1;
return;
}
+ mce_log_init();
mce_cpu_quirks(c);

mce_init(NULL);
@@ -704,90 +748,116 @@ static int mce_release(struct inode *ino
return 0;
}

-static void collect_tscs(void *data)
-{
- unsigned long *cpu_tsc = (unsigned long *)data;
-
- rdtscll(cpu_tsc[smp_processor_id()]);
-}
-
-static ssize_t mce_read(struct file *filp, char __user *ubuf, size_t usize,
- loff_t *off)
+static ssize_t mce_read_cpu(struct mce_log_cpu *mcelog_cpu,
+ char __user *inubuf, size_t usize)
{
- unsigned long *cpu_tsc;
- static DEFINE_MUTEX(mce_read_mutex);
- unsigned prev, next;
- char __user *buf = ubuf;
- int i, err;
+ char __user *ubuf = inubuf;
+ int head, tail, pos, i, err = 0;

- cpu_tsc = kmalloc(nr_cpu_ids * sizeof(long), GFP_KERNEL);
- if (!cpu_tsc)
- return -ENOMEM;
-
- mutex_lock(&mce_read_mutex);
- next = rcu_dereference(mcelog.next);
+ head = mcelog_cpu->head;
+ tail = mcelog_cpu->tail;

- /* Only supports full reads right now */
- if (*off != 0 || usize < MCE_LOG_LEN*sizeof(struct mce)) {
- mutex_unlock(&mce_read_mutex);
- kfree(cpu_tsc);
- return -EINVAL;
- }
-
- err = 0;
- prev = 0;
- do {
- for (i = prev; i < next; i++) {
- unsigned long start = jiffies;
+ if (head == tail)
+ return 0;

- while (!mcelog.entry[i].finished) {
- if (time_after_eq(jiffies, start + 2)) {
- memset(mcelog.entry + i, 0,
+ for (pos = tail; pos != head && usize >= sizeof(struct mce);
+ pos = pos == MCE_LOG_LIMIT ? 0 : pos+1) {
+ i = mce_log_index(pos);
+ if (!mcelog_cpu->entry[i].finished) {
+ u64 start, now;
+ rdtscll(start);
+ while (!mcelog_cpu->entry[i].finished) {
+ rdtscll(now);
+ if (now - start > WRITER_TIMEOUT_CYC) {
+ memset(mcelog_cpu->entry + i, 0,
sizeof(struct mce));
+ head = mcelog_cpu->head;
goto timeout;
}
cpu_relax();
}
- smp_rmb();
- err |= copy_to_user(buf, mcelog.entry + i,
- sizeof(struct mce));
- buf += sizeof(struct mce);
-timeout:
- ;
}
+ /* finished field should be checked before
+ * copy_to_user() */
+ smp_rmb();
+ err |= copy_to_user(ubuf, mcelog_cpu->entry + i,
+ sizeof(struct mce));
+ ubuf += sizeof(struct mce);
+ usize -= sizeof(struct mce);
+ timeout:
+ ;
+ }
+ /* mcelog_cpu->tail must be updated after corresponding MCE
+ * records are copied out */
+ smp_wmb();
+ mcelog_cpu->tail = pos;

- memset(mcelog.entry + prev, 0,
- (next - prev) * sizeof(struct mce));
- prev = next;
- next = cmpxchg(&mcelog.next, prev, 0);
- } while (next != prev);
+ return err ? -EFAULT : ubuf - inubuf;
+}

- synchronize_sched();
+static int mce_empty_cpu(struct mce_log_cpu *mcelog_cpu)
+{
+ int head, tail;

- /*
- * Collect entries that were still getting written before the
- * synchronize.
- */
- on_each_cpu(collect_tscs, cpu_tsc, 1);
- for (i = next; i < MCE_LOG_LEN; i++) {
- if (mcelog.entry[i].finished &&
- mcelog.entry[i].tsc < cpu_tsc[mcelog.entry[i].cpu]) {
- err |= copy_to_user(buf, mcelog.entry+i,
- sizeof(struct mce));
- smp_rmb();
- buf += sizeof(struct mce);
- memset(&mcelog.entry[i], 0, sizeof(struct mce));
- }
+ head = mcelog_cpu->head;
+ tail = mcelog_cpu->tail;
+
+ return head == tail;
+}
+
+static int mce_empty(void)
+{
+ int cpu;
+ struct mce_log_cpu *mcelog_cpu;
+
+ for_each_possible_cpu(cpu) {
+ mcelog_cpu = &per_cpu(mcelog_cpus, cpu);
+ if (!mce_empty_cpu(mcelog_cpu))
+ return 0;
}
+ return 1;
+}
+
+static ssize_t mce_read(struct file *filp, char __user *inubuf, size_t usize,
+ loff_t *off)
+{
+ char __user *ubuf = inubuf;
+ struct mce_log_cpu *mcelog_cpu;
+ int cpu, new_mce, err = 0;
+ static DEFINE_MUTEX(mce_read_mutex);
+ size_t usize_limit;
+
+ /* Too large user buffer size may cause system not response */
+ usize_limit = num_possible_cpus() * MCE_LOG_LEN * sizeof(struct mce);
+ if (usize > usize_limit)
+ usize = usize_limit;
+ mutex_lock(&mce_read_mutex);
+ do {
+ new_mce = 0;
+ for_each_possible_cpu(cpu) {
+ if (usize < sizeof(struct mce))
+ goto out;
+ mcelog_cpu = &per_cpu(mcelog_cpus, cpu);
+ err = mce_read_cpu(mcelog_cpu, ubuf,
+ sizeof(struct mce));
+ if (err > 0) {
+ ubuf += sizeof(struct mce);
+ usize -= sizeof(struct mce);
+ new_mce = 1;
+ err = 0;
+ } else if (err < 0)
+ goto out;
+ }
+ } while (new_mce || !mce_empty());
+out:
mutex_unlock(&mce_read_mutex);
- kfree(cpu_tsc);
- return err ? -EFAULT : buf - ubuf;
+ return err ? : ubuf - inubuf;
}

static unsigned int mce_poll(struct file *file, poll_table *wait)
{
poll_wait(file, &mce_wait, wait);
- if (rcu_dereference(mcelog.next))
+ if (!mce_empty())
return POLLIN | POLLRDNORM;
return 0;
}
@@ -982,12 +1052,26 @@ static ssize_t set_trigger(struct sys_de
return len;
}

+static ssize_t show_log_flags(struct sys_device *s,
+ struct sysdev_attribute *attr,
+ char *buf)
+{
+ int cpu = s->id;
+ struct mce_log_cpu *mcelog_cpu = &per_cpu(mcelog_cpus, cpu);
+ unsigned flags;
+ do {
+ flags = mcelog_cpu->flags;
+ } while (cmpxchg((unsigned *)&mcelog_cpu->flags, flags, 0) != flags);
+ return sprintf(buf, "0x%x\n", flags);
+}
+
static SYSDEV_ATTR(trigger, 0644, show_trigger, set_trigger);
static SYSDEV_INT_ATTR(tolerant, 0644, tolerant);
ACCESSOR(check_interval,check_interval,mce_restart())
+static SYSDEV_ATTR(log_flags, 0644, show_log_flags, NULL);
static struct sysdev_attribute *mce_attributes[] = {
&attr_tolerant.attr, &attr_check_interval, &attr_trigger,
- NULL
+ &attr_log_flags, NULL
};

static cpumask_var_t mce_device_initialized;


Attachments:
signature.asc (197.00 B)
This is a digitally signed message part

2009-04-22 09:23:29

by Ingo Molnar

[permalink] [raw]
Subject: Re: Re-implement MCE log ring buffer as per-CPU ring buffer


* Huang Ying <[email protected]> wrote:

> Re-implement MCE log ring buffer as per-CPU ring buffer for better
> scalability. Basic design is as follow:

Before changing anything substantial in the MCE code it would be
necessary to clean up and then unify the 32-bit and 64-bit side of
the MCE code first. (Which essentially means extending the
64-bit-only code to 32-bit)

I've put a few cleanup patches into tip:x86/mce2 (warning: there are
a few broken ones at the tail - so consider it WIP), to help make
this happen:

dd98699: x86, mce: print number of MCE banks
ef319a8: x86, mce: unify
be0f336: x86, mce: unify, prepare for 32-bit
90c99b7: x86, mce: prepare unification
1e1d4f8: x86, mce: prepare mce.h and mce_64.c for unification
fd6b13f: x86, mce: clean up mce_64.c
f7f8e03: x86, mce: clean up mce_intel.c
b19c8ea: x86, mce: unify the Intel thermal interrupt code
6eade5b: mce: unify Intel thermal init, prepare
d393769: x86, mce: clean up mce.h
7bb2efa: x86, mce: clean up winchip.c
3b94f2b: x86, mce: clean up p5.c
13a219b: x86, mce: clean up mce_32.c
e69e307: x86, mce: clean up non-fatal.c
3b9a5bc: x86, mce: clean up k7.c
79e1d29: x86, mce: clean up p6.c
c5fac5b: x86, mce: clean up therm_throt.c
788a16a: x86, mce: clean up p4.c
07d7726: x86, mce: clean up mce_amd_64.c
a6a6958: x86, mce: clean up the sysfs variables namespace
895f799: x86, mce: clean up the mce_64.c code

these are the more questionable ones in need of restructuring:

dd98699: x86, mce: print number of MCE banks
ef319a8: x86, mce: unify
be0f336: x86, mce: unify, prepare for 32-bit
90c99b7: x86, mce: prepare unification
1e1d4f8: x86, mce: prepare mce.h and mce_64.c for unification
fd6b13f: x86, mce: clean up mce_64.c
f7f8e03: x86, mce: clean up mce_intel.c
b19c8ea: x86, mce: unify the Intel thermal interrupt code
6eade5b: mce: unify Intel thermal init, prepare

... but you get the idea.

Ingo

Subject: Re: Re-implement MCE log ring buffer as per-CPU ring buffer

On 22.04.09 11:22:59, Ingo Molnar wrote:
>
> * Huang Ying <[email protected]> wrote:
>
> > Re-implement MCE log ring buffer as per-CPU ring buffer for better
> > scalability. Basic design is as follow:
>
> Before changing anything substantial in the MCE code it would be
> necessary to clean up and then unify the 32-bit and 64-bit side of
> the MCE code first. (Which essentially means extending the
> 64-bit-only code to 32-bit)

You may also want to consider to use the in-kernel ring_buffer api
(include/linux/ring_buffer.h).

-Robert

--
Advanced Micro Devices, Inc.
Operating System Research Center
email: [email protected]

2009-04-22 10:19:49

by Ingo Molnar

[permalink] [raw]
Subject: Re: Re-implement MCE log ring buffer as per-CPU ring buffer


* Robert Richter <[email protected]> wrote:

> On 22.04.09 11:22:59, Ingo Molnar wrote:
> >
> > * Huang Ying <[email protected]> wrote:
> >
> > > Re-implement MCE log ring buffer as per-CPU ring buffer for better
> > > scalability. Basic design is as follow:
> >
> > Before changing anything substantial in the MCE code it would be
> > necessary to clean up and then unify the 32-bit and 64-bit side
> > of the MCE code first. (Which essentially means extending the
> > 64-bit-only code to 32-bit)
>
> You may also want to consider to use the in-kernel ring_buffer api
> (include/linux/ring_buffer.h).

Yeah. I'd have suggested that once cleanups and unification is done
(which is still a long way out :-).

Ingo

2009-04-22 11:12:29

by Andi Kleen

[permalink] [raw]
Subject: Re: Re-implement MCE log ring buffer as per-CPU ring buffer

Huang Ying wrote:

Basic concept is good and it fixes some long standing problems.

But some details need to be improved I think:

> +/*
> + * Initialize MCE per-CPU log buffer
> + */
> +static void mce_log_init(void)
> +{
> + int cpu;
> + struct mce_log_cpu *mcelog_cpu;
> +
> + if (mcelog.log_cpus)
> + return;
> + mcelog.log_cpus = kmalloc(num_possible_cpus() * sizeof(mcelog_cpu),
> + GFP_KERNEL);
> + for_each_possible_cpu(cpu) {
> + mcelog_cpu = &per_cpu(mcelog_cpus, cpu);
> + mcelog.log_cpus[cpu] = mcelog_cpu;
> + }

I don't understand why the separate allocation. Can't you just put
the whole log buffer directly into the per cpu data?

This would also make the initialization cleaner because you would need to only
initialize state for the currently initialized CPU.


> + while (!mcelog_cpu->entry[i].finished) {
> + rdtscll(now);
> + if (now - start > WRITER_TIMEOUT_CYC) {
> + memset(mcelog_cpu->entry + i, 0,
> sizeof(struct mce));
> + head = mcelog_cpu->head;
> goto timeout;

This timeout should be reported somehow, perhaps with a printk.
Also it's problematic to hardcode cycles here, i think it would
be better to use a similar scheme as the Monarch timeout
with a ndelay() and a spinunit. That is guaranteed to stay
the same even as CPU frequencies change.

> +static ssize_t mce_read(struct file *filp, char __user *inubuf, size_t usize,
> + loff_t *off)
> +{
> + char __user *ubuf = inubuf;
> + struct mce_log_cpu *mcelog_cpu;
> + int cpu, new_mce, err = 0;
> + static DEFINE_MUTEX(mce_read_mutex);
> + size_t usize_limit;
> +
> + /* Too large user buffer size may cause system not response */

Did you understand why that happens? It's worrying.

> +static ssize_t show_log_flags(struct sys_device *s,
> + struct sysdev_attribute *attr,
> + char *buf)
> +{
> + int cpu = s->id;
> + struct mce_log_cpu *mcelog_cpu = &per_cpu(mcelog_cpus, cpu);
> + unsigned flags;
> + do {
> + flags = mcelog_cpu->flags;
> + } while (cmpxchg((unsigned *)&mcelog_cpu->flags, flags, 0) != flags);
> + return sprintf(buf, "0x%x\n", flags);
> +}
> +
> static SYSDEV_ATTR(trigger, 0644, show_trigger, set_trigger);
> static SYSDEV_INT_ATTR(tolerant, 0644, tolerant);
> ACCESSOR(check_interval,check_interval,mce_restart())
> +static SYSDEV_ATTR(log_flags, 0644, show_log_flags, NULL);

Do we really need that interface? It seems rather obscure.
mcelog should get these flags anyways. Better to drop it.

-Andi

2009-04-22 11:20:55

by Andi Kleen

[permalink] [raw]
Subject: Re: Re-implement MCE log ring buffer as per-CPU ring buffer

Ingo Molnar wrote:
> * Huang Ying <[email protected]> wrote:
>
>> Re-implement MCE log ring buffer as per-CPU ring buffer for better
>> scalability. Basic design is as follow:
>
> Before changing anything substantial in the MCE code it would be
> necessary to clean up and then unify the 32-bit and 64-bit side of
> the MCE code first. (Which essentially means extending the
> 64-bit-only code to 32-bit)

I don't disagree. In fact my original patchkit had this first in 6-7 patches,
but I dropped the 32bit unification because you complained about
the interface. It's great that you changed your mind on that.
Anyways if we can go forward that now fine by me.

The big part is to make sure that the hardware workarounds in the
32bit code get applied to 64bit. These are essentially the quirks
of PPro and K7.

Then there's some straight forward work needed to make the 64bit
code 32bit clean (mostly a few unsigned long -> u64)

And then there are the old non MCA handlers for p5 and winchip.
These are best kept separate, there isn't really much code commonality
here because they use completely different registers.
>
> I've put a few cleanup patches into tip:x86/mce2 (warning: there are
> a few broken ones at the tail - so consider it WIP), to help make
> this happen:
>
> dd98699: x86, mce: print number of MCE banks
> ef319a8: x86, mce: unify
> be0f336: x86, mce: unify, prepare for 32-bit
> 90c99b7: x86, mce: prepare unification
> 1e1d4f8: x86, mce: prepare mce.h and mce_64.c for unification
> fd6b13f: x86, mce: clean up mce_64.c
> f7f8e03: x86, mce: clean up mce_intel.c
> b19c8ea: x86, mce: unify the Intel thermal interrupt code
> 6eade5b: mce: unify Intel thermal init, prepare

I don't think there's much to unify here, the 64bit code
can be just used as is on 32bit too (only needs the 64bit
mce_log infrastructure) and the 32bit code be dropped.

Would it be possible to drop these two changesets?

> d393769: x86, mce: clean up mce.h
> 7bb2efa: x86, mce: clean up winchip.c
> 3b94f2b: x86, mce: clean up p5.c
> 13a219b: x86, mce: clean up mce_32.c
> e69e307: x86, mce: clean up non-fatal.c
> 3b9a5bc: x86, mce: clean up k7.c
> 79e1d29: x86, mce: clean up p6.c
> c5fac5b: x86, mce: clean up therm_throt.c
> 788a16a: x86, mce: clean up p4.c
> 07d7726: x86, mce: clean up mce_amd_64.c
> a6a6958: x86, mce: clean up the sysfs variables namespace
> 895f799: x86, mce: clean up the mce_64.c code
>
> these are the more questionable ones in need of restructurin

And then restart here? Add 32bit workarounds, add hook
to initialize p5/winchip, make 64bit code 32bit clean.
That can come straight from my patchkit. I can do that
if there is interest.

-Andi

2009-04-22 11:31:00

by Andi Kleen

[permalink] [raw]
Subject: Re: Re-implement MCE log ring buffer as per-CPU ring buffer


> You may also want to consider to use the in-kernel ring_buffer api
> (include/linux/ring_buffer.h).

Its semantics don't map very well to the machine check requirements.

I did an analysis of that in http://thread.gmane.org/gmane.linux.kernel/772928/focus=773452
AFAIK the reasons stated there are all still valid.

-Andi

2009-04-22 11:35:36

by Andi Kleen

[permalink] [raw]
Subject: Re: Re-implement MCE log ring buffer as per-CPU ring buffer

Ingo Molnar wrote:
> * Robert Richter <[email protected]> wrote:
>
>> On 22.04.09 11:22:59, Ingo Molnar wrote:
>>> * Huang Ying <[email protected]> wrote:
>>>
>>>> Re-implement MCE log ring buffer as per-CPU ring buffer for better
>>>> scalability. Basic design is as follow:
>>> Before changing anything substantial in the MCE code it would be
>>> necessary to clean up and then unify the 32-bit and 64-bit side
>>> of the MCE code first. (Which essentially means extending the
>>> 64-bit-only code to 32-bit)
>> You may also want to consider to use the in-kernel ring_buffer api
>> (include/linux/ring_buffer.h).
>
> Yeah. I'd have suggested that once cleanups and unification is done
> (which is still a long way out :-).

Hi Ingo,

If you can tell us clearly how to do this I can work on it. e.g.
if the scheme laid out in my earlier mail is acceptable and
I can port these patches to mce2 I mentioned and that can be hopefully
all done quickly.

However I don't agree with you stalling important hardware support
and bug fix code for a long time just because you have something
half baked in tree and I hope you won't pursue such a policy.

-Andi

2009-04-22 15:55:59

by H. Peter Anvin

[permalink] [raw]
Subject: Re: Re-implement MCE log ring buffer as per-CPU ring buffer

Andi Kleen wrote:
>
> And then restart here? Add 32bit workarounds, add hook
> to initialize p5/winchip, make 64bit code 32bit clean.
> That can come straight from my patchkit. I can do that
> if there is interest.
>

Could you come up with a tree we can look at?

-hpa

--
H. Peter Anvin, Intel Open Source Technology Center
I work for Intel. I don't speak on their behalf.

2009-04-22 15:58:41

by Andi Kleen

[permalink] [raw]
Subject: Re: Re-implement MCE log ring buffer as per-CPU ring buffer

H. Peter Anvin wrote:
> Andi Kleen wrote:
>> And then restart here? Add 32bit workarounds, add hook
>> to initialize p5/winchip, make 64bit code 32bit clean.
>> That can come straight from my patchkit. I can do that
>> if there is interest.
>>
>
> Could you come up with a tree we can look at?

Will do. I'll probably need a day or so.

-Andi

2009-04-24 05:51:22

by Huang, Ying

[permalink] [raw]
Subject: Re: Re-implement MCE log ring buffer as per-CPU ring buffer

On Wed, 2009-04-22 at 19:11 +0800, Andi Kleen wrote:
> Huang Ying wrote:
>
> Basic concept is good and it fixes some long standing problems.
>
> But some details need to be improved I think:
>
> > +/*
> > + * Initialize MCE per-CPU log buffer
> > + */
> > +static void mce_log_init(void)
> > +{
> > + int cpu;
> > + struct mce_log_cpu *mcelog_cpu;
> > +
> > + if (mcelog.log_cpus)
> > + return;
> > + mcelog.log_cpus = kmalloc(num_possible_cpus() * sizeof(mcelog_cpu),
> > + GFP_KERNEL);
> > + for_each_possible_cpu(cpu) {
> > + mcelog_cpu = &per_cpu(mcelog_cpus, cpu);
> > + mcelog.log_cpus[cpu] = mcelog_cpu;
> > + }
>
> I don't understand why the separate allocation. Can't you just put
> the whole log buffer directly into the per cpu data?
>
> This would also make the initialization cleaner because you would need to only
> initialize state for the currently initialized CPU.

The original intention to use mcelog.log_cpus is to make it easier to
find per-cpu buffers from memory image produced by kdump. But is seems
that something as follow is just sufficient.

mcelog.log_cpus = alloc_percpu(struct mce_log_cpu);

Is this what you suggested?

>
> > + while (!mcelog_cpu->entry[i].finished) {
> > + rdtscll(now);
> > + if (now - start > WRITER_TIMEOUT_CYC) {
> > + memset(mcelog_cpu->entry + i, 0,
> > sizeof(struct mce));
> > + head = mcelog_cpu->head;
> > goto timeout;
>
> This timeout should be reported somehow, perhaps with a printk.
> Also it's problematic to hardcode cycles here, i think it would
> be better to use a similar scheme as the Monarch timeout
> with a ndelay() and a spinunit. That is guaranteed to stay
> the same even as CPU frequencies change.

Yes. I will change this.

> > +static ssize_t mce_read(struct file *filp, char __user *inubuf, size_t usize,
> > + loff_t *off)
> > +{
> > + char __user *ubuf = inubuf;
> > + struct mce_log_cpu *mcelog_cpu;
> > + int cpu, new_mce, err = 0;
> > + static DEFINE_MUTEX(mce_read_mutex);
> > + size_t usize_limit;
> > +
> > + /* Too large user buffer size may cause system not response */
>
> Did you understand why that happens? It's worrying.

The loop in mce_read() does not release the CPU in non-preempt kernel.
If the user buffer size is too large, it may cause the CPU to be
soft-lock-upped or not response. One way to solve the issue is to
constrain the size of user buffer, another way may be insert a
schedule() in the loop.

> > +static ssize_t show_log_flags(struct sys_device *s,
> > + struct sysdev_attribute *attr,
> > + char *buf)
> > +{
> > + int cpu = s->id;
> > + struct mce_log_cpu *mcelog_cpu = &per_cpu(mcelog_cpus, cpu);
> > + unsigned flags;
> > + do {
> > + flags = mcelog_cpu->flags;
> > + } while (cmpxchg((unsigned *)&mcelog_cpu->flags, flags, 0) != flags);
> > + return sprintf(buf, "0x%x\n", flags);
> > +}
> > +
> > static SYSDEV_ATTR(trigger, 0644, show_trigger, set_trigger);
> > static SYSDEV_INT_ATTR(tolerant, 0644, tolerant);
> > ACCESSOR(check_interval,check_interval,mce_restart())
> > +static SYSDEV_ATTR(log_flags, 0644, show_log_flags, NULL);
>
> Do we really need that interface? It seems rather obscure.
> mcelog should get these flags anyways. Better to drop it.

For overflow flag, it seems not very useful. I will drop this.

Best Regards,
Huang Ying


Attachments:
signature.asc (197.00 B)
This is a digitally signed message part

2009-04-24 06:07:05

by Huang, Ying

[permalink] [raw]
Subject: Re: Re-implement MCE log ring buffer as per-CPU ring buffer

On Wed, 2009-04-22 at 18:16 +0800, Robert Richter wrote:
> On 22.04.09 11:22:59, Ingo Molnar wrote:
> >
> > * Huang Ying <[email protected]> wrote:
> >
> > > Re-implement MCE log ring buffer as per-CPU ring buffer for better
> > > scalability. Basic design is as follow:
> >
> > Before changing anything substantial in the MCE code it would be
> > necessary to clean up and then unify the 32-bit and 64-bit side of
> > the MCE code first. (Which essentially means extending the
> > 64-bit-only code to 32-bit)
>
> You may also want to consider to use the in-kernel ring_buffer api
> (include/linux/ring_buffer.h).

It seems that ring_buffer is not NMI-safe, while mcelog buffer will be
used in NMI context and interrupt context. When will ring_buffer to be
NMI-safe?

Best Regards,
Huang Ying


Attachments:
signature.asc (197.00 B)
This is a digitally signed message part
Subject: Re: Re-implement MCE log ring buffer as per-CPU ring buffer

On 24.04.09 14:06:50, Huang Ying wrote:
> On Wed, 2009-04-22 at 18:16 +0800, Robert Richter wrote:
> > On 22.04.09 11:22:59, Ingo Molnar wrote:
> > >
> > > * Huang Ying <[email protected]> wrote:
> > >
> > > > Re-implement MCE log ring buffer as per-CPU ring buffer for better
> > > > scalability. Basic design is as follow:
> > >
> > > Before changing anything substantial in the MCE code it would be
> > > necessary to clean up and then unify the 32-bit and 64-bit side of
> > > the MCE code first. (Which essentially means extending the
> > > 64-bit-only code to 32-bit)
> >
> > You may also want to consider to use the in-kernel ring_buffer api
> > (include/linux/ring_buffer.h).
>
> It seems that ring_buffer is not NMI-safe, while mcelog buffer will be
> used in NMI context and interrupt context. When will ring_buffer to be
> NMI-safe?

You can use it in nmi context with separate read and write
buffers. See this patch description:
6dad828b76c7224a22ddc9ce7aa495d994f03b31

Not sure if somebody will make the ring_buffer non-locking.

-Robert

--
Advanced Micro Devices, Inc.
Operating System Research Center
email: [email protected]

2009-04-24 13:36:24

by Steven Rostedt

[permalink] [raw]
Subject: Re: Re-implement MCE log ring buffer as per-CPU ring buffer


On Fri, 24 Apr 2009, Robert Richter wrote:

> On 24.04.09 14:06:50, Huang Ying wrote:
> > On Wed, 2009-04-22 at 18:16 +0800, Robert Richter wrote:
> > > On 22.04.09 11:22:59, Ingo Molnar wrote:
> > > >
> > > > * Huang Ying <[email protected]> wrote:
> > > >
> > > > > Re-implement MCE log ring buffer as per-CPU ring buffer for better
> > > > > scalability. Basic design is as follow:
> > > >
> > > > Before changing anything substantial in the MCE code it would be
> > > > necessary to clean up and then unify the 32-bit and 64-bit side of
> > > > the MCE code first. (Which essentially means extending the
> > > > 64-bit-only code to 32-bit)
> > >
> > > You may also want to consider to use the in-kernel ring_buffer api
> > > (include/linux/ring_buffer.h).
> >
> > It seems that ring_buffer is not NMI-safe, while mcelog buffer will be
> > used in NMI context and interrupt context. When will ring_buffer to be
> > NMI-safe?
>
> You can use it in nmi context with separate read and write
> buffers. See this patch description:
> 6dad828b76c7224a22ddc9ce7aa495d994f03b31
>
> Not sure if somebody will make the ring_buffer non-locking.

It already is ;-)

I've put in for a patent application on the algorithm so I must wait till
it is processed before I can release the code.

-- Steve

2009-04-27 00:58:24

by Huang, Ying

[permalink] [raw]
Subject: Re: Re-implement MCE log ring buffer as per-CPU ring buffer

On Fri, 2009-04-24 at 18:09 +0800, Robert Richter wrote:
> On 24.04.09 14:06:50, Huang Ying wrote:
> > On Wed, 2009-04-22 at 18:16 +0800, Robert Richter wrote:
> > > On 22.04.09 11:22:59, Ingo Molnar wrote:
> > > >
> > > > * Huang Ying <[email protected]> wrote:
> > > >
> > > > > Re-implement MCE log ring buffer as per-CPU ring buffer for better
> > > > > scalability. Basic design is as follow:
> > > >
> > > > Before changing anything substantial in the MCE code it would be
> > > > necessary to clean up and then unify the 32-bit and 64-bit side of
> > > > the MCE code first. (Which essentially means extending the
> > > > 64-bit-only code to 32-bit)
> > >
> > > You may also want to consider to use the in-kernel ring_buffer api
> > > (include/linux/ring_buffer.h).
> >
> > It seems that ring_buffer is not NMI-safe, while mcelog buffer will be
> > used in NMI context and interrupt context. When will ring_buffer to be
> > NMI-safe?
>
> You can use it in nmi context with separate read and write
> buffers. See this patch description:
> 6dad828b76c7224a22ddc9ce7aa495d994f03b31

You use it as oprofile CPU buffer. There is only one reader and one
writer for oprofile CPU buffer, so separating read and write buffers is
sufficient for it to be used there. But there is multiple writer for
MCELOG buffer (from NMI, IRQ and timer), so I think your method doesn't
work here.

Best Regards,
Huang Ying


Attachments:
signature.asc (197.00 B)
This is a digitally signed message part

2009-04-27 07:50:18

by Huang, Ying

[permalink] [raw]
Subject: Re: Re-implement MCE log ring buffer as per-CPU ring buffer

On Fri, 2009-04-24 at 21:36 +0800, Steven Rostedt wrote:
> On Fri, 24 Apr 2009, Robert Richter wrote:
> > >
> > > It seems that ring_buffer is not NMI-safe, while mcelog buffer will be
> > > used in NMI context and interrupt context. When will ring_buffer to be
> > > NMI-safe?
> >
> > You can use it in nmi context with separate read and write
> > buffers. See this patch description:
> > 6dad828b76c7224a22ddc9ce7aa495d994f03b31
> >
> > Not sure if somebody will make the ring_buffer non-locking.
>
> It already is ;-)
>
> I've put in for a patent application on the algorithm so I must wait till
> it is processed before I can release the code.

When will it be merged by mainline kernel? Do you have a plan?

We do have some scalability issues of current mcelog implementation, and
hopes that can be solved as soon as possible, perhaps for 2.6.31?

Best Regards,
Huang Ying


Attachments:
signature.asc (197.00 B)
This is a digitally signed message part

2009-04-27 07:53:34

by Andi Kleen

[permalink] [raw]
Subject: Re: Re-implement MCE log ring buffer as per-CPU ring buffer

Huang Ying wrote:
> On Fri, 2009-04-24 at 21:36 +0800, Steven Rostedt wrote:
>> On Fri, 24 Apr 2009, Robert Richter wrote:
>>>> It seems that ring_buffer is not NMI-safe, while mcelog buffer will be
>>>> used in NMI context and interrupt context. When will ring_buffer to be
>>>> NMI-safe?
>>> You can use it in nmi context with separate read and write
>>> buffers. See this patch description:
>>> 6dad828b76c7224a22ddc9ce7aa495d994f03b31
>>>
>>> Not sure if somebody will make the ring_buffer non-locking.
>> It already is ;-)
>>
>> I've put in for a patent application on the algorithm so I must wait till
>> it is processed before I can release the code.
>
> When will it be merged by mainline kernel? Do you have a plan?

Patent applications tend to be measured in years. Also I'm not sure
we really want patented algorithms in the kernel anyways if it can
be avoided. So the patent probably makes it impractical to use this
thing at all.

-Andi

2009-04-27 14:28:04

by Steven Rostedt

[permalink] [raw]
Subject: Re: Re-implement MCE log ring buffer as per-CPU ring buffer


On Mon, 27 Apr 2009, Huang Ying wrote:

> On Fri, 2009-04-24 at 21:36 +0800, Steven Rostedt wrote:
> > On Fri, 24 Apr 2009, Robert Richter wrote:
> > > >
> > > > It seems that ring_buffer is not NMI-safe, while mcelog buffer will be
> > > > used in NMI context and interrupt context. When will ring_buffer to be
> > > > NMI-safe?
> > >
> > > You can use it in nmi context with separate read and write
> > > buffers. See this patch description:
> > > 6dad828b76c7224a22ddc9ce7aa495d994f03b31
> > >
> > > Not sure if somebody will make the ring_buffer non-locking.
> >
> > It already is ;-)
> >
> > I've put in for a patent application on the algorithm so I must wait till
> > it is processed before I can release the code.
>
> When will it be merged by mainline kernel? Do you have a plan?
>
> We do have some scalability issues of current mcelog implementation, and
> hopes that can be solved as soon as possible, perhaps for 2.6.31?

Yes, I plan on being able to post it before the 31 merge window opens.
I'll ping the lawyer to expedite the process.

-- Steve

2009-04-27 14:37:13

by Andi Kleen

[permalink] [raw]
Subject: Patenting kernel patches was Re: Re-implement MCE log ring buffer as per-CPU ring buffer


>>> I've put in for a patent application on the algorithm so I must wait till
>>> it is processed before I can release the code.
>> When will it be merged by mainline kernel? Do you have a plan?
>>
>> We do have some scalability issues of current mcelog implementation, and
>> hopes that can be solved as soon as possible, perhaps for 2.6.31?
>
> Yes, I plan on being able to post it before the 31 merge window opens.
> I'll ping the lawyer to expedite the process.

Sorry, but this is quite ridiculous. Are you serious?

Why would we want such a patented algorithm in the kernel? Normally the standard
policy is to avoid patented algorithms (unless there's really no alternative like
with RCU which is clearly not the case here) and I'm not aware of this policy haven't
changed.

And also holding up perfectly good uncontaminated patches for something
patented seems especially wrong.

I think we should move forward with a standard non patented ring
buffer Ying was working on for this and avoid the patent mess as
far as possible.

-Andi (who wonders if he isn't in bizarro land now)

2009-04-27 14:39:43

by Steven Rostedt

[permalink] [raw]
Subject: Re: Re-implement MCE log ring buffer as per-CPU ring buffer


On Mon, 27 Apr 2009, Andi Kleen wrote:

> Huang Ying wrote:
> > On Fri, 2009-04-24 at 21:36 +0800, Steven Rostedt wrote:
> > > On Fri, 24 Apr 2009, Robert Richter wrote:
> > > > > It seems that ring_buffer is not NMI-safe, while mcelog buffer will be
> > > > > used in NMI context and interrupt context. When will ring_buffer to be
> > > > > NMI-safe?
> > > > You can use it in nmi context with separate read and write
> > > > buffers. See this patch description:
> > > > 6dad828b76c7224a22ddc9ce7aa495d994f03b31
> > > >
> > > > Not sure if somebody will make the ring_buffer non-locking.
> > > It already is ;-)
> > >
> > > I've put in for a patent application on the algorithm so I must wait till
> > > it is processed before I can release the code.
> >
> > When will it be merged by mainline kernel? Do you have a plan?
>
> Patent applications tend to be measured in years. Also I'm not sure
> we really want patented algorithms in the kernel anyways if it can
> be avoided. So the patent probably makes it impractical to use this
> thing at all.

Once it is filed then I can post it, I do not need to wait till it is
done.

And what the hell do you mean about not using patents?? We have several
patents in the kernel. Have you ever heard of the Open Invention Network?

http://www.openinventionnetwork.com/

This patent will go under the Open Source patent pool to help
protect against patent attacks against Linux, et. al.

Yes, the Open Source community can file their own patents. The more
patents owned by the Open Source community, the better.

-- Steve

2009-04-27 14:51:41

by Davide Libenzi

[permalink] [raw]
Subject: Re: Re-implement MCE log ring buffer as per-CPU ring buffer

On Mon, 27 Apr 2009, Steven Rostedt wrote:

> Once it is filed then I can post it, I do not need to wait till it is
> done.
>
> And what the hell do you mean about not using patents?? We have several
> patents in the kernel. Have you ever heard of the Open Invention Network?
>
> http://www.openinventionnetwork.com/
>
> This patent will go under the Open Source patent pool to help
> protect against patent attacks against Linux, et. al.
>
> Yes, the Open Source community can file their own patents. The more
> patents owned by the Open Source community, the better.

IANAL, but what is the point of it? If that's for OSS IP protection
against Evil Corporate claims, isn't the only fact that you posted the
code and algorirthm, constitutes Prior Art, that would automatically
protect your contribution from future claims?


- Davide

2009-04-27 14:52:29

by Steven Rostedt

[permalink] [raw]
Subject: Re: Patenting kernel patches was Re: Re-implement MCE log ring buffer as per-CPU ring buffer


On Mon, 27 Apr 2009, Andi Kleen wrote:
> > Yes, I plan on being able to post it before the 31 merge window opens. I'll
> > ping the lawyer to expedite the process.
>
> Sorry, but this is quite ridiculous. Are you serious?

Yes.

>
> Why would we want such a patented algorithm in the kernel? Normally the
> standard
> policy is to avoid patented algorithms (unless there's really no alternative
> like
> with RCU which is clearly not the case here) and I'm not aware of this policy
> haven't
> changed.
>
> And also holding up perfectly good uncontaminated patches for something
> patented seems especially wrong.
>
> I think we should move forward with a standard non patented ring
> buffer Ying was working on for this and avoid the patent mess as
> far as possible.

In the world where patents can destroy Open Source/Free Software, one of
the protections that Free Software can do is to file their own patents.
Thus fight fire with fire.

>
> -Andi (who wonders if he isn't in bizarro land now)

Welcome to bizarro land!

-- Steve

2009-04-27 14:57:57

by Steven Rostedt

[permalink] [raw]
Subject: Re: Re-implement MCE log ring buffer as per-CPU ring buffer


On Mon, 27 Apr 2009, Davide Libenzi wrote:

> On Mon, 27 Apr 2009, Steven Rostedt wrote:
>
> > Once it is filed then I can post it, I do not need to wait till it is
> > done.
> >
> > And what the hell do you mean about not using patents?? We have several
> > patents in the kernel. Have you ever heard of the Open Invention Network?
> >
> > http://www.openinventionnetwork.com/
> >
> > This patent will go under the Open Source patent pool to help
> > protect against patent attacks against Linux, et. al.
> >
> > Yes, the Open Source community can file their own patents. The more
> > patents owned by the Open Source community, the better.
>
> IANAL, but what is the point of it? If that's for OSS IP protection
> against Evil Corporate claims, isn't the only fact that you posted the
> code and algorirthm, constitutes Prior Art, that would automatically
> protect your contribution from future claims?

Not always. There could be patent applications that have been out before
you posted. But what you want is not to be on the defensive side only. If
a company comes and attacks you over a patent they own, you pull out all
your patents and look to see if the company that is attacking you is using
anything you own.

As I said, the more patents that the Open Source community owns the
better. The Open Invention Network is posed to counter attack any company
that tries to attack a free software group. It is not just to counter with
prior art, but also threaten the company that is attacking.

-- Steve

2009-04-27 15:33:32

by Alan

[permalink] [raw]
Subject: Re: Patenting kernel patches was Re: Re-implement MCE log ring buffer as per-CPU ring buffer

> And also holding up perfectly good uncontaminated patches for something
> patented seems especially wrong.
>
> I think we should move forward with a standard non patented ring
> buffer Ying was working on for this and avoid the patent mess as
> far as possible.

Agreed. If the future patches to be presented are useful and the
licensing usable etc then they can still be merged in the future (or far
future depending how much look folks have speeding up lawyers ;))

Alan

2009-04-27 16:15:07

by Valdis Klētnieks

[permalink] [raw]
Subject: Re: Re-implement MCE log ring buffer as per-CPU ring buffer

On Mon, 27 Apr 2009 07:44:13 PDT, Davide Libenzi said:

> IANAL, but what is the point of it? If that's for OSS IP protection
> against Evil Corporate claims, isn't the only fact that you posted the
> code and algorirthm, constitutes Prior Art, that would automatically
> protect your contribution from future claims?

The general idea is that if some Big-Linux-Hating-Company comes to you and says
"You're infringing our patent #XXZZZY in this module", you want to be able pull
out a sheet of paper and say "Yes, and you've shipped 45,348,917 copies of
FooBar Release 7, that infringes on *our* patent #YYXXZ. You probably owe
us more money than we owe you. But we'll let you walk out the door and
never mention it again."

If you're old enough to remember the Cold War, it was called Mutually
Assured Destruction back then. ;)


Attachments:
(No filename) (226.00 B)

2009-04-27 17:06:19

by Theodore Ts'o

[permalink] [raw]
Subject: Re: Re-implement MCE log ring buffer as per-CPU ring buffer

On Mon, Apr 27, 2009 at 10:39:29AM -0400, Steven Rostedt wrote:
> > > > it is processed before I can release the code.
> > >
> > > When will it be merged by mainline kernel? Do you have a plan?
> >
> > Patent applications tend to be measured in years. Also I'm not sure
> > we really want patented algorithms in the kernel anyways if it can
> > be avoided. So the patent probably makes it impractical to use this
> > thing at all.
>
> Once it is filed then I can post it, I do not need to wait till it is
> done.

Simply preparing a patent application for filing can take a
non-trivial amount of time. I'd suggest that you should ask your
patent attorney whether he is planning on filing internationally, or
just in the US.

If he (or she) is only planning only in the US, unlike the rest of the
world, the US is not a first-to-file, so as long as you have proof of
when you invented it --- for example, getting a description of your
invention notarized --- you might be able to get clearance from your
legal department to submit the patch while the patent application is
getting prepared.

Reference:
http://en.wikipedia.org/wiki/First_to_file_and_first_to_invent

If your attorney is not willing to do this, all I can suggest is that
you ask him to ask his opposite numbers at other open source companies
how they handle this situation at a future Linux Foundation Legal
Summit. I can assure you there are far more rational ways of handling
this than forcing people to wait until the patent application is
actually filed.

> And what the hell do you mean about not using patents?? We have several
> patents in the kernel. Have you ever heard of the Open Invention Network?
>
> http://www.openinventionnetwork.com/
>
> This patent will go under the Open Source patent pool to help
> protect against patent attacks against Linux, et. al.
>
> Yes, the Open Source community can file their own patents. The more
> patents owned by the Open Source community, the better.

No argument here. It's just that there are better ways of doing
things than waiting until after the patent application is fully
drafted, all of the engineering drawings are done, the language of
patent claims are fine-tuned, etc., all of which can take a long, LONG
time, depending on how busy your patent attorney and his (or her)
staff might happen to be.

- Ted

2009-04-27 20:07:22

by H. Peter Anvin

[permalink] [raw]
Subject: Re: Re-implement MCE log ring buffer as per-CPU ring buffer

Theodore Tso wrote:
> Simply preparing a patent application for filing can take a
> non-trivial amount of time. I'd suggest that you should ask your
> patent attorney whether he is planning on filing internationally, or
> just in the US.
>
> If he (or she) is only planning only in the US, unlike the rest of the
> world, the US is not a first-to-file, so as long as you have proof of
> when you invented it --- for example, getting a description of your
> invention notarized --- you might be able to get clearance from your
> legal department to submit the patch while the patent application is
> getting prepared.

Keep in mind, though, the one-year disclosure-to-file statutory bar.

-hpa

2009-04-29 19:15:26

by Andi Kleen

[permalink] [raw]
Subject: Re: Re-implement MCE log ring buffer as per-CPU ring buffer II

H. Peter Anvin wrote:
> Andi Kleen wrote:
>> And then restart here? Add 32bit workarounds, add hook
>> to initialize p5/winchip, make 64bit code 32bit clean.
>> That can come straight from my patchkit. I can do that
>> if there is interest.
>>
>
> Could you come up with a tree we can look at?

Sorry for taking longer, had to fight some infrastructure issues.

I merged the two trees in
git://git.kernel.org/pub/scm/linux/kernel/git/ak/linux-mce-2.6.git mce-32bit-merge

This is my old 32bit mce unification tree forward ported to mce2 and some fixes
for the code in mce2. This is currently with Kconfig for old/new, deprecation
of old for .32 (could be done directly too, but doing it with this additional
step seems safer to me because it will allow easier debugging)

This currently includes the two error injection patches as last (used
for testing), those can be dropped/added later of course too.

If this is ok I can respin the rest of the mce patches on top of that
tree.

-Andi

2009-04-30 07:39:11

by Hidetoshi Seto

[permalink] [raw]
Subject: 32bit mce unification (Re: Re-implement MCE log ring buffer as per-CPU ring buffer II)

Hi Andi,

Andi Kleen wrote:
> H. Peter Anvin wrote:
>> Andi Kleen wrote:
>>> And then restart here? Add 32bit workarounds, add hook
>>> to initialize p5/winchip, make 64bit code 32bit clean.
>>> That can come straight from my patchkit. I can do that
>>> if there is interest.
>>>
>>
>> Could you come up with a tree we can look at?
>
> Sorry for taking longer, had to fight some infrastructure issues.
>
> I merged the two trees in
> git://git.kernel.org/pub/scm/linux/kernel/git/ak/linux-mce-2.6.git
> mce-32bit-merge
>
> This is my old 32bit mce unification tree forward ported to mce2 and
> some fixes
> for the code in mce2. This is currently with Kconfig for old/new,
> deprecation
> of old for .32 (could be done directly too, but doing it with this
> additional
> step seems safer to me because it will allow easier debugging)
>
> This currently includes the two error injection patches as last (used
> for testing), those can be dropped/added later of course too.
>
> If this is ok I can respin the rest of the mce patches on top of that
> tree.

Thank you for providing updates.
They are looks good, but I have some comments:


We can get an instant numbered list from your tree by following:

$ git format-patch origin/master..origin/mce-32bit-merge
0001-x86-mce-Add-mce_threshold-option-for-intel-cmci.patch
0002-x86-mce-Cleanup-param-parser.patch
0003-x86-mce-Add-mce-nopoll-option-to-disable-timer-pol.patch
0004-x86-mce-clean-up-the-mce_64.c-code.patch
0005-x86-mce-clean-up-the-sysfs-variables-namespace.patch
0006-x86-mce-clean-up-mce_amd_64.c.patch
0007-x86-mce-clean-up-p4.c.patch
0008-x86-mce-clean-up-therm_throt.c.patch
0009-x86-mce-clean-up-p6.c.patch
0010-x86-mce-clean-up-k7.c.patch
0011-x86-mce-clean-up-non-fatal.c.patch
0012-x86-mce-clean-up-mce_32.c.patch
0013-x86-mce-clean-up-p5.c.patch
0014-x86-mce-clean-up-winchip.c.patch
0015-x86-mce-clean-up-mce.h.patch
0016-mce-unify-Intel-thermal-init-prepare.patch
0017-x86-mce-unify-the-Intel-thermal-interrupt-code.patch
0018-x86-mce-clean-up-mce_intel.c.patch
0019-x86-mce-clean-up-mce_64.c.patch
0020-x86-mce-prepare-mce.h-and-mce_64.c-for-unification.patch
0021-x86-mce-prepare-unification.patch
0022-x86-mce-unify-prepare-for-32-bit.patch
0023-x86-mce-unify.patch
0024-x86-mce-print-number-of-MCE-banks.patch
0025-x86-MCE-Make-mce_amd_64.c-compile-again.patch
0026-Revert-x86-mce-Add-mce-nopoll-option-to-disable-t.patch
0027-x86-MCE-Move-ifdef-X86_64-to-the-beginning-of-the.patch
0028-x86-MCE-Initial-steps-to-make-64bit-mce-code-32bit.patch
0029-x86-MCE-Implement-the-PPro-bank-0-quirk-in-the-64b.patch
0030-x86-MCE-Port-K7-bank-0-quirk-to-64bit-mce-code.patch
0031-x86-MCE-Use-a-call-vector-to-call-the-64bit-mce-ha.patch
0032-x86-MCE-Rename-64bit-mce_dont_init-to-mce_disabled.patch
0033-x86-MCE-Move-mce_disabled-option-into-common-64bit.patch
0034-x86-MCE-Remove-machine-check-handler-idle-notify-o.patch
0035-x86-MCE-Remove-oops_begin-use-in-64bit-machine-c.patch
0036-x86-MCE-Remove-unused-stop-restart_mce-on-32bit.patch
0037-x86-MCE-Use-64bit-machine-check-code-on-32bit.patch
0038-x86-MCE-Deprecate-old-32bit-machine-check-code.patch
0039-x86-MCE-Enable-MCE_INTEL-for-32bit-new-MCE.patch
0040-x86-MCE-Enable-MCE_AMD-for-32bit-NEW_MCE.patch
0041-x86-MCE-Document-new-32bit-mcelog-requirement-in-D.patch
0042-x86-MCE-Add-MSR-read-wrappers-for-easier-error-inj.patch
0043-x86-MCE-Add-basic-error-injection-infrastructure.patch


Let's start comment/review on this list:

0001-x86-mce-Add-mce_threshold-option-for-intel-cmci.patch
0002-x86-mce-Cleanup-param-parser.patch
0003-x86-mce-Add-mce-nopoll-option-to-disable-timer-pol.patch

These 3 above are merged in tip/x86/mce2, but not pushed into
mainline yet.
You(Andi) and I agreed that 0001 and 0003 should be reverted and
re-implemented before pushing them into .31, and I have patches
to do it.
One of revert patches is placed in 0026. It's fine but I'd like to
replace it with that I own, which have more detailed description.

0004-x86-mce-clean-up-the-mce_64.c-code.patch
0005-x86-mce-clean-up-the-sysfs-variables-namespace.patch
0006-x86-mce-clean-up-mce_amd_64.c.patch
0007-x86-mce-clean-up-p4.c.patch
0008-x86-mce-clean-up-therm_throt.c.patch
0009-x86-mce-clean-up-p6.c.patch
0010-x86-mce-clean-up-k7.c.patch
0011-x86-mce-clean-up-non-fatal.c.patch
0012-x86-mce-clean-up-mce_32.c.patch
0013-x86-mce-clean-up-p5.c.patch
0014-x86-mce-clean-up-winchip.c.patch
0015-x86-mce-clean-up-mce.h.patch
0016-mce-unify-Intel-thermal-init-prepare.patch
0017-x86-mce-unify-the-Intel-thermal-interrupt-code.patch
0018-x86-mce-clean-up-mce_intel.c.patch
0019-x86-mce-clean-up-mce_64.c.patch
0020-x86-mce-prepare-mce.h-and-mce_64.c-for-unification.patch
0021-x86-mce-prepare-unification.patch
0022-x86-mce-unify-prepare-for-32-bit.patch
0023-x86-mce-unify.patch
0024-x86-mce-print-number-of-MCE-banks.patch

These 21 patches above are now top of tip/x86/mce2.
Some are broken and some following patches from Andi are
addressed to fix it.

0025-x86-MCE-Make-mce_amd_64.c-compile-again.patch

The description of this patch is "It's device_mce, not mce_dev."
This is for a bug of 0005, which was intended to rename device_mce
to mce_dev, but the rename was not finished.
I suppose "mce_" prefix is better for name space, so I think
rework on 0005 or incremental fix to finish renaming will be more
better solution.

0026-Revert-x86-mce-Add-mce-nopoll-option-to-disable-t.patch

This reverts 0003. I'd like to replace this with that I own.

0027-x86-MCE-Move-ifdef-X86_64-to-the-beginning-of-the.patch

This is for a bug of 0022, that places ifdefs wrongly.


0028-x86-MCE-Initial-steps-to-make-64bit-mce-code-32bit.patch
0029-x86-MCE-Implement-the-PPro-bank-0-quirk-in-the-64b.patch
0030-x86-MCE-Port-K7-bank-0-quirk-to-64bit-mce-code.patch
0031-x86-MCE-Use-a-call-vector-to-call-the-64bit-mce-ha.patch
0032-x86-MCE-Rename-64bit-mce_dont_init-to-mce_disabled.patch
0033-x86-MCE-Move-mce_disabled-option-into-common-64bit.patch
0034-x86-MCE-Remove-machine-check-handler-idle-notify-o.patch
0035-x86-MCE-Remove-oops_begin-use-in-64bit-machine-c.patch
0036-x86-MCE-Remove-unused-stop-restart_mce-on-32bit.patch
0037-x86-MCE-Use-64bit-machine-check-code-on-32bit.patch
0038-x86-MCE-Deprecate-old-32bit-machine-check-code.patch
0039-x86-MCE-Enable-MCE_INTEL-for-32bit-new-MCE.patch
0040-x86-MCE-Enable-MCE_AMD-for-32bit-NEW_MCE.patch
0041-x86-MCE-Document-new-32bit-mcelog-requirement-in-D.patch
0042-x86-MCE-Add-MSR-read-wrappers-for-easier-error-inj.patch
0043-x86-MCE-Add-basic-error-injection-infrastructure.patch

These 16 patches above is 32bit mce unification by Andi.
0028~0036 are preparation and cleanup,
0037~0041 are main part of unification, and
0042~0043 are for error injection.
My comments are as followings:


0029,0037,0038,0041:
have space before tab in indent / trailing whitespace.


0038:
The change in the first line is not required, and we need to schedule
feature removal not so soon.
Since the patch description is "Schedule for removal in 2.6.32",
the "When:" should be .32 instead of .31

>@@ -1,4 +1,4 @@
>-The following is a list of files and features that are going to be
>+he following is a list of files and features that are going to be
> removed in the kernel source tree. Every entry should contain what
> exactly is going away, why it is happening, and who is going to be doing
> the work. When the feature is removed from the kernel, it should also
>@@ -428,3 +428,13 @@
> After a reasonable transition period, we will remove the legacy
> fakephp interface.
> Who: Alex Chiang <[email protected]>
>+
>+----------------------------
>+
>+What: CONFIG_X86_OLD_MCE
>+When: 2.6.31
>+Why: Remove the old legacy 32bit machine check code. This has been
>+ superseded by the newer machine check code from the 64bit port,
>+ but the old version has been kept around for easier testing. Note this
>+ doesn't impact the old P5 and WinChip machine check handlers.
>+Who: Andi Kleen <[email protected]>


0039:
The comment is wrongly changed, and it would be better to place vectors
in sorted order.

>@@ -88,12 +88,13 @@
> #define THERMAL_APIC_VECTOR 0xfa
>
> #ifdef CONFIG_X86_32
>-/* 0xf8 - 0xf9 : free */
>+/* 0xf9 : free */
> #else
>-# define THRESHOLD_APIC_VECTOR 0xf9
> # define UV_BAU_MESSAGE 0xf8
> #endif
>
>+#define THRESHOLD_APIC_VECTOR 0xf9
>+
> /* f0-f7 used for spreading out TLB flushes: */
> #define INVALIDATE_TLB_VECTOR_END 0xf7
> #define INVALIDATE_TLB_VECTOR_START 0xf0


0041:
Is this version correct?
The latest (git://git.kernel.org/pub/scm/utils/cpu/mce/mcelog.git) is:
#define MCELOG_VERSION "0.8pre2"

>@@ -48,6 +48,7 @@ o procps 3.2.0 # ps --version
> o oprofile 0.9 # oprofiled --version
> o udev 081 # udevinfo -V
> o grub 0.93 # grub --version
>+o mcelog 0.6
>
> Kernel compilation
> ==================


0043:
From checkpatch:
WARNING: usage of NR_CPUS is often wrong - consider using cpu_possible(), num_possible_cpus(), for_each_possible_cpu(), etc
#174: FILE: arch/x86/kernel/cpu/mcheck/mce-inject.c:96:
+ if (m.cpu >= NR_CPUS || !cpu_online(m.cpu))

Plus, with CONFIG_X86_MCE_INJECT=m, I got:
ERROR: "add_timer_on" [arch/x86/kernel/cpu/mcheck/mce-inject.ko] undefined!
IIRC there was a patch named "Export add_timer_on for modules" for
error injection. (Or how about using bool instead of tristate if there
are no strong reason to make it kernel module?)

>@@ -823,6 +823,14 @@ config X86_MCE_THRESHOLD
> bool
> default y
>
>+config X86_MCE_INJECT
>+ depends on X86_NEW_MCE
>+ tristate "Machine check injector support"
>+ help
>+ Provide support for injecting machine checks for testing purposes.
>+ If you don't know what a machine check is and you don't do kernel
>+ QA it is safe to say n.
>+
> config X86_MCE_NONFATAL
> tristate "Check for non-fatal errors on AMD Athlon/Duron / Intel Pentium 4"
> depends on X86_OLD_MCE


Thanks,
H.Seto