LinuxLists.cc - [PATCH -v4] x86: MCE: Re-implement MCE log ring buffer as per-CPU ring buffer

2009-06-04 03:11:39

Subject: [PATCH -v4] x86: MCE: Re-implement MCE log ring buffer as per-CPU ring buffer

Re-implement MCE log ring buffer as per-CPU ring buffer for better
scalability. Basic design is as follow:

- One ring buffer for each CPU

+ MCEs are added to corresponding local per-CPU buffer, instead of
one big global buffer. Contention/unfairness between CPUs is
eleminated.

+ MCE records are read out and removed from per-CPU buffers by mutex
protected global reader function. Because there are no many
readers in system to contend in most cases.

- Per-CPU ring buffer data structure

+ An array is used to hold MCE records. integer "head" indicates
next writing position and integer "tail" indicates next reading
position.

+ To distinguish buffer empty and full, head and tail wrap to 0 at
MCE_LOG_LIMIT instead of MCE_LOG_LEN. Then the real next writing
position is head % MCE_LOG_LEN, and real next reading position is
tail % MCE_LOG_LEN. If buffer is empty, head == tail, if buffer is
full, head % MCE_LOG_LEN == tail % MCE_LOG_LEN and head != tail.

- Lock-less for writer side

+ MCE log writer may come from NMI, so the writer side must be
lock-less. For per-CPU buffer of one CPU, writers may come from
process, IRQ or NMI context, so "head" is increased with
cmpxchg_local() to allocate buffer space.

+ Reader side is protected with a mutex to guarantee only one reader
is active in the whole system.

Performance test show that the throughput of per-CPU mcelog buffer can
reach 430k records/s compared with 5.3k records/s for original
implementation on a 2-core 2.1GHz Core2 machine.

ChangeLog:

v4:

- Rebased on x86-tip.git x86/mce3 branch

- Fix a synchronization issue about mce.finished

- Fix comment style

v3:

- Use DEFINE_PER_CPU to allocate per_cpu mcelog buffer.

- Use cond_resched() to prevent possible system not reponsing issue
for large user buffer.

v2:

- Use alloc_percpu() to allocate per_cpu mcelog buffer.

- Use ndelay to implement witer timeout.

Signed-off-by: Huang Ying <[email protected]>
CC: Andi Kleen <[email protected]>

---
arch/x86/include/asm/mce.h | 17 +-
arch/x86/kernel/cpu/mcheck/mce.c | 277 ++++++++++++++++++++++++---------------
2 files changed, 183 insertions(+), 111 deletions(-)

--- a/arch/x86/include/asm/mce.h
+++ b/arch/x86/include/asm/mce.h
@@ -52,20 +52,27 @@ struct mce {
* is set.
*/

-#define MCE_LOG_LEN 32
+#define MCE_LOG_LEN 32
+#define MCE_LOG_LIMIT (MCE_LOG_LEN * 2 - 1)
+
+static inline int mce_log_index(int n)
+{
+ return n >= MCE_LOG_LEN ? n - MCE_LOG_LEN : n;
+}
+
+struct mce_log_cpu;

struct mce_log {
- char signature[12]; /* "MACHINECHECK" */
+ char signature[12]; /* "MACHINECHEC2" */
unsigned len; /* = MCE_LOG_LEN */
- unsigned next;
unsigned flags;
unsigned pad0;
- struct mce entry[MCE_LOG_LEN];
+ struct mce_log_cpu *mcelog_cpus;
};

#define MCE_OVERFLOW 0 /* bit 0 in flags means overflow */

-#define MCE_LOG_SIGNATURE "MACHINECHECK"
+#define MCE_LOG_SIGNATURE "MACHINECHEC2"

#define MCE_GET_RECORD_LEN _IOR('M', 1, int)
#define MCE_GET_LOG_LEN _IOR('M', 2, int)
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -55,6 +55,9 @@ int mce_disabled;

#define MISC_MCELOG_MINOR 227

+/* Timeout log reader to wait writer to finish */
+#define WRITER_TIMEOUT_NS NSEC_PER_MSEC
+
atomic_t mce_entry;

/*
@@ -110,42 +113,50 @@ static struct mce_log mcelog = {
MCE_LOG_LEN,
};

+struct mce_log_cpu {
+ int head;
+ int tail;
+ unsigned long flags;
+ struct mce entry[MCE_LOG_LEN];
+};
+
+DEFINE_PER_CPU(struct mce_log_cpu, mce_log_cpus);
+
void mce_log(struct mce *mce)
{
- unsigned next, entry;
+ struct mce_log_cpu *mcelog_cpu = &__get_cpu_var(mce_log_cpus);
+ int head, ihead, tail, next;

mce->finished = 0;
- wmb();
- for (;;) {
- entry = rcu_dereference(mcelog.next);
- for (;;) {
- /*
- * When the buffer fills up discard new entries.
- * Assume that the earlier errors are the more
- * interesting ones:
- */
- if (entry >= MCE_LOG_LEN) {
- set_bit(MCE_OVERFLOW,
- (unsigned long *)&mcelog.flags);
- return;
- }
- /* Old left over entry. Skip: */
- if (mcelog.entry[entry].finished) {
- entry++;
- continue;
- }
- break;
+ /*
+ * mce->finished must be set to 0 before written to ring
+ * buffer
+ */
+ smp_wmb();
+ do {
+ head = mcelog_cpu->head;
+ tail = mcelog_cpu->tail;
+ ihead = mce_log_index(head);
+ /*
+ * When the buffer fills up discard new entries.
+ * Assume that the earlier errors are the more
+ * interesting.
+ */
+ if (ihead == mce_log_index(tail) && head != tail) {
+ set_bit(MCE_OVERFLOW, &mcelog_cpu->flags);
+ return;
}
- smp_rmb();
- next = entry + 1;
- if (cmpxchg(&mcelog.next, entry, next) == entry)
- break;
- }
- memcpy(mcelog.entry + entry, mce, sizeof(struct mce));
- wmb();
- mcelog.entry[entry].finished = 1;
- wmb();
-
+ next = head == MCE_LOG_LIMIT ? 0 : head + 1;
+ } while (cmpxchg_local(&mcelog_cpu->head, head, next) != head);
+ memcpy(mcelog_cpu->entry + ihead, mce, sizeof(struct mce));
+ /*
+ * ".finished" of MCE record in ring buffer must be set after
+ * copy
+ */
+ smp_wmb();
+ mcelog_cpu->entry[ihead].finished = 1;
+ /* bit 0 of notify_user should be set after finished be set */
+ smp_wmb();
set_bit(0, &notify_user);
}

@@ -175,22 +186,38 @@ static void print_mce(struct mce *m)
"and contact your hardware vendor\n");
}

-static void mce_panic(char *msg, struct mce *backup, u64 start)
+static int mce_print_cpu(int cpu, struct mce *backup, u64 start)
{
int i;
+ struct mce_log_cpu *mcelog_cpu;

- bust_spinlocks(1);
- console_verbose();
+ mcelog_cpu = &__get_cpu_var(mce_log_cpus);
for (i = 0; i < MCE_LOG_LEN; i++) {
- u64 tsc = mcelog.entry[i].tsc;
+ u64 tsc = mcelog_cpu->entry[i].tsc;

if ((s64)(tsc - start) < 0)
continue;
- print_mce(&mcelog.entry[i]);
- if (backup && mcelog.entry[i].tsc == backup->tsc)
+ print_mce(&mcelog_cpu->entry[i]);
+ if (backup && mcelog_cpu->entry[i].tsc == backup->tsc)
backup = NULL;
}
- if (backup)
+ return backup == NULL;
+}
+
+static void mce_panic(char *msg, struct mce *backup, u64 start)
+{
+ int cpu, cpu_self;
+
+ bust_spinlocks(1);
+ console_verbose();
+ cpu_self = smp_processor_id();
+ for_each_online_cpu(cpu) {
+ if (cpu == cpu_self)
+ continue;
+ if (mce_print_cpu(cpu, backup, start))
+ backup = NULL;
+ }
+ if (!mce_print_cpu(cpu_self, backup, start))
print_mce(backup);
panic(msg);
}
@@ -642,6 +669,16 @@ static int mce_cap_init(void)
return 0;
}

+/*
+ * Initialize MCE per-CPU log buffer
+ */
+static __cpuinit void mce_log_init(void)
+{
+ if (mcelog.mcelog_cpus)
+ return;
+ mcelog.mcelog_cpus = &per_cpu_var(mce_log_cpus);
+}
+
static void mce_init(void)
{
mce_banks_t all_banks;
@@ -771,6 +808,7 @@ void __cpuinit mcheck_init(struct cpuinf
mce_disabled = 1;
return;
}
+ mce_log_init();
mce_cpu_quirks(c);

machine_check_vector = do_machine_check;
@@ -819,94 +857,121 @@ static int mce_release(struct inode *ino
return 0;
}

-static void collect_tscs(void *data)
+static ssize_t mce_read_cpu(struct mce_log_cpu *mcelog_cpu,
+ char __user *inubuf, size_t usize)
{
- unsigned long *cpu_tsc = (unsigned long *)data;
+ char __user *ubuf = inubuf;
+ int head, tail, pos, i, err = 0;

- rdtscll(cpu_tsc[smp_processor_id()]);
-}
-
-static DEFINE_MUTEX(mce_read_mutex);
-
-static ssize_t mce_read(struct file *filp, char __user *ubuf, size_t usize,
- loff_t *off)
-{
- char __user *buf = ubuf;
- unsigned long *cpu_tsc;
- unsigned prev, next;
- int i, err;
+ head = mcelog_cpu->head;
+ tail = mcelog_cpu->tail;

- cpu_tsc = kmalloc(nr_cpu_ids * sizeof(long), GFP_KERNEL);
- if (!cpu_tsc)
- return -ENOMEM;
-
- mutex_lock(&mce_read_mutex);
- next = rcu_dereference(mcelog.next);
-
- /* Only supports full reads right now */
- if (*off != 0 || usize < MCE_LOG_LEN*sizeof(struct mce)) {
- mutex_unlock(&mce_read_mutex);
- kfree(cpu_tsc);
-
- return -EINVAL;
- }
-
- err = 0;
- prev = 0;
- do {
- for (i = prev; i < next; i++) {
- unsigned long start = jiffies;
+ if (head == tail)
+ return 0;

- while (!mcelog.entry[i].finished) {
- if (time_after_eq(jiffies, start + 2)) {
- memset(mcelog.entry + i, 0,
+ for (pos = tail; pos != head && usize >= sizeof(struct mce);
+ pos = pos == MCE_LOG_LIMIT ? 0 : pos+1) {
+ i = mce_log_index(pos);
+ if (!mcelog_cpu->entry[i].finished) {
+ int timeout = WRITER_TIMEOUT_NS;
+ while (!mcelog_cpu->entry[i].finished) {
+ if (timeout-- <= 0) {
+ memset(mcelog_cpu->entry + i, 0,
sizeof(struct mce));
+ head = mcelog_cpu->head;
+ printk(KERN_WARNING "mcelog: timeout "
+ "waiting for writer to finish!\n");
goto timeout;
}
- cpu_relax();
+ ndelay(1);
}
- smp_rmb();
- err |= copy_to_user(buf, mcelog.entry + i,
- sizeof(struct mce));
- buf += sizeof(struct mce);
-timeout:
- ;
}
+ /*
+ * finished field should be checked before
+ * copy_to_user()
+ */
+ smp_rmb();
+ err |= copy_to_user(ubuf, mcelog_cpu->entry + i,
+ sizeof(struct mce));
+ ubuf += sizeof(struct mce);
+ usize -= sizeof(struct mce);
+ mcelog_cpu->entry[i].finished = 0;
+timeout:
+ ;
+ }
+ /*
+ * mcelog_cpu->tail must be updated after ".finished" of
+ * corresponding MCE records are clear.
+ */
+ smp_wmb();
+ mcelog_cpu->tail = pos;
+
+ return err ? -EFAULT : ubuf - inubuf;
+}

- memset(mcelog.entry + prev, 0,
- (next - prev) * sizeof(struct mce));
- prev = next;
- next = cmpxchg(&mcelog.next, prev, 0);
- } while (next != prev);
+static int mce_empty_cpu(struct mce_log_cpu *mcelog_cpu)
+{
+ int head, tail;

- synchronize_sched();
+ head = mcelog_cpu->head;
+ tail = mcelog_cpu->tail;

- /*
- * Collect entries that were still getting written before the
- * synchronize.
- */
- on_each_cpu(collect_tscs, cpu_tsc, 1);
+ return head == tail;
+}

- for (i = next; i < MCE_LOG_LEN; i++) {
- if (mcelog.entry[i].finished &&
- mcelog.entry[i].tsc < cpu_tsc[mcelog.entry[i].cpu]) {
- err |= copy_to_user(buf, mcelog.entry+i,
- sizeof(struct mce));
- smp_rmb();
- buf += sizeof(struct mce);
- memset(&mcelog.entry[i], 0, sizeof(struct mce));
- }
+static int mce_empty(void)
+{
+ int cpu;
+ struct mce_log_cpu *mcelog_cpu;
+
+ for_each_possible_cpu(cpu) {
+ mcelog_cpu = &per_cpu(mce_log_cpus, cpu);
+ if (!mce_empty_cpu(mcelog_cpu))
+ return 0;
}
- mutex_unlock(&mce_read_mutex);
- kfree(cpu_tsc);
+ return 1;
+}

- return err ? -EFAULT : buf - ubuf;
+static ssize_t mce_read(struct file *filp, char __user *inubuf, size_t usize,
+ loff_t *off)
+{
+ char __user *ubuf = inubuf;
+ struct mce_log_cpu *mcelog_cpu;
+ int cpu, new_mce, err = 0;
+ static DEFINE_MUTEX(mce_read_mutex);
+
+ mutex_lock(&mce_read_mutex);
+ do {
+ new_mce = 0;
+ for_each_possible_cpu(cpu) {
+ if (usize < sizeof(struct mce))
+ goto out;
+ mcelog_cpu = &per_cpu(mce_log_cpus, cpu);
+ err = mce_read_cpu(mcelog_cpu, ubuf,
+ sizeof(struct mce));
+ if (err > 0) {
+ ubuf += sizeof(struct mce);
+ usize -= sizeof(struct mce);
+ new_mce = 1;
+ err = 0;
+ } else if (err < 0)
+ goto out;
+ }
+ if (need_resched()) {
+ mutex_unlock(&mce_read_mutex);
+ cond_resched();
+ mutex_lock(&mce_read_mutex);
+ }
+ } while (new_mce || !mce_empty());
+out:
+ mutex_unlock(&mce_read_mutex);
+ return err ? : ubuf - inubuf;
}

static unsigned int mce_poll(struct file *file, poll_table *wait)
{
poll_wait(file, &mce_wait, wait);
- if (rcu_dereference(mcelog.next))
+ if (!mce_empty())
return POLLIN | POLLRDNORM;
return 0;
}

2009-06-04 05:56:17

by Andi Kleen

[permalink] [raw]

Subject: Re: [PATCH -v4] x86: MCE: Re-implement MCE log ring buffer as per-CPU ring buffer

Huang Ying wrote:
> Re-implement MCE log ring buffer as per-CPU ring buffer for better
> scalability. Basic design is as follow:

Looks good. Thanks.

Acked-by: Andi Kleen <[email protected]>

>
> Performance test show that the throughput of per-CPU mcelog buffer can
> reach 430k records/s compared with 5.3k records/s for original
> implementation on a 2-core 2.1GHz Core2 machine.

I should add the main reason to add the patch is less that performance
(normally there shouldn't be that many errors, so it doesn't matter too much),
but that it scales to a larger number of CPUs. The old mce log ring had a fixed
sized buffer that was hard to extend and didn't scale on larger systems,
which could cause overflows and other issues.
Also this fixes a long standing known livelock in the old code where
a very high continuous error rate could livelock the reader.

-Andi

2009-06-05 07:02:15

by Hidetoshi Seto

[permalink] [raw]

Subject: Re: [PATCH -v4] x86: MCE: Re-implement MCE log ring buffer as per-CPU ring buffer

Huang Ying wrote:
> Re-implement MCE log ring buffer as per-CPU ring buffer for better
> scalability. Basic design is as follow:

That would be great. This should be in .31, I think.

Some minor points:

> struct mce_log {
> - char signature[12]; /* "MACHINECHECK" */
> + char signature[12]; /* "MACHINECHEC2" */
> unsigned len; /* = MCE_LOG_LEN */
> - unsigned next;
> unsigned flags;
> unsigned pad0;
> - struct mce entry[MCE_LOG_LEN];
> + struct mce_log_cpu *mcelog_cpus;
> };

What is this *mcelog_cpus to be used for?
It seems it will point one of per-CPU buffers (maybe cpu#0's buffer)
if I have read the following mce_log_init() correctly.

> @@ -642,6 +669,16 @@ static int mce_cap_init(void)
> return 0;
> }
>
> +/*
> + * Initialize MCE per-CPU log buffer
> + */
> +static __cpuinit void mce_log_init(void)
> +{
> + if (mcelog.mcelog_cpus)
> + return;
> + mcelog.mcelog_cpus = &per_cpu_var(mce_log_cpus);
> +}
> +
> static void mce_init(void)
> {
> mce_banks_t all_banks;

Next,

> +static int mce_empty_cpu(struct mce_log_cpu *mcelog_cpu)
> +{
> + int head, tail;
> + head = mcelog_cpu->head;
> + tail = mcelog_cpu->tail;
> + return head == tail;
> +}

Are there any race condition?
Why we cannot have "return (mcelog_cpu->head == mcelog_cpu->tail);"?

Last,

> +static ssize_t mce_read(struct file *filp, char __user *inubuf, size_t usize,
> + loff_t *off)
> +{
> + char __user *ubuf = inubuf;
> + struct mce_log_cpu *mcelog_cpu;
> + int cpu, new_mce, err = 0;
> + static DEFINE_MUTEX(mce_read_mutex);
> +
(snip)
> + return err ? : ubuf - inubuf;
> }

It would be better to put "static DEFINE_MUTEX(mce_read_mutex)" to outside of
the function.

And it looks work, but I'd prefer:
return err ? err : ubuf - inubuf;

Overall it looks good.
Unfortunately there were updates on tip/mce3 so I could not apply this
patch on the top of the branch. I'd appreciate it if you could rebase
this patch again.

Thanks,
H.Seto

2009-06-05 08:27:43

by Huang, Ying

[permalink] [raw]

Subject: Re: [PATCH -v4] x86: MCE: Re-implement MCE log ring buffer as per-CPU ring buffer

On Fri, 2009-06-05 at 15:01 +0800, Hidetoshi Seto wrote:
> Huang Ying wrote:
> > Re-implement MCE log ring buffer as per-CPU ring buffer for better
> > scalability. Basic design is as follow:
>
> That would be great. This should be in .31, I think.
>
> Some minor points:
>
> > struct mce_log {
> > - char signature[12]; /* "MACHINECHECK" */
> > + char signature[12]; /* "MACHINECHEC2" */
> > unsigned len; /* = MCE_LOG_LEN */
> > - unsigned next;
> > unsigned flags;
> > unsigned pad0;
> > - struct mce entry[MCE_LOG_LEN];
> > + struct mce_log_cpu *mcelog_cpus;
> > };
>
> What is this *mcelog_cpus to be used for?
> It seems it will point one of per-CPU buffers (maybe cpu#0's buffer)
> if I have read the following mce_log_init() correctly.

It is mainly used by something like kdump, which can search
"MACHINECHEC2", and analyze mce_log. mcelog_cpus can help kdump find the
real mcelog storage.

> > @@ -642,6 +669,16 @@ static int mce_cap_init(void)
> > return 0;
> > }
> >
> > +/*
> > + * Initialize MCE per-CPU log buffer
> > + */
> > +static __cpuinit void mce_log_init(void)
> > +{
> > + if (mcelog.mcelog_cpus)
> > + return;
> > + mcelog.mcelog_cpus = &per_cpu_var(mce_log_cpus);
> > +}
> > +
> > static void mce_init(void)
> > {
> > mce_banks_t all_banks;
>
>
> Next,
>
> > +static int mce_empty_cpu(struct mce_log_cpu *mcelog_cpu)
> > +{
> > + int head, tail;
> > + head = mcelog_cpu->head;
> > + tail = mcelog_cpu->tail;
> > + return head == tail;
> > +}
>
> Are there any race condition?
> Why we cannot have "return (mcelog_cpu->head == mcelog_cpu->tail);"?

Yes, it is better, I will change this.

> Last,
>
> > +static ssize_t mce_read(struct file *filp, char __user *inubuf, size_t usize,
> > + loff_t *off)
> > +{
> > + char __user *ubuf = inubuf;
> > + struct mce_log_cpu *mcelog_cpu;
> > + int cpu, new_mce, err = 0;
> > + static DEFINE_MUTEX(mce_read_mutex);
> > +
> (snip)
> > + return err ? : ubuf - inubuf;
> > }
>
> It would be better to put "static DEFINE_MUTEX(mce_read_mutex)" to outside of
> the function.
>
> And it looks work, but I'd prefer:
> return err ? err : ubuf - inubuf;

OK, I will change this.

> Overall it looks good.
> Unfortunately there were updates on tip/mce3 so I could not apply this
> patch on the top of the branch. I'd appreciate it if you could rebase
> this patch again.

Yes. I will re-base it.

Best Regards,
Huang Ying

2009-06-05 08:29:06

by Hidetoshi Seto

[permalink] [raw]

Subject: Re: [PATCH -v4] x86: MCE: Re-implement MCE log ring buffer as per-CPU ring buffer

Huang Ying wrote:
> On Fri, 2009-06-05 at 15:01 +0800, Hidetoshi Seto wrote:
>> Huang Ying wrote:
>>> struct mce_log {
>>> - char signature[12]; /* "MACHINECHECK" */
>>> + char signature[12]; /* "MACHINECHEC2" */
>>> unsigned len; /* = MCE_LOG_LEN */
>>> - unsigned next;
>>> unsigned flags;
>>> unsigned pad0;
>>> - struct mce entry[MCE_LOG_LEN];
>>> + struct mce_log_cpu *mcelog_cpus;
>>> };
>> What is this *mcelog_cpus to be used for?
>> It seems it will point one of per-CPU buffers (maybe cpu#0's buffer)
>> if I have read the following mce_log_init() correctly.
>
> It is mainly used by something like kdump, which can search
> "MACHINECHEC2", and analyze mce_log. mcelog_cpus can help kdump find the
> real mcelog storage.

Hum, but it help tools only to find one of buffers, not to find all.

I think it would be better help for tools if we have another signature
on struct mce_log_cpu, e.g.:

+struct mce_log_cpu {
+ char signature[**]; /* "MCE_LOG_CPU_VER_1" or so */
+ __u32 cpuid; /* cpuid or extcpu, same as struct mce */
+ int head;
+ int tail;
+ unsigned long flags;
+ struct mce entry[MCE_LOG_LEN];
+};

How about this?

Thanks,
H.Seto

2009-06-05 08:39:50

by Huang, Ying

[permalink] [raw]

Subject: Re: [PATCH -v4] x86: MCE: Re-implement MCE log ring buffer as per-CPU ring buffer

On Fri, 2009-06-05 at 16:28 +0800, Hidetoshi Seto wrote:
> Huang Ying wrote:
> > On Fri, 2009-06-05 at 15:01 +0800, Hidetoshi Seto wrote:
> >> Huang Ying wrote:
> >>> struct mce_log {
> >>> - char signature[12]; /* "MACHINECHECK" */
> >>> + char signature[12]; /* "MACHINECHEC2" */
> >>> unsigned len; /* = MCE_LOG_LEN */
> >>> - unsigned next;
> >>> unsigned flags;
> >>> unsigned pad0;
> >>> - struct mce entry[MCE_LOG_LEN];
> >>> + struct mce_log_cpu *mcelog_cpus;
> >>> };
> >> What is this *mcelog_cpus to be used for?
> >> It seems it will point one of per-CPU buffers (maybe cpu#0's buffer)
> >> if I have read the following mce_log_init() correctly.
> >
> > It is mainly used by something like kdump, which can search
> > "MACHINECHEC2", and analyze mce_log. mcelog_cpus can help kdump find the
> > real mcelog storage.
>
> Hum, but it help tools only to find one of buffers, not to find all.
>
> I think it would be better help for tools if we have another signature
> on struct mce_log_cpu, e.g.:
>
> +struct mce_log_cpu {
> + char signature[**]; /* "MCE_LOG_CPU_VER_1" or so */
> + __u32 cpuid; /* cpuid or extcpu, same as struct mce */
> + int head;
> + int tail;
> + unsigned long flags;
> + struct mce entry[MCE_LOG_LEN];
> +};
>
> How about this?

+ mcelog.mcelog_cpus = &per_cpu_var(mce_log_cpus);

So mcelog.mcelog_cpus are pointed to the buffers of all CPUs, not just
that of one CPU. You can find them by analyzing PER cpu data structure.

Best Regards,
Huang Ying

2009-06-08 01:18:11

by Hidetoshi Seto

[permalink] [raw]

Subject: Re: [PATCH -v4] x86: MCE: Re-implement MCE log ring buffer as per-CPU ring buffer

Huang Ying wrote:
> On Fri, 2009-06-05 at 16:28 +0800, Hidetoshi Seto wrote:
>> Huang Ying wrote:
>>> On Fri, 2009-06-05 at 15:01 +0800, Hidetoshi Seto wrote:
>>>> Huang Ying wrote:
>>>>> struct mce_log {
>>>>> - char signature[12]; /* "MACHINECHECK" */
>>>>> + char signature[12]; /* "MACHINECHEC2" */
>>>>> unsigned len; /* = MCE_LOG_LEN */
>>>>> - unsigned next;
>>>>> unsigned flags;
>>>>> unsigned pad0;
>>>>> - struct mce entry[MCE_LOG_LEN];
>>>>> + struct mce_log_cpu *mcelog_cpus;
>>>>> };
>>>> What is this *mcelog_cpus to be used for?
>>>> It seems it will point one of per-CPU buffers (maybe cpu#0's buffer)
>>>> if I have read the following mce_log_init() correctly.
>>> It is mainly used by something like kdump, which can search
>>> "MACHINECHEC2", and analyze mce_log. mcelog_cpus can help kdump find the
>>> real mcelog storage.
>> Hum, but it help tools only to find one of buffers, not to find all.
>>
>> I think it would be better help for tools if we have another signature
>> on struct mce_log_cpu, e.g.:
>>
>> +struct mce_log_cpu {
>> + char signature[**]; /* "MCE_LOG_CPU_VER_1" or so */
>> + __u32 cpuid; /* cpuid or extcpu, same as struct mce */
>> + int head;
>> + int tail;
>> + unsigned long flags;
>> + struct mce entry[MCE_LOG_LEN];
>> +};
>>
>> How about this?
>
> + mcelog.mcelog_cpus = &per_cpu_var(mce_log_cpus);
>
> So mcelog.mcelog_cpus are pointed to the buffers of all CPUs, not just
> that of one CPU. You can find them by analyzing PER cpu data structure.

I thought that the signature is used to find the structure without such
analyzing. If a tool can analyze PER cpu data, then it likely know
where the mce_log_cpu is, so mcelog.mcelog_cpus will not be required.

#define per_cpu_var(var) per_cpu__##var

#define per_cpu(var, cpu) \
(*SHIFT_PERCPU_PTR(&per_cpu_var(var), per_cpu_offset(cpu)))

OK, I have mistook per_cpu_var() for per_cpu().
Then mcelog.mcelog_cpus does not point any of the buffers, but tells where
the buffer is locating in each of PER cpu data.
Tools have to get per_cpu_offset(cpu) to know where the PER cpu data
allocated. According to pcpu_alloc_bootmem(), it might consider NUMA.
Anyway It will take a bit more effort.

So still I think the mcelog.mcelog_cpus is mostly pointless.

Thanks,
H.Seto

2009-06-08 02:05:34

by Huang, Ying

[permalink] [raw]

Subject: Re: [PATCH -v4] x86: MCE: Re-implement MCE log ring buffer as per-CPU ring buffer

On Mon, 2009-06-08 at 09:17 +0800, Hidetoshi Seto wrote:
> Huang Ying wrote:
> > On Fri, 2009-06-05 at 16:28 +0800, Hidetoshi Seto wrote:
> >> Huang Ying wrote:
> >>> On Fri, 2009-06-05 at 15:01 +0800, Hidetoshi Seto wrote:
> >>>> Huang Ying wrote:
> >>>>> struct mce_log {
> >>>>> - char signature[12]; /* "MACHINECHECK" */
> >>>>> + char signature[12]; /* "MACHINECHEC2" */
> >>>>> unsigned len; /* = MCE_LOG_LEN */
> >>>>> - unsigned next;
> >>>>> unsigned flags;
> >>>>> unsigned pad0;
> >>>>> - struct mce entry[MCE_LOG_LEN];
> >>>>> + struct mce_log_cpu *mcelog_cpus;
> >>>>> };
> >>>> What is this *mcelog_cpus to be used for?
> >>>> It seems it will point one of per-CPU buffers (maybe cpu#0's buffer)
> >>>> if I have read the following mce_log_init() correctly.
> >>> It is mainly used by something like kdump, which can search
> >>> "MACHINECHEC2", and analyze mce_log. mcelog_cpus can help kdump find the
> >>> real mcelog storage.
> >> Hum, but it help tools only to find one of buffers, not to find all.
> >>
> >> I think it would be better help for tools if we have another signature
> >> on struct mce_log_cpu, e.g.:
> >>
> >> +struct mce_log_cpu {
> >> + char signature[**]; /* "MCE_LOG_CPU_VER_1" or so */
> >> + __u32 cpuid; /* cpuid or extcpu, same as struct mce */
> >> + int head;
> >> + int tail;
> >> + unsigned long flags;
> >> + struct mce entry[MCE_LOG_LEN];
> >> +};
> >>
> >> How about this?
> >
> > + mcelog.mcelog_cpus = &per_cpu_var(mce_log_cpus);
> >
> > So mcelog.mcelog_cpus are pointed to the buffers of all CPUs, not just
> > that of one CPU. You can find them by analyzing PER cpu data structure.
>
> I thought that the signature is used to find the structure without such
> analyzing. If a tool can analyze PER cpu data, then it likely know
> where the mce_log_cpu is, so mcelog.mcelog_cpus will not be required.
>
> #define per_cpu_var(var) per_cpu__##var
>
> #define per_cpu(var, cpu) \
> (*SHIFT_PERCPU_PTR(&per_cpu_var(var), per_cpu_offset(cpu)))
>
> OK, I have mistook per_cpu_var() for per_cpu().
> Then mcelog.mcelog_cpus does not point any of the buffers, but tells where
> the buffer is locating in each of PER cpu data.
> Tools have to get per_cpu_offset(cpu) to know where the PER cpu data
> allocated. According to pcpu_alloc_bootmem(), it might consider NUMA.
> Anyway It will take a bit more effort.
>
> So still I think the mcelog.mcelog_cpus is mostly pointless.

Emm, it seem hard to analyze mce_log_cpu from per_cpu_var(mce_log_cpus),
how about following method:

struct mce_log {
char signature[12];
...
struct mce_log_cpu *mcelog_cpus[];
};

void mcelog_init(void)
{
int cpu;
mcelog.mcelog_cpus = kmalloc(sizeof(void *) * num_possible_cpus();
for_each_possible_cpu(cpu)
mcelog.mcelog_cpus[cpu] = &per_cpu(mce_log_cpus, cpu);
}

Best Regards,
Huang Ying

2009-06-08 03:10:21

by Hidetoshi Seto

[permalink] [raw]

Subject: Re: [PATCH -v4] x86: MCE: Re-implement MCE log ring buffer as per-CPU ring buffer

Huang Ying wrote:
> Emm, it seem hard to analyze mce_log_cpu from per_cpu_var(mce_log_cpus),
> how about following method:
>
> struct mce_log {
> char signature[12];
> ...
> struct mce_log_cpu *mcelog_cpus[];
> };
>
> void mcelog_init(void)
> {
> int cpu;
> mcelog.mcelog_cpus = kmalloc(sizeof(void *) * num_possible_cpus();
> for_each_possible_cpu(cpu)
> mcelog.mcelog_cpus[cpu] = &per_cpu(mce_log_cpus, cpu);
> }

It would be good. Using kzalloc() will be better.
It might be a bit much, but how about adding:

+ mcelog.nr_mcelog_cpus = num_possible_cpus();

Be careful that mcheck_init(), caller of mcelog_init(), will be called
for each booted CPU. You will need:

+ if (mcelog.mcelog_cpus)
+ return;

Thanks,
H.Seto