Current MCE log ring buffer has following bugs and issues:
- On larger systems the 32 size buffer easily overflow, losing events.
- We had some reports of events getting corrupted which were also
blamed on the ring buffer.
- There's a known livelock, now hit by more people, under high error
rate.
We fix these bugs and issues via making MCE log ring buffer as
lock-less per-CPU ring buffer.
- One ring buffer for each CPU
+ MCEs are added to corresponding local per-CPU buffer, instead of
one big global buffer. Contention/unfairness between CPUs is
eleminated.
+ MCE records are read out and removed from per-CPU buffers by mutex
protected global reader function. Because there are no many
readers in system to contend in most cases.
- Per-CPU ring buffer data structure
+ An array is used to hold MCE records. integer "head" indicates
next writing position and integer "tail" indicates next reading
position.
+ To distinguish buffer empty and full, head and tail wrap to 0 at
MCE_LOG_LIMIT instead of MCE_LOG_LEN. Then the real next writing
position is head % MCE_LOG_LEN, and real next reading position is
tail % MCE_LOG_LEN. If buffer is empty, head == tail, if buffer is
full, head % MCE_LOG_LEN == tail % MCE_LOG_LEN and head != tail.
- Lock-less for writer side
+ MCE log writer may come from NMI, so the writer side must be
lock-less. For per-CPU buffer of one CPU, writers may come from
process, IRQ or NMI context, so "head" is increased with
cmpxchg_local() to allocate buffer space.
+ Reader side is protected with a mutex to guarantee only one reader
is active in the whole system.
Performance test show that the throughput of per-CPU mcelog buffer can
reach 430k records/s compared with 5.3k records/s for original
implementation on a 2-core 2.1GHz Core2 machine.
ChangeLog:
v7:
- Rebased on latest x86-tip.git/x86/mce
- Revise mce print code.
v6:
- Rebased on latest x86-tip.git/x86/mce + x86/tip
v5:
- Rebased on latest x86-tip.git/mce4.
- Fix parsing mce_log data structure in dumping kernel.
v4:
- Rebased on x86-tip.git x86/mce3 branch
- Fix a synchronization issue about mce.finished
- Fix comment style
v3:
- Use DEFINE_PER_CPU to allocate per_cpu mcelog buffer.
- Use cond_resched() to prevent possible system not reponsing issue
for large user buffer.
v2:
- Use alloc_percpu() to allocate per_cpu mcelog buffer.
- Use ndelay to implement witer timeout.
Signed-off-by: Huang Ying <[email protected]>
CC: Andi Kleen <[email protected]>
---
arch/x86/include/asm/mce.h | 18 +-
arch/x86/kernel/cpu/mcheck/mce.c | 295 +++++++++++++++++++++++----------------
2 files changed, 190 insertions(+), 123 deletions(-)
--- a/arch/x86/include/asm/mce.h
+++ b/arch/x86/include/asm/mce.h
@@ -76,20 +76,28 @@ struct mce {
* is set.
*/
-#define MCE_LOG_LEN 32
+#define MCE_LOG_LEN 32
+#define MCE_LOG_LIMIT (MCE_LOG_LEN * 2 - 1)
+
+static inline int mce_log_index(int n)
+{
+ return n >= MCE_LOG_LEN ? n - MCE_LOG_LEN : n;
+}
+
+struct mce_log_cpu;
struct mce_log {
- char signature[12]; /* "MACHINECHECK" */
+ char signature[12]; /* "MACHINECHEC2" */
unsigned len; /* = MCE_LOG_LEN */
- unsigned next;
unsigned flags;
unsigned recordlen; /* length of struct mce */
- struct mce entry[MCE_LOG_LEN];
+ unsigned nr_mcelog_cpus;
+ struct mce_log_cpu **mcelog_cpus;
};
#define MCE_OVERFLOW 0 /* bit 0 in flags means overflow */
-#define MCE_LOG_SIGNATURE "MACHINECHECK"
+#define MCE_LOG_SIGNATURE "MACHINECHEC2"
#define MCE_GET_RECORD_LEN _IOR('M', 1, int)
#define MCE_GET_LOG_LEN _IOR('M', 2, int)
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -52,6 +52,9 @@ int mce_disabled __read_mostly;
#define SPINUNIT 100 /* 100ns */
+/* Timeout log reader to wait writer to finish */
+#define WRITER_TIMEOUT_NS NSEC_PER_MSEC
+
atomic_t mce_entry;
DEFINE_PER_CPU(unsigned, mce_exception_count);
@@ -125,42 +128,50 @@ static struct mce_log mcelog = {
.recordlen = sizeof(struct mce),
};
+struct mce_log_cpu {
+ int head;
+ int tail;
+ unsigned long flags;
+ struct mce entry[MCE_LOG_LEN];
+};
+
+DEFINE_PER_CPU(struct mce_log_cpu, mce_log_cpus);
+
void mce_log(struct mce *mce)
{
- unsigned next, entry;
+ struct mce_log_cpu *mcelog_cpu = &__get_cpu_var(mce_log_cpus);
+ int head, ihead, tail, next;
mce->finished = 0;
- wmb();
- for (;;) {
- entry = rcu_dereference(mcelog.next);
- for (;;) {
- /*
- * When the buffer fills up discard new entries.
- * Assume that the earlier errors are the more
- * interesting ones:
- */
- if (entry >= MCE_LOG_LEN) {
- set_bit(MCE_OVERFLOW,
- (unsigned long *)&mcelog.flags);
- return;
- }
- /* Old left over entry. Skip: */
- if (mcelog.entry[entry].finished) {
- entry++;
- continue;
- }
- break;
+ /*
+ * mce->finished must be set to 0 before written to ring
+ * buffer
+ */
+ smp_wmb();
+ do {
+ head = mcelog_cpu->head;
+ tail = mcelog_cpu->tail;
+ ihead = mce_log_index(head);
+ /*
+ * When the buffer fills up discard new entries.
+ * Assume that the earlier errors are the more
+ * interesting.
+ */
+ if (ihead == mce_log_index(tail) && head != tail) {
+ set_bit(MCE_OVERFLOW, &mcelog_cpu->flags);
+ return;
}
- smp_rmb();
- next = entry + 1;
- if (cmpxchg(&mcelog.next, entry, next) == entry)
- break;
- }
- memcpy(mcelog.entry + entry, mce, sizeof(struct mce));
- wmb();
- mcelog.entry[entry].finished = 1;
- wmb();
-
+ next = head == MCE_LOG_LIMIT ? 0 : head + 1;
+ } while (cmpxchg_local(&mcelog_cpu->head, head, next) != head);
+ memcpy(mcelog_cpu->entry + ihead, mce, sizeof(struct mce));
+ /*
+ * ".finished" of MCE record in ring buffer must be set after
+ * copy
+ */
+ smp_wmb();
+ mcelog_cpu->entry[ihead].finished = 1;
+ /* bit 0 of notify_user should be set after finished be set */
+ smp_wmb();
mce->finished = 1;
set_bit(0, &mce_need_notify);
}
@@ -200,6 +211,26 @@ static void print_mce_tail(void)
KERN_EMERG "Run through mcelog --ascii to decode and contact your hardware vendor\n");
}
+static void print_mce_cpu(int cpu, struct mce *final, u64 mask, u64 res)
+{
+ int i;
+ struct mce_log_cpu *mcelog_cpu;
+
+ mcelog_cpu = &per_cpu(mce_log_cpus, cpu);
+ for (i = 0; i < MCE_LOG_LEN; i++) {
+ struct mce *m = &mcelog_cpu->entry[i];
+ if (!m->finished)
+ continue;
+ if (!(m->status & MCI_STATUS_VAL))
+ continue;
+ if ((m->status & mask) != res)
+ continue;
+ if (final && !memcmp(m, final, sizeof(struct mce)))
+ continue;
+ print_mce(m);
+ }
+}
+
#define PANIC_TIMEOUT 5 /* 5 seconds */
static atomic_t mce_paniced;
@@ -222,7 +253,7 @@ static void wait_for_panic(void)
static void mce_panic(char *msg, struct mce *final, char *exp)
{
- int i;
+ int cpu;
if (!fake_panic) {
/*
@@ -241,23 +272,12 @@ static void mce_panic(char *msg, struct
}
print_mce_head();
/* First print corrected ones that are still unlogged */
- for (i = 0; i < MCE_LOG_LEN; i++) {
- struct mce *m = &mcelog.entry[i];
- if (!(m->status & MCI_STATUS_VAL))
- continue;
- if (!(m->status & MCI_STATUS_UC))
- print_mce(m);
- }
- /* Now print uncorrected but with the final one last */
- for (i = 0; i < MCE_LOG_LEN; i++) {
- struct mce *m = &mcelog.entry[i];
- if (!(m->status & MCI_STATUS_VAL))
- continue;
- if (!(m->status & MCI_STATUS_UC))
- continue;
- if (!final || memcmp(m, final, sizeof(struct mce)))
- print_mce(m);
- }
+ for_each_online_cpu(cpu)
+ print_mce_cpu(cpu, final, MCI_STATUS_UC, 0);
+ /* Print uncorrected but without the final one */
+ for_each_online_cpu(cpu)
+ print_mce_cpu(cpu, final, MCI_STATUS_UC, MCI_STATUS_UC);
+ /* Finally print the final mce */
if (final)
print_mce(final);
if (cpu_missing)
@@ -1207,6 +1227,22 @@ static int __cpuinit mce_cap_init(void)
return 0;
}
+/*
+ * Initialize MCE per-CPU log buffer
+ */
+static __cpuinit void mce_log_init(void)
+{
+ int cpu;
+
+ if (mcelog.mcelog_cpus)
+ return;
+ mcelog.nr_mcelog_cpus = num_possible_cpus();
+ mcelog.mcelog_cpus = kzalloc(sizeof(void *) * num_possible_cpus(),
+ GFP_KERNEL);
+ for_each_possible_cpu(cpu)
+ mcelog.mcelog_cpus[cpu] = &per_cpu(mce_log_cpus, cpu);
+}
+
static void mce_init(void)
{
mce_banks_t all_banks;
@@ -1367,6 +1403,7 @@ void __cpuinit mcheck_init(struct cpuinf
return;
}
mce_cpu_quirks(c);
+ mce_log_init();
machine_check_vector = do_machine_check;
@@ -1415,94 +1452,116 @@ static int mce_release(struct inode *ino
return 0;
}
-static void collect_tscs(void *data)
+static ssize_t mce_read_cpu(struct mce_log_cpu *mcelog_cpu,
+ char __user *inubuf, size_t usize)
{
- unsigned long *cpu_tsc = (unsigned long *)data;
+ char __user *ubuf = inubuf;
+ int head, tail, pos, i, err = 0;
- rdtscll(cpu_tsc[smp_processor_id()]);
-}
+ head = mcelog_cpu->head;
+ tail = mcelog_cpu->tail;
-static DEFINE_MUTEX(mce_read_mutex);
+ if (head == tail)
+ return 0;
-static ssize_t mce_read(struct file *filp, char __user *ubuf, size_t usize,
- loff_t *off)
-{
- char __user *buf = ubuf;
- unsigned long *cpu_tsc;
- unsigned prev, next;
- int i, err;
+ for (pos = tail; pos != head && usize >= sizeof(struct mce);
+ pos = pos == MCE_LOG_LIMIT ? 0 : pos+1) {
+ i = mce_log_index(pos);
+ if (!mcelog_cpu->entry[i].finished) {
+ int timeout = WRITER_TIMEOUT_NS;
+ while (!mcelog_cpu->entry[i].finished) {
+ if (timeout-- <= 0) {
+ memset(mcelog_cpu->entry + i, 0,
+ sizeof(struct mce));
+ head = mcelog_cpu->head;
+ printk(KERN_WARNING "mcelog: timeout "
+ "waiting for writer to finish!\n");
+ goto timeout;
+ }
+ ndelay(1);
+ }
+ }
+ /*
+ * finished field should be checked before
+ * copy_to_user()
+ */
+ smp_rmb();
+ err |= copy_to_user(ubuf, mcelog_cpu->entry + i,
+ sizeof(struct mce));
+ ubuf += sizeof(struct mce);
+ usize -= sizeof(struct mce);
+ mcelog_cpu->entry[i].finished = 0;
+timeout:
+ ;
+ }
+ /*
+ * mcelog_cpu->tail must be updated after ".finished" of
+ * corresponding MCE records are clear.
+ */
+ smp_wmb();
+ mcelog_cpu->tail = pos;
- cpu_tsc = kmalloc(nr_cpu_ids * sizeof(long), GFP_KERNEL);
- if (!cpu_tsc)
- return -ENOMEM;
+ return err ? -EFAULT : ubuf - inubuf;
+}
- mutex_lock(&mce_read_mutex);
- next = rcu_dereference(mcelog.next);
+static int mce_empty_cpu(struct mce_log_cpu *mcelog_cpu)
+{
+ return mcelog_cpu->head == mcelog_cpu->tail;
+}
- /* Only supports full reads right now */
- if (*off != 0 || usize < MCE_LOG_LEN*sizeof(struct mce)) {
- mutex_unlock(&mce_read_mutex);
- kfree(cpu_tsc);
+static int mce_empty(void)
+{
+ int cpu;
+ struct mce_log_cpu *mcelog_cpu;
- return -EINVAL;
+ for_each_possible_cpu(cpu) {
+ mcelog_cpu = &per_cpu(mce_log_cpus, cpu);
+ if (!mce_empty_cpu(mcelog_cpu))
+ return 0;
}
+ return 1;
+}
- err = 0;
- prev = 0;
- do {
- for (i = prev; i < next; i++) {
- unsigned long start = jiffies;
+static ssize_t mce_read(struct file *filp, char __user *inubuf, size_t usize,
+ loff_t *off)
+{
+ char __user *ubuf = inubuf;
+ struct mce_log_cpu *mcelog_cpu;
+ int cpu, new_mce, err = 0;
+ static DEFINE_MUTEX(mce_read_mutex);
- while (!mcelog.entry[i].finished) {
- if (time_after_eq(jiffies, start + 2)) {
- memset(mcelog.entry + i, 0,
- sizeof(struct mce));
- goto timeout;
- }
- cpu_relax();
- }
- smp_rmb();
- err |= copy_to_user(buf, mcelog.entry + i,
- sizeof(struct mce));
- buf += sizeof(struct mce);
-timeout:
- ;
+ mutex_lock(&mce_read_mutex);
+ do {
+ new_mce = 0;
+ for_each_possible_cpu(cpu) {
+ if (usize < sizeof(struct mce))
+ goto out;
+ mcelog_cpu = &per_cpu(mce_log_cpus, cpu);
+ err = mce_read_cpu(mcelog_cpu, ubuf,
+ sizeof(struct mce));
+ if (err > 0) {
+ ubuf += sizeof(struct mce);
+ usize -= sizeof(struct mce);
+ new_mce = 1;
+ err = 0;
+ } else if (err < 0)
+ goto out;
}
-
- memset(mcelog.entry + prev, 0,
- (next - prev) * sizeof(struct mce));
- prev = next;
- next = cmpxchg(&mcelog.next, prev, 0);
- } while (next != prev);
-
- synchronize_sched();
-
- /*
- * Collect entries that were still getting written before the
- * synchronize.
- */
- on_each_cpu(collect_tscs, cpu_tsc, 1);
-
- for (i = next; i < MCE_LOG_LEN; i++) {
- if (mcelog.entry[i].finished &&
- mcelog.entry[i].tsc < cpu_tsc[mcelog.entry[i].cpu]) {
- err |= copy_to_user(buf, mcelog.entry+i,
- sizeof(struct mce));
- smp_rmb();
- buf += sizeof(struct mce);
- memset(&mcelog.entry[i], 0, sizeof(struct mce));
+ if (need_resched()) {
+ mutex_unlock(&mce_read_mutex);
+ cond_resched();
+ mutex_lock(&mce_read_mutex);
}
- }
+ } while (new_mce || !mce_empty());
+out:
mutex_unlock(&mce_read_mutex);
- kfree(cpu_tsc);
-
- return err ? -EFAULT : buf - ubuf;
+ return err ? err : ubuf - inubuf;
}
static unsigned int mce_poll(struct file *file, poll_table *wait)
{
poll_wait(file, &mce_wait, wait);
- if (rcu_dereference(mcelog.next))
+ if (!mce_empty())
return POLLIN | POLLRDNORM;
return 0;
}
* Huang Ying <[email protected]> wrote:
> Current MCE log ring buffer has following bugs and issues:
>
> - On larger systems the 32 size buffer easily overflow, losing events.
>
> - We had some reports of events getting corrupted which were also
> blamed on the ring buffer.
>
> - There's a known livelock, now hit by more people, under high error
> rate.
>
> We fix these bugs and issues via making MCE log ring buffer as
> lock-less per-CPU ring buffer.
I like the direction of this (the current MCE ring-buffer code is a bad
local hack that should never have been merged upstream in that form) -
but i'd like to see a MUCH more ambitious (and much more useful!)
approach insted of using an explicit ring-buffer.
Please define MCE generic tracepoints using TRACE_EVENT() and use
perfcounters to access them.
This approach solves all the problems you listed and it also adds a
large number of new features to MCE events:
- Multiple user-space agents can access MCE events. You can have an
mcelog daemon running but also a system-wide tracer capturing
important events in flight-recorder mode.
- Sampling support: the kernel and the user-space call-chain of MCE
events can be stored and analyzed as well. This way actual patterns
of bad behavior can be matched to precisely what kind of activity
happened in the kernel (and/or in the app) around that moment in
time.
- Coupling with other hardware and software events: the PMU can track a
number of other anomalies - monitoring software might chose to
monitor those plus the MCE events as well - in one coherent stream of
events.
- Discovery of MCE sources - tracepoints are enumerated and tools can
act upon the existence (or non-existence) of various channels of MCE
information.
- Filtering support: you just subscribe to and act upon the events you
are interested in. Then even on a per event source basis there's
in-kernel filter expressions available that can restrict the amount
of data that hits the event channel.
- Arbitrary deep per cpu buffering of events - you can buffer 32
entries or you can buffer as much as you want, as long as you have
the RAM.
- An NMI-safe ring-buffer implementation - mappable to user-space.
- Built-in support for timestamping of events, PID markers, CPU
markers, etc.
- A rich ABI accessible over system call interface. Per cpu, per task
and per workload monitoring of MCE events can be done this way. The
ABI itself has a nice, meaningful structure.
- Extensible ABI: new fields can be added without breaking tooling.
New tracepoints can be added as the hardware side evolves. There's
various parsers that can be used.
- Lots of scheduling/buffering/batching modes of operandi for MCE
events. poll() support. mmap() support. read() support. You name it.
- Rich tooling support: even without any MCE specific extensions added
the 'perf' tool today offers various views of MCE data: perf report,
perf stat, perf trace can all be used to view logged MCE events and
perhaps correlate them to certain user-space usage patterns. But it
can be used directly as well, for user-space agents and policy action
in mcelog, etc.
- Significant code reduction and cleanup in the MCE code: the whole
mcelog facility can be dropped in essence.
- (these are the top of the list - there more advantages as well.)
Such a design would basically propel the MCE code into the twenty first
century. Once we have these facilities we can phase out /dev/mcelog for
good. It would turn Linux MCE events from a quirky hack that doesnt even
work after years of hacking into a modern, extensible event logging
facility that uses event sources and flexible transports to user-space.
It would actually be code that is not a problem child like today but one
that we can take pride in and which is fun to work on :-)
Now, an approach like this shouldnt just be a blind export of mce_log()
into a single artificial generic event [which is a pretty poor API to
begin with] - it should be the definition of meaningful
tracepoints/events that describe the hardware's structure.
I'd rather have a good enumeration of various sources of MCEs as
separate tracepoints than some badly jumbled mess of all MCE sources in
one inflexible ABI as /dev/mcelog does it today.
Note, if you need any perfcounter infrastructure extensions/help for
this then we'll be glad to provide that. I'm sure there's a few things
to enhance and a few things to fix - there always are with any
non-trivial new user :-) But heck would i take _those_ forward looking
problems over any of the current MCE design mess, any day of the week.
Thanks,
Ingo
Hi, Ingo,
I think it is a interesting idea to replace the original MCE logging
part implementation (MCE log ring buffer + /dev/mcelog misc device) with
tracing infrastructure. The tracing infrastructure has definitely more
features than original method, but it is more complex than original
method too.
The tracing infrastructure is good at tracing. But I think it may be too
complex to be used as a (hardware) error processing mechanism. I think
the error processing should be simple and relatively independent with
other part of the kernel to avoid triggering more (hardware) error
during error processing. What do you think about this?
Even if we will reach consensus at some point to re-write MCE logging
part totally. We still need to fix the bugs in original implementation
before the new implementation being ready. Do you agree?
Best Regards,
Huang Ying
* Huang Ying <[email protected]> wrote:
> Hi, Ingo,
>
> I think it is a interesting idea to replace the original MCE logging
> part implementation (MCE log ring buffer + /dev/mcelog misc device)
> with tracing infrastructure. The tracing infrastructure has definitely
> more features than original method, but it is more complex than
> original method too.
TRACE_EVENT() can be used to define the tracepoint itself - but most of
the benefit comes from using the performance events subsystem for event
reporting. It's a perfect match for MCE events - see below.
> The tracing infrastructure is good at tracing. But I think it may be
> too complex to be used as a (hardware) error processing mechanism. I
> think the error processing should be simple and relatively independent
> with other part of the kernel to avoid triggering more (hardware)
> error during error processing. What do you think about this?
The fact that event reporting is used for other things as well might
even make it more fault-resilient than open-coded MCE code: just due to
it possibly being hot in the cache already (and thus not needing extra
DRAM cycles to a possibly faulty piece of hardware) while the MCE code,
not used by anything else, has to be brought into the cache for error
processing - triggering more faults.
But ... that argument is even largely irrelevant: the overriding core
kernel principle was always that of reuse of facilities. We want to keep
performance event reporting simple and robust too, so re-using
facilities is a good thing and the needs of MCE logging will improve
perf events - and vice versa.
Furthermore, to me as x86 maintainer it matters a lot that a piece of
code i maintain is clean, well-designed and maintainable in the long
run. The current /dev/mcelog code is not clean and not maintainable,
and that needs to be fixed before it can be extended.
That MCE logging also becomes more robust and more feature-rich as a
result is a happy side-effect.
> Even if we will reach consensus at some point to re-write MCE logging
> part totally. We still need to fix the bugs in original implementation
> before the new implementation being ready. Do you agree?
Well, this is all 2.6.33 stuff and that's plenty of time to do the
generic events + perf events approach.
Anyway, i think we can work on these MCE enhancements much better if we
move it to the level of actual patches. Let me give you a (very small)
demonstration of my requests.
The patch attached below adds two new events to the MCE code, to the
thermal throttling code:
mce_thermal_throttle_hot
mce_thermal_throttle_cold
If you apply the patch below to latest -tip and boot that kernel and do:
cd tools/perf/
make -j install
And type 'perf list', those two new MCE events will show up in the list
of available events:
# perf list
[...]
mce:mce_thermal_throttle_hot [Tracepoint event]
mce:mce_thermal_throttle_cold [Tracepoint event]
[...]
Initially you can check whether those events are occuring:
# perf stat -a -e mce:mce_thermal_throttle_hot sleep 1
Performance counter stats for 'sleep 1':
0 mce:mce_thermal_throttle_hot # 0.000 M/sec
1.000672214 seconds time elapsed
None on a normal system. Now, i tried it on a system that gets frequent
thermal events, and there it gives:
# perf stat -a -e mce:mce_thermal_throttle_hot sleep 1
Performance counter stats for 'sleep 1':
73 mce:mce_thermal_throttle_hot # 0.000 M/sec
1.000765260 seconds time elapsed
Now i can observe the frequency of those thermal throttling events:
# perf stat --repeat 10 -a -e mce:mce_thermal_throttle_hot sleep 1
Performance counter stats for 'sleep 1' (10 runs):
184 mce:mce_thermal_throttle_hot # 0.000 M/sec ( +- 1.971% )
1.006488821 seconds time elapsed ( +- 0.203% )
184/sec is pretty hefty, with a relatively stable rate.
Now i try to see exactly which code is affected by those thermal
throttling events. For that i use 'perf record', and profile the whole
system for 10 seconds:
# perf record -f -e mce:mce_thermal_throttle_hot -c 1 -a sleep 10
[ perf record: Woken up 4 times to write data ]
[ perf record: Captured and wrote 0.802 MB perf.data (~35026 samples) ]
And i used 'perf report' to analyze where the MCE events occured:
# Samples: 2495
#
# Overhead Command Shared Object Symbol
# ........ .............. ........................ ......
#
55.39% hackbench [vdso] [.] 0x000000f77e1430
35.27% hackbench f77c2430 [.] 0x000000f77c2430
3.05% hackbench [kernel] [k] ia32_setup_sigcontext
0.88% bash [vdso] [.] 0x000000f776f430
0.64% bash f7731430 [.] 0x000000f7731430
0.52% hackbench ./hackbench [.] sender
0.48% nice f7753430 [.] 0x000000f7753430
0.36% hackbench ./hackbench [.] receiver
0.24% perf /lib64/libc-2.5.so [.] __GI_read
0.24% hackbench /lib/libc-2.5.so [.] __libc_read
Most of the events concentrate themselves to around syscall entry
points. That's not because the system is doing that most of the time -
it's probably because the fast syscall entry point is a critical path of
the CPU and overheating events likely concentrate themselves to around
that place.
Now lets how we can do very detailed MCE logs, with per CPU data,
timestamps and other events mixed. For that i use perf record again:
# perf record -R -f -e mce:mce_thermal_throttle_hot:r -e mce:mce_thermal_throttle_cold:r -e irq:irq_handler_entry:r -M -c 1 -a sleep 10
[ perf record: Woken up 12 times to write data ]
[ perf record: Captured and wrote 1.167 MB perf.data (~50990 samples) ]
And i use 'perf trace' to look at the result:
hackbench-5070 [000] 1542.044184301: mce_thermal_throttle_hot: throttle_count=150852
hackbench-5070 [000] 1542.044672817: mce_thermal_throttle_cold: throttle_count=150852
hackbench-5056 [000] 1542.047029212: mce_thermal_throttle_hot: throttle_count=150853
hackbench-5057 [000] 1542.047518733: mce_thermal_throttle_cold: throttle_count=150853
hackbench-5058 [000] 1542.049822672: mce_thermal_throttle_hot: throttle_count=150854
hackbench-5046 [000] 1542.050312612: mce_thermal_throttle_cold: throttle_count=150854
hackbench-5072 [000] 1542.052720357: mce_thermal_throttle_hot: throttle_count=150855
hackbench-5072 [000] 1542.053210612: mce_thermal_throttle_cold: throttle_count=150855
:5074-5074 [000] 1542.055689468: mce_thermal_throttle_hot: throttle_count=150856
hackbench-5011 [000] 1542.056179849: mce_thermal_throttle_cold: throttle_count=150856
:5061-5061 [000] 1542.056937152: mce_thermal_throttle_hot: throttle_count=150857
hackbench-5048 [000] 1542.057428970: mce_thermal_throttle_cold: throttle_count=150857
hackbench-5053 [000] 1542.060372255: mce_thermal_throttle_hot: throttle_count=150858
hackbench-5054 [000] 1542.060863006: mce_thermal_throttle_cold: throttle_count=150858
The first thing we can see it from the trace, only the first core
reports thermal events (the box is a dual-core box). The other thing is
the timing: the CPU is in 'hot' state for about ~490 microseconds, then
in 'cold' state for ~250 microseconds.
So the CPU spends about 65% of its time in hot-throttled state, to cool
down enough to switch back to cold mode again.
Also, most of the overheating events hit the 'hackbench' app. (which is
no surprise on this box as it is only running that workload - but might
be interesting on other systems with more mixed workloads.)
So even in its completely MCE-unaware form, and with this small patch
that took 5 minutes to write, the performance events subsystem and
tooling can already give far greater insight into characteristics of
these events than the MCE subsystem did before.
Obviously the reported events can be used for different, MCE specific
things as well:
- Report an alert on the desktop (without having to scan the syslog all
the time)
- Mail the sysadmin
- Do some specific policy action such as soft-throttle the workload
until the situation and the log data can be examined.
As i outlined it in my previous mail, the perf events subsystem provides
a wide range of facilities that can be used for that purpose.
If there are aspects of MCE reporting usecases that necessiate the
extension of perf events, then we can do that - both sides will benefit
from it. What we dont want to do is to prolongue the pain caused by the
inferior, buggy and poorly designed /dev/mcelog ABI and code. It's not
serving our users well and people are spending time on fixing something
that has been misdesigned from the get go instead of concentrating on
improving the right facilities.
So this is really how i see the future of the MCE logging subsystem: use
meaningful generic events that empowers users, admins and tools to do
something intelligent with the available stream of MCE data. The
hardware tells us a _lot_ of information - lets use and expose that in a
structured form.
Ingo
---
arch/x86/kernel/cpu/mcheck/therm_throt.c | 9 +++++
include/trace/events/mce.h | 47 +++++++++++++++++++++++++++++++
2 files changed, 55 insertions(+), 1 deletion(-)
Index: linux/arch/x86/kernel/cpu/mcheck/therm_throt.c
===================================================================
--- linux.orig/arch/x86/kernel/cpu/mcheck/therm_throt.c
+++ linux/arch/x86/kernel/cpu/mcheck/therm_throt.c
@@ -31,6 +31,9 @@
#include <asm/mce.h>
#include <asm/msr.h>
+#define CREATE_TRACE_POINTS
+#include <trace/events/mce.h>
+
/* How long to wait between reporting thermal events */
#define CHECK_INTERVAL (300 * HZ)
@@ -118,8 +121,12 @@ static int therm_throt_process(bool is_t
was_throttled = state->is_throttled;
state->is_throttled = is_throttled;
- if (is_throttled)
+ if (is_throttled) {
state->throttle_count++;
+ trace_mce_thermal_throttle_hot(state->throttle_count);
+ } else {
+ trace_mce_thermal_throttle_cold(state->throttle_count);
+ }
if (time_before64(now, state->next_check) &&
state->throttle_count != state->last_throttle_count)
Index: linux/include/trace/events/mce.h
===================================================================
--- /dev/null
+++ linux/include/trace/events/mce.h
@@ -0,0 +1,47 @@
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM mce
+
+#if !defined(_TRACE_MCE_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_MCE_H
+
+#include <linux/ktime.h>
+#include <linux/tracepoint.h>
+
+TRACE_EVENT(mce_thermal_throttle_hot,
+
+ TP_PROTO(unsigned int throttle_count),
+
+ TP_ARGS(throttle_count),
+
+ TP_STRUCT__entry(
+ __field( u64, throttle_count )
+ ),
+
+ TP_fast_assign(
+ __entry->throttle_count = throttle_count;
+ ),
+
+ TP_printk("throttle_count=%Lu", __entry->throttle_count)
+);
+
+TRACE_EVENT(mce_thermal_throttle_cold,
+
+ TP_PROTO(unsigned int throttle_count),
+
+ TP_ARGS(throttle_count),
+
+ TP_STRUCT__entry(
+ __field( u64, throttle_count )
+ ),
+
+ TP_fast_assign(
+ __entry->throttle_count = throttle_count;
+ ),
+
+ TP_printk("throttle_count=%Lu", __entry->throttle_count)
+);
+
+#endif /* _TRACE_MCE_H */
+
+/* This part must be outside protection */
+#include <trace/define_trace.h>
Hi Huang,
Huang Ying wrote:
> Current MCE log ring buffer has following bugs and issues:
>
> - On larger systems the 32 size buffer easily overflow, losing events.
>
> - We had some reports of events getting corrupted which were also
> blamed on the ring buffer.
>
> - There's a known livelock, now hit by more people, under high error
> rate.
>
> We fix these bugs and issues via making MCE log ring buffer as
> lock-less per-CPU ring buffer.
Now I have a real problem on the small MCE log buffer on my new large
system with Nehalem which has many cpus/banks in one socket...
So I'd like to solve the problem asap. I think this problem might block
some distros to support new processor.
Last week I reviewed your patch again and noticed that it is doing a lot
of changes at once. I suppose that this method must be one of reasons
why your patch seems to be so hard to review, and why it is taking long
time to be accepted by x86 maintainers.
Fortunately I had some spare time so I carefully broke your patch into
some purpose-designed pieces. It would be the most significant change
that now there are 2 steps to convert the buffer structure - 1) to make
it per-CPU and 2) to make it ring buffer.
Also I fixed some problem in your patch, found on the way to make this
patch set. I'll explain about my changes later using diff from your
change. Comments are welcomed.
Thanks,
H.Seto
Hidetoshi Seto (10):
x86, mce: remove tsc handling from mce_read
x86, mce: mce_read can check args without mutex
x86, mce: change writer timeout in mce_read
x86, mce: use do-while in mce_log
x86, mce: make mce_log buffer to per-CPU, prep
x86, mce: make mce_log buffer to per-CPU
x86, mce: remove for-loop in mce_log
x86, mce: change barriers in mce_log
x86, mce: make mce_log buffer to ring buffer
x86, mce: move mce_log_init() into mce_cap_init()
arch/x86/include/asm/mce.h | 43 ++++--
arch/x86/kernel/cpu/mcheck/mce.c | 299 +++++++++++++++++++++++---------------
2 files changed, 211 insertions(+), 131 deletions(-)
This handling (looks too buggy) is no longer required.
(This piece originates from Huang's patch, titled:
"x86, MCE: Fix bugs and issues of MCE log ring buffer")
Originally-From: Huang Ying <[email protected]>
Signed-off-by: Hidetoshi Seto <[email protected]>
---
arch/x86/kernel/cpu/mcheck/mce.c | 30 ------------------------------
1 files changed, 0 insertions(+), 30 deletions(-)
diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
index a6d5d4a..28e1e14 100644
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -1449,34 +1449,21 @@ static int mce_release(struct inode *inode, struct file *file)
return 0;
}
-static void collect_tscs(void *data)
-{
- unsigned long *cpu_tsc = (unsigned long *)data;
-
- rdtscll(cpu_tsc[smp_processor_id()]);
-}
-
static DEFINE_MUTEX(mce_read_mutex);
static ssize_t mce_read(struct file *filp, char __user *ubuf, size_t usize,
loff_t *off)
{
char __user *buf = ubuf;
- unsigned long *cpu_tsc;
unsigned prev, next;
int i, err;
- cpu_tsc = kmalloc(nr_cpu_ids * sizeof(long), GFP_KERNEL);
- if (!cpu_tsc)
- return -ENOMEM;
-
mutex_lock(&mce_read_mutex);
next = rcu_dereference(mcelog.next);
/* Only supports full reads right now */
if (*off != 0 || usize < MCE_LOG_LEN*sizeof(struct mce)) {
mutex_unlock(&mce_read_mutex);
- kfree(cpu_tsc);
return -EINVAL;
}
@@ -1511,24 +1498,7 @@ timeout:
synchronize_sched();
- /*
- * Collect entries that were still getting written before the
- * synchronize.
- */
- on_each_cpu(collect_tscs, cpu_tsc, 1);
-
- for (i = next; i < MCE_LOG_LEN; i++) {
- if (mcelog.entry[i].finished &&
- mcelog.entry[i].tsc < cpu_tsc[mcelog.entry[i].cpu]) {
- err |= copy_to_user(buf, mcelog.entry+i,
- sizeof(struct mce));
- smp_rmb();
- buf += sizeof(struct mce);
- memset(&mcelog.entry[i], 0, sizeof(struct mce));
- }
- }
mutex_unlock(&mce_read_mutex);
- kfree(cpu_tsc);
return err ? -EFAULT : buf - ubuf;
}
--
1.6.4.3
Move it before mutex_lock().
Signed-off-by: Hidetoshi Seto <[email protected]>
---
arch/x86/kernel/cpu/mcheck/mce.c | 11 ++++-------
1 files changed, 4 insertions(+), 7 deletions(-)
diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
index 28e1e14..2d0f9d8 100644
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -1458,15 +1458,12 @@ static ssize_t mce_read(struct file *filp, char __user *ubuf, size_t usize,
unsigned prev, next;
int i, err;
- mutex_lock(&mce_read_mutex);
- next = rcu_dereference(mcelog.next);
-
/* Only supports full reads right now */
- if (*off != 0 || usize < MCE_LOG_LEN*sizeof(struct mce)) {
- mutex_unlock(&mce_read_mutex);
-
+ if (*off != 0 || usize < sizeof(struct mce) * MCE_LOG_LEN)
return -EINVAL;
- }
+
+ mutex_lock(&mce_read_mutex);
+ next = rcu_dereference(mcelog.next);
err = 0;
prev = 0;
--
1.6.4.3
Changed to use ndelay().
(This piece originates from Huang's patch, titled:
"x86, MCE: Fix bugs and issues of MCE log ring buffer")
Originally-From: Huang Ying <[email protected]>
Signed-off-by: Hidetoshi Seto <[email protected]>
---
arch/x86/kernel/cpu/mcheck/mce.c | 11 ++++++++---
1 files changed, 8 insertions(+), 3 deletions(-)
diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
index 2d0f9d8..9e8eabd 100644
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -52,6 +52,9 @@ int mce_disabled __read_mostly;
#define SPINUNIT 100 /* 100ns */
+/* Timeout log reader to wait writer to finish */
+#define WRITER_TIMEOUT_NS NSEC_PER_MSEC
+
atomic_t mce_entry;
DEFINE_PER_CPU(unsigned, mce_exception_count);
@@ -1469,15 +1472,17 @@ static ssize_t mce_read(struct file *filp, char __user *ubuf, size_t usize,
prev = 0;
do {
for (i = prev; i < next; i++) {
- unsigned long start = jiffies;
+ int timeout = WRITER_TIMEOUT_NS;
while (!mcelog.entry[i].finished) {
- if (time_after_eq(jiffies, start + 2)) {
+ if (timeout-- <= 0) {
memset(mcelog.entry + i, 0,
sizeof(struct mce));
+ printk(KERN_WARNING "mcelog: timeout "
+ "waiting for writer to finish!\n");
goto timeout;
}
- cpu_relax();
+ ndelay(1);
}
smp_rmb();
err |= copy_to_user(buf, mcelog.entry + i,
--
1.6.4.3
No changes in logic.
Signed-off-by: Hidetoshi Seto <[email protected]>
---
arch/x86/kernel/cpu/mcheck/mce.c | 10 +++++-----
1 files changed, 5 insertions(+), 5 deletions(-)
diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
index 9e8eabd..5f88ada 100644
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -134,7 +134,8 @@ void mce_log(struct mce *mce)
mce->finished = 0;
wmb();
- for (;;) {
+
+ do {
entry = rcu_dereference(mcelog.next);
for (;;) {
/*
@@ -156,14 +157,13 @@ void mce_log(struct mce *mce)
}
smp_rmb();
next = entry + 1;
- if (cmpxchg(&mcelog.next, entry, next) == entry)
- break;
- }
+ } while (cmpxchg(&mcelog.next, entry, next) != entry);
+
memcpy(mcelog.entry + entry, mce, sizeof(struct mce));
+
wmb();
mcelog.entry[entry].finished = 1;
wmb();
-
mce->finished = 1;
set_bit(0, &mce_need_notify);
}
--
1.6.4.3
Move core part of mce_read() into mce_read_buf(). Add mce_empty().
These changes are required to improve readability of next patch.
(This piece originates from Huang's patch, titled:
"x86, MCE: Fix bugs and issues of MCE log ring buffer")
Signed-off-by: Hidetoshi Seto <[email protected]>
---
arch/x86/kernel/cpu/mcheck/mce.c | 48 ++++++++++++++++++++++++++-----------
1 files changed, 34 insertions(+), 14 deletions(-)
diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
index 5f88ada..684b42e 100644
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -1452,20 +1452,12 @@ static int mce_release(struct inode *inode, struct file *file)
return 0;
}
-static DEFINE_MUTEX(mce_read_mutex);
-
-static ssize_t mce_read(struct file *filp, char __user *ubuf, size_t usize,
- loff_t *off)
+static ssize_t mce_read_buf(char __user *inubuf, size_t usize)
{
- char __user *buf = ubuf;
+ char __user *ubuf = inubuf;
unsigned prev, next;
int i, err;
- /* Only supports full reads right now */
- if (*off != 0 || usize < sizeof(struct mce) * MCE_LOG_LEN)
- return -EINVAL;
-
- mutex_lock(&mce_read_mutex);
next = rcu_dereference(mcelog.next);
err = 0;
@@ -1485,9 +1477,9 @@ static ssize_t mce_read(struct file *filp, char __user *ubuf, size_t usize,
ndelay(1);
}
smp_rmb();
- err |= copy_to_user(buf, mcelog.entry + i,
+ err |= copy_to_user(ubuf, mcelog.entry + i,
sizeof(struct mce));
- buf += sizeof(struct mce);
+ ubuf += sizeof(struct mce);
timeout:
;
}
@@ -1500,15 +1492,43 @@ timeout:
synchronize_sched();
+ return err ? -EFAULT : ubuf - inubuf;
+}
+
+static int mce_empty(void)
+{
+ return !rcu_dereference(mcelog.next);
+}
+
+static DEFINE_MUTEX(mce_read_mutex);
+
+static ssize_t mce_read(struct file *filp, char __user *inubuf, size_t usize,
+ loff_t *off)
+{
+ char __user *ubuf = inubuf;
+ int err;
+
+ /* Only supports full reads right now */
+ if (*off != 0 || usize < sizeof(struct mce) * MCE_LOG_LEN)
+ return -EINVAL;
+
+ mutex_lock(&mce_read_mutex);
+
+ err = mce_read_buf(ubuf, usize);
+ if (err > 0) {
+ ubuf += err;
+ err = 0;
+ }
+
mutex_unlock(&mce_read_mutex);
- return err ? -EFAULT : buf - ubuf;
+ return err ? err : ubuf - inubuf;
}
static unsigned int mce_poll(struct file *file, poll_table *wait)
{
poll_wait(file, &mce_wait, wait);
- if (rcu_dereference(mcelog.next))
+ if (!mce_empty())
return POLLIN | POLLRDNORM;
return 0;
}
--
1.6.4.3
On larger systems the global 32 size buffer for mcelog easily overflow,
lose events. And there's a known livelock, now hit by more people,
under high error rate.
This patch fixes these issues by making MCE log buffer to per-CPU:
+ MCEs are added to corresponding local per-CPU buffer, instead of
one big global buffer. Contention/unfairness between CPUs is
eliminated.
Reader/Writer convention is unchanged (= Lock-less for writer side):
+ MCE log writer may come from NMI, so the writer side must be
lock-less. For per-CPU buffer of one CPU, writers may come from
process, IRQ or NMI context, so cmpxchg_local() is used to allocate
buffer space.
+ MCE records are read out and removed from per-CPU buffers by mutex
protected global reader function. Because there are no many
readers in system to contend in most cases. In other words,
reader side is protected with a mutex to guarantee only one reader
is active in the whole system.
As the result now each CPU has its local 32 size buffer.
HS: Add a member header_len to struct mce_log to help debugger to know
where the array of record is.
(This piece originates from Huang's patch, titled:
"x86, MCE: Fix bugs and issues of MCE log ring buffer")
Originally-From: Huang Ying <[email protected]>
Signed-off-by: Hidetoshi Seto <[email protected]>
---
arch/x86/include/asm/mce.h | 37 ++++++----
arch/x86/kernel/cpu/mcheck/mce.c | 139 +++++++++++++++++++++++++++-----------
2 files changed, 120 insertions(+), 56 deletions(-)
diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h
index 2f1c0ef..c5d4144 100644
--- a/arch/x86/include/asm/mce.h
+++ b/arch/x86/include/asm/mce.h
@@ -52,7 +52,7 @@
#define MCE_INJ_NMI_BROADCAST (1 << 2) /* do NMI broadcasting */
#define MCE_INJ_EXCEPTION (1 << 3) /* raise as exception */
-/* Fields are zero when not available */
+/* MCE log entry. Fields are zero when not available. */
struct mce {
__u64 status;
__u64 misc;
@@ -63,12 +63,12 @@ struct mce {
__u64 time; /* wall time_t when error was detected */
__u8 cpuvendor; /* cpu vendor as encoded in system.h */
__u8 inject_flags; /* software inject flags */
- __u16 pad;
+ __u16 pad;
__u32 cpuid; /* CPUID 1 EAX */
- __u8 cs; /* code segment */
+ __u8 cs; /* code segment */
__u8 bank; /* machine check bank */
__u8 cpu; /* cpu number; obsolete; use extcpu now */
- __u8 finished; /* entry is valid */
+ __u8 finished; /* 1 if write to entry is finished & entry is valid */
__u32 extcpu; /* linux cpu number that detected the error */
__u32 socketid; /* CPU socket ID */
__u32 apicid; /* CPU initial apic ID */
@@ -76,26 +76,33 @@ struct mce {
};
/*
- * This structure contains all data related to the MCE log. Also
- * carries a signature to make it easier to find from external
- * debugging tools. Each entry is only valid when its finished flag
- * is set.
+ * This structure contains all data related to the MCE log. Also carries
+ * a signature to make it easier to find from external debugging tools.
+ * Each entry is only valid when its finished flag is set.
*/
-#define MCE_LOG_LEN 32
+#define MCE_LOG_LEN 32
+
+struct mce_log_cpu;
struct mce_log {
- char signature[12]; /* "MACHINECHECK" */
- unsigned len; /* = MCE_LOG_LEN */
- unsigned next;
+ char signature[12]; /* "MACHINECHEC2" */
+
+ /* points the table of per-CPU buffers */
+ struct mce_log_cpu **mcelog_cpus;
+ unsigned int nr_mcelog_cpus; /* = num_possible_cpus() */
+
+ /* spec of per-CPU buffer */
+ unsigned int header_len; /* offset of array "entry" */
+ unsigned int nr_record; /* array size (= MCE_LOG_LEN) */
+ unsigned int record_len; /* length of struct mce */
+
unsigned flags;
- unsigned recordlen; /* length of struct mce */
- struct mce entry[MCE_LOG_LEN];
};
#define MCE_OVERFLOW 0 /* bit 0 in flags means overflow */
-#define MCE_LOG_SIGNATURE "MACHINECHECK"
+#define MCE_LOG_SIGNATURE "MACHINECHEC2"
#define MCE_GET_RECORD_LEN _IOR('M', 1, int)
#define MCE_GET_LOG_LEN _IOR('M', 2, int)
diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
index 684b42e..ad2eb89 100644
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -122,21 +122,30 @@ EXPORT_PER_CPU_SYMBOL_GPL(mce_fake_banks);
* separate MCEs from kernel messages to avoid bogus bug reports.
*/
+struct mce_log_cpu {
+ unsigned next;
+ struct mce entry[MCE_LOG_LEN];
+};
+
+DEFINE_PER_CPU(struct mce_log_cpu, mce_log_cpus);
+
static struct mce_log mcelog = {
.signature = MCE_LOG_SIGNATURE,
- .len = MCE_LOG_LEN,
- .recordlen = sizeof(struct mce),
+ .header_len = offsetof(struct mce_log_cpu, entry),
+ .nr_record = MCE_LOG_LEN,
+ .record_len = sizeof(struct mce),
};
void mce_log(struct mce *mce)
{
+ struct mce_log_cpu *mcelog_cpu = &__get_cpu_var(mce_log_cpus);
unsigned next, entry;
mce->finished = 0;
wmb();
do {
- entry = rcu_dereference(mcelog.next);
+ entry = mcelog_cpu->next;
for (;;) {
/*
* When the buffer fills up discard new entries.
@@ -149,7 +158,7 @@ void mce_log(struct mce *mce)
return;
}
/* Old left over entry. Skip: */
- if (mcelog.entry[entry].finished) {
+ if (mcelog_cpu->entry[entry].finished) {
entry++;
continue;
}
@@ -157,12 +166,12 @@ void mce_log(struct mce *mce)
}
smp_rmb();
next = entry + 1;
- } while (cmpxchg(&mcelog.next, entry, next) != entry);
+ } while (cmpxchg_local(&mcelog_cpu->next, entry, next) != entry);
- memcpy(mcelog.entry + entry, mce, sizeof(struct mce));
+ memcpy(mcelog_cpu->entry + entry, mce, sizeof(struct mce));
wmb();
- mcelog.entry[entry].finished = 1;
+ mcelog_cpu->entry[entry].finished = 1;
wmb();
mce->finished = 1;
set_bit(0, &mce_need_notify);
@@ -210,6 +219,26 @@ static void print_mce_tail(void)
"Run through mcelog --ascii to decode and contact your hardware vendor\n");
}
+static void print_mce_cpu(int cpu, struct mce *final, u64 mask, u64 res)
+{
+ int i;
+ struct mce_log_cpu *mcelog_cpu;
+
+ mcelog_cpu = &per_cpu(mce_log_cpus, cpu);
+ for (i = 0; i < MCE_LOG_LEN; i++) {
+ struct mce *m = &mcelog_cpu->entry[i];
+ if (!m->finished)
+ continue;
+ if (!(m->status & MCI_STATUS_VAL))
+ continue;
+ if ((m->status & mask) != res)
+ continue;
+ if (final && !memcmp(m, final, sizeof(struct mce)))
+ continue;
+ print_mce(m);
+ }
+}
+
#define PANIC_TIMEOUT 5 /* 5 seconds */
static atomic_t mce_paniced;
@@ -232,7 +261,7 @@ static void wait_for_panic(void)
static void mce_panic(char *msg, struct mce *final, char *exp)
{
- int i;
+ int cpu;
if (!fake_panic) {
/*
@@ -251,23 +280,12 @@ static void mce_panic(char *msg, struct mce *final, char *exp)
}
print_mce_head();
/* First print corrected ones that are still unlogged */
- for (i = 0; i < MCE_LOG_LEN; i++) {
- struct mce *m = &mcelog.entry[i];
- if (!(m->status & MCI_STATUS_VAL))
- continue;
- if (!(m->status & MCI_STATUS_UC))
- print_mce(m);
- }
- /* Now print uncorrected but with the final one last */
- for (i = 0; i < MCE_LOG_LEN; i++) {
- struct mce *m = &mcelog.entry[i];
- if (!(m->status & MCI_STATUS_VAL))
- continue;
- if (!(m->status & MCI_STATUS_UC))
- continue;
- if (!final || memcmp(m, final, sizeof(struct mce)))
- print_mce(m);
- }
+ for_each_online_cpu(cpu)
+ print_mce_cpu(cpu, final, MCI_STATUS_UC, 0);
+ /* Print uncorrected but without the final one */
+ for_each_online_cpu(cpu)
+ print_mce_cpu(cpu, final, MCI_STATUS_UC, MCI_STATUS_UC);
+ /* Finally print the final mce */
if (final)
print_mce(final);
if (cpu_missing)
@@ -1234,6 +1252,22 @@ static int __cpuinit mce_cap_init(void)
return 0;
}
+/*
+ * Initialize MCE per-CPU log buffer
+ */
+static __cpuinit void mce_log_init(void)
+{
+ int cpu;
+
+ if (mcelog.mcelog_cpus)
+ return;
+ mcelog.nr_mcelog_cpus = num_possible_cpus();
+ mcelog.mcelog_cpus = kzalloc(sizeof(void *) * num_possible_cpus(),
+ GFP_KERNEL);
+ for_each_possible_cpu(cpu)
+ mcelog.mcelog_cpus[cpu] = &per_cpu(mce_log_cpus, cpu);
+}
+
static void mce_init(void)
{
mce_banks_t all_banks;
@@ -1404,6 +1438,7 @@ void __cpuinit mcheck_init(struct cpuinfo_x86 *c)
mce_disabled = 1;
return;
}
+ mce_log_init();
machine_check_vector = do_machine_check;
@@ -1452,13 +1487,16 @@ static int mce_release(struct inode *inode, struct file *file)
return 0;
}
-static ssize_t mce_read_buf(char __user *inubuf, size_t usize)
+static ssize_t mce_read_cpu(int cpu, char __user *inubuf, size_t usize)
{
+ struct mce_log_cpu *mcelog_cpu = &per_cpu(mce_log_cpus, cpu);
char __user *ubuf = inubuf;
unsigned prev, next;
int i, err;
- next = rcu_dereference(mcelog.next);
+ next = mcelog_cpu->next;
+ if (!next)
+ return 0;
err = 0;
prev = 0;
@@ -1466,9 +1504,9 @@ static ssize_t mce_read_buf(char __user *inubuf, size_t usize)
for (i = prev; i < next; i++) {
int timeout = WRITER_TIMEOUT_NS;
- while (!mcelog.entry[i].finished) {
+ while (!mcelog_cpu->entry[i].finished) {
if (timeout-- <= 0) {
- memset(mcelog.entry + i, 0,
+ memset(mcelog_cpu->entry + i, 0,
sizeof(struct mce));
printk(KERN_WARNING "mcelog: timeout "
"waiting for writer to finish!\n");
@@ -1477,27 +1515,33 @@ static ssize_t mce_read_buf(char __user *inubuf, size_t usize)
ndelay(1);
}
smp_rmb();
- err |= copy_to_user(ubuf, mcelog.entry + i,
+ err |= copy_to_user(ubuf, mcelog_cpu->entry + i,
sizeof(struct mce));
ubuf += sizeof(struct mce);
timeout:
;
}
- memset(mcelog.entry + prev, 0,
+ memset(mcelog_cpu->entry + prev, 0,
(next - prev) * sizeof(struct mce));
prev = next;
- next = cmpxchg(&mcelog.next, prev, 0);
+ next = cmpxchg(&mcelog_cpu->next, prev, 0);
} while (next != prev);
- synchronize_sched();
-
return err ? -EFAULT : ubuf - inubuf;
}
static int mce_empty(void)
{
- return !rcu_dereference(mcelog.next);
+ int cpu;
+ struct mce_log_cpu *mcelog_cpu;
+
+ for_each_possible_cpu(cpu) {
+ mcelog_cpu = &per_cpu(mce_log_cpus, cpu);
+ if (mcelog_cpu->next)
+ return 0;
+ }
+ return 1;
}
static DEFINE_MUTEX(mce_read_mutex);
@@ -1506,7 +1550,7 @@ static ssize_t mce_read(struct file *filp, char __user *inubuf, size_t usize,
loff_t *off)
{
char __user *ubuf = inubuf;
- int err;
+ int cpu, err = 0;
/* Only supports full reads right now */
if (*off != 0 || usize < sizeof(struct mce) * MCE_LOG_LEN)
@@ -1514,12 +1558,25 @@ static ssize_t mce_read(struct file *filp, char __user *inubuf, size_t usize,
mutex_lock(&mce_read_mutex);
- err = mce_read_buf(ubuf, usize);
- if (err > 0) {
- ubuf += err;
- err = 0;
+ while (!mce_empty()) {
+ for_each_possible_cpu(cpu) {
+ if (usize < MCE_LOG_LEN * sizeof(struct mce))
+ goto out;
+ err = mce_read_cpu(cpu, ubuf, sizeof(struct mce));
+ if (err > 0) {
+ ubuf += sizeof(struct mce);
+ usize -= sizeof(struct mce);
+ err = 0;
+ } else if (err < 0)
+ goto out;
+ }
+ if (need_resched()) {
+ mutex_unlock(&mce_read_mutex);
+ cond_resched();
+ mutex_lock(&mce_read_mutex);
+ }
}
-
+out:
mutex_unlock(&mce_read_mutex);
return err ? err : ubuf - inubuf;
--
1.6.4.3
The new per-CPU buffer allows us to drop this weird code.
Note: This reverts a part of following old commit:
commit 673242c10d535bfe238d9d8e82ac93432d35b88e
Author: Andi Kleen <[email protected]>
Date: Mon Sep 12 18:49:24 2005 +0200
[PATCH] x86-64: Make lockless machine check record passing a bit more robust.
(This piece originates from Huang's patch, titled:
"x86, MCE: Fix bugs and issues of MCE log ring buffer")
Originally-From: Huang Ying <[email protected]>
Signed-off-by: Hidetoshi Seto <[email protected]>
---
arch/x86/kernel/cpu/mcheck/mce.c | 25 ++++++++-----------------
1 files changed, 8 insertions(+), 17 deletions(-)
diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
index ad2eb89..87b2e29 100644
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -146,23 +146,14 @@ void mce_log(struct mce *mce)
do {
entry = mcelog_cpu->next;
- for (;;) {
- /*
- * When the buffer fills up discard new entries.
- * Assume that the earlier errors are the more
- * interesting ones:
- */
- if (entry >= MCE_LOG_LEN) {
- set_bit(MCE_OVERFLOW,
- (unsigned long *)&mcelog.flags);
- return;
- }
- /* Old left over entry. Skip: */
- if (mcelog_cpu->entry[entry].finished) {
- entry++;
- continue;
- }
- break;
+ /*
+ * When the buffer fills up discard new entries.
+ * Assume that the earlier errors are the more
+ * interesting ones:
+ */
+ if (entry >= MCE_LOG_LEN) {
+ set_bit(MCE_OVERFLOW, (unsigned long *)&mcelog.flags);
+ return;
}
smp_rmb();
next = entry + 1;
--
1.6.4.3
Long time ago, smp_wmb() was replaced to wmb() by following commit:
commit 7644143cd6f7e029f3a8ea64f5fb0ab33ec39f72
Author: Mike Waychison <[email protected]>
Date: Fri Sep 30 00:01:27 2005 +0200
[PATCH] x86_64: Fix mce_log
> AK: turned the smp_wmbs into true wmbs to make sure they are not
> reordered by the compiler on UP.
Change them back to original form, and put comments.
(This piece originates from Huang's patch, titled:
"x86, MCE: Fix bugs and issues of MCE log ring buffer")
Originally-From: Huang Ying <[email protected]>
Signed-off-by: Hidetoshi Seto <[email protected]>
---
arch/x86/kernel/cpu/mcheck/mce.c | 10 +++++++---
1 files changed, 7 insertions(+), 3 deletions(-)
diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
index 87b2e29..655915b 100644
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -141,8 +141,9 @@ void mce_log(struct mce *mce)
struct mce_log_cpu *mcelog_cpu = &__get_cpu_var(mce_log_cpus);
unsigned next, entry;
+ /* mce->finished must be set to 0 before written to buffer */
mce->finished = 0;
- wmb();
+ smp_wmb();
do {
entry = mcelog_cpu->next;
@@ -161,9 +162,12 @@ void mce_log(struct mce *mce)
memcpy(mcelog_cpu->entry + entry, mce, sizeof(struct mce));
- wmb();
+ /* ".finished" of MCE record in buffer must be set after copy */
+ smp_wmb();
mcelog_cpu->entry[entry].finished = 1;
- wmb();
+
+ /* bit 0 of notify_user should be set after finished be set */
+ smp_wmb();
mce->finished = 1;
set_bit(0, &mce_need_notify);
}
--
1.6.4.3
This patch implements Per-CPU ring buffer data structure.
+ An array is used to hold MCE records. integer "head" indicates
next writing position and integer "tail" indicates next reading
position.
+ To distinguish buffer empty and full, head and tail wrap to 0 at
MCE_LOG_LIMIT instead of MCE_LOG_LEN. Then the real next writing
position is head % MCE_LOG_LEN, and real next reading position is
tail % MCE_LOG_LEN. If buffer is empty, head == tail, if buffer is
full, head % MCE_LOG_LEN == tail % MCE_LOG_LEN and head != tail.
(This piece originates from Huang's patch, titled:
"x86, MCE: Fix bugs and issues of MCE log ring buffer")
Originally-From: Huang Ying <[email protected]>
Signed-off-by: Hidetoshi Seto <[email protected]>
---
arch/x86/include/asm/mce.h | 6 +++
arch/x86/kernel/cpu/mcheck/mce.c | 77 +++++++++++++++++++++----------------
2 files changed, 50 insertions(+), 33 deletions(-)
diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h
index c5d4144..4b5ef3c 100644
--- a/arch/x86/include/asm/mce.h
+++ b/arch/x86/include/asm/mce.h
@@ -82,6 +82,12 @@ struct mce {
*/
#define MCE_LOG_LEN 32
+#define MCE_LOG_LIMIT (MCE_LOG_LEN * 2 - 1)
+
+static inline int mce_log_index(int n)
+{
+ return n >= MCE_LOG_LEN ? n - MCE_LOG_LEN : n;
+}
struct mce_log_cpu;
diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
index 655915b..63a7820 100644
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -123,7 +123,8 @@ EXPORT_PER_CPU_SYMBOL_GPL(mce_fake_banks);
*/
struct mce_log_cpu {
- unsigned next;
+ int head;
+ int tail;
struct mce entry[MCE_LOG_LEN];
};
@@ -139,32 +140,34 @@ static struct mce_log mcelog = {
void mce_log(struct mce *mce)
{
struct mce_log_cpu *mcelog_cpu = &__get_cpu_var(mce_log_cpus);
- unsigned next, entry;
+ int head, ihead, tail, next;
/* mce->finished must be set to 0 before written to buffer */
mce->finished = 0;
smp_wmb();
do {
- entry = mcelog_cpu->next;
+ head = mcelog_cpu->head;
+ tail = mcelog_cpu->tail;
+ ihead = mce_log_index(head);
+
/*
* When the buffer fills up discard new entries.
* Assume that the earlier errors are the more
- * interesting ones:
+ * interesting.
*/
- if (entry >= MCE_LOG_LEN) {
+ if (ihead == mce_log_index(tail) && head != tail) {
set_bit(MCE_OVERFLOW, (unsigned long *)&mcelog.flags);
return;
}
- smp_rmb();
- next = entry + 1;
- } while (cmpxchg_local(&mcelog_cpu->next, entry, next) != entry);
+ next = head == MCE_LOG_LIMIT ? 0 : head + 1;
+ } while (cmpxchg_local(&mcelog_cpu->head, head, next) != head);
- memcpy(mcelog_cpu->entry + entry, mce, sizeof(struct mce));
+ memcpy(mcelog_cpu->entry + ihead, mce, sizeof(struct mce));
/* ".finished" of MCE record in buffer must be set after copy */
smp_wmb();
- mcelog_cpu->entry[entry].finished = 1;
+ mcelog_cpu->entry[ihead].finished = 1;
/* bit 0 of notify_user should be set after finished be set */
smp_wmb();
@@ -1486,42 +1489,50 @@ static ssize_t mce_read_cpu(int cpu, char __user *inubuf, size_t usize)
{
struct mce_log_cpu *mcelog_cpu = &per_cpu(mce_log_cpus, cpu);
char __user *ubuf = inubuf;
- unsigned prev, next;
- int i, err;
+ int head, tail, pos, i, err = 0;
- next = mcelog_cpu->next;
- if (!next)
+ head = mcelog_cpu->head;
+ tail = mcelog_cpu->tail;
+ if (head == tail)
return 0;
- err = 0;
- prev = 0;
- do {
- for (i = prev; i < next; i++) {
+ for (pos = tail; pos != head && usize >= sizeof(struct mce);
+ pos = pos == MCE_LOG_LIMIT ? 0 : pos+1) {
+ i = mce_log_index(pos);
+ if (!mcelog_cpu->entry[i].finished) {
int timeout = WRITER_TIMEOUT_NS;
while (!mcelog_cpu->entry[i].finished) {
if (timeout-- <= 0) {
memset(mcelog_cpu->entry + i, 0,
sizeof(struct mce));
+ head = mcelog_cpu->head;
printk(KERN_WARNING "mcelog: timeout "
"waiting for writer to finish!\n");
goto timeout;
}
ndelay(1);
}
- smp_rmb();
- err |= copy_to_user(ubuf, mcelog_cpu->entry + i,
- sizeof(struct mce));
- ubuf += sizeof(struct mce);
-timeout:
- ;
}
-
- memset(mcelog_cpu->entry + prev, 0,
- (next - prev) * sizeof(struct mce));
- prev = next;
- next = cmpxchg(&mcelog_cpu->next, prev, 0);
- } while (next != prev);
+ /*
+ * finished field should be checked before
+ * copy_to_user()
+ */
+ smp_rmb();
+ err |= copy_to_user(ubuf, mcelog_cpu->entry + i,
+ sizeof(struct mce));
+ ubuf += sizeof(struct mce);
+ usize -= sizeof(struct mce);
+ mcelog_cpu->entry[i].finished = 0;
+timeout:
+ ;
+ }
+ /*
+ * mcelog_cpu->tail must be updated after ".finished" of
+ * corresponding MCE records are clear.
+ */
+ smp_wmb();
+ mcelog_cpu->tail = pos;
return err ? -EFAULT : ubuf - inubuf;
}
@@ -1533,7 +1544,7 @@ static int mce_empty(void)
for_each_possible_cpu(cpu) {
mcelog_cpu = &per_cpu(mce_log_cpus, cpu);
- if (mcelog_cpu->next)
+ if (mcelog_cpu->head != mcelog_cpu->tail)
return 0;
}
return 1;
@@ -1548,14 +1559,14 @@ static ssize_t mce_read(struct file *filp, char __user *inubuf, size_t usize,
int cpu, err = 0;
/* Only supports full reads right now */
- if (*off != 0 || usize < sizeof(struct mce) * MCE_LOG_LEN)
+ if (*off != 0 || usize < sizeof(struct mce))
return -EINVAL;
mutex_lock(&mce_read_mutex);
while (!mce_empty()) {
for_each_possible_cpu(cpu) {
- if (usize < MCE_LOG_LEN * sizeof(struct mce))
+ if (usize < sizeof(struct mce))
goto out;
err = mce_read_cpu(cpu, ubuf, sizeof(struct mce));
if (err > 0) {
--
1.6.4.3
The mce_log_init() should be able to return -ENOMEM, to abort all following
initialization in mce.c. It can be done in same style with mce_banks_init()
so I moved mce_log_init() into mce_cap_init() too.
And put __cpuinit to mce_banks_init().
Signed-off-by: Hidetoshi Seto <[email protected]>
---
arch/x86/kernel/cpu/mcheck/mce.c | 48 +++++++++++++++++++++++--------------
1 files changed, 30 insertions(+), 18 deletions(-)
diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
index 63a7820..fad3daa 100644
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -1194,19 +1194,40 @@ int mce_notify_irq(void)
}
EXPORT_SYMBOL_GPL(mce_notify_irq);
-static int mce_banks_init(void)
+static __cpuinit int mce_banks_init(void)
{
int i;
mce_banks = kzalloc(banks * sizeof(struct mce_bank), GFP_KERNEL);
if (!mce_banks)
return -ENOMEM;
+
for (i = 0; i < banks; i++) {
struct mce_bank *b = &mce_banks[i];
b->ctl = -1ULL;
b->init = 1;
}
+
+ return 0;
+}
+
+/*
+ * Initialize MCE per-CPU log buffer
+ */
+static __cpuinit int mce_log_init(void)
+{
+ int cpu;
+
+ mcelog.nr_mcelog_cpus = num_possible_cpus();
+ mcelog.mcelog_cpus = kzalloc(sizeof(void *) * num_possible_cpus(),
+ GFP_KERNEL);
+ if (!mcelog.mcelog_cpus)
+ return -ENOMEM;
+
+ for_each_possible_cpu(cpu)
+ mcelog.mcelog_cpus[cpu] = &per_cpu(mce_log_cpus, cpu);
+
return 0;
}
@@ -1233,6 +1254,7 @@ static int __cpuinit mce_cap_init(void)
/* Don't support asymmetric configurations today */
WARN_ON(banks != 0 && b != banks);
banks = b;
+
if (!mce_banks) {
int err = mce_banks_init();
@@ -1240,6 +1262,13 @@ static int __cpuinit mce_cap_init(void)
return err;
}
+ if (!mcelog.mcelog_cpus) {
+ int err = mce_log_init();
+
+ if (err)
+ return err;
+ }
+
/* Use accurate RIP reporting if available. */
if ((cap & MCG_EXT_P) && MCG_EXT_CNT(cap) >= 9)
rip_msr = MSR_IA32_MCG_EIP;
@@ -1250,22 +1279,6 @@ static int __cpuinit mce_cap_init(void)
return 0;
}
-/*
- * Initialize MCE per-CPU log buffer
- */
-static __cpuinit void mce_log_init(void)
-{
- int cpu;
-
- if (mcelog.mcelog_cpus)
- return;
- mcelog.nr_mcelog_cpus = num_possible_cpus();
- mcelog.mcelog_cpus = kzalloc(sizeof(void *) * num_possible_cpus(),
- GFP_KERNEL);
- for_each_possible_cpu(cpu)
- mcelog.mcelog_cpus[cpu] = &per_cpu(mce_log_cpus, cpu);
-}
-
static void mce_init(void)
{
mce_banks_t all_banks;
@@ -1436,7 +1449,6 @@ void __cpuinit mcheck_init(struct cpuinfo_x86 *c)
mce_disabled = 1;
return;
}
- mce_log_init();
machine_check_vector = do_machine_check;
--
1.6.4.3
>
> (This piece originates from Huang's patch, titled:
> "x86, MCE: Fix bugs and issues of MCE log ring buffer")
You should use proper From: headers then for correct attribution.
-Andi
This is the diff between Huang's original change and the result of changes
by my patch set.
I'll going to explain what I changed from the original.
> diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h
> index 2d5c42a..4b5ef3c 100644
> --- a/arch/x86/include/asm/mce.h
> +++ b/arch/x86/include/asm/mce.h
> @@ -52,7 +52,7 @@
> #define MCE_INJ_NMI_BROADCAST (1 << 2) /* do NMI broadcasting */
> #define MCE_INJ_EXCEPTION (1 << 3) /* raise as exception */
>
> -/* Fields are zero when not available */
> +/* MCE log entry. Fields are zero when not available. */
> struct mce {
> __u64 status;
> __u64 misc;
> @@ -63,12 +63,12 @@ struct mce {
> __u64 time; /* wall time_t when error was detected */
> __u8 cpuvendor; /* cpu vendor as encoded in system.h */
> __u8 inject_flags; /* software inject flags */
> - __u16 pad;
> + __u16 pad;
> __u32 cpuid; /* CPUID 1 EAX */
> - __u8 cs; /* code segment */
> + __u8 cs; /* code segment */
> __u8 bank; /* machine check bank */
> __u8 cpu; /* cpu number; obsolete; use extcpu now */
> - __u8 finished; /* entry is valid */
> + __u8 finished; /* 1 if write to entry is finished & entry is valid */
> __u32 extcpu; /* linux cpu number that detected the error */
> __u32 socketid; /* CPU socket ID */
> __u32 apicid; /* CPU initial apic ID */
> @@ -76,10 +76,9 @@ struct mce {
> };
>
> /*
> - * This structure contains all data related to the MCE log. Also
> - * carries a signature to make it easier to find from external
> - * debugging tools. Each entry is only valid when its finished flag
> - * is set.
> + * This structure contains all data related to the MCE log. Also carries
> + * a signature to make it easier to find from external debugging tools.
> + * Each entry is only valid when its finished flag is set.
> */
Small clean up.
>
> #define MCE_LOG_LEN 32
> @@ -93,12 +92,18 @@ static inline int mce_log_index(int n)
> struct mce_log_cpu;
>
> struct mce_log {
> - char signature[12]; /* "MACHINECHEC2" */
> - unsigned len; /* = MCE_LOG_LEN */
> - unsigned flags;
> - unsigned recordlen; /* length of struct mce */
> - unsigned nr_mcelog_cpus;
> + char signature[12]; /* "MACHINECHEC2" */
> +
> + /* points the table of per-CPU buffers */
> struct mce_log_cpu **mcelog_cpus;
> + unsigned int nr_mcelog_cpus; /* = num_possible_cpus() */
> +
> + /* spec of per-CPU buffer */
> + unsigned int header_len; /* offset of array "entry" */
> + unsigned int nr_record; /* array size (= MCE_LOG_LEN) */
> + unsigned int record_len; /* length of struct mce */
> +
> + unsigned flags;
> };
Add a member header_len, and renamed "len" to "nr_record" which is easier
to understand. Please refer the description of patch 6/10.
>
> #define MCE_OVERFLOW 0 /* bit 0 in flags means overflow */
> diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
> index 5db3f5b..fad3daa 100644
> --- a/arch/x86/kernel/cpu/mcheck/mce.c
> +++ b/arch/x86/kernel/cpu/mcheck/mce.c
> @@ -122,54 +122,53 @@ EXPORT_PER_CPU_SYMBOL_GPL(mce_fake_banks);
> * separate MCEs from kernel messages to avoid bogus bug reports.
> */
>
> -static struct mce_log mcelog = {
> - .signature = MCE_LOG_SIGNATURE,
> - .len = MCE_LOG_LEN,
> - .recordlen = sizeof(struct mce),
> -};
> -
> struct mce_log_cpu {
> int head;
> int tail;
> - unsigned long flags;
> struct mce entry[MCE_LOG_LEN];
> };
Removed "flags" from per-CPU structure since mce_ioctl only cares global flags
in struct mce_log.
>
> DEFINE_PER_CPU(struct mce_log_cpu, mce_log_cpus);
>
> +static struct mce_log mcelog = {
> + .signature = MCE_LOG_SIGNATURE,
> + .header_len = offsetof(struct mce_log_cpu, entry),
> + .nr_record = MCE_LOG_LEN,
> + .record_len = sizeof(struct mce),
> +};
> +
> void mce_log(struct mce *mce)
> {
> struct mce_log_cpu *mcelog_cpu = &__get_cpu_var(mce_log_cpus);
> int head, ihead, tail, next;
>
> + /* mce->finished must be set to 0 before written to buffer */
> mce->finished = 0;
> - /*
> - * mce->finished must be set to 0 before written to ring
> - * buffer
> - */
> smp_wmb();
> +
> do {
> head = mcelog_cpu->head;
> tail = mcelog_cpu->tail;
> ihead = mce_log_index(head);
> +
> /*
> * When the buffer fills up discard new entries.
> * Assume that the earlier errors are the more
> * interesting.
> */
> if (ihead == mce_log_index(tail) && head != tail) {
> - set_bit(MCE_OVERFLOW, &mcelog_cpu->flags);
> + set_bit(MCE_OVERFLOW, (unsigned long *)&mcelog.flags);
Use global flags which is cared by mce_ioctl.
> return;
> }
> next = head == MCE_LOG_LIMIT ? 0 : head + 1;
> } while (cmpxchg_local(&mcelog_cpu->head, head, next) != head);
> +
> memcpy(mcelog_cpu->entry + ihead, mce, sizeof(struct mce));
> - /*
> - * ".finished" of MCE record in ring buffer must be set after
> - * copy
> - */
> +
> + /* ".finished" of MCE record in buffer must be set after copy */
> smp_wmb();
> mcelog_cpu->entry[ihead].finished = 1;
> +
> /* bit 0 of notify_user should be set after finished be set */
> smp_wmb();
> mce->finished = 1;
Changes from here to ....
> @@ -1195,19 +1194,40 @@ int mce_notify_irq(void)
> }
> EXPORT_SYMBOL_GPL(mce_notify_irq);
>
> -static int mce_banks_init(void)
> +static __cpuinit int mce_banks_init(void)
> {
> int i;
>
> mce_banks = kzalloc(banks * sizeof(struct mce_bank), GFP_KERNEL);
> if (!mce_banks)
> return -ENOMEM;
> +
> for (i = 0; i < banks; i++) {
> struct mce_bank *b = &mce_banks[i];
>
> b->ctl = -1ULL;
> b->init = 1;
> }
> +
> + return 0;
> +}
> +
> +/*
> + * Initialize MCE per-CPU log buffer
> + */
> +static __cpuinit int mce_log_init(void)
> +{
> + int cpu;
> +
> + mcelog.nr_mcelog_cpus = num_possible_cpus();
> + mcelog.mcelog_cpus = kzalloc(sizeof(void *) * num_possible_cpus(),
> + GFP_KERNEL);
> + if (!mcelog.mcelog_cpus)
> + return -ENOMEM;
> +
> + for_each_possible_cpu(cpu)
> + mcelog.mcelog_cpus[cpu] = &per_cpu(mce_log_cpus, cpu);
> +
> return 0;
> }
>
> @@ -1234,6 +1254,7 @@ static int __cpuinit mce_cap_init(void)
> /* Don't support asymmetric configurations today */
> WARN_ON(banks != 0 && b != banks);
> banks = b;
> +
> if (!mce_banks) {
> int err = mce_banks_init();
>
> @@ -1241,6 +1262,13 @@ static int __cpuinit mce_cap_init(void)
> return err;
> }
>
> + if (!mcelog.mcelog_cpus) {
> + int err = mce_log_init();
> +
> + if (err)
> + return err;
> + }
> +
> /* Use accurate RIP reporting if available. */
> if ((cap & MCG_EXT_P) && MCG_EXT_CNT(cap) >= 9)
> rip_msr = MSR_IA32_MCG_EIP;
> @@ -1251,22 +1279,6 @@ static int __cpuinit mce_cap_init(void)
> return 0;
> }
>
> -/*
> - * Initialize MCE per-CPU log buffer
> - */
> -static __cpuinit void mce_log_init(void)
> -{
> - int cpu;
> -
> - if (mcelog.mcelog_cpus)
> - return;
> - mcelog.nr_mcelog_cpus = num_possible_cpus();
> - mcelog.mcelog_cpus = kzalloc(sizeof(void *) * num_possible_cpus(),
> - GFP_KERNEL);
> - for_each_possible_cpu(cpu)
> - mcelog.mcelog_cpus[cpu] = &per_cpu(mce_log_cpus, cpu);
> -}
> -
> static void mce_init(void)
> {
> mce_banks_t all_banks;
> @@ -1437,7 +1449,6 @@ void __cpuinit mcheck_init(struct cpuinfo_x86 *c)
> mce_disabled = 1;
> return;
> }
> - mce_log_init();
>
> machine_check_vector = do_machine_check;
>
... here are what done in patch 10/10.
> @@ -1486,15 +1497,14 @@ static int mce_release(struct inode *inode, struct file *file)
> return 0;
> }
>
> -static ssize_t mce_read_cpu(struct mce_log_cpu *mcelog_cpu,
> - char __user *inubuf, size_t usize)
> +static ssize_t mce_read_cpu(int cpu, char __user *inubuf, size_t usize)
> {
> + struct mce_log_cpu *mcelog_cpu = &per_cpu(mce_log_cpus, cpu);
Changed 1st arg to cpu number.
> char __user *ubuf = inubuf;
> int head, tail, pos, i, err = 0;
>
> head = mcelog_cpu->head;
> tail = mcelog_cpu->tail;
> -
> if (head == tail)
> return 0;
>
> @@ -1503,13 +1513,14 @@ static ssize_t mce_read_cpu(struct mce_log_cpu *mcelog_cpu,
> i = mce_log_index(pos);
> if (!mcelog_cpu->entry[i].finished) {
> int timeout = WRITER_TIMEOUT_NS;
> +
> while (!mcelog_cpu->entry[i].finished) {
> if (timeout-- <= 0) {
> memset(mcelog_cpu->entry + i, 0,
> sizeof(struct mce));
> head = mcelog_cpu->head;
> printk(KERN_WARNING "mcelog: timeout "
> - "waiting for writer to finish!\n");
> + "waiting for writer to finish!\n");
> goto timeout;
> }
> ndelay(1);
> @@ -1538,11 +1549,6 @@ timeout:
> return err ? -EFAULT : ubuf - inubuf;
> }
>
> -static int mce_empty_cpu(struct mce_log_cpu *mcelog_cpu)
> -{
> - return mcelog_cpu->head == mcelog_cpu->tail;
> -}
> -
> static int mce_empty(void)
> {
> int cpu;
> @@ -1550,33 +1556,34 @@ static int mce_empty(void)
>
> for_each_possible_cpu(cpu) {
> mcelog_cpu = &per_cpu(mce_log_cpus, cpu);
> - if (!mce_empty_cpu(mcelog_cpu))
> + if (mcelog_cpu->head != mcelog_cpu->tail)
Inlined. Or it would be better to have static inlines in header file.
> return 0;
> }
> return 1;
> }
>
> +static DEFINE_MUTEX(mce_read_mutex);
> +
> static ssize_t mce_read(struct file *filp, char __user *inubuf, size_t usize,
> loff_t *off)
> {
> char __user *ubuf = inubuf;
> - struct mce_log_cpu *mcelog_cpu;
> - int cpu, new_mce, err = 0;
> - static DEFINE_MUTEX(mce_read_mutex);
> + int cpu, err = 0;
> +
> + /* Only supports full reads right now */
> + if (*off != 0 || usize < sizeof(struct mce))
> + return -EINVAL;
Picked up what implicitly dropped.
>
> mutex_lock(&mce_read_mutex);
> - do {
> - new_mce = 0;
> +
> + while (!mce_empty()) {
> for_each_possible_cpu(cpu) {
> if (usize < sizeof(struct mce))
> goto out;
> - mcelog_cpu = &per_cpu(mce_log_cpus, cpu);
> - err = mce_read_cpu(mcelog_cpu, ubuf,
> - sizeof(struct mce));
> + err = mce_read_cpu(cpu, ubuf, sizeof(struct mce));
> if (err > 0) {
> ubuf += sizeof(struct mce);
> usize -= sizeof(struct mce);
> - new_mce = 1;
> err = 0;
> } else if (err < 0)
> goto out;
> @@ -1586,9 +1593,10 @@ static ssize_t mce_read(struct file *filp, char __user *inubuf, size_t usize,
> cond_resched();
> mutex_lock(&mce_read_mutex);
> }
> - } while (new_mce || !mce_empty());
> + }
I could not catch the intent of "new_mce," so replaced do-while by
simple while statement.
> out:
> mutex_unlock(&mce_read_mutex);
> +
> return err ? err : ubuf - inubuf;
> }
>
That's all.
Thanks,
H.Seto
Andi Kleen wrote:
>> (This piece originates from Huang's patch, titled:
>> "x86, MCE: Fix bugs and issues of MCE log ring buffer")
>
> You should use proper From: headers then for correct attribution.
>
> -Andi
I just referred following commit we already have:
> commit 77e26cca20013e9352a8df86a54640543304a23a
> Author: Hidetoshi Seto <[email protected]>
> Date: Thu Jun 11 16:04:35 2009 +0900
>
> x86, mce: Fix mce printing
>
> This patch:
>
> - Adds print_mce_head() instead of first flag
> - Makes the header to be printed always
> - Stops double printing of corrected errors
>
> [ This portion originates from Huang Ying's patch ]
>
> Originally-From: Huang Ying <[email protected]>
> Signed-off-by: Hidetoshi Seto <[email protected]>
> LKML-Reference: <[email protected]>
> Signed-off-by: Ingo Molnar <[email protected]>
You mean s/Originally-From/From/, right?
I'll update attributes if this set need to be revised or if maintainer
(who expected to be able to set proper attributes) request me to do so.
Thanks,
H.Seto
2009/10/5 Hidetoshi Seto <[email protected]>:
> Hi Huang,
>
> Huang Ying wrote:
>> Current MCE log ring buffer has following bugs and issues:
>>
>> - On larger systems the 32 size buffer easily overflow, losing events.
>>
>> - We had some reports of events getting corrupted which were also
>> ? blamed on the ring buffer.
>>
>> - There's a known livelock, now hit by more people, under high error
>> ? rate.
>>
>> We fix these bugs and issues via making MCE log ring buffer as
>> lock-less per-CPU ring buffer.
>
> Now I have a real problem on the small MCE log buffer on my new large
> system with Nehalem which has many cpus/banks in one socket...
> So I'd like to solve the problem asap. ?I think this problem might block
> some distros to support new processor.
>
> Last week I reviewed your patch again and noticed that it is doing a lot
> of changes at once. ?I suppose that this method must be one of reasons
> why your patch seems to be so hard to review, and why it is taking long
> time to be accepted by x86 maintainers.
>
> Fortunately I had some spare time so I carefully broke your patch into
> some purpose-designed pieces. ?It would be the most significant change
> that now there are 2 steps to convert the buffer structure - 1) to make
> it per-CPU and 2) to make it ring buffer.
>
> Also I fixed some problem in your patch, found on the way to make this
> patch set. ?I'll explain about my changes later using diff from your
> change. ?Comments are welcomed.
>
> Thanks,
> H.Seto
>
Looks like the conversion of MCE log into a TRACE_EVENT is still in
discussion whereas the
current issues are urgent.
So the need is to have a more stable ring buffer. But this one is an ad-hoc one.
We already have a general purpose per-cpu/lockless ring buffer implementation in
kernel/trace/ring_buffer.c
And it's not only used by tracing, it's generally available.
I think it would be nicer to use it to avoid a proliferation of
unstable ring buffers
inside the kernel.
Fr?d?ric Weisbecker <[email protected]> writes:
> So the need is to have a more stable ring buffer. But this one is an ad-hoc one.
It was designed to do one job and does it well with minimal overhead.
That seems all positive to me.
> We already have a general purpose per-cpu/lockless ring buffer implementation in
> kernel/trace/ring_buffer.c
> And it's not only used by tracing, it's generally available.
That trace ring buffer is significantly larger than the all rest of the MCE
code together:
andi@basil:~/lsrc/git/obj> size kernel/trace/ring_buffer.o
text data bss dec hex filename
14241 21 12 14274 37c2 kernel/trace/ring_buffer.o
andi@basil:~/lsrc/git/obj> size arch/x86/kernel/cpu/mcheck/mce.o
text data bss dec hex filename
11098 4480 224 15802 3dba arch/x86/kernel/cpu/mcheck/mce.o
Basically you're proposing to more than double the code footprint of the
MCE handler for a small kernel configuration, just to avoid a few lines
of code.
It might be that all of its features are needed for tracing, but they
are definitely not needed for MCE handling and it would need
to be put on a significant diet before core code like the MCE handler
could even think to start relying on it. In addition the ring-buffer.c
currently doesn't even directly do what mce needs and would need
significant glue code to fit into the existing interfaces and likely
modifications to the ring buffer itself too (e.g. to stop
trace_stop from messing with error handling)
Error handling code like the MCE handler has to be always compiled in
unlike tracing and other debugging options and I don't think it can
really afford to use such oversized code, especially since it doesn't
really need any of its fancy features. Linux has enough problems with
code size growth recently, no need to make it worse with proposals
like this. Also the requirements of error handling are quite different
from other tracing and debugging and the ring-buffer doesn't even fit
well recently.
If the code should be converted to use a generic buffer kernel/kfifo.o
would be the right target and size:
andi@basil:~/lsrc/git/obj> size kernel/kfifo.o
text data bss dec hex filename
734 0 0 734 2de kernel/kfifo.o
Unfortunately this needs Stefanie's lockless kfifo changes first to make
happen and they didn't make it into 2.6.32-rc*
If they make it into 2.6.33 and there's really a strong desire to use
some generalized buffer I think that would be a suitable solution.
But frankly just keeping Ying's buffer around would also be quite reasonable
Short term Ying's simple alternative is the only good solution.
-Andi
--
[email protected] -- Speaking for myself only.
Fr?d?ric Weisbecker wrote:
> Looks like the conversion of MCE log into a TRACE_EVENT is still in
> discussion whereas the current issues are urgent.
>
> So the need is to have a more stable ring buffer. But this one is an ad-hoc one.
> We already have a general purpose per-cpu/lockless ring buffer implementation in
> kernel/trace/ring_buffer.c
> And it's not only used by tracing, it's generally available.
Thank you for this comment.
First of all, we have to make it clear that what is the urgent issue.
IMHO there are 2 issues, one is "small buffer" and the other is "non
ring buffer."
I think that the "small buffer" is definitely a scalability issue and
that this problem should be fixed asap to support large servers with
modern processors.
But I think that the "non ring buffer" is kind of a performance issue,
which is relatively minor problem on the MCE codes.
So frankly I approve of the per-CPU buffering, but be skeptical about
the ring buffering here.
In other words I don't mind dropping 9/10 of my patch set to postpone
converting the buffer to ring buffer, because even though the rests of
my patch set can convert the buffer to per-CPU style, which will solve
the urgent issue I have now. That is why I divided original patch into
these pieces and made separate steps in the buffer conversion.
How about converting the MCE buffer to per-CPU as the 1st step, to solve
an urgent scalability issue, x86 maintainers? > Ingo, Peter?
> I think it would be nicer to use it to avoid a proliferation of
> unstable ring buffers inside the kernel.
I tend to agree with Andi.
I think the generic ring buffer implementation is too rich for current
use in MCE codes.
Maybe I could invent an alternative patch of my patch 9/10 to use generic
ring buffer from MCE codes. But as I mentioned above I think that the
patch will solve the minor issue only. Also I think that it might bring
unexpected impacts on traditional interfaces.
Therefore if there are still "use generic one if you want to use ring buffer"
comments then I'd like to drop 9/10 simply rather than reinvent it.
Any other suggestions?
BTW, I'd like to make it clear that what is the main complaint here.
Is it kind of "don't duplicate codes"?
The requirement in MCE codes is quite simple, so use of relatively complex
generic ring buffer doesn't fit for it. I'm not sure but as Andi proposed
the next version of kernel/kfifo will be better choice here. I can wait
the release of new kfifo either without making MCE log buffer to ring,
or with the temporary ad-hoc one.
Is it kind of "provide generic interface"?
I think it makes a lot of sense. I can't say the /dev/mcelog is the best
interface for MCE logging. Moreover there are many other hardware error
sources in a system such as PCIe and power unit etc., so it would be nice
for monitoring applications if kernel can provide a single channel (other
than /var/log/messages) to gather all kind of hardware errors.
The TRACE_EVENT would be one of possible options (or parts) to implement
such feature.
Oh, don't forget "please keep traditional interfaces."
To be honest I think the current interfaces look quite ugly. But I know
that there are few real users such as mcelog (and mced?), and that at this
time we have no choice but use these tools. So we cannot cast away current
interfaces immediately, until new interfaces are invented and new user land
application can provide enough functionalities. Or we need to improve
current interface gradually and synchronously with improvements on tools ...
but I suppose in this way there will never be drastic changes.
Well, then I think what we have to do is invent a competitive new interface
for MCE logging, which can live together with current interface for a while,
isn't it?
Thanks,
H.Seto
On Mon, 2009-10-05 at 15:50 +0800, Hidetoshi Seto wrote:
> Andi Kleen wrote:
> >> (This piece originates from Huang's patch, titled:
> >> "x86, MCE: Fix bugs and issues of MCE log ring buffer")
> >
> > You should use proper From: headers then for correct attribution.
> >
> > -Andi
>
> I just referred following commit we already have:
>
> > commit 77e26cca20013e9352a8df86a54640543304a23a
> > Author: Hidetoshi Seto <[email protected]>
> > Date: Thu Jun 11 16:04:35 2009 +0900
> >
> > x86, mce: Fix mce printing
> >
> > This patch:
> >
> > - Adds print_mce_head() instead of first flag
> > - Makes the header to be printed always
> > - Stops double printing of corrected errors
> >
> > [ This portion originates from Huang Ying's patch ]
> >
> > Originally-From: Huang Ying <[email protected]>
> > Signed-off-by: Hidetoshi Seto <[email protected]>
> > LKML-Reference: <[email protected]>
> > Signed-off-by: Ingo Molnar <[email protected]>
>
> You mean s/Originally-From/From/, right?
>
> I'll update attributes if this set need to be revised or if maintainer
> (who expected to be able to set proper attributes) request me to do so.
I don't think it is a good collaboration style to copy others' patches,
do some modification and send it out, instead:
- comment on original patches
- communicate with the original author firstly
- provide another patch on top of original patches
Best Regards,
Huang Ying
In general it is not; there are exceptions for cases where either the original patch needs to be significantly modified to apply to the current codebase, or because it would cause breakage.
In those cases, use the tag Originally-by:.
"Huang Ying" <[email protected]> wrote:
>On Mon, 2009-10-05 at 15:50 +0800, Hidetoshi Seto wrote:
>> Andi Kleen wrote:
>> >> (This piece originates from Huang's patch, titled:
>> >> "x86, MCE: Fix bugs and issues of MCE log ring buffer")
>> >
>> > You should use proper From: headers then for correct attribution.
>> >
>> > -Andi
>>
>> I just referred following commit we already have:
>>
>> > commit 77e26cca20013e9352a8df86a54640543304a23a
>> > Author: Hidetoshi Seto <[email protected]>
>> > Date: Thu Jun 11 16:04:35 2009 +0900
>> >
>> > x86, mce: Fix mce printing
>> >
>> > This patch:
>> >
>> > - Adds print_mce_head() instead of first flag
>> > - Makes the header to be printed always
>> > - Stops double printing of corrected errors
>> >
>> > [ This portion originates from Huang Ying's patch ]
>> >
>> > Originally-From: Huang Ying <[email protected]>
>> > Signed-off-by: Hidetoshi Seto <[email protected]>
>> > LKML-Reference: <[email protected]>
>> > Signed-off-by: Ingo Molnar <[email protected]>
>>
>> You mean s/Originally-From/From/, right?
>>
>> I'll update attributes if this set need to be revised or if maintainer
>> (who expected to be able to set proper attributes) request me to do so.
>
>I don't think it is a good collaboration style to copy others' patches,
>do some modification and send it out, instead:
>
>- comment on original patches
>- communicate with the original author firstly
>- provide another patch on top of original patches
>
>Best Regards,
>Huang Ying
>
>
--
Sent from my mobile phone. Please excuse any lack of formatting.
Huang Ying wrote:
> I don't think it is a good collaboration style to copy others' patches,
> do some modification and send it out, instead:
Sorry, I could not stop myself since I got frustrated over a trivial matter
around this field in these days. And I could not wait your next post any
more, didn't want to see abandoned "rebased" patch again, so I took this
kind of aggressive violent action in bad manner to push your change.
Sorry again.
> - comment on original patches
I think your patch contains various changes, so how about to break down
this one into small pieces and make patches with proper clearer description.
I believe that the urgent problem can be solved by making the buffer to
per-CPU. Or do you have any reason to rush it into ring buffer?
> - communicate with the original author firstly
Certainly I wrote following line at first:
> Hi Huang,
> - provide another patch on top of original patches
Since what I want to demonstrate to you was another style of your patch,
I could not provide patches in incremental form as usual.
Thanks,
H.Seto