From: Borislav Petkov <[email protected]>
Hi,
as previously discussed, here's the first part of the RAS daemon code
in smaller, broken-out, digestible pieces. No functionality change from
the last time, only commit message fixups, clarifications and comment
adjustments.
Concerning risk, I don't see any since patch 4/4 leaves the persistent
event disabled for now - sort of like a chicken bit - and can only be
enabled if explicitly stated so on the kernel command line.
Please review,
thanks.
From: Borislav Petkov <[email protected]>
mv kernel/perf_event.c -> kernel/events/core.c. From there, all further
sensible splitting can happen. The idea is that due to perf_event.c
becoming pretty sizable and with the advent of the marriage with ftrace,
splitting functionality into its logical parts should help speeding up
the unification and managing the complexity of the subsystem.
Signed-off-by: Borislav Petkov <[email protected]>
---
kernel/Makefile | 5 +++--
kernel/events/Makefile | 5 +++++
kernel/{perf_event.c => events/core.c} | 0
3 files changed, 8 insertions(+), 2 deletions(-)
create mode 100644 kernel/events/Makefile
rename kernel/{perf_event.c => events/core.c} (100%)
diff --git a/kernel/Makefile b/kernel/Makefile
index 85cbfb3..7981530 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -21,7 +21,6 @@ CFLAGS_REMOVE_mutex-debug.o = -pg
CFLAGS_REMOVE_rtmutex-debug.o = -pg
CFLAGS_REMOVE_cgroup-debug.o = -pg
CFLAGS_REMOVE_sched_clock.o = -pg
-CFLAGS_REMOVE_perf_event.o = -pg
CFLAGS_REMOVE_irq_work.o = -pg
endif
@@ -103,7 +102,9 @@ obj-$(CONFIG_RING_BUFFER) += trace/
obj-$(CONFIG_TRACEPOINTS) += trace/
obj-$(CONFIG_SMP) += sched_cpupri.o
obj-$(CONFIG_IRQ_WORK) += irq_work.o
-obj-$(CONFIG_PERF_EVENTS) += perf_event.o
+
+obj-$(CONFIG_PERF_EVENTS) += events/
+
obj-$(CONFIG_HAVE_HW_BREAKPOINT) += hw_breakpoint.o
obj-$(CONFIG_USER_RETURN_NOTIFIER) += user-return-notifier.o
obj-$(CONFIG_PADATA) += padata.o
diff --git a/kernel/events/Makefile b/kernel/events/Makefile
new file mode 100644
index 0000000..26c00e4
--- /dev/null
+++ b/kernel/events/Makefile
@@ -0,0 +1,5 @@
+ifdef CONFIG_FUNCTION_TRACER
+CFLAGS_REMOVE_core.o = -pg
+endif
+
+obj-y += core.o
diff --git a/kernel/perf_event.c b/kernel/events/core.c
similarity index 100%
rename from kernel/perf_event.c
rename to kernel/events/core.c
--
1.7.4.rc2
From: Borislav Petkov <[email protected]>
Add a barebones implementation for registering persistent events with
perf. For that, we don't destroy the buffers when they're unmapped;
also, we map them read-only so that multiple agents can access them.
Signed-off-by: Borislav Petkov <[email protected]>
---
include/linux/perf_event.h | 22 +++++++++++++++-
kernel/events/Makefile | 2 +-
kernel/events/core.c | 29 ++++++++++++++++++---
kernel/events/persistent.c | 59 ++++++++++++++++++++++++++++++++++++++++++++
4 files changed, 105 insertions(+), 7 deletions(-)
create mode 100644 kernel/events/persistent.c
diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 9eec53d..18f2a68 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -216,8 +216,9 @@ struct perf_event_attr {
precise_ip : 2, /* skid constraint */
mmap_data : 1, /* non-exec mmap data */
sample_id_all : 1, /* sample_type all events */
+ persistent : 1, /* event always on */
- __reserved_1 : 45;
+ __reserved_1 : 44;
union {
__u32 wakeup_events; /* wakeup every n events */
@@ -1158,6 +1159,15 @@ extern void perf_swevent_put_recursion_context(int rctx);
extern void perf_event_enable(struct perf_event *event);
extern void perf_event_disable(struct perf_event *event);
extern void perf_event_task_tick(void);
+extern struct perf_buffer *
+perf_buffer_alloc(int nr_pages, long watermark, int cpu, int flags);
+extern void perf_buffer_put(struct perf_buffer *buffer);
+extern struct perf_event *
+perf_enable_persistent_event(struct perf_event_attr *attr,
+ int cpu, unsigned bufsz);
+extern void perf_disable_persistent_event(struct perf_event *event, int cpu);
+extern int perf_persistent_open(struct inode *inode, struct file *file);
+extern const struct file_operations perf_pers_fops;
#else
static inline void
perf_event_task_sched_in(struct task_struct *task) { }
@@ -1192,6 +1202,16 @@ static inline void perf_swevent_put_recursion_context(int rctx) { }
static inline void perf_event_enable(struct perf_event *event) { }
static inline void perf_event_disable(struct perf_event *event) { }
static inline void perf_event_task_tick(void) { }
+static inline struct perf_buffer *
+perf_buffer_alloc(int nr_pages, long watermark, int cpu, int flags) { return NULL; }
+static inline void perf_buffer_put(struct perf_buffer *buffer) {}
+static inline struct perf_event *
+perf_enable_persistent_event(struct perf_event_attr *attr, int cpu,
+ unsigned bufsz) { return -EINVAL; }
+static inline void
+perf_disable_persistent_event(struct perf_event *event, int cpu) {}
+static inline int
+perf_persistent_open(struct inode *inode, struct file *file) { return -1; }
#endif
#define perf_output_put(handle, x) \
diff --git a/kernel/events/Makefile b/kernel/events/Makefile
index 26c00e4..e2d4d8e 100644
--- a/kernel/events/Makefile
+++ b/kernel/events/Makefile
@@ -2,4 +2,4 @@ ifdef CONFIG_FUNCTION_TRACER
CFLAGS_REMOVE_core.o = -pg
endif
-obj-y += core.o
+obj-y += core.o persistent.o
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 440bc48..2d8b1af 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -2898,7 +2898,7 @@ static void free_event_rcu(struct rcu_head *head)
kfree(event);
}
-static void perf_buffer_put(struct perf_buffer *buffer);
+void perf_buffer_put(struct perf_buffer *buffer);
static void free_event(struct perf_event *event)
{
@@ -3458,7 +3458,7 @@ static void *perf_mmap_alloc_page(int cpu)
return page_address(page);
}
-static struct perf_buffer *
+struct perf_buffer *
perf_buffer_alloc(int nr_pages, long watermark, int cpu, int flags)
{
struct perf_buffer *buffer;
@@ -3575,7 +3575,7 @@ static void perf_buffer_free(struct perf_buffer *buffer)
schedule_work(&buffer->work);
}
-static struct perf_buffer *
+struct perf_buffer *
perf_buffer_alloc(int nr_pages, long watermark, int cpu, int flags)
{
struct perf_buffer *buffer;
@@ -3676,7 +3676,7 @@ static struct perf_buffer *perf_buffer_get(struct perf_event *event)
return buffer;
}
-static void perf_buffer_put(struct perf_buffer *buffer)
+void perf_buffer_put(struct perf_buffer *buffer)
{
if (!atomic_dec_and_test(&buffer->refcount))
return;
@@ -3695,6 +3695,11 @@ static void perf_mmap_close(struct vm_area_struct *vma)
{
struct perf_event *event = vma->vm_file->private_data;
+ if (event->attr.persistent) {
+ atomic_dec(&event->mmap_count);
+ return;
+ }
+
if (atomic_dec_and_mutex_lock(&event->mmap_count, &event->mmap_mutex)) {
unsigned long size = perf_data_size(event->buffer);
struct user_struct *user = event->mmap_user;
@@ -3737,7 +3742,7 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma)
if (event->cpu == -1 && event->attr.inherit)
return -EINVAL;
- if (!(vma->vm_flags & VM_SHARED))
+ if (!(vma->vm_flags & VM_SHARED) && !event->attr.persistent)
return -EINVAL;
vma_size = vma->vm_end - vma->vm_start;
@@ -3846,6 +3851,16 @@ static const struct file_operations perf_fops = {
.fasync = perf_fasync,
};
+const struct file_operations perf_pers_fops = {
+ .llseek = no_llseek,
+ .open = perf_persistent_open,
+ .poll = perf_poll,
+ .unlocked_ioctl = perf_ioctl,
+ .compat_ioctl = perf_ioctl,
+ .mmap = perf_mmap,
+ .fasync = perf_fasync,
+};
+
/*
* Perf event wakeup
*
@@ -6756,6 +6771,10 @@ __perf_event_exit_task(struct perf_event *child_event,
raw_spin_unlock_irq(&child_ctx->lock);
}
+ /* do not remove persistent events on task exit */
+ if (child_event->attr.persistent)
+ return;
+
perf_remove_from_context(child_event);
/*
diff --git a/kernel/events/persistent.c b/kernel/events/persistent.c
new file mode 100644
index 0000000..e52752a
--- /dev/null
+++ b/kernel/events/persistent.c
@@ -0,0 +1,59 @@
+#include <linux/perf_event.h>
+
+/*
+ * Create and enable the perf event described by @attr.
+ *
+ * Returns the @event pointer which receives the allocated event from
+ * perf on success. Make sure to check return code before touching @event
+ * any further.
+ *
+ * @attr: perf attr template
+ * @cpu: on which cpu
+ * @nr_pages: perf buffer size in pages
+ *
+ */
+struct perf_event *perf_enable_persistent_event(struct perf_event_attr *attr,
+ int cpu, unsigned nr_pages)
+{
+ struct perf_buffer *buffer;
+ struct perf_event *event;
+
+ event = perf_event_create_kernel_counter(attr, cpu, NULL, NULL);
+ if (IS_ERR(event))
+ return event;
+
+ buffer = perf_buffer_alloc(nr_pages, 0, cpu, PERF_BUFFER_WRITABLE);
+ if (IS_ERR(buffer))
+ goto err;
+
+ rcu_assign_pointer(event->buffer, buffer);
+ perf_event_enable(event);
+
+ return event;
+
+err:
+ perf_event_release_kernel(event);
+ return ERR_PTR(-EINVAL);
+}
+
+void perf_disable_persistent_event(struct perf_event *event, int cpu)
+{
+ if (!event)
+ return;
+
+ perf_event_disable(event);
+
+ if (event->buffer) {
+ perf_buffer_put(event->buffer);
+ rcu_assign_pointer(event->buffer, NULL);
+ }
+
+ perf_event_release_kernel(event);
+}
+
+int perf_persistent_open(struct inode *inode, struct file *file)
+{
+ file->private_data = inode->i_private;
+
+ return 0;
+}
--
1.7.4.rc2
From: Borislav Petkov <[email protected]>
Add the necessary glue to enable the mce_record tracepoint on boot,
turning it into a persistent event. This exports the MCE buffer through
a debugfs per-CPU file which a userspace daemon can read and then
process the received error data further.
Signed-off-by: Borislav Petkov <[email protected]>
---
arch/x86/include/asm/mce.h | 8 +++
arch/x86/kernel/cpu/mcheck/mce.c | 89 ++++++++++++++++++++++++++++++++++++++
2 files changed, 97 insertions(+), 0 deletions(-)
diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h
index eb16e94..40cc9bc 100644
--- a/arch/x86/include/asm/mce.h
+++ b/arch/x86/include/asm/mce.h
@@ -115,6 +115,14 @@ struct mce_log {
#ifdef __KERNEL__
+/*
+ * a per-CPU descriptor of the persistent MCE tracepoint
+ */
+struct mce_tp_desc {
+ struct perf_event *event;
+ struct dentry *debugfs_entry;
+};
+
extern struct atomic_notifier_head x86_mce_decoder_chain;
#include <linux/percpu.h>
diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
index 3385ea2..9589ebf 100644
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -96,6 +96,7 @@ static char *mce_helper_argv[2] = { mce_helper, NULL };
static DECLARE_WAIT_QUEUE_HEAD(mce_wait);
static DEFINE_PER_CPU(struct mce, mces_seen);
+static DEFINE_PER_CPU(struct mce_tp_desc, mce_event);
static int cpu_missing;
/*
@@ -2055,6 +2056,91 @@ static void __cpuinit mce_reenable_cpu(void *h)
}
}
+static struct perf_event_attr pattr = {
+ .type = PERF_TYPE_TRACEPOINT,
+ .size = sizeof(pattr),
+ .sample_type = PERF_SAMPLE_RAW,
+ .persistent = 1,
+};
+
+static struct dentry *mce_add_event_debugfs(struct perf_event *event, int cpu)
+{
+ char buf[14];
+
+ sprintf(buf, "mce_record%d", cpu);
+
+ return debugfs_create_file(buf, S_IRUGO | S_IWUSR,
+ mce_get_debugfs_dir(),
+ event, &perf_pers_fops);
+}
+
+#define MCE_BUF_PAGES 4
+
+static int mce_enable_perf_event_on_cpu(int cpu)
+{
+ struct mce_tp_desc *d = &per_cpu(mce_event, cpu);
+ int err = -EINVAL;
+
+ d->event = perf_enable_persistent_event(&pattr, cpu, MCE_BUF_PAGES);
+ if (IS_ERR(d->event)) {
+ printk(KERN_ERR "MCE: Error enabling event on cpu %d\n", cpu);
+ goto ret;
+ }
+
+ d->debugfs_entry = mce_add_event_debugfs(d->event, cpu);
+ if (!d->debugfs_entry) {
+ printk(KERN_ERR "MCE: Error adding event debugfs entry on cpu %d\n", cpu);
+ goto disable;
+ }
+
+ return 0;
+
+disable:
+ perf_disable_persistent_event(d->event, cpu);
+
+ret:
+ return err;
+}
+
+static void mce_disable_perf_event_on_cpu(int cpu)
+{
+ struct mce_tp_desc *d = &per_cpu(mce_event, cpu);
+ debugfs_remove(d->debugfs_entry);
+ perf_disable_persistent_event(d->event, cpu);
+}
+
+static __init int mcheck_init_persistent_event(void)
+{
+ int cpu, err = 0;
+
+ get_online_cpus();
+
+ pattr.config = event_mce_record.event.type;
+ pattr.sample_period = 1;
+ pattr.wakeup_events = 1;
+
+ for_each_online_cpu(cpu)
+ if (mce_enable_perf_event_on_cpu(cpu))
+ goto err_unwind;
+
+ goto unlock;
+
+err_unwind:
+ err = -EINVAL;
+ for (--cpu; cpu >= 0; cpu--)
+ mce_disable_perf_event_on_cpu(cpu);
+
+unlock:
+ put_online_cpus();
+
+ return err;
+}
+
+/*
+ * This has to run after event_trace_init()
+ */
+device_initcall(mcheck_init_persistent_event);
+
/* Get notified when a cpu comes on/off. Be hotplug friendly. */
static int __cpuinit
mce_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
@@ -2068,6 +2154,7 @@ mce_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
mce_create_device(cpu);
if (threshold_cpu_callback)
threshold_cpu_callback(action, cpu);
+ mce_enable_perf_event_on_cpu(cpu);
break;
case CPU_DEAD:
case CPU_DEAD_FROZEN:
@@ -2077,6 +2164,7 @@ mce_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
break;
case CPU_DOWN_PREPARE:
case CPU_DOWN_PREPARE_FROZEN:
+ mce_disable_perf_event_on_cpu(cpu);
del_timer_sync(t);
smp_call_function_single(cpu, mce_disable_cpu, &action, 1);
break;
@@ -2088,6 +2176,7 @@ mce_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
add_timer_on(t, cpu);
}
smp_call_function_single(cpu, mce_reenable_cpu, &action, 1);
+ mce_enable_perf_event_on_cpu(cpu);
break;
case CPU_POST_DEAD:
/* intentionally ignoring frozen here */
--
1.7.4.rc2
From: Borislav Petkov <[email protected]>
This is new functionality and it affects all of x86 so we want to be
very conservative about it and have it off by default for now, in case
something goes awry. You can always enable it by supplying "ras" on the
kernel command line.
Also, depending on whether it is enabled or not, we emit the tracepoint
from a different place in the code to pick up additional decoded info.
Signed-off-by: Borislav Petkov <[email protected]>
---
Documentation/kernel-parameters.txt | 2 ++
arch/x86/include/asm/mce.h | 1 +
arch/x86/kernel/cpu/mcheck/mce.c | 32 ++++++++++++++++++++++++++++++--
drivers/edac/mce_amd.c | 5 +++++
4 files changed, 38 insertions(+), 2 deletions(-)
diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
index cc85a92..f09438a 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -2165,6 +2165,8 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
ramdisk_size= [RAM] Sizes of RAM disks in kilobytes
See Documentation/blockdev/ramdisk.txt.
+ ras [X86] Enable RAS daemon supporting functionality.
+
rcupdate.blimit= [KNL,BOOT]
Set maximum number of finished RCU callbacks to process
in one batch.
diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h
index 40cc9bc..649d47a 100644
--- a/arch/x86/include/asm/mce.h
+++ b/arch/x86/include/asm/mce.h
@@ -131,6 +131,7 @@ extern struct atomic_notifier_head x86_mce_decoder_chain;
extern int mce_disabled;
extern int mce_p5_enabled;
+extern int ras;
#ifdef CONFIG_X86_MCE
int mcheck_init(void);
diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
index 9589ebf..3fb22cb 100644
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -59,6 +59,8 @@ static DEFINE_MUTEX(mce_read_mutex);
#define CREATE_TRACE_POINTS
#include <trace/events/mce.h>
+EXPORT_TRACEPOINT_SYMBOL_GPL(mce_record);
+
int mce_disabled __read_mostly;
#define MISC_MCELOG_MINOR 227
@@ -86,6 +88,7 @@ static int mce_dont_log_ce __read_mostly;
int mce_cmci_disabled __read_mostly;
int mce_ignore_ce __read_mostly;
int mce_ser __read_mostly;
+int ras __read_mostly;
struct mce_bank *mce_banks __read_mostly;
@@ -105,6 +108,7 @@ static int cpu_missing;
*/
ATOMIC_NOTIFIER_HEAD(x86_mce_decoder_chain);
EXPORT_SYMBOL_GPL(x86_mce_decoder_chain);
+EXPORT_SYMBOL_GPL(ras);
static int default_decode_mce(struct notifier_block *nb, unsigned long val,
void *data)
@@ -163,8 +167,9 @@ void mce_log(struct mce *mce)
{
unsigned next, entry;
- /* Emit the trace record: */
- trace_mce_record(mce);
+ if (!ras)
+ /* Emit the trace record: */
+ trace_mce_record(mce);
mce->finished = 0;
wmb();
@@ -1721,6 +1726,19 @@ static int __init mcheck_enable(char *str)
}
__setup("mce", mcheck_enable);
+static int __init ras_enable(char *str)
+{
+ /*
+ * We enable the persistent event only if "ras" is supplied on the
+ * command line.
+ */
+ ras = 1;
+
+ return 0;
+}
+
+__setup("ras", ras_enable);
+
int __init mcheck_init(void)
{
atomic_notifier_chain_register(&x86_mce_decoder_chain, &mce_dec_nb);
@@ -2081,6 +2099,9 @@ static int mce_enable_perf_event_on_cpu(int cpu)
struct mce_tp_desc *d = &per_cpu(mce_event, cpu);
int err = -EINVAL;
+ if (!ras)
+ return 0;
+
d->event = perf_enable_persistent_event(&pattr, cpu, MCE_BUF_PAGES);
if (IS_ERR(d->event)) {
printk(KERN_ERR "MCE: Error enabling event on cpu %d\n", cpu);
@@ -2105,6 +2126,10 @@ ret:
static void mce_disable_perf_event_on_cpu(int cpu)
{
struct mce_tp_desc *d = &per_cpu(mce_event, cpu);
+
+ if (!ras)
+ return;
+
debugfs_remove(d->debugfs_entry);
perf_disable_persistent_event(d->event, cpu);
}
@@ -2113,6 +2138,9 @@ static __init int mcheck_init_persistent_event(void)
{
int cpu, err = 0;
+ if (!ras)
+ return -EBUSY;
+
get_online_cpus();
pattr.config = event_mce_record.event.type;
diff --git a/drivers/edac/mce_amd.c b/drivers/edac/mce_amd.c
index 795cfbc..e329335 100644
--- a/drivers/edac/mce_amd.c
+++ b/drivers/edac/mce_amd.c
@@ -1,5 +1,7 @@
#include <linux/module.h>
#include <linux/slab.h>
+#include <asm/mce.h>
+#include <trace/events/mce.h>
#include "mce_amd.h"
@@ -829,6 +831,9 @@ int amd_decode_mce(struct notifier_block *nb, unsigned long val, void *data)
amd_decode_err_code(m->status & 0xffff);
+ if (ras)
+ trace_mce_record(m);
+
return NOTIFY_STOP;
}
EXPORT_SYMBOL_GPL(amd_decode_mce);
--
1.7.4.rc2
* Borislav Petkov <[email protected]> wrote:
> From: Borislav Petkov <[email protected]>
>
> Add a barebones implementation for registering persistent events with
> perf. For that, we don't destroy the buffers when they're unmapped;
> also, we map them read-only so that multiple agents can access them.
>
> Signed-off-by: Borislav Petkov <[email protected]>
> ---
> include/linux/perf_event.h | 22 +++++++++++++++-
> kernel/events/Makefile | 2 +-
> kernel/events/core.c | 29 ++++++++++++++++++---
> kernel/events/persistent.c | 59 ++++++++++++++++++++++++++++++++++++++++++++
> 4 files changed, 105 insertions(+), 7 deletions(-)
> create mode 100644 kernel/events/persistent.c
I really like this new kernel/events/ subdirectory you created - we could move
the other perf_events.c facilities there as well, and maybe split up the file a
bit, it's 7.5 KLOC now which is a bit excessive.
We should probably do that before adding kernel/events/persistent.c - wanna do
those changes? Initial commit should be a mostly-rename thing:
kernel/perf_events.c could move into kernel/events/core.c or so - and then we
could split the core up some more, into various facilities.
[ kernel/hw_breakpoint.c could possibly move into kernel/events/hw_breakpoint.c
file as well - if Frederic acks it. ]
Thanks,
Ingo
* Borislav Petkov <[email protected]> wrote:
> +static struct perf_event_attr pattr = {
> + .type = PERF_TYPE_TRACEPOINT,
> + .size = sizeof(pattr),
> + .sample_type = PERF_SAMPLE_RAW,
> + .persistent = 1,
> +};
> +
> +static struct dentry *mce_add_event_debugfs(struct perf_event *event, int cpu)
> +{
> + char buf[14];
> +
> + sprintf(buf, "mce_record%d", cpu);
> +
> + return debugfs_create_file(buf, S_IRUGO | S_IWUSR,
> + mce_get_debugfs_dir(),
> + event, &perf_pers_fops);
> +}
> +
> +#define MCE_BUF_PAGES 4
> +
> +static int mce_enable_perf_event_on_cpu(int cpu)
> +{
> + struct mce_tp_desc *d = &per_cpu(mce_event, cpu);
> + int err = -EINVAL;
> +
> + d->event = perf_enable_persistent_event(&pattr, cpu, MCE_BUF_PAGES);
> + if (IS_ERR(d->event)) {
> + printk(KERN_ERR "MCE: Error enabling event on cpu %d\n", cpu);
> + goto ret;
> + }
> +
> + d->debugfs_entry = mce_add_event_debugfs(d->event, cpu);
> + if (!d->debugfs_entry) {
> + printk(KERN_ERR "MCE: Error adding event debugfs entry on cpu %d\n", cpu);
> + goto disable;
> + }
> +
> + return 0;
> +
> +disable:
> + perf_disable_persistent_event(d->event, cpu);
> +
> +ret:
> + return err;
> +}
> +
> +static void mce_disable_perf_event_on_cpu(int cpu)
> +{
> + struct mce_tp_desc *d = &per_cpu(mce_event, cpu);
> + debugfs_remove(d->debugfs_entry);
> + perf_disable_persistent_event(d->event, cpu);
> +}
> +
> +static __init int mcheck_init_persistent_event(void)
> +{
> + int cpu, err = 0;
> +
> + get_online_cpus();
> +
> + pattr.config = event_mce_record.event.type;
> + pattr.sample_period = 1;
> + pattr.wakeup_events = 1;
> +
> + for_each_online_cpu(cpu)
> + if (mce_enable_perf_event_on_cpu(cpu))
> + goto err_unwind;
> +
> + goto unlock;
> +
> +err_unwind:
> + err = -EINVAL;
> + for (--cpu; cpu >= 0; cpu--)
> + mce_disable_perf_event_on_cpu(cpu);
> +
> +unlock:
> + put_online_cpus();
> +
> + return err;
> +}
> +
> +/*
> + * This has to run after event_trace_init()
> + */
> +device_initcall(mcheck_init_persistent_event);
Looks quite generic - shouldnt this bit be generalized a bit more into
kernel/events/? When other places (and other platforms) want to add a
persistent event they would thus have your new facility available as well.
Except this bit:
> /* Get notified when a cpu comes on/off. Be hotplug friendly. */
> static int __cpuinit
> mce_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
> @@ -2068,6 +2154,7 @@ mce_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
> mce_create_device(cpu);
> if (threshold_cpu_callback)
> threshold_cpu_callback(action, cpu);
> + mce_enable_perf_event_on_cpu(cpu);
> break;
> case CPU_DEAD:
> case CPU_DEAD_FROZEN:
> @@ -2077,6 +2164,7 @@ mce_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
> break;
> case CPU_DOWN_PREPARE:
> case CPU_DOWN_PREPARE_FROZEN:
> + mce_disable_perf_event_on_cpu(cpu);
> del_timer_sync(t);
> smp_call_function_single(cpu, mce_disable_cpu, &action, 1);
> break;
> @@ -2088,6 +2176,7 @@ mce_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
> add_timer_on(t, cpu);
> }
> smp_call_function_single(cpu, mce_reenable_cpu, &action, 1);
> + mce_enable_perf_event_on_cpu(cpu);
> break;
> case CPU_POST_DEAD:
> /* intentionally ignoring frozen here */
which looks a bit x86 specific.
Thanks,
Ingo
* Borislav Petkov <[email protected]> wrote:
> From: Borislav Petkov <[email protected]>
>
> This is new functionality and it affects all of x86 so we want to be
> very conservative about it and have it off by default for now, in case
> something goes awry. You can always enable it by supplying "ras" on the
> kernel command line.
>
> Also, depending on whether it is enabled or not, we emit the tracepoint
> from a different place in the code to pick up additional decoded info.
>
> Signed-off-by: Borislav Petkov <[email protected]>
> ---
> Documentation/kernel-parameters.txt | 2 ++
> arch/x86/include/asm/mce.h | 1 +
> arch/x86/kernel/cpu/mcheck/mce.c | 32 ++++++++++++++++++++++++++++++--
> drivers/edac/mce_amd.c | 5 +++++
> 4 files changed, 38 insertions(+), 2 deletions(-)
the boot flag is fine - but please keep it enabled by default if MCE support is
enabled in the .config, we do not chicken out when it comes to testing new
code! :-)
Thanks,
Ingo
* Ingo Molnar <[email protected]> wrote:
> > include/linux/perf_event.h | 22 +++++++++++++++-
> > kernel/events/Makefile | 2 +-
> > kernel/events/core.c | 29 ++++++++++++++++++---
> > kernel/events/persistent.c | 59 ++++++++++++++++++++++++++++++++++++++++++++
> > 4 files changed, 105 insertions(+), 7 deletions(-)
> > create mode 100644 kernel/events/persistent.c
>
> I really like this new kernel/events/ subdirectory you created - we could move
> the other perf_events.c facilities there as well, and maybe split up the file a
> bit, it's 7.5 KLOC now which is a bit excessive.
>
> We should probably do that before adding kernel/events/persistent.c - wanna do
> those changes? Initial commit should be a mostly-rename thing:
> kernel/perf_events.c could move into kernel/events/core.c or so - and then we
> could split the core up some more, into various facilities.
Looks like you have a time machine and after reading my feedback you have have
sneaked this exact change into patch 1/4! :-)
( The alternative interpretation is that i should start reading series at the
beginning - not in the most interesting looking (largest) places where i
would stupidly ask for changes you've put into 1/4 already. )
Thanks,
Ingo
On Tue, May 03, 2011 at 02:40:46AM -0400, Ingo Molnar wrote:
> I really like this new kernel/events/ subdirectory you created -
Yeah, this was your suggestion a couple of months ago, I just picked it
up.
> we could move the other perf_events.c facilities there as well, and
> maybe split up the file a bit, it's 7.5 KLOC now which is a bit
> excessive.
Right, so I left perf_event.c as a whole on purpose (only renamed)
because it makes more sense IMHO for someone who actually wrote it to
split it in logical parts (^hint hint^). I mean, I could do it too but
it'll take me much more time :).
> We should probably do that before adding kernel/events/persistent.c - wanna do
> those changes? Initial commit should be a mostly-rename thing:
> kernel/perf_events.c could move into kernel/events/core.c or so - and then we
> could split the core up some more, into various facilities.
>
> [ kernel/hw_breakpoint.c could possibly move into kernel/events/hw_breakpoint.c
> file as well - if Frederic acks it. ]
Yeah, no problem.
Thanks for the review, btw.
--
Regards/Gruss,
Boris.
Advanced Micro Devices GmbH
Einsteinring 24, 85609 Dornach
General Managers: Alberto Bozzo, Andrew Bowd
Registration: Dornach, Gemeinde Aschheim, Landkreis Muenchen
Registergericht Muenchen, HRB Nr. 43632
On Tue, May 03, 2011 at 02:44:01AM -0400, Ingo Molnar wrote:
>
> * Borislav Petkov <[email protected]> wrote:
>
> > +static struct perf_event_attr pattr = {
> > + .type = PERF_TYPE_TRACEPOINT,
> > + .size = sizeof(pattr),
> > + .sample_type = PERF_SAMPLE_RAW,
> > + .persistent = 1,
> > +};
> > +
> > +static struct dentry *mce_add_event_debugfs(struct perf_event *event, int cpu)
> > +{
> > + char buf[14];
> > +
> > + sprintf(buf, "mce_record%d", cpu);
> > +
> > + return debugfs_create_file(buf, S_IRUGO | S_IWUSR,
> > + mce_get_debugfs_dir(),
> > + event, &perf_pers_fops);
> > +}
> > +
> > +#define MCE_BUF_PAGES 4
> > +
> > +static int mce_enable_perf_event_on_cpu(int cpu)
> > +{
> > + struct mce_tp_desc *d = &per_cpu(mce_event, cpu);
> > + int err = -EINVAL;
> > +
> > + d->event = perf_enable_persistent_event(&pattr, cpu, MCE_BUF_PAGES);
> > + if (IS_ERR(d->event)) {
> > + printk(KERN_ERR "MCE: Error enabling event on cpu %d\n", cpu);
> > + goto ret;
> > + }
> > +
> > + d->debugfs_entry = mce_add_event_debugfs(d->event, cpu);
> > + if (!d->debugfs_entry) {
> > + printk(KERN_ERR "MCE: Error adding event debugfs entry on cpu %d\n", cpu);
> > + goto disable;
> > + }
> > +
> > + return 0;
> > +
> > +disable:
> > + perf_disable_persistent_event(d->event, cpu);
> > +
> > +ret:
> > + return err;
> > +}
> > +
> > +static void mce_disable_perf_event_on_cpu(int cpu)
> > +{
> > + struct mce_tp_desc *d = &per_cpu(mce_event, cpu);
> > + debugfs_remove(d->debugfs_entry);
> > + perf_disable_persistent_event(d->event, cpu);
> > +}
> > +
> > +static __init int mcheck_init_persistent_event(void)
> > +{
> > + int cpu, err = 0;
> > +
> > + get_online_cpus();
> > +
> > + pattr.config = event_mce_record.event.type;
> > + pattr.sample_period = 1;
> > + pattr.wakeup_events = 1;
> > +
> > + for_each_online_cpu(cpu)
> > + if (mce_enable_perf_event_on_cpu(cpu))
> > + goto err_unwind;
> > +
> > + goto unlock;
> > +
> > +err_unwind:
> > + err = -EINVAL;
> > + for (--cpu; cpu >= 0; cpu--)
> > + mce_disable_perf_event_on_cpu(cpu);
> > +
> > +unlock:
> > + put_online_cpus();
> > +
> > + return err;
> > +}
> > +
> > +/*
> > + * This has to run after event_trace_init()
> > + */
> > +device_initcall(mcheck_init_persistent_event);
>
> Looks quite generic - shouldnt this bit be generalized a bit more into
> kernel/events/? When other places (and other platforms) want to add a
> persistent event they would thus have your new facility available as well.
Right, so the position of those seemed kinda arbitrary to me too due to
their partial genericity (that's not a real word :)).
But, the difference between persistent events and "normal" perf
events is that we export the former through debugfs, thus bypassing
sys_perf_event_open syscall and make them a rather different beast.
I'm guessing that when we move those to /sysfs, this mechanism will
have to change too so it might make sense to move that functionality to
persistent.c. Let me see what I can do.
>
> Except this bit:
>
> > /* Get notified when a cpu comes on/off. Be hotplug friendly. */
> > static int __cpuinit
> > mce_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
> > @@ -2068,6 +2154,7 @@ mce_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
> > mce_create_device(cpu);
> > if (threshold_cpu_callback)
> > threshold_cpu_callback(action, cpu);
> > + mce_enable_perf_event_on_cpu(cpu);
> > break;
> > case CPU_DEAD:
> > case CPU_DEAD_FROZEN:
> > @@ -2077,6 +2164,7 @@ mce_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
> > break;
> > case CPU_DOWN_PREPARE:
> > case CPU_DOWN_PREPARE_FROZEN:
> > + mce_disable_perf_event_on_cpu(cpu);
> > del_timer_sync(t);
> > smp_call_function_single(cpu, mce_disable_cpu, &action, 1);
> > break;
> > @@ -2088,6 +2176,7 @@ mce_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
> > add_timer_on(t, cpu);
> > }
> > smp_call_function_single(cpu, mce_reenable_cpu, &action, 1);
> > + mce_enable_perf_event_on_cpu(cpu);
> > break;
> > case CPU_POST_DEAD:
> > /* intentionally ignoring frozen here */
>
> which looks a bit x86 specific.
Yep.
Thanks.
--
Regards/Gruss,
Boris.
Advanced Micro Devices GmbH
Einsteinring 24, 85609 Dornach
General Managers: Alberto Bozzo, Andrew Bowd
Registration: Dornach, Gemeinde Aschheim, Landkreis Muenchen
Registergericht Muenchen, HRB Nr. 43632
On Tue, May 03, 2011 at 02:45:05AM -0400, Ingo Molnar wrote:
>
> * Borislav Petkov <[email protected]> wrote:
>
> > From: Borislav Petkov <[email protected]>
> >
> > This is new functionality and it affects all of x86 so we want to be
> > very conservative about it and have it off by default for now, in case
> > something goes awry. You can always enable it by supplying "ras" on the
> > kernel command line.
> >
> > Also, depending on whether it is enabled or not, we emit the tracepoint
> > from a different place in the code to pick up additional decoded info.
> >
> > Signed-off-by: Borislav Petkov <[email protected]>
> > ---
> > Documentation/kernel-parameters.txt | 2 ++
> > arch/x86/include/asm/mce.h | 1 +
> > arch/x86/kernel/cpu/mcheck/mce.c | 32 ++++++++++++++++++++++++++++++--
> > drivers/edac/mce_amd.c | 5 +++++
> > 4 files changed, 38 insertions(+), 2 deletions(-)
>
> the boot flag is fine - but please keep it enabled by default if MCE support is
> enabled in the .config, we do not chicken out when it comes to testing new
> code! :-)
Ok, the problem I see with it is that people without a RAS daemon
running will have the mechanism collecting MCEs in the background, using
up resources (4 pages per CPU is the buffer) and not doing anything (in
the best case that is, when we're not broken otherwise).
Do we want to have that on all x86 turned on by default? Or maybe invert
the flag semantics so that people can disable it on boot?
Thanks.
--
Regards/Gruss,
Boris.
Advanced Micro Devices GmbH
Einsteinring 24, 85609 Dornach
General Managers: Alberto Bozzo, Andrew Bowd
Registration: Dornach, Gemeinde Aschheim, Landkreis Muenchen
Registergericht Muenchen, HRB Nr. 43632
* Borislav Petkov <[email protected]> wrote:
> On Tue, May 03, 2011 at 02:45:05AM -0400, Ingo Molnar wrote:
> >
> > * Borislav Petkov <[email protected]> wrote:
> >
> > > From: Borislav Petkov <[email protected]>
> > >
> > > This is new functionality and it affects all of x86 so we want to be
> > > very conservative about it and have it off by default for now, in case
> > > something goes awry. You can always enable it by supplying "ras" on the
> > > kernel command line.
> > >
> > > Also, depending on whether it is enabled or not, we emit the tracepoint
> > > from a different place in the code to pick up additional decoded info.
> > >
> > > Signed-off-by: Borislav Petkov <[email protected]>
> > > ---
> > > Documentation/kernel-parameters.txt | 2 ++
> > > arch/x86/include/asm/mce.h | 1 +
> > > arch/x86/kernel/cpu/mcheck/mce.c | 32 ++++++++++++++++++++++++++++++--
> > > drivers/edac/mce_amd.c | 5 +++++
> > > 4 files changed, 38 insertions(+), 2 deletions(-)
> >
> > the boot flag is fine - but please keep it enabled by default if MCE support is
> > enabled in the .config, we do not chicken out when it comes to testing new
> > code! :-)
>
> Ok, the problem I see with it is that people without a RAS daemon
> running will have the mechanism collecting MCEs in the background, using
> up resources (4 pages per CPU is the buffer) and not doing anything (in
> the best case that is, when we're not broken otherwise).
Well, i'd put it behind a new Kconfig option which is default enabled if MCE
support is enabled. That way people can still disable it both in the .config
and on the boot command line as well.
> Do we want to have that on all x86 turned on by default? Or maybe invert the
> flag semantics so that people can disable it on boot?
Boot flagsshould have feature=on/off kind of obvious semantics.
Thanks,
Ingo
* Borislav Petkov <[email protected]> wrote:
> > we could move the other perf_events.c facilities there as well, and maybe
> > split up the file a bit, it's 7.5 KLOC now which is a bit excessive.
>
> Right, so I left perf_event.c as a whole on purpose (only renamed) because it
> makes more sense IMHO for someone who actually wrote it to split it in
> logical parts (^hint hint^). I mean, I could do it too but it'll take me much
> more time :).
Well you are now writing new events code as well so spending that time might be
a good investment in terms of getting persistent.c right as well.
Could you send a Git pull request against perf/core that does the obvious file
movement and renaming? We can do that first step straight away and get the ball
rolling.
Thanks,
Ingo
* Borislav Petkov <[email protected]> wrote:
> I'm guessing that when we move those to /sysfs, this mechanism will have to
> change too so it might make sense to move that functionality to persistent.c.
> Let me see what I can do.
Yes, we really do not want to increase the debugfs mess. This looks like an
implementational detail but from a tooling POV it's quite essential.
As a starting point check out Peter's patch from November 2010:
[RFC][PATCH 8/8] perf: Sysfs events
https://lkml.org/lkml/2010/11/17/489
That patch should still apply cleanly to the latest tree.
In the end we want all events (including regular hw events) to be easily
available via sysfs enumeration. The goal is that ideally when you type 'perf
list' it should only access /sys.
Thanks,
Ingo
On Tue, May 03, 2011 at 04:22:34AM -0400, Ingo Molnar wrote:
> Could you send a Git pull request against perf/core that does the
> obvious file movement and renaming? We can do that first step straight
> away and get the ball rolling.
Sure, here we go:
The following changes since commit ac0a3260f37b8616da8d33488ec94b94e6ae5b31:
Merge branch 'tip/perf/core' of git://git.kernel.org/pub/scm/linux/kernel/git/rostedt/linux-2.6-trace into perf/core (2011-05-01 19:11:42 +0200)
are available in the git repository at:
git://git.kernel.org/pub/scm/linux/kernel/git/bp/bp.git perf-event-rename
Borislav Petkov (1):
perf: Start the restructuring
kernel/Makefile | 5 +++--
kernel/events/Makefile | 5 +++++
kernel/{perf_event.c => events/core.c} | 0
3 files changed, 8 insertions(+), 2 deletions(-)
create mode 100644 kernel/events/Makefile
rename kernel/{perf_event.c => events/core.c} (100%)
--
Regards/Gruss,
Boris.
Advanced Micro Devices GmbH
Einsteinring 24, 85609 Dornach
General Managers: Alberto Bozzo, Andrew Bowd
Registration: Dornach, Gemeinde Aschheim, Landkreis Muenchen
Registergericht Muenchen, HRB Nr. 43632
On Tue, May 03, 2011 at 08:40:46AM +0200, Ingo Molnar wrote:
> [ kernel/hw_breakpoint.c could possibly move into kernel/events/hw_breakpoint.c
> file as well - if Frederic acks it. ]
Makes sense yeah.
Thanks.
>
> Thanks,
>
> Ingo
On Tue, May 03, 2011 at 08:59:35AM -0400, Frederic Weisbecker wrote:
> On Tue, May 03, 2011 at 08:40:46AM +0200, Ingo Molnar wrote:
> > [ kernel/hw_breakpoint.c could possibly move into kernel/events/hw_breakpoint.c
> > file as well - if Frederic acks it. ]
>
> Makes sense yeah.
Ok, looks trivial enough. Ingo, let me know if I should add it to the
pull request from earlier.
--
>From 48dbb6dc86ca5d1b2224937d774c7ba98bc3a485 Mon Sep 17 00:00:00 2001
From: Borislav Petkov <[email protected]>
Date: Tue, 3 May 2011 15:26:43 +0200
Subject: [PATCH] hw breakpoints: Move to kernel/events/
As part of the events sybsystem unification, relocate hw_breakpoint.c
into its new destination.
Cc: Frederic Weisbecker <[email protected]>
Signed-off-by: Borislav Petkov <[email protected]>
---
kernel/Makefile | 1 -
kernel/events/Makefile | 3 ++-
kernel/{ => events}/hw_breakpoint.c | 0
3 files changed, 2 insertions(+), 2 deletions(-)
rename kernel/{ => events}/hw_breakpoint.c (100%)
diff --git a/kernel/Makefile b/kernel/Makefile
index 7981530..e9cf191 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -105,7 +105,6 @@ obj-$(CONFIG_IRQ_WORK) += irq_work.o
obj-$(CONFIG_PERF_EVENTS) += events/
-obj-$(CONFIG_HAVE_HW_BREAKPOINT) += hw_breakpoint.o
obj-$(CONFIG_USER_RETURN_NOTIFIER) += user-return-notifier.o
obj-$(CONFIG_PADATA) += padata.o
obj-$(CONFIG_CRASH_DUMP) += crash_dump.o
diff --git a/kernel/events/Makefile b/kernel/events/Makefile
index 26c00e4..1ce23d3 100644
--- a/kernel/events/Makefile
+++ b/kernel/events/Makefile
@@ -2,4 +2,5 @@ ifdef CONFIG_FUNCTION_TRACER
CFLAGS_REMOVE_core.o = -pg
endif
-obj-y += core.o
+obj-y := core.o
+obj-$(CONFIG_HAVE_HW_BREAKPOINT) += hw_breakpoint.o
diff --git a/kernel/hw_breakpoint.c b/kernel/events/hw_breakpoint.c
similarity index 100%
rename from kernel/hw_breakpoint.c
rename to kernel/events/hw_breakpoint.c
--
1.7.4.rc2
--
Regards/Gruss,
Boris.
Advanced Micro Devices GmbH
Einsteinring 24, 85609 Dornach
General Managers: Alberto Bozzo, Andrew Bowd
Registration: Dornach, Gemeinde Aschheim, Landkreis Muenchen
Registergericht Muenchen, HRB Nr. 43632
On Tue, May 03, 2011 at 09:30:57AM -0400, Borislav Petkov wrote:
> On Tue, May 03, 2011 at 08:59:35AM -0400, Frederic Weisbecker wrote:
> > On Tue, May 03, 2011 at 08:40:46AM +0200, Ingo Molnar wrote:
> > > [ kernel/hw_breakpoint.c could possibly move into kernel/events/hw_breakpoint.c
> > > file as well - if Frederic acks it. ]
> >
> > Makes sense yeah.
>
> Ok, looks trivial enough. Ingo, let me know if I should add it to the
> pull request from earlier.
I slapped it into a new branch along with the previous patch:
git://git.kernel.org/pub/scm/linux/kernel/git/bp/bp.git perf-event-rename-2
Thanks.
--
Regards/Gruss,
Boris.
Advanced Micro Devices GmbH
Einsteinring 24, 85609 Dornach
General Managers: Alberto Bozzo, Andrew Bowd
Registration: Dornach, Gemeinde Aschheim, Landkreis Muenchen
Registergericht Muenchen, HRB Nr. 43632
On Tue, 2011-05-03 at 09:18 +0200, Borislav Petkov wrote:
> On Tue, May 03, 2011 at 02:44:01AM -0400, Ingo Molnar wrote:
> > * Borislav Petkov <[email protected]> wrote:
[]
> > > +static struct dentry *mce_add_event_debugfs(struct perf_event *event, int cpu)
> > > +{
> > > + char buf[14];
> > > +
> > > + sprintf(buf, "mce_record%d", cpu);
trivia:
buf[14] doesn't fit 10k+ cpus.
Isn't that an unnecessary future/current problem?
Maybe something bigger?
char buf[sizeof("mce_record") + sizeof(int) * 3]
On Tue, May 03, 2011 at 11:14:16AM -0400, Joe Perches wrote:
> On Tue, 2011-05-03 at 09:18 +0200, Borislav Petkov wrote:
> > On Tue, May 03, 2011 at 02:44:01AM -0400, Ingo Molnar wrote:
> > > * Borislav Petkov <[email protected]> wrote:
> []
> > > > +static struct dentry *mce_add_event_debugfs(struct perf_event *event, int cpu)
> > > > +{
> > > > + char buf[14];
> > > > +
> > > > + sprintf(buf, "mce_record%d", cpu);
>
> trivia:
>
> buf[14] doesn't fit 10k+ cpus.
I'll fix that when someone comes up with a x86 system that supports 10k+
CPUs with a single system image :).
--
Regards/Gruss,
Boris.
Advanced Micro Devices GmbH
Einsteinring 24, 85609 Dornach
General Managers: Alberto Bozzo, Andrew Bowd
Registration: Dornach, Gemeinde Aschheim, Landkreis Muenchen
Registergericht Muenchen, HRB Nr. 43632
On Tue, 2011-05-03 at 17:22 +0200, Borislav Petkov wrote:
> On Tue, May 03, 2011 at 11:14:16AM -0400, Joe Perches wrote:
> > On Tue, 2011-05-03 at 09:18 +0200, Borislav Petkov wrote:
> > > On Tue, May 03, 2011 at 02:44:01AM -0400, Ingo Molnar wrote:
> > > > * Borislav Petkov <[email protected]> wrote:
> > []
> > > > > +static struct dentry *mce_add_event_debugfs(struct perf_event *event, int cpu)
> > > > > +{
> > > > > + char buf[14];
> > > > > +
> > > > > + sprintf(buf, "mce_record%d", cpu);
> > trivia:
> > buf[14] doesn't fit 10k+ cpus.
> I'll fix that when someone comes up with a x86 system that supports 10k+
> CPUs with a single system image :).
Your choice.
I think leaving trivially fixable known defects around
isn't good engineering, but maybe that's just me.
On Tue, 2011-05-03 at 17:22 +0200, Borislav Petkov wrote:
> On Tue, May 03, 2011 at 11:14:16AM -0400, Joe Perches wrote:
> > On Tue, 2011-05-03 at 09:18 +0200, Borislav Petkov wrote:
> > > On Tue, May 03, 2011 at 02:44:01AM -0400, Ingo Molnar wrote:
> > > > * Borislav Petkov <[email protected]> wrote:
> > []
> > > > > +static struct dentry *mce_add_event_debugfs(struct perf_event *event, int cpu)
> > > > > +{
> > > > > + char buf[14];
There should be no harm in upping that number to 32. (I've seen much
worse on stacks).
> > > > > +
> > > > > + sprintf(buf, "mce_record%d", cpu);
At least change this to:
snprintf(buf, 14, ....)
> >
> > trivia:
> >
> > buf[14] doesn't fit 10k+ cpus.
>
> I'll fix that when someone comes up with a x86 system that supports 10k+
> CPUs with a single system image :).
>
Best to fix it now, to save the trouble of finding this bug for those
that are maintaining this code long after we have retired or are dead ;)
-- Steve
On Tue, May 03, 2011 at 11:34:32AM -0400, Steven Rostedt wrote:
> On Tue, 2011-05-03 at 17:22 +0200, Borislav Petkov wrote:
> > On Tue, May 03, 2011 at 11:14:16AM -0400, Joe Perches wrote:
> > > On Tue, 2011-05-03 at 09:18 +0200, Borislav Petkov wrote:
> > > > On Tue, May 03, 2011 at 02:44:01AM -0400, Ingo Molnar wrote:
> > > > > * Borislav Petkov <[email protected]> wrote:
> > > []
> > > > > > +static struct dentry *mce_add_event_debugfs(struct perf_event *event, int cpu)
> > > > > > +{
> > > > > > + char buf[14];
>
> There should be no harm in upping that number to 32. (I've seen much
> worse on stacks).
>
> > > > > > +
> > > > > > + sprintf(buf, "mce_record%d", cpu);
>
> At least change this to:
>
> snprintf(buf, 14, ....)
Yep, this is the only sane suggestion I've got, thanks.
Other than that, we shouldn't be concentrating too much on this since
we're moving to /sysfs anyway and there we have the per-CPU hierarchy
with /sys/devices/system/cpu/cpu%d...
> > I'll fix that when someone comes up with a x86 system that supports 10k+
> > CPUs with a single system image :).
> >
>
> Best to fix it now, to save the trouble of finding this bug for those
> that are maintaining this code long after we have retired or are dead ;)
Nah, I'll leave it in so that they can have a blast from the past and
can boast on the slashdot of the future or on ((LWN++)..)++ that they've
fixed the oldest bug in Linux when booting it on quantum x86 computers
and tried running their RAS daemon :-).
--
Regards/Gruss,
Boris.
Advanced Micro Devices GmbH
Einsteinring 24, 85609 Dornach
General Managers: Alberto Bozzo, Andrew Bowd
Registration: Dornach, Gemeinde Aschheim, Landkreis Muenchen
Registergericht Muenchen, HRB Nr. 43632
> Ok, the problem I see with it is that people without a RAS daemon
> running will have the mechanism collecting MCEs in the background, using
> up resources (4 pages per CPU is the buffer) and not doing anything (in
> the best case that is, when we're not broken otherwise).
Can the kernel detect whether anyone is listening to the
persistent MCE event? If so, then the kernel could printk()
something to let the user with no RAS daemon (or a dead
daemon) that stuff is happening that they might like to
know about.
Probably make some sense to delay such a message (so that in
the boot case we give the daemon a chance to get started before
complaining that it hasn't shown up for work).
-Tony
On Tue, May 03, 2011 at 01:17:50PM -0400, Luck, Tony wrote:
> > Ok, the problem I see with it is that people without a RAS daemon
> > running will have the mechanism collecting MCEs in the background, using
> > up resources (4 pages per CPU is the buffer) and not doing anything (in
> > the best case that is, when we're not broken otherwise).
>
> Can the kernel detect whether anyone is listening to the
> persistent MCE event? If so, then the kernel could printk()
> something to let the user with no RAS daemon (or a dead
> daemon) that stuff is happening that they might like to
> know about.
Right, so I have a primitive way to do that when you enable ras over the
command line, i.e. boot with "ras=on." But that doesn't help in cases
where the daemon dies for some reason.
Maybe the decoding path should look at whether the event descriptor is
still mmapped or whether the event is enabled; let me think about it a
bit longer, good point btw!
> Probably make some sense to delay such a message (so that in
> the boot case we give the daemon a chance to get started before
> complaining that it hasn't shown up for work).
Yep, that and also I need to address the case for catching earlybird
MCEs, when perf hasn't been initialized yet. I'm thinking we could
reuse the mcelog buffer and feed those into the RAS daemon after init.
Something like that.
Thanks.
--
Regards/Gruss,
Boris.
Advanced Micro Devices GmbH
Einsteinring 24, 85609 Dornach
General Managers: Alberto Bozzo, Andrew Bowd
Registration: Dornach, Gemeinde Aschheim, Landkreis Muenchen
Registergericht Muenchen, HRB Nr. 43632
* Borislav Petkov <[email protected]> wrote:
> > Probably make some sense to delay such a message (so that in the boot case
> > we give the daemon a chance to get started before complaining that it
> > hasn't shown up for work).
>
> Yep, that and also I need to address the case for catching earlybird MCEs,
> when perf hasn't been initialized yet. I'm thinking we could reuse the mcelog
> buffer and feed those into the RAS daemon after init. Something like that.
I'd suggest that to solve that problem we initialize events earlier and allow
something like a small static buffer backed event which does not need any
dynamic memory so it can be enabled early on and can receive events, at the
same time the MCE hw bits are enabled.
That event can get a larger buffer once the system is more fully up - without
losing whatever events it is already holding.
Thanks,
Ingo
* Luck, Tony <[email protected]> wrote:
> > Ok, the problem I see with it is that people without a RAS daemon
> > running will have the mechanism collecting MCEs in the background, using
> > up resources (4 pages per CPU is the buffer) and not doing anything (in
> > the best case that is, when we're not broken otherwise).
>
> Can the kernel detect whether anyone is listening to the
> persistent MCE event? If so, then the kernel could printk()
> something to let the user with no RAS daemon (or a dead
> daemon) that stuff is happening that they might like to
> know about.
>
> Probably make some sense to delay such a message (so that in
> the boot case we give the daemon a chance to get started before
> complaining that it hasn't shown up for work).
Yes, i definitely think a gateway to printk would be useful, so that the system
can log MCE events the syslog way as well. This probably makes sense for
persistent events in general, not just MCE events.
Btw., as a sidenote, the much more interesting direction is the reverse
direction: we want a gateway of printk into the RAS daemon as well - in form of
a special 'printk events' that contain:
- the log level of the kernel when the message was generated
- the log level of the message
- the printk timestamp
- plus the printk message itself, as a free-form string
This would allow RAS functionality to dispatch off printk events immediately
and transparently, without having to separately worry about how to talk to
syslogd/klogd how to get its logs ...
printk itself could become a persistent event. (Transparently and without
breaking compatible syslogd/klogd functionality.)
This would also allow the RAS daemon to log printk messages around suspicious
MCE events, in a time-serialized way via a single event channel - so post
mortem can be done using a single facility.
There's ongoing work to timestamp perf events with GTOD timestamps - that way
global log analysis becomes possible as well.
Thanks,
Ingo
> Yes, i definitely think a gateway to printk would be useful, so that the system
> can log MCE events the syslog way as well. This probably makes sense for
> persistent events in general, not just MCE events.
s/as well/instead/ ??? If the persistent event mechanism is correctly feeding
data to a mart daemon, I don't think we need any printk() chatter. It is only
if this is not working that we'd want to see some console logging.
I agree that this isn't just a property of the MCE persistent event - other
persistent events would very likely want a way to shout for help if the events
are piling up with no listener.
> printk itself could become a persistent event. (Transparently and without
> breaking compatible syslogd/klogd functionality.)
Someone from Google was very skeptical of printk() remaining stable from
release to release ... a big issue when you have some heavy duty infrastructure
trying to parse and consume these messages. We should really consider such
stuff a user visible ABI, and thus not subject to random breakage - which
is a radical departure from our current attitude to printk().
-Tony
Em Wed, May 04, 2011 at 02:40:37PM -0700, Luck, Tony escreveu:
> > Yes, i definitely think a gateway to printk would be useful, so that the system
> > can log MCE events the syslog way as well. This probably makes sense for
> > persistent events in general, not just MCE events.
>
> s/as well/instead/ ??? If the persistent event mechanism is correctly feeding
> data to a mart daemon, I don't think we need any printk() chatter. It is only
> if this is not working that we'd want to see some console logging.
>
> I agree that this isn't just a property of the MCE persistent event - other
> persistent events would very likely want a way to shout for help if the events
> are piling up with no listener.
>
> > printk itself could become a persistent event. (Transparently and without
> > breaking compatible syslogd/klogd functionality.)
>
> Someone from Google was very skeptical of printk() remaining stable from
> release to release ... a big issue when you have some heavy duty infrastructure
> trying to parse and consume these messages. We should really consider such
> stuff a user visible ABI, and thus not subject to random breakage - which
> is a radical departure from our current attitude to printk().
what is the problem with adding free form additional info to whatever
turns into heavenly for ever unchanged dogma? :-)
- Arnaldo
* Luck, Tony <[email protected]> wrote:
> > Yes, i definitely think a gateway to printk would be useful, so that the system
> > can log MCE events the syslog way as well. This probably makes sense for
> > persistent events in general, not just MCE events.
>
> s/as well/instead/ ??? If the persistent event mechanism is correctly feeding
> data to a mart daemon, I don't think we need any printk() chatter. It is only
> if this is not working that we'd want to see some console logging.
That could certainly be the default incarnation of it, but flexibly allowing
all the variations does not look particularly bothersome either.
I have no problem with only offering the sanest variations though.
> I agree that this isn't just a property of the MCE persistent event - other
> persistent events would very likely want a way to shout for help if the
> events are piling up with no listener.
Yeah. Basically a fallback mechanism and would also inform users about the
availability of a nice RAS daemon out there.
> > printk itself could become a persistent event. (Transparently and without
> > breaking compatible syslogd/klogd functionality.)
>
> Someone from Google was very skeptical of printk() remaining stable from
> release to release ... [...]
Yeah, the printk messages themselves are not ABI nor will they ever be -
although spurious changes are rare so they might provide a bridge to structured
events.
printk events are a compatibility wrapper to allow RAS functionality to have
easy and unified access to all system events that matter. The structure of
printk events is obviously the log level plus a free-form ASCII string,
something like:
1- the printk timestamp
2- the log level of the kernel when the message was generated
3- the log level of the message
4- the printk message itself, as a free-form string
> [...] a big issue when you have some heavy duty infrastructure trying to
> parse and consume these messages. We should really consider such stuff a
> user visible ABI, and thus not subject to random breakage - which is a
> radical departure from our current attitude to printk().
Indeed, turning printk into an ABI clearly wont fly upstream although i'd
expect upstream to *care more* about good printk messages if the RAS daemon
starts making good use of it. Any printk message that turns out to be useful
can be turned into an ABI by defining a proper structured event out of it, via
TRACE_EVENT() et al.
This does not mean that it's not *useful* to allow the streaming of all print
evnts to the RAS daemon. They are available, they get generated and they
clearly look useful to me, and it will be useful when a sysadmin looks at the
RAS log to figure out an incident.
Consider an example of two logs, one with just pure RAS events, the other with
printk lines (and user-space events, see my patch a couple of months ago that
allows event injection for critical user-space events as well) embedded:
The MCE-only log:
Subsystem | Time | event
------------------------------------------------------------------
[MCE] May 5 05:23:56 correctable MCE event on memory bank X
[MCE] May 5 06:19:59 correctable MCE event on memory bank X
Versus a broader, unified log (all events come via the perf event mmap
ring-buffer, ordered properly and delivered quickly and transparently):
Subsystem | Time | event
------------------------------------------------------------------
[MCE] May 5 05:23:56 correctable MCE event on memory bank X
[printk] May 5 06:19:53 thermal trip triggered
[MCE] May 5 06:19:59 correctable MCE event on memory bank X
[fault] May 5 06:20:00 delivered SIGSEGV to task 'httpd'
[httpd] May 5 06:20:00 unexpected restart
[printk] May 5 06:20:01 EXT4-fs (9345): group descriptors corrupted!
As a sysadmin i might misinterpret the first one as a low and still acceptable
rate of correctable MCE errors: roughly one event per hour.
I'd take the second log *much* more seriously and would prioritize this
incident as it likely indicates bad (overheating?) hardware and user-visible
crashes and possible uncorrected data corruption.
Note that we made use of printk events, fault events and user-space injected
events as well, in addition to the primary MCE events.
And yes, some of the printk events, if they are relied on frequently and
programmatically, will be turned into proper events - and this process is
helped by printk events.
As i understood it, being useful in such a way is one of the main goals of the
new RAS daemon.
Thanks,
Ingo
On Thu, May 05, 2011 at 02:39:51AM -0400, Ingo Molnar wrote:
> printk events are a compatibility wrapper to allow RAS functionality to have
> easy and unified access to all system events that matter. The structure of
> printk events is obviously the log level plus a free-form ASCII string,
> something like:
>
> 1- the printk timestamp
Yeah, we want here the timestamp when the event happened.
> 2- the log level of the kernel when the message was generated
> 3- the log level of the message
> 4- the printk message itself, as a free-form string
>
> > [...] a big issue when you have some heavy duty infrastructure trying to
> > parse and consume these messages. We should really consider such stuff a
> > user visible ABI, and thus not subject to random breakage - which is a
> > radical departure from our current attitude to printk().
>
> Indeed, turning printk into an ABI clearly wont fly upstream although i'd
> expect upstream to *care more* about good printk messages if the RAS daemon
> starts making good use of it. Any printk message that turns out to be useful
> can be turned into an ABI by defining a proper structured event out of it, via
> TRACE_EVENT() et al.
Actually let's have the RAS printk's as TRACE_EVENT's from the start
- it's not like we're going to convert every printk call into a RAS
printk event. We only want relevant ones from traps.c, maybe some power
management events, fs, maybe some critical security stuff, etc.
> This does not mean that it's not *useful* to allow the streaming of all print
> evnts to the RAS daemon. They are available, they get generated and they
> clearly look useful to me, and it will be useful when a sysadmin looks at the
> RAS log to figure out an incident.
>
> Consider an example of two logs, one with just pure RAS events, the other with
> printk lines (and user-space events, see my patch a couple of months ago that
> allows event injection for critical user-space events as well) embedded:
>
> The MCE-only log:
>
> Subsystem | Time | event
> ------------------------------------------------------------------
> [MCE] May 5 05:23:56 correctable MCE event on memory bank X
> [MCE] May 5 06:19:59 correctable MCE event on memory bank X
>
> Versus a broader, unified log (all events come via the perf event mmap
> ring-buffer, ordered properly and delivered quickly and transparently):
Yes, especially since we can get it out to userspace even faster than
printk :).
> Subsystem | Time | event
> ------------------------------------------------------------------
> [MCE] May 5 05:23:56 correctable MCE event on memory bank X
> [printk] May 5 06:19:53 thermal trip triggered
> [MCE] May 5 06:19:59 correctable MCE event on memory bank X
> [fault] May 5 06:20:00 delivered SIGSEGV to task 'httpd'
> [httpd] May 5 06:20:00 unexpected restart
> [printk] May 5 06:20:01 EXT4-fs (9345): group descriptors corrupted!
^^ I wouldn't even call it "printk" since this is obviously [fs].
The printk event should have a field that says from which subsys the
printk comes from and thus make it so integrated that it is even
invisible :).
And
> [printk] May 5 06:19:53 thermal trip triggered
could be
> [pm] May 5 06:19:53 thermal trip triggered
for power management.
> As a sysadmin i might misinterpret the first one as a low and still acceptable
> rate of correctable MCE errors: roughly one event per hour.
>
> I'd take the second log *much* more seriously and would prioritize this
> incident as it likely indicates bad (overheating?) hardware and user-visible
> crashes and possible uncorrected data corruption.
>
> Note that we made use of printk events, fault events and user-space injected
> events as well, in addition to the primary MCE events.
>
> And yes, some of the printk events, if they are relied on frequently and
> programmatically, will be turned into proper events - and this process is
> helped by printk events.
>
> As i understood it, being useful in such a way is one of the main goals of the
> new RAS daemon.
Yep, this is absolutely what we want to do - we want to have RAS
events not only coupled with hardware events but actually collect all
_relevant_ events into a common RAS log that is very lightweight and
doesn't rely on printk. It can, _however_, be turned into printk's (and
it should) if no daemon is running. And yes, we can have special tags
like "[[..]]" or whatever to be able to grep it out even from dmesg.
Cool,
thanks.
--
Regards/Gruss,
Boris.
Advanced Micro Devices GmbH
Einsteinring 24, 85609 Dornach
General Managers: Alberto Bozzo, Andrew Bowd
Registration: Dornach, Gemeinde Aschheim, Landkreis Muenchen
Registergericht Muenchen, HRB Nr. 43632
* Borislav Petkov <[email protected]> wrote:
> On Thu, May 05, 2011 at 02:39:51AM -0400, Ingo Molnar wrote:
> > printk events are a compatibility wrapper to allow RAS functionality to have
> > easy and unified access to all system events that matter. The structure of
> > printk events is obviously the log level plus a free-form ASCII string,
> > something like:
> >
> > 1- the printk timestamp
>
> Yeah, we want here the timestamp when the event happened.
>
> > 2- the log level of the kernel when the message was generated
> > 3- the log level of the message
> > 4- the printk message itself, as a free-form string
> >
> > > [...] a big issue when you have some heavy duty infrastructure trying to
> > > parse and consume these messages. We should really consider such stuff a
> > > user visible ABI, and thus not subject to random breakage - which is a
> > > radical departure from our current attitude to printk().
> >
> > Indeed, turning printk into an ABI clearly wont fly upstream although i'd
> > expect upstream to *care more* about good printk messages if the RAS daemon
> > starts making good use of it. Any printk message that turns out to be useful
> > can be turned into an ABI by defining a proper structured event out of it, via
> > TRACE_EVENT() et al.
>
> Actually let's have the RAS printk's as TRACE_EVENT's from the start
> - it's not like we're going to convert every printk call into a RAS
> printk event. [...]
Fully agreed that printk should be a TRACE_EVENT() from the get go.
What i meant was that it also gives an opportunity for the introduction of new
TRACE_EVENT()s: if RAS tooling sees problems with some important printk
changing its format all the time then such problems can be addressed by
'upgrading' that printk event to a TRACE_EVENT().
> [...] We only want relevant ones from traps.c, maybe some power management
> events, fs, maybe some critical security stuff, etc.
Yeah. And what that 'critical stuff' is will probably be found out gradually.
In the meanwhile we'll have printk events as a starting point.
The printk event is also useful for a practical reason: if you add it right now
you can test the RAS daemon and provoke a steady stream of events easily by
catching (and generating) printk events.
( We also want event injection to work, to be able to simulate real MCE
events. )
Thanks,
Ingo