From: Borislav Petkov <[email protected]>
Hi all,
off and on I get some free time to work on that, here's the latest
incarnation. It contains review feedback from the earlier round.
Patch 1/4 adds a trace_add_file() interface which adds an additional
file to debugfs, in this case the "persistent" file which contains the
normal perf file descriptor sys_perf_event_open gives to the perf tool.
IOW, one gets:
/mnt/dbg/tracing/events/mce/mce_record/
|-- enable
|-- filter
|-- format
|-- id
`-- persistent1
0 directories, 5 files
[ 1 is the CPU number so sticking all per-CPU descriptors in this
directory could get a little cluttered and ugly so I'll have to think
about that a bit more. ]
3/4 is the meat which adds <kernel/events/persistent.c> and 4/4 shows
how one can init a persistent event on a CPU.
What remains is adding code which can enable events on boot from the
kernel cmdline and more testing.
As always, comments and suggestions are appreciated.
Thanks.
Borislav Petkov (4):
trace events: Interface to add files to debugfs
perf: Add persistent events
perf: Add persistent event facilities
persistent test
arch/x86/include/asm/mce.h | 1 +
arch/x86/kernel/cpu/mcheck/mce.c | 5 ++
include/linux/ftrace_event.h | 3 +
include/linux/perf_event.h | 24 +++++-
kernel/events/Makefile | 2 +-
kernel/events/core.c | 18 +++--
kernel/events/internal.h | 2 +
kernel/events/persistent.c | 168 +++++++++++++++++++++++++++++++++++++++
kernel/trace/trace.c | 4 +
kernel/trace/trace.h | 1 +
kernel/trace/trace_events.c | 35 ++++++++
11 files changed, 254 insertions(+), 9 deletions(-)
create mode 100644 kernel/events/persistent.c
--
1.7.11.rc1
From: Borislav Petkov <[email protected]>
Add the needed pieces for persistent events which are process-agnostic.
Also, make their buffers read-only when mmaping them from userspace.
Add a descriptor struct collecting all that is needed for a persistent
event, for ease of handling in the code later.
Signed-off-by: Borislav Petkov <[email protected]>
---
include/linux/perf_event.h | 17 ++++++++++++++++-
kernel/events/core.c | 5 ++++-
2 files changed, 20 insertions(+), 2 deletions(-)
diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 7602ccb3f40e..252aab74e64d 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -254,8 +254,9 @@ struct perf_event_attr {
exclude_host : 1, /* don't count in host */
exclude_guest : 1, /* don't count in guest */
+ persistent : 1, /* always-on event */
- __reserved_1 : 43;
+ __reserved_1 : 42;
union {
__u32 wakeup_events; /* wakeup every n events */
@@ -1078,6 +1079,20 @@ struct perf_output_handle {
int page;
};
+/*
+ * perf event descriptor to collect all attributes belonging to an event
+ */
+struct perf_event_desc {
+ struct perf_event *event;
+ struct perf_event_attr *pattr;
+ const struct file_operations *fops;
+ /* debugfs entry */
+ struct dentry *dentry;
+ const char *dir_name;
+ const char *fname;
+ int fd;
+};
+
#ifdef CONFIG_PERF_EVENTS
extern int perf_pmu_register(struct pmu *pmu, char *name, int type);
diff --git a/kernel/events/core.c b/kernel/events/core.c
index b7935fcec7d9..95026f2b3d55 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -3589,7 +3589,10 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma)
if (event->cpu == -1 && event->attr.inherit)
return -EINVAL;
- if (!(vma->vm_flags & VM_SHARED))
+ if (!(vma->vm_flags & VM_SHARED) && !event->attr.persistent)
+ return -EINVAL;
+
+ if (event->attr.persistent && (vma->vm_flags & VM_WRITE))
return -EINVAL;
vma_size = vma->vm_end - vma->vm_start;
--
1.7.11.rc1
From: Borislav Petkov <[email protected]>
Signed-off-by: Borislav Petkov <[email protected]>
---
arch/x86/include/asm/mce.h | 1 +
arch/x86/kernel/cpu/mcheck/mce.c | 5 +++++
kernel/events/persistent.c | 41 +++++++++++++++++++++++++++++++++++++++-
3 files changed, 46 insertions(+), 1 deletion(-)
diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h
index a3ac52b29cbf..e46b05dff64a 100644
--- a/arch/x86/include/asm/mce.h
+++ b/arch/x86/include/asm/mce.h
@@ -256,5 +256,6 @@ struct cper_sec_mem_err;
extern void apei_mce_report_mem_error(int corrected,
struct cper_sec_mem_err *mem_err);
+extern int perf_get_mce_event_id(void);
#endif /* __KERNEL__ */
#endif /* _ASM_X86_MCE_H */
diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
index 292d0258311c..cacd45bfac01 100644
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -2445,3 +2445,8 @@ static int __init mcheck_debugfs_init(void)
}
late_initcall(mcheck_debugfs_init);
#endif
+
+int perf_get_mce_event_id(void)
+{
+ return event_mce_record.event.type;
+}
diff --git a/kernel/events/persistent.c b/kernel/events/persistent.c
index 33d45396dc02..edb07640d5f1 100644
--- a/kernel/events/persistent.c
+++ b/kernel/events/persistent.c
@@ -10,7 +10,7 @@
#include <asm/mce.h>
-static int pers_open_generic(struct inode *inode, struct file *filp)
+int pers_open_generic(struct inode *inode, struct file *filp)
{
filp->private_data = inode->i_private;
return 0;
@@ -127,3 +127,42 @@ void perf_rm_persistent_event(struct perf_event_desc *desc)
perf_event_release_kernel(event);
}
+
+static struct perf_event_attr pattr = {
+ .type = PERF_TYPE_TRACEPOINT,
+ .size = sizeof(pattr),
+ .sample_type = PERF_SAMPLE_RAW,
+ .persistent = 1,
+};
+
+static struct perf_event_desc pdesc = {
+ .dir_name = "mce_record",
+};
+
+static int __init pers_init(void)
+{
+ struct perf_event *event;
+ char *fname;
+ int cpu = 1;
+
+ fname = kzalloc(10 + 4, GFP_KERNEL);
+ if (!fname)
+ return -ENOMEM;
+
+ snprintf(fname, 14, "persistent%d", cpu);
+ pdesc.fname = fname;
+
+ pattr.config = perf_get_mce_event_id();
+ pattr.sample_period = 1;
+ pattr.wakeup_events = 1,
+ pdesc.pattr = &pattr;
+
+ event = perf_add_persistent_on_cpu(cpu, &pdesc, 4);
+ if (IS_ERR(event)) {
+ pr_err("%s: Error adding persistent event!\n", __func__);
+ return -EINVAL;
+ }
+ return 0;
+}
+
+late_initcall(pers_init);
--
1.7.11.rc1
From: Borislav Petkov <[email protected]>
This generic interface is supposed to be used to add files to
(debugfs)/tracing/events/<subsys>/<event_name>/ which are supposed to
export additional events functionality to users.
For example, the file descriptor of a persistent event can be exported
like this in the 'pers' node:
/mnt/dbg/tracing/events/mce/mce_record/
|-- enable
|-- filter
|-- format
|-- id
`-- persistent
So that userspace tools can read it out and map it directly.
Cc: Peter Zijlstra <[email protected]>
Cc: Steven Rostedt <[email protected]>
Signed-off-by: Borislav Petkov <[email protected]>
---
include/linux/ftrace_event.h | 3 +++
kernel/trace/trace.c | 4 ++++
kernel/trace/trace.h | 1 +
kernel/trace/trace_events.c | 35 +++++++++++++++++++++++++++++++++++
4 files changed, 43 insertions(+)
diff --git a/include/linux/ftrace_event.h b/include/linux/ftrace_event.h
index 642928cf57b4..f4f283954cbf 100644
--- a/include/linux/ftrace_event.h
+++ b/include/linux/ftrace_event.h
@@ -265,6 +265,9 @@ extern int trace_define_field(struct ftrace_event_call *call, const char *type,
int is_signed, int filter_type);
extern int trace_add_event_call(struct ftrace_event_call *call);
extern void trace_remove_event_call(struct ftrace_event_call *call);
+extern struct dentry *
+trace_add_file(const char *dir_name, const char *fname, umode_t mode,
+ void *data, const struct file_operations *fops);
#define is_signed_type(type) (((type)(-1)) < 0)
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 5c38c81496ce..e45479572328 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -4641,6 +4641,10 @@ struct dentry *trace_create_file(const char *name,
return ret;
}
+void trace_remove_file(struct dentry *d)
+{
+ return debugfs_remove(d);
+}
static struct dentry *trace_options_init_dentry(void)
{
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 55e1f7f0db12..fd639f45c425 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -338,6 +338,7 @@ struct dentry *trace_create_file(const char *name,
struct dentry *parent,
void *data,
const struct file_operations *fops);
+void trace_remove_file(struct dentry *d);
struct dentry *tracing_init_dentry(void);
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 29111da1d100..5eda4f584380 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -1199,6 +1199,41 @@ event_create_dir(struct ftrace_event_call *call, struct dentry *d_events,
return 0;
}
+static struct dentry *__trace_add_file(struct ftrace_event_call *call,
+ const char *fname, umode_t mode,
+ void *data,
+ const struct file_operations *fops)
+{
+ struct dentry *ret;
+
+ mutex_lock(&event_mutex);
+ ret = trace_create_file(fname, mode, call->dir, data, fops);
+ mutex_unlock(&event_mutex);
+
+ return ret;
+}
+
+/*
+ * Assumptions:
+ * - event debugfs dir is already initialized
+ * - trace event is not declared in a module
+ */
+struct dentry *trace_add_file(const char *dir_name, const char *fname,
+ umode_t mode, void *data,
+ const struct file_operations *fops)
+{
+ struct ftrace_event_call *call;
+
+ if (!strlen(fname))
+ return NULL;
+
+ list_for_each_entry(call, &ftrace_events, list) {
+ if (!strncmp(call->name, dir_name, strlen(dir_name)))
+ return __trace_add_file(call, fname, mode, data, fops);
+ }
+ return NULL;
+}
+
static int
__trace_add_event_call(struct ftrace_event_call *call, struct module *mod,
const struct file_operations *id,
--
1.7.11.rc1
From: Borislav Petkov <[email protected]>
Add a barebones implementation for registering persistent events with
perf. For that, we don't destroy the buffers when they're unmapped;
also, we map them read-only so that multiple agents can access them.
Also, we allocate event buffer at event allocation time and not at mmap
time so that we can log samples into it regardless of whether there are
readers in userspace.
Signed-off-by: Borislav Petkov <[email protected]>
---
include/linux/perf_event.h | 7 +++
kernel/events/Makefile | 2 +-
kernel/events/core.c | 13 ++---
kernel/events/internal.h | 2 +
kernel/events/persistent.c | 129 +++++++++++++++++++++++++++++++++++++++++++++
5 files changed, 146 insertions(+), 7 deletions(-)
create mode 100644 kernel/events/persistent.c
diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 252aab74e64d..95073ac4186f 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -1312,6 +1312,10 @@ extern void perf_swevent_put_recursion_context(int rctx);
extern void perf_event_enable(struct perf_event *event);
extern void perf_event_disable(struct perf_event *event);
extern void perf_event_task_tick(void);
+extern struct perf_event *perf_add_persistent_on_cpu(unsigned int,
+ struct perf_event_desc *,
+ unsigned nr_pages);
+extern void perf_rm_persistent_event(struct perf_event_desc *desc);
#else
static inline void
perf_event_task_sched_in(struct task_struct *prev,
@@ -1350,6 +1354,9 @@ static inline void perf_swevent_put_recursion_context(int rctx) { }
static inline void perf_event_enable(struct perf_event *event) { }
static inline void perf_event_disable(struct perf_event *event) { }
static inline void perf_event_task_tick(void) { }
+static inline struct perf_event *perf_add_persistent_on_cpu
+(unsigned int cpu, struct perf_event_desc *desc, unsigned nr_pages) { return NULL; }
+static inline void perf_rm_persistent_event(struct perf_event_desc *) { }
#endif
#define perf_output_put(handle, x) perf_output_copy((handle), &(x), sizeof(x))
diff --git a/kernel/events/Makefile b/kernel/events/Makefile
index 103f5d147b2f..70990d5a2037 100644
--- a/kernel/events/Makefile
+++ b/kernel/events/Makefile
@@ -2,7 +2,7 @@ ifdef CONFIG_FUNCTION_TRACER
CFLAGS_REMOVE_core.o = -pg
endif
-obj-y := core.o ring_buffer.o callchain.o
+obj-y := core.o ring_buffer.o callchain.o persistent.o
obj-$(CONFIG_HAVE_HW_BREAKPOINT) += hw_breakpoint.o
obj-$(CONFIG_UPROBES) += uprobes.o
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 95026f2b3d55..e59579d442ca 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -2854,8 +2854,6 @@ static void free_event_rcu(struct rcu_head *head)
kfree(event);
}
-static void ring_buffer_put(struct ring_buffer *rb);
-
static void free_event(struct perf_event *event)
{
irq_work_sync(&event->pending);
@@ -3225,8 +3223,6 @@ unlock:
return ret;
}
-static const struct file_operations perf_fops;
-
static struct perf_event *perf_fget_light(int fd, int *fput_needed)
{
struct file *file;
@@ -3517,7 +3513,7 @@ static struct ring_buffer *ring_buffer_get(struct perf_event *event)
return rb;
}
-static void ring_buffer_put(struct ring_buffer *rb)
+void ring_buffer_put(struct ring_buffer *rb)
{
struct perf_event *event, *n;
unsigned long flags;
@@ -3546,6 +3542,11 @@ static void perf_mmap_close(struct vm_area_struct *vma)
{
struct perf_event *event = vma->vm_file->private_data;
+ if (event->attr.persistent) {
+ atomic_dec(&event->mmap_count);
+ return;
+ }
+
if (atomic_dec_and_mutex_lock(&event->mmap_count, &event->mmap_mutex)) {
unsigned long size = perf_data_size(event->rb);
struct user_struct *user = event->mmap_user;
@@ -3694,7 +3695,7 @@ static int perf_fasync(int fd, struct file *filp, int on)
return 0;
}
-static const struct file_operations perf_fops = {
+const struct file_operations perf_fops = {
.llseek = no_llseek,
.release = perf_release,
.read = perf_read,
diff --git a/kernel/events/internal.h b/kernel/events/internal.h
index a096c19f2c2a..a13834f26d93 100644
--- a/kernel/events/internal.h
+++ b/kernel/events/internal.h
@@ -37,6 +37,7 @@ struct ring_buffer {
extern void rb_free(struct ring_buffer *rb);
extern struct ring_buffer *
rb_alloc(int nr_pages, long watermark, int cpu, int flags);
+extern void ring_buffer_put(struct ring_buffer *rb);
extern void perf_event_wakeup(struct perf_event *event);
extern void
@@ -134,4 +135,5 @@ static inline void put_recursion_context(int *recursion, int rctx)
recursion[rctx]--;
}
+extern const struct file_operations perf_fops;
#endif /* _KERNEL_EVENTS_INTERNAL_H */
diff --git a/kernel/events/persistent.c b/kernel/events/persistent.c
new file mode 100644
index 000000000000..33d45396dc02
--- /dev/null
+++ b/kernel/events/persistent.c
@@ -0,0 +1,129 @@
+#include <linux/slab.h>
+#include <linux/file.h>
+#include <linux/debugfs.h>
+#include <linux/perf_event.h>
+#include <linux/anon_inodes.h>
+#include <linux/ftrace_event.h>
+
+#include "../trace/trace.h"
+#include "internal.h"
+
+#include <asm/mce.h>
+
+static int pers_open_generic(struct inode *inode, struct file *filp)
+{
+ filp->private_data = inode->i_private;
+ return 0;
+}
+
+static ssize_t
+pers_event_read(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos)
+{
+ struct perf_event_desc *d = filp->private_data;
+ struct trace_seq *s;
+ int r;
+
+ if (*ppos)
+ return 0;
+
+ s = kzalloc(sizeof(*s), GFP_KERNEL);
+ if (!s)
+ return -ENOMEM;
+
+ trace_seq_init(s);
+ trace_seq_printf(s, "%d\n", d->fd);
+
+ r = simple_read_from_buffer(ubuf, cnt, ppos, s->buffer, s->len);
+ kfree(s);
+ return r;
+}
+
+static const struct file_operations persistent_fops = {
+ .open = pers_open_generic,
+ .read = pers_event_read,
+ .llseek = default_llseek,
+};
+
+/*
+ * Create and enable the persistent version of the perf event described by
+ * @desc->pattr.
+ *
+ * Returns the @event pointer which receives the allocated event from
+ * perf on success. Make sure to check return code before touching @event
+ * any further.
+ *
+ * @cpu: on which cpu
+ * @desc: perf event descriptor
+ * @nr_pages: size in pages
+ */
+struct perf_event
+*perf_add_persistent_on_cpu(unsigned int cpu, struct perf_event_desc *desc,
+ unsigned nr_pages)
+{
+ struct perf_event *event = ERR_PTR(-EINVAL);
+ struct file *event_file = NULL;
+ struct ring_buffer *buf;
+ struct dentry *dentry;
+ int event_fd;
+
+ event_fd = get_unused_fd_flags(O_RDWR);
+ if (event_fd < 0)
+ return NULL;
+
+ dentry = trace_add_file(desc->dir_name, desc->fname, 0444,
+ desc, &persistent_fops);
+ if (!dentry) {
+ pr_err("Error adding trace file %s\n", desc->fname);
+ goto err_trace_file;
+ }
+
+ desc->dentry = dentry;
+
+ event_file = anon_inode_getfile("[pers_event]", &perf_fops, event, O_RDWR);
+ if (IS_ERR(event_file))
+ goto err_inode;
+
+ event = perf_event_create_kernel_counter(desc->pattr, cpu, NULL, NULL, NULL);
+ if (IS_ERR(event))
+ goto err_inode;
+
+ buf = rb_alloc(nr_pages, 0, cpu, RING_BUFFER_WRITABLE);
+ if (!buf)
+ goto err_counter;
+
+ rcu_assign_pointer(event->rb, buf);
+ fd_install(event_fd, event_file);
+ event->filp = event_file;
+ desc->fd = event_fd;
+
+ perf_event_enable(event);
+
+ return event;
+
+ err_counter:
+ perf_event_release_kernel(event);
+
+ err_inode:
+ trace_remove_file(desc->dentry);
+
+ err_trace_file:
+ put_unused_fd(event_fd);
+ return event;
+}
+
+void perf_rm_persistent_event(struct perf_event_desc *desc)
+{
+ struct perf_event *event = desc->event;
+
+ if (!event)
+ return;
+
+ perf_event_disable(event);
+
+ if (event->rb) {
+ ring_buffer_put(event->rb);
+ rcu_assign_pointer(event->rb, NULL);
+ }
+
+ perf_event_release_kernel(event);
+}
--
1.7.11.rc1
On Thu, 16 Aug 2012 19:45:19 +0200
Borislav Petkov <[email protected]> wrote:
> off and on I get some free time to work on that, here's the latest
> incarnation. It contains review feedback from the earlier round.
Can I add one small request for the next round? I've looked through the
patches and the code a bit, and I still have absolutely no clue of what a
"persistent event" is or why I might want one. Now, admittedly, I'm
slower than most, but it still might not hurt to add some overall
description of what this is for...?
Thanks,
jon
Hi Jon,
On Thu, Aug 16, 2012 at 02:12:53PM -0600, Jonathan Corbet wrote:
> On Thu, 16 Aug 2012 19:45:19 +0200
> Borislav Petkov <[email protected]> wrote:
>
> > off and on I get some free time to work on that, here's the latest
> > incarnation. It contains review feedback from the earlier round.
>
> Can I add one small request for the next round? I've looked through the
> patches and the code a bit, and I still have absolutely no clue of what a
> "persistent event" is or why I might want one. Now, admittedly, I'm
> slower than most, but it still might not hurt to add some overall
> description of what this is for...?
yeah, maybe the naming is not the most fitting one but here's the basic
idea:
Normally, you enable perf events for a duration of time where you trace
your workload and then you disable them. In contrast, persistent events
are perf events which you enable at system boot and they remain enabled
- thus persistent - during the whole system lifetime.
The machinery logs all events and carries them out to userspace. Much
like the blackbox of a plane which records system events during the
whole flight.
And the usecase here is that we want the machine check code to log
machine checks during the whole system lifetime using the perf/trace
events infrastructure. And this happens regardless of whether there's a
userspace consumer of the logged data. Normally, we'll have one though
:).
MCA and the trace_mce_record() tracepoint is thus is the first user of
the persistent events but there might be others.
Please follow this thread:
http://marc.info/?l=linux-kernel&m=133234057614885 for some more
thoughts on the idea.
Of course, this whole thing is basically a rough draft and we're still
hammering out the details as we go.
Thanks.
--
Regards/Gruss,
Boris.
Advanced Micro Devices GmbH
Einsteinring 24, 85609 Dornach
GM: Alberto Bozzo
Reg: Dornach, Landkreis Muenchen
HRB Nr. 43632 WEEE Registernr: 129 19551
On Thu, 2012-08-16 at 22:55 +0200, Borislav Petkov wrote:
> Hi Jon,
>
> On Thu, Aug 16, 2012 at 02:12:53PM -0600, Jonathan Corbet wrote:
> > On Thu, 16 Aug 2012 19:45:19 +0200
> > Borislav Petkov <[email protected]> wrote:
> >
> > > off and on I get some free time to work on that, here's the latest
> > > incarnation. It contains review feedback from the earlier round.
> >
> > Can I add one small request for the next round? I've looked through the
> > patches and the code a bit, and I still have absolutely no clue of what a
> > "persistent event" is or why I might want one. Now, admittedly, I'm
> > slower than most, but it still might not hurt to add some overall
> > description of what this is for...?
>
> yeah, maybe the naming is not the most fitting one but here's the basic
> idea:
>
> Normally, you enable perf events for a duration of time where you trace
> your workload and then you disable them. In contrast, persistent events
> are perf events which you enable at system boot and they remain enabled
> - thus persistent - during the whole system lifetime.
>
> The machinery logs all events and carries them out to userspace. Much
> like the blackbox of a plane which records system events during the
> whole flight.
>
> And the usecase here is that we want the machine check code to log
> machine checks during the whole system lifetime using the perf/trace
> events infrastructure. And this happens regardless of whether there's a
> userspace consumer of the logged data. Normally, we'll have one though
> :).
>
> MCA and the trace_mce_record() tracepoint is thus is the first user of
> the persistent events but there might be others.
>
> Please follow this thread:
> http://marc.info/?l=linux-kernel&m=133234057614885 for some more
> thoughts on the idea.
>
> Of course, this whole thing is basically a rough draft and we're still
> hammering out the details as we go.
BTW, we already have a persistent buffering in the kernel. It's used by
ftrace. What about having perf use that buffering for persistent events?
Or is there some other issues about using it.
I'm currently working on having perf read ftrace data, so in the near
future, I plan on having some RFC patches to have perf reading from this
buffer anyway.
-- Steve
On Thu, Aug 16, 2012 at 05:13:33PM -0400, Steven Rostedt wrote:
> BTW, we already have a persistent buffering in the kernel. It's used
> by ftrace. What about having perf use that buffering for persistent
> events? Or is there some other issues about using it.
Not that I know of.
I coded this with the perf ring buffer now. If only there was one ring
buffer in the kernel.. /me ducks and hides.
Ok, with perf I can read out the events programmatically by mmaping the
per-CPU buffer with perf_mmap. How do you do that in ftrace? Any code
pointers I can stare at?
> I'm currently working on having perf read ftrace data, so in the near
> future, I plan on having some RFC patches to have perf reading from
> this buffer anyway.
Are you saying the ftrace buffer would be mmappable too now?
Btw, I wanted to hear your opinion on patch 1/4 since it touches
ftrace/trace_events code. Can you please look at it and tell me if its
ok?
Thanks.
--
Regards/Gruss,
Boris.
Advanced Micro Devices GmbH
Einsteinring 24, 85609 Dornach
GM: Alberto Bozzo
Reg: Dornach, Landkreis Muenchen
HRB Nr. 43632 WEEE Registernr: 129 19551
On Thu, 2012-08-16 at 23:41 +0200, Borislav Petkov wrote:
> On Thu, Aug 16, 2012 at 05:13:33PM -0400, Steven Rostedt wrote:
> > BTW, we already have a persistent buffering in the kernel. It's used
> > by ftrace. What about having perf use that buffering for persistent
> > events? Or is there some other issues about using it.
>
> Not that I know of.
>
> I coded this with the perf ring buffer now. If only there was one ring
> buffer in the kernel.. /me ducks and hides.
:)
>
> Ok, with perf I can read out the events programmatically by mmaping the
> per-CPU buffer with perf_mmap. How do you do that in ftrace? Any code
> pointers I can stare at?
Do these need to be mmapped, or are they copied into another location
(disk or network)? I had code to allow the ftrace ring buffers to be
mmapped, but stopped that work a while ago.
>
> > I'm currently working on having perf read ftrace data, so in the near
> > future, I plan on having some RFC patches to have perf reading from
> > this buffer anyway.
>
> Are you saying the ftrace buffer would be mmappable too now?
No, the ftrace buffer was optimized for the splice command. I would have
perf do what trace-cmd does. That is, write directly into a file as it
reads it, without ever needing the info to come into userspace. I wasn't
planning on replacing perf buffers, I was just planning on giving perf a
tracing boost.
Another alternative, which would be a bit slower, would be to copy the
pages, either into userspace (as a read) or into a location to mmap the
buffers.
I could also continue the work on allowing ftrace buffers to be mmapped.
But if the best thing to do is just use the perf buffers, then we could
just stick with that.
>
> Btw, I wanted to hear your opinion on patch 1/4 since it touches
> ftrace/trace_events code. Can you please look at it and tell me if its
> ok?
>
I'll take a look.
-- Steve
On Thu, 2012-08-16 at 19:45 +0200, Borislav Petkov wrote:
> From: Borislav Petkov <[email protected]>
>
> This generic interface is supposed to be used to add files to
> (debugfs)/tracing/events/<subsys>/<event_name>/ which are supposed to
> export additional events functionality to users.
>
> For example, the file descriptor of a persistent event can be exported
> like this in the 'pers' node:
>
> /mnt/dbg/tracing/events/mce/mce_record/
> |-- enable
> |-- filter
> |-- format
> |-- id
> `-- persistent
This looks rather hacky :-/
Can't this be added to the perf syscall, and have the user query the
event instead?
Note, this code will change drastically soon, and this change will
break. I'll talk more about what will change at plumbers.
-- Steve
>
> So that userspace tools can read it out and map it directly.
>
> Cc: Peter Zijlstra <[email protected]>
> Cc: Steven Rostedt <[email protected]>
> Signed-off-by: Borislav Petkov <[email protected]>
> ---
> include/linux/ftrace_event.h | 3 +++
> kernel/trace/trace.c | 4 ++++
> kernel/trace/trace.h | 1 +
> kernel/trace/trace_events.c | 35 +++++++++++++++++++++++++++++++++++
> 4 files changed, 43 insertions(+)
>
> diff --git a/include/linux/ftrace_event.h b/include/linux/ftrace_event.h
> index 642928cf57b4..f4f283954cbf 100644
> --- a/include/linux/ftrace_event.h
> +++ b/include/linux/ftrace_event.h
> @@ -265,6 +265,9 @@ extern int trace_define_field(struct ftrace_event_call *call, const char *type,
> int is_signed, int filter_type);
> extern int trace_add_event_call(struct ftrace_event_call *call);
> extern void trace_remove_event_call(struct ftrace_event_call *call);
> +extern struct dentry *
> +trace_add_file(const char *dir_name, const char *fname, umode_t mode,
> + void *data, const struct file_operations *fops);
>
> #define is_signed_type(type) (((type)(-1)) < 0)
>
> diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
> index 5c38c81496ce..e45479572328 100644
> --- a/kernel/trace/trace.c
> +++ b/kernel/trace/trace.c
> @@ -4641,6 +4641,10 @@ struct dentry *trace_create_file(const char *name,
> return ret;
> }
>
> +void trace_remove_file(struct dentry *d)
> +{
> + return debugfs_remove(d);
> +}
>
> static struct dentry *trace_options_init_dentry(void)
> {
> diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
> index 55e1f7f0db12..fd639f45c425 100644
> --- a/kernel/trace/trace.h
> +++ b/kernel/trace/trace.h
> @@ -338,6 +338,7 @@ struct dentry *trace_create_file(const char *name,
> struct dentry *parent,
> void *data,
> const struct file_operations *fops);
> +void trace_remove_file(struct dentry *d);
>
> struct dentry *tracing_init_dentry(void);
>
> diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
> index 29111da1d100..5eda4f584380 100644
> --- a/kernel/trace/trace_events.c
> +++ b/kernel/trace/trace_events.c
> @@ -1199,6 +1199,41 @@ event_create_dir(struct ftrace_event_call *call, struct dentry *d_events,
> return 0;
> }
>
> +static struct dentry *__trace_add_file(struct ftrace_event_call *call,
> + const char *fname, umode_t mode,
> + void *data,
> + const struct file_operations *fops)
> +{
> + struct dentry *ret;
> +
> + mutex_lock(&event_mutex);
> + ret = trace_create_file(fname, mode, call->dir, data, fops);
> + mutex_unlock(&event_mutex);
> +
> + return ret;
> +}
> +
> +/*
> + * Assumptions:
> + * - event debugfs dir is already initialized
> + * - trace event is not declared in a module
> + */
> +struct dentry *trace_add_file(const char *dir_name, const char *fname,
> + umode_t mode, void *data,
> + const struct file_operations *fops)
> +{
> + struct ftrace_event_call *call;
> +
> + if (!strlen(fname))
> + return NULL;
> +
> + list_for_each_entry(call, &ftrace_events, list) {
> + if (!strncmp(call->name, dir_name, strlen(dir_name)))
> + return __trace_add_file(call, fname, mode, data, fops);
> + }
> + return NULL;
> +}
> +
> static int
> __trace_add_event_call(struct ftrace_event_call *call, struct module *mod,
> const struct file_operations *id,
On Thu, 2012-08-16 at 19:45 +0200, Borislav Petkov wrote:
>
> -static void ring_buffer_put(struct ring_buffer *rb)
> +void ring_buffer_put(struct ring_buffer *rb)
> {
> struct perf_event *event, *n;
> unsigned long flags;
I have to say that it is really unfortunate that we have two ring
buffers in the kernel, called struct ring_buffer. One being global and
one being local to events. And now we are exporting the name
"ring_buffer_*" too! This is going to confuse the hell out of people.
Perhaps this should have been called perf_buffer, as that is what it's
used for.
-- Steve
On Thu, Aug 16, 2012 at 06:06:48PM -0400, Steven Rostedt wrote:
> > /mnt/dbg/tracing/events/mce/mce_record/
> > |-- enable
> > |-- filter
> > |-- format
> > |-- id
> > `-- persistent
>
> This looks rather hacky :-/
Ya think?! :-)
Well, this whole machinery is only for telling to userspace which is the
file descriptor of the persistent event. And yes, it could probably be
done with a lot less code.
Hell, I'm even fine with issuing it in dmesg and userspace parsing the
output - talk about efficient implementation.
> Can't this be added to the perf syscall, and have the user query the
> event instead?
Do you mean perf_ioctl? For that to work I need to have a file
descriptor of the "normal" event first and then query it to get the
persistent file descriptor... Or do you mean something easier?
> Note, this code will change drastically soon, and this change will
> break. I'll talk more about what will change at plumbers.
I'll try to get a video or something... :)
--
Regards/Gruss,
Boris.
Advanced Micro Devices GmbH
Einsteinring 24, 85609 Dornach
GM: Alberto Bozzo
Reg: Dornach, Landkreis Muenchen
HRB Nr. 43632 WEEE Registernr: 129 19551
On Thu, Aug 16, 2012 at 06:12:35PM -0400, Steven Rostedt wrote:
> On Thu, 2012-08-16 at 19:45 +0200, Borislav Petkov wrote:
> >
> > -static void ring_buffer_put(struct ring_buffer *rb)
> > +void ring_buffer_put(struct ring_buffer *rb)
> > {
> > struct perf_event *event, *n;
> > unsigned long flags;
>
> I have to say that it is really unfortunate that we have two ring
> buffers in the kernel, called struct ring_buffer. One being global and
> one being local to events. And now we are exporting the name
> "ring_buffer_*" too! This is going to confuse the hell out of people.
>
> Perhaps this should have been called perf_buffer, as that is what it's
> used for.
Right, I guess the easiest would be to rename it for now.
--
Regards/Gruss,
Boris.
Advanced Micro Devices GmbH
Einsteinring 24, 85609 Dornach
GM: Alberto Bozzo
Reg: Dornach, Landkreis Muenchen
HRB Nr. 43632 WEEE Registernr: 129 19551
On Thu, Aug 16, 2012 at 06:00:18PM -0400, Steven Rostedt wrote:
> Do these need to be mmapped, or are they copied into another location
> (disk or network)? I had code to allow the ftrace ring buffers to be
> mmapped, but stopped that work a while ago.
Well, my angle here is that since those are persistent, i.e. always-on
events, they should be as low overhead as possible. So mmaping the
buffer is probably that since it doesn't involve any data shuffling to
and fro...
> > > I'm currently working on having perf read ftrace data, so in the near
> > > future, I plan on having some RFC patches to have perf reading from
> > > this buffer anyway.
> >
> > Are you saying the ftrace buffer would be mmappable too now?
>
> No, the ftrace buffer was optimized for the splice command.
Ah, the splice thing moving data between two fds.
> I would have perf do what trace-cmd does. That is, write directly into
> a file as it reads it, without ever needing the info to come into
> userspace.
Is that a real file?
Because what I was doing in the RAS daemon is mmap the buffer, read out
the logged events in userspace and work on them.
Having them go to a file which I need to open in userspace adds one more
step and I don't know how that would pan out in a critical situation of
you getting a machine check where every insn counts. And besides, in a
MCE situation you cannot be sure that the events would actually go to
file... So my current impression is that reading them direct from memory
is the fastest we can do. But this is only me and I have been wrong in
the past. Lotsa times :).
> I wasn't planning on replacing perf buffers, I was just planning on
> giving perf a tracing boost.
Cool.
> Another alternative, which would be a bit slower, would be to copy the
> pages, either into userspace (as a read) or into a location to mmap the
> buffers.
Yeah, copying is not a good idea, IMHO, due to the said above.
> I could also continue the work on allowing ftrace buffers to be
> mmapped. But if the best thing to do is just use the perf buffers,
> then we could just stick with that.
The easiest for now, I'd say, since it is already there.
> > Btw, I wanted to hear your opinion on patch 1/4 since it touches
> > ftrace/trace_events code. Can you please look at it and tell me if its
> > ok?
> >
> I'll take a look.
Thanks.
--
Regards/Gruss,
Boris.
Advanced Micro Devices GmbH
Einsteinring 24, 85609 Dornach
GM: Alberto Bozzo
Reg: Dornach, Landkreis Muenchen
HRB Nr. 43632 WEEE Registernr: 129 19551
On Fri, 2012-08-17 at 09:38 +0200, Borislav Petkov wrote:
> On Thu, Aug 16, 2012 at 06:00:18PM -0400, Steven Rostedt wrote:
> > Do these need to be mmapped, or are they copied into another location
> > (disk or network)? I had code to allow the ftrace ring buffers to be
> > mmapped, but stopped that work a while ago.
>
> Well, my angle here is that since those are persistent, i.e. always-on
> events, they should be as low overhead as possible. So mmaping the
> buffer is probably that since it doesn't involve any data shuffling to
> and fro...
Understood.
>
> > > > I'm currently working on having perf read ftrace data, so in the near
> > > > future, I plan on having some RFC patches to have perf reading from
> > > > this buffer anyway.
> > >
> > > Are you saying the ftrace buffer would be mmappable too now?
> >
> > No, the ftrace buffer was optimized for the splice command.
>
> Ah, the splice thing moving data between two fds.
Yep.
>
> > I would have perf do what trace-cmd does. That is, write directly into
> > a file as it reads it, without ever needing the info to come into
> > userspace.
>
> Is that a real file?
It's anything that needs a file descriptor. But yeah.
>
> Because what I was doing in the RAS daemon is mmap the buffer, read out
> the logged events in userspace and work on them.
OK, so you're not storing the data, your just using it to manipulate it.
If it wasn't for sparc, we could have done this with the ftrace buffers,
but sparc has some crazy requirement that physical pages can only be
mapped at certain locations, and you can't just take a bunch of random
pages and map them at the same location.
>
> Having them go to a file which I need to open in userspace adds one more
> step and I don't know how that would pan out in a critical situation of
> you getting a machine check where every insn counts. And besides, in a
> MCE situation you cannot be sure that the events would actually go to
> file... So my current impression is that reading them direct from memory
> is the fastest we can do. But this is only me and I have been wrong in
> the past. Lotsa times :).
>
> > I wasn't planning on replacing perf buffers, I was just planning on
> > giving perf a tracing boost.
>
> Cool.
>
> > Another alternative, which would be a bit slower, would be to copy the
> > pages, either into userspace (as a read) or into a location to mmap the
> > buffers.
>
> Yeah, copying is not a good idea, IMHO, due to the said above.
I agree.
>
> > I could also continue the work on allowing ftrace buffers to be
> > mmapped. But if the best thing to do is just use the perf buffers,
> > then we could just stick with that.
>
> The easiest for now, I'd say, since it is already there.
Yeah, I agree with this too. As the perf buffer is better for
manipulating of data and not storing it.
-- Steve
On Fri, 2012-08-17 at 11:20 -0400, Steven Rostedt wrote:
> If it wasn't for sparc, we could have done this with the ftrace buffers,
> but sparc has some crazy requirement that physical pages can only be
> mapped at certain locations, and you can't just take a bunch of random
> pages and map them at the same location.
same problem for perf.. I've meant to make it either mmap or splice but
not both, but I've never gotten around to actually implement any of
that.
On Thu, 2012-08-16 at 19:45 +0200, Borislav Petkov wrote:
> + err_counter:
> + perf_event_release_kernel(event);
> +
> + err_inode:
> + trace_remove_file(desc->dentry);
> +
> + err_trace_file:
> + put_unused_fd(event_fd);
> + return event;
> +}
Note that perf code has unindented labels.
On Thu, 2012-08-16 at 19:45 +0200, Borislav Petkov wrote:
> +struct perf_event
> +*perf_add_persistent_on_cpu(unsigned int cpu, struct perf_event_desc *desc,
> + unsigned nr_pages)
> +{
Oh, weird style that, the * is very much part of the type and would thus
be on the earlier line, I'm not sure I've ever seen it split like this.
On Thu, 2012-08-16 at 19:45 +0200, Borislav Petkov wrote:
>
> off and on I get some free time to work on that, here's the latest
> incarnation. It contains review feedback from the earlier round.
>
> Patch 1/4 adds a trace_add_file() interface which adds an additional
> file to debugfs, in this case the "persistent" file which contains the
> normal perf file descriptor sys_perf_event_open gives to the perf tool.
>
> IOW, one gets:
>
> /mnt/dbg/tracing/events/mce/mce_record/
> |-- enable
> |-- filter
> |-- format
> |-- id
> `-- persistent1
>
> 0 directories, 5 files
>
> [ 1 is the CPU number so sticking all per-CPU descriptors in this
> directory could get a little cluttered and ugly so I'll have to think
> about that a bit more. ]
>
> 3/4 is the meat which adds <kernel/events/persistent.c> and 4/4 shows
> how one can init a persistent event on a CPU.
>
> What remains is adding code which can enable events on boot from the
> kernel cmdline and more testing.
>
> As always, comments and suggestions are appreciated.
Good progress there, there's still a few things though:
- the point also raised by Steven, I'm pretty sure that the placing of
the debugfs files unfortunate. I would much rather see something
like /debug/perf/persistent/$foo, also dropping your
perf_event_desc::dir_name.
- I would make perf_add_persistent_on_cpu() static and create something
like perf_add_persistent() which iterates all CPUs and creates:
"%s-%04d", perf_event_desc::fname, cpu. This needs a little extra for
cpu-hotplug, not sure what to do there.
- related to the first point, by not tying them to actual events you
can create a persistent 'event' that contains multiple events. Its
quite possible to create multiple kernel events and use the
equivalent of PERF_EVENT_IOC_SET_OUTPUT on them to the exposed FD.
- It might be good to provide means of changing the persistent event's
buffer size, or maybe even 'destroy' persistent buffers.
On Tue, Aug 21, 2012 at 12:30:50PM +0200, Peter Zijlstra wrote:
> Good progress there, there's still a few things though:
>
> - the point also raised by Steven, I'm pretty sure that the placing of
> the debugfs files unfortunate. I would much rather see something
> like /debug/perf/persistent/$foo, also dropping your
> perf_event_desc::dir_name.
Ok, how do we want to do the per-CPU layout there? Like this:
/debug/perf/persistent/mce_record0
/debug/perf/persistent/mce_record1
...
or rather
/debug/perf/persistent/cpu0/mce_record
/debug/perf/persistent/cpu1/mce_record
?
The thing is, I'm not sure we want to make persistent events per-CPU
implicitly. If yes, then the second layout would fit better. Hmm.
> - I would make perf_add_persistent_on_cpu() static and create
> something like perf_add_persistent() which iterates all CPUs and
> creates: "%s-%04d", perf_event_desc::fname, cpu.
Ok.
> This needs a little extra for cpu-hotplug, not sure what to do there.
get/put_online_cpus()?
and then maybe check whether some of the CPUs are offline and warn if
so?
> - related to the first point, by not tying them to actual events
> you can create a persistent 'event' that contains multiple events.
> Its quite possible to create multiple kernel events and use the
> equivalent of PERF_EVENT_IOC_SET_OUTPUT on them to the exposed FD.
Hm, interesting.
So, in that case, the persistent thing would have a per-CPU buffer and
a file descriptor connected to it and one would be able to add events
which would log into the per-CPU buffer. Ok.
> - It might be good to provide means of changing the persistent
> event's buffer size, or maybe even 'destroy' persistent buffers.
ioctl? Or something else?
Thanks.
--
Regards/Gruss,
Boris.
Advanced Micro Devices GmbH
Einsteinring 24, 85609 Dornach
GM: Alberto Bozzo
Reg: Dornach, Landkreis Muenchen
HRB Nr. 43632 WEEE Registernr: 129 19551
On Tue, 2012-08-21 at 15:11 +0200, Borislav Petkov wrote:
> On Tue, Aug 21, 2012 at 12:30:50PM +0200, Peter Zijlstra wrote:
> > Good progress there, there's still a few things though:
> >
> > - the point also raised by Steven, I'm pretty sure that the placing of
> > the debugfs files unfortunate. I would much rather see something
> > like /debug/perf/persistent/$foo, also dropping your
> > perf_event_desc::dir_name.
>
> Ok, how do we want to do the per-CPU layout there? Like this:
>
> /debug/perf/persistent/mce_record0
> /debug/perf/persistent/mce_record1
> ...
>
> or rather
>
> /debug/perf/persistent/cpu0/mce_record
> /debug/perf/persistent/cpu1/mce_record
>
> ?
Definitely the second one. The first one is just ugly. The second is
more in line to the tracing directories too:
/debug/tracing/per_cpu/cpu0/...
-- Steve
On Tue, Aug 21, 2012 at 09:41:15AM -0400, Steven Rostedt wrote:
> Definitely the second one. The first one is just ugly. The second is
> more in line to the tracing directories too:
>
> /debug/tracing/per_cpu/cpu0/...
Maybe I can reuse your code and add
/debug/tracing/per_cpu/cpu0/persistent/<event>
?
It is basically debugfs handling, right? Or are you reworking that
too?
Thanks.
--
Regards/Gruss,
Boris.
Advanced Micro Devices GmbH
Einsteinring 24, 85609 Dornach
GM: Alberto Bozzo
Reg: Dornach, Landkreis Muenchen
HRB Nr. 43632 WEEE Registernr: 129 19551
On Tue, 2012-08-21 at 15:50 +0200, Borislav Petkov wrote:
> On Tue, Aug 21, 2012 at 09:41:15AM -0400, Steven Rostedt wrote:
> > Definitely the second one. The first one is just ugly. The second is
> > more in line to the tracing directories too:
> >
> > /debug/tracing/per_cpu/cpu0/...
>
> Maybe I can reuse your code and add
>
> /debug/tracing/per_cpu/cpu0/persistent/<event>
>
> ?
>
> It is basically debugfs handling, right? Or are you reworking that
> too?
>
Anything global wont be modified. IOW, tools wont break :-)
-- Steve