2008-03-22 15:33:01

by Pekka Paalanen

[permalink] [raw]
Subject: Re: tracing and mmiotrace

On Sat, 22 Mar 2008 13:07:47 +0100
Ingo Molnar <[email protected]> wrote:

> > > i'd suggest the following: pull x86.git and sched-devel.git into a
> > > single tree [the two will combine without rejects]. Then try to add a
> > > kernel/tracing/trace_mmiotrace.c ftrace plugin. The trace_sysprof.c
> > > plugin might be a good example.
> >
> > I did this and now I have mmiotrace enabled/disabled via the tracing
> > framework (what do we call this, since ftrace is one of the tracers?).
>
> cool! could you send the patches for that? (even if they are not fully
> functional yet)

Patch attached in the end. Nice to see how much code disappeared. I tried
to mark all the features I had to break with XXX-comments.

> > The elaborate testing is because I keep hitting a NULL pointer
> > dereference in tracing_generic_entry_update() suggesting 'entry' is
> > NULL. Is there some special init function I have to call or what
> > gives?
>
> that's weird - _entry_ is NULL?

Yes. See the patch, and my kernel log at
http://jumi.lut.fi/~paalanen/scratch/tracing-dmesg1.txt

First you see my check in __trace_special() trigger, and then NULL
pointer dereference BUG at ffffffff80259332 which translates to
kernel/trace/trace.c:639 in tracing_generic_entry_update(), line:
entry->preempt_count = pc & 0xff;
I don't understand why it tries to dereference NULL even when I
check for it.

---
Oh, and during bootup there is an error about -fstack-protector-all
test failed.

> > btw. currently 'cat /debug/tracing/trace' gives the
> > print_func_help_header() and quits, it never stays waiting for more
> > data.
>
> yeah - try "cat /debug/tracing/trace_pipe" for the waiting variant -
> does that work better?

Not really. IIRC it said "resource temporarily unavailable" until
mmiotrace actually tried to produce data. No header there. So I
cannot make "cat" ready to read and after that start tracing.

> 2.6.26 is still a bit away, i strongly feel that we should do this via
> the ftrace framework. We could try this: we try to bring it up to full
> functionality now, even via hacks like multiple trace_special() calls,
> and then extend/clean-up the interacations and missing callbacks?

Okay, though personally I'm not such a big fan of "hack now, fix later
(if willing)" ;-)
Oops, I forgot the "depends on RELAY" there.


arch/x86/Kconfig.debug | 3 +-
arch/x86/mm/mmio-mod.c | 208 ++++++----------------------------------
include/linux/mmiotrace.h | 6 +
kernel/trace/Makefile | 1 +
kernel/trace/trace.c | 7 ++
kernel/trace/trace_mmiotrace.c | 84 ++++++++++++++++
6 files changed, 130 insertions(+), 179 deletions(-)

diff --git a/arch/x86/Kconfig.debug b/arch/x86/Kconfig.debug
index 4c4fea0..127f1e9 100644
--- a/arch/x86/Kconfig.debug
+++ b/arch/x86/Kconfig.debug
@@ -186,7 +186,8 @@ config MMIOTRACE_HOOKS

config MMIOTRACE
bool "Memory mapped IO tracing"
- depends on DEBUG_KERNEL && RELAY && DEBUG_FS
+ depends on DEBUG_KERNEL && RELAY
+ select TRACING
select MMIOTRACE_HOOKS
default y
help
diff --git a/arch/x86/mm/mmio-mod.c b/arch/x86/mm/mmio-mod.c
index 7386440..c7a67d7 100644
--- a/arch/x86/mm/mmio-mod.c
+++ b/arch/x86/mm/mmio-mod.c
@@ -22,9 +22,8 @@
#define DEBUG 1

#include <linux/module.h>
-#include <linux/relay.h>
#include <linux/debugfs.h>
-#include <linux/proc_fs.h>
+#include <linux/uaccess.h>
#include <asm/io.h>
#include <linux/version.h>
#include <linux/kallsyms.h>
@@ -63,18 +62,18 @@ static const size_t subbuf_size = 256*1024;
static DEFINE_PER_CPU(struct trap_reason, pf_reason);
static DEFINE_PER_CPU(struct mm_io_header_rw, cpu_trace);

+#if 0 /* XXX: no way gather this info anymore */
/* Access to this is not per-cpu. */
static DEFINE_PER_CPU(atomic_t, dropped);
+#endif

static struct dentry *dir;
-static struct dentry *enabled_file;
static struct dentry *marker_file;

static DEFINE_MUTEX(mmiotrace_mutex);
static DEFINE_SPINLOCK(trace_lock);
static atomic_t mmiotrace_enabled;
static LIST_HEAD(trace_list); /* struct remap_trace */
-static struct rchan *chan;

/*
* Locking in this file:
@@ -93,36 +92,24 @@ static unsigned long filter_offset;
static int nommiotrace;
static int ISA_trace;
static int trace_pc;
-static int enable_now;

module_param(n_subbufs, uint, 0);
module_param(filter_offset, ulong, 0);
module_param(nommiotrace, bool, 0);
module_param(ISA_trace, bool, 0);
module_param(trace_pc, bool, 0);
-module_param(enable_now, bool, 0);

MODULE_PARM_DESC(n_subbufs, "Number of 256kB buffers, default 128.");
MODULE_PARM_DESC(filter_offset, "Start address of traced mappings.");
MODULE_PARM_DESC(nommiotrace, "Disable actual MMIO tracing.");
MODULE_PARM_DESC(ISA_trace, "Do not exclude the low ISA range.");
MODULE_PARM_DESC(trace_pc, "Record address of faulting instructions.");
-MODULE_PARM_DESC(enable_now, "Start mmiotracing immediately on module load.");

static bool is_enabled(void)
{
return atomic_read(&mmiotrace_enabled);
}

-static void record_timestamp(struct mm_io_header *header)
-{
- struct timespec now;
-
- getnstimeofday(&now);
- header->sec = now.tv_sec;
- header->nsec = now.tv_nsec;
-}
-
/*
* Write callback for the debugfs entry:
* Read a marker and write it to the mmio trace log
@@ -141,7 +128,6 @@ static ssize_t write_marker(struct file *file, const char __user *buffer,
headp = (struct mm_io_header *)event;
headp->type = MMIO_MAGIC | (MMIO_MARKER << MMIO_OPCODE_SHIFT);
headp->data_len = len;
- record_timestamp(headp);

if (copy_from_user(event + sizeof(*headp), buffer, len)) {
kfree(event);
@@ -149,9 +135,11 @@ static ssize_t write_marker(struct file *file, const char __user *buffer,
}

spin_lock_irq(&trace_lock);
+#if 0 /* XXX: convert this to use tracing */
if (is_enabled())
relay_write(chan, event, sizeof(*headp) + len);
else
+#endif
len = -EINVAL;
spin_unlock_irq(&trace_lock);
kfree(event);
@@ -242,7 +230,11 @@ static void pre(struct kmmio_probe *p, struct pt_regs *regs,
else
my_trace->rw.pc = 0;

- record_timestamp(&my_trace->header);
+ /*
+ * XXX: the timestamp recorded will be *after* the tracing has been
+ * done, not at the time we hit the instruction. SMP implications
+ * on event ordering?
+ */

switch (type) {
case REG_READ:
@@ -295,77 +287,19 @@ static void post(struct kmmio_probe *p, unsigned long condition,
default:
break;
}
- relay_write(chan, my_trace, sizeof(*my_trace));
+
+ /*
+ * XXX: Several required values are ignored:
+ * - mapping id
+ * - program counter
+ * Also the address should be physical, not virtual.
+ */
+ mmio_trace_record(my_trace->header.type, my_trace->rw.address,
+ my_trace->rw.value);
put_cpu_var(cpu_trace);
put_cpu_var(pf_reason);
}

-/*
- * subbuf_start() relay callback.
- *
- * Defined so that we know when events are dropped due to the buffer-full
- * condition.
- */
-static int subbuf_start_handler(struct rchan_buf *buf, void *subbuf,
- void *prev_subbuf, size_t prev_padding)
-{
- unsigned int cpu = buf->cpu;
- atomic_t *drop = &per_cpu(dropped, cpu);
- int count;
- if (relay_buf_full(buf)) {
- if (atomic_inc_return(drop) == 1)
- pr_err(NAME "cpu %d buffer full!\n", cpu);
- return 0;
- }
- count = atomic_read(drop);
- if (count) {
- pr_err(NAME "cpu %d buffer no longer full, missed %d events.\n",
- cpu, count);
- atomic_sub(count, drop);
- }
-
- return 1;
-}
-
-static struct file_operations mmio_fops = {
- .owner = THIS_MODULE,
-};
-
-/* file_create() callback. Creates relay file in debugfs. */
-static struct dentry *create_buf_file_handler(const char *filename,
- struct dentry *parent,
- int mode,
- struct rchan_buf *buf,
- int *is_global)
-{
- struct dentry *buf_file;
-
- mmio_fops.read = relay_file_operations.read;
- mmio_fops.open = relay_file_operations.open;
- mmio_fops.poll = relay_file_operations.poll;
- mmio_fops.mmap = relay_file_operations.mmap;
- mmio_fops.release = relay_file_operations.release;
- mmio_fops.splice_read = relay_file_operations.splice_read;
-
- buf_file = debugfs_create_file(filename, mode, parent, buf,
- &mmio_fops);
-
- return buf_file;
-}
-
-/* file_remove() default callback. Removes relay file in debugfs. */
-static int remove_buf_file_handler(struct dentry *dentry)
-{
- debugfs_remove(dentry);
- return 0;
-}
-
-static struct rchan_callbacks relay_callbacks = {
- .subbuf_start = subbuf_start_handler,
- .create_buf_file = create_buf_file_handler,
- .remove_buf_file = remove_buf_file_handler,
-};
-
static void ioremap_trace_core(unsigned long offset, unsigned long size,
void __iomem *addr)
{
@@ -387,7 +321,6 @@ static void ioremap_trace_core(unsigned long offset, unsigned long size,
.pc = 0
}
};
- record_timestamp(&event.header);

if (!trace) {
pr_err(NAME "kmalloc failed in ioremap\n");
@@ -410,7 +343,10 @@ static void ioremap_trace_core(unsigned long offset, unsigned long size,
if (!is_enabled())
goto not_enabled;

- relay_write(chan, &event, sizeof(event));
+ /*
+ * XXX: Insufficient data recorded!
+ */
+ mmio_trace_record(event.header.type, event.map.addr, event.map.len);
list_add_tail(&trace->list, &trace_list);
if (!nommiotrace)
register_kmmio_probe(&trace->probe);
@@ -454,7 +390,6 @@ static void iounmap_trace_core(volatile void __iomem *addr)
struct remap_trace *found_trace = NULL;

pr_debug(NAME "Unmapping %p.\n", addr);
- record_timestamp(&event.header);

spin_lock_irq(&trace_lock);
if (!is_enabled())
@@ -469,7 +404,8 @@ static void iounmap_trace_core(volatile void __iomem *addr)
break;
}
}
- relay_write(chan, &event, sizeof(event));
+ mmio_trace_record(event.header.type, event.map.addr,
+ found_trace ? found_trace->id : -1);

not_enabled:
spin_unlock_irq(&trace_lock);
@@ -512,77 +448,23 @@ static void clear_trace_list(void)
}
}

-static ssize_t read_enabled_file_bool(struct file *file,
- char __user *user_buf, size_t count, loff_t *ppos)
-{
- char buf[3];
-
- if (is_enabled())
- buf[0] = '1';
- else
- buf[0] = '0';
- buf[1] = '\n';
- buf[2] = '\0';
- return simple_read_from_buffer(user_buf, count, ppos, buf, 2);
-}
-
-static void enable_mmiotrace(void);
-static void disable_mmiotrace(void);
-
-static ssize_t write_enabled_file_bool(struct file *file,
- const char __user *user_buf, size_t count, loff_t *ppos)
-{
- char buf[32];
- int buf_size = min(count, (sizeof(buf)-1));
-
- if (copy_from_user(buf, user_buf, buf_size))
- return -EFAULT;
-
- switch (buf[0]) {
- case 'y':
- case 'Y':
- case '1':
- enable_mmiotrace();
- break;
- case 'n':
- case 'N':
- case '0':
- disable_mmiotrace();
- break;
- }
-
- return count;
-}
-
-/* this ripped from kernel/kprobes.c */
-static struct file_operations fops_enabled = {
- .owner = THIS_MODULE,
- .read = read_enabled_file_bool,
- .write = write_enabled_file_bool
-};
-
static struct file_operations fops_marker = {
.owner = THIS_MODULE,
.write = write_marker
};

-static void enable_mmiotrace(void)
+void enable_mmiotrace(void)
{
mutex_lock(&mmiotrace_mutex);
if (is_enabled())
goto out;

- chan = relay_open("cpu", dir, subbuf_size, n_subbufs,
- &relay_callbacks, NULL);
- if (!chan) {
- pr_err(NAME "relay app channel creation failed.\n");
- goto out;
- }
-
reference_kmmio();

+#if 0 /* XXX: tracing does not support text entries */
marker_file = debugfs_create_file("marker", 0660, dir, NULL,
&fops_marker);
+#endif
if (!marker_file)
pr_err(NAME "marker file creation failed.\n");

@@ -598,7 +480,7 @@ out:
mutex_unlock(&mmiotrace_mutex);
}

-static void disable_mmiotrace(void)
+void disable_mmiotrace(void)
{
mutex_lock(&mmiotrace_mutex);
if (!is_enabled())
@@ -615,17 +497,13 @@ static void disable_mmiotrace(void)
debugfs_remove(marker_file);
marker_file = NULL;
}
- if (chan) {
- relay_close(chan);
- chan = NULL;
- }

pr_info(NAME "disabled.\n");
out:
mutex_unlock(&mmiotrace_mutex);
}

-static int __init init(void)
+int __init init_mmiotrace(void)
{
pr_debug(NAME "load...\n");
if (n_subbufs < 2)
@@ -636,31 +514,5 @@ static int __init init(void)
pr_err(NAME "Couldn't create relay app directory.\n");
return -ENOMEM;
}
-
- enabled_file = debugfs_create_file("enabled", 0600, dir, NULL,
- &fops_enabled);
- if (!enabled_file) {
- pr_err(NAME "Couldn't create enabled file.\n");
- debugfs_remove(dir);
- return -ENOMEM;
- }
-
- if (enable_now)
- enable_mmiotrace();
-
return 0;
}
-
-static void __exit cleanup(void)
-{
- pr_debug(NAME "unload...\n");
- if (enabled_file)
- debugfs_remove(enabled_file);
- disable_mmiotrace();
- if (dir)
- debugfs_remove(dir);
-}
-
-module_init(init);
-module_exit(cleanup);
-MODULE_LICENSE("GPL");
diff --git a/include/linux/mmiotrace.h b/include/linux/mmiotrace.h
index cb5efd0..579b3b0 100644
--- a/include/linux/mmiotrace.h
+++ b/include/linux/mmiotrace.h
@@ -54,6 +54,12 @@ static inline void mmiotrace_iounmap(volatile void __iomem *addr)
}
#endif /* CONFIG_MMIOTRACE_HOOKS */

+/* in kernel/trace/trace_mmiotrace.c */
+extern int __init init_mmiotrace(void);
+extern void enable_mmiotrace(void);
+extern void disable_mmiotrace(void);
+extern void mmio_trace_record(u32 type, unsigned long addr, unsigned long arg);
+
#endif /* __KERNEL__ */


diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile
index 4ec1f52..fd8c5a4 100644
--- a/kernel/trace/Makefile
+++ b/kernel/trace/Makefile
@@ -15,5 +15,6 @@ obj-$(CONFIG_FTRACE) += trace_functions.o
obj-$(CONFIG_IRQSOFF_TRACER) += trace_irqsoff.o
obj-$(CONFIG_PREEMPT_TRACER) += trace_irqsoff.o
obj-$(CONFIG_SCHED_TRACER) += trace_sched_wakeup.o
+obj-$(CONFIG_MMIOTRACE) += trace_mmiotrace.o

libftrace-y := ftrace.o
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 64ea829..05444b0 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -682,6 +682,13 @@ __trace_special(void *__tr, void *__data,

spin_lock_irqsave(&data->lock, irq_flags);
entry = tracing_get_trace_entry(tr, data);
+ if (!entry) {
+ static unsigned limit;
+ if (limit++ < 12)
+ pr_err("%s: Dereferencing NULL, arrr!\n", __func__);
+ spin_unlock_irqrestore(&data->lock, irq_flags);
+ return;
+ }
tracing_generic_entry_update(entry, 0);
entry->type = TRACE_SPECIAL;
entry->special.arg1 = arg1;
diff --git a/kernel/trace/trace_mmiotrace.c b/kernel/trace/trace_mmiotrace.c
new file mode 100644
index 0000000..e4dd03c
--- /dev/null
+++ b/kernel/trace/trace_mmiotrace.c
@@ -0,0 +1,84 @@
+/*
+ * Memory mapped I/O tracing
+ *
+ * Copyright (C) 2008 Pekka Paalanen <[email protected]>
+ */
+
+#define DEBUG 1
+
+#include <linux/kernel.h>
+#include <linux/mmiotrace.h>
+
+#include "trace.h"
+
+extern void
+__trace_special(void *__tr, void *__data,
+ unsigned long arg1, unsigned long arg2, unsigned long arg3);
+
+static struct trace_array *mmio_trace_array;
+
+
+static void mmio_trace_init(struct trace_array *tr)
+{
+ pr_debug("in %s\n", __func__);
+ mmio_trace_array = tr;
+ if (tr->ctrl)
+ enable_mmiotrace();
+}
+
+static void mmio_trace_reset(struct trace_array *tr)
+{
+ pr_debug("in %s\n", __func__);
+ if (tr->ctrl)
+ disable_mmiotrace();
+}
+
+static void mmio_trace_ctrl_update(struct trace_array *tr)
+{
+ pr_debug("in %s\n", __func__);
+ if (tr->ctrl)
+ enable_mmiotrace();
+ else
+ disable_mmiotrace();
+}
+
+static struct tracer mmio_tracer __read_mostly =
+{
+ .name = "mmiotrace",
+ .init = mmio_trace_init,
+ .reset = mmio_trace_reset,
+ .ctrl_update = mmio_trace_ctrl_update,
+};
+
+__init static int init_mmio_trace(void)
+{
+ int ret = init_mmiotrace();
+ if (ret)
+ return ret;
+ return register_tracer(&mmio_tracer);
+}
+device_initcall(init_mmio_trace);
+
+void mmio_trace_record(u32 type, unsigned long addr, unsigned long arg)
+{
+ struct trace_array *tr = mmio_trace_array;
+ struct trace_array_cpu *data = tr->data[smp_processor_id()];
+
+ if (!current || current->pid == 0) {
+ /*
+ * XXX: This is a problem. We need to able to record, no
+ * matter what. tracing_generic_entry_update() would crash.
+ */
+ static unsigned limit;
+ if (limit++ < 12)
+ pr_err("Error in %s: no current.\n", __func__);
+ return;
+ }
+ if (!tr || !data) {
+ static unsigned limit;
+ if (limit++ < 12)
+ pr_err("%s: no tr or data\n", __func__);
+ return;
+ }
+ __trace_special(tr, data, type, addr, arg);
+}


2008-03-22 20:17:00

by Steven Rostedt

[permalink] [raw]
Subject: Re: tracing and mmiotrace


Hi Pekka,

I'm on vacation at the moment, but will start up helping Ingo with ftrace
on Monday.

On Sat, 22 Mar 2008, Pekka Paalanen wrote:

> >
> > that's weird - _entry_ is NULL?
>
> Yes. See the patch, and my kernel log at
> http://jumi.lut.fi/~paalanen/scratch/tracing-dmesg1.txt
>
> First you see my check in __trace_special() trigger, and then NULL
> pointer dereference BUG at ffffffff80259332 which translates to
> kernel/trace/trace.c:639 in tracing_generic_entry_update(), line:
> entry->preempt_count = pc & 0xff;
> I don't understand why it tries to dereference NULL even when I
> check for it.

Interesting. I'll have to look into that more on Monday.

>
> ---
> Oh, and during bootup there is an error about -fstack-protector-all
> test failed.
>
> > > btw. currently 'cat /debug/tracing/trace' gives the
> > > print_func_help_header() and quits, it never stays waiting for more
> > > data.
> >
> > yeah - try "cat /debug/tracing/trace_pipe" for the waiting variant -
> > does that work better?
>
> Not really. IIRC it said "resource temporarily unavailable" until
> mmiotrace actually tried to produce data. No header there. So I
> cannot make "cat" ready to read and after that start tracing.

I'll have to clean up the trace_pipe code to be a bit more robust.

Thanks for doing this. I'll be getting more involved starting Monday.

-- Steve

2008-03-23 11:01:53

by Pekka Paalanen

[permalink] [raw]
Subject: Re: tracing and mmiotrace

On Sat, 22 Mar 2008 16:16:48 -0400 (EDT)
Steven Rostedt <[email protected]> wrote:

> On Sat, 22 Mar 2008, Pekka Paalanen wrote:
>
> > First you see my check in __trace_special() trigger, and then NULL
> > pointer dereference BUG at ffffffff80259332 which translates to
> > kernel/trace/trace.c:639 in tracing_generic_entry_update(), line:
> > entry->preempt_count = pc & 0xff;
> > I don't understand why it tries to dereference NULL even when I
> > check for it.
>
> Interesting. I'll have to look into that more on Monday.

I found the culprit.

My code never calls tracing_reset(). If I had enabled
FTRACE_STARTUP_TEST, the selftest code would run tracing_reset().
So I copied stack_reset() from trace_sysprof.c and now recording works.
My bad.

btw. the semantics of verbose iter_ctrl seem inverted from user point
of view. I also noticed that 'trace' file does not consume the data and
'trace_pipe' does consume the data. This is important information for the
user.

I think it would be nice, if iter_ctrl listed only the flags the current
tracer supports. This would also imply, that the tracer is not started
automatically when selected. Most of the flags are not useful for
mmiotrace.


Thanks.

--
Pekka Paalanen
http://www.iki.fi/pq/

2008-03-23 17:17:21

by Pekka Paalanen

[permalink] [raw]
Subject: Re: tracing and mmiotrace

Hello,

here is a patch that makes mmiotrace work almost well within the tracing
framework. The patch applies on top of my previous patch. I have my own
output formatting in place now.

Summary of changes:
- fix the NULL dereference that was due to not calling tracing_reset()
- add print_line() callback into struct tracer
- implement print_line() for mmiotrace, producing up-to-spec text
- add my output header, but that is not really called in the right place
- rewrote the main structs in mmiotrace
- added two new trace entry types: TRACE_MMIO_RW and TRACE_MMIO_MAP
- made some functions in trace.c non-static
- check current==NULL in tracing_generic_entry_update()
- fix(?) comparison in trace_seq_printf()

Things seem to work fine except a few issues. Markers (text lines injected
into mmiotrace log) are missing, I did not feel hacking them in before we
have variable length entries. My output header is printed only for 'trace'
file, but not 'trace_pipe'. For some reason, despite my quick fix,
iter->trace is NULL in print_trace_line() when called from 'trace_pipe'
file, which means I don't get proper output formatting.

I only tried by loading nouveau.ko, which just detects the card, and that
is traced fine. I didn't try further. Map, two reads and unmap. Works
perfectly.

I am missing the information about overflows, I'd prefer to have a
counter for lost events. I didn't try, but I guess currently there is no
way of knowning when it overflows?

So, not too far from being fully operational, it seems :-)
And looking at the diffstat, there also is some 700-900 lines of user space
code that just became obsolete.

Cheers.


arch/x86/Kconfig.debug | 2 +-
arch/x86/mm/mmio-mod.c | 140 +++++++++++--------------------------
include/linux/mmiotrace.h | 85 +++++++----------------
kernel/trace/trace.c | 65 +++++++++++------
kernel/trace/trace.h | 29 ++++++++
kernel/trace/trace_mmiotrace.c | 151 ++++++++++++++++++++++++++++++++--------
6 files changed, 261 insertions(+), 211 deletions(-)

diff --git a/arch/x86/Kconfig.debug b/arch/x86/Kconfig.debug
index 127f1e9..f412908 100644
--- a/arch/x86/Kconfig.debug
+++ b/arch/x86/Kconfig.debug
@@ -186,7 +186,7 @@ config MMIOTRACE_HOOKS

config MMIOTRACE
bool "Memory mapped IO tracing"
- depends on DEBUG_KERNEL && RELAY
+ depends on DEBUG_KERNEL
select TRACING
select MMIOTRACE_HOOKS
default y
diff --git a/arch/x86/mm/mmio-mod.c b/arch/x86/mm/mmio-mod.c
index c7a67d7..62abc28 100644
--- a/arch/x86/mm/mmio-mod.c
+++ b/arch/x86/mm/mmio-mod.c
@@ -37,11 +37,6 @@

#define NAME "mmiotrace: "

-/* This app's relay channel files will appear in /debug/mmio-trace */
-static const char APP_DIR[] = "mmio-trace";
-/* the marker injection file in /debug/APP_DIR */
-static const char MARKER_FILE[] = "mmio-marker";
-
struct trap_reason {
unsigned long addr;
unsigned long ip;
@@ -56,18 +51,15 @@ struct remap_trace {
unsigned long id;
};

-static const size_t subbuf_size = 256*1024;
-
/* Accessed per-cpu. */
static DEFINE_PER_CPU(struct trap_reason, pf_reason);
-static DEFINE_PER_CPU(struct mm_io_header_rw, cpu_trace);
+static DEFINE_PER_CPU(struct mmiotrace_rw, cpu_trace);

#if 0 /* XXX: no way gather this info anymore */
/* Access to this is not per-cpu. */
static DEFINE_PER_CPU(atomic_t, dropped);
#endif

-static struct dentry *dir;
static struct dentry *marker_file;

static DEFINE_MUTEX(mmiotrace_mutex);
@@ -82,24 +74,21 @@ static LIST_HEAD(trace_list); /* struct remap_trace */
* and trace_lock.
* - Routines depending on is_enabled() must take trace_lock.
* - trace_list users must hold trace_lock.
- * - is_enabled() guarantees that chan is valid.
+ * - is_enabled() guarantees that mmio_trace_record is allowed.
* - pre/post callbacks assume the effect of is_enabled() being true.
*/

/* module parameters */
-static unsigned int n_subbufs = 32*4;
static unsigned long filter_offset;
static int nommiotrace;
static int ISA_trace;
static int trace_pc;

-module_param(n_subbufs, uint, 0);
module_param(filter_offset, ulong, 0);
module_param(nommiotrace, bool, 0);
module_param(ISA_trace, bool, 0);
module_param(trace_pc, bool, 0);

-MODULE_PARM_DESC(n_subbufs, "Number of 256kB buffers, default 128.");
MODULE_PARM_DESC(filter_offset, "Start address of traced mappings.");
MODULE_PARM_DESC(nommiotrace, "Disable actual MMIO tracing.");
MODULE_PARM_DESC(ISA_trace, "Do not exclude the low ISA range.");
@@ -110,6 +99,7 @@ static bool is_enabled(void)
return atomic_read(&mmiotrace_enabled);
}

+#if 0 /* XXX: needs rewrite */
/*
* Write callback for the debugfs entry:
* Read a marker and write it to the mmio trace log
@@ -145,6 +135,7 @@ static ssize_t write_marker(struct file *file, const char __user *buffer,
kfree(event);
return len;
}
+#endif

static void print_pte(unsigned long address)
{
@@ -198,9 +189,10 @@ static void pre(struct kmmio_probe *p, struct pt_regs *regs,
unsigned long addr)
{
struct trap_reason *my_reason = &get_cpu_var(pf_reason);
- struct mm_io_header_rw *my_trace = &get_cpu_var(cpu_trace);
+ struct mmiotrace_rw *my_trace = &get_cpu_var(cpu_trace);
const unsigned long instptr = instruction_pointer(regs);
const enum reason_type type = get_ins_type(instptr);
+ struct remap_trace *trace = p->user_data;

/* it doesn't make sense to have more than one active trace per cpu */
if (my_reason->active_traces)
@@ -212,23 +204,17 @@ static void pre(struct kmmio_probe *p, struct pt_regs *regs,
my_reason->addr = addr;
my_reason->ip = instptr;

- my_trace->header.type = MMIO_MAGIC;
- my_trace->header.pid = 0;
- my_trace->header.data_len = sizeof(struct mm_io_rw);
- my_trace->rw.address = addr;
- /*
- * struct remap_trace *trace = p->user_data;
- * phys = addr - trace->probe.addr + trace->phys;
- */
+ my_trace->phys = addr - trace->probe.addr + trace->phys;
+ my_trace->map_id = trace->id;

/*
* Only record the program counter when requested.
* It may taint clean-room reverse engineering.
*/
if (trace_pc)
- my_trace->rw.pc = instptr;
+ my_trace->pc = instptr;
else
- my_trace->rw.pc = 0;
+ my_trace->pc = 0;

/*
* XXX: the timestamp recorded will be *after* the tracing has been
@@ -238,28 +224,25 @@ static void pre(struct kmmio_probe *p, struct pt_regs *regs,

switch (type) {
case REG_READ:
- my_trace->header.type |=
- (MMIO_READ << MMIO_OPCODE_SHIFT) |
- (get_ins_mem_width(instptr) << MMIO_WIDTH_SHIFT);
+ my_trace->opcode = MMIO_READ;
+ my_trace->width = get_ins_mem_width(instptr);
break;
case REG_WRITE:
- my_trace->header.type |=
- (MMIO_WRITE << MMIO_OPCODE_SHIFT) |
- (get_ins_mem_width(instptr) << MMIO_WIDTH_SHIFT);
- my_trace->rw.value = get_ins_reg_val(instptr, regs);
+ my_trace->opcode = MMIO_WRITE;
+ my_trace->width = get_ins_mem_width(instptr);
+ my_trace->value = get_ins_reg_val(instptr, regs);
break;
case IMM_WRITE:
- my_trace->header.type |=
- (MMIO_WRITE << MMIO_OPCODE_SHIFT) |
- (get_ins_mem_width(instptr) << MMIO_WIDTH_SHIFT);
- my_trace->rw.value = get_ins_imm_val(instptr);
+ my_trace->opcode = MMIO_WRITE;
+ my_trace->width = get_ins_mem_width(instptr);
+ my_trace->value = get_ins_imm_val(instptr);
break;
default:
{
unsigned char *ip = (unsigned char *)instptr;
- my_trace->header.type |=
- (MMIO_UNKNOWN_OP << MMIO_OPCODE_SHIFT);
- my_trace->rw.value = (*ip) << 16 | *(ip + 1) << 8 |
+ my_trace->opcode = MMIO_UNKNOWN_OP;
+ my_trace->width = 0;
+ my_trace->value = (*ip) << 16 | *(ip + 1) << 8 |
*(ip + 2);
}
}
@@ -271,7 +254,7 @@ static void post(struct kmmio_probe *p, unsigned long condition,
struct pt_regs *regs)
{
struct trap_reason *my_reason = &get_cpu_var(pf_reason);
- struct mm_io_header_rw *my_trace = &get_cpu_var(cpu_trace);
+ struct mmiotrace_rw *my_trace = &get_cpu_var(cpu_trace);

/* this should always return the active_trace count to 0 */
my_reason->active_traces--;
@@ -282,20 +265,13 @@ static void post(struct kmmio_probe *p, unsigned long condition,

switch (my_reason->type) {
case REG_READ:
- my_trace->rw.value = get_ins_reg_val(my_reason->ip, regs);
+ my_trace->value = get_ins_reg_val(my_reason->ip, regs);
break;
default:
break;
}

- /*
- * XXX: Several required values are ignored:
- * - mapping id
- * - program counter
- * Also the address should be physical, not virtual.
- */
- mmio_trace_record(my_trace->header.type, my_trace->rw.address,
- my_trace->rw.value);
+ mmio_trace_rw(my_trace);
put_cpu_var(cpu_trace);
put_cpu_var(pf_reason);
}
@@ -305,21 +281,11 @@ static void ioremap_trace_core(unsigned long offset, unsigned long size,
{
static atomic_t next_id;
struct remap_trace *trace = kmalloc(sizeof(*trace), GFP_KERNEL);
- struct mm_io_header_map event = {
- .header = {
- .type = MMIO_MAGIC |
- (MMIO_PROBE << MMIO_OPCODE_SHIFT),
- .sec = 0,
- .nsec = 0,
- .pid = 0,
- .data_len = sizeof(struct mm_io_map)
- },
- .map = {
- .phys = offset,
- .addr = (unsigned long)addr,
- .len = size,
- .pc = 0
- }
+ struct mmiotrace_map map = {
+ .phys = offset,
+ .virt = (unsigned long)addr,
+ .len = size,
+ .opcode = MMIO_PROBE
};

if (!trace) {
@@ -338,15 +304,13 @@ static void ioremap_trace_core(unsigned long offset, unsigned long size,
.phys = offset,
.id = atomic_inc_return(&next_id)
};
+ map.map_id = trace->id;

spin_lock_irq(&trace_lock);
if (!is_enabled())
goto not_enabled;

- /*
- * XXX: Insufficient data recorded!
- */
- mmio_trace_record(event.header.type, event.map.addr, event.map.len);
+ mmio_trace_mapping(&map);
list_add_tail(&trace->list, &trace_list);
if (!nommiotrace)
register_kmmio_probe(&trace->probe);
@@ -369,21 +333,11 @@ mmiotrace_ioremap(unsigned long offset, unsigned long size, void __iomem *addr)

static void iounmap_trace_core(volatile void __iomem *addr)
{
- struct mm_io_header_map event = {
- .header = {
- .type = MMIO_MAGIC |
- (MMIO_UNPROBE << MMIO_OPCODE_SHIFT),
- .sec = 0,
- .nsec = 0,
- .pid = 0,
- .data_len = sizeof(struct mm_io_map)
- },
- .map = {
- .phys = 0,
- .addr = (unsigned long)addr,
- .len = 0,
- .pc = 0
- }
+ struct mmiotrace_map map = {
+ .phys = 0,
+ .virt = (unsigned long)addr,
+ .len = 0,
+ .opcode = MMIO_UNPROBE
};
struct remap_trace *trace;
struct remap_trace *tmp;
@@ -404,8 +358,8 @@ static void iounmap_trace_core(volatile void __iomem *addr)
break;
}
}
- mmio_trace_record(event.header.type, event.map.addr,
- found_trace ? found_trace->id : -1);
+ map.map_id = (found_trace) ? found_trace->id : -1;
+ mmio_trace_mapping(&map);

not_enabled:
spin_unlock_irq(&trace_lock);
@@ -448,10 +402,12 @@ static void clear_trace_list(void)
}
}

+#if 0 /* XXX: out of order */
static struct file_operations fops_marker = {
.owner = THIS_MODULE,
.write = write_marker
};
+#endif

void enable_mmiotrace(void)
{
@@ -464,9 +420,9 @@ void enable_mmiotrace(void)
#if 0 /* XXX: tracing does not support text entries */
marker_file = debugfs_create_file("marker", 0660, dir, NULL,
&fops_marker);
-#endif
if (!marker_file)
pr_err(NAME "marker file creation failed.\n");
+#endif

if (nommiotrace)
pr_info(NAME "MMIO tracing disabled.\n");
@@ -502,17 +458,3 @@ void disable_mmiotrace(void)
out:
mutex_unlock(&mmiotrace_mutex);
}
-
-int __init init_mmiotrace(void)
-{
- pr_debug(NAME "load...\n");
- if (n_subbufs < 2)
- return -EINVAL;
-
- dir = debugfs_create_dir(APP_DIR, NULL);
- if (!dir) {
- pr_err(NAME "Couldn't create relay app directory.\n");
- return -ENOMEM;
- }
- return 0;
-}
diff --git a/include/linux/mmiotrace.h b/include/linux/mmiotrace.h
index 579b3b0..c88a9c1 100644
--- a/include/linux/mmiotrace.h
+++ b/include/linux/mmiotrace.h
@@ -54,73 +54,38 @@ static inline void mmiotrace_iounmap(volatile void __iomem *addr)
}
#endif /* CONFIG_MMIOTRACE_HOOKS */

-/* in kernel/trace/trace_mmiotrace.c */
-extern int __init init_mmiotrace(void);
-extern void enable_mmiotrace(void);
-extern void disable_mmiotrace(void);
-extern void mmio_trace_record(u32 type, unsigned long addr, unsigned long arg);
-
-#endif /* __KERNEL__ */
-
-
-/*
- * If you change anything here, you must bump MMIO_VERSION.
- * This is the relay data format for user space.
- */
-#define MMIO_VERSION 0x04
-
-/* mm_io_header.type */
-#define MMIO_OPCODE_MASK 0xff
-#define MMIO_OPCODE_SHIFT 0
-#define MMIO_WIDTH_MASK 0xff00
-#define MMIO_WIDTH_SHIFT 8
-#define MMIO_MAGIC (0x6f000000 | (MMIO_VERSION<<16))
-#define MMIO_MAGIC_MASK 0xffff0000
-
-enum mm_io_opcode { /* payload type: */
- MMIO_READ = 0x1, /* struct mm_io_rw */
- MMIO_WRITE = 0x2, /* struct mm_io_rw */
- MMIO_PROBE = 0x3, /* struct mm_io_map */
- MMIO_UNPROBE = 0x4, /* struct mm_io_map */
+enum mm_io_opcode {
+ MMIO_READ = 0x1, /* struct mmiotrace_rw */
+ MMIO_WRITE = 0x2, /* struct mmiotrace_rw */
+ MMIO_PROBE = 0x3, /* struct mmiotrace_map */
+ MMIO_UNPROBE = 0x4, /* struct mmiotrace_map */
MMIO_MARKER = 0x5, /* raw char data */
- MMIO_UNKNOWN_OP = 0x6, /* struct mm_io_rw */
+ MMIO_UNKNOWN_OP = 0x6, /* struct mmiotrace_rw */
};

-struct mm_io_header {
- __u32 type; /* see MMIO_* macros above */
- __u32 sec; /* timestamp */
- __u32 nsec;
- __u32 pid; /* PID of the process, or 0 for kernel core */
- __u16 data_len; /* length of the following payload */
+struct mmiotrace_rw {
+ unsigned long phys; /* PCI address of register */
+ unsigned long value;
+ unsigned long pc; /* optional program counter */
+ int map_id;
+ unsigned char opcode; /* one of MMIO_{READ,WRITE,UNKNOWN_OP} */
+ unsigned char width; /* size of register access in bytes */
};

-struct mm_io_rw {
- __u64 address; /* virtual address of register */
- __u64 value;
- __u64 pc; /* optional program counter */
+struct mmiotrace_map {
+ unsigned long phys; /* base address in PCI space */
+ unsigned long virt; /* base virtual address */
+ unsigned long len; /* mapping size */
+ int map_id;
+ unsigned char opcode; /* MMIO_PROBE or MMIO_UNPROBE */
};

-struct mm_io_map {
- __u64 phys; /* base address in PCI space */
- __u64 addr; /* base virtual address */
- __u64 len; /* mapping size */
- __u64 pc; /* optional program counter */
-};
-
-
-/*
- * These structures are used to allow a single relay_write()
- * call to write a full packet.
- */
-
-struct mm_io_header_rw {
- struct mm_io_header header;
- struct mm_io_rw rw;
-} __attribute__((packed));
+/* in kernel/trace/trace_mmiotrace.c */
+extern void enable_mmiotrace(void);
+extern void disable_mmiotrace(void);
+extern void mmio_trace_rw(struct mmiotrace_rw *rw);
+extern void mmio_trace_mapping(struct mmiotrace_map *map);

-struct mm_io_header_map {
- struct mm_io_header header;
- struct mm_io_map map;
-} __attribute__((packed));
+#endif /* __KERNEL__ */

#endif /* MMIOTRACE_H */
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 05444b0..d7547fb 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -37,7 +37,7 @@ unsigned long __read_mostly tracing_thresh;

static int tracing_disabled = 1;

-static long
+long
ns2usecs(cycle_t nsec)
{
nsec += 500;
@@ -96,18 +96,6 @@ unsigned long nsecs_to_usecs(unsigned long nsecs)
return nsecs / 1000;
}

-enum trace_type {
- __TRACE_FIRST_TYPE = 0,
-
- TRACE_FN,
- TRACE_CTX,
- TRACE_WAKE,
- TRACE_STACK,
- TRACE_SPECIAL,
-
- __TRACE_LAST_TYPE
-};
-
enum trace_flag_type {
TRACE_FLAG_IRQS_OFF = 0x01,
TRACE_FLAG_NEED_RESCHED = 0x02,
@@ -189,7 +177,7 @@ void *head_page(struct trace_array_cpu *data)
return page_address(page);
}

-static int
+int
trace_seq_printf(struct trace_seq *s, const char *fmt, ...)
{
int len = (PAGE_SIZE - 1) - s->len;
@@ -204,7 +192,7 @@ trace_seq_printf(struct trace_seq *s, const char *fmt, ...)
va_end(ap);

/* If we can't write it all, don't bother writing anything */
- if (ret > len)
+ if (ret >= len)
return 0;

s->len += ret;
@@ -637,7 +625,7 @@ tracing_generic_entry_update(struct trace_entry *entry, unsigned long flags)
pc = preempt_count();

entry->preempt_count = pc & 0xff;
- entry->pid = tsk->pid;
+ entry->pid = (tsk) ? tsk->pid : 0;
entry->t = ftrace_now(raw_smp_processor_id());
entry->flags = (irqs_disabled_flags(flags) ? TRACE_FLAG_IRQS_OFF : 0) |
((pc & HARDIRQ_MASK) ? TRACE_FLAG_HARDIRQ : 0) |
@@ -682,13 +670,6 @@ __trace_special(void *__tr, void *__data,

spin_lock_irqsave(&data->lock, irq_flags);
entry = tracing_get_trace_entry(tr, data);
- if (!entry) {
- static unsigned limit;
- if (limit++ < 12)
- pr_err("%s: Dereferencing NULL, arrr!\n", __func__);
- spin_unlock_irqrestore(&data->lock, irq_flags);
- return;
- }
tracing_generic_entry_update(entry, 0);
entry->type = TRACE_SPECIAL;
entry->special.arg1 = arg1;
@@ -699,6 +680,40 @@ __trace_special(void *__tr, void *__data,
trace_wake_up();
}

+#ifdef CONFIG_MMIOTRACE
+void __trace_mmiotrace_rw(struct trace_array *tr, struct trace_array_cpu *data,
+ struct mmiotrace_rw *rw)
+{
+ struct trace_entry *entry;
+ unsigned long irq_flags;
+
+ spin_lock_irqsave(&data->lock, irq_flags);
+ entry = tracing_get_trace_entry(tr, data);
+ tracing_generic_entry_update(entry, 0);
+ entry->type = TRACE_MMIO_RW;
+ entry->mmiorw = *rw;
+ spin_unlock_irqrestore(&data->lock, irq_flags);
+
+ trace_wake_up();
+}
+
+void __trace_mmiotrace_map(struct trace_array *tr, struct trace_array_cpu *data,
+ struct mmiotrace_map *map)
+{
+ struct trace_entry *entry;
+ unsigned long irq_flags;
+
+ spin_lock_irqsave(&data->lock, irq_flags);
+ entry = tracing_get_trace_entry(tr, data);
+ tracing_generic_entry_update(entry, 0);
+ entry->type = TRACE_MMIO_MAP;
+ entry->mmiomap = *map;
+ spin_unlock_irqrestore(&data->lock, irq_flags);
+
+ trace_wake_up();
+}
+#endif
+
void __trace_stack(struct trace_array *tr,
struct trace_array_cpu *data,
unsigned long flags,
@@ -1539,6 +1554,9 @@ static int trace_empty(struct trace_iterator *iter)

static int print_trace_line(struct trace_iterator *iter)
{
+ if (iter->trace && iter->trace->print_line)
+ return iter->trace->print_line(iter);
+
if (trace_flags & TRACE_ITER_BIN)
return print_bin_fmt(iter);

@@ -2158,6 +2176,7 @@ static int tracing_open_pipe(struct inode *inode, struct file *filp)
return -ENOMEM;

iter->tr = &global_trace;
+ iter->trace = current_trace;

filp->private_data = iter;

diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index fd1febd..408d2da 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -5,6 +5,21 @@
#include <asm/atomic.h>
#include <linux/sched.h>
#include <linux/clocksource.h>
+#include <linux/mmiotrace.h>
+
+enum trace_type {
+ __TRACE_FIRST_TYPE = 0,
+
+ TRACE_FN,
+ TRACE_CTX,
+ TRACE_WAKE,
+ TRACE_STACK,
+ TRACE_SPECIAL,
+ TRACE_MMIO_RW,
+ TRACE_MMIO_MAP,
+
+ __TRACE_LAST_TYPE
+};

/*
* Function trace entry - function address and parent function addres:
@@ -63,6 +78,8 @@ struct trace_entry {
struct ctx_switch_entry ctx;
struct special_entry special;
struct stack_entry stack;
+ struct mmiotrace_rw mmiorw;
+ struct mmiotrace_map mmiomap;
};
};

@@ -130,6 +147,7 @@ struct tracer {
int (*selftest)(struct tracer *trace,
struct trace_array *tr);
#endif
+ int (*print_line)(struct trace_iterator *iter);
struct tracer *next;
int print_max;
};
@@ -248,6 +266,15 @@ extern int unregister_tracer_switch(struct tracer_switch_ops *ops);
extern unsigned long ftrace_update_tot_cnt;
#endif

+#ifdef CONFIG_MMIOTRACE
+extern void __trace_mmiotrace_rw(struct trace_array *tr,
+ struct trace_array_cpu *data,
+ struct mmiotrace_rw *rw);
+extern void __trace_mmiotrace_map(struct trace_array *tr,
+ struct trace_array_cpu *data,
+ struct mmiotrace_map *map);
+#endif
+
#ifdef CONFIG_FTRACE_STARTUP_TEST
#ifdef CONFIG_FTRACE
extern int trace_selftest_startup_function(struct tracer *trace,
@@ -280,6 +307,8 @@ extern int trace_selftest_startup_sysprof(struct tracer *trace,
#endif /* CONFIG_FTRACE_STARTUP_TEST */

extern void *head_page(struct trace_array_cpu *data);
+extern int trace_seq_printf(struct trace_seq *s, const char *fmt, ...);
+extern long ns2usecs(cycle_t nsec);

extern unsigned long trace_flags;

diff --git a/kernel/trace/trace_mmiotrace.c b/kernel/trace/trace_mmiotrace.c
index e4dd03c..3a12b1a 100644
--- a/kernel/trace/trace_mmiotrace.c
+++ b/kernel/trace/trace_mmiotrace.c
@@ -11,19 +11,26 @@

#include "trace.h"

-extern void
-__trace_special(void *__tr, void *__data,
- unsigned long arg1, unsigned long arg2, unsigned long arg3);
-
static struct trace_array *mmio_trace_array;

+static void mmio_reset_data(struct trace_array *tr)
+{
+ int cpu;
+
+ tr->time_start = ftrace_now(tr->cpu);
+
+ for_each_online_cpu(cpu)
+ tracing_reset(tr->data[cpu]);
+}

static void mmio_trace_init(struct trace_array *tr)
{
pr_debug("in %s\n", __func__);
mmio_trace_array = tr;
- if (tr->ctrl)
+ if (tr->ctrl) {
+ mmio_reset_data(tr);
enable_mmiotrace();
+ }
}

static void mmio_trace_reset(struct trace_array *tr)
@@ -31,15 +38,110 @@ static void mmio_trace_reset(struct trace_array *tr)
pr_debug("in %s\n", __func__);
if (tr->ctrl)
disable_mmiotrace();
+ mmio_reset_data(tr);
+ mmio_trace_array = NULL;
}

static void mmio_trace_ctrl_update(struct trace_array *tr)
{
pr_debug("in %s\n", __func__);
- if (tr->ctrl)
+ if (tr->ctrl) {
+ mmio_reset_data(tr);
enable_mmiotrace();
- else
+ } else {
disable_mmiotrace();
+ }
+}
+
+/* XXX: This is not called for trace_pipe file! */
+void mmio_print_header(struct trace_iterator *iter)
+{
+ struct trace_seq *s = &iter->seq;
+ trace_seq_printf(s, "VERSION broken 20070824\n");
+ /* TODO: print /proc/bus/pci/devices contents as PCIDEV lines */
+}
+
+static int mmio_print_rw(struct trace_iterator *iter)
+{
+ struct trace_entry *entry = iter->ent;
+ struct mmiotrace_rw *rw = &entry->mmiorw;
+ struct trace_seq *s = &iter->seq;
+ unsigned long long t = ns2usecs(entry->t);
+ unsigned long usec_rem = do_div(t, 1000000ULL);
+ unsigned secs = (unsigned long)t;
+ int ret = 1;
+
+ switch (entry->mmiorw.opcode) {
+ case MMIO_READ:
+ ret = trace_seq_printf(s,
+ "R %d %lu.%06lu %d 0x%lx 0x%lx 0x%lx %d\n",
+ rw->width, secs, usec_rem, rw->map_id, rw->phys,
+ rw->value, rw->pc, entry->pid);
+ break;
+ case MMIO_WRITE:
+ ret = trace_seq_printf(s,
+ "W %d %lu.%06lu %d 0x%lx 0x%lx 0x%lx %d\n",
+ rw->width, secs, usec_rem, rw->map_id, rw->phys,
+ rw->value, rw->pc, entry->pid);
+ break;
+ case MMIO_UNKNOWN_OP:
+ ret = trace_seq_printf(s,
+ "UNKNOWN %lu.%06lu %d 0x%lx %02x,%02x,%02x 0x%lx %d\n",
+ secs, usec_rem, rw->map_id, rw->phys,
+ (rw->value >> 16) & 0xff, (rw->value >> 8) & 0xff,
+ (rw->value >> 0) & 0xff, rw->pc, entry->pid);
+ break;
+ default:
+ ret = trace_seq_printf(s, "rw what?\n");
+ break;
+ }
+ if (ret)
+ return 1;
+ return 0;
+}
+
+static int mmio_print_map(struct trace_iterator *iter)
+{
+ struct trace_entry *entry = iter->ent;
+ struct mmiotrace_map *m = &entry->mmiomap;
+ struct trace_seq *s = &iter->seq;
+ unsigned long long t = ns2usecs(entry->t);
+ unsigned long usec_rem = do_div(t, 1000000ULL);
+ unsigned secs = (unsigned long)t;
+ int ret = 1;
+
+ switch (entry->mmiorw.opcode) {
+ case MMIO_PROBE:
+ ret = trace_seq_printf(s,
+ "MAP %lu.%06lu %d 0x%lx 0x%lx 0x%lx 0x%lx %d\n",
+ secs, usec_rem, m->map_id, m->phys, m->virt, m->len,
+ 0UL, entry->pid);
+ break;
+ case MMIO_UNPROBE:
+ ret = trace_seq_printf(s,
+ "UNMAP %lu.%06lu %d 0x%lx %d\n",
+ secs, usec_rem, m->map_id, 0UL, entry->pid);
+ break;
+ default:
+ ret = trace_seq_printf(s, "map what?\n");
+ break;
+ }
+ if (ret)
+ return 1;
+ return 0;
+}
+
+/* return 0 to abort printing without consuming current entry in pipe mode */
+static int mmio_print_line(struct trace_iterator *iter)
+{
+ switch (iter->ent->type) {
+ case TRACE_MMIO_RW:
+ return mmio_print_rw(iter);
+ case TRACE_MMIO_MAP:
+ return mmio_print_map(iter);
+ default:
+ return 1; /* ignore unknown entries */
+ }
}

static struct tracer mmio_tracer __read_mostly =
@@ -47,38 +149,31 @@ static struct tracer mmio_tracer __read_mostly =
.name = "mmiotrace",
.init = mmio_trace_init,
.reset = mmio_trace_reset,
+ .open = mmio_print_header,
.ctrl_update = mmio_trace_ctrl_update,
+ .print_line = mmio_print_line,
};

__init static int init_mmio_trace(void)
{
- int ret = init_mmiotrace();
- if (ret)
- return ret;
return register_tracer(&mmio_tracer);
}
device_initcall(init_mmio_trace);

-void mmio_trace_record(u32 type, unsigned long addr, unsigned long arg)
+void mmio_trace_rw(struct mmiotrace_rw *rw)
{
struct trace_array *tr = mmio_trace_array;
struct trace_array_cpu *data = tr->data[smp_processor_id()];
+ __trace_mmiotrace_rw(tr, data, rw);
+}

- if (!current || current->pid == 0) {
- /*
- * XXX: This is a problem. We need to able to record, no
- * matter what. tracing_generic_entry_update() would crash.
- */
- static unsigned limit;
- if (limit++ < 12)
- pr_err("Error in %s: no current.\n", __func__);
- return;
- }
- if (!tr || !data) {
- static unsigned limit;
- if (limit++ < 12)
- pr_err("%s: no tr or data\n", __func__);
- return;
- }
- __trace_special(tr, data, type, addr, arg);
+void mmio_trace_mapping(struct mmiotrace_map *map)
+{
+ struct trace_array *tr = mmio_trace_array;
+ struct trace_array_cpu *data;
+
+ preempt_disable();
+ data = tr->data[smp_processor_id()];
+ __trace_mmiotrace_map(tr, data, map);
+ preempt_enable();
}

2008-03-30 18:12:55

by Pekka Paalanen

[permalink] [raw]
Subject: Re: tracing and mmiotrace

Hello,

another weekend, another patch. This should apply on top of my previous patch
from March 23rd.

Summary of changes:
- Print PCI device list in output header
- work around recursive probe hits on SMP
- refactor dis/arm_kmmio_fault_page() and add check for page levels
- remove un/reference_kmmio(), the die notifier hook is registered
permanently into the list
- explicitly check for single stepping in die notifier callback

I have tested this version on my UP Athlon64 desktop with Nouveau, and
SMP Core 2 Duo laptop with the proprietary nvidia driver. Both systems
are 64-bit. One previously unknown bug crept into daylight: the ftrace
framework's output routines print the first entry last after buffer has
wrapped around.

The most important regressions compared to non-ftrace mmiotrace at this
time are:
- failure of trace_pipe file
- illegal lines in output file
- unaware of losing data due to buffer full

Personally I'd like to see these three solved before submitting to
mainline. Other issues may come up once we know when we lose events.

Cheers.


Signed-off-by: Pekka Paalanen <[email protected]>

---
arch/x86/mm/kmmio.c | 186 ++++++++++++++++-----------------------
arch/x86/mm/mmio-mod.c | 3 -
include/linux/mmiotrace.h | 2 -
kernel/trace/trace_mmiotrace.c | 47 ++++++++++-
4 files changed, 120 insertions(+), 118 deletions(-)

diff --git a/arch/x86/mm/kmmio.c b/arch/x86/mm/kmmio.c
index efb4679..cd0d95f 100644
--- a/arch/x86/mm/kmmio.c
+++ b/arch/x86/mm/kmmio.c
@@ -5,15 +5,12 @@
* 2008 Pekka Paalanen <[email protected]>
*/

-#include <linux/version.h>
#include <linux/list.h>
#include <linux/spinlock.h>
#include <linux/hash.h>
#include <linux/init.h>
#include <linux/module.h>
-#include <linux/slab.h>
#include <linux/kernel.h>
-#include <linux/mm.h>
#include <linux/uaccess.h>
#include <linux/ptrace.h>
#include <linux/preempt.h>
@@ -22,10 +19,9 @@
#include <linux/mutex.h>
#include <asm/io.h>
#include <asm/cacheflush.h>
-#include <asm/errno.h>
#include <asm/tlbflush.h>
-#include <asm/pgtable.h>
-
+#include <asm/errno.h>
+#include <asm/debugreg.h>
#include <linux/mmiotrace.h>

#define KMMIO_PAGE_HASH_BITS 4
@@ -57,14 +53,9 @@ struct kmmio_context {
int active;
};

-static int kmmio_die_notifier(struct notifier_block *nb, unsigned long val,
- void *args);
-
-static DEFINE_MUTEX(kmmio_init_mutex);
static DEFINE_SPINLOCK(kmmio_lock);

-/* These are protected by kmmio_lock */
-static int kmmio_initialized;
+/* Protected by kmmio_lock */
unsigned int kmmio_count;

/* Read-protected by RCU, write-protected by kmmio_lock. */
@@ -79,60 +70,6 @@ static struct list_head *kmmio_page_list(unsigned long page)
/* Accessed per-cpu */
static DEFINE_PER_CPU(struct kmmio_context, kmmio_ctx);

-/* protected by kmmio_init_mutex */
-static struct notifier_block nb_die = {
- .notifier_call = kmmio_die_notifier
-};
-
-/**
- * Makes sure kmmio is initialized and usable.
- * This must be called before any other kmmio function defined here.
- * May sleep.
- */
-void reference_kmmio(void)
-{
- mutex_lock(&kmmio_init_mutex);
- spin_lock_irq(&kmmio_lock);
- if (!kmmio_initialized) {
- int i;
- for (i = 0; i < KMMIO_PAGE_TABLE_SIZE; i++)
- INIT_LIST_HEAD(&kmmio_page_table[i]);
- if (register_die_notifier(&nb_die))
- BUG();
- }
- kmmio_initialized++;
- spin_unlock_irq(&kmmio_lock);
- mutex_unlock(&kmmio_init_mutex);
-}
-EXPORT_SYMBOL_GPL(reference_kmmio);
-
-/**
- * Clean up kmmio after use. This must be called for every call to
- * reference_kmmio(). All probes registered after the corresponding
- * reference_kmmio() must have been unregistered when calling this.
- * May sleep.
- */
-void unreference_kmmio(void)
-{
- bool unreg = false;
-
- mutex_lock(&kmmio_init_mutex);
- spin_lock_irq(&kmmio_lock);
-
- if (kmmio_initialized == 1) {
- BUG_ON(is_kmmio_active());
- unreg = true;
- }
- kmmio_initialized--;
- BUG_ON(kmmio_initialized < 0);
- spin_unlock_irq(&kmmio_lock);
-
- if (unreg)
- unregister_die_notifier(&nb_die); /* calls sync_rcu() */
- mutex_unlock(&kmmio_init_mutex);
-}
-EXPORT_SYMBOL(unreference_kmmio);
-
/*
* this is basically a dynamic stabbing problem:
* Could use the existing prio tree code or
@@ -167,58 +104,56 @@ static struct kmmio_fault_page *get_kmmio_fault_page(unsigned long page)
return NULL;
}

-/** Mark the given page as not present. Access to it will trigger a fault. */
-static void arm_kmmio_fault_page(unsigned long page, int *page_level)
+static void set_page_present(unsigned long addr, bool present, int *pglevel)
{
- unsigned long address = page & PAGE_MASK;
+ pteval_t pteval;
+ pmdval_t pmdval;
int level;
- pte_t *pte = lookup_address(address, &level);
+ pmd_t *pmd;
+ pte_t *pte = lookup_address(addr, &level);

if (!pte) {
- pr_err("kmmio: Error in %s: no pte for page 0x%08lx\n",
- __func__, page);
+ pr_err("kmmio: no pte for page 0x%08lx\n", addr);
return;
}

- if (level == PG_LEVEL_2M) {
- pmd_t *pmd = (pmd_t *)pte;
- set_pmd(pmd, __pmd(pmd_val(*pmd) & ~_PAGE_PRESENT));
- } else {
- /* PG_LEVEL_4K */
- set_pte(pte, __pte(pte_val(*pte) & ~_PAGE_PRESENT));
+ if (pglevel)
+ *pglevel = level;
+
+ switch (level) {
+ case PG_LEVEL_2M:
+ pmd = (pmd_t *)pte;
+ pmdval = pmd_val(*pmd) & ~_PAGE_PRESENT;
+ if (present)
+ pmdval |= _PAGE_PRESENT;
+ set_pmd(pmd, __pmd(pmdval));
+ break;
+
+ case PG_LEVEL_4K:
+ pteval = pte_val(*pte) & ~_PAGE_PRESENT;
+ if (present)
+ pteval |= _PAGE_PRESENT;
+ set_pte_atomic(pte, __pte(pteval));
+ break;
+
+ default:
+ pr_err("kmmio: unexpected page level 0x%x.\n", level);
+ return;
}

- if (page_level)
- *page_level = level;
+ __flush_tlb_one(addr);
+}

- __flush_tlb_one(page);
+/** Mark the given page as not present. Access to it will trigger a fault. */
+static void arm_kmmio_fault_page(unsigned long page, int *page_level)
+{
+ set_page_present(page & PAGE_MASK, false, page_level);
}

/** Mark the given page as present. */
static void disarm_kmmio_fault_page(unsigned long page, int *page_level)
{
- unsigned long address = page & PAGE_MASK;
- int level;
- pte_t *pte = lookup_address(address, &level);
-
- if (!pte) {
- pr_err("kmmio: Error in %s: no pte for page 0x%08lx\n",
- __func__, page);
- return;
- }
-
- if (level == PG_LEVEL_2M) {
- pmd_t *pmd = (pmd_t *)pte;
- set_pmd(pmd, __pmd(pmd_val(*pmd) | _PAGE_PRESENT));
- } else {
- /* PG_LEVEL_4K */
- set_pte(pte, __pte(pte_val(*pte) | _PAGE_PRESENT));
- }
-
- if (page_level)
- *page_level = level;
-
- __flush_tlb_one(page);
+ set_page_present(page & PAGE_MASK, true, page_level);
}

/*
@@ -240,6 +175,7 @@ int kmmio_handler(struct pt_regs *regs, unsigned long addr)
{
struct kmmio_context *ctx;
struct kmmio_fault_page *faultpage;
+ int ret = 0; /* default to fault not handled */

/*
* Preemption is now disabled to prevent process switch during
@@ -257,21 +193,35 @@ int kmmio_handler(struct pt_regs *regs, unsigned long addr)
/*
* Either this page fault is not caused by kmmio, or
* another CPU just pulled the kmmio probe from under
- * our feet. In the latter case all hell breaks loose.
+ * our feet. The latter case should not be possible.
*/
goto no_kmmio;
}

ctx = &get_cpu_var(kmmio_ctx);
if (ctx->active) {
+ disarm_kmmio_fault_page(faultpage->page, NULL);
+ if (addr == ctx->addr) {
+ /*
+ * On SMP we sometimes get recursive probe hits on the
+ * same address. Context is already saved, fall out.
+ */
+ pr_debug("kmmio: duplicate probe hit on CPU %d, for "
+ "address 0x%08lx.\n",
+ smp_processor_id(), addr);
+ ret = 1;
+ goto no_kmmio_ctx;
+ }
/*
* Prevent overwriting already in-flight context.
- * If this page fault really was due to kmmio trap,
- * all hell breaks loose.
+ * This should not happen, let's hope disarming at least
+ * prevents a panic.
*/
pr_emerg("kmmio: recursive probe hit on CPU %d, "
"for address 0x%08lx. Ignoring.\n",
smp_processor_id(), addr);
+ pr_emerg("kmmio: previous hit was at 0x%08lx.\n",
+ ctx->addr);
goto no_kmmio_ctx;
}
ctx->active++;
@@ -302,14 +252,14 @@ int kmmio_handler(struct pt_regs *regs, unsigned long addr)
*/

put_cpu_var(kmmio_ctx);
- return 1;
+ return 1; /* fault handled */

no_kmmio_ctx:
put_cpu_var(kmmio_ctx);
no_kmmio:
rcu_read_unlock();
preempt_enable_no_resched();
- return 0; /* page fault not handled by kmmio */
+ return ret;
}

/*
@@ -322,8 +272,11 @@ static int post_kmmio_handler(unsigned long condition, struct pt_regs *regs)
int ret = 0;
struct kmmio_context *ctx = &get_cpu_var(kmmio_ctx);

- if (!ctx->active)
+ if (!ctx->active) {
+ pr_debug("kmmio: spurious debug trap on CPU %d.\n",
+ smp_processor_id());
goto out;
+ }

if (ctx->probe && ctx->probe->post_handler)
ctx->probe->post_handler(ctx->probe, condition, regs);
@@ -525,9 +478,22 @@ static int kmmio_die_notifier(struct notifier_block *nb, unsigned long val,
{
struct die_args *arg = args;

- if (val == DIE_DEBUG)
+ if (val == DIE_DEBUG && (arg->err & DR_STEP))
if (post_kmmio_handler(arg->err, arg->regs) == 1)
return NOTIFY_STOP;

return NOTIFY_DONE;
}
+
+static struct notifier_block nb_die = {
+ .notifier_call = kmmio_die_notifier
+};
+
+static int __init init_kmmio(void)
+{
+ int i;
+ for (i = 0; i < KMMIO_PAGE_TABLE_SIZE; i++)
+ INIT_LIST_HEAD(&kmmio_page_table[i]);
+ return register_die_notifier(&nb_die);
+}
+fs_initcall(init_kmmio); /* should be before device_initcall() */
diff --git a/arch/x86/mm/mmio-mod.c b/arch/x86/mm/mmio-mod.c
index 62abc28..8256546 100644
--- a/arch/x86/mm/mmio-mod.c
+++ b/arch/x86/mm/mmio-mod.c
@@ -415,8 +415,6 @@ void enable_mmiotrace(void)
if (is_enabled())
goto out;

- reference_kmmio();
-
#if 0 /* XXX: tracing does not support text entries */
marker_file = debugfs_create_file("marker", 0660, dir, NULL,
&fops_marker);
@@ -448,7 +446,6 @@ void disable_mmiotrace(void)
spin_unlock_irq(&trace_lock);

clear_trace_list(); /* guarantees: no more kmmio callbacks */
- unreference_kmmio();
if (marker_file) {
debugfs_remove(marker_file);
marker_file = NULL;
diff --git a/include/linux/mmiotrace.h b/include/linux/mmiotrace.h
index c88a9c1..dd6b64b 100644
--- a/include/linux/mmiotrace.h
+++ b/include/linux/mmiotrace.h
@@ -31,8 +31,6 @@ static inline int is_kmmio_active(void)
return kmmio_count;
}

-extern void reference_kmmio(void);
-extern void unreference_kmmio(void);
extern int register_kmmio_probe(struct kmmio_probe *p);
extern void unregister_kmmio_probe(struct kmmio_probe *p);

diff --git a/kernel/trace/trace_mmiotrace.c b/kernel/trace/trace_mmiotrace.c
index 3a12b1a..361472b 100644
--- a/kernel/trace/trace_mmiotrace.c
+++ b/kernel/trace/trace_mmiotrace.c
@@ -8,6 +8,7 @@

#include <linux/kernel.h>
#include <linux/mmiotrace.h>
+#include <linux/pci.h>

#include "trace.h"

@@ -53,12 +54,52 @@ static void mmio_trace_ctrl_update(struct trace_array *tr)
}
}

+static int mmio_print_pcidev(struct trace_seq *s, const struct pci_dev *dev)
+{
+ int ret = 0;
+ int i;
+ resource_size_t start, end;
+ const struct pci_driver *drv = pci_dev_driver(dev);
+
+ /* XXX: incomplete checks for trace_seq_printf() return value */
+ ret += trace_seq_printf(s, "PCIDEV %02x%02x %04x%04x %x",
+ dev->bus->number, dev->devfn,
+ dev->vendor, dev->device, dev->irq);
+ /*
+ * XXX: is pci_resource_to_user() appropriate, since we are
+ * supposed to interpret the __ioremap() phys_addr argument based on
+ * these printed values?
+ */
+ for (i = 0; i < 7; i++) {
+ pci_resource_to_user(dev, i, &dev->resource[i], &start, &end);
+ ret += trace_seq_printf(s, " %llx",
+ (unsigned long long)(start |
+ (dev->resource[i].flags & PCI_REGION_FLAG_MASK)));
+ }
+ for (i = 0; i < 7; i++) {
+ pci_resource_to_user(dev, i, &dev->resource[i], &start, &end);
+ ret += trace_seq_printf(s, " %llx",
+ dev->resource[i].start < dev->resource[i].end ?
+ (unsigned long long)(end - start) + 1 : 0);
+ }
+ if (drv)
+ ret += trace_seq_printf(s, " %s\n", drv->name);
+ else
+ ret += trace_seq_printf(s, " \n");
+ return ret;
+}
+
/* XXX: This is not called for trace_pipe file! */
-void mmio_print_header(struct trace_iterator *iter)
+static void mmio_print_header(struct trace_iterator *iter)
{
struct trace_seq *s = &iter->seq;
- trace_seq_printf(s, "VERSION broken 20070824\n");
- /* TODO: print /proc/bus/pci/devices contents as PCIDEV lines */
+ struct pci_dev *dev = NULL;
+
+ trace_seq_printf(s, "VERSION 20070824\n");
+
+ for_each_pci_dev(dev)
+ mmio_print_pcidev(s, dev);
+ /* XXX: return value? What if header is very long? */
}

static int mmio_print_rw(struct trace_iterator *iter)