2009-03-07 05:09:46

by K.Prasad

[permalink] [raw]
Subject: [Patch 11/11] ftrace plugin for kernel symbol tracing using HW Breakpoint interfaces - v2

This patch adds an ftrace plugin to detect and profile memory access over
kernel variables. It uses HW Breakpoint interfaces to 'watch memory
addresses.

Signed-off-by: K.Prasad <[email protected]>
---
kernel/trace/Kconfig | 6
kernel/trace/Makefile | 1
kernel/trace/trace.h | 16 +
kernel/trace/trace_ksym.c | 448 ++++++++++++++++++++++++++++++++++++++++++
kernel/trace/trace_selftest.c | 36 +++
5 files changed, 507 insertions(+)

Index: linux-2.6-tip/kernel/trace/Kconfig
===================================================================
--- linux-2.6-tip.orig/kernel/trace/Kconfig
+++ linux-2.6-tip/kernel/trace/Kconfig
@@ -249,6 +249,12 @@ config POWER_TRACER
power management decisions, specifically the C-state and P-state
behavior.

+config KSYM_TRACER
+ bool "Trace read and write access on kernel memory locations"
+ select TRACING
+ help
+ This tracer helps find read and write operations on any given kernel
+ symbol i.e. /proc/kallsyms.

config STACK_TRACER
bool "Trace max stack"
Index: linux-2.6-tip/kernel/trace/Makefile
===================================================================
--- linux-2.6-tip.orig/kernel/trace/Makefile
+++ linux-2.6-tip/kernel/trace/Makefile
@@ -41,5 +41,6 @@ obj-$(CONFIG_WORKQUEUE_TRACER) += trace_
obj-$(CONFIG_BLK_DEV_IO_TRACE) += blktrace.o
obj-$(CONFIG_EVENT_TRACER) += trace_events.o
obj-$(CONFIG_EVENT_TRACER) += events.o
+obj-$(CONFIG_KSYM_TRACER) += trace_ksym.o

libftrace-y := ftrace.o
Index: linux-2.6-tip/kernel/trace/trace.h
===================================================================
--- linux-2.6-tip.orig/kernel/trace/trace.h
+++ linux-2.6-tip/kernel/trace/trace.h
@@ -12,6 +12,8 @@
#include <trace/kmemtrace.h>
#include <trace/power.h>

+#include <asm/hw_breakpoint.h>
+
enum trace_type {
__TRACE_FIRST_TYPE = 0,

@@ -34,6 +36,7 @@ enum trace_type {
TRACE_KMEM_FREE,
TRACE_POWER,
TRACE_BLK,
+ TRACE_KSYM,

__TRACE_LAST_TYPE,
};
@@ -191,6 +194,16 @@ struct kmemtrace_free_entry {
const void *ptr;
};

+struct trace_ksym {
+ struct trace_entry ent;
+ struct hw_breakpoint *ksym_hbkpt;
+ unsigned long ksym_addr;
+ unsigned long ip;
+ struct hlist_node ksym_hlist;
+ char ksym_name[KSYM_NAME_LEN];
+ char p_name[TASK_COMM_LEN];
+};
+
/*
* trace_flag_type is an enumeration that holds different
* states when a trace occurs. These are:
@@ -303,6 +316,7 @@ extern void __ftrace_bad_type(void);
TRACE_KMEM_ALLOC); \
IF_ASSIGN(var, ent, struct kmemtrace_free_entry, \
TRACE_KMEM_FREE); \
+ IF_ASSIGN(var, ent, struct trace_ksym, TRACE_KSYM); \
__ftrace_bad_type(); \
} while (0)

@@ -564,6 +578,8 @@ extern int trace_selftest_startup_syspro
struct trace_array *tr);
extern int trace_selftest_startup_branch(struct tracer *trace,
struct trace_array *tr);
+extern int trace_selftest_startup_ksym(struct tracer *trace,
+ struct trace_array *tr);
#endif /* CONFIG_FTRACE_STARTUP_TEST */

extern void *head_page(struct trace_array_cpu *data);
Index: linux-2.6-tip/kernel/trace/trace_ksym.c
===================================================================
--- /dev/null
+++ linux-2.6-tip/kernel/trace/trace_ksym.c
@@ -0,0 +1,448 @@
+#include <linux/module.h>
+#include <linux/fs.h>
+#include <linux/debugfs.h>
+#include <linux/ftrace.h>
+#include <linux/kallsyms.h>
+#include <linux/uaccess.h>
+
+#include "trace.h"
+#include "trace_output.h"
+
+/* For now, let us restrict the no. of symbols traced simultaneously to number
+ * of available hardware breakpoint registers.
+ */
+#define KSYM_TRACER_MAX HB_NUM
+
+#define KSYM_TRACER_OP_LEN 3 /* rw- */
+#define KSYM_FILTER_ENTRY_LEN (KSYM_NAME_LEN + KSYM_TRACER_OP_LEN + 1)
+
+#ifdef CONFIG_FTRACE_SELFTEST
+
+static int ksym_selftest_dummy;
+#define KSYM_SELFTEST_ENTRY "ksym_selftest_dummy"
+
+#endif /* CONFIG_FTRACE_SELFTEST */
+
+static struct trace_array *ksym_trace_array;
+
+DEFINE_MUTEX(ksym_tracer_mutex);
+
+static unsigned int ksym_filter_entry_count;
+static unsigned int ksym_tracing_enabled;
+
+static HLIST_HEAD(ksym_filter_head);
+
+/* HW Breakpoint related callback functions */
+void ksym_hbkpt_installed(struct hw_breakpoint *temp, struct pt_regs
+ *temp_regs)
+{
+}
+
+void ksym_hbkpt_uninstalled(struct hw_breakpoint *temp, struct
+ pt_regs * temp_regs)
+{
+}
+
+void ksym_hbkpt_handler(struct hw_breakpoint *hbkpt, struct pt_regs *regs)
+{
+ struct ring_buffer_event *event;
+ struct trace_array *tr;
+ struct trace_ksym *entry;
+ int pc;
+
+ if (!ksym_tracing_enabled)
+ return;
+
+ tr = ksym_trace_array;
+ pc = preempt_count();
+
+ event = trace_buffer_lock_reserve(tr, TRACE_KSYM,
+ sizeof(*entry), 0, pc);
+ if (!event)
+ return;
+
+ entry = ring_buffer_event_data(event);
+ strlcpy(entry->ksym_name, hbkpt->info.name, KSYM_SYMBOL_LEN);
+ entry->ksym_hbkpt = hbkpt;
+ entry->ip = instruction_pointer(regs);
+ strlcpy(entry->p_name, current->comm, TASK_COMM_LEN);
+
+ trace_buffer_unlock_commit(tr, event, 0, pc);
+}
+
+/* Valid access types are represented as
+ *
+ * rw- : Set Read/Write Access Breakpoint
+ * -w- : Set Write Access Breakpoint
+ * --- : Clear Breakpoints
+ * --x : Set Execution Break points (Not available yet)
+ *
+ */
+static int ksym_trace_get_access_type(char *access_str)
+{
+ int pos, access = 0;
+
+ for (pos = 0; pos < KSYM_TRACER_OP_LEN; pos++) {
+ switch (access_str[pos]) {
+ case 'r':
+ access += (pos == 0) ? 4 : -1;
+ break;
+ case 'w':
+ access += (pos == 1) ? 2 : -1;
+ break;
+ case '-':
+ break;
+ default:
+ return -EINVAL;
+ }
+ }
+
+ switch (access) {
+ case 6:
+ access = HW_BREAKPOINT_RW;
+ break;
+ case 2:
+ access = HW_BREAKPOINT_WRITE;
+ break;
+ case 0:
+ access = 0;
+ }
+
+ return access;
+}
+
+/*
+ * There can be several possible malformed requests and we attempt to capture
+ * all of them. We enumerate some of the rules
+ * 1. We will not allow kernel symbols with ':' since it is used as a delimiter.
+ * i.e. multiple ':' symbols disallowed. Possible uses are of the form
+ * <module>:<ksym_name>:<op>.
+ * 2. No delimiter symbol ':' in the input string
+ * 3. Spurious operator symbols or symbols not in their respective positions
+ * 4. <ksym_name>:--- i.e. clear breakpoint request when ksym_name not in file
+ * 5. Kernel symbol not a part of /proc/kallsyms
+ * 6. Duplicate requests
+ */
+static int parse_ksym_trace_str(char *input_string, char **ksymname,
+ unsigned long *addr)
+{
+ char *delimiter = ":";
+ int ret;
+
+ ret = -EINVAL;
+ *ksymname = strsep(&input_string, delimiter);
+ *addr = kallsyms_lookup_name(*ksymname);
+
+ /* Check for malformed request: (2), (1) and (5) */
+ if ((!input_string) ||
+ (strlen(input_string) != (KSYM_TRACER_OP_LEN + 1)) ||
+ (*addr == 0))
+ goto return_code;
+ ret = ksym_trace_get_access_type(input_string);
+
+return_code:
+ return ret;
+}
+
+static int process_new_ksym_entry(char *ksymname, int op, unsigned long addr)
+{
+ struct trace_ksym *entry;
+ int ret;
+
+ if (ksym_filter_entry_count >= KSYM_TRACER_MAX) {
+ printk(KERN_ERR "ksym_tracer: Maximum limit:(%d) reached. No"
+ " new requests for tracing can be accepted now.\n",
+ KSYM_TRACER_MAX);
+ return -ENOSPC;
+ }
+
+ entry = kzalloc(sizeof(struct trace_ksym), GFP_KERNEL);
+ if (!entry)
+ return -ENOMEM;
+
+ entry->ksym_hbkpt = kzalloc(sizeof(struct hw_breakpoint), GFP_KERNEL);
+ if (!entry->ksym_hbkpt) {
+ kfree(entry);
+ return -ENOMEM;
+ }
+
+ entry->ksym_hbkpt->info.name = ksymname;
+ entry->ksym_hbkpt->info.type = op;
+ entry->ksym_addr = entry->ksym_hbkpt->info.address = addr;
+ entry->ksym_hbkpt->info.len = HW_BREAKPOINT_LEN_4;
+ entry->ksym_hbkpt->priority = HW_BREAKPOINT_PRIO_NORMAL;
+
+ entry->ksym_hbkpt->installed = (void *)ksym_hbkpt_installed;
+ entry->ksym_hbkpt->uninstalled = (void *)ksym_hbkpt_uninstalled;
+ entry->ksym_hbkpt->triggered = (void *)ksym_hbkpt_handler;
+
+ ret = register_kernel_hw_breakpoint(entry->ksym_hbkpt);
+ if (ret <= 0) {
+ printk(KERN_INFO "ksym_tracer request failed. Try again"
+ " later!!\n");
+ if (ret == 0)
+ unregister_kernel_hw_breakpoint(entry->ksym_hbkpt);
+ kfree(entry->ksym_hbkpt);
+ kfree(entry);
+ return -EAGAIN;
+ }
+ hlist_add_head(&(entry->ksym_hlist), &ksym_filter_head);
+ ksym_filter_entry_count++;
+
+ return 0;
+}
+
+static ssize_t ksym_trace_filter_read(struct file *filp, char __user *ubuf,
+ size_t count, loff_t *ppos)
+{
+ struct trace_ksym *entry;
+ struct hlist_node *node;
+ char buf[KSYM_FILTER_ENTRY_LEN * KSYM_TRACER_MAX];
+ ssize_t ret, cnt = 0;
+
+ mutex_lock(&ksym_tracer_mutex);
+
+ hlist_for_each_entry(entry, node, &ksym_filter_head, ksym_hlist) {
+ cnt += snprintf(&buf[cnt], KSYM_FILTER_ENTRY_LEN - cnt, "%s:",
+ entry->ksym_hbkpt->info.name);
+ if (entry->ksym_hbkpt->info.type == HW_BREAKPOINT_WRITE)
+ cnt += snprintf(&buf[cnt], KSYM_FILTER_ENTRY_LEN - cnt,
+ "-w-\n");
+ else if (entry->ksym_hbkpt->info.type == HW_BREAKPOINT_RW)
+ cnt += snprintf(&buf[cnt], KSYM_FILTER_ENTRY_LEN - cnt,
+ "rw-\n");
+ }
+ ret = simple_read_from_buffer(ubuf, count, ppos, buf, strlen(buf));
+ mutex_unlock(&ksym_tracer_mutex);
+
+ return ret;
+}
+
+static ssize_t ksym_trace_filter_write(struct file *file,
+ const char __user *buffer,
+ size_t count, loff_t *ppos)
+{
+ struct trace_ksym *entry;
+ struct hlist_node *node;
+ char *input_string, *ksymname = NULL;
+ unsigned long ksym_addr = 0;
+ int ret, op, changed = 0;
+
+ /* Ignore echo "" > ksym_trace_filter */
+ if (count == 0)
+ return 0;
+
+ input_string = kzalloc(count, GFP_KERNEL);
+ if (!input_string)
+ return -ENOMEM;
+
+ if (copy_from_user(input_string, buffer, count)) {
+ kfree(input_string);
+ return -EFAULT;
+ }
+
+ ret = op = parse_ksym_trace_str(input_string, &ksymname, &ksym_addr);
+ if (ret < 0) {
+ kfree(input_string);
+ return ret;
+ }
+
+ mutex_lock(&ksym_tracer_mutex);
+
+ ret = -EINVAL;
+ hlist_for_each_entry(entry, node, &ksym_filter_head, ksym_hlist) {
+ if (entry->ksym_addr == ksym_addr) {
+ /* Check for malformed request: (6) */
+ if (entry->ksym_hbkpt->info.type != op)
+ changed = 1;
+ else
+ goto err_ret;
+ break;
+ }
+ }
+ if (changed) {
+ unregister_kernel_hw_breakpoint(entry->ksym_hbkpt);
+ entry->ksym_hbkpt->info.type = op;
+ if (op > 0) {
+ ret = register_kernel_hw_breakpoint(entry->ksym_hbkpt);
+ if (ret > 0) {
+ ret = count;
+ goto unlock_ret_path;
+ }
+ if (ret == 0) {
+ ret = -ENOSPC;
+ unregister_kernel_hw_breakpoint(entry->\
+ ksym_hbkpt);
+ }
+ }
+ ksym_filter_entry_count--;
+ hlist_del(&(entry->ksym_hlist));
+ kfree(entry->ksym_hbkpt);
+ kfree(entry);
+ ret = count;
+ goto err_ret;
+ } else {
+ /* Check for malformed request: (4) */
+ if (op == 0)
+ goto err_ret;
+ ret = process_new_ksym_entry(ksymname, op, ksym_addr);
+ if (ret)
+ goto err_ret;
+ }
+ ret = count;
+ goto unlock_ret_path;
+
+err_ret:
+ kfree(input_string);
+
+unlock_ret_path:
+ mutex_unlock(&ksym_tracer_mutex);
+ return ret;
+}
+
+static const struct file_operations ksym_tracing_fops = {
+ .open = tracing_open_generic,
+ .read = ksym_trace_filter_read,
+ .write = ksym_trace_filter_write,
+};
+
+static void ksym_trace_reset(struct trace_array *tr)
+{
+ struct trace_ksym *entry;
+ struct hlist_node *node, *node1;
+
+ ksym_tracing_enabled = 0;
+
+ mutex_lock(&ksym_tracer_mutex);
+ hlist_for_each_entry_safe(entry, node, node1, &ksym_filter_head,
+ ksym_hlist) {
+ unregister_kernel_hw_breakpoint(entry->ksym_hbkpt);
+ ksym_filter_entry_count--;
+ hlist_del(&(entry->ksym_hlist));
+/* Free the 'input_string' only if reset after startup self-test */
+#ifdef CONFIG_FTRACE_SELFTEST
+ if (strncmp(entry->ksym_hbkpt->info.name, KSYM_SELFTEST_ENTRY,
+ strlen(KSYM_SELFTEST_ENTRY)) != 0)
+#endif /* CONFIG_FTRACE_SELFTEST*/
+ kfree(entry->ksym_hbkpt->info.name);
+ kfree(entry->ksym_hbkpt);
+ kfree(entry);
+ }
+ mutex_unlock(&ksym_tracer_mutex);
+
+}
+
+static int ksym_trace_init(struct trace_array *tr)
+{
+ int cpu, ret = 0;
+
+ for_each_online_cpu(cpu)
+ tracing_reset(tr, cpu);
+
+ ksym_tracing_enabled = 1;
+ ksym_trace_array = tr;
+
+#ifdef CONFIG_FTRACE_SELFTEST
+ /* Check if we are re-entering self-test code during initialisation */
+ if (ksym_selftest_dummy)
+ goto ret_path;
+
+ ksym_selftest_dummy = 0;
+
+ /* Register the read-write tracing request */
+ ret = process_new_ksym_entry(KSYM_SELFTEST_ENTRY, HW_BREAKPOINT_RW,
+ (unsigned long)(&ksym_selftest_dummy));
+
+ if (ret < 0) {
+ printk(KERN_CONT "ksym_trace read-write startup test failed\n");
+ goto ret_path;
+ }
+ /* Perform a read and a write operation over the dummy variable to
+ * trigger the tracer
+ */
+ if (ksym_selftest_dummy == 0)
+ ksym_selftest_dummy++;
+
+ret_path:
+#endif /* CONFIG_FTRACE_SELFTEST */
+
+ return ret;
+}
+
+static void ksym_trace_print_header(struct seq_file *m)
+{
+
+ seq_puts(m,
+ "# TASK-PID CPU# Symbol Type "
+ "Function \n");
+ seq_puts(m,
+ "# | | | | "
+ "| \n");
+}
+
+static enum print_line_t ksym_trace_output(struct trace_iterator *iter)
+{
+ struct trace_entry *entry = iter->ent;
+ struct trace_seq *s = &iter->seq;
+ struct trace_ksym *field;
+ char str[KSYM_SYMBOL_LEN];
+ int ret;
+
+ trace_assign_type(field, entry);
+
+ ret = trace_seq_printf(s, "%-15s %-5d %-3d %-20s ", field->p_name,
+ entry->pid, iter->cpu, field->ksym_name);
+ if (!ret)
+ return TRACE_TYPE_PARTIAL_LINE;
+
+ switch (field->ksym_hbkpt->info.type) {
+ case HW_BREAKPOINT_WRITE:
+ ret = trace_seq_printf(s, " W ");
+ break;
+ case HW_BREAKPOINT_RW:
+ ret = trace_seq_printf(s, " RW ");
+ break;
+ default:
+ return TRACE_TYPE_PARTIAL_LINE;
+ }
+
+ if (!ret)
+ return TRACE_TYPE_PARTIAL_LINE;
+
+ sprint_symbol(str, field->ip);
+ ret = trace_seq_printf(s, "%-20s\n", str);
+ if (!ret)
+ return TRACE_TYPE_PARTIAL_LINE;
+
+ return TRACE_TYPE_HANDLED;
+}
+
+struct tracer ksym_tracer __read_mostly =
+{
+ .name = "ksym_tracer",
+ .init = ksym_trace_init,
+ .reset = ksym_trace_reset,
+#ifdef CONFIG_FTRACE_SELFTEST
+ .selftest = trace_selftest_startup_ksym,
+#endif
+ .print_header = ksym_trace_print_header,
+ .print_line = ksym_trace_output
+};
+
+__init static int init_ksym_trace(void)
+{
+ struct dentry *d_tracer;
+ struct dentry *entry;
+
+ d_tracer = tracing_init_dentry();
+ ksym_filter_entry_count = 0;
+
+ entry = debugfs_create_file("ksym_trace_filter", 0666, d_tracer,
+ NULL, &ksym_tracing_fops);
+ if (!entry)
+ pr_warning("Could not create debugfs "
+ "'ksym_trace_filter' file\n");
+
+ return register_tracer(&ksym_tracer);
+}
+device_initcall(init_ksym_trace);
Index: linux-2.6-tip/kernel/trace/trace_selftest.c
===================================================================
--- linux-2.6-tip.orig/kernel/trace/trace_selftest.c
+++ linux-2.6-tip/kernel/trace/trace_selftest.c
@@ -15,6 +15,7 @@ static inline int trace_valid_entry(stru
case TRACE_BRANCH:
case TRACE_GRAPH_ENT:
case TRACE_GRAPH_RET:
+ case TRACE_KSYM:
return 1;
}
return 0;
@@ -693,3 +694,38 @@ trace_selftest_startup_branch(struct tra
return ret;
}
#endif /* CONFIG_BRANCH_TRACER */
+
+#ifdef CONFIG_KSYM_TRACER
+int
+trace_selftest_startup_ksym(struct tracer *trace, struct trace_array *tr)
+{
+ unsigned long count;
+ int ret;
+
+ /* start the tracing */
+ ret = tracer_init(trace, tr);
+ if (ret) {
+ warn_failed_init_tracer(trace, ret);
+ return ret;
+ }
+
+ /* Sleep for a 1/10 of a second */
+ msleep(100);
+ /* stop the tracing. */
+ tracing_stop();
+ /* check the trace buffer */
+ ret = trace_test_buffer(tr, &count);
+ trace->reset(tr);
+ tracing_start();
+
+ /* read & write operations - one each is performed on the dummy variable
+ * triggering two entries in the trace buffer
+ */
+ if (!ret && count != 2) {
+ printk(KERN_CONT "Ksym tracer startup test failed");
+ ret = -1;
+ }
+
+ return ret;
+}
+#endif /* CONFIG_KSYM_TRACER */


2009-03-07 14:54:47

by KOSAKI Motohiro

[permalink] [raw]
Subject: Re: [Patch 11/11] ftrace plugin for kernel symbol tracing using HW Breakpoint interfaces - v2

Hi

> This patch adds an ftrace plugin to detect and profile memory access over
> kernel variables. It uses HW Breakpoint interfaces to 'watch memory
> addresses.
>
> Signed-off-by: K.Prasad <[email protected]>
> ---
> kernel/trace/Kconfig | 6
> kernel/trace/Makefile | 1
> kernel/trace/trace.h | 16 +
> kernel/trace/trace_ksym.c | 448 ++++++++++++++++++++++++++++++++++++++++++
> kernel/trace/trace_selftest.c | 36 +++
> 5 files changed, 507 insertions(+)

Could you please update Documentation/ftrace.txt?
I guess many user interesting this patch. :)



2009-03-07 18:21:27

by K.Prasad

[permalink] [raw]
Subject: Re: [Patch 11/11] ftrace plugin for kernel symbol tracing using HW Breakpoint interfaces - v2

On Sat, Mar 07, 2009 at 11:53:46PM +0900, KOSAKI Motohiro wrote:
> Hi
>
> > This patch adds an ftrace plugin to detect and profile memory access over
> > kernel variables. It uses HW Breakpoint interfaces to 'watch memory
> > addresses.
> >
> > Signed-off-by: K.Prasad <[email protected]>
> > ---
> > kernel/trace/Kconfig | 6
> > kernel/trace/Makefile | 1
> > kernel/trace/trace.h | 16 +
> > kernel/trace/trace_ksym.c | 448 ++++++++++++++++++++++++++++++++++++++++++
> > kernel/trace/trace_selftest.c | 36 +++
> > 5 files changed, 507 insertions(+)
>
> Could you please update Documentation/ftrace.txt?
> I guess many user interesting this patch. :)
>

Sure. We have plans for a documentation file for the hardware breakpoint
interfaces (something like Documentation/hw_breakpoint.txt). I shall
update the Documentation/ftrace.txt with usage details for the kernel
symbol tracing plugin along with them.

Thanks,
K.Prasad

2009-03-08 10:10:24

by Ingo Molnar

[permalink] [raw]
Subject: Re: [Patch 11/11] ftrace plugin for kernel symbol tracing using HW Breakpoint interfaces - v2


* KOSAKI Motohiro <[email protected]> wrote:

> Hi
>
> > This patch adds an ftrace plugin to detect and profile memory access over
> > kernel variables. It uses HW Breakpoint interfaces to 'watch memory
> > addresses.
> >
> > Signed-off-by: K.Prasad <[email protected]>
> > ---
> > kernel/trace/Kconfig | 6
> > kernel/trace/Makefile | 1
> > kernel/trace/trace.h | 16 +
> > kernel/trace/trace_ksym.c | 448 ++++++++++++++++++++++++++++++++++++++++++
> > kernel/trace/trace_selftest.c | 36 +++
> > 5 files changed, 507 insertions(+)
>
> Could you please update Documentation/ftrace.txt?
> I guess many user interesting this patch. :)

Yeah, it has become a really nice feature this way. As i told it
to K.Prasad before: we need this tracer because the data tracer
will likely become the most common usecase of this facility. We
will get the hw breakpoints facility tested and used.

And in fact we can go one step further: it would also be nice to
wire it up with the ftrace histogram code: so that we can get
usage histograms of kernel symbol read/write activities without
the overhead of tracing. (The branch tracer already has this.)

Especially frequently used variables generate a _lot_ of events.

Ingo

2009-03-08 11:00:56

by Frederic Weisbecker

[permalink] [raw]
Subject: Re: [Patch 11/11] ftrace plugin for kernel symbol tracing using HW Breakpoint interfaces - v2

On Sun, Mar 08, 2009 at 11:09:29AM +0100, Ingo Molnar wrote:
>
> * KOSAKI Motohiro <[email protected]> wrote:
>
> > Hi
> >
> > > This patch adds an ftrace plugin to detect and profile memory access over
> > > kernel variables. It uses HW Breakpoint interfaces to 'watch memory
> > > addresses.
> > >
> > > Signed-off-by: K.Prasad <[email protected]>
> > > ---
> > > kernel/trace/Kconfig | 6
> > > kernel/trace/Makefile | 1
> > > kernel/trace/trace.h | 16 +
> > > kernel/trace/trace_ksym.c | 448 ++++++++++++++++++++++++++++++++++++++++++
> > > kernel/trace/trace_selftest.c | 36 +++
> > > 5 files changed, 507 insertions(+)
> >
> > Could you please update Documentation/ftrace.txt?
> > I guess many user interesting this patch. :)
>
> Yeah, it has become a really nice feature this way. As i told it
> to K.Prasad before: we need this tracer because the data tracer
> will likely become the most common usecase of this facility. We
> will get the hw breakpoints facility tested and used.
>
> And in fact we can go one step further: it would also be nice to
> wire it up with the ftrace histogram code: so that we can get
> usage histograms of kernel symbol read/write activities without
> the overhead of tracing. (The branch tracer already has this.)
>
> Especially frequently used variables generate a _lot_ of events.
>
> Ingo

Right, it will even be an occasion to improve and further test
the histogram tracing.
K. Prasad if you need some help on how to use it, don't hesitate to tell.

Frederic.

2009-03-09 21:37:05

by K.Prasad

[permalink] [raw]
Subject: Re: [Patch 11/11] ftrace plugin for kernel symbol tracing using HW Breakpoint interfaces - v2

On Sun, Mar 08, 2009 at 11:09:29AM +0100, Ingo Molnar wrote:
>
> * KOSAKI Motohiro <[email protected]> wrote:
>
> > Hi
> >
> > > This patch adds an ftrace plugin to detect and profile memory access over
> > > kernel variables. It uses HW Breakpoint interfaces to 'watch memory
> > > addresses.
> > >
> > > Signed-off-by: K.Prasad <[email protected]>
> > > ---
> > > kernel/trace/Kconfig | 6
> > > kernel/trace/Makefile | 1
> > > kernel/trace/trace.h | 16 +
> > > kernel/trace/trace_ksym.c | 448 ++++++++++++++++++++++++++++++++++++++++++
> > > kernel/trace/trace_selftest.c | 36 +++
> > > 5 files changed, 507 insertions(+)
> >
> > Could you please update Documentation/ftrace.txt?
> > I guess many user interesting this patch. :)
>
> Yeah, it has become a really nice feature this way. As i told it
> to K.Prasad before: we need this tracer because the data tracer
> will likely become the most common usecase of this facility. We
> will get the hw breakpoints facility tested and used.
>
> And in fact we can go one step further: it would also be nice to
> wire it up with the ftrace histogram code: so that we can get
> usage histograms of kernel symbol read/write activities without
> the overhead of tracing. (The branch tracer already has this.)
>
> Especially frequently used variables generate a _lot_ of events.
>
> Ingo

I am looking at the histogram infrastructure, its implementation in
branch tracer for getting the same in ksym tracer. Meanwhile I'm thinking
if the patchset can be included in -tip tree for some good amount of
testing and feedback from the community before they make their way into
mainline (2.6.30?).

I will be glad if you can pick the patches to be a part of -tip tree.

Thanks,
K.Prasad

2009-03-10 12:21:21

by K.Prasad

[permalink] [raw]
Subject: Re: [Patch 11/11] ftrace plugin for kernel symbol tracing using HW Breakpoint interfaces - v2

On Sun, Mar 08, 2009 at 12:00:40PM +0100, Frederic Weisbecker wrote:
> On Sun, Mar 08, 2009 at 11:09:29AM +0100, Ingo Molnar wrote:
> >
> > * KOSAKI Motohiro <[email protected]> wrote:
> >
> > > Hi
> > >
> > > > This patch adds an ftrace plugin to detect and profile memory access over
> > > > kernel variables. It uses HW Breakpoint interfaces to 'watch memory
> > > > addresses.
> > > >
> > > > Signed-off-by: K.Prasad <[email protected]>
> > > > ---
> > > > kernel/trace/Kconfig | 6
> > > > kernel/trace/Makefile | 1
> > > > kernel/trace/trace.h | 16 +
> > > > kernel/trace/trace_ksym.c | 448 ++++++++++++++++++++++++++++++++++++++++++
> > > > kernel/trace/trace_selftest.c | 36 +++
> > > > 5 files changed, 507 insertions(+)
> > >
> > > Could you please update Documentation/ftrace.txt?
> > > I guess many user interesting this patch. :)
> >
> > Yeah, it has become a really nice feature this way. As i told it
> > to K.Prasad before: we need this tracer because the data tracer
> > will likely become the most common usecase of this facility. We
> > will get the hw breakpoints facility tested and used.
> >
> > And in fact we can go one step further: it would also be nice to
> > wire it up with the ftrace histogram code: so that we can get
> > usage histograms of kernel symbol read/write activities without
> > the overhead of tracing. (The branch tracer already has this.)
> >
> > Especially frequently used variables generate a _lot_ of events.
> >
> > Ingo
>
> Right, it will even be an occasion to improve and further test
> the histogram tracing.
> K. Prasad if you need some help on how to use it, don't hesitate to tell.
>
> Frederic.
>

Hi Frederic,
Thanks for the offer of help.

As I try to get ksym tracer generate histogram information, I see a few
challenges and would like to know your thoughts about them.

- Unlike branch tracer which stores the branch stats in statically
declared data structures ('struct ftrace_branch_data'), ksym
generates data at runtime (during every memory access of interest on
the target variable) and would require dynamic memory allocation. Let's
consider the case where we want the information shown below in the
histogram:

Access_Type Symbol_name Function Counter
------------ ----------- -------- -------
W Sym_A Fn_A 10
W Sym_A Fn_B 15
RW Sym_C Fn_C 20

We need a data structure to store the above information and a new
instance of it for every new Fn_X that accesses Sym_X, while all this
information captured in the context of the hardware breakpoint
exception. I am not sure if dynamically allocating GFP_ATOMIC memory for
such a huge requirement is a good idea.

- Alternatively if we choose to statically declare a data section (as
done by the branch tracer). It should be accompanied by code to check
if we reached the end of section and wrap the pointer around. In effect, it
would become a ring-buffer containing only statistics about the
'snapshot' in the buffer, and not historically aggregated data.

- Removal of the 'Function' column to display only aggregate 'hit'
statistics would help reduce the complexity to a large extent, as the
counter can be embedded in the data structures containing
'ksym_trace_filter' information. But indeed, we are trading useful
information for simplicity.

- Perhaps your proposed enhancements to the 'trace_stat' infrastructure,
say - a generic buffering mechanism to store histogram related
information (or a framework to read through data in the ring-buffer)
would help solve many of the issues. Or is 'histogram' best done in
user-space?

Thanks,
K.Prasad
P.S.: You can refer me as 'Prasad' although I sign as above, which is a
patronymic nomenclature
(http://en.wikipedia.org/wiki/Patronymic_name#Indian_subcontinent).
Here's an illustration from another IBMer:
http://www.almaden.ibm.com/u/mohan/#name :-)

2009-03-10 19:56:19

by Frederic Weisbecker

[permalink] [raw]
Subject: Re: [Patch 11/11] ftrace plugin for kernel symbol tracing using HW Breakpoint interfaces - v2

On Tue, Mar 10, 2009 at 05:51:02PM +0530, K.Prasad wrote:
> On Sun, Mar 08, 2009 at 12:00:40PM +0100, Frederic Weisbecker wrote:
> > On Sun, Mar 08, 2009 at 11:09:29AM +0100, Ingo Molnar wrote:
> > >
> > > * KOSAKI Motohiro <[email protected]> wrote:
> > >
> > > > Hi
> > > >
> > > > > This patch adds an ftrace plugin to detect and profile memory access over
> > > > > kernel variables. It uses HW Breakpoint interfaces to 'watch memory
> > > > > addresses.
> > > > >
> > > > > Signed-off-by: K.Prasad <[email protected]>
> > > > > ---
> > > > > kernel/trace/Kconfig | 6
> > > > > kernel/trace/Makefile | 1
> > > > > kernel/trace/trace.h | 16 +
> > > > > kernel/trace/trace_ksym.c | 448 ++++++++++++++++++++++++++++++++++++++++++
> > > > > kernel/trace/trace_selftest.c | 36 +++
> > > > > 5 files changed, 507 insertions(+)
> > > >
> > > > Could you please update Documentation/ftrace.txt?
> > > > I guess many user interesting this patch. :)
> > >
> > > Yeah, it has become a really nice feature this way. As i told it
> > > to K.Prasad before: we need this tracer because the data tracer
> > > will likely become the most common usecase of this facility. We
> > > will get the hw breakpoints facility tested and used.
> > >
> > > And in fact we can go one step further: it would also be nice to
> > > wire it up with the ftrace histogram code: so that we can get
> > > usage histograms of kernel symbol read/write activities without
> > > the overhead of tracing. (The branch tracer already has this.)
> > >
> > > Especially frequently used variables generate a _lot_ of events.
> > >
> > > Ingo
> >
> > Right, it will even be an occasion to improve and further test
> > the histogram tracing.
> > K. Prasad if you need some help on how to use it, don't hesitate to tell.
> >
> > Frederic.
> >
>
> Hi Frederic,
> Thanks for the offer of help.
>
> As I try to get ksym tracer generate histogram information, I see a few
> challenges and would like to know your thoughts about them.
>
> - Unlike branch tracer which stores the branch stats in statically
> declared data structures ('struct ftrace_branch_data'), ksym
> generates data at runtime (during every memory access of interest on
> the target variable) and would require dynamic memory allocation. Let's
> consider the case where we want the information shown below in the
> histogram:
>
> Access_Type Symbol_name Function Counter
> ------------ ----------- -------- -------
> W Sym_A Fn_A 10
> W Sym_A Fn_B 15
> RW Sym_C Fn_C 20
>
> We need a data structure to store the above information and a new
> instance of it for every new Fn_X that accesses Sym_X, while all this
> information captured in the context of the hardware breakpoint
> exception. I am not sure if dynamically allocating GFP_ATOMIC memory for
> such a huge requirement is a good idea.


Ah, I see... And the number of functions that will dereference it is not
predictable...


> - Alternatively if we choose to statically declare a data section (as
> done by the branch tracer). It should be accompanied by code to check
> if we reached the end of section and wrap the pointer around. In effect, it
> would become a ring-buffer containing only statistics about the
> 'snapshot' in the buffer, and not historically aggregated data.


That's possible, but we could loose interesting cases of function that didn't
dereference a variable for a while but did it often, because they could be
overriden by another. Anyway, it seems a good idea.


> - Removal of the 'Function' column to display only aggregate 'hit'
> statistics would help reduce the complexity to a large extent, as the
> counter can be embedded in the data structures containing
> 'ksym_trace_filter' information. But indeed, we are trading useful
> information for simplicity.


As you said, it would be too much a loss of useful informations.


> - Perhaps your proposed enhancements to the 'trace_stat' infrastructure,
> say - a generic buffering mechanism to store histogram related
> information (or a framework to read through data in the ring-buffer)
> would help solve many of the issues. Or is 'histogram' best done in
> user-space?


For now it's just something that provides some abstracts over the seq file
and sorting facilities.
And if it can be enhanced in any way that can help, it would be great. But
I don't know how I could write something generic enough to support any kind
of problem that matches yours in its pattern.

In such case, the stat tracing looks a bit like regular events tracing: we
don't want to allocate memory for all entries because of the different context
sources and because of the allocation overhead, but we still want to store
all of the events.
Such thing could rely on the ring-buffer that we are using, but we would need to
create a sort of private instance of the ring buffer because we will store our hits
and then we will read all that from the ring buffer directly to get the stats sum.
I'm not sure we can create such private instances yet.

For now I would suggest to pre-allocate a set of entries for each breakpoints, using
a predefined number (I call it n here) of entries and count the hits for the n first
functions that were trapped on the breakpoint.
And if you missed some functions because n is too small, then increment an overrun
variable for the current breakpoint that you can display with the stats.
So that the user will know that he missed some things.
I guess it could be accompagnied by a file to change the value of n.

It's just an opinion anyway.


> Thanks,
> K.Prasad
> P.S.: You can refer me as 'Prasad' although I sign as above, which is a
> patronymic nomenclature
> (http://en.wikipedia.org/wiki/Patronymic_name#Indian_subcontinent).
> Here's an illustration from another IBMer:
> http://www.almaden.ibm.com/u/mohan/#name :-)


Ok :-)

Thanks.