2008-12-03 00:58:24

by Jiaying Zhang

[permalink] [raw]
Subject: [RFC PATCH 3/3] kernel tracing prototype

Refer to the previous email
"[RFC PATCH 0/3] A couple of feature requests to the unified trace buffer".

A kernel tracing prototype that uses the unified trace buffer to keep
the collected trace data.

Index: linux-2.6.26.2/include/asm-generic/vmlinux.lds.h
===================================================================
--- linux-2.6.26.2.orig/include/asm-generic/vmlinux.lds.h
2008-11-25 11:32:26.000000000 -0800
+++ linux-2.6.26.2/include/asm-generic/vmlinux.lds.h 2008-11-25
11:32:37.000000000 -0800
@@ -50,6 +50,9 @@
MEM_KEEP(init.data) \
MEM_KEEP(exit.data) \
. = ALIGN(8); \
+ VMLINUX_SYMBOL(__start___ktraces) = .; \
+ *(__ktrace) \
+ VMLINUX_SYMBOL(__stop___ktraces) = .; \
VMLINUX_SYMBOL(__start___markers) = .; \
*(__markers) \
VMLINUX_SYMBOL(__stop___markers) = .;
@@ -60,6 +63,7 @@
VMLINUX_SYMBOL(__start_rodata) = .; \
*(.rodata) *(.rodata.*) \
*(__vermagic) /* Kernel version magic */ \
+ *(__ktrace_strings) /* Ktrace: strings */ \
*(__markers_strings) /* Markers: strings */ \
} \
\
Index: linux-2.6.26.2/include/linux/ktrace.h
===================================================================
--- /dev/null 1970-01-01 00:00:00.000000000 +0000
+++ linux-2.6.26.2/include/linux/ktrace.h 2008-11-25
11:32:37.000000000 -0800
@@ -0,0 +1,73 @@
+#ifndef _LINUX_KTRACE_H
+#define _LINUX_KTRACE_H
+
+#include <linux/types.h>
+
+struct kernel_trace;
+
+typedef void ktrace_probe_func(struct kernel_trace *, void *, size_t);
+
+struct kernel_trace {
+ const char *name;
+ const char *format; /* format string describing variable list */
+ u32 enabled:1, event_id:31;
+ ktrace_probe_func *func; /* probe function */
+ struct list_head list; /* list head linked to the hash table entry */
+} __attribute__((aligned(8)));
+
+extern int ktrace_enabled;
+
+/*
+ * Make sure the alignment of the structure in the __ktrace section will
+ * not add unwanted padding between the beginning of the section and the
+ * structure. Force alignment to the same alignment as the section start.
+ */
+
+#define DEFINE_KTRACE_STRUCT(name) struct ktrace_struct_##name
+
+#ifdef CONFIG_KTRACE
+#define DO_TRACE(name, format, args...)
\
+ do { \
+ static const char __kstrtab_##name[] \
+ __attribute__((section("__ktrace_strings"))) \
+ = #name "\0" format; \
+ static struct kernel_trace __ktrace_##name \
+ __attribute__((section("__ktrace"), aligned(8))) = \
+ { __kstrtab_##name, &__kstrtab_##name[sizeof(#name)], \
+ 0, 0, NULL, LIST_HEAD_INIT(__ktrace_##name.list) }; \
+ __ktrace_check_format(format, ## args); \
+ if (unlikely(ktrace_enabled) && \
+ unlikely(__ktrace_##name.enabled)) { \
+ struct ktrace_struct_##name karg = { args }; \
+ (*__ktrace_##name.func) \
+ (&__ktrace_##name, &karg, sizeof(karg)); \
+ } \
+ } while (0)
+
+#else /* !CONFIG_KTRACE */
+#define DO_TRACE(name, format, args...)
+#endif
+
+/* To be used for string format validity checking with gcc */
+static inline void __printf(1, 2) ___ktrace_check_format(const char *fmt, ...)
+{
+}
+
+#define __ktrace_check_format(format, args...) \
+ do { \
+ if (0) \
+ ___ktrace_check_format(format, ## args); \
+ } while (0)
+
+
+int init_ktrace(void);
+void exit_ktrace(void);
+
+/* get the trace buffer information */
+#define KTRACE_BUF_GET_SIZE _IOR(0xF5, 0x00,__u32)
+#define KTRACE_BUF_GET_PRODUCED _IOR(0xF5, 0x01,__u32)
+#define KTRACE_BUF_GET_CONSUMED _IOR(0xF5, 0x02,__u32)
+#define KTRACE_BUF_PUT_PRODUCED _IOW(0xF5, 0x03,__u32)
+#define KTRACE_BUF_PUT_CONSUMED _IOW(0xF5, 0x04,__u32)
+
+#endif
Index: linux-2.6.26.2/kernel/module.c
===================================================================
--- linux-2.6.26.2.orig/kernel/module.c 2008-11-25 11:32:26.000000000 -0800
+++ linux-2.6.26.2/kernel/module.c 2008-11-25 11:32:37.000000000 -0800
@@ -1770,6 +1770,8 @@
unsigned int unusedgplcrcindex;
unsigned int markersindex;
unsigned int markersstringsindex;
+ unsigned int ktraceindex;
+ unsigned int ktracestringsindex;
struct module *mod;
long err = 0;
void *percpu = NULL, *ptr = NULL; /* Stops spurious gcc warning */
@@ -2050,6 +2052,10 @@
markersstringsindex = find_sec(hdr, sechdrs, secstrings,
"__markers_strings");

+ ktraceindex = find_sec(hdr, sechdrs, secstrings, "__ktrace");
+ ktracestringsindex = find_sec(hdr, sechdrs, secstrings,
+ "__ktrace_strings");
+
/* Now do relocations. */
for (i = 1; i < hdr->e_shnum; i++) {
const char *strtab = (char *)sechdrs[strindex].sh_addr;
@@ -2077,6 +2083,11 @@
sechdrs[markersindex].sh_size / sizeof(*mod->markers);
#endif

+#ifdef CONFIG_KTRACE
+ mod->ktrace = (void *)sechdrs[ktraceindex].sh_addr;
+ mod->num_ktrace =
+ sechdrs[ktraceindex].sh_size / sizeof(*mod->ktrace);
+#endif
/* Find duplicate symbols */
err = verify_export_symbols(mod);

@@ -2099,6 +2110,7 @@
marker_update_probe_range(mod->markers,
mod->markers + mod->num_markers);
#endif
+
err = module_finalize(hdr, sechdrs, mod);
if (err < 0)
goto cleanup;
Index: linux-2.6.26.2/include/linux/module.h
===================================================================
--- linux-2.6.26.2.orig/include/linux/module.h 2008-11-25
11:32:26.000000000 -0800
+++ linux-2.6.26.2/include/linux/module.h 2008-11-25
11:32:37.000000000 -0800
@@ -16,6 +16,7 @@
#include <linux/kobject.h>
#include <linux/moduleparam.h>
#include <linux/marker.h>
+#include <linux/ktrace.h>
#include <asm/local.h>

#include <asm/module.h>
@@ -342,6 +343,11 @@
struct marker *markers;
unsigned int num_markers;
#endif
+#ifdef CONFIG_KTRACE
+ struct kernel_trace *ktrace;
+ unsigned int num_ktrace;
+#endif
+
};
#ifndef MODULE_ARCH_INIT
#define MODULE_ARCH_INIT {}
Index: linux-2.6.26.2/kernel/ktrace/ktrace-mod.c
===================================================================
--- /dev/null 1970-01-01 00:00:00.000000000 +0000
+++ linux-2.6.26.2/kernel/ktrace/ktrace-mod.c 2008-11-25
11:32:37.000000000 -0800
@@ -0,0 +1,41 @@
+/*
+ * kernel/ktrace-mod.c
+ *
+ * the kernel tracing module wrapper
+ *
+ * Copyright 2008- Google Inc.
+ * Original Author:
+ */
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/bitmap.h>
+#include <linux/rcupdate.h>
+#include <linux/relay.h>
+#include <linux/errno.h>
+#include <linux/sched.h>
+#include <linux/linkage.h>
+#include <linux/debugfs.h>
+#include <linux/timer.h>
+#include <linux/delay.h>
+#include <linux/jhash.h>
+#include <linux/ktrace.h>
+#include <linux/ring_buffer.h>
+
+static int __init init_ktrace_module(void)
+{
+ return init_ktrace();
+}
+
+static void __exit exit_ktrace_module(void)
+{
+ exit_ktrace();
+}
+
+module_init(init_ktrace_module);
+module_exit(exit_ktrace_module);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Jiaying Zhang <[email protected]>");
+MODULE_DESCRIPTION("Kernel Tracing Module");
+
Index: linux-2.6.26.2/kernel/ktrace/ktrace.c
===================================================================
--- /dev/null 1970-01-01 00:00:00.000000000 +0000
+++ linux-2.6.26.2/kernel/ktrace/ktrace.c 2008-11-25
16:34:31.000000000 -0800
@@ -0,0 +1,658 @@
+/*
+ * kernel/ktrace.c
+ *
+ * Implementation of the kernel tracing for linux kernel 2.6.
+ *
+ * Copyright 2008- Google Inc.
+ * Original Author: Jiaying Zhang
+ */
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/bitmap.h>
+#include <linux/poll.h>
+#include <linux/errno.h>
+#include <linux/sched.h>
+#include <linux/linkage.h>
+#include <linux/debugfs.h>
+#include <linux/timer.h>
+#include <linux/delay.h>
+#include <linux/jhash.h>
+#include <linux/ktrace.h>
+#include <linux/ring_buffer.h>
+
+int ktrace_enabled = 0;
+EXPORT_SYMBOL(ktrace_enabled);
+static uint16_t ktrace_next_id = 0;
+
+extern struct kernel_trace __start___ktraces[];
+extern struct kernel_trace __stop___ktraces[];
+
+static DEFINE_MUTEX(ktrace_mutex);
+
+/* ktrace hash table entry structure and variables */
+struct ktrace_entry {
+ struct hlist_node hlist;
+ char *name;
+ char *format;
+ ktrace_probe_func *func;
+ u32 enabled:1, event_id:31;
+ struct list_head klist; /* list of loaded ktraces */
+};
+
+#define KTRACE_HASH_BITS 6
+#define KTRACE_HASH_SIZE (1 << KTRACE_HASH_BITS)
+static struct hlist_head ktrace_table[KTRACE_HASH_SIZE];
+
+/* debugfs directory variables */
+static struct dentry *tracedir = NULL;
+static struct dentry *eventdir = NULL;
+
+/* ring buffer code */
+static unsigned long trace_buf_size = 16384UL;
+struct ring_buffer *trace_buffer = NULL;
+static DECLARE_WAIT_QUEUE_HEAD(trace_wait); /* waitqueue for trace_data_poll */
+static struct timer_list trace_timer; /* reader wake-up timer */
+#define TRACEREAD_WAKEUP_INTERVAL 1000
+static struct kref ktrace_kref;
+
+/*
+static void
+ring_buffer_probe(struct kernel_trace *kt, void *data, size_t event_size)
+{
+ struct ring_buffer_event *event;
+ void *trace_event;
+ unsigned long flag;
+
+
+ event = __ring_buffer_lock_reserve(trace_buffer,
+ sizeof(uint16_t) + event_size);
+ trace_event = ring_buffer_event_data(event);
+ *(uint16_t *) trace_event = kt->event_id;
+ memcpy(trace_event + sizeof(uint16_t), data, event_size);
+ __ring_buffer_unlock_commit(trace_buffer, event);
+}
+*/
+
+static void
+ring_buffer_probe(struct kernel_trace *kt, void *data, size_t event_size)
+{
+ struct ring_buffer_event *event;
+ void *trace_event;
+ unsigned long flag;
+
+
+ event = ring_buffer_lock_reserve(trace_buffer,
+ sizeof(uint16_t) + event_size, &flag);
+ trace_event = ring_buffer_event_data(event);
+ *(uint16_t *) trace_event = kt->event_id;
+ memcpy(trace_event + sizeof(uint16_t), data, event_size);
+ ring_buffer_unlock_commit(trace_buffer, event, flag);
+}
+
+/* timer function used to defer ring buffer reader waking */
+static void wakeup_readers(unsigned long data)
+{
+ wake_up_interruptible(&trace_wait);
+ mod_timer(&trace_timer, jiffies + TRACEREAD_WAKEUP_INTERVAL);
+}
+
+/* function for reading ktrace metadata info */
+static void *tracing_info_start(struct seq_file *seq, loff_t *pos)
+{
+ return (*pos >= KTRACE_HASH_SIZE) ? NULL : &ktrace_table[*pos];
+}
+
+static void *tracing_info_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+ ++*pos;
+ return (*pos >= KTRACE_HASH_SIZE) ? NULL : &ktrace_table[*pos];
+}
+
+static int tracing_info_show(struct seq_file *seq, void *v)
+{
+ struct hlist_head *head = v;
+ struct hlist_node *node;
+ struct ktrace_entry *entry;
+ hlist_for_each_entry(entry, node, head, hlist)
+ seq_printf(seq, "name '%s' format '%s' id %u %s\n",
+ entry->name, entry->format, entry->event_id,
+ entry->enabled ? "enabled" : "disabled");
+ return 0;
+}
+
+static void tracing_info_stop(struct seq_file *seq, void *v)
+{
+}
+
+static struct seq_operations tracing_info_seq_ops = {
+ .start = tracing_info_start,
+ .next = tracing_info_next,
+ .show = tracing_info_show,
+ .stop = tracing_info_stop,
+};
+
+static int tracing_info_open(struct inode *inode, struct file *file)
+{
+ return seq_open(file, &tracing_info_seq_ops);
+}
+
+static struct file_operations traceinfo_file_operations = {
+ .open = tracing_info_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release,
+};
+
+/* functions for reading/writing the global 'enabled' control file */
+static ssize_t tracing_control_read(struct file *filp, char __user *ubuf,
+ size_t cnt, loff_t *ppos)
+{
+ int ret;
+ char buf[64];
+ ret = sprintf(buf, "%u\n", ktrace_enabled);
+ return simple_read_from_buffer(ubuf, cnt, ppos, buf, ret);
+}
+
+static ssize_t tracing_control_write(struct file *filp, const char
__user *ubuf,
+ size_t cnt, loff_t *ppos)
+{
+ int ret;
+ char buf[64];
+ unsigned long val;
+
+ if (cnt >= sizeof(buf))
+ return -EINVAL;
+ if (copy_from_user(&buf, ubuf, cnt))
+ return -EFAULT;
+ buf[cnt] = 0;
+
+ ret = strict_strtoul(buf, 10, &val);
+ if (ret < 0)
+ return ret;
+
+ if (val ^ ktrace_enabled) {
+ if (val) {
+ set_kernel_trace_flag_all_tasks();
+ trace_timer.expires =
+ jiffies + TRACEREAD_WAKEUP_INTERVAL;
+ add_timer(&trace_timer);
+ ktrace_enabled = 1;
+ } else {
+ clear_kernel_trace_flag_all_tasks();
+ ktrace_enabled = 0;
+ del_timer_sync(&trace_timer);
+ }
+ }
+
+ filp->f_pos += cnt;
+ return cnt;
+}
+
+static struct file_operations tracecontrol_file_operations = {
+ .read = tracing_control_read,
+ .write = tracing_control_write,
+};
+
+/* functions for reading/writing a trace event file */
+static int update_ktrace(struct ktrace_entry *entry,
+ ktrace_probe_func *func, int enabled)
+{
+ struct kernel_trace *iter;
+ /* no need to update the list if the tracing is not initialied */
+ if (!tracedir)
+ return 0;
+ entry->enabled = enabled;
+ entry->func = func;
+ list_for_each_entry(iter, &entry->klist, list) {
+ iter->func = func;
+ iter->enabled = enabled;
+ }
+ return 0;
+}
+
+static int ktrace_probe_register(struct ktrace_entry *entry,
+ ktrace_probe_func *probefunc)
+{
+ return update_ktrace(entry, probefunc, 1);
+}
+
+static int ktrace_probe_unregister(struct ktrace_entry *entry)
+{
+ return update_ktrace(entry, NULL, 0);
+}
+
+static int ktrace_control_open(struct inode *inode, struct file *filp)
+{
+ filp->private_data = inode->i_private;
+ return 0;
+}
+
+static ssize_t ktrace_control_read(struct file *filp, char __user *ubuf,
+ size_t cnt, loff_t *ppos)
+{
+ int ret;
+ char buf[64];
+ struct ktrace_entry *entry = filp->private_data;
+
+ ret = sprintf(buf, "%u\n", entry->enabled);
+ return simple_read_from_buffer(ubuf, cnt, ppos, buf, ret);
+}
+
+static ssize_t ktrace_control_write(struct file *filp, const char __user *ubuf,
+ size_t cnt, loff_t *ppos)
+{
+ int ret;
+ char buf[64];
+ unsigned long val;
+ struct ktrace_entry *entry = filp->private_data;
+
+ if (cnt >= sizeof(buf))
+ return -EINVAL;
+ if (copy_from_user(&buf, ubuf, cnt))
+ return -EFAULT;
+ buf[cnt] = 0;
+
+ ret = strict_strtoul(buf, 10, &val);
+ if (ret < 0)
+ return ret;
+ val = !!val;
+
+ mutex_lock(&ktrace_mutex);
+ if (val ^ entry->enabled) {
+ if (val)
+ ktrace_probe_register(entry, ring_buffer_probe);
+ else
+ ktrace_probe_unregister(entry);
+ }
+ mutex_unlock(&ktrace_mutex);
+ return cnt;
+}
+
+static struct file_operations ktracecontrol_file_operations = {
+ .open = ktrace_control_open,
+ .read = ktrace_control_read,
+ .write = ktrace_control_write,
+};
+
+/* functions for adding/removing a trace event. Protected by mutex lock.
+ * Called during initialization or after loading a module */
+static struct ktrace_entry * add_ktrace(struct kernel_trace *ktrace)
+{
+ struct ktrace_entry *entry;
+ size_t name_len = strlen(ktrace->name) + 1;
+ u32 hash = jhash(ktrace->name, name_len-1, 0);
+ struct hlist_head *head;
+ struct hlist_node *node;
+ struct dentry *dentry;
+
+ if (!tracedir)
+ return 0;
+ head = &ktrace_table[hash & ((1 << KTRACE_HASH_BITS)-1)];
+ hlist_for_each_entry(entry, node, head, hlist) {
+ if (!strcmp(ktrace->name, entry->name)) {
+ if (strcmp(ktrace->format, entry->format)) {
+ printk(KERN_WARNING "the format of tracepoint "
+ "\'%s\' changes from \'%s\' to \'%s\'."
+ "Please re-load ktrace module first!\n",
+ ktrace->name, entry->format,
+ ktrace->format);
+ return ERR_PTR(-EINVAL);
+ }
+ ktrace->enabled = entry->enabled;
+ ktrace->func = entry->func;
+ if (list_empty(&entry->klist))
+ goto add_head;
+ list_add_tail(&ktrace->list, &entry->klist);
+ return entry;
+ }
+ }
+ entry = kmalloc(sizeof(struct ktrace_entry), GFP_KERNEL);
+ if (!entry)
+ return ERR_PTR(-ENOMEM);
+ entry->name = kmalloc(name_len, GFP_KERNEL);
+ if (!entry->name)
+ return ERR_PTR(-ENOMEM);
+ memcpy(entry->name, ktrace->name, name_len);
+ if (ktrace->format) {
+ size_t format_len = strlen(ktrace->format) + 1;
+ entry->format = kmalloc(format_len, GFP_KERNEL);
+ if (!entry->format)
+ return ERR_PTR(-ENOMEM);
+ memcpy(entry->format, ktrace->format, format_len);
+ } else
+ entry->format = NULL;
+ entry->func = ktrace->func;
+ entry->enabled = 0;
+ entry->event_id = ktrace_next_id ++;
+ INIT_LIST_HEAD(&entry->klist);
+ hlist_add_head(&entry->hlist, head);
+add_head:
+ list_add(&ktrace->list, &entry->klist);
+ dentry = debugfs_create_file(entry->name, 0444, eventdir,
+ entry, &ktracecontrol_file_operations);
+ if (!dentry)
+ printk("Couldn't create debugfs '%s' entry\n", entry->name);
+ return entry;
+}
+
+static int remove_ktrace(struct kernel_trace *ktrace, int free)
+{
+ struct ktrace_entry *entry;
+ size_t name_len = strlen(ktrace->name) + 1;
+ u32 hash = jhash(ktrace->name, name_len-1, 0);
+ struct hlist_head *head;
+ struct hlist_node *node, *temp;
+ struct dentry *dentry;
+
+ if (!tracedir)
+ return 0;
+ list_del(&ktrace->list);
+ head = &ktrace_table[hash & ((1 << KTRACE_HASH_BITS)-1)];
+ hlist_for_each_entry_safe(entry, node, temp, head, hlist)
+ if (!strcmp(ktrace->name, entry->name)) {
+ if (list_empty(&entry->klist)) {
+ dentry = lookup_one_len(entry->name,
+ eventdir, strlen(entry->name));
+ if (dentry && !IS_ERR(dentry)) {
+ debugfs_remove(dentry);
+ dput(dentry);
+ }
+ entry->enabled = 0;
+ if (free) {
+ hlist_del(&entry->hlist);
+ kfree(entry->name);
+ kfree(entry->format);
+ kfree(entry);
+ }
+ }
+ return 0;
+ }
+ return -1;
+}
+
+/* Add/remove tracepoints contained in a module to the ktrace hash table.
+ * Called at the end of module load/unload. */
+static int ktrace_module_notify(struct notifier_block * self,
+ unsigned long val, void * data)
+{
+#ifdef CONFIG_MODULES
+ struct kernel_trace *iter;
+ struct module *mod = data;
+
+ if (val != MODULE_STATE_COMING && val != MODULE_STATE_GOING)
+ return 0;
+ mutex_lock(&ktrace_mutex);
+ for (iter = mod->ktrace; iter < mod->ktrace + mod->num_ktrace; iter ++)
+ if (val == MODULE_STATE_COMING)
+ add_ktrace(iter);
+ else
+ remove_ktrace(iter, 0);
+ mutex_unlock(&ktrace_mutex);
+#endif
+ return 0;
+}
+
+static struct notifier_block ktrace_module_nb = {
+ .notifier_call = ktrace_module_notify,
+};
+
+/* functions for user-space programs to read data from tracing buffer */
+static int trace_data_open(struct inode *inode, struct file *filp)
+{
+ filp->private_data = inode->i_private;
+ kref_get(&ktrace_kref);
+ return 0;
+}
+
+static void release_trace_buffer(struct kref *kref)
+{
+ ring_buffer_free(trace_buffer);
+ trace_buffer = NULL;
+}
+
+static int trace_data_release(struct inode *inode, struct file *filp)
+{
+ kref_put(&ktrace_kref, release_trace_buffer);
+ return 0;
+}
+
+static unsigned int trace_data_poll(struct file *filp, poll_table *poll_table)
+{
+ struct ring_buffer_per_cpu *cpu_buffer = filp->private_data;
+ if (!ktrace_enabled)
+ return POLLERR;
+ if (filp->f_mode & FMODE_READ) {
+ while (1) {
+ poll_wait(filp, &trace_wait, poll_table);
+ if (!ring_buffer_per_cpu_empty(cpu_buffer))
+ return POLLIN | POLLRDNORM;
+ }
+ }
+ return 0;
+}
+
+static int trace_buf_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
+{
+ struct ring_buffer_per_cpu *cpu_buffer = vma->vm_private_data;
+ pgoff_t pgoff = vmf->pgoff;
+ struct page *page;
+
+ if (!trace_buffer)
+ return VM_FAULT_OOM;
+ if ((page = ring_buffer_get_page(cpu_buffer, pgoff)) == NULL)
+ return VM_FAULT_SIGBUS;
+ get_page(page);
+ vmf->page = page;
+ return 0;
+}
+
+static struct vm_operations_struct trace_data_mmap_ops = {
+ .fault = trace_buf_fault,
+};
+
+static int trace_data_mmap(struct file *filp, struct vm_area_struct *vma)
+{
+ unsigned long length = vma->vm_end - vma->vm_start;
+ struct ring_buffer_per_cpu *cpu_buffer = filp->private_data;
+
+ if (length != ring_buffer_size(trace_buffer))
+ return -EINVAL;
+ vma->vm_ops = &trace_data_mmap_ops;
+ vma->vm_flags |= VM_DONTEXPAND;
+ vma->vm_private_data = cpu_buffer;
+ return 0;
+}
+
+static int trace_data_ioctl(struct inode *inode, struct file *filp,
+ unsigned int cmd, unsigned long arg)
+{
+ u32 __user *argp = (u32 __user *)arg;
+ struct ring_buffer_per_cpu *cpu_buffer = filp->private_data;
+
+ if (!trace_buffer || !tracedir)
+ return -EINVAL;
+ switch (cmd) {
+ case KTRACE_BUF_GET_SIZE:
+ {
+ unsigned long bufsize;
+ bufsize = ring_buffer_size(trace_buffer);
+ return put_user((u32)bufsize, argp);
+ }
+ case KTRACE_BUF_GET_PRODUCED:
+ return
put_user((u32)ring_buffer_get_produced(cpu_buffer), argp);
+ case KTRACE_BUF_GET_CONSUMED:
+ return
put_user((u32)ring_buffer_get_consumed(cpu_buffer), argp);
+ case KTRACE_BUF_PUT_CONSUMED:
+ {
+ u32 consumed, consumed_old;
+ unsigned long bufsize = ring_buffer_size(trace_buffer);
+ int ret;
+
+ ret = get_user(consumed, argp);
+ if (ret) {
+ printk("error getting consumed value: %d\n", ret);
+ return ret;
+ }
+ consumed_old = ring_buffer_get_consumed(cpu_buffer);
+ if (consumed == consumed_old)
+ return 0;
+ ring_buffer_advance_reader(cpu_buffer, consumed - consumed_old);
+ break;
+ }
+ default:
+ return -ENOIOCTLCMD;
+ }
+ return 0;
+}
+
+static struct file_operations tracedata_file_operations = {
+ .open = trace_data_open,
+ .poll = trace_data_poll,
+ .mmap = trace_data_mmap,
+ .ioctl = trace_data_ioctl,
+ .release = trace_data_release,
+};
+
+static int create_debugfs(void)
+{
+ struct dentry *entry, *bufdir;
+ struct ring_buffer_per_cpu *cpu_buffer;
+ char *tmpname;
+ int cpu;
+
+ tracedir = debugfs_create_dir("ktrace", NULL);
+ if (!tracedir) {
+ printk("Couldn't create debugfs directory\n");
+ return -ENOMEM;
+ }
+
+ bufdir = debugfs_create_dir("buffers", tracedir);
+ if (!bufdir) {
+ printk("Couldn't create 'buffers' directory\n");
+ return -ENOMEM;
+ }
+
+ tmpname = kzalloc(NAME_MAX + 1, GFP_KERNEL);
+ if (!tmpname)
+ return -ENOMEM;
+ for_each_possible_cpu(cpu) {
+ snprintf(tmpname, NAME_MAX, "%s%d", "cpu", cpu);
+ cpu_buffer = ring_buffer_cpu(trace_buffer, cpu);
+ entry = debugfs_create_file(tmpname, 0444, bufdir, cpu_buffer,
+ &tracedata_file_operations);
+ if (!entry) {
+ printk("Couldn't create debugfs \'%s\' entry\n",
+ tmpname);
+ return -ENOMEM;
+ }
+ }
+ kfree(tmpname);
+
+ entry = debugfs_create_file("enabled", 0444, tracedir, NULL,
+ &tracecontrol_file_operations);
+ if (!entry) {
+ printk("Couldn't create debugfs 'enabled' entry\n");
+ return -ENOMEM;
+ }
+
+ entry = debugfs_create_file("info", 0222, tracedir, NULL,
+ &traceinfo_file_operations);
+ if (!entry) {
+ printk("Couldn't create debugfs 'info' entry\n");
+ return -ENOMEM;
+ }
+
+ eventdir = debugfs_create_dir("events", tracedir);
+ if (!eventdir) {
+ printk("Couldn't create 'events' directory\n");
+ return -ENOMEM;
+ }
+ return 0;
+}
+
+static int remove_debugfs(void)
+{
+ struct dentry *dentry, *bufdir;
+ char *tmpname;
+ int cpu;
+
+ bufdir = lookup_one_len("buffers", tracedir, strlen("buffers"));
+ if (!bufdir || IS_ERR(bufdir))
+ return -EIO;
+ tmpname = kzalloc(NAME_MAX + 1, GFP_KERNEL);
+ if (!tmpname)
+ return -ENOMEM;
+ for_each_possible_cpu(cpu) {
+ snprintf(tmpname, NAME_MAX, "%s%d", "cpu", cpu);
+ dentry = lookup_one_len(tmpname, bufdir, strlen(tmpname));
+ if (dentry && !IS_ERR(dentry)) {
+ debugfs_remove(dentry);
+ dput(dentry);
+ }
+ }
+ kfree(tmpname);
+ debugfs_remove(bufdir);
+
+ dentry = lookup_one_len("enabled", tracedir, strlen("enabled"));
+ if (dentry && !IS_ERR(dentry)) {
+ debugfs_remove(dentry);
+ dput(dentry);
+ }
+ dentry = lookup_one_len("info", tracedir, strlen("info"));
+ if (dentry && !IS_ERR(dentry)) {
+ debugfs_remove(dentry);
+ dput(dentry);
+ }
+ debugfs_remove(eventdir);
+ debugfs_remove(tracedir);
+ eventdir = NULL;
+ tracedir = NULL;
+ return 0;
+}
+
+int init_ktrace(void)
+{
+ struct kernel_trace *iter;
+ int i, err;
+
+ ktrace_next_id = 0;
+ kref_set(&ktrace_kref, 1);
+ trace_buffer = ring_buffer_alloc(trace_buf_size, RB_FL_OVERWRITE);
+ if (trace_buffer == NULL)
+ return -ENOMEM;
+ setup_timer(&trace_timer, wakeup_readers, 0);
+ for (i = 0; i < KTRACE_HASH_SIZE; i++)
+ INIT_HLIST_HEAD(&ktrace_table[i]);
+
+ mutex_lock(&ktrace_mutex);
+ if ((err = create_debugfs()) != 0)
+ goto out;
+ for (iter = __start___ktraces; iter < __stop___ktraces; iter++)
+ add_ktrace(iter);
+ err = register_module_notifier(&ktrace_module_nb);
+out:
+ mutex_unlock(&ktrace_mutex);
+ return err;
+}
+EXPORT_SYMBOL_GPL(init_ktrace);
+
+void exit_ktrace(void)
+{
+ struct kernel_trace *iter;
+
+ if (ktrace_enabled) {
+ clear_kernel_trace_flag_all_tasks();
+ ktrace_enabled = 0;
+ del_timer_sync(&trace_timer);
+ }
+ mutex_lock(&ktrace_mutex);
+ for (iter = __start___ktraces; iter < __stop___ktraces; iter++)
+ remove_ktrace(iter, 1);
+ unregister_module_notifier(&ktrace_module_nb);
+ remove_debugfs();
+ mutex_unlock(&ktrace_mutex);
+
+ kref_put(&ktrace_kref, release_trace_buffer);
+}
+EXPORT_SYMBOL_GPL(exit_ktrace);


2008-12-03 04:35:32

by Steven Rostedt

[permalink] [raw]
Subject: Re: [RFC PATCH 3/3] kernel tracing prototype


On Tue, 2008-12-02 at 16:26 -0800, Jiaying Zhang wrote:
> Refer to the previous email
> "[RFC PATCH 0/3] A couple of feature requests to the unified trace
> buffer".
>
> A kernel tracing prototype that uses the unified trace buffer to keep
> the collected trace data.
>

> Index: linux-2.6.26.2/include/linux/ktrace.h
> ===================================================================
> --- /dev/null 1970-01-01 00:00:00.000000000 +0000
> +++ linux-2.6.26.2/include/linux/ktrace.h 2008-11-25
> 11:32:37.000000000 -0800
> @@ -0,0 +1,73 @@
> +#ifndef _LINUX_KTRACE_H
> +#define _LINUX_KTRACE_H
> +
> +#include <linux/types.h>
> +
> +struct kernel_trace;
> +
> +typedef void ktrace_probe_func(struct kernel_trace *, void *,
> size_t);
> +
> +struct kernel_trace {
> + const char *name;
> + const char *format; /* format string describing variable
> list */
> + u32 enabled:1, event_id:31;
> + ktrace_probe_func *func; /* probe function */
> + struct list_head list; /* list head linked to the hash table
> entry */
> +} __attribute__((aligned(8)));
> +
> +extern int ktrace_enabled;
> +
> +/*
> + * Make sure the alignment of the structure in the __ktrace section
> will
> + * not add unwanted padding between the beginning of the section and
> the
> + * structure. Force alignment to the same alignment as the section
> start.
> + */
> +
> +#define DEFINE_KTRACE_STRUCT(name) struct ktrace_struct_##name
> +
> +#ifdef CONFIG_KTRACE
> +#define DO_TRACE(name, format, args...)
> \
> + do {
> \
> + static const char __kstrtab_##name[]
> \
> + __attribute__((section("__ktrace_strings")))
> \
> + = #name "\0" format;
> \
> + static struct kernel_trace __ktrace_##name
> \
> + __attribute__((section("__ktrace"), aligned(8))) =
> \
> + { __kstrtab_##name, &__kstrtab_##name[sizeof(#name)],
> \
> + 0, 0, NULL, LIST_HEAD_INIT(__ktrace_##name.list) };
> \
> + __ktrace_check_format(format, ## args);
> \
> + if (unlikely(ktrace_enabled) &&
> \
> + unlikely(__ktrace_##name.enabled))
> { \
> + struct ktrace_struct_##name karg = { args };
> \
> + (*__ktrace_##name.func)
> \
> + (&__ktrace_##name, &karg, sizeof(karg));
> \
> + }
> \

This looks like another form of markers/trace-points. Why not use them?

> + } while (0)
> +
> +#else /* !CONFIG_KTRACE */
> +#define DO_TRACE(name, format, args...)
> +#endif
> +

-- Steve

2008-12-03 05:30:25

by Jiaying Zhang

[permalink] [raw]
Subject: Re: [RFC PATCH 3/3] kernel tracing prototype

On Tue, Dec 2, 2008 at 8:35 PM, Steven Rostedt <[email protected]> wrote:
>
> On Tue, 2008-12-02 at 16:26 -0800, Jiaying Zhang wrote:
>> Refer to the previous email
>> "[RFC PATCH 0/3] A couple of feature requests to the unified trace
>> buffer".
>>
>> A kernel tracing prototype that uses the unified trace buffer to keep
>> the collected trace data.
>>
>
>> Index: linux-2.6.26.2/include/linux/ktrace.h
>> ===================================================================
>> --- /dev/null 1970-01-01 00:00:00.000000000 +0000
>> +++ linux-2.6.26.2/include/linux/ktrace.h 2008-11-25
>> 11:32:37.000000000 -0800
>> @@ -0,0 +1,73 @@
>> +#ifndef _LINUX_KTRACE_H
>> +#define _LINUX_KTRACE_H
>> +
>> +#include <linux/types.h>
>> +
>> +struct kernel_trace;
>> +
>> +typedef void ktrace_probe_func(struct kernel_trace *, void *,
>> size_t);
>> +
>> +struct kernel_trace {
>> + const char *name;
>> + const char *format; /* format string describing variable
>> list */
>> + u32 enabled:1, event_id:31;
>> + ktrace_probe_func *func; /* probe function */
>> + struct list_head list; /* list head linked to the hash table
>> entry */
>> +} __attribute__((aligned(8)));
>> +
>> +extern int ktrace_enabled;
>> +
>> +/*
>> + * Make sure the alignment of the structure in the __ktrace section
>> will
>> + * not add unwanted padding between the beginning of the section and
>> the
>> + * structure. Force alignment to the same alignment as the section
>> start.
>> + */
>> +
>> +#define DEFINE_KTRACE_STRUCT(name) struct ktrace_struct_##name
>> +
>> +#ifdef CONFIG_KTRACE
>> +#define DO_TRACE(name, format, args...)
>> \
>> + do {
>> \
>> + static const char __kstrtab_##name[]
>> \
>> + __attribute__((section("__ktrace_strings")))
>> \
>> + = #name "\0" format;
>> \
>> + static struct kernel_trace __ktrace_##name
>> \
>> + __attribute__((section("__ktrace"), aligned(8))) =
>> \
>> + { __kstrtab_##name, &__kstrtab_##name[sizeof(#name)],
>> \
>> + 0, 0, NULL, LIST_HEAD_INIT(__ktrace_##name.list) };
>> \
>> + __ktrace_check_format(format, ## args);
>> \
>> + if (unlikely(ktrace_enabled) &&
>> \
>> + unlikely(__ktrace_##name.enabled))
>> { \
>> + struct ktrace_struct_##name karg = { args };
>> \
>> + (*__ktrace_##name.func)
>> \
>> + (&__ktrace_##name, &karg, sizeof(karg));
>> \
>> + }
>> \
>
> This looks like another form of markers/trace-points. Why not use them?
Yes. It borrows many ideas from the marker code. But we don't intend
to be as general as markers and lttng, which allows us to make many
simplifications and we think will give us performance benefits.

Jiaying
>
>> + } while (0)
>> +
>> +#else /* !CONFIG_KTRACE */
>> +#define DO_TRACE(name, format, args...)
>> +#endif
>> +
>
> -- Steve
>
>
>

2008-12-05 08:05:10

by Jiaying Zhang

[permalink] [raw]
Subject: Re: [RFC PATCH 3/3] kernel tracing prototype

To better answer the question why we want to implement a new kernel
tracing prototype, here are some performance results we collected before
with the tbench benchmark. We were considering to use the existing
marker code then and had built a simple prototype that allows us to measure
the performance overhead from different layers.

- vanilla 2.6.26 kernel, CONFIG_MARKERS=n
Throughput 759.352 MB/sec 4
- markers compiled in, tracing disabled
Throughput 754.18 MB/sec 4
- tracing syscall entry/exit, use markers, not logging data to ring_buffer
Throughput 715.68 MB/sec 4
- tracing syscall entry/exit, use markers, logging data to ring_buffer
Throughput 654.056 MB/sec 4
- tracing syscall entry/exit, bypass markers, not logging data to ring_buffer
Throughput 735.575 MB/sec 4
- tracing syscall entry/exit, bypass markers, logging data to ring_buffer
Throughput 667.68 MB/sec 4

We found that the marker layer alone adds about 3% overhead while
tracing is enabled. This overhead is mostly from the extra code and
function calls added by the markers code. In our use case, we don't
need several features in the current marker code, like the support for
multiple probe functions and the use of va_arg to allow arbitrary trace
format, but we have very strict performance requirements. So we
implemented our own trace instrumentation code that has many
similarities as the markers but is simpler and thus we think is easier
to be made more efficient.

We also found that logging data to the unified trace buffer adds about
10% overhead. After taking a close look, we found that there are two
extra overheads. One is the extra function calls introduced by the
unified trace buffer. That is why we are considering to add the inline
trace write functions in the unified trace buffer. The other is the overhead
on obtaining the timestamps. I saw that the unified trace buffer code
uses sched_clock to obtain timestamps. For some reason, this function
alone adds about 2~3% overhead on my testing machine.

The above numbers were measured on one multi-core system.
I am going to run more performance tests on more systems and will
keep you updated.

Jiaying

On Tue, Dec 2, 2008 at 9:30 PM, Jiaying Zhang <[email protected]> wrote:
>
> On Tue, Dec 2, 2008 at 8:35 PM, Steven Rostedt <[email protected]> wrote:
> >
> > On Tue, 2008-12-02 at 16:26 -0800, Jiaying Zhang wrote:
> >> Refer to the previous email
> >> "[RFC PATCH 0/3] A couple of feature requests to the unified trace
> >> buffer".
> >>
> >> A kernel tracing prototype that uses the unified trace buffer to keep
> >> the collected trace data.
> >>
> >
> >> Index: linux-2.6.26.2/include/linux/ktrace.h
> >> ===================================================================
> >> --- /dev/null 1970-01-01 00:00:00.000000000 +0000
> >> +++ linux-2.6.26.2/include/linux/ktrace.h 2008-11-25
> >> 11:32:37.000000000 -0800
> >> @@ -0,0 +1,73 @@
> >> +#ifndef _LINUX_KTRACE_H
> >> +#define _LINUX_KTRACE_H
> >> +
> >> +#include <linux/types.h>
> >> +
> >> +struct kernel_trace;
> >> +
> >> +typedef void ktrace_probe_func(struct kernel_trace *, void *,
> >> size_t);
> >> +
> >> +struct kernel_trace {
> >> + const char *name;
> >> + const char *format; /* format string describing variable
> >> list */
> >> + u32 enabled:1, event_id:31;
> >> + ktrace_probe_func *func; /* probe function */
> >> + struct list_head list; /* list head linked to the hash table
> >> entry */
> >> +} __attribute__((aligned(8)));
> >> +
> >> +extern int ktrace_enabled;
> >> +
> >> +/*
> >> + * Make sure the alignment of the structure in the __ktrace section
> >> will
> >> + * not add unwanted padding between the beginning of the section and
> >> the
> >> + * structure. Force alignment to the same alignment as the section
> >> start.
> >> + */
> >> +
> >> +#define DEFINE_KTRACE_STRUCT(name) struct ktrace_struct_##name
> >> +
> >> +#ifdef CONFIG_KTRACE
> >> +#define DO_TRACE(name, format, args...)
> >> \
> >> + do {
> >> \
> >> + static const char __kstrtab_##name[]
> >> \
> >> + __attribute__((section("__ktrace_strings")))
> >> \
> >> + = #name "\0" format;
> >> \
> >> + static struct kernel_trace __ktrace_##name
> >> \
> >> + __attribute__((section("__ktrace"), aligned(8))) =
> >> \
> >> + { __kstrtab_##name, &__kstrtab_##name[sizeof(#name)],
> >> \
> >> + 0, 0, NULL, LIST_HEAD_INIT(__ktrace_##name.list) };
> >> \
> >> + __ktrace_check_format(format, ## args);
> >> \
> >> + if (unlikely(ktrace_enabled) &&
> >> \
> >> + unlikely(__ktrace_##name.enabled))
> >> { \
> >> + struct ktrace_struct_##name karg = { args };
> >> \
> >> + (*__ktrace_##name.func)
> >> \
> >> + (&__ktrace_##name, &karg, sizeof(karg));
> >> \
> >> + }
> >> \
> >
> > This looks like another form of markers/trace-points. Why not use them?
> Yes. It borrows many ideas from the marker code. But we don't intend
> to be as general as markers and lttng, which allows us to make many
> simplifications and we think will give us performance benefits.
>
> Jiaying
> >
> >> + } while (0)
> >> +
> >> +#else /* !CONFIG_KTRACE */
> >> +#define DO_TRACE(name, format, args...)
> >> +#endif
> >> +
> >
> > -- Steve
> >
> >
> >

2008-12-05 15:35:37

by Frank Ch. Eigler

[permalink] [raw]
Subject: Re: [RFC PATCH 3/3] kernel tracing prototype

Jiaying Zhang <[email protected]> writes:

> To better answer the question why we want to implement a new kernel
> tracing prototype, here are some performance results we collected before
> with the tbench benchmark.

Thanks.

> - vanilla 2.6.26 kernel, CONFIG_MARKERS=n
> Throughput 759.352 MB/sec 4
> - markers compiled in, tracing disabled
> Throughput 754.18 MB/sec 4

Is your kernel built with -freorder-blocks? This option dramatically
reduces the cost of inactive markers/tracepoints.

> - tracing syscall entry/exit, use markers, not logging data to ring_buffer
> Throughput 715.68 MB/sec 4
> - tracing syscall entry/exit, use markers, logging data to ring_buffer
> Throughput 654.056 MB/sec 4

(By the way, how are you doing syscall entry/exit tracing?)


- FChE

2008-12-05 15:38:01

by Martin Bligh

[permalink] [raw]
Subject: Re: [RFC PATCH 3/3] kernel tracing prototype

>> - vanilla 2.6.26 kernel, CONFIG_MARKERS=n
>> Throughput 759.352 MB/sec 4
>> - markers compiled in, tracing disabled
>> Throughput 754.18 MB/sec 4
>
> Is your kernel built with -freorder-blocks? This option dramatically
> reduces the cost of inactive markers/tracepoints.

I'm curious why we wouldn't turn that on by default in the makefile?
At least if CONFIG_MARKERS is on.

2008-12-05 18:36:48

by Jiaying Zhang

[permalink] [raw]
Subject: Re: [RFC PATCH 3/3] kernel tracing prototype

On Fri, Dec 5, 2008 at 7:34 AM, Frank Ch. Eigler <[email protected]> wrote:
> Jiaying Zhang <[email protected]> writes:
>
>> To better answer the question why we want to implement a new kernel
>> tracing prototype, here are some performance results we collected before
>> with the tbench benchmark.
>
> Thanks.
>
>> - vanilla 2.6.26 kernel, CONFIG_MARKERS=n
>> Throughput 759.352 MB/sec 4
>> - markers compiled in, tracing disabled
>> Throughput 754.18 MB/sec 4
>
> Is your kernel built with -freorder-blocks? This option dramatically
> reduces the cost of inactive markers/tracepoints.
No. I will try this this flag and let you know if I see any
performance difference.
Thanks for your suggestion!

>
>> - tracing syscall entry/exit, use markers, not logging data to ring_buffer
>> Throughput 715.68 MB/sec 4
>> - tracing syscall entry/exit, use markers, logging data to ring_buffer
>> Throughput 654.056 MB/sec 4
>
> (By the way, how are you doing syscall entry/exit tracing?)
The way we trace syscall entry/exit is similar to the approach
used in LTTng. When tracing is enabled, a special flag,
TIF_KERNEL_TRACE is set so syscall entry and exit call special
syscall_trace_enter and syscall_trace_exit that are patched
to call trace_mark in our old prototype.

Jiaying

>
>
> - FChE
>