2008-10-28 04:13:01

by Steven Rostedt

[permalink] [raw]
Subject: [PATCH][RFC] trace: profile likely and unlikely annotations


Andrew Morton recently suggested having an in-kernel way to profile
likely and unlikely macros. This patch achieves that goal.

When configured, every(*) likely and unlikely macro gets a counter attached
to it. When the condition is hit, the hit and misses of that condition
are recorded. These numbers can later be retrieved by:

/debugfs/tracing/profile_likely - All likely markers
/debugfs/tracing/profile_unlikely - All unlikely markers.

# cat /debug/tracing/profile_unlikely | \
awk '{ if ((($1 + 1.0)/ ($2 + 1.0)) > 0.5) { print $0; }}' |head -15
Hit Missed Function
--- ------ --------
36278 44749 __switch_to+0x129/0x55d
70064 10964 __switch_to+0xcd/0x55d
70060 10964 __switch_to+0x63/0x55d
2628 0 syscall_trace_leave+0x0/0x154
1 0 native_smp_prepare_cpus+0x3e9/0x485
287349 98809 place_entity+0xac/0x11c
141308 236748 calc_delta_mine+0x8b/0x109
55091 0 enqueue_task+0x0/0x9b
71724 52117 pick_next_task_fair+0x0/0xc1
11 5 yield_task_fair+0x0/0x13b
1269 0 pre_schedule_rt+0x0/0x98
46392 1265 pick_next_task_rt+0x0/0x12f
51523 0 dequeue_task+0x57/0x114

# cat /debug/tracing/profile_likely | \
awk '{ if ((($2 + 1.0)/ ($1 + 1.0)) > 0.5) { print $0; }}'
Hit Missed Function
--- ------ --------
43316 37700 schedule+0x6e7/0xdb2
23032 12997 schedule+0xa9b/0xdb2
0 7 __cancel_work_timer+0x4b/0x292
885 1683 update_wall_time+0x27b/0x4fd
0 369948 audit_syscall_exit+0x131/0x403
19 370851 audit_free+0x0/0x27e
0 368981 audit_syscall_entry+0x1a4/0x226
19 900 audit_free+0xf0/0x27e
0 973 audit_alloc+0xa6/0x13c
0 6 move_masked_irq+0x2b/0x174
12878 223023 page_evictable+0x9a/0x1f1
9 7 flush_cpu_slab+0x0/0x143
634634 324186 kmem_cache_free+0x79/0x139
3142 61979 dentry_lru_del_init+0x0/0x89
948 1166 load_elf_binary+0xa00/0x18ad
102 82 disk_part_iter_next+0x0/0xf1
0 11 tcp_mss_split_point+0x0/0xbd


As you can see by the above, there's a bit of work to do in rethinking
the use of some unlikelys and likelys. Note: the unlikely case had 60 hits
that were more than 50%.

(*) Not ever unlikely is recorded, those that are used by vsyscalls
(a few of them) had to be marked as unlikely_notrace().

Signed-off-by: Steven Rostedt <[email protected]>
---
arch/x86/kernel/vsyscall_64.c | 5 -
include/asm-generic/vmlinux.lds.h | 14 +++
include/linux/compiler.h | 38 ++++++++
include/linux/seqlock.h | 3
kernel/trace/Kconfig | 16 +++
kernel/trace/Makefile | 2
kernel/trace/trace_unlikely.c | 163 ++++++++++++++++++++++++++++++++++++++
7 files changed, 237 insertions(+), 4 deletions(-)

Index: linux-tip.git/kernel/trace/Makefile
===================================================================
--- linux-tip.git.orig/kernel/trace/Makefile 2008-10-27 23:11:52.000000000 -0400
+++ linux-tip.git/kernel/trace/Makefile 2008-10-27 23:12:08.000000000 -0400
@@ -26,4 +26,6 @@ obj-$(CONFIG_FUNCTION_PROFILER) += trace
obj-$(CONFIG_MMIOTRACE) += trace_mmiotrace.o
obj-$(CONFIG_BOOT_TRACER) += trace_boot.o

+obj-$(CONFIG_TRACE_UNLIKELY_PROFILE) += trace_unlikely.o
+
libftrace-y := ftrace.o
Index: linux-tip.git/arch/x86/kernel/vsyscall_64.c
===================================================================
--- linux-tip.git.orig/arch/x86/kernel/vsyscall_64.c 2008-10-27 23:11:52.000000000 -0400
+++ linux-tip.git/arch/x86/kernel/vsyscall_64.c 2008-10-27 23:12:44.000000000 -0400
@@ -124,7 +124,8 @@ static __always_inline void do_vgettimeo
seq = read_seqbegin(&__vsyscall_gtod_data.lock);

vread = __vsyscall_gtod_data.clock.vread;
- if (unlikely(!__vsyscall_gtod_data.sysctl_enabled || !vread)) {
+ if (unlikely_notrace(!__vsyscall_gtod_data.sysctl_enabled ||
+ !vread)) {
gettimeofday(tv,NULL);
return;
}
@@ -165,7 +166,7 @@ time_t __vsyscall(1) vtime(time_t *t)
{
struct timeval tv;
time_t result;
- if (unlikely(!__vsyscall_gtod_data.sysctl_enabled))
+ if (unlikely_notrace(!__vsyscall_gtod_data.sysctl_enabled))
return time_syscall(t);

vgettimeofday(&tv, NULL);
Index: linux-tip.git/include/asm-generic/vmlinux.lds.h
===================================================================
--- linux-tip.git.orig/include/asm-generic/vmlinux.lds.h 2008-10-27 23:11:52.000000000 -0400
+++ linux-tip.git/include/asm-generic/vmlinux.lds.h 2008-10-27 23:12:08.000000000 -0400
@@ -45,6 +45,17 @@
#define MCOUNT_REC()
#endif

+#ifdef CONFIG_TRACE_UNLIKELY_PROFILE
+#define LIKELY_PROFILE() VMLINUX_SYMBOL(__start_likely_profile) = .; \
+ *(_ftrace_likely) \
+ VMLINUX_SYMBOL(__stop_likely_profile) = .; \
+ VMLINUX_SYMBOL(__start_unlikely_profile) = .; \
+ *(_ftrace_unlikely) \
+ VMLINUX_SYMBOL(__stop_unlikely_profile) = .;
+#else
+#define LIKELY_PROFILE()
+#endif
+
/* .data section */
#define DATA_DATA \
*(.data) \
@@ -62,7 +73,8 @@
VMLINUX_SYMBOL(__stop___markers) = .; \
VMLINUX_SYMBOL(__start___tracepoints) = .; \
*(__tracepoints) \
- VMLINUX_SYMBOL(__stop___tracepoints) = .;
+ VMLINUX_SYMBOL(__stop___tracepoints) = .; \
+ LIKELY_PROFILE()

#define RO_DATA(align) \
. = ALIGN((align)); \
Index: linux-tip.git/include/linux/compiler.h
===================================================================
--- linux-tip.git.orig/include/linux/compiler.h 2008-10-27 23:11:52.000000000 -0400
+++ linux-tip.git/include/linux/compiler.h 2008-10-28 00:00:50.000000000 -0400
@@ -59,8 +59,46 @@ extern void __chk_io_ptr(const volatile
* specific implementations come from the above header files
*/

+#ifdef CONFIG_TRACE_UNLIKELY_PROFILE
+struct ftrace_likely_data {
+ unsigned long ip;
+ unsigned long hit;
+ unsigned long missed;
+};
+void ftrace_likely_update(struct ftrace_likely_data *f, int val);
+
+#define likely_notrace(x) __builtin_expect(!!(x), 1)
+#define unlikely_notrace(x) __builtin_expect(!!(x), 0)
+
+/* Can't use _THIS_IP_ because it screws up header dependencies */
+#define __THIS_IP__ ({ __label__ __here; __here: (unsigned long)&&__here; })
+
+#define likely(x) ({ \
+ int ______r; \
+ static struct ftrace_likely_data ______f \
+ __attribute__((__aligned__(4))) \
+ __attribute__((section("_ftrace_likely"))); \
+ if (unlikely_notrace(!______f.ip)) \
+ ______f.ip = __THIS_IP__; \
+ ______r = likely_notrace(x); \
+ ftrace_likely_update(&______f, ______r); \
+ ______r; \
+ })
+#define unlikely(x) ({ \
+ int ______r; \
+ static struct ftrace_likely_data ______f \
+ __attribute__((__aligned__(4))) \
+ __attribute__((section("_ftrace_unlikely"))); \
+ if (unlikely_notrace(!______f.ip)) \
+ ______f.ip = __THIS_IP__; \
+ ______r = unlikely_notrace(x); \
+ ftrace_likely_update(&______f, ______r); \
+ ______r; \
+ })
+#else
#define likely(x) __builtin_expect(!!(x), 1)
#define unlikely(x) __builtin_expect(!!(x), 0)
+#endif

/* Optimization barrier */
#ifndef barrier
Index: linux-tip.git/kernel/trace/Kconfig
===================================================================
--- linux-tip.git.orig/kernel/trace/Kconfig 2008-10-27 23:11:52.000000000 -0400
+++ linux-tip.git/kernel/trace/Kconfig 2008-10-27 23:12:08.000000000 -0400
@@ -138,6 +138,22 @@ config BOOT_TRACER
selected, because the self-tests are an initcall as well and that
would invalidate the boot trace. )

+config TRACE_UNLIKELY_PROFILE
+ bool "Trace likely/unlikely profiler"
+ depends on DEBUG_KERNEL
+ select TRACING
+ help
+ This tracer profiles all the the likely and unlikely macros
+ in the kernel. It will display the results in:
+
+ /debugfs/tracing/profile_likely
+ /debugfs/tracing/profile_unlikely
+
+ Note: this will add a significant overhead, only turn this
+ on if you need to profile the system's use of these macros.
+
+ Say N if unsure.
+
config STACK_TRACER
bool "Trace max stack"
depends on HAVE_FUNCTION_TRACER
Index: linux-tip.git/kernel/trace/trace_unlikely.c
===================================================================
--- /dev/null 1970-01-01 00:00:00.000000000 +0000
+++ linux-tip.git/kernel/trace/trace_unlikely.c 2008-10-27 23:13:53.000000000 -0400
@@ -0,0 +1,163 @@
+/*
+ * unlikely profiler
+ *
+ * Copyright (C) 2008 Steven Rostedt <[email protected]>
+ */
+#include <linux/kallsyms.h>
+#include <linux/seq_file.h>
+#include <linux/spinlock.h>
+#include <linux/debugfs.h>
+#include <linux/uaccess.h>
+#include <linux/module.h>
+#include <linux/ftrace.h>
+#include <linux/hash.h>
+#include <linux/fs.h>
+#include <asm/local.h>
+#include "trace.h"
+
+void ftrace_likely_update(struct ftrace_likely_data *f, int val)
+{
+ /* FIXME: Make this atomic! */
+ if (val)
+ f->hit++;
+ else
+ f->missed++;
+}
+EXPORT_SYMBOL(ftrace_likely_update);
+
+struct ftrace_pointer {
+ void *start;
+ void *stop;
+};
+
+static void *
+t_next(struct seq_file *m, void *v, loff_t *pos)
+{
+ struct ftrace_pointer *f = m->private;
+ struct ftrace_likely_data *p = v;
+
+ (*pos)++;
+
+ if (v == (void *)1)
+ return f->start;
+
+ ++p;
+
+ if ((void *)p >= (void *)f->stop)
+ return NULL;
+
+ return p;
+}
+
+static void *t_start(struct seq_file *m, loff_t *pos)
+{
+ void *t = (void *)1;
+ loff_t l = 0;
+
+ for (; t && l < *pos; t = t_next(m, t, &l))
+ ;
+
+ return t;
+}
+
+static void t_stop(struct seq_file *m, void *p)
+{
+}
+
+static int trace_lookup_sym(struct seq_file *m, unsigned long addr)
+{
+#ifdef CONFIG_KALLSYMS
+ char str[KSYM_SYMBOL_LEN];
+
+ sprint_symbol(str, addr);
+
+ return seq_printf(m, "%s\n", str);
+#else
+ return seq_printf(m, "%p\n", (void *)addr);
+#endif
+}
+
+static int t_show(struct seq_file *m, void *v)
+{
+ struct ftrace_likely_data *p = v;
+
+ if (v == (void *)1) {
+ seq_printf(m, " Hit Missed Function\n"
+ " --- ------ --------\n");
+ return 0;
+ }
+
+ /* skip null records */
+ if (!p->ip)
+ return 0;
+
+ seq_printf(m, " %8lu %8lu ", p->hit, p->missed);
+ trace_lookup_sym(m, p->ip);
+ return 0;
+}
+
+static struct seq_operations tracing_likely_seq_ops = {
+ .start = t_start,
+ .next = t_next,
+ .stop = t_stop,
+ .show = t_show,
+};
+
+static int tracing_likely_open(struct inode *inode, struct file *file)
+{
+ int ret;
+
+ ret = seq_open(file, &tracing_likely_seq_ops);
+ if (!ret) {
+ struct seq_file *m = file->private_data;
+ m->private = (void *)inode->i_private;
+ }
+
+ return ret;
+}
+
+static struct file_operations tracing_likely_fops = {
+ .open = tracing_likely_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+};
+
+extern unsigned long __start_likely_profile[];
+extern unsigned long __stop_likely_profile[];
+extern unsigned long __start_unlikely_profile[];
+extern unsigned long __stop_unlikely_profile[];
+
+static struct ftrace_pointer ftrace_likely_pos = {
+ .start = __start_likely_profile,
+ .stop = __stop_likely_profile,
+};
+
+static struct ftrace_pointer ftrace_unlikely_pos = {
+ .start = __start_unlikely_profile,
+ .stop = __stop_unlikely_profile,
+};
+
+static __init int ftrace_unlikely_init(void)
+{
+ struct dentry *d_tracer;
+ struct dentry *entry;
+
+ d_tracer = tracing_init_dentry();
+
+ entry = debugfs_create_file("profile_likely", 0444, d_tracer,
+ &ftrace_likely_pos,
+ &tracing_likely_fops);
+ if (!entry)
+ pr_warning("Could not create debugfs 'profile_likely' entry\n");
+
+ entry = debugfs_create_file("profile_unlikely", 0444, d_tracer,
+ &ftrace_unlikely_pos,
+ &tracing_likely_fops);
+ if (!entry)
+ pr_warning("Could not create debugfs"
+ " 'profile_unlikely' entry\n");
+
+ return 0;
+}
+
+device_initcall(ftrace_unlikely_init);
Index: linux-tip.git/include/linux/seqlock.h
===================================================================
--- linux-tip.git.orig/include/linux/seqlock.h 2008-10-27 23:11:52.000000000 -0400
+++ linux-tip.git/include/linux/seqlock.h 2008-10-27 23:12:08.000000000 -0400
@@ -90,7 +90,8 @@ static __always_inline unsigned read_seq
repeat:
ret = sl->sequence;
smp_rmb();
- if (unlikely(ret & 1)) {
+ /* Used in vsyscall, need notrace version */
+ if (unlikely_notrace(ret & 1)) {
cpu_relax();
goto repeat;
}


2008-10-28 04:23:38

by Arjan van de Ven

[permalink] [raw]
Subject: Re: [PATCH][RFC] trace: profile likely and unlikely annotations

On Tue, 28 Oct 2008 00:12:48 -0400 (EDT)
Steven Rostedt <[email protected]> wrote:

>
> Andrew Morton recently suggested having an in-kernel way to profile
> likely and unlikely macros. This patch achieves that goal.


alternatively we remove likely/unlikely ;)

or maybe only ban it from drivers or something.


--
Arjan van de Ven Intel Open Source Technology Centre
For development, discussion and tips for power savings,
visit http://www.lesswatts.org

2008-10-28 04:41:00

by Andrew Morton

[permalink] [raw]
Subject: Re: [PATCH][RFC] trace: profile likely and unlikely annotations

On Tue, 28 Oct 2008 00:12:48 -0400 (EDT) Steven Rostedt <[email protected]> wrote:

> (*) Not ever unlikely is recorded, those that are used by vsyscalls
> (a few of them) had to be marked as unlikely_notrace().

Yes, it took continuous maintenance to weed this stuff out of
profile-likely-unlikely-macros.patch.

I'd suggest that you take a peek at this patch and make sure that
you've addressed all the sites which had to be fixed up while
maintaining this one.

(I've been maintaining this for 2-1/2 years. Sniff)


From: Daniel Walker <[email protected]>

-ENOCHANGELOG!

Creates /proc/likely_prof.

[[email protected]: profile_likely: export do_check_likely]
[[email protected]: fixes and general maintenance]
[[email protected]: likely_prof changed to use proc_create]
[[email protected]: likeliness accounting change and cleanup]
[[email protected]: proc: remove proc_root from drivers]
[[email protected]: update to test_and_set_bit_lock / clear_bit_unlock]
[[email protected]: likely-profiling: disable ftrace]
Signed-off-by: Daniel Walker <[email protected]>
Signed-off-by: Hua Zhong <[email protected]>
Cc: Andi Kleen <[email protected]>
Signed-off-by: Roel Kluin <[email protected]>
Signed-off-by: Alexey Dobriyan <[email protected]>
Signed-off-by: Andrew Morton <[email protected]>
---

arch/x86/kernel/vsyscall_64.c | 2
arch/x86/vdso/vclock_gettime.c | 2
include/linux/compiler.h | 36 +++++++
lib/Kconfig.debug | 8 +
lib/Makefile | 5 +
lib/likely_prof.c | 150 +++++++++++++++++++++++++++++++
6 files changed, 203 insertions(+)

diff -puN arch/x86/kernel/vsyscall_64.c~profile-likely-unlikely-macros arch/x86/kernel/vsyscall_64.c
--- a/arch/x86/kernel/vsyscall_64.c~profile-likely-unlikely-macros
+++ a/arch/x86/kernel/vsyscall_64.c
@@ -17,6 +17,8 @@
* want per guest time just set the kernel.vsyscall64 sysctl to 0.
*/

+#define SUPPRESS_LIKELY_PROFILING
+
#include <linux/time.h>
#include <linux/init.h>
#include <linux/kernel.h>
diff -puN arch/x86/vdso/vclock_gettime.c~profile-likely-unlikely-macros arch/x86/vdso/vclock_gettime.c
--- a/arch/x86/vdso/vclock_gettime.c~profile-likely-unlikely-macros
+++ a/arch/x86/vdso/vclock_gettime.c
@@ -9,6 +9,8 @@
* Also alternative() doesn't work.
*/

+#define SUPPRESS_LIKELY_PROFILING
+
#include <linux/kernel.h>
#include <linux/posix-timers.h>
#include <linux/time.h>
diff -puN include/linux/compiler.h~profile-likely-unlikely-macros include/linux/compiler.h
--- a/include/linux/compiler.h~profile-likely-unlikely-macros
+++ a/include/linux/compiler.h
@@ -53,6 +53,41 @@ extern void __chk_io_ptr(const volatile
# include <linux/compiler-intel.h>
#endif

+#if defined(CONFIG_PROFILE_LIKELY) && !defined(SUPPRESS_LIKELY_PROFILING) && \
+ !(defined(CONFIG_MODULE_UNLOAD) && defined(MODULE))
+struct likeliness {
+ char *const file;
+ unsigned long caller;
+ unsigned int count[2];
+ struct likeliness *next;
+ unsigned int label;
+};
+
+extern int do_check_likely(struct likeliness *likeliness, unsigned int exp);
+
+#define LP_IS_EXPECTED 1
+#define LP_UNSEEN 4
+#define LP_LINE_SHIFT 3
+
+#define __check_likely(exp, is_likely) \
+ ({ \
+ static struct likeliness likeliness = { \
+ .file = __FILE__, \
+ .label = __LINE__ << LP_LINE_SHIFT | \
+ LP_UNSEEN | is_likely, \
+ }; \
+ do_check_likely(&likeliness, !!(exp)); \
+ })
+
+/*
+ * We check for constant values with __builtin_constant_p() since
+ * it's not interesting to profile them, and there is a compiler
+ * bug in gcc 3.x which blows up during constant evalution when
+ * CONFIG_PROFILE_LIKELY is turned on.
+ */
+#define likely(x) (__builtin_constant_p(x) ? (!!(x)) : __check_likely((x), 1))
+#define unlikely(x) (__builtin_constant_p(x) ? (!!(x)) : __check_likely((x), 0))
+#else
/*
* Generic compiler-dependent macros required for kernel
* build go below this comment. Actual compiler/compiler version
@@ -61,6 +96,7 @@ extern void __chk_io_ptr(const volatile

#define likely(x) __builtin_expect(!!(x), 1)
#define unlikely(x) __builtin_expect(!!(x), 0)
+#endif

/* Optimization barrier */
#ifndef barrier
diff -puN lib/Kconfig.debug~profile-likely-unlikely-macros lib/Kconfig.debug
--- a/lib/Kconfig.debug~profile-likely-unlikely-macros
+++ a/lib/Kconfig.debug
@@ -568,6 +568,14 @@ config DEBUG_SYNCHRO_TEST

See Documentation/synchro-test.txt.

+config PROFILE_LIKELY
+ bool "Record return values from likely/unlikely macros"
+ default n
+ help
+ Adds profiling on likely/unlikly macros . To see the
+ results of the profiling you can view the following,
+ /proc/likely_prof
+
config BOOT_PRINTK_DELAY
bool "Delay each boot printk message by N milliseconds"
depends on DEBUG_KERNEL && PRINTK && GENERIC_CALIBRATE_DELAY
diff -puN lib/Makefile~profile-likely-unlikely-macros lib/Makefile
--- a/lib/Makefile~profile-likely-unlikely-macros
+++ a/lib/Makefile
@@ -22,6 +22,9 @@ obj-y += bcd.o div64.o sort.o parser.o h
bust_spinlocks.o hexdump.o kasprintf.o bitmap.o scatterlist.o \
string_helpers.o

+# likely profiling can cause recursion in ftrace, so don't trace it.
+CFLAGS_REMOVE_likely_prof.o = -pg
+
ifeq ($(CONFIG_DEBUG_KOBJECT),y)
CFLAGS_kobject.o += -DDEBUG
CFLAGS_kobject_uevent.o += -DDEBUG
@@ -82,6 +85,8 @@ obj-$(CONFIG_HAVE_ARCH_TRACEHOOK) += sys

obj-$(CONFIG_DYNAMIC_PRINTK_DEBUG) += dynamic_printk.o

+obj-$(CONFIG_PROFILE_LIKELY) += likely_prof.o
+
hostprogs-y := gen_crc32table
clean-files := crc32table.h

diff -puN /dev/null lib/likely_prof.c
--- /dev/null
+++ a/lib/likely_prof.c
@@ -0,0 +1,150 @@
+/*
+ * This code should enable profiling the likely and unlikely macros.
+ *
+ * Output goes in /proc/likely_prof
+ *
+ * Authors:
+ * Daniel Walker <[email protected]>
+ * Hua Zhong <[email protected]>
+ * Andrew Morton <[email protected]>
+ */
+
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/fs.h>
+#include <linux/seq_file.h>
+#include <linux/proc_fs.h>
+#include <linux/kallsyms.h>
+
+#include <asm/bug.h>
+#include <asm/atomic.h>
+
+static struct likeliness *likeliness_head;
+
+int do_check_likely(struct likeliness *likeliness, unsigned int ret)
+{
+ static unsigned long likely_lock;
+
+ likeliness->count[ret]++;
+
+ if (likeliness->label & LP_UNSEEN) {
+ /*
+ * We don't simply use a spinlock because internally to the
+ * spinlock there is a call to unlikely which causes recursion.
+ * We opted for this method because we didn't need a preempt/irq
+ * disable and it was a bit cleaner then using internal __raw
+ * spinlock calls.
+ */
+ if (!test_and_set_bit_lock(0, &likely_lock)) {
+ if (likeliness->label & LP_UNSEEN) {
+ likeliness->label &= (~LP_UNSEEN);
+ likeliness->next = likeliness_head;
+ likeliness_head = likeliness;
+ likeliness->caller = (unsigned long)
+ __builtin_return_address(0);
+ }
+ clear_bit_unlock(0, &likely_lock);
+ }
+ }
+
+ return ret;
+}
+EXPORT_SYMBOL(do_check_likely);
+
+static void * lp_seq_start(struct seq_file *out, loff_t *pos)
+{
+
+ if (!*pos) {
+
+ seq_printf(out, "Likely Profiling Results\n");
+ seq_printf(out, " --------------------------------------------"
+ "------------------------\n");
+ seq_printf(out, "[+- ]Type | # True | # False | Function@"
+ "Filename:Line\n");
+
+ out->private = likeliness_head;
+ }
+
+ return out->private;
+}
+
+static void *lp_seq_next(struct seq_file *out, void *p, loff_t *pos)
+{
+ struct likeliness *entry = p;
+
+ if (entry->next) {
+ ++(*pos);
+ out->private = entry->next;
+ } else
+ out->private = NULL;
+
+ return out->private;
+}
+
+static int lp_seq_show(struct seq_file *out, void *p)
+{
+ struct likeliness *entry = p;
+ unsigned int pos = entry->count[1];
+ unsigned int neg = entry->count[0];
+ char function[KSYM_SYMBOL_LEN];
+
+ /*
+ * Balanced if the suggestion was false in less than 5% of the tests
+ */
+ if (!(entry->label & LP_IS_EXPECTED)) {
+ if (pos + neg < 20 * pos)
+ seq_printf(out, "+");
+ else
+ seq_printf(out, " ");
+
+ seq_printf(out, "unlikely ");
+ } else {
+ if (pos + neg < 20 * neg)
+ seq_printf(out, "-");
+ else
+ seq_printf(out, " ");
+
+ seq_printf(out, "likely ");
+ }
+
+ sprint_symbol(function, entry->caller);
+ seq_printf(out, "|%9u|%9u|\t%s@%s:%u\n", pos, neg, function,
+ entry->file, entry->label >> LP_LINE_SHIFT);
+
+ return 0;
+}
+
+static void lp_seq_stop(struct seq_file *m, void *p)
+{
+}
+
+static struct seq_operations likely_profiling_ops = {
+ .start = lp_seq_start,
+ .next = lp_seq_next,
+ .stop = lp_seq_stop,
+ .show = lp_seq_show
+};
+
+static int lp_results_open(struct inode *inode, struct file *file)
+{
+ return seq_open(file, &likely_profiling_ops);
+}
+
+static struct file_operations proc_likely_operations = {
+ .open = lp_results_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release,
+};
+
+static int __init init_likely(void)
+{
+ struct proc_dir_entry *entry =
+ proc_create("likely_prof", 0, NULL, &proc_likely_operations);
+ if (!entry)
+ return 1;
+
+ return 0;
+}
+__initcall(init_likely);
_

2008-10-28 14:37:34

by Theodore Ts'o

[permalink] [raw]
Subject: Re: [PATCH][RFC] trace: profile likely and unlikely annotations

On Tue, Oct 28, 2008 at 12:12:48AM -0400, Steven Rostedt wrote:
>
> Andrew Morton recently suggested having an in-kernel way to profile
> likely and unlikely macros. This patch achieves that goal.

Maybe I'm confused, but when I read through the patch, it looks like
that 'hit' is incremented whenever the condition is true, and 'missed'
is incremented whenever the condition is false, correct?

Is that what you intended? So for profile_unlikely, "missed" is good,
and "hit" is bad, and for profile_likely, "hit" is good, and "missed"
is bad. That seems horribly confusing.

If that wasn't what you intended, the meaning of "hit" and "missed"
seems to be highly confusing, either way. Can we perhaps use some
other terminology? Simply using "True" and "False" would be better,
since there's no possible confusion what the labels mean.

> +#define unlikely(x) ({ \
> + int ______r; \
> + static struct ftrace_likely_data ______f \
> + __attribute__((__aligned__(4))) \
> + __attribute__((section("_ftrace_unlikely"))); \
> + if (unlikely_notrace(!______f.ip)) \
> + ______f.ip = __THIS_IP__; \
> + ______r = unlikely_notrace(x); \
> + ftrace_likely_update(&______f, ______r); \
> + ______r; \
> + })

Note that unlikely(x) calls ftrace_likely_update(), which does this:

> +void ftrace_likely_update(struct ftrace_likely_data *f, int val)
> +{
> + /* FIXME: Make this atomic! */
> + if (val)
> + f->hit++;
> + else
> + f->missed++;
> +}
> +EXPORT_SYMBOL(ftrace_likely_update);


So that seems to mean that if unlikely(x) is false, then _____r is 0,
which means we increment f->missed. Or am I missing something?

I would have thought that if unlikely(x) is false, that's *good*,
since it means the unlikely label was correct. And normally, when
people think about cache hits vs cache misses, hits are good and
misses are bad. Which is why I think the terminology is highly
confusing...

- Ted

2008-10-28 14:48:18

by Arjan van de Ven

[permalink] [raw]
Subject: Re: [PATCH][RFC] trace: profile likely and unlikely annotations

On Tue, 28 Oct 2008 10:37:20 -0400
Theodore Tso <[email protected]> wrote:

> On Tue, Oct 28, 2008 at 12:12:48AM -0400, Steven Rostedt wrote:
> >
> > Andrew Morton recently suggested having an in-kernel way to profile
> > likely and unlikely macros. This patch achieves that goal.
>
> Maybe I'm confused, but when I read through the patch, it looks like
> that 'hit' is incremented whenever the condition is true, and 'missed'
> is incremented whenever the condition is false, correct?
>
> Is that what you intended? So for profile_unlikely, "missed" is good,
> and "hit" is bad, and for profile_likely, "hit" is good, and "missed"
> is bad. That seems horribly confusing.
>
> If that wasn't what you intended, the meaning of "hit" and "missed"
> seems to be highly confusing, either way. Can we perhaps use some
> other terminology? Simply using "True" and "False" would be better,
> since there's no possible confusion what the labels mean.

or "correct" and "incorrect"



--
Arjan van de Ven Intel Open Source Technology Centre
For development, discussion and tips for power savings,
visit http://www.lesswatts.org

2008-10-28 14:56:48

by Steven Rostedt

[permalink] [raw]
Subject: Re: [PATCH][RFC] trace: profile likely and unlikely annotations


On Tue, 28 Oct 2008, Theodore Tso wrote:

> On Tue, Oct 28, 2008 at 12:12:48AM -0400, Steven Rostedt wrote:
> >
> > Andrew Morton recently suggested having an in-kernel way to profile
> > likely and unlikely macros. This patch achieves that goal.
>
> Maybe I'm confused, but when I read through the patch, it looks like
> that 'hit' is incremented whenever the condition is true, and 'missed'
> is incremented whenever the condition is false, correct?

Correct.

>
> Is that what you intended? So for profile_unlikely, "missed" is good,
> and "hit" is bad, and for profile_likely, "hit" is good, and "missed"
> is bad. That seems horribly confusing.

Correct. Yeah, I figured I'd get complaints about this (hence the RFC).
If you look at my awk example, you will also notice that I switched the
$1 and $2 around when reading the other file.

This can be confusing either way. I did this to reuse the code for both
outputs.

>
> If that wasn't what you intended, the meaning of "hit" and "missed"
> seems to be highly confusing, either way. Can we perhaps use some
> other terminology? Simply using "True" and "False" would be better,
> since there's no possible confusion what the labels mean.

So renaming 'hit' and 'miss' to 'True' and 'False' would be good enough?
That is, it will still mean that a 'True' is bad for unlikely but good for
a likely?

>
> > +#define unlikely(x) ({ \
> > + int ______r; \
> > + static struct ftrace_likely_data ______f \
> > + __attribute__((__aligned__(4))) \
> > + __attribute__((section("_ftrace_unlikely"))); \
> > + if (unlikely_notrace(!______f.ip)) \
> > + ______f.ip = __THIS_IP__; \
> > + ______r = unlikely_notrace(x); \
> > + ftrace_likely_update(&______f, ______r); \
> > + ______r; \
> > + })
>
> Note that unlikely(x) calls ftrace_likely_update(), which does this:
>
> > +void ftrace_likely_update(struct ftrace_likely_data *f, int val)
> > +{
> > + /* FIXME: Make this atomic! */
> > + if (val)
> > + f->hit++;
> > + else
> > + f->missed++;
> > +}
> > +EXPORT_SYMBOL(ftrace_likely_update);
>
>
> So that seems to mean that if unlikely(x) is false, then _____r is 0,
> which means we increment f->missed. Or am I missing something?
>
> I would have thought that if unlikely(x) is false, that's *good*,
> since it means the unlikely label was correct. And normally, when
> people think about cache hits vs cache misses, hits are good and
> misses are bad. Which is why I think the terminology is highly
> confusing...

OK, I'm fine with changing the terminology. v2 will do:

s/hit/True/
s/missed/False/

Fine with you?

-- Steve

2008-10-28 18:30:10

by Theodore Ts'o

[permalink] [raw]
Subject: Re: [PATCH][RFC] trace: profile likely and unlikely annotations

On Tue, Oct 28, 2008 at 10:49:16AM -0400, Steven Rostedt wrote:
>
> OK, I'm fine with changing the terminology. v2 will do:
>
> s/hit/True/
> s/missed/False/
>
> Fine with you?

I'm OK with either that, or with Arjan's suggestion of "Correct" and
"Incorrect" --- although that would changing a line in the definition
of #define unlikely(x):

ftrace_likely_update(&______f, !______r);

Either "True" / "False" or "Correct" / "Incorrect" has the advantage
of being unambiguous. "Correct" / "Incorrect" has the advantage that
people don't have to think about the fact that for
/proc/profile_unlikely, high numbers of "False" is a good thing, where
as for /proc/profile_likely, high numbers of "True" is a good thing.
With "Correct" / "Incorrect" it's easier to undersatnd that high
numbers of "Correct" is good.

So I can see why Arjan suggested Correct/Incorrect, although I can
live with either.

- Ted

2008-10-28 18:41:32

by Steven Rostedt

[permalink] [raw]
Subject: Re: [PATCH][RFC] trace: profile likely and unlikely annotations


On Tue, 28 Oct 2008, Theodore Tso wrote:

> On Tue, Oct 28, 2008 at 10:49:16AM -0400, Steven Rostedt wrote:
> >
> > OK, I'm fine with changing the terminology. v2 will do:
> >
> > s/hit/True/
> > s/missed/False/
> >
> > Fine with you?
>
> I'm OK with either that, or with Arjan's suggestion of "Correct" and
> "Incorrect" --- although that would changing a line in the definition
> of #define unlikely(x):
>
> ftrace_likely_update(&______f, !______r);
>
> Either "True" / "False" or "Correct" / "Incorrect" has the advantage
> of being unambiguous. "Correct" / "Incorrect" has the advantage that
> people don't have to think about the fact that for
> /proc/profile_unlikely, high numbers of "False" is a good thing, where
> as for /proc/profile_likely, high numbers of "True" is a good thing.
> With "Correct" / "Incorrect" it's easier to undersatnd that high
> numbers of "Correct" is good.
>
> So I can see why Arjan suggested Correct/Incorrect, although I can
> live with either.

I'm almost done with v2. Using Arjan's version (Correct/Incorrect) was
actually a trivial change.

-- Steve

2008-10-28 19:23:28

by Steven Rostedt

[permalink] [raw]
Subject: Re: [PATCH][RFC] trace: profile likely and unlikely annotations


On Tue, 28 Oct 2008, Arjan van de Ven wrote:

> On Tue, 28 Oct 2008 10:37:20 -0400
> Theodore Tso <[email protected]> wrote:
>
> > On Tue, Oct 28, 2008 at 12:12:48AM -0400, Steven Rostedt wrote:
> > >
> > > Andrew Morton recently suggested having an in-kernel way to profile
> > > likely and unlikely macros. This patch achieves that goal.
> >
> > Maybe I'm confused, but when I read through the patch, it looks like
> > that 'hit' is incremented whenever the condition is true, and 'missed'
> > is incremented whenever the condition is false, correct?
> >
> > Is that what you intended? So for profile_unlikely, "missed" is good,
> > and "hit" is bad, and for profile_likely, "hit" is good, and "missed"
> > is bad. That seems horribly confusing.
> >
> > If that wasn't what you intended, the meaning of "hit" and "missed"
> > seems to be highly confusing, either way. Can we perhaps use some
> > other terminology? Simply using "True" and "False" would be better,
> > since there's no possible confusion what the labels mean.
>
> or "correct" and "incorrect"

This means that the code will need to be different for both. Or at least a
way to differentiate between the two. Not that hard, but I wanted to make
the code as trivial as possible.

-- Steve

2008-10-29 16:35:58

by Steven Rostedt

[permalink] [raw]
Subject: [PATCH 1/2 v2][RFC] trace: profile likely and unlikely annotations


Andrew Morton recently suggested having an in-kernel way to profile
likely and unlikely macros. This patch achieves that goal.

When configured, every(*) likely and unlikely macro gets a counter attached
to it. When the condition is hit, the hit and misses of that condition
are recorded. These numbers can later be retrieved by:

/debugfs/tracing/profile_likely - All likely markers
/debugfs/tracing/profile_unlikely - All unlikely markers.

# cat /debug/tracing/profile_unlikely | \
awk '{if (($1 || $2) && (($2+0.1)/($1+0.1) > 0.25)) { print $0; }}' | \
head -20
correct incorrect Function File Line
------- --------- -------- ---- ----
46894 38216 __switch_to process_64.c 624
11951 73138 __switch_to process_64.c 594
11951 73097 __switch_to process_64.c 590
117873 35049 need_resched sched.h 2103
1345 711 syscall_get_error syscall.h 51
0 2510 syscall_trace_leave ptrace.c 1568
0 1 native_smp_prepare_cpus smpboot.c 1231
91635 276125 calc_delta_fair sched_fair.c 411
219028 113449 calc_delta_mine sched.c 1279
0 58048 sched_info_queued sched_stats.h 222
55449 71591 pick_next_task_fair sched_fair.c 1394
10 5 yield_task_fair sched_fair.c 983
7117 2553 rt_policy sched.c 144
0 1220 pre_schedule_rt sched_rt.c 1260
1216 52402 pick_next_task_rt sched_rt.c 884
0 48009 sched_info_dequeued sched_stats.h 177
0 14 sched_move_task sched.c 8707
0 14 sched_move_task sched.c 8697


# cat /debug/tracing/profile_likely | \
awk '{if (($1 || $2) && (($2+0.1)/($1+0.1) > 0.25)) { print $0; }}'
correct incorrect Function File Line
------- --------- -------- ---- ----
42987 40240 pick_next_task sched.c 4407
21419 18103 switch_mm mmu_context_64.h 18
0 7 __cancel_work_timer workqueue.c 560
861 3960 clocksource_adjust timekeeping.c 456
390 1913 clocksource_adjust timekeeping.c 450
0 363683 audit_syscall_exit auditsc.c 1570
30 364573 audit_get_context auditsc.c 732
0 362744 audit_syscall_entry auditsc.c 1541
30 879 audit_free auditsc.c 1446
0 952 audit_alloc auditsc.c 862
2606 952 audit_alloc auditsc.c 858
0 6 move_masked_irq migration.c 20
4465 2091 __grab_cache_page filemap.c 2212
9495 2787 iov_iter_advance filemap.c 1884
3121 899 __set_page_dirty page-writeback.c 1134
12874 221371 mapping_unevictable pagemap.h 50
4 10 __flush_cpu_slab slub.c 1466
635239 320760 slab_free slub.c 1731
3105 61055 dentry_lru_del_init dcache.c 153
942 1156 load_elf_binary binfmt_elf.c 904
102 82 disk_put_part genhd.h 206
0 1 tcp_mss_split_point tcp_output.c 1126



As you can see by the above, there's a bit of work to do in rethinking
the use of some unlikelys and likelys. Note: the unlikely case had 76 hits
that were more than 25%.

Note: After submitting my first version of this patch, Andrew Morton
showed me a version written by Daniel Walker, where I picked up
the following ideas from:

1) Using __builtin_constant_p to avoid profiling fixed values.
2) Using __FILE__ instead of instruction pointers.
3) Using the preprocessor to stop all profiling of likely
annotations from vsyscall_64.c.

Thanks to Andrew Morton, Arjan van de Ven, Theodore Tso and Ingo Molnar
for their feed back on this patch.

(*) Not ever unlikely is recorded, those that are used by vsyscalls
(a few of them) had to have profiling disabled.

Signed-off-by: Steven Rostedt <[email protected]>
---
arch/x86/kernel/vsyscall_64.c | 8 +
include/asm-generic/vmlinux.lds.h | 14 +++
include/linux/compiler.h | 61 ++++++++++++++
kernel/trace/Kconfig | 16 +++
kernel/trace/Makefile | 2
kernel/trace/trace_unlikely.c | 157 ++++++++++++++++++++++++++++++++++++++
6 files changed, 255 insertions(+), 3 deletions(-)

Index: linux-tip.git/kernel/trace/Makefile
===================================================================
--- linux-tip.git.orig/kernel/trace/Makefile 2008-10-29 11:30:05.000000000 -0400
+++ linux-tip.git/kernel/trace/Makefile 2008-10-29 12:32:40.000000000 -0400
@@ -25,4 +25,6 @@ obj-$(CONFIG_STACK_TRACER) += trace_stac
obj-$(CONFIG_MMIOTRACE) += trace_mmiotrace.o
obj-$(CONFIG_BOOT_TRACER) += trace_boot.o

+obj-$(CONFIG_TRACE_UNLIKELY_PROFILE) += trace_unlikely.o
+
libftrace-y := ftrace.o
Index: linux-tip.git/arch/x86/kernel/vsyscall_64.c
===================================================================
--- linux-tip.git.orig/arch/x86/kernel/vsyscall_64.c 2008-10-29 11:30:05.000000000 -0400
+++ linux-tip.git/arch/x86/kernel/vsyscall_64.c 2008-10-29 11:30:12.000000000 -0400
@@ -17,6 +17,14 @@
* want per guest time just set the kernel.vsyscall64 sysctl to 0.
*/

+/* Protect userspace from profiling */
+#ifdef CONFIG_TRACE_UNLIKELY_PROFILE
+# undef likely
+# undef unlikely
+# define likely(x) likely_notrace(x)
+# define unlikely(x) unlikely_notrace(x)
+#endif
+
#include <linux/time.h>
#include <linux/init.h>
#include <linux/kernel.h>
Index: linux-tip.git/include/asm-generic/vmlinux.lds.h
===================================================================
--- linux-tip.git.orig/include/asm-generic/vmlinux.lds.h 2008-10-29 11:30:06.000000000 -0400
+++ linux-tip.git/include/asm-generic/vmlinux.lds.h 2008-10-29 11:30:12.000000000 -0400
@@ -45,6 +45,17 @@
#define MCOUNT_REC()
#endif

+#ifdef CONFIG_TRACE_UNLIKELY_PROFILE
+#define LIKELY_PROFILE() VMLINUX_SYMBOL(__start_likely_profile) = .; \
+ *(_ftrace_likely) \
+ VMLINUX_SYMBOL(__stop_likely_profile) = .; \
+ VMLINUX_SYMBOL(__start_unlikely_profile) = .; \
+ *(_ftrace_unlikely) \
+ VMLINUX_SYMBOL(__stop_unlikely_profile) = .;
+#else
+#define LIKELY_PROFILE()
+#endif
+
/* .data section */
#define DATA_DATA \
*(.data) \
@@ -62,7 +73,8 @@
VMLINUX_SYMBOL(__stop___markers) = .; \
VMLINUX_SYMBOL(__start___tracepoints) = .; \
*(__tracepoints) \
- VMLINUX_SYMBOL(__stop___tracepoints) = .;
+ VMLINUX_SYMBOL(__stop___tracepoints) = .; \
+ LIKELY_PROFILE()

#define RO_DATA(align) \
. = ALIGN((align)); \
Index: linux-tip.git/include/linux/compiler.h
===================================================================
--- linux-tip.git.orig/include/linux/compiler.h 2008-10-29 11:30:06.000000000 -0400
+++ linux-tip.git/include/linux/compiler.h 2008-10-29 11:32:05.000000000 -0400
@@ -59,8 +59,65 @@ extern void __chk_io_ptr(const volatile
* specific implementations come from the above header files
*/

-#define likely(x) __builtin_expect(!!(x), 1)
-#define unlikely(x) __builtin_expect(!!(x), 0)
+#ifdef CONFIG_TRACE_UNLIKELY_PROFILE
+struct ftrace_likely_data {
+ const char *func;
+ const char *file;
+ unsigned line;
+ unsigned long correct;
+ unsigned long incorrect;
+};
+void ftrace_likely_update(struct ftrace_likely_data *f, int val, int expect);
+
+#define likely_notrace(x) __builtin_expect(!!(x), 1)
+#define unlikely_notrace(x) __builtin_expect(!!(x), 0)
+
+#define likely_check(x) ({ \
+ int ______r; \
+ static struct ftrace_likely_data \
+ __attribute__((__aligned__(4))) \
+ __attribute__((section("_ftrace_likely"))) \
+ ______f = { \
+ .func = __func__, \
+ .file = __FILE__, \
+ .line = __LINE__, \
+ }; \
+ ______f.line = __LINE__; \
+ ______r = likely_notrace(x); \
+ ftrace_likely_update(&______f, ______r, 1); \
+ ______r; \
+ })
+#define unlikely_check(x) ({ \
+ int ______r; \
+ static struct ftrace_likely_data \
+ __attribute__((__aligned__(4))) \
+ __attribute__((section("_ftrace_unlikely"))) \
+ ______f = { \
+ .func = __func__, \
+ .file = __FILE__, \
+ .line = __LINE__, \
+ }; \
+ ______f.line = __LINE__; \
+ ______r = unlikely_notrace(x); \
+ ftrace_likely_update(&______f, ______r, 0); \
+ ______r; \
+ })
+
+/*
+ * Using __builtin_constant_p(x) to ignore cases where the return
+ * value is always the same. This idea is taken from a similar patch
+ * written by Daniel Walker.
+ */
+# ifndef likely
+# define likely(x) (__builtin_constant_p(x) ? !!(x) : likely_check(x))
+# endif
+# ifndef unlikely
+# define unlikely(x) (__builtin_constant_p(x) ? !!(x) : unlikely_check(x))
+# endif
+#else
+# define likely(x) __builtin_expect(!!(x), 1)
+# define unlikely(x) __builtin_expect(!!(x), 0)
+#endif

/* Optimization barrier */
#ifndef barrier
Index: linux-tip.git/kernel/trace/Kconfig
===================================================================
--- linux-tip.git.orig/kernel/trace/Kconfig 2008-10-29 11:30:10.000000000 -0400
+++ linux-tip.git/kernel/trace/Kconfig 2008-10-29 12:32:40.000000000 -0400
@@ -138,6 +138,22 @@ config BOOT_TRACER
selected, because the self-tests are an initcall as well and that
would invalidate the boot trace. )

+config TRACE_UNLIKELY_PROFILE
+ bool "Trace likely/unlikely profiler"
+ depends on DEBUG_KERNEL
+ select TRACING
+ help
+ This tracer profiles all the the likely and unlikely macros
+ in the kernel. It will display the results in:
+
+ /debugfs/tracing/profile_likely
+ /debugfs/tracing/profile_unlikely
+
+ Note: this will add a significant overhead, only turn this
+ on if you need to profile the system's use of these macros.
+
+ Say N if unsure.
+
config STACK_TRACER
bool "Trace max stack"
depends on HAVE_FUNCTION_TRACER
Index: linux-tip.git/kernel/trace/trace_unlikely.c
===================================================================
--- /dev/null 1970-01-01 00:00:00.000000000 +0000
+++ linux-tip.git/kernel/trace/trace_unlikely.c 2008-10-29 12:32:40.000000000 -0400
@@ -0,0 +1,157 @@
+/*
+ * unlikely profiler
+ *
+ * Copyright (C) 2008 Steven Rostedt <[email protected]>
+ */
+#include <linux/kallsyms.h>
+#include <linux/seq_file.h>
+#include <linux/spinlock.h>
+#include <linux/debugfs.h>
+#include <linux/uaccess.h>
+#include <linux/module.h>
+#include <linux/ftrace.h>
+#include <linux/hash.h>
+#include <linux/fs.h>
+#include <asm/local.h>
+#include "trace.h"
+
+void ftrace_likely_update(struct ftrace_likely_data *f, int val, int expect)
+{
+ /* FIXME: Make this atomic! */
+ if (val == expect)
+ f->correct++;
+ else
+ f->incorrect++;
+}
+EXPORT_SYMBOL(ftrace_likely_update);
+
+struct ftrace_pointer {
+ void *start;
+ void *stop;
+};
+
+static void *
+t_next(struct seq_file *m, void *v, loff_t *pos)
+{
+ struct ftrace_pointer *f = m->private;
+ struct ftrace_likely_data *p = v;
+
+ (*pos)++;
+
+ if (v == (void *)1)
+ return f->start;
+
+ ++p;
+
+ if ((void *)p >= (void *)f->stop)
+ return NULL;
+
+ return p;
+}
+
+static void *t_start(struct seq_file *m, loff_t *pos)
+{
+ void *t = (void *)1;
+ loff_t l = 0;
+
+ for (; t && l < *pos; t = t_next(m, t, &l))
+ ;
+
+ return t;
+}
+
+static void t_stop(struct seq_file *m, void *p)
+{
+}
+
+static int t_show(struct seq_file *m, void *v)
+{
+ struct ftrace_likely_data *p = v;
+ const char *f;
+
+ if (v == (void *)1) {
+ seq_printf(m, " correct incorrect"
+ " Function "
+ " File Line\n"
+ " ------- ---------"
+ " -------- "
+ " ---- ----\n");
+ return 0;
+ }
+
+ /* Only print the file, not the path */
+ f = p->file + strlen(p->file);
+ while (f >= p->file && *f != '/')
+ f--;
+ f++;
+
+ seq_printf(m, " %8lu %8lu ", p->correct, p->incorrect);
+ seq_printf(m, "%-30.30s %-20.20s %d\n", p->func, f, p->line);
+ return 0;
+}
+
+static struct seq_operations tracing_likely_seq_ops = {
+ .start = t_start,
+ .next = t_next,
+ .stop = t_stop,
+ .show = t_show,
+};
+
+static int tracing_likely_open(struct inode *inode, struct file *file)
+{
+ int ret;
+
+ ret = seq_open(file, &tracing_likely_seq_ops);
+ if (!ret) {
+ struct seq_file *m = file->private_data;
+ m->private = (void *)inode->i_private;
+ }
+
+ return ret;
+}
+
+static struct file_operations tracing_likely_fops = {
+ .open = tracing_likely_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+};
+
+extern unsigned long __start_likely_profile[];
+extern unsigned long __stop_likely_profile[];
+extern unsigned long __start_unlikely_profile[];
+extern unsigned long __stop_unlikely_profile[];
+
+static struct ftrace_pointer ftrace_likely_pos = {
+ .start = __start_likely_profile,
+ .stop = __stop_likely_profile,
+};
+
+static struct ftrace_pointer ftrace_unlikely_pos = {
+ .start = __start_unlikely_profile,
+ .stop = __stop_unlikely_profile,
+};
+
+static __init int ftrace_unlikely_init(void)
+{
+ struct dentry *d_tracer;
+ struct dentry *entry;
+
+ d_tracer = tracing_init_dentry();
+
+ entry = debugfs_create_file("profile_likely", 0444, d_tracer,
+ &ftrace_likely_pos,
+ &tracing_likely_fops);
+ if (!entry)
+ pr_warning("Could not create debugfs 'profile_likely' entry\n");
+
+ entry = debugfs_create_file("profile_unlikely", 0444, d_tracer,
+ &ftrace_unlikely_pos,
+ &tracing_likely_fops);
+ if (!entry)
+ pr_warning("Could not create debugfs"
+ " 'profile_unlikely' entry\n");
+
+ return 0;
+}
+
+device_initcall(ftrace_unlikely_init);

2008-10-29 16:38:32

by Steven Rostedt

[permalink] [raw]
Subject: [PATCH 2/2 v2][RFC] ftrace: unlikely annotation tracer


This patch adds a way to record the instances of the likely and unlikely
condition annotations.

When "unlikely" is set in /debugfs/tracing/iter_ctrl the unlikely conditions
will be added to any of the ftrace tracers.

For example:

irqbalance-2662 [002] 41127.102858: [correct] debug_mutex_free_waiter 51
<idle>-0 [001] 41127.102858: [correct] add_preempt_count 4307
irqbalance-2662 [002] 41127.102859: [INCORRECT] debug_smp_processor_id 15
<idle>-0 [001] 41127.102860: [correct] add_preempt_count 4316
irqbalance-2662 [002] 41127.102861: [correct] slab_alloc 1608

Which shows the normal tracer heading, as well as whether the condition was
correct "[correct]" or was mistaken "[INCORRECT]", followed by the
function and line number.


Signed-off-by: Steven Rostedt <[email protected]>
---
kernel/Makefile | 5 +
kernel/trace/Kconfig | 22 +++++++
kernel/trace/Makefile | 5 +
kernel/trace/ring_buffer.c | 11 +++
kernel/trace/trace.c | 37 +++++++++++
kernel/trace/trace.h | 43 +++++++++++++
kernel/trace/trace_unlikely.c | 130 ++++++++++++++++++++++++++++++++++++++++++
7 files changed, 253 insertions(+)

Index: linux-tip.git/kernel/trace/Kconfig
===================================================================
--- linux-tip.git.orig/kernel/trace/Kconfig 2008-10-29 12:32:30.000000000 -0400
+++ linux-tip.git/kernel/trace/Kconfig 2008-10-29 12:32:35.000000000 -0400
@@ -154,6 +154,28 @@ config TRACE_UNLIKELY_PROFILE

Say N if unsure.

+config TRACING_UNLIKELY
+ bool
+ help
+ Selected by tracers that will trace the likely and unlikely
+ conditions. This prevents the tracers themselves from being
+ profiled. Profiling the tracing infrastructure can only happen
+ when the likelys and unlikelys are not being traced.
+
+config UNLIKELY_TRACER
+ bool "Trace likely/unlikely instances"
+ depends on TRACE_UNLIKELY_PROFILE
+ select TRACING_UNLIKELY
+ help
+ This is traces the events of likely and unlikely condition
+ calls in the kernel. The difference between this and the
+ "Trace likely/unlikely profiler" is that this is not a
+ histogram of the callers, but actually places the calling
+ events into a running buffer to see when and where the
+ events happened, as well as their results.
+
+ Say N if unsure.
+
config STACK_TRACER
bool "Trace max stack"
depends on HAVE_FUNCTION_TRACER
Index: linux-tip.git/kernel/Makefile
===================================================================
--- linux-tip.git.orig/kernel/Makefile 2008-10-29 12:32:30.000000000 -0400
+++ linux-tip.git/kernel/Makefile 2008-10-29 12:32:35.000000000 -0400
@@ -24,6 +24,11 @@ CFLAGS_REMOVE_sched_clock.o = -pg
CFLAGS_REMOVE_sched.o = -mno-spe -pg
endif

+ifdef CONFIG_TRACING_UNLIKELY
+# Do not profile tracepoint if we are tracing the unlikely conditions
+CFLAGS_tracepoint.o += '-Dunlikely(x)=unlikely_notrace(x)'
+endif
+
obj-$(CONFIG_FREEZER) += freezer.o
obj-$(CONFIG_PROFILING) += profile.o
obj-$(CONFIG_SYSCTL_SYSCALL_CHECK) += sysctl_check.o
Index: linux-tip.git/kernel/trace/Makefile
===================================================================
--- linux-tip.git.orig/kernel/trace/Makefile 2008-10-29 12:32:30.000000000 -0400
+++ linux-tip.git/kernel/trace/Makefile 2008-10-29 12:32:35.000000000 -0400
@@ -10,6 +10,11 @@ CFLAGS_trace_selftest_dynamic.o = -pg
obj-y += trace_selftest_dynamic.o
endif

+ifdef CONFIG_TRACING_UNLIKELY
+KBUILD_CFLAGS += '-Dlikely(x)=likely_notrace(x)'
+KBUILD_CFLAGS += '-Dunlikely(x)=unlikely_notrace(x)'
+endif
+
obj-$(CONFIG_FUNCTION_TRACER) += libftrace.o
obj-$(CONFIG_RING_BUFFER) += ring_buffer.o

Index: linux-tip.git/kernel/trace/trace_unlikely.c
===================================================================
--- linux-tip.git.orig/kernel/trace/trace_unlikely.c 2008-10-29 12:32:30.000000000 -0400
+++ linux-tip.git/kernel/trace/trace_unlikely.c 2008-10-29 12:32:35.000000000 -0400
@@ -15,8 +15,138 @@
#include <asm/local.h>
#include "trace.h"

+#ifdef CONFIG_UNLIKELY_TRACER
+
+static int unlikely_tracing_enabled __read_mostly;
+static DEFINE_MUTEX(unlikely_tracing_mutex);
+static DEFINE_PER_CPU(atomic_t, unlikely_trace_disabled);
+static struct trace_array *unlikely_tracer;
+
+void trace_unlikely_stop(void)
+{
+ /* preemption should be disabled */
+ int cpu = smp_processor_id();
+
+ atomic_inc(&per_cpu(unlikely_trace_disabled, cpu));
+}
+
+void trace_unlikely_start(void)
+{
+ /* preemption should be disabled */
+ int cpu = smp_processor_id();
+
+ atomic_dec(&per_cpu(unlikely_trace_disabled, cpu));
+}
+
+static void
+probe_likely_condition(struct ftrace_likely_data *f, int val, int expect)
+{
+ struct trace_array *tr = unlikely_tracer;
+ struct ring_buffer_event *event;
+ struct trace_unlikely *entry;
+ unsigned long flags, irq_flags;
+ int cpu, resched, pc;
+
+ if (unlikely(!tr))
+ return;
+
+ pc = preempt_count();
+ /* Can be called within schedule */
+ resched = need_resched();
+ preempt_disable_notrace();
+ cpu = raw_smp_processor_id();
+ if (atomic_inc_return(&per_cpu(unlikely_trace_disabled, cpu)) != 1)
+ goto out;
+
+ event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry),
+ &irq_flags);
+ if (!event)
+ goto out;
+
+ local_save_flags(flags);
+ entry = ring_buffer_event_data(event);
+ tracing_generic_entry_update(&entry->ent, flags, pc);
+ entry->ent.type = TRACE_UNLIKELY;
+
+ /*
+ * I would love to save just the ftrace_likely_data pointer, but
+ * this code can also be used by modules. Ugly things can happen
+ * if the module is unloaded, and then we go and read the
+ * pointer. This is slower, but much safer.
+ */
+
+ strncpy(entry->func, f->func, TRACE_FUNC_SIZE);
+ strncpy(entry->file, f->file, TRACE_FILE_SIZE);
+ entry->func[TRACE_FUNC_SIZE] = 0;
+ entry->file[TRACE_FILE_SIZE] = 0;
+ entry->line = f->line;
+ entry->correct = val == expect;
+
+ ring_buffer_unlock_commit(tr->buffer, event, irq_flags);
+
+ out:
+ atomic_dec(&per_cpu(unlikely_trace_disabled, cpu));
+ if (resched)
+ preempt_enable_no_resched_notrace();
+ else
+ preempt_enable_notrace();
+}
+
+static inline
+void trace_likely_condition(struct ftrace_likely_data *f, int val, int expect)
+{
+ if (!unlikely_tracing_enabled)
+ return;
+
+ probe_likely_condition(f, val, expect);
+}
+
+int enable_unlikely_tracing(struct trace_array *tr)
+{
+ int ret = 0;
+
+ mutex_lock(&unlikely_tracing_mutex);
+ unlikely_tracer = tr;
+ /*
+ * Must be seen before enabling. The reader is a condition
+ * where we do not need a matching rmb()
+ */
+ smp_wmb();
+ unlikely_tracing_enabled++;
+ mutex_unlock(&unlikely_tracing_mutex);
+
+ return ret;
+}
+
+void disable_unlikely_tracing(void)
+{
+ mutex_lock(&unlikely_tracing_mutex);
+
+ if (!unlikely_tracing_enabled)
+ goto out_unlock;
+
+ unlikely_tracing_enabled--;
+
+ out_unlock:
+ mutex_unlock(&unlikely_tracing_mutex);
+}
+#else
+static inline
+void trace_likely_condition(struct ftrace_likely_data *f, int val, int expect)
+{
+}
+#endif /* CONFIG_UNLIKELY_TRACER */
+
void ftrace_likely_update(struct ftrace_likely_data *f, int val, int expect)
{
+ /*
+ * I would love to have a trace point here instead, but the
+ * trace point code is so inundated with unlikely and likely
+ * conditions that the recursive nightmare that exists is too
+ * much to try to get working. At least for now.
+ */
+ trace_likely_condition(f, val, expect);
+
/* FIXME: Make this atomic! */
if (val == expect)
f->correct++;
Index: linux-tip.git/kernel/trace/trace.c
===================================================================
--- linux-tip.git.orig/kernel/trace/trace.c 2008-10-29 12:32:30.000000000 -0400
+++ linux-tip.git/kernel/trace/trace.c 2008-10-29 12:32:35.000000000 -0400
@@ -235,6 +235,9 @@ static const char *trace_options[] = {
"stacktrace",
"sched-tree",
"ftrace_printk",
+#ifdef CONFIG_UNLIKELY_TRACER
+ "unlikely",
+#endif
NULL
};

@@ -1457,6 +1460,17 @@ print_lat_fmt(struct trace_iterator *ite
trace_seq_print_cont(s, iter);
break;
}
+ case TRACE_UNLIKELY: {
+ struct trace_unlikely *field;
+
+ trace_assign_type(field, entry);
+
+ trace_seq_printf(s, "[%s] %-30.30s %d\n",
+ field->correct ? "correct" : "INCORRECT",
+ field->func,
+ field->line);
+ break;
+ }
default:
trace_seq_printf(s, "Unknown type %d\n", entry->type);
}
@@ -1593,6 +1607,17 @@ static enum print_line_t print_trace_fmt
trace_seq_print_cont(s, iter);
break;
}
+ case TRACE_UNLIKELY: {
+ struct trace_unlikely *field;
+
+ trace_assign_type(field, entry);
+
+ trace_seq_printf(s, "[%s] %-30.30s %d\n",
+ field->correct ? "correct" : "INCORRECT",
+ field->func,
+ field->line);
+ break;
+ }
}
return TRACE_TYPE_HANDLED;
}
@@ -1675,6 +1700,10 @@ static enum print_line_t print_raw_fmt(s
trace_seq_print_cont(s, iter);
break;
}
+ case TRACE_UNLIKELY: {
+ /* IGNORE */
+ break;
+ };
}
return TRACE_TYPE_HANDLED;
}
@@ -2362,8 +2391,14 @@ tracing_ctrl_write(struct file *filp, co

tr->ctrl = val;

+ if (!val)
+ trace_unlikely_disable();
+
if (current_trace && current_trace->ctrl_update)
current_trace->ctrl_update(tr);
+
+ if (val)
+ trace_unlikely_enable(tr);
}
mutex_unlock(&trace_types_lock);

@@ -2425,6 +2460,7 @@ tracing_set_trace_write(struct file *fil
if (t == current_trace)
goto out;

+ trace_unlikely_disable();
if (current_trace && current_trace->reset)
current_trace->reset(tr);

@@ -2432,6 +2468,7 @@ tracing_set_trace_write(struct file *fil
if (t->init)
t->init(tr);

+ trace_unlikely_enable(tr);
out:
mutex_unlock(&trace_types_lock);

Index: linux-tip.git/kernel/trace/trace.h
===================================================================
--- linux-tip.git.orig/kernel/trace/trace.h 2008-10-29 12:32:30.000000000 -0400
+++ linux-tip.git/kernel/trace/trace.h 2008-10-29 12:32:35.000000000 -0400
@@ -21,6 +21,7 @@ enum trace_type {
TRACE_SPECIAL,
TRACE_MMIO_RW,
TRACE_MMIO_MAP,
+ TRACE_UNLIKELY,
TRACE_BOOT,

__TRACE_LAST_TYPE
@@ -117,6 +118,16 @@ struct trace_boot {
struct boot_trace initcall;
};

+#define TRACE_FUNC_SIZE 30
+#define TRACE_FILE_SIZE 20
+struct trace_unlikely {
+ struct trace_entry ent;
+ unsigned line;
+ char func[TRACE_FUNC_SIZE+1];
+ char file[TRACE_FILE_SIZE+1];
+ char correct;
+};
+
/*
* trace_flag_type is an enumeration that holds different
* states when a trace occurs. These are:
@@ -217,6 +228,7 @@ extern void __ftrace_bad_type(void);
IF_ASSIGN(var, ent, struct trace_mmiotrace_map, \
TRACE_MMIO_MAP); \
IF_ASSIGN(var, ent, struct trace_boot, TRACE_BOOT); \
+ IF_ASSIGN(var, ent, struct trace_unlikely, TRACE_UNLIKELY); \
__ftrace_bad_type(); \
} while (0)

@@ -413,8 +425,39 @@ enum trace_iterator_flags {
TRACE_ITER_STACKTRACE = 0x100,
TRACE_ITER_SCHED_TREE = 0x200,
TRACE_ITER_PRINTK = 0x400,
+#ifdef CONFIG_UNLIKELY_TRACER
+ TRACE_ITER_UNLIKELY = 0x800,
+#endif
};

extern struct tracer nop_trace;

+#ifdef CONFIG_UNLIKELY_TRACER
+extern int enable_unlikely_tracing(struct trace_array *tr);
+extern void disable_unlikely_tracing(void);
+static inline int trace_unlikely_enable(struct trace_array *tr)
+{
+ if (trace_flags & TRACE_ITER_UNLIKELY)
+ return enable_unlikely_tracing(tr);
+ return 0;
+}
+static inline void trace_unlikely_disable(void)
+{
+ /* due to races, always disable */
+ disable_unlikely_tracing();
+}
+extern void trace_unlikely_stop(void);
+extern void trace_unlikely_start(void);
+#else
+static inline int trace_unlikely_enable(struct trace_array *tr)
+{
+ return 0;
+}
+static inline void trace_unlikely_disable(void)
+{
+}
+static inline void trace_unlikely_stop(void) { }
+static inline void trace_unlikely_start(void) { }
+#endif /* CONFIG_UNLIKELY_TRACER */
+
#endif /* _LINUX_KERNEL_TRACE_H */
Index: linux-tip.git/kernel/trace/ring_buffer.c
===================================================================
--- linux-tip.git.orig/kernel/trace/ring_buffer.c 2008-10-29 12:32:30.000000000 -0400
+++ linux-tip.git/kernel/trace/ring_buffer.c 2008-10-29 12:32:35.000000000 -0400
@@ -16,6 +16,8 @@
#include <linux/list.h>
#include <linux/fs.h>

+#include "trace.h"
+
/* Up this if you want to test the TIME_EXTENTS and normalization */
#define DEBUG_SHIFT 0

@@ -1139,7 +1141,14 @@ ring_buffer_lock_reserve(struct ring_buf
if (length > BUF_PAGE_SIZE)
goto out;

+ /*
+ * Ugly, but the unlikely tracing will enter back into
+ * the ring buffer code. Namely through the timestamp, causing
+ * a triggering of the "again" loop, that will be infinite.
+ */
+ trace_unlikely_stop();
event = rb_reserve_next_event(cpu_buffer, RINGBUF_TYPE_DATA, length);
+ trace_unlikely_start();
if (!event)
goto out;

@@ -1251,8 +1260,10 @@ int ring_buffer_write(struct ring_buffer
goto out;

event_length = rb_calculate_event_length(length);
+ trace_unlikely_stop();
event = rb_reserve_next_event(cpu_buffer,
RINGBUF_TYPE_DATA, event_length);
+ trace_unlikely_start();
if (!event)
goto out;


2008-10-29 16:40:46

by Arjan van de Ven

[permalink] [raw]
Subject: Re: [PATCH 1/2 v2][RFC] trace: profile likely and unlikely annotations

On Wed, 29 Oct 2008 12:35:44 -0400 (EDT)
Steven Rostedt <[email protected]> wrote:

>
> Andrew Morton recently suggested having an in-kernel way to profile
> likely and unlikely macros. This patch achieves that goal.
>
> When configured, every(*) likely and unlikely macro gets a counter
> attached to it. When the condition is hit, the hit and misses of that
> condition are recorded. These numbers can later be retrieved by:
>
> /debugfs/tracing/profile_likely - All likely markers
> /debugfs/tracing/profile_unlikely - All unlikely markers.
>

I like the general approach
it'd be very convenient if the debugfs files also had a
percentage-incorrect (or percentage-correct, either way) listed,
so that we humans can focus straight on which ones are outliers rather
than having to do math in our heads first...


--
Arjan van de Ven Intel Open Source Technology Centre
For development, discussion and tips for power savings,
visit http://www.lesswatts.org

2008-10-29 22:40:15

by Steven Rostedt

[permalink] [raw]
Subject: [PATCH 1/2 v3][RFC] trace: profile likely and unlikely annotations


[
changes since v2:

Added Arjan's recommendation about calculating a percentage of
failures in the output.

Question: Do people prefer the "func file line" notation or the
kallsyms "func+0x<offset>" notation?
]

Andrew Morton recently suggested having an in-kernel way to profile
likely and unlikely macros. This patch achieves that goal.

When configured, every(*) likely and unlikely macro gets a counter attached
to it. When the condition is hit, the hit and misses of that condition
are recorded. These numbers can later be retrieved by:

/debugfs/tracing/profile_likely - All likely markers
/debugfs/tracing/profile_unlikely - All unlikely markers.

# cat /debug/tracing/profile_unlikely | head
correct incorrect % Function File Line
------- --------- - -------- ---- ----
2494 0 0 do_arch_prctl process_64.c 832
0 0 0 do_arch_prctl process_64.c 804
2988 0 0 IS_ERR err.h 34
124668 6133 4 __switch_to process_64.c 673
130811 0 0 __switch_to process_64.c 639
69768 61064 87 __switch_to process_64.c 624
15557 115251 100 __switch_to process_64.c 594
15555 115227 100 __switch_to process_64.c 590

# cat /debug/tracing/profile_unlikely | \
awk '{ if ($3 > 25) print $0; }' |head -20
59851 49675 82 __switch_to process_64.c 624
12688 96800 100 __switch_to process_64.c 594
12688 96778 100 __switch_to process_64.c 590
152290 49467 32 need_resched sched.h 2103
1385 1034 74 syscall_get_error syscall.h 51
0 2967 100 syscall_trace_leave ptrace.c 1568
0 1 100 native_smp_prepare_cpus smpboot.c 1231
304770 92199 30 update_curr sched_fair.c 489
113195 297065 100 calc_delta_fair sched_fair.c 411
227018 123309 54 calc_delta_mine sched.c 1279
0 70299 100 sched_info_queued sched_stats.h 222
67434 100268 100 pick_next_task_fair sched_fair.c 1394
7 9 100 yield_task_fair sched_fair.c 983
7215 3321 46 rt_policy sched.c 144
0 1611 100 pre_schedule_rt sched_rt.c 1260
1607 66013 100 pick_next_task_rt sched_rt.c 884
0 60198 100 sched_info_dequeued sched_stats.h 177
0 15 100 sched_move_task sched.c 8707
0 15 100 sched_move_task sched.c 8697
68373 50822 74 schedule sched.c 4467


# cat /debug/tracing/profile_likely | awk '{ if ($3 > 25) print $0; }'
54665 53649 98 pick_next_task sched.c 4407
25344 23614 93 switch_mm mmu_context_64.h 18
0 7 100 __cancel_work_timer workqueue.c 560
1202 1354 100 clocksource_adjust timekeeping.c 456
0 441888 100 audit_syscall_exit auditsc.c 1570
22 442776 100 audit_get_context auditsc.c 732
0 440930 100 audit_syscall_entry auditsc.c 1541
22 889 100 audit_free auditsc.c 1446
0 962 100 audit_alloc auditsc.c 862
2614 962 36 audit_alloc auditsc.c 858
0 6 100 move_masked_irq migration.c 20
4482 2110 47 __grab_cache_page filemap.c 2212
9575 2787 29 iov_iter_advance filemap.c 1884
3121 905 28 __set_page_dirty page-writeback.c 1134
12882 222991 100 mapping_unevictable pagemap.h 50
4 11 100 __flush_cpu_slab slub.c 1466
664688 320446 48 slab_free slub.c 1731
2962 61920 100 dentry_lru_del_init dcache.c 153
950 1157 100 load_elf_binary binfmt_elf.c 904
102 82 80 disk_put_part genhd.h 206
0 4 100 tcp_mss_split_point tcp_output.c 1126


As you can see by the above, there's a bit of work to do in rethinking
the use of some unlikelys and likelys. Note: the unlikely case had 77 hits
that were more than 25%.

Note: After submitting my first version of this patch, Andrew Morton
showed me a version written by Daniel Walker, where I picked up
the following ideas from:

1) Using __builtin_constant_p to avoid profiling fixed values.
2) Using __FILE__ instead of instruction pointers.
3) Using the preprocessor to stop all profiling of likely
annotations from vsyscall_64.c.

Thanks to Andrew Morton, Arjan van de Ven, Theodore Tso and Ingo Molnar
for their feed back on this patch.

(*) Not ever unlikely is recorded, those that are used by vsyscalls
(a few of them) had to have profiling disabled.

Signed-off-by: Steven Rostedt <[email protected]>
---
arch/x86/kernel/vsyscall_64.c | 8 +
include/asm-generic/vmlinux.lds.h | 14 ++-
include/linux/compiler.h | 61 +++++++++++++
kernel/trace/Kconfig | 16 +++
kernel/trace/Makefile | 2
kernel/trace/trace_unlikely.c | 167 ++++++++++++++++++++++++++++++++++++++
6 files changed, 265 insertions(+), 3 deletions(-)

Index: linux-tip.git/kernel/trace/Makefile
===================================================================
--- linux-tip.git.orig/kernel/trace/Makefile 2008-10-29 12:38:54.000000000 -0400
+++ linux-tip.git/kernel/trace/Makefile 2008-10-29 18:31:26.000000000 -0400
@@ -25,4 +25,6 @@ obj-$(CONFIG_STACK_TRACER) += trace_stac
obj-$(CONFIG_MMIOTRACE) += trace_mmiotrace.o
obj-$(CONFIG_BOOT_TRACER) += trace_boot.o

+obj-$(CONFIG_TRACE_UNLIKELY_PROFILE) += trace_unlikely.o
+
libftrace-y := ftrace.o
Index: linux-tip.git/arch/x86/kernel/vsyscall_64.c
===================================================================
--- linux-tip.git.orig/arch/x86/kernel/vsyscall_64.c 2008-10-29 12:38:54.000000000 -0400
+++ linux-tip.git/arch/x86/kernel/vsyscall_64.c 2008-10-29 16:13:00.000000000 -0400
@@ -17,6 +17,14 @@
* want per guest time just set the kernel.vsyscall64 sysctl to 0.
*/

+/* Protect userspace from profiling */
+#ifdef CONFIG_TRACE_UNLIKELY_PROFILE
+# undef likely
+# undef unlikely
+# define likely(x) likely_notrace(x)
+# define unlikely(x) unlikely_notrace(x)
+#endif
+
#include <linux/time.h>
#include <linux/init.h>
#include <linux/kernel.h>
Index: linux-tip.git/include/asm-generic/vmlinux.lds.h
===================================================================
--- linux-tip.git.orig/include/asm-generic/vmlinux.lds.h 2008-10-29 12:38:54.000000000 -0400
+++ linux-tip.git/include/asm-generic/vmlinux.lds.h 2008-10-29 16:13:00.000000000 -0400
@@ -45,6 +45,17 @@
#define MCOUNT_REC()
#endif

+#ifdef CONFIG_TRACE_UNLIKELY_PROFILE
+#define LIKELY_PROFILE() VMLINUX_SYMBOL(__start_likely_profile) = .; \
+ *(_ftrace_likely) \
+ VMLINUX_SYMBOL(__stop_likely_profile) = .; \
+ VMLINUX_SYMBOL(__start_unlikely_profile) = .; \
+ *(_ftrace_unlikely) \
+ VMLINUX_SYMBOL(__stop_unlikely_profile) = .;
+#else
+#define LIKELY_PROFILE()
+#endif
+
/* .data section */
#define DATA_DATA \
*(.data) \
@@ -62,7 +73,8 @@
VMLINUX_SYMBOL(__stop___markers) = .; \
VMLINUX_SYMBOL(__start___tracepoints) = .; \
*(__tracepoints) \
- VMLINUX_SYMBOL(__stop___tracepoints) = .;
+ VMLINUX_SYMBOL(__stop___tracepoints) = .; \
+ LIKELY_PROFILE()

#define RO_DATA(align) \
. = ALIGN((align)); \
Index: linux-tip.git/include/linux/compiler.h
===================================================================
--- linux-tip.git.orig/include/linux/compiler.h 2008-10-29 12:38:54.000000000 -0400
+++ linux-tip.git/include/linux/compiler.h 2008-10-29 16:13:00.000000000 -0400
@@ -59,8 +59,65 @@ extern void __chk_io_ptr(const volatile
* specific implementations come from the above header files
*/

-#define likely(x) __builtin_expect(!!(x), 1)
-#define unlikely(x) __builtin_expect(!!(x), 0)
+#ifdef CONFIG_TRACE_UNLIKELY_PROFILE
+struct ftrace_likely_data {
+ const char *func;
+ const char *file;
+ unsigned line;
+ unsigned long correct;
+ unsigned long incorrect;
+};
+void ftrace_likely_update(struct ftrace_likely_data *f, int val, int expect);
+
+#define likely_notrace(x) __builtin_expect(!!(x), 1)
+#define unlikely_notrace(x) __builtin_expect(!!(x), 0)
+
+#define likely_check(x) ({ \
+ int ______r; \
+ static struct ftrace_likely_data \
+ __attribute__((__aligned__(4))) \
+ __attribute__((section("_ftrace_likely"))) \
+ ______f = { \
+ .func = __func__, \
+ .file = __FILE__, \
+ .line = __LINE__, \
+ }; \
+ ______f.line = __LINE__; \
+ ______r = likely_notrace(x); \
+ ftrace_likely_update(&______f, ______r, 1); \
+ ______r; \
+ })
+#define unlikely_check(x) ({ \
+ int ______r; \
+ static struct ftrace_likely_data \
+ __attribute__((__aligned__(4))) \
+ __attribute__((section("_ftrace_unlikely"))) \
+ ______f = { \
+ .func = __func__, \
+ .file = __FILE__, \
+ .line = __LINE__, \
+ }; \
+ ______f.line = __LINE__; \
+ ______r = unlikely_notrace(x); \
+ ftrace_likely_update(&______f, ______r, 0); \
+ ______r; \
+ })
+
+/*
+ * Using __builtin_constant_p(x) to ignore cases where the return
+ * value is always the same. This idea is taken from a similar patch
+ * written by Daniel Walker.
+ */
+# ifndef likely
+# define likely(x) (__builtin_constant_p(x) ? !!(x) : likely_check(x))
+# endif
+# ifndef unlikely
+# define unlikely(x) (__builtin_constant_p(x) ? !!(x) : unlikely_check(x))
+# endif
+#else
+# define likely(x) __builtin_expect(!!(x), 1)
+# define unlikely(x) __builtin_expect(!!(x), 0)
+#endif

/* Optimization barrier */
#ifndef barrier
Index: linux-tip.git/kernel/trace/Kconfig
===================================================================
--- linux-tip.git.orig/kernel/trace/Kconfig 2008-10-29 16:12:53.000000000 -0400
+++ linux-tip.git/kernel/trace/Kconfig 2008-10-29 18:31:26.000000000 -0400
@@ -138,6 +138,22 @@ config BOOT_TRACER
selected, because the self-tests are an initcall as well and that
would invalidate the boot trace. )

+config TRACE_UNLIKELY_PROFILE
+ bool "Trace likely/unlikely profiler"
+ depends on DEBUG_KERNEL
+ select TRACING
+ help
+ This tracer profiles all the the likely and unlikely macros
+ in the kernel. It will display the results in:
+
+ /debugfs/tracing/profile_likely
+ /debugfs/tracing/profile_unlikely
+
+ Note: this will add a significant overhead, only turn this
+ on if you need to profile the system's use of these macros.
+
+ Say N if unsure.
+
config STACK_TRACER
bool "Trace max stack"
depends on HAVE_FUNCTION_TRACER
Index: linux-tip.git/kernel/trace/trace_unlikely.c
===================================================================
--- /dev/null 1970-01-01 00:00:00.000000000 +0000
+++ linux-tip.git/kernel/trace/trace_unlikely.c 2008-10-29 18:31:26.000000000 -0400
@@ -0,0 +1,167 @@
+/*
+ * unlikely profiler
+ *
+ * Copyright (C) 2008 Steven Rostedt <[email protected]>
+ */
+#include <linux/kallsyms.h>
+#include <linux/seq_file.h>
+#include <linux/spinlock.h>
+#include <linux/debugfs.h>
+#include <linux/uaccess.h>
+#include <linux/module.h>
+#include <linux/ftrace.h>
+#include <linux/hash.h>
+#include <linux/fs.h>
+#include <asm/local.h>
+#include "trace.h"
+
+void ftrace_likely_update(struct ftrace_likely_data *f, int val, int expect)
+{
+ /* FIXME: Make this atomic! */
+ if (val == expect)
+ f->correct++;
+ else
+ f->incorrect++;
+}
+EXPORT_SYMBOL(ftrace_likely_update);
+
+struct ftrace_pointer {
+ void *start;
+ void *stop;
+};
+
+static void *
+t_next(struct seq_file *m, void *v, loff_t *pos)
+{
+ struct ftrace_pointer *f = m->private;
+ struct ftrace_likely_data *p = v;
+
+ (*pos)++;
+
+ if (v == (void *)1)
+ return f->start;
+
+ ++p;
+
+ if ((void *)p >= (void *)f->stop)
+ return NULL;
+
+ return p;
+}
+
+static void *t_start(struct seq_file *m, loff_t *pos)
+{
+ void *t = (void *)1;
+ loff_t l = 0;
+
+ for (; t && l < *pos; t = t_next(m, t, &l))
+ ;
+
+ return t;
+}
+
+static void t_stop(struct seq_file *m, void *p)
+{
+}
+
+static int t_show(struct seq_file *m, void *v)
+{
+ struct ftrace_likely_data *p = v;
+ const char *f;
+ unsigned long percent;
+
+ if (v == (void *)1) {
+ seq_printf(m, " correct incorrect %% "
+ " Function "
+ " File Line\n"
+ " ------- --------- - "
+ " -------- "
+ " ---- ----\n");
+ return 0;
+ }
+
+ /* Only print the file, not the path */
+ f = p->file + strlen(p->file);
+ while (f >= p->file && *f != '/')
+ f--;
+ f++;
+
+ if (p->correct) {
+ percent = p->incorrect * 100;
+ percent /= p->correct;
+ /* No need to see huge numbers */
+ if (percent > 100)
+ percent = 100;
+ } else
+ percent = p->incorrect ? 100 : 0;
+
+ seq_printf(m, "%8lu %8lu %3lu ", p->correct, p->incorrect, percent);
+ seq_printf(m, "%-30.30s %-20.20s %d\n", p->func, f, p->line);
+ return 0;
+}
+
+static struct seq_operations tracing_likely_seq_ops = {
+ .start = t_start,
+ .next = t_next,
+ .stop = t_stop,
+ .show = t_show,
+};
+
+static int tracing_likely_open(struct inode *inode, struct file *file)
+{
+ int ret;
+
+ ret = seq_open(file, &tracing_likely_seq_ops);
+ if (!ret) {
+ struct seq_file *m = file->private_data;
+ m->private = (void *)inode->i_private;
+ }
+
+ return ret;
+}
+
+static struct file_operations tracing_likely_fops = {
+ .open = tracing_likely_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+};
+
+extern unsigned long __start_likely_profile[];
+extern unsigned long __stop_likely_profile[];
+extern unsigned long __start_unlikely_profile[];
+extern unsigned long __stop_unlikely_profile[];
+
+static struct ftrace_pointer ftrace_likely_pos = {
+ .start = __start_likely_profile,
+ .stop = __stop_likely_profile,
+};
+
+static struct ftrace_pointer ftrace_unlikely_pos = {
+ .start = __start_unlikely_profile,
+ .stop = __stop_unlikely_profile,
+};
+
+static __init int ftrace_unlikely_init(void)
+{
+ struct dentry *d_tracer;
+ struct dentry *entry;
+
+ d_tracer = tracing_init_dentry();
+
+ entry = debugfs_create_file("profile_likely", 0444, d_tracer,
+ &ftrace_likely_pos,
+ &tracing_likely_fops);
+ if (!entry)
+ pr_warning("Could not create debugfs 'profile_likely' entry\n");
+
+ entry = debugfs_create_file("profile_unlikely", 0444, d_tracer,
+ &ftrace_unlikely_pos,
+ &tracing_likely_fops);
+ if (!entry)
+ pr_warning("Could not create debugfs"
+ " 'profile_unlikely' entry\n");
+
+ return 0;
+}
+
+device_initcall(ftrace_unlikely_init);

2008-10-29 22:41:01

by Steven Rostedt

[permalink] [raw]
Subject: [PATCH 2/2 v3][RFC] ftrace: unlikely annotation tracer


[
Changes since v2:

Changed to a "func:file:line-num" format.
]

This patch adds a way to record the instances of the likely and unlikely
condition annotations.

When "unlikely" is set in /debugfs/tracing/iter_ctrl the unlikely conditions
will be added to any of the ftrace tracers.

For example:

bash-3471 [003] 357.014755: [INCORRECT] sched_info_dequeued:sched_stats.h:177
bash-3471 [003] 357.014756: [correct] update_curr:sched_fair.c:489
bash-3471 [003] 357.014758: [correct] calc_delta_fair:sched_fair.c:411
bash-3471 [003] 357.014759: [correct] account_group_exec_runtime:sched_stats.h:356
bash-3471 [003] 357.014761: [correct] update_curr:sched_fair.c:489
bash-3471 [003] 357.014763: [INCORRECT] calc_delta_fair:sched_fair.c:411
bash-3471 [003] 357.014765: [correct] calc_delta_mine:sched.c:1279


Which shows the normal tracer heading, as well as whether the condition was
correct "[correct]" or was mistaken "[INCORRECT]", followed by the function,
file name and line number.


Signed-off-by: Steven Rostedt <[email protected]>
---
kernel/Makefile | 5 +
kernel/trace/Kconfig | 22 ++++++
kernel/trace/Makefile | 5 +
kernel/trace/ring_buffer.c | 11 +++
kernel/trace/trace.c | 39 +++++++++++
kernel/trace/trace.h | 43 +++++++++++++
kernel/trace/trace_unlikely.c | 137 ++++++++++++++++++++++++++++++++++++++++++
7 files changed, 262 insertions(+)

Index: linux-tip.git/kernel/trace/Kconfig
===================================================================
--- linux-tip.git.orig/kernel/trace/Kconfig 2008-10-29 17:01:11.000000000 -0400
+++ linux-tip.git/kernel/trace/Kconfig 2008-10-29 17:46:37.000000000 -0400
@@ -154,6 +154,28 @@ config TRACE_UNLIKELY_PROFILE

Say N if unsure.

+config TRACING_UNLIKELY
+ bool
+ help
+ Selected by tracers that will trace the likely and unlikely
+ conditions. This prevents the tracers themselves from being
+ profiled. Profiling the tracing infrastructure can only happen
+ when the likelys and unlikelys are not being traced.
+
+config UNLIKELY_TRACER
+ bool "Trace likely/unlikely instances"
+ depends on TRACE_UNLIKELY_PROFILE
+ select TRACING_UNLIKELY
+ help
+ This is traces the events of likely and unlikely condition
+ calls in the kernel. The difference between this and the
+ "Trace likely/unlikely profiler" is that this is not a
+ histogram of the callers, but actually places the calling
+ events into a running buffer to see when and where the
+ events happened, as well as their results.
+
+ Say N if unsure.
+
config STACK_TRACER
bool "Trace max stack"
depends on HAVE_FUNCTION_TRACER
Index: linux-tip.git/kernel/Makefile
===================================================================
--- linux-tip.git.orig/kernel/Makefile 2008-10-29 17:01:11.000000000 -0400
+++ linux-tip.git/kernel/Makefile 2008-10-29 17:46:37.000000000 -0400
@@ -24,6 +24,11 @@ CFLAGS_REMOVE_sched_clock.o = -pg
CFLAGS_REMOVE_sched.o = -mno-spe -pg
endif

+ifdef CONFIG_TRACING_UNLIKELY
+# Do not profile tracepoint if we are tracing the unlikely conditions
+CFLAGS_tracepoint.o += '-Dunlikely(x)=unlikely_notrace(x)'
+endif
+
obj-$(CONFIG_FREEZER) += freezer.o
obj-$(CONFIG_PROFILING) += profile.o
obj-$(CONFIG_SYSCTL_SYSCALL_CHECK) += sysctl_check.o
Index: linux-tip.git/kernel/trace/Makefile
===================================================================
--- linux-tip.git.orig/kernel/trace/Makefile 2008-10-29 17:01:11.000000000 -0400
+++ linux-tip.git/kernel/trace/Makefile 2008-10-29 17:46:37.000000000 -0400
@@ -10,6 +10,11 @@ CFLAGS_trace_selftest_dynamic.o = -pg
obj-y += trace_selftest_dynamic.o
endif

+ifdef CONFIG_TRACING_UNLIKELY
+KBUILD_CFLAGS += '-Dlikely(x)=likely_notrace(x)'
+KBUILD_CFLAGS += '-Dunlikely(x)=unlikely_notrace(x)'
+endif
+
obj-$(CONFIG_FUNCTION_TRACER) += libftrace.o
obj-$(CONFIG_RING_BUFFER) += ring_buffer.o

Index: linux-tip.git/kernel/trace/trace_unlikely.c
===================================================================
--- linux-tip.git.orig/kernel/trace/trace_unlikely.c 2008-10-29 17:25:31.000000000 -0400
+++ linux-tip.git/kernel/trace/trace_unlikely.c 2008-10-29 18:03:30.000000000 -0400
@@ -15,8 +15,145 @@
#include <asm/local.h>
#include "trace.h"

+#ifdef CONFIG_UNLIKELY_TRACER
+
+static int unlikely_tracing_enabled __read_mostly;
+static DEFINE_MUTEX(unlikely_tracing_mutex);
+static DEFINE_PER_CPU(atomic_t, unlikely_trace_disabled);
+static struct trace_array *unlikely_tracer;
+
+void trace_unlikely_stop(void)
+{
+ /* preemption should be disabled */
+ int cpu = smp_processor_id();
+
+ atomic_inc(&per_cpu(unlikely_trace_disabled, cpu));
+}
+
+void trace_unlikely_start(void)
+{
+ /* preemption should be disabled */
+ int cpu = smp_processor_id();
+
+ atomic_dec(&per_cpu(unlikely_trace_disabled, cpu));
+}
+
+static void
+probe_likely_condition(struct ftrace_likely_data *f, int val, int expect)
+{
+ struct trace_array *tr = unlikely_tracer;
+ struct ring_buffer_event *event;
+ struct trace_unlikely *entry;
+ unsigned long flags, irq_flags;
+ int cpu, resched, pc;
+ const char *p;
+
+ /*
+ * I would love to save just the ftrace_likely_data pointer, but
+ * this code can also be used by modules. Ugly things can happen
+ * if the module is unloaded, and then we go and read the
+ * pointer. This is slower, but much safer.
+ */
+
+ if (unlikely(!tr))
+ return;
+
+ pc = preempt_count();
+ /* Can be called within schedule */
+ resched = need_resched();
+ preempt_disable_notrace();
+ cpu = raw_smp_processor_id();
+ if (atomic_inc_return(&per_cpu(unlikely_trace_disabled, cpu)) != 1)
+ goto out;
+
+ event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry),
+ &irq_flags);
+ if (!event)
+ goto out;
+
+ local_save_flags(flags);
+ entry = ring_buffer_event_data(event);
+ tracing_generic_entry_update(&entry->ent, flags, pc);
+ entry->ent.type = TRACE_UNLIKELY;
+
+ /* Strip off the path, only save the file */
+ p = f->file + strlen(f->file);
+ while (p >= f->file && *p != '/')
+ p--;
+ p++;
+
+ strncpy(entry->func, f->func, TRACE_FUNC_SIZE);
+ strncpy(entry->file, p, TRACE_FILE_SIZE);
+ entry->func[TRACE_FUNC_SIZE] = 0;
+ entry->file[TRACE_FILE_SIZE] = 0;
+ entry->line = f->line;
+ entry->correct = val == expect;
+
+ ring_buffer_unlock_commit(tr->buffer, event, irq_flags);
+
+ out:
+ atomic_dec(&per_cpu(unlikely_trace_disabled, cpu));
+ if (resched)
+ preempt_enable_no_resched_notrace();
+ else
+ preempt_enable_notrace();
+}
+
+static inline
+void trace_likely_condition(struct ftrace_likely_data *f, int val, int expect)
+{
+ if (!unlikely_tracing_enabled)
+ return;
+
+ probe_likely_condition(f, val, expect);
+}
+
+int enable_unlikely_tracing(struct trace_array *tr)
+{
+ int ret = 0;
+
+ mutex_lock(&unlikely_tracing_mutex);
+ unlikely_tracer = tr;
+ /*
+ * Must be seen before enabling. The reader is a condition
+ * where we do not need a matching rmb()
+ */
+ smp_wmb();
+ unlikely_tracing_enabled++;
+ mutex_unlock(&unlikely_tracing_mutex);
+
+ return ret;
+}
+
+void disable_unlikely_tracing(void)
+{
+ mutex_lock(&unlikely_tracing_mutex);
+
+ if (!unlikely_tracing_enabled)
+ goto out_unlock;
+
+ unlikely_tracing_enabled--;
+
+ out_unlock:
+ mutex_unlock(&unlikely_tracing_mutex);
+}
+#else
+static inline
+void trace_likely_condition(struct ftrace_likely_data *f, int val, int expect)
+{
+}
+#endif /* CONFIG_UNLIKELY_TRACER */
+
void ftrace_likely_update(struct ftrace_likely_data *f, int val, int expect)
{
+ /*
+ * I would love to have a trace point here instead, but the
+ * trace point code is so inundated with unlikely and likely
+ * conditions that the recursive nightmare that exists is too
+ * much to try to get working. At least for now.
+ */
+ trace_likely_condition(f, val, expect);
+
/* FIXME: Make this atomic! */
if (val == expect)
f->correct++;
Index: linux-tip.git/kernel/trace/trace.c
===================================================================
--- linux-tip.git.orig/kernel/trace/trace.c 2008-10-29 17:01:11.000000000 -0400
+++ linux-tip.git/kernel/trace/trace.c 2008-10-29 18:04:04.000000000 -0400
@@ -235,6 +235,9 @@ static const char *trace_options[] = {
"stacktrace",
"sched-tree",
"ftrace_printk",
+#ifdef CONFIG_UNLIKELY_TRACER
+ "unlikely",
+#endif
NULL
};

@@ -1457,6 +1460,18 @@ print_lat_fmt(struct trace_iterator *ite
trace_seq_print_cont(s, iter);
break;
}
+ case TRACE_UNLIKELY: {
+ struct trace_unlikely *field;
+
+ trace_assign_type(field, entry);
+
+ trace_seq_printf(s, "[%s] %s:%s:%d\n",
+ field->correct ? "correct" : "INCORRECT",
+ field->func,
+ field->file,
+ field->line);
+ break;
+ }
default:
trace_seq_printf(s, "Unknown type %d\n", entry->type);
}
@@ -1593,6 +1608,18 @@ static enum print_line_t print_trace_fmt
trace_seq_print_cont(s, iter);
break;
}
+ case TRACE_UNLIKELY: {
+ struct trace_unlikely *field;
+
+ trace_assign_type(field, entry);
+
+ trace_seq_printf(s, "[%s] %s:%s:%d\n",
+ field->correct ? "correct" : "INCORRECT",
+ field->func,
+ field->file,
+ field->line);
+ break;
+ }
}
return TRACE_TYPE_HANDLED;
}
@@ -1675,6 +1702,10 @@ static enum print_line_t print_raw_fmt(s
trace_seq_print_cont(s, iter);
break;
}
+ case TRACE_UNLIKELY: {
+ /* IGNORE */
+ break;
+ };
}
return TRACE_TYPE_HANDLED;
}
@@ -2362,8 +2393,14 @@ tracing_ctrl_write(struct file *filp, co

tr->ctrl = val;

+ if (!val)
+ trace_unlikely_disable();
+
if (current_trace && current_trace->ctrl_update)
current_trace->ctrl_update(tr);
+
+ if (val)
+ trace_unlikely_enable(tr);
}
mutex_unlock(&trace_types_lock);

@@ -2425,6 +2462,7 @@ tracing_set_trace_write(struct file *fil
if (t == current_trace)
goto out;

+ trace_unlikely_disable();
if (current_trace && current_trace->reset)
current_trace->reset(tr);

@@ -2432,6 +2470,7 @@ tracing_set_trace_write(struct file *fil
if (t->init)
t->init(tr);

+ trace_unlikely_enable(tr);
out:
mutex_unlock(&trace_types_lock);

Index: linux-tip.git/kernel/trace/trace.h
===================================================================
--- linux-tip.git.orig/kernel/trace/trace.h 2008-10-29 17:01:11.000000000 -0400
+++ linux-tip.git/kernel/trace/trace.h 2008-10-29 17:46:37.000000000 -0400
@@ -21,6 +21,7 @@ enum trace_type {
TRACE_SPECIAL,
TRACE_MMIO_RW,
TRACE_MMIO_MAP,
+ TRACE_UNLIKELY,
TRACE_BOOT,

__TRACE_LAST_TYPE
@@ -117,6 +118,16 @@ struct trace_boot {
struct boot_trace initcall;
};

+#define TRACE_FUNC_SIZE 30
+#define TRACE_FILE_SIZE 20
+struct trace_unlikely {
+ struct trace_entry ent;
+ unsigned line;
+ char func[TRACE_FUNC_SIZE+1];
+ char file[TRACE_FILE_SIZE+1];
+ char correct;
+};
+
/*
* trace_flag_type is an enumeration that holds different
* states when a trace occurs. These are:
@@ -217,6 +228,7 @@ extern void __ftrace_bad_type(void);
IF_ASSIGN(var, ent, struct trace_mmiotrace_map, \
TRACE_MMIO_MAP); \
IF_ASSIGN(var, ent, struct trace_boot, TRACE_BOOT); \
+ IF_ASSIGN(var, ent, struct trace_unlikely, TRACE_UNLIKELY); \
__ftrace_bad_type(); \
} while (0)

@@ -413,8 +425,39 @@ enum trace_iterator_flags {
TRACE_ITER_STACKTRACE = 0x100,
TRACE_ITER_SCHED_TREE = 0x200,
TRACE_ITER_PRINTK = 0x400,
+#ifdef CONFIG_UNLIKELY_TRACER
+ TRACE_ITER_UNLIKELY = 0x800,
+#endif
};

extern struct tracer nop_trace;

+#ifdef CONFIG_UNLIKELY_TRACER
+extern int enable_unlikely_tracing(struct trace_array *tr);
+extern void disable_unlikely_tracing(void);
+static inline int trace_unlikely_enable(struct trace_array *tr)
+{
+ if (trace_flags & TRACE_ITER_UNLIKELY)
+ return enable_unlikely_tracing(tr);
+ return 0;
+}
+static inline void trace_unlikely_disable(void)
+{
+ /* due to races, always disable */
+ disable_unlikely_tracing();
+}
+extern void trace_unlikely_stop(void);
+extern void trace_unlikely_start(void);
+#else
+static inline int trace_unlikely_enable(struct trace_array *tr)
+{
+ return 0;
+}
+static inline void trace_unlikely_disable(void)
+{
+}
+static inline void trace_unlikely_stop(void) { }
+static inline void trace_unlikely_start(void) { }
+#endif /* CONFIG_UNLIKELY_TRACER */
+
#endif /* _LINUX_KERNEL_TRACE_H */
Index: linux-tip.git/kernel/trace/ring_buffer.c
===================================================================
--- linux-tip.git.orig/kernel/trace/ring_buffer.c 2008-10-29 17:01:11.000000000 -0400
+++ linux-tip.git/kernel/trace/ring_buffer.c 2008-10-29 17:46:37.000000000 -0400
@@ -16,6 +16,8 @@
#include <linux/list.h>
#include <linux/fs.h>

+#include "trace.h"
+
/* Up this if you want to test the TIME_EXTENTS and normalization */
#define DEBUG_SHIFT 0

@@ -1151,7 +1153,14 @@ ring_buffer_lock_reserve(struct ring_buf
if (length > BUF_PAGE_SIZE)
goto out;

+ /*
+ * Ugly, but the unlikely tracing will enter back into
+ * the ring buffer code. Namely through the timestamp, causing
+ * a triggering of the "again" loop, that will be infinite.
+ */
+ trace_unlikely_stop();
event = rb_reserve_next_event(cpu_buffer, RINGBUF_TYPE_DATA, length);
+ trace_unlikely_start();
if (!event)
goto out;

@@ -1263,8 +1272,10 @@ int ring_buffer_write(struct ring_buffer
goto out;

event_length = rb_calculate_event_length(length);
+ trace_unlikely_stop();
event = rb_reserve_next_event(cpu_buffer,
RINGBUF_TYPE_DATA, event_length);
+ trace_unlikely_start();
if (!event)
goto out;


2008-10-30 14:33:11

by Jörn Engel

[permalink] [raw]
Subject: Re: [PATCH 1/2 v3][RFC] trace: profile likely and unlikely annotations

On Wed, 29 October 2008 18:39:55 -0400, Steven Rostedt wrote:
>
> 69768 61064 87 __switch_to process_64.c 624
> 15557 115251 100 __switch_to process_64.c 594
> 15555 115227 100 __switch_to process_64.c 590

I may be out of school for a while, but that math sure looks odd.

> + if (p->correct) {
> + percent = p->incorrect * 100;
> + percent /= p->correct;
percent /= p->correct + p->incorect;
> + /* No need to see huge numbers */
> + if (percent > 100)
> + percent = 100;
> + } else
> + percent = p->incorrect ? 100 : 0;

Jörn

--
To my face you have the audacity to advise me to become a thief - the worst
kind of thief that is conceivable, a thief of spiritual things, a thief of
ideas! It is insufferable, intolerable!
-- M. Binet in Scarabouche

2008-10-30 14:55:59

by Theodore Ts'o

[permalink] [raw]
Subject: Re: [PATCH 1/2 v3][RFC] trace: profile likely and unlikely annotations

On Thu, Oct 30, 2008 at 03:32:39PM +0100, J?rn Engel wrote:
> On Wed, 29 October 2008 18:39:55 -0400, Steven Rostedt wrote:
> >
> > 69768 61064 87 __switch_to process_64.c 624
> > 15557 115251 100 __switch_to process_64.c 594
> > 15555 115227 100 __switch_to process_64.c 590
>
> I may be out of school for a while, but that math sure looks odd.
>
> > + if (p->correct) {
> > + percent = p->incorrect * 100;
> > + percent /= p->correct;
> percent /= p->correct + p->incorect;

And once you do the above fix, I don't think what is below is
necessary any more. :-)

> > + /* No need to see huge numbers */
> > + if (percent > 100)
> > + percent = 100;

I would also calculate the percent correct rather than the percent
incorrect, on the general theory that 100% good, 0% bad is easier for
my little brain to understand, but that's just a minor thing...

- Ted

2008-10-30 15:10:53

by Steven Rostedt

[permalink] [raw]
Subject: Re: [PATCH 1/2 v3][RFC] trace: profile likely and unlikely annotations


On Thu, 30 Oct 2008, Theodore Tso wrote:

> On Thu, Oct 30, 2008 at 03:32:39PM +0100, J?rn Engel wrote:
> > On Wed, 29 October 2008 18:39:55 -0400, Steven Rostedt wrote:
> > >
> > > 69768 61064 87 __switch_to process_64.c 624
> > > 15557 115251 100 __switch_to process_64.c 594
> > > 15555 115227 100 __switch_to process_64.c 590
> >
> > I may be out of school for a while, but that math sure looks odd.
> >
> > > + if (p->correct) {
> > > + percent = p->incorrect * 100;
> > > + percent /= p->correct;
> > percent /= p->correct + p->incorect;
>
> And once you do the above fix, I don't think what is below is
> necessary any more. :-)

Bah! Total brain-fart. Thanks ;-)

>
> > > + /* No need to see huge numbers */
> > > + if (percent > 100)
> > > + percent = 100;
>
> I would also calculate the percent correct rather than the percent
> incorrect, on the general theory that 100% good, 0% bad is easier for
> my little brain to understand, but that's just a minor thing...

I thought about whether or not to show percent correct or precent
incorrect. I chose, incorrect, just because it is easier to spot the
trouble makers. You can say, lets allow 25% incorrect, better than saying
lets have a minimum of 75%. I think the mind can pick out those that go
over a number better than it can see those that are under a number.

Thanks,

-- Steve