2008-02-19 20:39:26

by Arjan van de Ven

[permalink] [raw]
Subject: [PATCH] x86: add the debugfs interface for the sysprof tool

From: Soren Sandmann <[email protected]>
Subject: [PATCH] x86: add the debugfs interface for the sysprof tool

The sysprof tool is a very easy to use GUI tool to find out where
userspace is spending CPU time. See
http://www.daimi.au.dk/~sandmann/sysprof/
for more information and screenshots on this tool.

Sysprof needs a 200 line kernel module to do it's work, this
module puts some simple profiling data into debugfs.

Signed-off-by: Soren Sandmann <[email protected]>
Signed-off-by: Arjan van de Ven <[email protected]>
---
arch/x86/Kconfig.debug | 10 ++
arch/x86/kernel/Makefile | 2 +
arch/x86/kernel/sysprof.c | 200 +++++++++++++++++++++++++++++++++++++++++++++
arch/x86/kernel/sysprof.h | 34 ++++++++
4 files changed, 246 insertions(+), 0 deletions(-)
create mode 100644 arch/x86/kernel/sysprof.c
create mode 100644 arch/x86/kernel/sysprof.h

diff --git a/arch/x86/Kconfig.debug b/arch/x86/Kconfig.debug
index 12c98ea..8eb06c0 100644
--- a/arch/x86/Kconfig.debug
+++ b/arch/x86/Kconfig.debug
@@ -206,6 +206,16 @@ config MMIOTRACE_TEST

Say N, unless you absolutely know what you are doing.

+config SYSPROF
+ tristate "Enable sysprof userland performance sampler"
+ depends on PROFILING
+ help
+ This option enables the sysprof debugfs file that is used by the
+ sysprof tool. sysprof is a tool to show where in userspace CPU
+ time is spent.
+
+ When in doubt, say N
+
#
# IO delay types:
#
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index 4a4260c..1e8fb66 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -80,6 +80,8 @@ obj-$(CONFIG_DEBUG_NX_TEST) += test_nx.o
obj-$(CONFIG_VMI) += vmi_32.o vmiclock_32.o
obj-$(CONFIG_PARAVIRT) += paravirt.o paravirt_patch_$(BITS).o

+obj-$(CONFIG_SYSPROF) += sysprof.o
+
ifdef CONFIG_INPUT_PCSPKR
obj-y += pcspeaker.o
endif
diff --git a/arch/x86/kernel/sysprof.c b/arch/x86/kernel/sysprof.c
new file mode 100644
index 0000000..6220b9f
--- /dev/null
+++ b/arch/x86/kernel/sysprof.c
@@ -0,0 +1,200 @@
+/* -*- c-basic-offset: 8 -*- */
+
+/* Sysprof -- Sampling, systemwide CPU profiler
+ * Copyright 2004, Red Hat, Inc.
+ * Copyright 2004, 2005, Soeren Sandmann
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/proc_fs.h>
+#include <linux/poll.h>
+#include <linux/highmem.h>
+#include <linux/pagemap.h>
+#include <linux/profile.h>
+#include <linux/debugfs.h>
+#include <asm/uaccess.h>
+#include <asm/atomic.h>
+
+#include "sysprof.h"
+
+#define SAMPLES_PER_SECOND (200)
+#define INTERVAL ((HZ <= SAMPLES_PER_SECOND)? 1 : (HZ / SAMPLES_PER_SECOND))
+#define N_TRACES 256
+
+static struct sysprof_stacktrace stack_traces[N_TRACES];
+static struct sysprof_stacktrace *head = &stack_traces[0];
+static struct sysprof_stacktrace *tail = &stack_traces[0];
+static DECLARE_WAIT_QUEUE_HEAD(wait_for_trace);
+static DECLARE_WAIT_QUEUE_HEAD(wait_for_exit);
+
+struct userspace_reader {
+ struct task_struct *task;
+ unsigned long cache_address;
+ unsigned long *cache;
+};
+
+struct stack_frame;
+
+struct stack_frame {
+ struct stack_frame __user *next;
+ unsigned long return_address;
+};
+
+static int read_frame(struct stack_frame __user *frame_pointer,
+ struct stack_frame *frame)
+{
+ if (__copy_from_user_inatomic(frame, frame_pointer,
+ sizeof(struct stack_frame)))
+ return 1;
+ return 0;
+}
+
+static DEFINE_PER_CPU(int, n_samples);
+
+static int timer_notify(struct pt_regs *regs)
+{
+ struct sysprof_stacktrace *trace = head;
+ int i;
+ int is_user;
+ static atomic_t in_timer_notify = ATOMIC_INIT(1);
+ int n;
+
+ n = ++get_cpu_var(n_samples);
+ put_cpu_var(n_samples);
+
+ if (n % INTERVAL != 0)
+ return 0;
+
+ /* 0: locked, 1: unlocked */
+
+ if (!atomic_dec_and_test(&in_timer_notify))
+ goto out;
+
+ is_user = user_mode(regs);
+
+ if (!current || current->pid == 0)
+ goto out;
+
+ if (is_user && current->state != TASK_RUNNING)
+ goto out;
+
+ if (!is_user) {
+ /* kernel */
+ trace->pid = current->pid;
+ trace->truncated = 0;
+ trace->n_addresses = 1;
+
+ /* 0x1 is taken by sysprof to mean "in kernel" */
+ trace->addresses[0] = 0x1;
+ } else {
+ struct stack_frame __user *frame_pointer;
+ struct stack_frame frame;
+ memset(trace, 0, sizeof(struct sysprof_stacktrace));
+
+ trace->pid = current->pid;
+ trace->truncated = 0;
+
+ i = 0;
+
+ trace->addresses[i++] = regs->ip;
+
+ frame_pointer = (struct stack_frame __user *)regs->bp;
+
+ while (read_frame(frame_pointer, &frame) == 0 &&
+ i < SYSPROF_MAX_ADDRESSES &&
+ (unsigned long)frame_pointer >= regs->sp) {
+ trace->addresses[i++] = frame.return_address;
+ frame_pointer = frame.next;
+ }
+
+ trace->n_addresses = i;
+
+ if (i == SYSPROF_MAX_ADDRESSES)
+ trace->truncated = 1;
+ else
+ trace->truncated = 0;
+ }
+
+ if (head++ == &stack_traces[N_TRACES - 1])
+ head = &stack_traces[0];
+
+ wake_up(&wait_for_trace);
+
+out:
+ atomic_inc(&in_timer_notify);
+ return 0;
+}
+
+static ssize_t procfile_read(struct file *filp, char __user *buffer,
+ size_t count, loff_t *ppos)
+{
+ ssize_t bcount;
+ if (head == tail)
+ return -EWOULDBLOCK;
+
+ BUG_ON(tail->pid == 0);
+ *ppos = 0;
+ bcount = simple_read_from_buffer(buffer, count, ppos,
+ tail, sizeof(struct sysprof_stacktrace));
+
+ if (tail++ == &stack_traces[N_TRACES - 1])
+ tail = &stack_traces[0];
+
+ return bcount;
+}
+
+static unsigned int procfile_poll(struct file *filp, poll_table * poll_table)
+{
+ if (head != tail)
+ return POLLIN | POLLRDNORM;
+
+ poll_wait(filp, &wait_for_trace, poll_table);
+
+ if (head != tail)
+ return POLLIN | POLLRDNORM;
+
+ return 0;
+}
+
+static const struct file_operations sysprof_fops = {
+ .owner = THIS_MODULE,
+ .read = procfile_read,
+ .poll = procfile_poll,
+};
+
+static struct dentry *debugfs_pe;
+int init_module(void)
+{
+ debugfs_pe = debugfs_create_file("sysprof-trace", 0600, NULL, NULL,
+ &sysprof_fops);
+ if (!debugfs_pe)
+ return -ENODEV;
+ register_timer_hook(timer_notify);
+
+ return 0;
+}
+
+void cleanup_module(void)
+{
+ unregister_timer_hook(timer_notify);
+ debugfs_remove(debugfs_pe);
+}
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Soeren Sandmann ([email protected])");
+MODULE_DESCRIPTION("Kernel driver for the sysprof performance analysis tool");
diff --git a/arch/x86/kernel/sysprof.h b/arch/x86/kernel/sysprof.h
new file mode 100644
index 0000000..6e16d6f
--- /dev/null
+++ b/arch/x86/kernel/sysprof.h
@@ -0,0 +1,34 @@
+/* Sysprof -- Sampling, systemwide CPU profiler
+ * Copyright 2004, Red Hat, Inc.
+ * Copyright 2004, 2005, Soeren Sandmann
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ */
+
+#ifndef SYSPROF_MODULE_H
+#define SYSPROF_MODULE_H
+
+#define SYSPROF_MAX_ADDRESSES 512
+
+struct sysprof_stacktrace {
+ int pid; /* -1 if in kernel */
+ int truncated;
+ int n_addresses; /* note: this can be 1 if the process was compiled
+ * with -fomit-frame-pointer or is otherwise weird
+ */
+ unsigned long addresses[SYSPROF_MAX_ADDRESSES];
+};
+
+#endif
--
1.5.4.1



--
If you want to reach me at my work email, use [email protected]
For development, discussion and tips for power savings,
visit http://www.lesswatts.org


2008-02-20 09:10:56

by Ingo Molnar

[permalink] [raw]
Subject: Re: [PATCH] x86: add the debugfs interface for the sysprof tool


* Arjan van de Ven <[email protected]> wrote:

> From: Soren Sandmann <[email protected]>
> Subject: [PATCH] x86: add the debugfs interface for the sysprof tool
>
> The sysprof tool is a very easy to use GUI tool to find out where
> userspace is spending CPU time. See
> http://www.daimi.au.dk/~sandmann/sysprof/ for more information and
> screenshots on this tool.
>
> Sysprof needs a 200 line kernel module to do it's work, this module
> puts some simple profiling data into debugfs.

thanks, looks good to me - applied.

Ingo

2008-02-20 18:16:53

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [PATCH] x86: add the debugfs interface for the sysprof tool


On Tue, 2008-02-19 at 12:37 -0800, Arjan van de Ven wrote:
> From: Soren Sandmann <[email protected]>
> Subject: [PATCH] x86: add the debugfs interface for the sysprof tool
>
> The sysprof tool is a very easy to use GUI tool to find out where
> userspace is spending CPU time. See
> http://www.daimi.au.dk/~sandmann/sysprof/
> for more information and screenshots on this tool.
>
> Sysprof needs a 200 line kernel module to do it's work, this
> module puts some simple profiling data into debugfs.

What is the added value over oprofile?

2008-02-20 18:40:16

by Arjan van de Ven

[permalink] [raw]
Subject: Re: [PATCH] x86: add the debugfs interface for the sysprof tool

On Wed, 20 Feb 2008 19:16:15 +0100
Peter Zijlstra <[email protected]> wrote:

>
> On Tue, 2008-02-19 at 12:37 -0800, Arjan van de Ven wrote:
> > From: Soren Sandmann <[email protected]>
> > Subject: [PATCH] x86: add the debugfs interface for the sysprof tool
> >
> > The sysprof tool is a very easy to use GUI tool to find out where
> > userspace is spending CPU time. See
> > http://www.daimi.au.dk/~sandmann/sysprof/
> > for more information and screenshots on this tool.
> >
> > Sysprof needs a 200 line kernel module to do it's work, this
> > module puts some simple profiling data into debugfs.
>
> What is the added value over oprofile?

it actually works and is usable by humans ;)

what oprofile doesn't do is the nice userland hierarchy of where cpu time is spend.
(and that is also what makes it mostly useless in practice)
>


--
If you want to reach me at my work email, use [email protected]
For development, discussion and tips for power savings,
visit http://www.lesswatts.org

2008-02-20 18:54:19

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [PATCH] x86: add the debugfs interface for the sysprof tool


On Wed, 2008-02-20 at 10:39 -0800, Arjan van de Ven wrote:
> On Wed, 20 Feb 2008 19:16:15 +0100
> Peter Zijlstra <[email protected]> wrote:
>
> >
> > On Tue, 2008-02-19 at 12:37 -0800, Arjan van de Ven wrote:
> > > From: Soren Sandmann <[email protected]>
> > > Subject: [PATCH] x86: add the debugfs interface for the sysprof tool
> > >
> > > The sysprof tool is a very easy to use GUI tool to find out where
> > > userspace is spending CPU time. See
> > > http://www.daimi.au.dk/~sandmann/sysprof/
> > > for more information and screenshots on this tool.
> > >
> > > Sysprof needs a 200 line kernel module to do it's work, this
> > > module puts some simple profiling data into debugfs.
> >
> > What is the added value over oprofile?
>
> it actually works and is usable by humans ;)

# sysprof

(sysprof:12480): Gtk-WARNING **: cannot open display:

-ENOX

> what oprofile doesn't do is the nice userland hierarchy of where cpu time is spend.
> (and that is also what makes it mostly useless in practice)

so why not fix oprofile callgraph output and build a GUI on top of
oprofile for those of us who really want RSI from mouse movement :-)

2008-02-20 18:58:41

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [PATCH] x86: add the debugfs interface for the sysprof tool


On Wed, 2008-02-20 at 19:53 +0100, Peter Zijlstra wrote:
> On Wed, 2008-02-20 at 10:39 -0800, Arjan van de Ven wrote:
> > On Wed, 20 Feb 2008 19:16:15 +0100
> > Peter Zijlstra <[email protected]> wrote:
> >
> > >
> > > On Tue, 2008-02-19 at 12:37 -0800, Arjan van de Ven wrote:
> > > > From: Soren Sandmann <[email protected]>
> > > > Subject: [PATCH] x86: add the debugfs interface for the sysprof tool
> > > >
> > > > The sysprof tool is a very easy to use GUI tool to find out where
> > > > userspace is spending CPU time. See
> > > > http://www.daimi.au.dk/~sandmann/sysprof/
> > > > for more information and screenshots on this tool.
> > > >
> > > > Sysprof needs a 200 line kernel module to do it's work, this
> > > > module puts some simple profiling data into debugfs.
> > >
> > > What is the added value over oprofile?
> >
> > it actually works and is usable by humans ;)
>
> # sysprof
>
> (sysprof:12480): Gtk-WARNING **: cannot open display:
>
> -ENOX

Usable for me is a cli interface. Also, I absolutely love opannotate.

For a fact, most of my professional userspace development days, I had to
profile on remote machines with no X.

2008-02-20 19:27:28

by Arjan van de Ven

[permalink] [raw]
Subject: Re: [PATCH] x86: add the debugfs interface for the sysprof tool

On Wed, 20 Feb 2008 19:53:42 +0100
Peter Zijlstra <[email protected]> wrote:

>
> On Wed, 2008-02-20 at 10:39 -0800, Arjan van de Ven wrote:
> > On Wed, 20 Feb 2008 19:16:15 +0100
> > Peter Zijlstra <[email protected]> wrote:
> >
> > >
> > > On Tue, 2008-02-19 at 12:37 -0800, Arjan van de Ven wrote:
> > > > From: Soren Sandmann <[email protected]>
> > > > Subject: [PATCH] x86: add the debugfs interface for the sysprof
> > > > tool
> > > >
> > > > The sysprof tool is a very easy to use GUI tool to find out
> > > > where userspace is spending CPU time. See
> > > > http://www.daimi.au.dk/~sandmann/sysprof/
> > > > for more information and screenshots on this tool.
> > > >
> > > > Sysprof needs a 200 line kernel module to do it's work, this
> > > > module puts some simple profiling data into debugfs.
> > >
> > > What is the added value over oprofile?
> >
> > it actually works and is usable by humans ;)
>
> # sysprof
>
> (sysprof:12480): Gtk-WARNING **: cannot open display:
>
> -ENOX
>
> > what oprofile doesn't do is the nice userland hierarchy of where
> > cpu time is spend. (and that is also what makes it mostly useless
> > in practice)
>
> so why not fix oprofile callgraph output and build a GUI on top of
> oprofile for those of us who really want RSI from mouse movement :-)

feel free to reinvent a whole GUI just to avoid a 200 line kernel module.
sysprof is here. it works. the gui is REALLY nice.
I think it's the wrong tradeoff though... oprofile exists for how long?


--
If you want to reach me at my work email, use [email protected]
For development, discussion and tips for power savings,
visit http://www.lesswatts.org

2008-02-20 20:59:43

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [PATCH] x86: add the debugfs interface for the sysprof tool


On Wed, 2008-02-20 at 11:26 -0800, Arjan van de Ven wrote:

> feel free to reinvent a whole GUI just to avoid a 200 line kernel module.
> sysprof is here. it works.

> the gui is REALLY nice.

I guess we have to agree to disagree here. Its plain useless from my
POV.

> I think it's the wrong tradeoff though... oprofile exists for how long?

Dunno, years, and has served me well.

The thing I worry about is the wild-growth of duplicate functionality
and interfaces. You might say, 'its in /debug' so no API crap, but if
enough user-space depends on it people _will_ complain if it breaks.

Hopefully someone will consolidate stuff - soon. I can agree with the
fact that the oprofile user-interface is quite horrible, and perhaps the
kernel code isn't pretty (never looked at it), so if people want to
replace it, feel free, but offer a full replacement so we can deprecate
and remove the old stuff, and not carry everything around.

Currently we have: readprofile, oprofile, perfmon and now sysprof.

Also, sysprof is a misnomer, you cannot be a system wide profiler and
have code like:

+ if (!is_user) {
+ /* kernel */
+ trace->pid = current->pid;
+ trace->truncated = 0;
+ trace->n_addresses = 1;
+
+ /* 0x1 is taken by sysprof to mean "in kernel" */
+ trace->addresses[0] = 0x1;
+ }

The kernel is an integral part of the system, it can often help to know
where in the kernel time is spent - even if you're not directly
interested in 'fixing' the kernel.

2008-02-20 21:08:11

by Arjan van de Ven

[permalink] [raw]
Subject: Re: [PATCH] x86: add the debugfs interface for the sysprof tool

On Wed, 20 Feb 2008 21:58:42 +0100
Peter Zijlstra <[email protected]> wrote:

>
> On Wed, 2008-02-20 at 11:26 -0800, Arjan van de Ven wrote:
>
> > feel free to reinvent a whole GUI just to avoid a 200 line kernel
> > module. sysprof is here. it works.
>
> > the gui is REALLY nice.
>
> I guess we have to agree to disagree here. Its plain useless from my
> POV.

that's fine. Different tools for different people. sysprof isn't aimed
at kernel developers.

"If all you have is an allen wrench, everything looks like Ikea"

it's ok to have different tools for really different things


--
If you want to reach me at my work email, use [email protected]
For development, discussion and tips for power savings,
visit http://www.lesswatts.org

2008-02-20 21:45:38

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [PATCH] x86: add the debugfs interface for the sysprof tool


On Wed, 2008-02-20 at 13:07 -0800, Arjan van de Ven wrote:
> On Wed, 20 Feb 2008 21:58:42 +0100
> Peter Zijlstra <[email protected]> wrote:
>
> >
> > On Wed, 2008-02-20 at 11:26 -0800, Arjan van de Ven wrote:
> >
> > > feel free to reinvent a whole GUI just to avoid a 200 line kernel
> > > module. sysprof is here. it works.
> >
> > > the gui is REALLY nice.
> >
> > I guess we have to agree to disagree here. Its plain useless from my
> > POV.
>
> that's fine. Different tools for different people. sysprof isn't aimed
> at kernel developers.

That was speaking from userland days.

But you forgot the more important points about API and wild-growth of
duplicate interfaces.

2008-02-20 22:37:30

by Arjan van de Ven

[permalink] [raw]
Subject: Re: [PATCH] x86: add the debugfs interface for the sysprof tool

On Wed, 20 Feb 2008 22:44:29 +0100
Peter Zijlstra <[email protected]> wrote:

>
> On Wed, 2008-02-20 at 13:07 -0800, Arjan van de Ven wrote:
> > On Wed, 20 Feb 2008 21:58:42 +0100
> > Peter Zijlstra <[email protected]> wrote:
> >
> > >
> > > On Wed, 2008-02-20 at 11:26 -0800, Arjan van de Ven wrote:
> > >
> > > > feel free to reinvent a whole GUI just to avoid a 200 line
> > > > kernel module. sysprof is here. it works.
> > >
> > > > the gui is REALLY nice.
> > >
> > > I guess we have to agree to disagree here. Its plain useless from
> > > my POV.
> >
> > that's fine. Different tools for different people. sysprof isn't
> > aimed at kernel developers.
>
> That was speaking from userland days.
>
> But you forgot the more important points about API and wild-growth of
> duplicate interfaces.

yes there's multiple interfaces. There are multiple interfaces *today*.
Oprofile/perfmon2 are very focused on CPU events and have complex interfaces,
sysprof has a much more simple interface (and yes, very specific to syspref)
that just focuses on samples.

Sadly, I think there's use for both, and forcing both into the same straightjacket is a mistake imo.


--
If you want to reach me at my work email, use [email protected]
For development, discussion and tips for power savings,
visit http://www.lesswatts.org

2008-02-23 08:28:23

by Andrew Morton

[permalink] [raw]
Subject: Re: [PATCH] x86: add the debugfs interface for the sysprof tool

On Tue, 19 Feb 2008 12:37:56 -0800 Arjan van de Ven <[email protected]> wrote:

> From: Soren Sandmann <[email protected]>
> Subject: [PATCH] x86: add the debugfs interface for the sysprof tool
>
> The sysprof tool is a very easy to use GUI tool to find out where
> userspace is spending CPU time. See
> http://www.daimi.au.dk/~sandmann/sysprof/
> for more information and screenshots on this tool.
>
> Sysprof needs a 200 line kernel module to do it's work, this
> module puts some simple profiling data into debugfs.
>
> ...
>

Seems a poor idea to me. Sure, oprofile is "hard to set up", but not if
your distributor already did it for you.

Sidebar: the code uses the utterly crappy register_timer_hook() which

a) is woefully misnamed and

b) is racy and

c) will disrupt oprofile if it is being used. And vice versa.



This code adds a new kernel->userspace interface which is not even
documented in code comments. It appears to use a pollable debugfs file in
/proc somewhere, carrying an unspecified payload.


What happens when multiple processes are consuming data from the same
debugfs file?


> arch/x86/Kconfig.debug | 10 ++
> arch/x86/kernel/Makefile | 2 +
> arch/x86/kernel/sysprof.c | 200 +++++++++++++++++++++++++++++++++++++++++++++
> arch/x86/kernel/sysprof.h | 34 ++++++++
> 4 files changed, 246 insertions(+), 0 deletions(-)
> create mode 100644 arch/x86/kernel/sysprof.c
> create mode 100644 arch/x86/kernel/sysprof.h
>
> diff --git a/arch/x86/Kconfig.debug b/arch/x86/Kconfig.debug
> index 12c98ea..8eb06c0 100644
> --- a/arch/x86/Kconfig.debug
> +++ b/arch/x86/Kconfig.debug
> @@ -206,6 +206,16 @@ config MMIOTRACE_TEST
>
> Say N, unless you absolutely know what you are doing.
>
> +config SYSPROF
> + tristate "Enable sysprof userland performance sampler"
> + depends on PROFILING

Missing dependency on DEBUG_FS

> + help
> + This option enables the sysprof debugfs file that is used by the
> + sysprof tool. sysprof is a tool to show where in userspace CPU
> + time is spent.
> +
> + When in doubt, say N
> +

And it's x86-specific.

> #
> # IO delay types:
> #
> diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
> index 4a4260c..1e8fb66 100644
> --- a/arch/x86/kernel/Makefile
> +++ b/arch/x86/kernel/Makefile
> @@ -80,6 +80,8 @@ obj-$(CONFIG_DEBUG_NX_TEST) += test_nx.o
> obj-$(CONFIG_VMI) += vmi_32.o vmiclock_32.o
> obj-$(CONFIG_PARAVIRT) += paravirt.o paravirt_patch_$(BITS).o
>
> +obj-$(CONFIG_SYSPROF) += sysprof.o
> +
> ifdef CONFIG_INPUT_PCSPKR
> obj-y += pcspeaker.o
> endif
> diff --git a/arch/x86/kernel/sysprof.c b/arch/x86/kernel/sysprof.c
> new file mode 100644
> index 0000000..6220b9f
> --- /dev/null
> +++ b/arch/x86/kernel/sysprof.c
> @@ -0,0 +1,200 @@
> +/* -*- c-basic-offset: 8 -*- */
> +
> +/* Sysprof -- Sampling, systemwide CPU profiler
> + * Copyright 2004, Red Hat, Inc.
> + * Copyright 2004, 2005, Soeren Sandmann
> + *
> + * This program is free software; you can redistribute it and/or modify
> + * it under the terms of the GNU General Public License as published by
> + * the Free Software Foundation; either version 2 of the License, or
> + * (at your option) any later version.
> + *
> + * This program is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
> + * GNU General Public License for more details.
> + *
> + * You should have received a copy of the GNU General Public License
> + * along with this program; if not, write to the Free Software
> + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
> + */
> +
> +#include <linux/kernel.h>
> +#include <linux/module.h>
> +#include <linux/proc_fs.h>
> +#include <linux/poll.h>
> +#include <linux/highmem.h>
> +#include <linux/pagemap.h>
> +#include <linux/profile.h>
> +#include <linux/debugfs.h>
> +#include <asm/uaccess.h>

checkpatch used to warn that linux/uaccess.h is preferred over asm/uaccess.h
but this is another of those checkpatch features which seems to have
mysteriously disappeared, or it broke?

> +#include <asm/atomic.h>
> +
> +#include "sysprof.h"
> +
> +#define SAMPLES_PER_SECOND (200)
> +#define INTERVAL ((HZ <= SAMPLES_PER_SECOND)? 1 : (HZ / SAMPLES_PER_SECOND))
> +#define N_TRACES 256
> +
> +static struct sysprof_stacktrace stack_traces[N_TRACES];
> +static struct sysprof_stacktrace *head = &stack_traces[0];
> +static struct sysprof_stacktrace *tail = &stack_traces[0];

Access to head and tail appear to be racy. See below.

> +static DECLARE_WAIT_QUEUE_HEAD(wait_for_trace);
> +static DECLARE_WAIT_QUEUE_HEAD(wait_for_exit);
> +
> +struct userspace_reader {
> + struct task_struct *task;
> + unsigned long cache_address;
> + unsigned long *cache;
> +};
> +
> +struct stack_frame;
> +
> +struct stack_frame {
> + struct stack_frame __user *next;
> + unsigned long return_address;
> +};
> +
> +static int read_frame(struct stack_frame __user *frame_pointer,
> + struct stack_frame *frame)
> +{
> + if (__copy_from_user_inatomic(frame, frame_pointer,
> + sizeof(struct stack_frame)))
> + return 1;
> + return 0;
> +}
> +
> +static DEFINE_PER_CPU(int, n_samples);
> +
> +static int timer_notify(struct pt_regs *regs)
> +{
> + struct sysprof_stacktrace *trace = head;

We read `head' before taking the "lock". Another CPU could modify it after
we took a local copy.

> + int i;
> + int is_user;
> + static atomic_t in_timer_notify = ATOMIC_INIT(1);
> + int n;
> +
> + n = ++get_cpu_var(n_samples);
> + put_cpu_var(n_samples);

Needlessly disables preemption. Use __get_cpu_var().

> + if (n % INTERVAL != 0)
> + return 0;

It'd be nice to make INTERVAL a power of 2.

> + /* 0: locked, 1: unlocked */
> +
> + if (!atomic_dec_and_test(&in_timer_notify))
> + goto out;

Why not use spin_trylock()? Then you get lockdep support too.

And why not use spin_lock()? At least a comment should be added explaining
and justifying this peculiar home-made-not-really-locking design.

> + is_user = user_mode(regs);
> +
> + if (!current || current->pid == 0)
> + goto out;

And the changelog should explain and justify why we cannot profile root's
code. I cannot begin to imagine why it was done and I cannot fathom why it
passed uncommented in documentation, code, changelog and "review" comments.
It greatly reduces the usefulness of an already dubious feature.

If this limitation _was_ documented then I'd be in a position to ask what is
special about root, as opposed to some non-root user who has <unspecified>
capabilities. And why we penalise a root who has dropped <unspecified>
capabilities. etcetera.

Is this open-coded test of ->pid correct in a containerised environment?

> + if (is_user && current->state != TASK_RUNNING)
> + goto out;

Needs a comment (although this one is fairly obvious)

> + if (!is_user) {
> + /* kernel */
> + trace->pid = current->pid;
> + trace->truncated = 0;
> + trace->n_addresses = 1;
> +
> + /* 0x1 is taken by sysprof to mean "in kernel" */
> + trace->addresses[0] = 0x1;
> + } else {
> + struct stack_frame __user *frame_pointer;
> + struct stack_frame frame;
> + memset(trace, 0, sizeof(struct sysprof_stacktrace));
> +
> + trace->pid = current->pid;

This is ambiguous in a containerised environment.

Ingo, please be alert for anything which exposes raw pids to userspace.

I don't know what a correct and suitable interface might be - perhaps Pavel
or Eric can suggest something.

> + trace->truncated = 0;
> +
> + i = 0;
> +
> + trace->addresses[i++] = regs->ip;
> +
> + frame_pointer = (struct stack_frame __user *)regs->bp;
> +
> + while (read_frame(frame_pointer, &frame) == 0 &&
> + i < SYSPROF_MAX_ADDRESSES &&
> + (unsigned long)frame_pointer >= regs->sp) {
> + trace->addresses[i++] = frame.return_address;
> + frame_pointer = frame.next;
> + }

The (absent) interface documentation should explain what happens when a
fault causes this information to be truncated.

> + trace->n_addresses = i;
> +
> + if (i == SYSPROF_MAX_ADDRESSES)
> + trace->truncated = 1;
> + else
> + trace->truncated = 0;
> + }
> +
> + if (head++ == &stack_traces[N_TRACES - 1])
> + head = &stack_traces[0];

`head' can just merrily advance over `tail' and there is no notification to
userspace of the lost data.

> + wake_up(&wait_for_trace);
> +
> +out:
> + atomic_inc(&in_timer_notify);
> + return 0;
> +}
> +
> +static ssize_t procfile_read(struct file *filp, char __user *buffer,
> + size_t count, loff_t *ppos)
> +{
> + ssize_t bcount;
> + if (head == tail)
> + return -EWOULDBLOCK;

Please put a blank line between end-of-variables and start-of-code.

This seems to be a wrong return value? Shouldn't it just return zero if
there was nothing there? As can happen if some other process is reading the
same debugfs file?

> + BUG_ON(tail->pid == 0);

whee. There was no locking above to prevent the tasks's pid from
transitioning from non-zero to zero after it was tested. Which means this
is triggerable. Perhaps the implicit locking due to cpu-pinnedness and
interruption will prevent that race. If so, such a subtlety should be
commented, no?

> + *ppos = 0;
> + bcount = simple_read_from_buffer(buffer, count, ppos,
> + tail, sizeof(struct sysprof_stacktrace));
> +
> + if (tail++ == &stack_traces[N_TRACES - 1])
> + tail = &stack_traces[0];

There is no locking for `tail', and afaict we support multiple simultaneous
readers.

> + return bcount;
> +}

This reads a single item even if there were 100 queued, which is quite
inefficient.

We already have infrastructure for bulk kernel->user transfer in
kernel/relay.c?

> +static unsigned int procfile_poll(struct file *filp, poll_table * poll_table)

checkpatch missed this coding-style error.

> +{
> + if (head != tail)
> + return POLLIN | POLLRDNORM;
> +
> + poll_wait(filp, &wait_for_trace, poll_table);
> +
> + if (head != tail)
> + return POLLIN | POLLRDNORM;
> +
> + return 0;
> +}
> +
> +static const struct file_operations sysprof_fops = {
> + .owner = THIS_MODULE,
> + .read = procfile_read,
> + .poll = procfile_poll,
> +};
> +
> +static struct dentry *debugfs_pe;
> +int init_module(void)
> +{
> + debugfs_pe = debugfs_create_file("sysprof-trace", 0600, NULL, NULL,
> + &sysprof_fops);
> + if (!debugfs_pe)
> + return -ENODEV;
> + register_timer_hook(timer_notify);
> + return 0;
> +}

This function will enable sysprof-trace even if prof_on==0,
prof_on==SLEEP_PROFILING, etc which is pointless.

> +void cleanup_module(void)
> +{
> + unregister_timer_hook(timer_notify);
> + debugfs_remove(debugfs_pe);
> +}
> +
> +MODULE_LICENSE("GPL");
> +MODULE_AUTHOR("Soeren Sandmann ([email protected])");
> +MODULE_DESCRIPTION("Kernel driver for the sysprof performance analysis tool");
> diff --git a/arch/x86/kernel/sysprof.h b/arch/x86/kernel/sysprof.h
> new file mode 100644
> index 0000000..6e16d6f
> --- /dev/null
> +++ b/arch/x86/kernel/sysprof.h
> @@ -0,0 +1,34 @@
> +/* Sysprof -- Sampling, systemwide CPU profiler
> + * Copyright 2004, Red Hat, Inc.
> + * Copyright 2004, 2005, Soeren Sandmann
> + *
> + * This program is free software; you can redistribute it and/or modify
> + * it under the terms of the GNU General Public License as published by
> + * the Free Software Foundation; either version 2 of the License, or
> + * (at your option) any later version.
> + *
> + * This program is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
> + * GNU General Public License for more details.
> + *
> + * You should have received a copy of the GNU General Public License
> + * along with this program; if not, write to the Free Software
> + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
> + */
> +
> +#ifndef SYSPROF_MODULE_H
> +#define SYSPROF_MODULE_H
> +
> +#define SYSPROF_MAX_ADDRESSES 512
> +
> +struct sysprof_stacktrace {
> + int pid; /* -1 if in kernel */
> + int truncated;
> + int n_addresses; /* note: this can be 1 if the process was compiled
> + * with -fomit-frame-pointer or is otherwise weird
> + */
> + unsigned long addresses[SYSPROF_MAX_ADDRESSES];
> +};

This is broken for 32-bit userspace running on a 64-bit kernel. Unless
said userspace jumps through hoops and works out that it's running under a
64-bit kernel.

There might be alignment issues for addresses[], being at offset 12.


On Wed, 20 Feb 2008 10:10:22 +0100 Ingo Molnar <[email protected]> wrote:
>
> thanks, looks good to me - applied.

This is pretty distressing, frankly. The threshold for getting code into
Linux should be much higher than this.

I do not have the time to review everything which goes into all the git
trees. Better review, please.

2008-02-23 11:38:14

by Ingo Molnar

[permalink] [raw]
Subject: Re: [PATCH] x86: add the debugfs interface for the sysprof tool


* Andrew Morton <[email protected]> wrote:

> > Sysprof needs a 200 line kernel module to do it's work, this module
> > puts some simple profiling data into debugfs.
> >
> > ...
>
> Seems a poor idea to me. Sure, oprofile is "hard to set up", but not
> if your distributor already did it for you.

two things.

Firstly, this isnt an oprofile replacement, this is a pretty separate
concept. Sysprof is more of a tracer than a profiler. (and we are
currently working on merging it into ftrace)

Secondly, real developers who tune user-space code disagree with your
characterisation of oprofile being easy to use. _Please_ just ask any
developer around you who hasnt used oprofile before to try sysprof
versus oprofile - without looking at any docs. Give him minimal context
and two starting points: "opcontrol" and "sysprof" - nothing else. Give
him a very simple test.c code:

void test_fn(void)
{
for (;;)
gettimeofday(0, 0);
}

int main(void)
{
test_fn();
}

and ask him to try oprofile and then to try sysprof as a comparison - to
and figure in which app and which function the overhead is.

then measure the time it takes to solve this task via the two tools and
ask the developer's first impression about the two tools.

> On Wed, 20 Feb 2008 10:10:22 +0100 Ingo Molnar <[email protected]> wrote:
> >
> > thanks, looks good to me - applied.
>
> This is pretty distressing, frankly. The threshold for getting code into
> Linux should be much higher than this.
>
> I do not have the time to review everything which goes into all the git
> trees. Better review, please.

that was for x86.git#testing, it's not even in x86.git#mm.

It's 200 lines of pretty well isolated code for something that is
already much more usable to me than 10 years of oprofile. Really, i'd
much rather take 200 lines of poor kernel code written by a userspace
developer for stuff that _already works better_, than to have ~2000
lines of oprofile code and an unusable (to me) user-space tool written
by kernel developers.

Ingo

2008-02-23 11:51:45

by Pekka Enberg

[permalink] [raw]
Subject: Re: [PATCH] x86: add the debugfs interface for the sysprof tool

On Sat, Feb 23, 2008 at 10:11 AM, Andrew Morton
<[email protected]> wrote:
> Seems a poor idea to me. Sure, oprofile is "hard to set up", but not if
> your distributor already did it for you.

Have you tried sysprof? It's really nice to setup and use compared to
oprofile when profiling user-space.

On Sat, Feb 23, 2008 at 10:11 AM, Andrew Morton
<[email protected]> wrote:
> This code adds a new kernel->userspace interface which is not even
> documented in code comments. It appears to use a pollable debugfs file in
> /proc somewhere, carrying an unspecified payload.

Hmm, it does include <linux/proc_fs.h> but seems to set up a regular
debugfs directory (usually mounted in /sys/debug) and not proc. Arjan?

2008-02-23 12:22:53

by Ingo Molnar

[permalink] [raw]
Subject: Re: [PATCH] x86: add the debugfs interface for the sysprof tool


* Pekka Enberg <[email protected]> wrote:

> On Sat, Feb 23, 2008 at 10:11 AM, Andrew Morton
> <[email protected]> wrote:
> > Seems a poor idea to me. Sure, oprofile is "hard to set up", but not if
> > your distributor already did it for you.
>
> Have you tried sysprof? It's really nice to setup and use compared to
> oprofile when profiling user-space.

yes, it's very nice and very usable - and that's all that matters
really. It's almost a _duty_ of the mainstream kernel to include that
trivial 200 lines of sysprof code, given how poor instrumentation
support is on Linux.

As a comparison, here's a session of a newbie developer, meeting
oprofile for the first time in his life (using a fresh package,
oprofile-0.9.3-6.fc8):

---------------------->
[ Newbie: WTF, no GUI tool? ]
[ Narrator: we lose 90% of the developers at this point. ]
[ Newbie is adventurous and has heard about opcontrol and tries it. ]

# opcontrol

[ Newbie sees tons of output. User scratches head. After looking
around, finds the following option listed:
"-s/--start start data collection". That must be it! ]

# opcontrol -s
No vmlinux file specified. You must specify the correct vmlinux file, e.g.
opcontrol --vmlinux=/path/to/vmlinux
If you do not have a vmlinux file, use
opcontrol --no-vmlinux
Enter opcontrol --help for full options

[ Newbie: WTF? Doesnt oprofile think that what I want to do is to
profile ... the currently running kernel, wherever a kernel is
and whatever a vmlinux might be?? ]

[ Narrator: At this point oprofile has confused about 99% of all
user-space developers who have no freaking idea about what a vmlinux
is. ]

[ Newbie user figures that --no-vmlinux might be the right option: ]

# opcontrol -s --no-vmlinux
Option "--setup" not valid with "-s".

[ Newbie: WTF? what not valid? Why should i care? Damnit, i only want
to profile stuff!!! ]

[ The newbie user eventually finds out that opcontrol help text is
buggy and that -s does not mean --start, but --setup. ]

[ Narrator: we now have lost 99.99% of the first-time users. ]

[ Newbie, armed with this nontrivial piece of information: ]

# opcontrol --start --no-vmlinux

Using default event: CPU_CLK_UNHALTED:100000:0:1:1
Using 2.6+ OProfile kernel interface.
Using log file /var/lib/oprofile/samples/oprofiled.log
Daemon started.
Profiler running.

[ Newbie: wow, it's working! Lets start an infinite loop and lets try
this opreport thing: ]

# opreport

CPU_CLK_UNHALT...|
samples| %|
------------------
160405 82.9309 loop

[ Newbie: hm, i see where the overhead is - but which function is
calling it? ]

[ Newbie user wants to restart profiling and figures that
opcontrol --reset will do that: ]

# opcontrol --reset
Signalling daemon... done

[ GREAT! It even said "done". Now lets see our new profile: ]

# opreport
opreport error: No sample file found: try running opcontrol --dump
or specify a session containing sample files

[ Newbie: WTF???? ]

[ Narrator: we've now lost 99.99999% of the testers at this point. The
reamining 10 kernel developers still using oprofile have written up
all the commands to the back of their keyboards, for easy reference. ]
<----------------------

We should hang our collective heads in shame. Oprofile is an utter joke
in terms of usability. It had 5-10 years to get its stuff together and
didnt.

200 lines of totally isolated sysprof code is the least thing we can do
which we _must do_ to help our users.

Ingo

2008-02-23 12:29:50

by Pekka Enberg

[permalink] [raw]
Subject: Re: [PATCH] x86: add the debugfs interface for the sysprof tool

Hi Ingo,

On Sat, Feb 23, 2008 at 2:22 PM, Ingo Molnar <[email protected]> wrote:
> As a comparison, here's a session of a newbie developer, meeting
> oprofile for the first time in his life (using a fresh package,
> oprofile-0.9.3-6.fc8):
>
> ---------------------->
> [ Newbie: WTF, no GUI tool? ]
> [ Narrator: we lose 90% of the developers at this point. ]

In all fairness, there's a GUI for oprofile too [1] but I don't think
it supports call trees as sysprof does which is IMHO the killer
feature of the latter.

1. http://labs.o-hand.com/oprofileui/

2008-02-23 14:54:59

by Pekka Enberg

[permalink] [raw]
Subject: Re: [PATCH] x86: add the debugfs interface for the sysprof tool

On Sat, Feb 23, 2008 at 10:11 AM, Andrew Morton
<[email protected]> wrote:
> Seems a poor idea to me. Sure, oprofile is "hard to set up", but not if
> your distributor already did it for you.
>
> Sidebar: the code uses the utterly crappy register_timer_hook() which
>
> a) is woefully misnamed and
>
> b) is racy and
>
> c) will disrupt oprofile if it is being used. And vice versa.

I wonder if sysprof should hook to the same interrupt as oprofile then?

On Sat, Feb 23, 2008 at 10:11 AM, Andrew Morton
<[email protected]> wrote:
> This code adds a new kernel->userspace interface which is not even
> documented in code comments. It appears to use a pollable debugfs file in
> /proc somewhere, carrying an unspecified payload.

[snip]

> This reads a single item even if there were 100 queued, which is quite
> inefficient.
>
> We already have infrastructure for bulk kernel->user transfer in
> kernel/relay.c?

Agreed. This seems like a perfect fit with relayfs.

2008-02-23 15:17:22

by John Levon

[permalink] [raw]
Subject: Re: [PATCH] x86: add the debugfs interface for the sysprof tool


Obviously I hold no sway here, so there's little point in my continuing
to try and fight this madness, but I have to say my piece. Don't worry,
I'll leave it after this - I know Ingo always gets his way.

On Sat, Feb 23, 2008 at 12:37:24PM +0100, Ingo Molnar wrote:

> and ask him to try oprofile and then to try sysprof as a comparison - to
> and figure in which app and which function the overhead is.

Now work out whether this is a result of the kernel code or not. Whilst
there's probably no point me stating this: I object most strenuously to
the idea of merging duplicate code that does part of what oprofile does.

Soren wrote sysprof when he tried an earlier version of oprofile and
found it slightly non-obvious. Instead of doing any of these things:

- complaining to the oprofile list
- checking a version of oprofile written in the last two years
- fixing oprofile user space to be more obvious

he just went on right ahead and re-implemented it so he could write a
GUI on top of it. That *exact same GUI* could have been on top of
oprofile. Usability problems with opcontrol or opreport could have been
reported (or even - gosh - fixed). Soren was too lazy.

It seems the same applies to you and Arjan, at least.

> It's 200 lines of pretty well isolated code for something that is
> already much more usable to me than 10 years of oprofile. Really, i'd
> much rather take 200 lines of poor kernel code written by a userspace
> developer for stuff that _already works better_, than to have ~2000
> lines of oprofile code and an unusable (to me) user-space tool written
> by kernel developers.

I honestly can't believe that you're telling me you'd rather have more
kernel code than make some tweaks (which, by the way, you have NEVER
complained about to me or to the oprofile mailing list until now)
to a shell script. Specifically:

> [ Newbie: WTF, no GUI tool? ]

Wrong. It's shipped with a (rather poor) GUI pretty much since
inception. More recently, there's been at least two GUIs. For example:

http://labs.o-hand.com/wp-content/uploads/2007/12/oprofileui.png

> # opcontrol
>
> [ Newbie sees tons of output. User scratches head. After looking
> around, finds the following option listed:
> "-s/--start start data collection". That must be it! ]
>
> # opcontrol -s
> No vmlinux file specified. You must specify the correct vmlinux file, e.g.
> opcontrol --vmlinux=/path/to/vmlinux
> If you do not have a vmlinux file, use
> opcontrol --no-vmlinux
> Enter opcontrol --help for full options
>
> [ Newbie: WTF? Doesnt oprofile think that what I want to do is to
> profile ... the currently running kernel, wherever a kernel is
> and whatever a vmlinux might be?? ]

Firstly, the distributions should have set this up automatically. That
they don't is a distributor bug. The sheer madness of Linux not leaving
a vmlinux file in a stable known location is hardly something oprofile
can be blamed for.

Secondly, not ONE PERSON INCLUDING YOU has EVER suggested to us that we
default to no vmlinux.

> [ The newbie user eventually finds out that opcontrol help text is
> buggy and that -s does not mean --start, but --setup. ]

It's astonishing that you would know about this, complaining about this,
but not file a bug report. Same goes for the rest.

And once again, please explain how the buggy shell script has anything
to do with kernel code.

What, exactly, happened to scratching an itch? If everyone should be
hanging his head in shame, Ingo, why haven't you used your considerable
influence to find a maintainer for OProfile to fix the (until now,
private to you) issues you have with it? Shameful? Indeed.

john

2008-02-23 15:50:59

by Ingo Molnar

[permalink] [raw]
Subject: Re: [PATCH] x86: add the debugfs interface for the sysprof tool


* John Levon <[email protected]> wrote:

> > [ The newbie user eventually finds out that opcontrol help text is
> > buggy and that -s does not mean --start, but --setup. ]
>
> It's astonishing that you would know about this, complaining about
> this, but not file a bug report. Same goes for the rest.

i found out about this particular issue just today, when i wrote this
mail, so consider this as my bugreport.

And i have to say, most of the usability deficits in oprofile are very
obvious, so consider this as a general "the oprofile commands suck in
almost every detail" bugreport. Tools should fundamentally be
one-stop-shops. I personally wouldnt mind the lack of a GUI at all if
the command line tool was _obvious to use_. If the principle was: get
the current histogram to the user. A tool should work _hard_ to get
something (_anything_) useful out by default. Transparently start up any
background state and procesing that is needed - and hide it as much as
possible. Try to figure out where the vmlinux is, if there's any. Do
_not_ put the user through any extra chores if not absolutely necessary.
Display information that tells the user what happened. Etc., etc. -
these are all basic principles.

imagine if the "ls" command, instead of listing files, showed 60 lines
of options, and when i picked "-l" it would, instead of listing files
already, ask me: "do you really mean the current filesystem?". And if i
said "list whatever filesystem i wanted" then it would also say "because
dm-crypt is not configured into the kernel I cannot display encrypted
information, use me with --no-crypt".

and as i sit here, i tried "opreport" once again, to just see what it
does by default. It just hung there, displaying nothing. For minutes.
Then i got suspicious and straced it. It loops in stat()s in one huge
directory that has amassed more than 20 thousand sample files in the
last few hours. This is such a basic usecase, tell me that this never
happened to you and that nobody ever came to the idea to display "sorry,
this might take a few minutes, 10% of the files are processed so far"
sort of feel-good messages to the user?

but generally i cannot fix and report and fight over all the crap that
people do - that would mean i'd have to complain about Linux all day,
non-stop. What i can do is to tell apart the better solutions from the
worse solutions when they get submitted to me. And guess what? One
little isolated 200 lines patch in x86.git and the people who sat on
their accomplishments for years are up in arms and oppose it. I must be
doing something right i guess :-/

Ingo

2008-02-23 18:43:11

by Andrew Morton

[permalink] [raw]
Subject: Re: [PATCH] x86: add the debugfs interface for the sysprof tool

On Sat, 23 Feb 2008 12:37:24 +0100 Ingo Molnar <[email protected]> wrote:

> * Andrew Morton <[email protected]> wrote:
>
> > > Sysprof needs a 200 line kernel module to do it's work, this module
> > > puts some simple profiling data into debugfs.
> > >
> > > ...
> >
> > Seems a poor idea to me. Sure, oprofile is "hard to set up", but not
> > if your distributor already did it for you.
>
> two things.
>
> Firstly, this isnt an oprofile replacement, this is a pretty separate
> concept. Sysprof is more of a tracer than a profiler.

I don't understand the distinction and I don't see what sysprof (as defined
by its kernel->userspace interface) can do which oprofile cannot.

This is yet another thing which should have been in the damned changlog but
wasn't.

> (and we are
> currently working on merging it into ftrace)

I think you should drop it and we should see a replacement patch which has
all the bugs, inefficiencies and deficiencies addressed and which has a
vaguely respectable description.

> Secondly, real developers who tune user-space code disagree with your
> characterisation of oprofile being easy to use.

afacit all of these criticisms surround oprofile's userspace tools only.

2008-02-23 18:47:58

by Andrew Morton

[permalink] [raw]
Subject: Re: [PATCH] x86: add the debugfs interface for the sysprof tool

On Sat, 23 Feb 2008 13:51:34 +0200 "Pekka Enberg" <[email protected]> wrote:

> On Sat, Feb 23, 2008 at 10:11 AM, Andrew Morton
> <[email protected]> wrote:
> > Seems a poor idea to me. Sure, oprofile is "hard to set up", but not if
> > your distributor already did it for you.
>
> Have you tried sysprof? It's really nice to setup and use compared to
> oprofile when profiling user-space.

Wanna see how I use oprofile?

box:/home/akpm> akpm-oprofile gcc t.c
Daemon not running
Daemon not running
Using 2.6+ OProfile kernel interface.
Reading module info.
Using log file /var/lib/oprofile/oprofiled.log
Daemon started.
Profiler running.
t.c:6:12: error: token ""x"" is not valid in preprocessor expressions

real 0m0.262s
user 0m0.004s
sys 0m0.004s
Stopping profiling.
Stopping profiling.
Killing daemon.
CPU: CPU with timer interrupt, speed 0 MHz (estimated)
Profiling through timer interrupt
samples % symbol name
625 99.2063 mwait_idle
1 0.1587 __handle_mm_fault
1 0.1587 clear_page
1 0.1587 do_page_fault
1 0.1587 flush_tlb_page
1 0.1587 pfn_valid
box:/home/akpm>


One thirteen-character command! Why? Because I actually got off my butt
and wrote a script to hide low-level details. I wrote the thing five years
ago and don't remember anything about what's in it.

I didn't need to write a new kernel module to enable that
thirteen-character shell script, and I don't believe one needs to write a
new kernel module to put a nice easy-to-use GUI around oprofile either.


This is one of those i-cant-believe-im-having-this-discussion discussions.

2008-02-23 19:03:00

by Andrew Morton

[permalink] [raw]
Subject: Re: [PATCH] x86: add the debugfs interface for the sysprof tool

On Sat, 23 Feb 2008 16:54:49 +0200 "Pekka Enberg" <[email protected]> wrote:

> On Sat, Feb 23, 2008 at 10:11 AM, Andrew Morton
> <[email protected]> wrote:
> > Seems a poor idea to me. Sure, oprofile is "hard to set up", but not if
> > your distributor already did it for you.
> >
> > Sidebar: the code uses the utterly crappy register_timer_hook() which
> >
> > a) is woefully misnamed and
> >
> > b) is racy and
> >
> > c) will disrupt oprofile if it is being used. And vice versa.
>
> I wonder if sysprof should hook to the same interrupt as oprofile then?

oprofile uses register_timer_hook() for its oh-crap-nothing-else-works
fallback iirc. It's a useful fallback: I used it a few centuries ago on
some el-cheapo VIA CPU-based thing we had at Digeo.

It's unclear to me whether all this stuff works with NO_HZ=y, btw. Didn't
we just lose the regular timer interrupts which these clients depend upon?

2008-02-23 20:34:26

by Soeren Sandmann

[permalink] [raw]
Subject: Re: [PATCH] x86: add the debugfs interface for the sysprof tool

John Levon <[email protected]> writes:

> Soren wrote sysprof when he tried an earlier version of oprofile and
> found it slightly non-obvious. Instead of doing any of these things:

This is not accurate. Sysprof started by me adding a hierarchical call
view to speedprof, a SIGPROF profiler which was basically a hack in
memprof.

Oprofile did not work on my system at the time (Red Hat 9, I believe),
and the website said that I had to apply a patch to the kernel and
recompile, so I didn't try it.

The hierarchical call view in speedprof worked out so well that I
wrote a simple kernel module to produce system-wide stacktraces, and
fed them into speedprof. Since speedprof was just a hack in memprof, I
wrote a new GUI, and sysprof was born.

It was only later I tried oprofile and found it not only much more
difficult to use, but also much less useful when I did get it to work.


Soren

2008-02-23 20:44:17

by Andrew Morton

[permalink] [raw]
Subject: Re: [PATCH] x86: add the debugfs interface for the sysprof tool

On 23 Feb 2008 21:15:52 +0100 Soeren Sandmann <[email protected]> wrote:

> It was only later I tried oprofile and found it not only much more
> difficult to use, but also much less useful when I did get it to work.

Could we please be very careful to not conflate oprofile's kernel support
with oprofile userspace? For the purposes of this discussion the
distinction is most important.

I _assume_ you're referring to oprofile userspace which I agree is
user-hostile. It is a rich area for others to improve upon. In so doing
they might even have a need to change oprofile's kernel code. But simply
giving up and going off and implementing some parallel kernel code is an
utter last resort and should only be done after it has been shown that the
current kernel code is unfixable.

2008-02-24 02:50:38

by Pekka Enberg

[permalink] [raw]
Subject: Re: [PATCH] x86: add the debugfs interface for the sysprof tool

Hi Andrew,

Andrew Morton wrote:
> I didn't need to write a new kernel module to enable that
> thirteen-character shell script, and I don't believe one needs to write a
> new kernel module to put a nice easy-to-use GUI around oprofile either.
>
> This is one of those i-cant-believe-im-having-this-discussion discussions.

Sysprof tracks the full stack frame so it can provide meaningful call
tree (who called what) which is invaluable for spotting hot _paths_. I
don't see how oprofile can do that as it tracks instruction pointers only.

Pekka

2008-02-24 03:12:43

by Nicholas Miell

[permalink] [raw]
Subject: Re: [PATCH] x86: add the debugfs interface for the sysprof tool


On Sun, 2008-02-24 at 04:49 +0200, Pekka Enberg wrote:
> Hi Andrew,
>
> Andrew Morton wrote:
> > I didn't need to write a new kernel module to enable that
> > thirteen-character shell script, and I don't believe one needs to write a
> > new kernel module to put a nice easy-to-use GUI around oprofile either.
> >
> > This is one of those i-cant-believe-im-having-this-discussion discussions.
>
> Sysprof tracks the full stack frame so it can provide meaningful call
> tree (who called what) which is invaluable for spotting hot _paths_. I
> don't see how oprofile can do that as it tracks instruction pointers only.
>
> Pekka

You could try passing the --callgraph option to opcontrol.

--
Nicholas Miell <[email protected]>

2008-02-24 13:13:49

by Theodore Ts'o

[permalink] [raw]
Subject: Re: [PATCH] x86: add the debugfs interface for the sysprof tool

On Sat, Feb 23, 2008 at 01:53:35PM +0000, John Levon wrote:
> On Sat, Feb 23, 2008 at 12:37:24PM +0100, Ingo Molnar wrote:
>
> > It's 200 lines of pretty well isolated code for something that is
> > already much more usable to me than 10 years of oprofile. Really, i'd
> > much rather take 200 lines of poor kernel code written by a userspace
> > developer for stuff that _already works better_, than to have ~2000
> > lines of oprofile code and an unusable (to me) user-space tool written
> > by kernel developers.

I think it's fair to say that oth oprofile and sysprof can use some
improvements. There are a couple of questions that immediately come
to mind, including the most obvious one, *if* as you John clams, the
oprofile kernel had all of the functionality for the GUI, why wasn't
it used --- could it *perhaps* because the kernel interface for
oprofile wasn't documented well? Heck, even if sysprof is 200 lines
of code versus 2000 lines of kernel code, most people don't write
extra code unless it's because the 2000 lines of pre-existing code
isn't well documented enough.

> Firstly, the distributions should have set this up automatically. That
> they don't is a distributor bug. The sheer madness of Linux not leaving
> a vmlinux file in a stable known location is hardly something oprofile
> can be blamed for.

Wrong Answer. People who write userspace helpers *have* to do the
work of the distro's. It's a bad, bad, bad, Bad, BAD idea to leave it
up to the distributions. It means that some distributions won't get
it right; other distributions will do it in different ways, making it
harder for users to switch between distro's and making it harder for
people to write distribution-neutral HOWTO's.

There are plenty of things that can be done, including using search
paths to try to find vmlinuz; or maybe even proposing a new standard
such as say for example /lib/modules/`uname -r`/vmlinux being a
synlink to the location of vmlinux. We already have
/lib/modules/`uname -r`/build and /lib/modules/`uname -r`/source, for
example.

The abdication of responsibility and the lack of trying to solve the
usability issues is one of the things that really worries me about
*all* of Linux's RAS tools. We can and should do better! And it's
really embarassing that the RAS maintainers seem (I assume you are one
of the oprofile maintainers), seem to be blaming this on the victims,
the people who are complaining about using *your* tool. Yes, it's a
shame that Ingo didn't try to fix your tool; open source, and scratch
your own itch and all of that. To be sure. But at the *same* *time*
don't you have enough pride to take a look at a tools which so
obviously has massive lacks in the usability department, and tried to
fix it years ago? There's more than enough blame to go around twenty
times over, I would think.

- Ted

2008-02-24 13:45:26

by Ingo Molnar

[permalink] [raw]
Subject: Re: [PATCH] x86: add the debugfs interface for the sysprof tool


* Theodore Tso <[email protected]> wrote:

> The abdication of responsibility and the lack of trying to solve the
> usability issues is one of the things that really worries me about
> *all* of Linux's RAS tools. We can and should do better! And it's
> really embarassing that the RAS maintainers seem (I assume you are one
> of the oprofile maintainers), seem to be blaming this on the victims,
> the people who are complaining about using *your* tool. Yes, it's a
> shame that Ingo didn't try to fix your tool; open source, and scratch
> your own itch and all of that. To be sure. But at the *same* *time*
> don't you have enough pride to take a look at a tools which so
> obviously has massive lacks in the usability department, and tried to
> fix it years ago? There's more than enough blame to go around twenty
> times over, I would think.

i agree with most of your mail but i beg to differ with what you wrote
about my role :-/ The specific bug/issue i discovered with oprofile i
discovered on the very day i wrote about it to lkml.

In any case i'm not the one to fix oprofile - i cannot fix or report all
itches that i have in ~1 billion lines of userspace - i would have to
spend my whole life complaining ;-)

I'm the one to make sure that patches for useful userspace tools that
get submitted to me eventually go upstream, one way or another. Sysprof
has been around for years, had to rely on a (trivial) external module
and AFAIK the feature even predates oprofile's stack-trace support. The
API is butt-ugly and it's being fixed. So if then it should have been
the oprofile folks porting over sysprof to their new API ... claiming
otherwise would rewrite history i think.

a quick glance at oprofile's stack-trace/callgraph support shows it
being rather buggy: it uses __copy_from_user_inatomic() from NMI
context, which is bad because that can fault and re-enable NMIs, causing
stack recursion/corruption. Fixing it is nontrivial. I'm not sure how
much this feature has been tested - but with a sucky userspace kernel
features _do not get tested_ - it's as simple as that! I'd be happier
with sysprof's pure and simple "we only care about time events" initial
approach and evolve this field via actual interest from users.

Ingo

2008-02-24 14:34:11

by Ingo Molnar

[permalink] [raw]
Subject: Re: [PATCH] x86: add the debugfs interface for the sysprof tool


* Ingo Molnar <[email protected]> wrote:

> I'm the one to make sure that patches for useful userspace tools that
> get submitted to me eventually go upstream, one way or another.

just to make it clear: that "one way or another" very much includes the
possibility that sysprof is modified to make use of the oprofile APIs -
i.e. no kernel patch needed at all.

Ingo

2008-02-24 16:34:18

by John Levon

[permalink] [raw]
Subject: Re: [PATCH] x86: add the debugfs interface for the sysprof tool

On Sun, Feb 24, 2008 at 08:10:02AM -0500, Theodore Tso wrote:

> > Firstly, the distributions should have set this up automatically. That
> > they don't is a distributor bug. The sheer madness of Linux not leaving
> > a vmlinux file in a stable known location is hardly something oprofile
> > can be blamed for.
>
> Wrong Answer. People who write userspace helpers *have* to do the
> work of the distro's. It's a bad, bad, bad, Bad, BAD idea to leave it
> up to the distributions. It means that some distributions won't get
> it right; other distributions will do it in different ways, making it
> harder for users to switch between distro's and making it harder for
> people to write distribution-neutral HOWTO's.

Hi Ted. I would have loved to have fixed it myself, or had one of the
other oprofile contributors do so. Unfortunately I have no control over
what the distributions do. For example:

https://bugzilla.redhat.com/show_bug.cgi?id=139767

> There are plenty of things that can be done, including using search
> paths to try to find vmlinuz; or maybe even proposing a new standard
> such as say for example /lib/modules/`uname -r`/vmlinux being a

At the time when I was trying to fix this, I wasn't aware of any way to
propose a new standard and get distributions to follow it - is there
some way now? Informally I discussed this problem several times with
many people without any resolution. As regards searching informal paths,
this is extremely risky - get the wrong vmlinux and we end up with
inaccurate results, which is worse than no results.

> The abdication of responsibility and the lack of trying to solve the
> usability issues is one of the things that really worries me about
> *all* of Linux's RAS tools. We can and should do better! And it's
> really embarassing that the RAS maintainers seem (I assume you are one
> of the oprofile maintainers), seem to be blaming this on the victims,
> the people who are complaining about using *your* tool. Yes, it's a

I'm not abdicating responsibility: I no longer maintain oprofile,
haven't for a long time, and the other contributors don't have any real
time to spend on it either. I'm well aware there are improvements to be
made: when I (and others) had time, this stuff was improving massively
with each release. When Soren et al had a need for a simple tool + GUI,
they should have helped out with this: it's simply good engineering to
not duplicate efforts. I didn't even get a /single/ email.

regards,
john

2008-02-24 18:22:23

by Theodore Ts'o

[permalink] [raw]
Subject: Re: [PATCH] x86: add the debugfs interface for the sysprof tool

On Sun, Feb 24, 2008 at 04:32:40PM +0000, John Levon wrote:
> > There are plenty of things that can be done, including using search
> > paths to try to find vmlinuz; or maybe even proposing a new standard
> > such as say for example /lib/modules/`uname -r`/vmlinux being a
>
> At the time when I was trying to fix this, I wasn't aware of any way to
> propose a new standard and get distributions to follow it - is there
> some way now? Informally I discussed this problem several times with
> many people without any resolution. As regards searching informal paths,
> this is extremely risky - get the wrong vmlinux and we end up with
> inaccurate results, which is worse than no results.

The way that /lib/modules/`uname -r`/build was standardize was via
mail to LKML, years ago. It was declared so, "make install" for base
kernel Makefiles did that, and the distro's picked it up pretty
quickly thereafter.

In terms of picking the right vmlinux, how about a kernel patch which
stashes the MD5 checksum of vmlinux in a convenient location the
compressed kernel which can be pulled out via querying
/sys/kernel/vmlinux_cksum? If the problem is making sure you have the
right vmlinux, there are some fairly simple ways of assuring this ---
it's just a matter of thinking creatively.

> > The abdication of responsibility and the lack of trying to solve the
> > usability issues is one of the things that really worries me about
> > *all* of Linux's RAS tools. We can and should do better! And it's
> > really embarassing that the RAS maintainers seem (I assume you are one
> > of the oprofile maintainers), seem to be blaming this on the victims,
> > the people who are complaining about using *your* tool. Yes, it's a

Let me make it clear that the problems go far beyond oprofile. I have
similar issues of disquietude about the easy of use of SystemTap,
kdump, and all of the other RAS system tools. It may be the problem
is that the companies who fund the development of the RAS tools are
stopping before they can be made turn-key and easy to use by kernel
developers, as opposed to assuming that the distro's will do all of
the hard work productizing them and actually making them *usuable*.

The problem is that not enough mainline kernel developers use these
tools, mostly because they aren't easy enough for them to use. I
remember complaining about kdump, and I got the same answer, "Oh, it's
the distro's job to make it easy to use." Which is fine, except that
means very few people actually use it (how many kernel developers use
RHEL and SLES as their day-to-day development OS, as opposed to Fedora
or Debian, et. al.?), and since there aren't lots of kernel developers
using it, once the people who are funded to support the RAS tools get
reassigned to other projects, what's left is in a terrible shape to be
used by mainline kernel developers, and then the tools effectively
become unused and then unmaintained.....

- Ted

2008-02-26 04:18:27

by Anton Blanchard

[permalink] [raw]
Subject: Re: [PATCH] x86: add the debugfs interface for the sysprof tool


> > From: Soren Sandmann <[email protected]>
> > Subject: [PATCH] x86: add the debugfs interface for the sysprof tool
> >
> > The sysprof tool is a very easy to use GUI tool to find out where
> > userspace is spending CPU time. See
> > http://www.daimi.au.dk/~sandmann/sysprof/ for more information and
> > screenshots on this tool.
> >
> > Sysprof needs a 200 line kernel module to do it's work, this module
> > puts some simple profiling data into debugfs.
>
> thanks, looks good to me - applied.

Woah slow down guys. Did I miss the review?

Yes it's a 200 line patch, but it's a 200 line x86 patch. Surely we
should apply some of the same rigour we did when we merged the oprofile
patch? Is it biarch safe? Will it run on powerpc, arm etc?

I'm still struggling to understand why we need this functionality at
all. Lets argue the ABI and not cloud it with a discussion about
userspace eye candy. What does this give you that is an improvement over
the oprofile kernel-user ABI?

Anton

2008-02-26 05:05:35

by Anton Blanchard

[permalink] [raw]
Subject: Re: [PATCH] x86: add the debugfs interface for the sysprof tool


Hi Peter,

> Usable for me is a cli interface. Also, I absolutely love opannotate.

I definitely agree there.

It's interesting to note that sysprof requires you to run the GUI as
root in order to work. Maybe Ingo and Arjan are confident there are no
bugs in all the libraries that sysprof links to:

# ldd `which sysprof` | wc -l
39

I'm not.

Actually before someone converted it to debugfs, it was even worse, the
sysprof kernel module exported all profiling information to the world:

-r--r--r-- 1 root root 2060 2008-02-25 23:00 /proc/sysprof-trace

Anton

2008-02-26 05:15:52

by Anton Blanchard

[permalink] [raw]
Subject: Re: [PATCH] x86: add the debugfs interface for the sysprof tool


> It was only later I tried oprofile and found it not only much more
> difficult to use, but also much less useful when I did get it to work.

This surprises me. Can you please elaborate on why oprofile is "much
less useful" than sysprof?

Anton - who has used oprofile to analyse and tune databases, JVMs,
compilers and operating systems. Maybe I've been missing out on
the killer app for all this time!!!

2008-02-26 06:27:42

by Pekka Enberg

[permalink] [raw]
Subject: Re: [PATCH] x86: add the debugfs interface for the sysprof tool

On Sun, Feb 24, 2008 at 5:12 AM, Nicholas Miell <[email protected]> wrote:
> > Sysprof tracks the full stack frame so it can provide meaningful call
> > tree (who called what) which is invaluable for spotting hot _paths_. I
> > don't see how oprofile can do that as it tracks instruction pointers only.
>
> You could try passing the --callgraph option to opcontrol.

Hmm, perhaps I am missing something but I don't think that does what
sysprof does. At least I can't find where in the oprofile kernel code
does it save the full stack trace for user-space. John?

Pekka

2008-02-26 06:48:25

by Pekka Enberg

[permalink] [raw]
Subject: Re: [PATCH] x86: add the debugfs interface for the sysprof tool

Hi,

On Tue, Feb 26, 2008 at 8:27 AM, Pekka Enberg <[email protected]> wrote:
> > You could try passing the --callgraph option to opcontrol.
>
> Hmm, perhaps I am missing something but I don't think that does what
> sysprof does. At least I can't find where in the oprofile kernel code
> does it save the full stack trace for user-space. John?

Ok, so as pointed out by Nicholas/Andrew, oprofile does indeed do
exactly what sysprof does (see
arch/x86/oprofile/backtrace.c::backtrace_address, for example). So,
Soeren, any other reason we can't use the oprofile kernel module for
sysprof?

2008-02-26 08:56:30

by Ingo Molnar

[permalink] [raw]
Subject: Re: [PATCH] x86: add the debugfs interface for the sysprof tool


* Pekka Enberg <[email protected]> wrote:

> Hi,
>
> On Tue, Feb 26, 2008 at 8:27 AM, Pekka Enberg <[email protected]> wrote:
> > > You could try passing the --callgraph option to opcontrol.
> >
> > Hmm, perhaps I am missing something but I don't think that does what
> > sysprof does. At least I can't find where in the oprofile kernel code
> > does it save the full stack trace for user-space. John?
>
> Ok, so as pointed out by Nicholas/Andrew, oprofile does indeed do
> exactly what sysprof does (see
> arch/x86/oprofile/backtrace.c::backtrace_address, for example). So,
> Soeren, any other reason we can't use the oprofile kernel module for
> sysprof?

as i pointed it out earlier in the thread, the oprofile implementation
seems buggy because when an event comes from NMI context
__copy_from_user_inatomic() can fault and re-enable NMIs - causing
possible stack recursion/corruption. Does not look like an easy fix.

Ingo

2008-02-26 08:58:24

by Ingo Molnar

[permalink] [raw]
Subject: Re: [PATCH] x86: add the debugfs interface for the sysprof tool


* Anton Blanchard <[email protected]> wrote:

> > thanks, looks good to me - applied.
>
> Woah slow down guys. Did I miss the review?

note that it was applied to x86.git#testing. It's as if Andrew applied
something to -mm. This is not a guarantee of upstream merging (at all).

Ingo

2008-02-26 09:03:07

by Ingo Molnar

[permalink] [raw]
Subject: Re: [PATCH] x86: add the debugfs interface for the sysprof tool


* Anton Blanchard <[email protected]> wrote:

> This surprises me. Can you please elaborate on why oprofile is "much
> less useful" than sysprof?

see the thread you are replying to.

> Anton - who has used oprofile to analyse and tune databases, JVMs,
> compilers and operating systems. Maybe I've been missing out on
> the killer app for all this time!!!

it's OK if you use it full time and if you are amongst the 0.001% of the
developer population we call "the best kernel hackers on the planet" ;-)

It sucks badly if you use it occasionally and have to re-learn its
non-trivial usage and its quirks every time. As it happens, most
userspace developers are in this second category.

(and i'm not worried about the first category at all - if needed they
can write their own OS from scratch ;-)

Ingo

2008-02-26 17:32:28

by Andrew Morton

[permalink] [raw]
Subject: Re: [PATCH] x86: add the debugfs interface for the sysprof tool

On Tue, 26 Feb 2008 10:02:23 +0100 Ingo Molnar <[email protected]> wrote:

> > Anton - who has used oprofile to analyse and tune databases, JVMs,
> > compilers and operating systems. Maybe I've been missing out on
> > the killer app for all this time!!!
>
> it's OK if you use it full time and if you are amongst the 0.001% of the
> developer population we call "the best kernel hackers on the planet" ;-)
>
> It sucks badly if you use it occasionally and have to re-learn its
> non-trivial usage and its quirks every time. As it happens, most
> userspace developers are in this second category.

Nobody has in any way demonstrated that this is due to the design or
implementation of oprofile's kernel component.

2008-02-27 14:09:45

by Anton Blanchard

[permalink] [raw]
Subject: Re: [PATCH] x86: add the debugfs interface for the sysprof tool


Hi Ingo,

> > Anton - who has used oprofile to analyse and tune databases, JVMs,
> > compilers and operating systems. Maybe I've been missing out on
> > the killer app for all this time!!!
>
> it's OK if you use it full time and if you are amongst the 0.001% of the
> developer population we call "the best kernel hackers on the planet" ;-)

I dream of being a card carrying member of the club but apparently
owning the t-shirt doesn't count :)

> It sucks badly if you use it occasionally and have to re-learn its
> non-trivial usage and its quirks every time. As it happens, most
> userspace developers are in this second category.
>
> (and i'm not worried about the first category at all - if needed they
> can write their own OS from scratch ;-)

Or their own profiling infrastructure!

OK I'm done being a pain, time to be constructive. It's becoming clear
we have some work to do in order to make oprofile easier to use. I've
been involved with the project from the early days, and it's hard to be
objective when it comes to usability issues.

Off the top of my head, there are a number of reasons I think it makes
sense to use the existing oprofile kernel code instead of adding the
sysprof kernel module:

- Wide architecture support. A quick look shows 9 architectures with
oprofile kernel module code.

- biarch support. You can mix 32 or 64bit userspace and the same oprofile
userspace works with 32 or 64bit kernels. I'm not sure where sysprof
sits wrt this.

- Java and Mono support. Oprofile has work underway to communicate with
a running JVM and produce symbolic debugging information:

http://primates.ximian.com/~massi/blog/archive/2007/Nov-19.html

- Robust sample to binary mapping. It appears that sysprof has a race
where processes can exit before the user process has a chance to do
the PID to binary mapping.

- Support for the full set of performance monitor events. A 100 or 250Hz
timer is useful for simple profiling, but eventually you will want
either faster sampling or different event sampling (eg cache misses).

Oprofile now has a mode to output XML and it shouldn't take much effort
for sysprof to parse this instead of the binary debugfs file.

Anton