2009-01-27 20:08:19

by Kok, Auke

[permalink] [raw]
Subject: [PATCH] tracer for sys_open() - sreadahead


This tracer monitors regular file open() syscalls. This is a fast
and low-overhead alternative to strace, and does not allow or
require to be attached to every process.

The tracer only logs succesfull calls, as those are the only ones we
are currently interested in, and we can determine the absolute path
of these files as we log.

Signed-off-by: Auke Kok <[email protected]>


diff --git a/fs/open.c b/fs/open.c
index a3a78ce..8cf2a6b 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -30,6 +30,10 @@
#include <linux/audit.h>
#include <linux/falloc.h>

+#include <trace/fs.h>
+
+DEFINE_TRACE(do_sys_open);
+
int vfs_statfs(struct dentry *dentry, struct kstatfs *buf)
{
int retval = -ENODEV;
@@ -1040,6 +1044,7 @@ long do_sys_open(int dfd, const char __user *filename, int
flags, int mode)
fsnotify_open(f->f_path.dentry);
fd_install(fd, f);
}
+ trace_do_sys_open(f, flags, mode, fd);
}
putname(tmp);
}
diff --git a/include/trace/fs.h b/include/trace/fs.h
new file mode 100644
index 0000000..870eec2
--- /dev/null
+++ b/include/trace/fs.h
@@ -0,0 +1,11 @@
+#ifndef _TRACE_FS_H
+#define _TRACE_FS_H
+
+#include <linux/fs.h>
+#include <linux/tracepoint.h>
+
+DECLARE_TRACE(do_sys_open,
+ TPPROTO(struct file *filp, int flags, int mode, long fd),
+ TPARGS(filp, flags, mode, fd));
+
+#endif
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index e2a4ff6..0400815 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -149,6 +149,15 @@ config CONTEXT_SWITCH_TRACER
This tracer gets called from the context switch and records
all switching of tasks.

+config OPEN_CLOSE_TRACER
+ bool "Trace open() calls"
+ depends on DEBUG_KERNEL
+ select TRACING
+ select MARKERS
+ help
+ This tracer records open() syscalls. These calls are made when
+ files are accessed on disk.
+
config BOOT_TRACER
bool "Trace boot initcalls"
depends on DEBUG_KERNEL
diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile
index 349d5a9..25cec6c 100644
--- a/kernel/trace/Makefile
+++ b/kernel/trace/Makefile
@@ -20,6 +20,7 @@ obj-$(CONFIG_RING_BUFFER) += ring_buffer.o

obj-$(CONFIG_TRACING) += trace.o
obj-$(CONFIG_CONTEXT_SWITCH_TRACER) += trace_sched_switch.o
+obj-$(CONFIG_OPEN_CLOSE_TRACER) += trace_open_close.o
obj-$(CONFIG_SYSPROF_TRACER) += trace_sysprof.o
obj-$(CONFIG_FUNCTION_TRACER) += trace_functions.o
obj-$(CONFIG_IRQSOFF_TRACER) += trace_irqsoff.o
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 4d3d381..24c17d2 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -30,6 +30,7 @@ enum trace_type {
TRACE_USER_STACK,
TRACE_HW_BRANCHES,
TRACE_POWER,
+ TRACE_OPEN,

__TRACE_LAST_TYPE
};
diff --git a/kernel/trace/trace_open_close.c b/kernel/trace/trace_open_close.c
new file mode 100644
index 0000000..4250efc
--- /dev/null
+++ b/kernel/trace/trace_open_close.c
@@ -0,0 +1,148 @@
+/*
+ * trace open calls
+ * Copyright (C) 2009 Intel Corporation
+ *
+ * Based extensively on trace_sched_switch.c
+ * Copyright (C) 2007 Steven Rostedt <[email protected]>
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/fs.h>
+#include <linux/debugfs.h>
+#include <linux/kallsyms.h>
+#include <linux/uaccess.h>
+#include <linux/ftrace.h>
+#include <trace/fs.h>
+
+#include "trace.h"
+
+
+static struct trace_array *ctx_trace;
+static int __read_mostly open_trace_enabled;
+static atomic_t open_ref;
+
+static void probe_do_sys_open(struct file *filp, int flags, int mode, long fd)
+{
+ char *buf;
+ char *fname;
+
+ if (!atomic_read(&open_ref))
+ return;
+
+ if (!open_trace_enabled)
+ return;
+
+ buf = kzalloc(PAGE_SIZE, GFP_KERNEL);
+ if (!buf)
+ return;
+ fname = d_path(&filp->f_path, buf, PAGE_SIZE);
+
+ if (IS_ERR(fname))
+ goto out;
+
+ ftrace_printk("%s: open(\"%s\", %d, %d) = %ld\n",
+ current->comm, fname, flags, mode, fd);
+out:
+ kfree(buf);
+}
+
+static void open_trace_reset(struct trace_array *tr)
+{
+ tr->time_start = ftrace_now(tr->cpu);
+ tracing_reset_online_cpus(tr);
+}
+
+static int open_trace_register(void)
+{
+ int ret;
+
+ ret = register_trace_do_sys_open(probe_do_sys_open);
+ if (ret) {
+ pr_info("open trace: Could not activate tracepoint"
+ " probe to do_open\n");
+ }
+
+ return ret;
+}
+
+static void open_trace_unregister(void)
+{
+ unregister_trace_do_sys_open(probe_do_sys_open);
+}
+
+static void open_trace_start(void)
+{
+ long ref;
+
+ ref = atomic_inc_return(&open_ref);
+ if (ref == 1)
+ open_trace_register();
+}
+
+static void open_trace_stop(void)
+{
+ long ref;
+
+ ref = atomic_dec_and_test(&open_ref);
+ if (ref)
+ open_trace_unregister();
+}
+
+void open_trace_start_cmdline_record(void)
+{
+ open_trace_start();
+}
+
+void open_trace_stop_cmdline_record(void)
+{
+ open_trace_stop();
+}
+
+static void open_start_trace(struct trace_array *tr)
+{
+ open_trace_reset(tr);
+ open_trace_start_cmdline_record();
+ open_trace_enabled = 1;
+}
+
+static void open_stop_trace(struct trace_array *tr)
+{
+ open_trace_enabled = 0;
+ open_trace_stop_cmdline_record();
+}
+
+static int open_trace_init(struct trace_array *tr)
+{
+ ctx_trace = tr;
+
+ open_start_trace(tr);
+ return 0;
+}
+
+static void reset_open_trace(struct trace_array *tr)
+{
+ open_stop_trace(tr);
+}
+
+static struct tracer open_trace __read_mostly =
+{
+ .name = "open",
+ .init = open_trace_init,
+ .reset = reset_open_trace,
+};
+
+__init static int init_open_trace(void)
+{
+ int ret = 0;
+
+ if (atomic_read(&open_ref))
+ ret = open_trace_register();
+ if (ret) {
+ pr_info("error registering open trace\n");
+ return ret;
+ }
+ return register_tracer(&open_trace);
+}
+device_initcall(init_open_trace);
+


2009-01-27 20:52:07

by Arnaldo Carvalho de Melo

[permalink] [raw]
Subject: Re: [PATCH] tracer for sys_open() - sreadahead

Em Tue, Jan 27, 2009 at 12:08:04PM -0800, Kok, Auke escreveu:
> This tracer monitors regular file open() syscalls. This is a fast
> and low-overhead alternative to strace, and does not allow or
> require to be attached to every process.
>
> The tracer only logs succesfull calls, as those are the only ones we
> are currently interested in, and we can determine the absolute path
> of these files as we log.
>
> Signed-off-by: Auke Kok <[email protected]>

<SNIP>

> diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
> index 4d3d381..24c17d2 100644
> --- a/kernel/trace/trace.h
> +++ b/kernel/trace/trace.h
> @@ -30,6 +30,7 @@ enum trace_type {
> TRACE_USER_STACK,
> TRACE_HW_BRANCHES,
> TRACE_POWER,
> + TRACE_OPEN,

Why not a TRACE_VFS or TRACE_SYSCALL and then multiplex there open,
close, etc? trace_assign_type will get humongous in no time this way.

TRACE_BLK does multiplexing, as does TRACE_PROCESS that Frank just
posted too, and I'm working on a plugin for the patch that Neil
submitted for the socket layer that also will need an entry there for
its tracepoints.

Frederic, it seems that discussion about subtypes has to continue :-)

- Arnaldo

2009-01-27 21:15:16

by Frederic Weisbecker

[permalink] [raw]
Subject: Re: [PATCH] tracer for sys_open() - sreadahead

On Tue, Jan 27, 2009 at 06:51:37PM -0200, Arnaldo Carvalho de Melo wrote:
> Em Tue, Jan 27, 2009 at 12:08:04PM -0800, Kok, Auke escreveu:
> > This tracer monitors regular file open() syscalls. This is a fast
> > and low-overhead alternative to strace, and does not allow or
> > require to be attached to every process.
> >
> > The tracer only logs succesfull calls, as those are the only ones we
> > are currently interested in, and we can determine the absolute path
> > of these files as we log.
> >
> > Signed-off-by: Auke Kok <[email protected]>
>
> <SNIP>
>
> > diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
> > index 4d3d381..24c17d2 100644
> > --- a/kernel/trace/trace.h
> > +++ b/kernel/trace/trace.h
> > @@ -30,6 +30,7 @@ enum trace_type {
> > TRACE_USER_STACK,
> > TRACE_HW_BRANCHES,
> > TRACE_POWER,
> > + TRACE_OPEN,
>
> Why not a TRACE_VFS or TRACE_SYSCALL and then multiplex there open,
> close, etc? trace_assign_type will get humongous in no time this way.


That's what I was about to answer too.
That would be sad to find one tracer for open, one for read, one for write...


> TRACE_BLK does multiplexing, as does TRACE_PROCESS that Frank just
> posted too, and I'm working on a plugin for the patch that Neil
> submitted for the socket layer that also will need an entry there for
> its tracepoints.
>
> Frederic, it seems that discussion about subtypes has to continue :-)


Indeed, that becomes serious :-)

2009-01-27 22:43:39

by Frederic Weisbecker

[permalink] [raw]
Subject: Re: [PATCH] tracer for sys_open() - sreadahead

On Tue, Jan 27, 2009 at 12:08:04PM -0800, Kok, Auke wrote:
>
> This tracer monitors regular file open() syscalls. This is a fast
> and low-overhead alternative to strace, and does not allow or
> require to be attached to every process.
>
> The tracer only logs succesfull calls, as those are the only ones we
> are currently interested in, and we can determine the absolute path
> of these files as we log.
>
> Signed-off-by: Auke Kok <[email protected]>


Hi Auke,

Speaking about a global syscall tracer, I made a patch to trace only the syscalls
with the function-graph-tracer.

http://lkml.org/lkml/2008/12/30/267

Its approach and purpose is different than a tracer dedicated only to syscalls.
The function graph tracer traces execution graph of the functions and is more about
execution time spent and code flow whereas a syscall tracer can provide more specific
informations about syscalls.

So both are not overlaping.

But the low level part of my patch creates a thread flag _TIF_SYSCALL_TRACE which triggers
a ptrace hook when set.
This low-level part can easily be used by all tracers that would like to inspect syscalls.

Just a change is needed: Steven requested that the part inside syscall_trace_enter become
a tracepoint, making it totally shareable between tracers and easy to turn on and off.

And perhaps the parts that set/clear the flag on all tasks can be shared too.

So we can start with this low-level syscall tracing facility. If you want, I can adapt
this low-level part and submit a patch this week or the next one to give you this base
infrastructure.


Once we have it, I think a syscall tracer can be fed with new syscalls events through
several patch iterations, starting with the open and close one :-)

Are you ok with that?

Steven, Ingo, do you agree?


>
> diff --git a/fs/open.c b/fs/open.c
> index a3a78ce..8cf2a6b 100644
> --- a/fs/open.c
> +++ b/fs/open.c
> @@ -30,6 +30,10 @@
> #include <linux/audit.h>
> #include <linux/falloc.h>
>
> +#include <trace/fs.h>
> +
> +DEFINE_TRACE(do_sys_open);
> +
> int vfs_statfs(struct dentry *dentry, struct kstatfs *buf)
> {
> int retval = -ENODEV;
> @@ -1040,6 +1044,7 @@ long do_sys_open(int dfd, const char __user *filename, int
> flags, int mode)
> fsnotify_open(f->f_path.dentry);
> fd_install(fd, f);
> }
> + trace_do_sys_open(f, flags, mode, fd);
> }
> putname(tmp);
> }
> diff --git a/include/trace/fs.h b/include/trace/fs.h
> new file mode 100644
> index 0000000..870eec2
> --- /dev/null
> +++ b/include/trace/fs.h
> @@ -0,0 +1,11 @@
> +#ifndef _TRACE_FS_H
> +#define _TRACE_FS_H
> +
> +#include <linux/fs.h>
> +#include <linux/tracepoint.h>
> +
> +DECLARE_TRACE(do_sys_open,
> + TPPROTO(struct file *filp, int flags, int mode, long fd),
> + TPARGS(filp, flags, mode, fd));
> +
> +#endif
> diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
> index e2a4ff6..0400815 100644
> --- a/kernel/trace/Kconfig
> +++ b/kernel/trace/Kconfig
> @@ -149,6 +149,15 @@ config CONTEXT_SWITCH_TRACER
> This tracer gets called from the context switch and records
> all switching of tasks.
>
> +config OPEN_CLOSE_TRACER
> + bool "Trace open() calls"
> + depends on DEBUG_KERNEL
> + select TRACING
> + select MARKERS
> + help
> + This tracer records open() syscalls. These calls are made when
> + files are accessed on disk.
> +
> config BOOT_TRACER
> bool "Trace boot initcalls"
> depends on DEBUG_KERNEL
> diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile
> index 349d5a9..25cec6c 100644
> --- a/kernel/trace/Makefile
> +++ b/kernel/trace/Makefile
> @@ -20,6 +20,7 @@ obj-$(CONFIG_RING_BUFFER) += ring_buffer.o
>
> obj-$(CONFIG_TRACING) += trace.o
> obj-$(CONFIG_CONTEXT_SWITCH_TRACER) += trace_sched_switch.o
> +obj-$(CONFIG_OPEN_CLOSE_TRACER) += trace_open_close.o
> obj-$(CONFIG_SYSPROF_TRACER) += trace_sysprof.o
> obj-$(CONFIG_FUNCTION_TRACER) += trace_functions.o
> obj-$(CONFIG_IRQSOFF_TRACER) += trace_irqsoff.o
> diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
> index 4d3d381..24c17d2 100644
> --- a/kernel/trace/trace.h
> +++ b/kernel/trace/trace.h
> @@ -30,6 +30,7 @@ enum trace_type {
> TRACE_USER_STACK,
> TRACE_HW_BRANCHES,
> TRACE_POWER,
> + TRACE_OPEN,
>
> __TRACE_LAST_TYPE
> };
> diff --git a/kernel/trace/trace_open_close.c b/kernel/trace/trace_open_close.c
> new file mode 100644
> index 0000000..4250efc
> --- /dev/null
> +++ b/kernel/trace/trace_open_close.c
> @@ -0,0 +1,148 @@
> +/*
> + * trace open calls
> + * Copyright (C) 2009 Intel Corporation
> + *
> + * Based extensively on trace_sched_switch.c
> + * Copyright (C) 2007 Steven Rostedt <[email protected]>
> + *
> + */
> +
> +#include <linux/module.h>
> +#include <linux/fs.h>
> +#include <linux/debugfs.h>
> +#include <linux/kallsyms.h>
> +#include <linux/uaccess.h>
> +#include <linux/ftrace.h>
> +#include <trace/fs.h>
> +
> +#include "trace.h"
> +
> +
> +static struct trace_array *ctx_trace;
> +static int __read_mostly open_trace_enabled;
> +static atomic_t open_ref;
> +
> +static void probe_do_sys_open(struct file *filp, int flags, int mode, long fd)
> +{
> + char *buf;
> + char *fname;
> +
> + if (!atomic_read(&open_ref))
> + return;
> +
> + if (!open_trace_enabled)
> + return;
> +
> + buf = kzalloc(PAGE_SIZE, GFP_KERNEL);
> + if (!buf)
> + return;
> + fname = d_path(&filp->f_path, buf, PAGE_SIZE);
> +
> + if (IS_ERR(fname))
> + goto out;
> +
> + ftrace_printk("%s: open(\"%s\", %d, %d) = %ld\n",
> + current->comm, fname, flags, mode, fd);
> +out:
> + kfree(buf);
> +}
> +
> +static void open_trace_reset(struct trace_array *tr)
> +{
> + tr->time_start = ftrace_now(tr->cpu);
> + tracing_reset_online_cpus(tr);
> +}
> +
> +static int open_trace_register(void)
> +{
> + int ret;
> +
> + ret = register_trace_do_sys_open(probe_do_sys_open);
> + if (ret) {
> + pr_info("open trace: Could not activate tracepoint"
> + " probe to do_open\n");
> + }
> +
> + return ret;
> +}
> +
> +static void open_trace_unregister(void)
> +{
> + unregister_trace_do_sys_open(probe_do_sys_open);
> +}
> +
> +static void open_trace_start(void)
> +{
> + long ref;
> +
> + ref = atomic_inc_return(&open_ref);
> + if (ref == 1)
> + open_trace_register();
> +}
> +
> +static void open_trace_stop(void)
> +{
> + long ref;
> +
> + ref = atomic_dec_and_test(&open_ref);
> + if (ref)
> + open_trace_unregister();
> +}
> +
> +void open_trace_start_cmdline_record(void)
> +{
> + open_trace_start();
> +}
> +
> +void open_trace_stop_cmdline_record(void)
> +{
> + open_trace_stop();
> +}
> +
> +static void open_start_trace(struct trace_array *tr)
> +{
> + open_trace_reset(tr);
> + open_trace_start_cmdline_record();
> + open_trace_enabled = 1;
> +}
> +
> +static void open_stop_trace(struct trace_array *tr)
> +{
> + open_trace_enabled = 0;
> + open_trace_stop_cmdline_record();
> +}
> +
> +static int open_trace_init(struct trace_array *tr)
> +{
> + ctx_trace = tr;
> +
> + open_start_trace(tr);
> + return 0;
> +}
> +
> +static void reset_open_trace(struct trace_array *tr)
> +{
> + open_stop_trace(tr);
> +}
> +
> +static struct tracer open_trace __read_mostly =
> +{
> + .name = "open",
> + .init = open_trace_init,
> + .reset = reset_open_trace,
> +};
> +
> +__init static int init_open_trace(void)
> +{
> + int ret = 0;
> +
> + if (atomic_read(&open_ref))
> + ret = open_trace_register();
> + if (ret) {
> + pr_info("error registering open trace\n");
> + return ret;
> + }
> + return register_tracer(&open_trace);
> +}
> +device_initcall(init_open_trace);
> +
> --
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to [email protected]
> More majordomo info at http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at http://www.tux.org/lkml/

2009-01-27 22:51:09

by Frederic Weisbecker

[permalink] [raw]
Subject: Re: [PATCH] tracer for sys_open() - sreadahead

On Tue, Jan 27, 2009 at 11:43:03PM +0100, Frederic Weisbecker wrote:
> On Tue, Jan 27, 2009 at 12:08:04PM -0800, Kok, Auke wrote:
> >
> > This tracer monitors regular file open() syscalls. This is a fast
> > and low-overhead alternative to strace, and does not allow or
> > require to be attached to every process.
> >
> > The tracer only logs succesfull calls, as those are the only ones we
> > are currently interested in, and we can determine the absolute path
> > of these files as we log.
> >
> > Signed-off-by: Auke Kok <[email protected]>
>
>
> Hi Auke,
>
> Speaking about a global syscall tracer, I made a patch to trace only the syscalls
> with the function-graph-tracer.
>
> http://lkml.org/lkml/2008/12/30/267
>
> Its approach and purpose is different than a tracer dedicated only to syscalls.
> The function graph tracer traces execution graph of the functions and is more about
> execution time spent and code flow whereas a syscall tracer can provide more specific
> informations about syscalls.
>
> So both are not overlaping.
>
> But the low level part of my patch creates a thread flag _TIF_SYSCALL_TRACE which triggers

s/_TIF_SYSCALL_TRACE/_TIF_SYSCALL_FTRACE

_TIF_SYSCALL_TRACE is the one used by ptrace.


> a ptrace hook when set.
> This low-level part can easily be used by all tracers that would like to inspect syscalls.
>
> Just a change is needed: Steven requested that the part inside syscall_trace_enter become
> a tracepoint, making it totally shareable between tracers and easy to turn on and off.
>
> And perhaps the parts that set/clear the flag on all tasks can be shared too.
>
> So we can start with this low-level syscall tracing facility. If you want, I can adapt
> this low-level part and submit a patch this week or the next one to give you this base
> infrastructure.
>
>
> Once we have it, I think a syscall tracer can be fed with new syscalls events through
> several patch iterations, starting with the open and close one :-)
>
> Are you ok with that?
>
> Steven, Ingo, do you agree?
>
>
> >
> > diff --git a/fs/open.c b/fs/open.c
> > index a3a78ce..8cf2a6b 100644
> > --- a/fs/open.c
> > +++ b/fs/open.c
> > @@ -30,6 +30,10 @@
> > #include <linux/audit.h>
> > #include <linux/falloc.h>
> >
> > +#include <trace/fs.h>
> > +
> > +DEFINE_TRACE(do_sys_open);
> > +
> > int vfs_statfs(struct dentry *dentry, struct kstatfs *buf)
> > {
> > int retval = -ENODEV;
> > @@ -1040,6 +1044,7 @@ long do_sys_open(int dfd, const char __user *filename, int
> > flags, int mode)
> > fsnotify_open(f->f_path.dentry);
> > fd_install(fd, f);
> > }
> > + trace_do_sys_open(f, flags, mode, fd);
> > }
> > putname(tmp);
> > }
> > diff --git a/include/trace/fs.h b/include/trace/fs.h
> > new file mode 100644
> > index 0000000..870eec2
> > --- /dev/null
> > +++ b/include/trace/fs.h
> > @@ -0,0 +1,11 @@
> > +#ifndef _TRACE_FS_H
> > +#define _TRACE_FS_H
> > +
> > +#include <linux/fs.h>
> > +#include <linux/tracepoint.h>
> > +
> > +DECLARE_TRACE(do_sys_open,
> > + TPPROTO(struct file *filp, int flags, int mode, long fd),
> > + TPARGS(filp, flags, mode, fd));
> > +
> > +#endif
> > diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
> > index e2a4ff6..0400815 100644
> > --- a/kernel/trace/Kconfig
> > +++ b/kernel/trace/Kconfig
> > @@ -149,6 +149,15 @@ config CONTEXT_SWITCH_TRACER
> > This tracer gets called from the context switch and records
> > all switching of tasks.
> >
> > +config OPEN_CLOSE_TRACER
> > + bool "Trace open() calls"
> > + depends on DEBUG_KERNEL
> > + select TRACING
> > + select MARKERS
> > + help
> > + This tracer records open() syscalls. These calls are made when
> > + files are accessed on disk.
> > +
> > config BOOT_TRACER
> > bool "Trace boot initcalls"
> > depends on DEBUG_KERNEL
> > diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile
> > index 349d5a9..25cec6c 100644
> > --- a/kernel/trace/Makefile
> > +++ b/kernel/trace/Makefile
> > @@ -20,6 +20,7 @@ obj-$(CONFIG_RING_BUFFER) += ring_buffer.o
> >
> > obj-$(CONFIG_TRACING) += trace.o
> > obj-$(CONFIG_CONTEXT_SWITCH_TRACER) += trace_sched_switch.o
> > +obj-$(CONFIG_OPEN_CLOSE_TRACER) += trace_open_close.o
> > obj-$(CONFIG_SYSPROF_TRACER) += trace_sysprof.o
> > obj-$(CONFIG_FUNCTION_TRACER) += trace_functions.o
> > obj-$(CONFIG_IRQSOFF_TRACER) += trace_irqsoff.o
> > diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
> > index 4d3d381..24c17d2 100644
> > --- a/kernel/trace/trace.h
> > +++ b/kernel/trace/trace.h
> > @@ -30,6 +30,7 @@ enum trace_type {
> > TRACE_USER_STACK,
> > TRACE_HW_BRANCHES,
> > TRACE_POWER,
> > + TRACE_OPEN,
> >
> > __TRACE_LAST_TYPE
> > };
> > diff --git a/kernel/trace/trace_open_close.c b/kernel/trace/trace_open_close.c
> > new file mode 100644
> > index 0000000..4250efc
> > --- /dev/null
> > +++ b/kernel/trace/trace_open_close.c
> > @@ -0,0 +1,148 @@
> > +/*
> > + * trace open calls
> > + * Copyright (C) 2009 Intel Corporation
> > + *
> > + * Based extensively on trace_sched_switch.c
> > + * Copyright (C) 2007 Steven Rostedt <[email protected]>
> > + *
> > + */
> > +
> > +#include <linux/module.h>
> > +#include <linux/fs.h>
> > +#include <linux/debugfs.h>
> > +#include <linux/kallsyms.h>
> > +#include <linux/uaccess.h>
> > +#include <linux/ftrace.h>
> > +#include <trace/fs.h>
> > +
> > +#include "trace.h"
> > +
> > +
> > +static struct trace_array *ctx_trace;
> > +static int __read_mostly open_trace_enabled;
> > +static atomic_t open_ref;
> > +
> > +static void probe_do_sys_open(struct file *filp, int flags, int mode, long fd)
> > +{
> > + char *buf;
> > + char *fname;
> > +
> > + if (!atomic_read(&open_ref))
> > + return;
> > +
> > + if (!open_trace_enabled)
> > + return;
> > +
> > + buf = kzalloc(PAGE_SIZE, GFP_KERNEL);
> > + if (!buf)
> > + return;
> > + fname = d_path(&filp->f_path, buf, PAGE_SIZE);
> > +
> > + if (IS_ERR(fname))
> > + goto out;
> > +
> > + ftrace_printk("%s: open(\"%s\", %d, %d) = %ld\n",
> > + current->comm, fname, flags, mode, fd);
> > +out:
> > + kfree(buf);
> > +}
> > +
> > +static void open_trace_reset(struct trace_array *tr)
> > +{
> > + tr->time_start = ftrace_now(tr->cpu);
> > + tracing_reset_online_cpus(tr);
> > +}
> > +
> > +static int open_trace_register(void)
> > +{
> > + int ret;
> > +
> > + ret = register_trace_do_sys_open(probe_do_sys_open);
> > + if (ret) {
> > + pr_info("open trace: Could not activate tracepoint"
> > + " probe to do_open\n");
> > + }
> > +
> > + return ret;
> > +}
> > +
> > +static void open_trace_unregister(void)
> > +{
> > + unregister_trace_do_sys_open(probe_do_sys_open);
> > +}
> > +
> > +static void open_trace_start(void)
> > +{
> > + long ref;
> > +
> > + ref = atomic_inc_return(&open_ref);
> > + if (ref == 1)
> > + open_trace_register();
> > +}
> > +
> > +static void open_trace_stop(void)
> > +{
> > + long ref;
> > +
> > + ref = atomic_dec_and_test(&open_ref);
> > + if (ref)
> > + open_trace_unregister();
> > +}
> > +
> > +void open_trace_start_cmdline_record(void)
> > +{
> > + open_trace_start();
> > +}
> > +
> > +void open_trace_stop_cmdline_record(void)
> > +{
> > + open_trace_stop();
> > +}
> > +
> > +static void open_start_trace(struct trace_array *tr)
> > +{
> > + open_trace_reset(tr);
> > + open_trace_start_cmdline_record();
> > + open_trace_enabled = 1;
> > +}
> > +
> > +static void open_stop_trace(struct trace_array *tr)
> > +{
> > + open_trace_enabled = 0;
> > + open_trace_stop_cmdline_record();
> > +}
> > +
> > +static int open_trace_init(struct trace_array *tr)
> > +{
> > + ctx_trace = tr;
> > +
> > + open_start_trace(tr);
> > + return 0;
> > +}
> > +
> > +static void reset_open_trace(struct trace_array *tr)
> > +{
> > + open_stop_trace(tr);
> > +}
> > +
> > +static struct tracer open_trace __read_mostly =
> > +{
> > + .name = "open",
> > + .init = open_trace_init,
> > + .reset = reset_open_trace,
> > +};
> > +
> > +__init static int init_open_trace(void)
> > +{
> > + int ret = 0;
> > +
> > + if (atomic_read(&open_ref))
> > + ret = open_trace_register();
> > + if (ret) {
> > + pr_info("error registering open trace\n");
> > + return ret;
> > + }
> > + return register_tracer(&open_trace);
> > +}
> > +device_initcall(init_open_trace);
> > +
> > --
> > To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> > the body of a message to [email protected]
> > More majordomo info at http://vger.kernel.org/majordomo-info.html
> > Please read the FAQ at http://www.tux.org/lkml/

2009-01-28 00:44:23

by Frank Ch. Eigler

[permalink] [raw]
Subject: Re: [PATCH] tracer for sys_open() - sreadahead

Frederic Weisbecker <[email protected]> writes:

> [...]
> Speaking about a global syscall tracer, I made a patch to trace only the syscalls
> with the function-graph-tracer.
> http://lkml.org/lkml/2008/12/30/267 This low-level part can easily
> be used by all tracers that would like to inspect syscalls.
> [...]
> Just a change is needed: Steven requested that the part inside
> syscall_trace_enter become a tracepoint, making it totally shareable
> between tracers and easy to turn on and off.

Alternately, you could just rely on utrace's hooks. They were thought
out more fully with respect to parameter access, manipulation, and
programmatic control befitting even a debugger.


- FChE

Subject: Re: [PATCH] tracer for sys_open() - sreadahead

On Tue, Jan 27, 2009 at 11:43:05PM +0100, Frederic Weisbecker wrote:
> On Tue, Jan 27, 2009 at 12:08:04PM -0800, Kok, Auke wrote:
> >
> > This tracer monitors regular file open() syscalls. This is a fast
> > and low-overhead alternative to strace, and does not allow or
> > require to be attached to every process.
> >
> > The tracer only logs succesfull calls, as those are the only ones we
> > are currently interested in, and we can determine the absolute path
> > of these files as we log.
> >
> > Signed-off-by: Auke Kok <[email protected]>
>
>
> Hi Auke,
>
> Speaking about a global syscall tracer, I made a patch to trace only the syscalls
> with the function-graph-tracer.
>
> http://lkml.org/lkml/2008/12/30/267
>
> Its approach and purpose is different than a tracer dedicated only to syscalls.
> The function graph tracer traces execution graph of the functions and is more about
> execution time spent and code flow whereas a syscall tracer can provide more specific
> informations about syscalls.
>
> So both are not overlaping.
>
> But the low level part of my patch creates a thread flag _TIF_SYSCALL_TRACE which triggers
> a ptrace hook when set.
> This low-level part can easily be used by all tracers that would like to inspect syscalls.
>
> Just a change is needed: Steven requested that the part inside syscall_trace_enter become
> a tracepoint, making it totally shareable between tracers and easy to turn on and off.
>
> And perhaps the parts that set/clear the flag on all tasks can be shared too.
>
> So we can start with this low-level syscall tracing facility. If you want, I can adapt
> this low-level part and submit a patch this week or the next one to give you this base
> infrastructure.
>
>
> Once we have it, I think a syscall tracer can be fed with new syscalls events through
> several patch iterations, starting with the open and close one :-)

Here is something I did sometime ago that uses utrace. It is per task
doesn't use ftrace and is just intended as a prototype. It traces both
syscalls and returns.

---

Here is the beginnings of a simple utrace based strace. Right now, one
needs to invoke this program with a 'insmod <modname> tid=<tid>'.

The output looks something like this:
[267352.641112] Attached to 32604 => 0xd8f60090
[267353.981046] 2
[267353.981085] 197 0x1 0xbff8cd84 0x86cff4 0x86d4c0 0x86d4c0 0xbff8cd50a = 0
[267353.981097] 192 0x0 0x1000 0x3 0x22 0xffffffff 0x0 = b7f8d000
[267353.981124] 4 0x1 0xb7f8d000 0xd 0xd 0xb7f8d000 0xbff8cda8 = d
[267353.981174] 4 0x1 0xb7f8d000 0x3 0x3 0xb7f8d000 0xbff8c7dc = 3
[267353.981209] 252 0x0 0x0 0x86e0d0 0x0 0x86b274 0xbff8cee8 =
[267353.981215] Task 32604 exited
[267355.460180] Cannot find PID 32604

I know strace does a pretty print, but this is a quick and dirty
prototype.
---

#include <linux/module.h>
#include <linux/utrace.h>
#include <linux/err.h>
#include <asm/syscall.h>

MODULE_DESCRIPTION("syscall trace");
MODULE_LICENSE("GPL");

static int target_tid;

module_param_named(tid, target_tid, int, 0);

static u32 task_syscall_entry(u32 action, struct utrace_attached_engine *engine,
struct task_struct *task, struct pt_regs *regs)
{
long callno;
unsigned long args[6];

callno = syscall_get_nr(task, regs);
syscall_get_arguments(task, regs, 0, 6, args);

printk(KERN_INFO "%ld 0x%lx 0x%lx 0x%lx 0x%lx 0x%lx 0x%lx = ",
callno, args[0], args[1], args[2], args[3], args[4], args[5]);
return UTRACE_RESUME;
}

static u32 task_syscall_exit(u32 action, struct utrace_attached_engine *engine,
struct task_struct *task, struct pt_regs *regs)
{
long retval = syscall_get_return_value(task, regs);
printk("%lx\n", retval);
return UTRACE_RESUME;
}

static u32 task_exit(enum utrace_resume_action action,
struct utrace_attached_engine *engine, struct task_struct *task,
long orig_code, long *code)
{
printk("\n");
printk(KERN_INFO "Task %d exited\n", target_tid);
return UTRACE_DETACH;
}

static const struct utrace_engine_ops syscall_ops =
{
.report_syscall_entry = task_syscall_entry,
.report_syscall_exit = task_syscall_exit,
.report_exit = task_exit,
};

static int __init strace_init(void)
{
struct pid *pid;
int ret = 0;
struct task_struct *target;
struct utrace_attached_engine *engine;

pid = find_get_pid(target_tid);
if (pid == NULL) {
printk(KERN_ERR "Cannot find PID %d\n", target_tid);
ret = -ESRCH;
goto out;
}

engine = utrace_attach_pid(pid, UTRACE_ATTACH_CREATE, &syscall_ops, 0);
if (IS_ERR(engine)) {
printk(KERN_ERR "utrace_attach_pid: %ld\n", PTR_ERR(engine));
ret = -EINVAL;
goto out;
} else if (engine == NULL) {
printk(KERN_ERR "utrace_attach_pid => NULL\n");
ret = -EINVAL;
goto out;
} else
printk(KERN_INFO "Attached to %d => 0x%p\n",
target_tid, engine);

/*
* If utrace_attach_pid() succeeded above, we are sure the target
* is valid here
*/
target = pid_task(pid, PIDTYPE_PID);
put_pid(pid);

ret = utrace_set_events(target, engine, UTRACE_EVENT_SYSCALL |
UTRACE_EVENT(EXIT));
if (ret)
printk(KERN_ERR "utrace_set_events returned %d\n", ret);

out:
return ret;
}

static void __exit strace_exit(void)
{
int ret = 0;
struct pid *pid;
struct utrace_attached_engine *engine;
struct task_struct *target;

pid = find_get_pid(target_tid);
if (pid == NULL) {
printk(KERN_ERR "Cannot find PID %d\n", target_tid);
return;
}

target = pid_task(pid, PIDTYPE_PID);
put_pid(pid);
engine = utrace_attach_task(target, UTRACE_ATTACH_MATCH_OPS,
&syscall_ops, 0);
if (IS_ERR(engine))
printk(KERN_ERR "Can't find self: %ld\n", PTR_ERR(engine));
else if (engine == NULL)
printk(KERN_ERR "Can't find self: no match\n");
else {
printk(KERN_INFO "Trying detach 0x%p from %d\n",
engine, target_tid);
ret = utrace_control(target, engine, UTRACE_DETACH);
if (ret)
printk(KERN_ERR "utrace_control returned %d\n",
ret);
}
}

module_init(strace_init);
module_exit(strace_exit);

2009-01-28 14:05:56

by Frederic Weisbecker

[permalink] [raw]
Subject: Re: [PATCH] tracer for sys_open() - sreadahead

2009/1/28 Frank Ch. Eigler <[email protected]>:
> Frederic Weisbecker <[email protected]> writes:
>
>> [...]
>> Speaking about a global syscall tracer, I made a patch to trace only the syscalls
>> with the function-graph-tracer.
>> http://lkml.org/lkml/2008/12/30/267 This low-level part can easily
>> be used by all tracers that would like to inspect syscalls.
>> [...]
>> Just a change is needed: Steven requested that the part inside
>> syscall_trace_enter become a tracepoint, making it totally shareable
>> between tracers and easy to turn on and off.
>
> Alternately, you could just rely on utrace's hooks. They were thought
> out more fully with respect to parameter access, manipulation, and
> programmatic control befitting even a debugger.
>
>
> - FChE
>

I don't know much it. But I will soon have some time to look at your
patch which uses ftrace from utrace.
Anyway, are there some plans about utrace to be merged? Unless I
couldn't be able to use
it...

2009-01-28 14:21:38

by Frederic Weisbecker

[permalink] [raw]
Subject: Re: [PATCH] tracer for sys_open() - sreadahead

2009/1/28 Ananth N Mavinakayanahalli <[email protected]>:
> On Tue, Jan 27, 2009 at 11:43:05PM +0100, Frederic Weisbecker wrote:
>> On Tue, Jan 27, 2009 at 12:08:04PM -0800, Kok, Auke wrote:
>> >
>> > This tracer monitors regular file open() syscalls. This is a fast
>> > and low-overhead alternative to strace, and does not allow or
>> > require to be attached to every process.
>> >
>> > The tracer only logs succesfull calls, as those are the only ones we
>> > are currently interested in, and we can determine the absolute path
>> > of these files as we log.
>> >
>> > Signed-off-by: Auke Kok <[email protected]>
>>
>>
>> Hi Auke,
>>
>> Speaking about a global syscall tracer, I made a patch to trace only the syscalls
>> with the function-graph-tracer.
>>
>> http://lkml.org/lkml/2008/12/30/267
>>
>> Its approach and purpose is different than a tracer dedicated only to syscalls.
>> The function graph tracer traces execution graph of the functions and is more about
>> execution time spent and code flow whereas a syscall tracer can provide more specific
>> informations about syscalls.
>>
>> So both are not overlaping.
>>
>> But the low level part of my patch creates a thread flag _TIF_SYSCALL_TRACE which triggers
>> a ptrace hook when set.
>> This low-level part can easily be used by all tracers that would like to inspect syscalls.
>>
>> Just a change is needed: Steven requested that the part inside syscall_trace_enter become
>> a tracepoint, making it totally shareable between tracers and easy to turn on and off.
>>
>> And perhaps the parts that set/clear the flag on all tasks can be shared too.
>>
>> So we can start with this low-level syscall tracing facility. If you want, I can adapt
>> this low-level part and submit a patch this week or the next one to give you this base
>> infrastructure.
>>
>>
>> Once we have it, I think a syscall tracer can be fed with new syscalls events through
>> several patch iterations, starting with the open and close one :-)
>
> Here is something I did sometime ago that uses utrace. It is per task
> doesn't use ftrace and is just intended as a prototype. It traces both
> syscalls and returns.
>
> ---
>
> Here is the beginnings of a simple utrace based strace. Right now, one
> needs to invoke this program with a 'insmod <modname> tid=<tid>'.
>
> The output looks something like this:
> [267352.641112] Attached to 32604 => 0xd8f60090
> [267353.981046] 2
> [267353.981085] 197 0x1 0xbff8cd84 0x86cff4 0x86d4c0 0x86d4c0 0xbff8cd50a = 0
> [267353.981097] 192 0x0 0x1000 0x3 0x22 0xffffffff 0x0 = b7f8d000
> [267353.981124] 4 0x1 0xb7f8d000 0xd 0xd 0xb7f8d000 0xbff8cda8 = d
> [267353.981174] 4 0x1 0xb7f8d000 0x3 0x3 0xb7f8d000 0xbff8c7dc = 3
> [267353.981209] 252 0x0 0x0 0x86e0d0 0x0 0x86b274 0xbff8cee8 =
> [267353.981215] Task 32604 exited
> [267355.460180] Cannot find PID 32604
>
> I know strace does a pretty print, but this is a quick and dirty
> prototype.
> ---
>
> #include <linux/module.h>
> #include <linux/utrace.h>
> #include <linux/err.h>
> #include <asm/syscall.h>
>
> MODULE_DESCRIPTION("syscall trace");
> MODULE_LICENSE("GPL");
>
> static int target_tid;
>
> module_param_named(tid, target_tid, int, 0);
>
> static u32 task_syscall_entry(u32 action, struct utrace_attached_engine *engine,
> struct task_struct *task, struct pt_regs *regs)
> {
> long callno;
> unsigned long args[6];
>
> callno = syscall_get_nr(task, regs);
> syscall_get_arguments(task, regs, 0, 6, args);
>
> printk(KERN_INFO "%ld 0x%lx 0x%lx 0x%lx 0x%lx 0x%lx 0x%lx = ",
> callno, args[0], args[1], args[2], args[3], args[4], args[5]);
> return UTRACE_RESUME;
> }
>
> static u32 task_syscall_exit(u32 action, struct utrace_attached_engine *engine,
> struct task_struct *task, struct pt_regs *regs)
> {
> long retval = syscall_get_return_value(task, regs);
> printk("%lx\n", retval);
> return UTRACE_RESUME;
> }
>
> static u32 task_exit(enum utrace_resume_action action,
> struct utrace_attached_engine *engine, struct task_struct *task,
> long orig_code, long *code)
> {
> printk("\n");
> printk(KERN_INFO "Task %d exited\n", target_tid);
> return UTRACE_DETACH;
> }
>
> static const struct utrace_engine_ops syscall_ops =
> {
> .report_syscall_entry = task_syscall_entry,
> .report_syscall_exit = task_syscall_exit,
> .report_exit = task_exit,
> };
>
> static int __init strace_init(void)
> {
> struct pid *pid;
> int ret = 0;
> struct task_struct *target;
> struct utrace_attached_engine *engine;
>
> pid = find_get_pid(target_tid);
> if (pid == NULL) {
> printk(KERN_ERR "Cannot find PID %d\n", target_tid);
> ret = -ESRCH;
> goto out;
> }
>
> engine = utrace_attach_pid(pid, UTRACE_ATTACH_CREATE, &syscall_ops, 0);
> if (IS_ERR(engine)) {
> printk(KERN_ERR "utrace_attach_pid: %ld\n", PTR_ERR(engine));
> ret = -EINVAL;
> goto out;
> } else if (engine == NULL) {
> printk(KERN_ERR "utrace_attach_pid => NULL\n");
> ret = -EINVAL;
> goto out;
> } else
> printk(KERN_INFO "Attached to %d => 0x%p\n",
> target_tid, engine);
>
> /*
> * If utrace_attach_pid() succeeded above, we are sure the target
> * is valid here
> */
> target = pid_task(pid, PIDTYPE_PID);
> put_pid(pid);
>
> ret = utrace_set_events(target, engine, UTRACE_EVENT_SYSCALL |
> UTRACE_EVENT(EXIT));
> if (ret)
> printk(KERN_ERR "utrace_set_events returned %d\n", ret);
>
> out:
> return ret;
> }
>
> static void __exit strace_exit(void)
> {
> int ret = 0;
> struct pid *pid;
> struct utrace_attached_engine *engine;
> struct task_struct *target;
>
> pid = find_get_pid(target_tid);
> if (pid == NULL) {
> printk(KERN_ERR "Cannot find PID %d\n", target_tid);
> return;
> }
>
> target = pid_task(pid, PIDTYPE_PID);
> put_pid(pid);
> engine = utrace_attach_task(target, UTRACE_ATTACH_MATCH_OPS,
> &syscall_ops, 0);
> if (IS_ERR(engine))
> printk(KERN_ERR "Can't find self: %ld\n", PTR_ERR(engine));
> else if (engine == NULL)
> printk(KERN_ERR "Can't find self: no match\n");
> else {
> printk(KERN_INFO "Trying detach 0x%p from %d\n",
> engine, target_tid);
> ret = utrace_control(target, engine, UTRACE_DETACH);
> if (ret)
> printk(KERN_ERR "utrace_control returned %d\n",
> ret);
> }
> }
>
> module_init(strace_init);
> module_exit(strace_exit);
>


This is a very convenient Api. I would be glad to use it with ftrace.
But, what is it's status currently? Some projects to merge it?

2009-01-28 14:30:00

by Arnaldo Carvalho de Melo

[permalink] [raw]
Subject: Re: [PATCH] tracer for sys_open() - sreadahead

Em Wed, Jan 28, 2009 at 02:58:28PM +0100, Fr?d?ric Weisbecker escreveu:
> 2009/1/28 Frank Ch. Eigler <[email protected]>:
> > Frederic Weisbecker <[email protected]> writes:
> >
> >> [...]
> >> Speaking about a global syscall tracer, I made a patch to trace only the syscalls
> >> with the function-graph-tracer.
> >> http://lkml.org/lkml/2008/12/30/267 This low-level part can easily
> >> be used by all tracers that would like to inspect syscalls.
> >> [...]
> >> Just a change is needed: Steven requested that the part inside
> >> syscall_trace_enter become a tracepoint, making it totally shareable
> >> between tracers and easy to turn on and off.
> >
> > Alternately, you could just rely on utrace's hooks. They were thought
> > out more fully with respect to parameter access, manipulation, and
> > programmatic control befitting even a debugger.
> >
> >
> > - FChE
> >
>
> I don't know much it. But I will soon have some time to look at your
> patch which uses ftrace from utrace.
> Anyway, are there some plans about utrace to be merged? Unless I
> couldn't be able to use
> it...

Well, one of the reasons for utrace not to be merged, IIRC, was that
there would be no users in-kernel. With Frank's ftrace plugin that is
not true anymore.

- Arnaldo

Subject: Re: [PATCH] tracer for sys_open() - sreadahead

On Wed, Jan 28, 2009 at 03:21:24PM +0100, Fr?d?ric Weisbecker wrote:
> 2009/1/28 Ananth N Mavinakayanahalli <[email protected]>:
> > On Tue, Jan 27, 2009 at 11:43:05PM +0100, Frederic Weisbecker wrote:
> >> On Tue, Jan 27, 2009 at 12:08:04PM -0800, Kok, Auke wrote:

...

> > Here is something I did sometime ago that uses utrace. It is per task
> > doesn't use ftrace and is just intended as a prototype. It traces both
> > syscalls and returns.
> >
> > ---
> >
> > Here is the beginnings of a simple utrace based strace. Right now, one
> > needs to invoke this program with a 'insmod <modname> tid=<tid>'.
> >
> > The output looks something like this:
> > [267352.641112] Attached to 32604 => 0xd8f60090
> > [267353.981046] 2
> > [267353.981085] 197 0x1 0xbff8cd84 0x86cff4 0x86d4c0 0x86d4c0 0xbff8cd50a = 0
> > [267353.981097] 192 0x0 0x1000 0x3 0x22 0xffffffff 0x0 = b7f8d000
> > [267353.981124] 4 0x1 0xb7f8d000 0xd 0xd 0xb7f8d000 0xbff8cda8 = d
> > [267353.981174] 4 0x1 0xb7f8d000 0x3 0x3 0xb7f8d000 0xbff8c7dc = 3
> > [267353.981209] 252 0x0 0x0 0x86e0d0 0x0 0x86b274 0xbff8cee8 =
> > [267353.981215] Task 32604 exited
> > [267355.460180] Cannot find PID 32604
> >
> > I know strace does a pretty print, but this is a quick and dirty
> > prototype.
> > ---
> >
> > #include <linux/module.h>
> > #include <linux/utrace.h>
> > #include <linux/err.h>
> > #include <asm/syscall.h>
> >
> > MODULE_DESCRIPTION("syscall trace");
> > MODULE_LICENSE("GPL");
> >
> > static int target_tid;
> >
> > module_param_named(tid, target_tid, int, 0);
> >
> > static u32 task_syscall_entry(u32 action, struct utrace_attached_engine *engine,
> > struct task_struct *task, struct pt_regs *regs)
> > {
> > long callno;
> > unsigned long args[6];
> >
> > callno = syscall_get_nr(task, regs);
> > syscall_get_arguments(task, regs, 0, 6, args);
> >
> > printk(KERN_INFO "%ld 0x%lx 0x%lx 0x%lx 0x%lx 0x%lx 0x%lx = ",
> > callno, args[0], args[1], args[2], args[3], args[4], args[5]);
> > return UTRACE_RESUME;
> > }
> >
> > static u32 task_syscall_exit(u32 action, struct utrace_attached_engine *engine,
> > struct task_struct *task, struct pt_regs *regs)
> > {
> > long retval = syscall_get_return_value(task, regs);
> > printk("%lx\n", retval);
> > return UTRACE_RESUME;
> > }
> >
> > static u32 task_exit(enum utrace_resume_action action,
> > struct utrace_attached_engine *engine, struct task_struct *task,
> > long orig_code, long *code)
> > {
> > printk("\n");
> > printk(KERN_INFO "Task %d exited\n", target_tid);
> > return UTRACE_DETACH;
> > }
> >
> > static const struct utrace_engine_ops syscall_ops =
> > {
> > .report_syscall_entry = task_syscall_entry,
> > .report_syscall_exit = task_syscall_exit,
> > .report_exit = task_exit,
> > };
> >
> > static int __init strace_init(void)
> > {
> > struct pid *pid;
> > int ret = 0;
> > struct task_struct *target;
> > struct utrace_attached_engine *engine;
> >
> > pid = find_get_pid(target_tid);
> > if (pid == NULL) {
> > printk(KERN_ERR "Cannot find PID %d\n", target_tid);
> > ret = -ESRCH;
> > goto out;
> > }
> >
> > engine = utrace_attach_pid(pid, UTRACE_ATTACH_CREATE, &syscall_ops, 0);
> > if (IS_ERR(engine)) {
> > printk(KERN_ERR "utrace_attach_pid: %ld\n", PTR_ERR(engine));
> > ret = -EINVAL;
> > goto out;
> > } else if (engine == NULL) {
> > printk(KERN_ERR "utrace_attach_pid => NULL\n");
> > ret = -EINVAL;
> > goto out;
> > } else
> > printk(KERN_INFO "Attached to %d => 0x%p\n",
> > target_tid, engine);
> >
> > /*
> > * If utrace_attach_pid() succeeded above, we are sure the target
> > * is valid here
> > */
> > target = pid_task(pid, PIDTYPE_PID);
> > put_pid(pid);
> >
> > ret = utrace_set_events(target, engine, UTRACE_EVENT_SYSCALL |
> > UTRACE_EVENT(EXIT));
> > if (ret)
> > printk(KERN_ERR "utrace_set_events returned %d\n", ret);
> >
> > out:
> > return ret;
> > }
> >
> > static void __exit strace_exit(void)
> > {
> > int ret = 0;
> > struct pid *pid;
> > struct utrace_attached_engine *engine;
> > struct task_struct *target;
> >
> > pid = find_get_pid(target_tid);
> > if (pid == NULL) {
> > printk(KERN_ERR "Cannot find PID %d\n", target_tid);
> > return;
> > }
> >
> > target = pid_task(pid, PIDTYPE_PID);
> > put_pid(pid);
> > engine = utrace_attach_task(target, UTRACE_ATTACH_MATCH_OPS,
> > &syscall_ops, 0);
> > if (IS_ERR(engine))
> > printk(KERN_ERR "Can't find self: %ld\n", PTR_ERR(engine));
> > else if (engine == NULL)
> > printk(KERN_ERR "Can't find self: no match\n");
> > else {
> > printk(KERN_INFO "Trying detach 0x%p from %d\n",
> > engine, target_tid);
> > ret = utrace_control(target, engine, UTRACE_DETACH);
> > if (ret)
> > printk(KERN_ERR "utrace_control returned %d\n",
> > ret);
> > }
> > }
> >
> > module_init(strace_init);
> > module_exit(strace_exit);
> >
>
>
> This is a very convenient Api. I would be glad to use it with ftrace.
> But, what is it's status currently? Some projects to merge it?

Its currently available as a git tree:
git://git.kernel.org/pub/scm/linux/kernel/git/frob/linux-2.6-utrace.git

There is work going on to merge it upstream. Roland McGrath will be able
to better answer when. However, one of the biggest requirements to merge
it is an in-kernel user. Hopefully Frank's prototype and/or an ftrace
integration of the above snippet should be incentive to push for some
in-tree users :-)

Ananth

2009-01-28 17:15:32

by Frederic Weisbecker

[permalink] [raw]
Subject: Re: [PATCH] tracer for sys_open() - sreadahead

2009/1/28 Ananth N Mavinakayanahalli <[email protected]>:
> On Wed, Jan 28, 2009 at 03:21:24PM +0100, Fr?d?ric Weisbecker wrote:
>> 2009/1/28 Ananth N Mavinakayanahalli <[email protected]>:
>> > On Tue, Jan 27, 2009 at 11:43:05PM +0100, Frederic Weisbecker wrote:
>> >> On Tue, Jan 27, 2009 at 12:08:04PM -0800, Kok, Auke wrote:
>
> ...
>
>> > Here is something I did sometime ago that uses utrace. It is per task
>> > doesn't use ftrace and is just intended as a prototype. It traces both
>> > syscalls and returns.
>> >
>> > ---
>> >
>> > Here is the beginnings of a simple utrace based strace. Right now, one
>> > needs to invoke this program with a 'insmod <modname> tid=<tid>'.
>> >
>> > The output looks something like this:
>> > [267352.641112] Attached to 32604 => 0xd8f60090
>> > [267353.981046] 2
>> > [267353.981085] 197 0x1 0xbff8cd84 0x86cff4 0x86d4c0 0x86d4c0 0xbff8cd50a = 0
>> > [267353.981097] 192 0x0 0x1000 0x3 0x22 0xffffffff 0x0 = b7f8d000
>> > [267353.981124] 4 0x1 0xb7f8d000 0xd 0xd 0xb7f8d000 0xbff8cda8 = d
>> > [267353.981174] 4 0x1 0xb7f8d000 0x3 0x3 0xb7f8d000 0xbff8c7dc = 3
>> > [267353.981209] 252 0x0 0x0 0x86e0d0 0x0 0x86b274 0xbff8cee8 =
>> > [267353.981215] Task 32604 exited
>> > [267355.460180] Cannot find PID 32604
>> >
>> > I know strace does a pretty print, but this is a quick and dirty
>> > prototype.
>> > ---
>> >
>> > #include <linux/module.h>
>> > #include <linux/utrace.h>
>> > #include <linux/err.h>
>> > #include <asm/syscall.h>
>> >
>> > MODULE_DESCRIPTION("syscall trace");
>> > MODULE_LICENSE("GPL");
>> >
>> > static int target_tid;
>> >
>> > module_param_named(tid, target_tid, int, 0);
>> >
>> > static u32 task_syscall_entry(u32 action, struct utrace_attached_engine *engine,
>> > struct task_struct *task, struct pt_regs *regs)
>> > {
>> > long callno;
>> > unsigned long args[6];
>> >
>> > callno = syscall_get_nr(task, regs);
>> > syscall_get_arguments(task, regs, 0, 6, args);
>> >
>> > printk(KERN_INFO "%ld 0x%lx 0x%lx 0x%lx 0x%lx 0x%lx 0x%lx = ",
>> > callno, args[0], args[1], args[2], args[3], args[4], args[5]);
>> > return UTRACE_RESUME;
>> > }
>> >
>> > static u32 task_syscall_exit(u32 action, struct utrace_attached_engine *engine,
>> > struct task_struct *task, struct pt_regs *regs)
>> > {
>> > long retval = syscall_get_return_value(task, regs);
>> > printk("%lx\n", retval);
>> > return UTRACE_RESUME;
>> > }
>> >
>> > static u32 task_exit(enum utrace_resume_action action,
>> > struct utrace_attached_engine *engine, struct task_struct *task,
>> > long orig_code, long *code)
>> > {
>> > printk("\n");
>> > printk(KERN_INFO "Task %d exited\n", target_tid);
>> > return UTRACE_DETACH;
>> > }
>> >
>> > static const struct utrace_engine_ops syscall_ops =
>> > {
>> > .report_syscall_entry = task_syscall_entry,
>> > .report_syscall_exit = task_syscall_exit,
>> > .report_exit = task_exit,
>> > };
>> >
>> > static int __init strace_init(void)
>> > {
>> > struct pid *pid;
>> > int ret = 0;
>> > struct task_struct *target;
>> > struct utrace_attached_engine *engine;
>> >
>> > pid = find_get_pid(target_tid);
>> > if (pid == NULL) {
>> > printk(KERN_ERR "Cannot find PID %d\n", target_tid);
>> > ret = -ESRCH;
>> > goto out;
>> > }
>> >
>> > engine = utrace_attach_pid(pid, UTRACE_ATTACH_CREATE, &syscall_ops, 0);
>> > if (IS_ERR(engine)) {
>> > printk(KERN_ERR "utrace_attach_pid: %ld\n", PTR_ERR(engine));
>> > ret = -EINVAL;
>> > goto out;
>> > } else if (engine == NULL) {
>> > printk(KERN_ERR "utrace_attach_pid => NULL\n");
>> > ret = -EINVAL;
>> > goto out;
>> > } else
>> > printk(KERN_INFO "Attached to %d => 0x%p\n",
>> > target_tid, engine);
>> >
>> > /*
>> > * If utrace_attach_pid() succeeded above, we are sure the target
>> > * is valid here
>> > */
>> > target = pid_task(pid, PIDTYPE_PID);
>> > put_pid(pid);
>> >
>> > ret = utrace_set_events(target, engine, UTRACE_EVENT_SYSCALL |
>> > UTRACE_EVENT(EXIT));
>> > if (ret)
>> > printk(KERN_ERR "utrace_set_events returned %d\n", ret);
>> >
>> > out:
>> > return ret;
>> > }
>> >
>> > static void __exit strace_exit(void)
>> > {
>> > int ret = 0;
>> > struct pid *pid;
>> > struct utrace_attached_engine *engine;
>> > struct task_struct *target;
>> >
>> > pid = find_get_pid(target_tid);
>> > if (pid == NULL) {
>> > printk(KERN_ERR "Cannot find PID %d\n", target_tid);
>> > return;
>> > }
>> >
>> > target = pid_task(pid, PIDTYPE_PID);
>> > put_pid(pid);
>> > engine = utrace_attach_task(target, UTRACE_ATTACH_MATCH_OPS,
>> > &syscall_ops, 0);
>> > if (IS_ERR(engine))
>> > printk(KERN_ERR "Can't find self: %ld\n", PTR_ERR(engine));
>> > else if (engine == NULL)
>> > printk(KERN_ERR "Can't find self: no match\n");
>> > else {
>> > printk(KERN_INFO "Trying detach 0x%p from %d\n",
>> > engine, target_tid);
>> > ret = utrace_control(target, engine, UTRACE_DETACH);
>> > if (ret)
>> > printk(KERN_ERR "utrace_control returned %d\n",
>> > ret);
>> > }
>> > }
>> >
>> > module_init(strace_init);
>> > module_exit(strace_exit);
>> >
>>
>>
>> This is a very convenient Api. I would be glad to use it with ftrace.
>> But, what is it's status currently? Some projects to merge it?
>
> Its currently available as a git tree:
> git://git.kernel.org/pub/scm/linux/kernel/git/frob/linux-2.6-utrace.git
>
> There is work going on to merge it upstream. Roland McGrath will be able
> to better answer when. However, one of the biggest requirements to merge
> it is an in-kernel user. Hopefully Frank's prototype and/or an ftrace
> integration of the above snippet should be incentive to push for some
> in-tree users :-)
>
> Ananth
>

I hope so. That would be used by the function graph tracer and a
syscall tracer, and probably for other special tracers
needs. If Ingo and Steven are ok with it.

2009-01-28 22:11:39

by Kok, Auke

[permalink] [raw]
Subject: Re: [PATCH] tracer for sys_open() - sreadahead

Frederic Weisbecker wrote:
> On Tue, Jan 27, 2009 at 06:51:37PM -0200, Arnaldo Carvalho de Melo wrote:
>> Em Tue, Jan 27, 2009 at 12:08:04PM -0800, Kok, Auke escreveu:
>>> This tracer monitors regular file open() syscalls. This is a fast
>>> and low-overhead alternative to strace, and does not allow or
>>> require to be attached to every process.
>>>
>>> The tracer only logs succesfull calls, as those are the only ones we
>>> are currently interested in, and we can determine the absolute path
>>> of these files as we log.
>>>
>>> Signed-off-by: Auke Kok <[email protected]>
>> <SNIP>
>>
>>> diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
>>> index 4d3d381..24c17d2 100644
>>> --- a/kernel/trace/trace.h
>>> +++ b/kernel/trace/trace.h
>>> @@ -30,6 +30,7 @@ enum trace_type {
>>> TRACE_USER_STACK,
>>> TRACE_HW_BRANCHES,
>>> TRACE_POWER,
>>> + TRACE_OPEN,
>> Why not a TRACE_VFS or TRACE_SYSCALL and then multiplex there open,
>> close, etc? trace_assign_type will get humongous in no time this way.
>
>
> That's what I was about to answer too.
> That would be sad to find one tracer for open, one for read, one for write...
>
>
>> TRACE_BLK does multiplexing, as does TRACE_PROCESS that Frank just
>> posted too, and I'm working on a plugin for the patch that Neil
>> submitted for the socket layer that also will need an entry there for
>> its tracepoints.
>>
>> Frederic, it seems that discussion about subtypes has to continue :-)
>
>
> Indeed, that becomes serious :-)
>

I totally agree - this tracer was purely made quick-and-dirty to get sreadahead to
the next level. The in-kernel syscall trace facility is extremely potent and I
don't doubt that my patch may have been found slightly absurd by some :)

In all it was not productive to write a giant trace facility just to get
sreadahead one tracer type. Now that the code works and the benefit is proven, I'm
happy to see if we can work on making a decent long-term non-specialistic solution.

Auke

2009-01-28 22:25:24

by Kok, Auke

[permalink] [raw]
Subject: Re: [PATCH] tracer for sys_open() - sreadahead

Frederic Weisbecker wrote:
> On Tue, Jan 27, 2009 at 12:08:04PM -0800, Kok, Auke wrote:
>> This tracer monitors regular file open() syscalls. This is a fast
>> and low-overhead alternative to strace, and does not allow or
>> require to be attached to every process.
>>
>> The tracer only logs succesfull calls, as those are the only ones we
>> are currently interested in, and we can determine the absolute path
>> of these files as we log.
>>
>> Signed-off-by: Auke Kok <[email protected]>
>
>
> Hi Auke,
>
> Speaking about a global syscall tracer, I made a patch to trace only the syscalls
> with the function-graph-tracer.
>
> http://lkml.org/lkml/2008/12/30/267
>
> Its approach and purpose is different than a tracer dedicated only to syscalls.
> The function graph tracer traces execution graph of the functions and is more about
> execution time spent and code flow whereas a syscall tracer can provide more specific
> informations about syscalls.
>
> So both are not overlaping.
>
> But the low level part of my patch creates a thread flag _TIF_SYSCALL_TRACE which triggers
> a ptrace hook when set.
> This low-level part can easily be used by all tracers that would like to inspect syscalls.
>
> Just a change is needed: Steven requested that the part inside syscall_trace_enter become
> a tracepoint, making it totally shareable between tracers and easy to turn on and off.
>
> And perhaps the parts that set/clear the flag on all tasks can be shared too.
>
> So we can start with this low-level syscall tracing facility. If you want, I can adapt
> this low-level part and submit a patch this week or the next one to give you this base
> infrastructure.
>
>
> Once we have it, I think a syscall tracer can be fed with new syscalls events through
> several patch iterations, starting with the open and close one :-)
>
> Are you ok with that?

That sounds like a good start. one note of concern is that (for sreadahead) we're
really interested in the _absolute_ path, not the relative ones that are passed to
the syscall by userspace.

This may significantly complicate things when it comes to fs syscalls, and is one
of the reasons I wrote the quick-n-dirty alternative.


Auke

2009-01-29 00:46:17

by Arnaldo Carvalho de Melo

[permalink] [raw]
Subject: Re: [PATCH] tracer for sys_open() - sreadahead

Em Wed, Jan 28, 2009 at 02:05:20PM -0800, Kok, Auke escreveu:
> Frederic Weisbecker wrote:
> > On Tue, Jan 27, 2009 at 06:51:37PM -0200, Arnaldo Carvalho de Melo wrote:
> >> Em Tue, Jan 27, 2009 at 12:08:04PM -0800, Kok, Auke escreveu:
> >>> This tracer monitors regular file open() syscalls. This is a fast
> >>> and low-overhead alternative to strace, and does not allow or
> >>> require to be attached to every process.
> >>>
> >>> The tracer only logs succesfull calls, as those are the only ones we
> >>> are currently interested in, and we can determine the absolute path
> >>> of these files as we log.
> >>>
> >>> Signed-off-by: Auke Kok <[email protected]>
> >> <SNIP>
> >>
> >>> diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
> >>> index 4d3d381..24c17d2 100644
> >>> --- a/kernel/trace/trace.h
> >>> +++ b/kernel/trace/trace.h
> >>> @@ -30,6 +30,7 @@ enum trace_type {
> >>> TRACE_USER_STACK,
> >>> TRACE_HW_BRANCHES,
> >>> TRACE_POWER,
> >>> + TRACE_OPEN,
> >> Why not a TRACE_VFS or TRACE_SYSCALL and then multiplex there open,
> >> close, etc? trace_assign_type will get humongous in no time this way.
> >
> >
> > That's what I was about to answer too.
> > That would be sad to find one tracer for open, one for read, one for write...
> >
> >
> >> TRACE_BLK does multiplexing, as does TRACE_PROCESS that Frank just
> >> posted too, and I'm working on a plugin for the patch that Neil
> >> submitted for the socket layer that also will need an entry there for
> >> its tracepoints.
> >>
> >> Frederic, it seems that discussion about subtypes has to continue :-)
> >
> >
> > Indeed, that becomes serious :-)
> >
>
> I totally agree - this tracer was purely made quick-and-dirty to get sreadahead to
> the next level. The in-kernel syscall trace facility is extremely potent and I
> don't doubt that my patch may have been found slightly absurd by some :)

hey, hey, your patch just illustrated that we need a subtype facility.

I'm (as all the other people in some shape or form involved in pushing
some sort of common tracing infrastructure into the kernel, I guess)
excited about more developers using what is being put in place :-)

> In all it was not productive to write a giant trace facility just to get
> sreadahead one tracer type. Now that the code works and the benefit is proven, I'm
> happy to see if we can work on making a decent long-term non-specialistic solution.

That is the spirit!

- Arnaldo

2009-01-29 13:39:34

by Frederic Weisbecker

[permalink] [raw]
Subject: Re: [PATCH] tracer for sys_open() - sreadahead

2009/1/29 Arnaldo Carvalho de Melo <[email protected]>:
> hey, hey, your patch just illustrated that we need a subtype facility.
>
> I'm (as all the other people in some shape or form involved in pushing
> some sort of common tracing infrastructure into the kernel, I guess)
> excited about more developers using what is being put in place :-)


Indeed :-)

About these subtypes. I've thought about what can be intuitive for
developers who
will use it.

I think the struct trace_event can be reused by just adding a subtype
event inside:

+ int (*find_subtype_func) (struct trace_iterator *);

struct trace_event {
struct hlist_node node;
int type;
trace_print_func trace;
trace_print_func latency_trace;
trace_print_func raw;
trace_print_func hex;
trace_print_func binary;
+ struct trace_event *subevent_array;
+ find_subtype_func find_subtype;
};

And then I shall implement an event internal hlist for each global event.
All a tracer developer just have to do is:

_ implement a general trace_event for his global event type
_ implement each trace subevent as an array, one will have to define
its own set of values for the types, they will
not conflict with those used by global event types since they are
isolated and depend on one global event type.
_ implement find_subtype callback. When a new trace has to be printed,
this callback is called to find the subtype
value corresponding to a trace. If the tracer returns a positive
value, the matching subevent is searched on the subevent_array
and the appropriate trace_print_func is called.
Otherwise if find_subtype is NULL or returns -EINVAL, then the
appropriate trace_print_func of the current global event is called, as
usual.

We can even make it recursive, letting one crazy tracer beeing able to
have subtypes of subtypes :-)

Hm? If you are ok, I shall start it.


>> In all it was not productive to write a giant trace facility just to get
>> sreadahead one tracer type. Now that the code works and the benefit is proven, I'm
>> happy to see if we can work on making a decent long-term non-specialistic solution.
>
> That is the spirit!
>
> - Arnaldo
>

2009-01-29 13:40:42

by Frederic Weisbecker

[permalink] [raw]
Subject: Re: [PATCH] tracer for sys_open() - sreadahead

2009/1/29 Fr?d?ric Weisbecker <[email protected]>:
> 2009/1/29 Arnaldo Carvalho de Melo <[email protected]>:
>> hey, hey, your patch just illustrated that we need a subtype facility.
>>
>> I'm (as all the other people in some shape or form involved in pushing
>> some sort of common tracing infrastructure into the kernel, I guess)
>> excited about more developers using what is being put in place :-)
>
>
> Indeed :-)
>
> About these subtypes. I've thought about what can be intuitive for
> developers who
> will use it.
>
> I think the struct trace_event can be reused by just adding a subtype
> event inside:
>
> + int (*find_subtype_func) (struct trace_iterator *);


Sorry: forgot the typedef there...

>
> struct trace_event {
> struct hlist_node node;
> int type;
> trace_print_func trace;
> trace_print_func latency_trace;
> trace_print_func raw;
> trace_print_func hex;
> trace_print_func binary;
> + struct trace_event *subevent_array;
> + find_subtype_func find_subtype;
> };
>
> And then I shall implement an event internal hlist for each global event.
> All a tracer developer just have to do is:
>
> _ implement a general trace_event for his global event type
> _ implement each trace subevent as an array, one will have to define
> its own set of values for the types, they will
> not conflict with those used by global event types since they are
> isolated and depend on one global event type.
> _ implement find_subtype callback. When a new trace has to be printed,
> this callback is called to find the subtype
> value corresponding to a trace. If the tracer returns a positive
> value, the matching subevent is searched on the subevent_array
> and the appropriate trace_print_func is called.
> Otherwise if find_subtype is NULL or returns -EINVAL, then the
> appropriate trace_print_func of the current global event is called, as
> usual.
>
> We can even make it recursive, letting one crazy tracer beeing able to
> have subtypes of subtypes :-)
>
> Hm? If you are ok, I shall start it.
>
>
>>> In all it was not productive to write a giant trace facility just to get
>>> sreadahead one tracer type. Now that the code works and the benefit is proven, I'm
>>> happy to see if we can work on making a decent long-term non-specialistic solution.
>>
>> That is the spirit!
>>
>> - Arnaldo
>>
>

2009-01-29 14:05:36

by Ingo Molnar

[permalink] [raw]
Subject: Re: [PATCH] tracer for sys_open() - sreadahead


* Frederic Weisbecker <[email protected]> wrote:

> On Tue, Jan 27, 2009 at 11:43:03PM +0100, Frederic Weisbecker wrote:
> > On Tue, Jan 27, 2009 at 12:08:04PM -0800, Kok, Auke wrote:
> > >
> > > This tracer monitors regular file open() syscalls. This is a fast
> > > and low-overhead alternative to strace, and does not allow or
> > > require to be attached to every process.
> > >
> > > The tracer only logs succesfull calls, as those are the only ones we
> > > are currently interested in, and we can determine the absolute path
> > > of these files as we log.
> > >
> > > Signed-off-by: Auke Kok <[email protected]>
> >
> >
> > Hi Auke,
> >
> > Speaking about a global syscall tracer, I made a patch to trace only the syscalls
> > with the function-graph-tracer.
> >
> > http://lkml.org/lkml/2008/12/30/267
> >
> > Its approach and purpose is different than a tracer dedicated only to syscalls.
> > The function graph tracer traces execution graph of the functions and is more about
> > execution time spent and code flow whereas a syscall tracer can provide more specific
> > informations about syscalls.
> >
> > So both are not overlaping.
> >
> > But the low level part of my patch creates a thread flag _TIF_SYSCALL_TRACE which triggers
>
> s/_TIF_SYSCALL_TRACE/_TIF_SYSCALL_FTRACE

> > Once we have it, I think a syscall tracer can be fed with new syscalls
> > events through several patch iterations, starting with the open and
> > close one :-)
> >
> > Are you ok with that?
> >
> > Steven, Ingo, do you agree?

yes. We definitely need this on the asm syscall level, to not contaminate
hundreds of syscalls with tracepoints.

Auke's sys_open() plugin would be a nice prototype for that concept - but
in generally it would be useful to be able to augment kernel tracer output
with all syscall events that occur.

The output would be something like a slimmed-down strace, but for the
whole kernel and not tied to ptrace semantics (which are crippling).

Would you be interested in extending your syscall tracing concept with
those bits and would you be interested in integrating Auke's plugin into
that

Ingo

2009-01-29 14:29:27

by Frederic Weisbecker

[permalink] [raw]
Subject: Re: [PATCH] tracer for sys_open() - sreadahead

2009/1/29 Ingo Molnar <[email protected]>:
>
> * Frederic Weisbecker <[email protected]> wrote:
>
>> On Tue, Jan 27, 2009 at 11:43:03PM +0100, Frederic Weisbecker wrote:
>> > On Tue, Jan 27, 2009 at 12:08:04PM -0800, Kok, Auke wrote:
>> > >
>> > > This tracer monitors regular file open() syscalls. This is a fast
>> > > and low-overhead alternative to strace, and does not allow or
>> > > require to be attached to every process.
>> > >
>> > > The tracer only logs succesfull calls, as those are the only ones we
>> > > are currently interested in, and we can determine the absolute path
>> > > of these files as we log.
>> > >
>> > > Signed-off-by: Auke Kok <[email protected]>
>> >
>> >
>> > Hi Auke,
>> >
>> > Speaking about a global syscall tracer, I made a patch to trace only the syscalls
>> > with the function-graph-tracer.
>> >
>> > http://lkml.org/lkml/2008/12/30/267
>> >
>> > Its approach and purpose is different than a tracer dedicated only to syscalls.
>> > The function graph tracer traces execution graph of the functions and is more about
>> > execution time spent and code flow whereas a syscall tracer can provide more specific
>> > informations about syscalls.
>> >
>> > So both are not overlaping.
>> >
>> > But the low level part of my patch creates a thread flag _TIF_SYSCALL_TRACE which triggers
>>
>> s/_TIF_SYSCALL_TRACE/_TIF_SYSCALL_FTRACE
>
>> > Once we have it, I think a syscall tracer can be fed with new syscalls
>> > events through several patch iterations, starting with the open and
>> > close one :-)
>> >
>> > Are you ok with that?
>> >
>> > Steven, Ingo, do you agree?
>
> yes. We definitely need this on the asm syscall level, to not contaminate
> hundreds of syscalls with tracepoints.
>
> Auke's sys_open() plugin would be a nice prototype for that concept - but
> in generally it would be useful to be able to augment kernel tracer output
> with all syscall events that occur.
>
> The output would be something like a slimmed-down strace, but for the
> whole kernel and not tied to ptrace semantics (which are crippling).
>
> Would you be interested in extending your syscall tracing concept with
> those bits and would you be interested in integrating Auke's plugin into
> that
>
> Ingo


Several people talked me about utrace and gave some examples about it
in this discussion.
The Api is very convenient to fetch syscall numbers, arguments and
return values.
And the hooks are done in the generic core code, so it is arch independent.

The only drawback I can see is that it is not yet merged upstream, in
need of in-kernel users.
If it only depends on this condition, we could be these users...

What do you think?

2009-01-29 14:31:46

by Ingo Molnar

[permalink] [raw]
Subject: Re: [PATCH] tracer for sys_open() - sreadahead


* Fr?d?ric Weisbecker <[email protected]> wrote:

> 2009/1/29 Ingo Molnar <[email protected]>:
> >
> > * Frederic Weisbecker <[email protected]> wrote:
> >
> >> On Tue, Jan 27, 2009 at 11:43:03PM +0100, Frederic Weisbecker wrote:
> >> > On Tue, Jan 27, 2009 at 12:08:04PM -0800, Kok, Auke wrote:
> >> > >
> >> > > This tracer monitors regular file open() syscalls. This is a fast
> >> > > and low-overhead alternative to strace, and does not allow or
> >> > > require to be attached to every process.
> >> > >
> >> > > The tracer only logs succesfull calls, as those are the only ones we
> >> > > are currently interested in, and we can determine the absolute path
> >> > > of these files as we log.
> >> > >
> >> > > Signed-off-by: Auke Kok <[email protected]>
> >> >
> >> >
> >> > Hi Auke,
> >> >
> >> > Speaking about a global syscall tracer, I made a patch to trace only the syscalls
> >> > with the function-graph-tracer.
> >> >
> >> > http://lkml.org/lkml/2008/12/30/267
> >> >
> >> > Its approach and purpose is different than a tracer dedicated only to syscalls.
> >> > The function graph tracer traces execution graph of the functions and is more about
> >> > execution time spent and code flow whereas a syscall tracer can provide more specific
> >> > informations about syscalls.
> >> >
> >> > So both are not overlaping.
> >> >
> >> > But the low level part of my patch creates a thread flag _TIF_SYSCALL_TRACE which triggers
> >>
> >> s/_TIF_SYSCALL_TRACE/_TIF_SYSCALL_FTRACE
> >
> >> > Once we have it, I think a syscall tracer can be fed with new syscalls
> >> > events through several patch iterations, starting with the open and
> >> > close one :-)
> >> >
> >> > Are you ok with that?
> >> >
> >> > Steven, Ingo, do you agree?
> >
> > yes. We definitely need this on the asm syscall level, to not contaminate
> > hundreds of syscalls with tracepoints.
> >
> > Auke's sys_open() plugin would be a nice prototype for that concept - but
> > in generally it would be useful to be able to augment kernel tracer output
> > with all syscall events that occur.
> >
> > The output would be something like a slimmed-down strace, but for the
> > whole kernel and not tied to ptrace semantics (which are crippling).
> >
> > Would you be interested in extending your syscall tracing concept with
> > those bits and would you be interested in integrating Auke's plugin into
> > that
> >
> > Ingo
>
>
> Several people talked me about utrace and gave some examples about it in
> this discussion. The Api is very convenient to fetch syscall numbers,
> arguments and return values. And the hooks are done in the generic core
> code, so it is arch independent.
>
> The only drawback I can see is that it is not yet merged upstream, in
> need of in-kernel users. If it only depends on this condition, we could
> be these users...
>
> What do you think?

sure - how do the minimal bits/callbacks look like which enable syscall
tracing?

Ingo

2009-01-29 14:41:15

by Frederic Weisbecker

[permalink] [raw]
Subject: Re: [PATCH] tracer for sys_open() - sreadahead

2009/1/29 Ingo Molnar <[email protected]>:
>>
>> Several people talked me about utrace and gave some examples about it in
>> this discussion. The Api is very convenient to fetch syscall numbers,
>> arguments and return values. And the hooks are done in the generic core
>> code, so it is arch independent.
>>
>> The only drawback I can see is that it is not yet merged upstream, in
>> need of in-kernel users. If it only depends on this condition, we could
>> be these users...
>>
>> What do you think?
>
> sure - how do the minimal bits/callbacks look like which enable syscall
> tracing?
>
> Ingo


There is a very straightforward example provided by Ananth in there:
http://lkml.org/lkml/2009/1/28/59

2009-01-29 14:48:55

by Frederic Weisbecker

[permalink] [raw]
Subject: Re: [PATCH] tracer for sys_open() - sreadahead

2009/1/29 Fr?d?ric Weisbecker <[email protected]>:
> 2009/1/29 Ingo Molnar <[email protected]>:
>>>
>>> Several people talked me about utrace and gave some examples about it in
>>> this discussion. The Api is very convenient to fetch syscall numbers,
>>> arguments and return values. And the hooks are done in the generic core
>>> code, so it is arch independent.
>>>
>>> The only drawback I can see is that it is not yet merged upstream, in
>>> need of in-kernel users. If it only depends on this condition, we could
>>> be these users...
>>>
>>> What do you think?
>>
>> sure - how do the minimal bits/callbacks look like which enable syscall
>> tracing?
>>
>> Ingo
>
>
> There is a very straightforward example provided by Ananth in there:
> http://lkml.org/lkml/2009/1/28/59
>

One other drawback may be the fact that utrace will be traced by the
function tracers... adding some junk on their traces.
But I guess this is just a matter of some patches to make it not traced.

BTW, there is an interesting proof of concept there:
http://lkml.org/lkml/2009/1/27/294

2009-01-29 15:10:22

by Ingo Molnar

[permalink] [raw]
Subject: Re: [PATCH] tracer for sys_open() - sreadahead


* Fr?d?ric Weisbecker <[email protected]> wrote:

> 2009/1/29 Ingo Molnar <[email protected]>:
> >>
> >> Several people talked me about utrace and gave some examples about it in
> >> this discussion. The Api is very convenient to fetch syscall numbers,
> >> arguments and return values. And the hooks are done in the generic core
> >> code, so it is arch independent.
> >>
> >> The only drawback I can see is that it is not yet merged upstream, in
> >> need of in-kernel users. If it only depends on this condition, we could
> >> be these users...
> >>
> >> What do you think?
> >
> > sure - how do the minimal bits/callbacks look like which enable syscall
> > tracing?
> >
> > Ingo
>
>
> There is a very straightforward example provided by Ananth in there:
> http://lkml.org/lkml/2009/1/28/59

I mean, how does the infrastructure patch look like - what code does this
add to the kernel - just to get the syscall tracing bits. Lets get some
progress here - it's clear that tracing syscalls is good, we just need to
do it and look at actual patches.

Ingo

2009-01-29 15:18:08

by Frederic Weisbecker

[permalink] [raw]
Subject: Re: [PATCH] tracer for sys_open() - sreadahead

2009/1/29 Ingo Molnar <[email protected]>:
>
> * Fr?d?ric Weisbecker <[email protected]> wrote:
>
>> 2009/1/29 Ingo Molnar <[email protected]>:
>> >>
>> >> Several people talked me about utrace and gave some examples about it in
>> >> this discussion. The Api is very convenient to fetch syscall numbers,
>> >> arguments and return values. And the hooks are done in the generic core
>> >> code, so it is arch independent.
>> >>
>> >> The only drawback I can see is that it is not yet merged upstream, in
>> >> need of in-kernel users. If it only depends on this condition, we could
>> >> be these users...
>> >>
>> >> What do you think?
>> >
>> > sure - how do the minimal bits/callbacks look like which enable syscall
>> > tracing?
>> >
>> > Ingo
>>
>>
>> There is a very straightforward example provided by Ananth in there:
>> http://lkml.org/lkml/2009/1/28/59
>
> I mean, how does the infrastructure patch look like - what code does this
> add to the kernel - just to get the syscall tracing bits. Lets get some
> progress here - it's clear that tracing syscalls is good, we just need to
> do it and look at actual patches.
>
> Ingo
>

The latest snapshot version I've found is here:
http://people.redhat.com/roland/utrace/2.6-current/utrace.patch
This is mostly independent core code and a good number of hooks inside ptrace.

But I don't know much about the overhead it potentially brings on ptrace.

2009-01-29 15:35:24

by Frederic Weisbecker

[permalink] [raw]
Subject: Re: [PATCH] tracer for sys_open() - sreadahead

2009/1/29 Ingo Molnar <[email protected]>:
>
> * Fr?d?ric Weisbecker <[email protected]> wrote:
>
>> 2009/1/29 Ingo Molnar <[email protected]>:
>> >>
>> >> Several people talked me about utrace and gave some examples about it in
>> >> this discussion. The Api is very convenient to fetch syscall numbers,
>> >> arguments and return values. And the hooks are done in the generic core
>> >> code, so it is arch independent.
>> >>
>> >> The only drawback I can see is that it is not yet merged upstream, in
>> >> need of in-kernel users. If it only depends on this condition, we could
>> >> be these users...
>> >>
>> >> What do you think?
>> >
>> > sure - how do the minimal bits/callbacks look like which enable syscall
>> > tracing?


I know you are talking about the only necessary bits from utrace to
have the syscalls tracing.
But I can't answer you better than would the utrace people.

And actually I'm not sure the utrace bits for syscall tracing can be
isolated from the rest of its
core.

Anyway, I will let the utrace guy answer to it :-)


>> There is a very straightforward example provided by Ananth in there:
>> http://lkml.org/lkml/2009/1/28/59
>
> I mean, how does the infrastructure patch look like - what code does this
> add to the kernel - just to get the syscall tracing bits. Lets get some
> progress here - it's clear that tracing syscalls is good, we just need to
> do it and look at actual patches.
>
> Ingo
>

2009-01-29 15:54:18

by Frank Ch. Eigler

[permalink] [raw]
Subject: Re: [PATCH] tracer for sys_open() - sreadahead

Hi -

On Thu, Jan 29, 2009 at 04:34:46PM +0100, Fr?d?ric Weisbecker wrote:
> 2009/1/29 Ingo Molnar <[email protected]>:
> [...]
> >> > sure - how do the minimal bits/callbacks look like which enable syscall
> >> > tracing?

> I know you are talking about the only necessary bits from utrace to
> have the syscalls tracing. But I can't answer you better than would
> the utrace people. And actually I'm not sure the utrace bits for
> syscall tracing can be isolated from the rest of its core.

My understanding is that the parts of utrace that remain out-of-tree
are relatively integrated, and just present the programmatic callback
API to the already merged "tracehook" layer.

- FChE