2008-11-06 19:01:53

by Ken Chen

[permalink] [raw]
Subject: [patch] add /proc/pid/stack to dump task's stack trace

This patch adds the ability to query a task's stack trace via /proc/pid/stack.
It is considered to be more useful than /proc/pid/wchan as it provides full
stack trace instead of single depth.

Signed-off-by: Ken Chen <[email protected]>

index bcceb99..3202d5c 100644
--- a/Documentation/filesystems/proc.txt
+++ b/Documentation/filesystems/proc.txt
@@ -139,6 +139,7 @@ Table 1-1: Process specific entries in /proc
statm Process memory status information
status Process status in human readable form
wchan If CONFIG_KALLSYMS is set, a pre-decoded wchan
+ stack If CONFIG_KALLSYMS is set, report full stack trace
smaps Extension based on maps, the rss size for each mapped file
..............................................................................

diff --git a/fs/proc/base.c b/fs/proc/base.c
index 486cf3f..466e519 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -65,6 +65,7 @@
#include <linux/mm.h>
#include <linux/rcupdate.h>
#include <linux/kallsyms.h>
+#include <linux/stacktrace.h>
#include <linux/resource.h>
#include <linux/module.h>
#include <linux/mount.h>
@@ -338,6 +339,35 @@ static int proc_pid_wchan
else
return sprintf(buffer, "%s", symname);
}
+
+#define MAX_STACK_TRACE_DEPTH 32
+static int proc_stack_trace(struct task_struct *task, char *buffer)
+{
+ int i, len = 0;
+ unsigned long *entries;
+ struct stack_trace trace;
+
+ entries = kmalloc(sizeof(*entries) * MAX_STACK_TRACE_DEPTH, GFP_KERNEL);
+ if (!entries)
+ goto out;
+
+ trace.nr_entries = 0;
+ trace.max_entries = MAX_STACK_TRACE_DEPTH;
+ trace.entries = entries;
+ trace.skip = 0;
+
+ read_lock(&tasklist_lock);
+ save_stack_trace_tsk(task, &trace);
+ read_unlock(&tasklist_lock);
+
+ for (i = 0; i < trace.nr_entries; i++) {
+ len += sprintf(buffer + len, "[<%p>] %pS\n",
+ (void *) entries[i], (void *) entries[i]);
+ }
+ kfree(entries);
+out:
+ return len;
+}
#endif /* CONFIG_KALLSYMS */

#ifdef CONFIG_SCHEDSTATS
@@ -2489,6 +2519,7 @@ static const struct pid_entry tgid_base_stuff[] = {
DIR("attr", S_IRUGO|S_IXUGO, attr_dir),
#endif
#ifdef CONFIG_KALLSYMS
+ INF("stack", S_IRUSR, stack_trace),
INF("wchan", S_IRUGO, pid_wchan),
#endif
#ifdef CONFIG_SCHEDSTATS


2008-11-06 19:48:16

by Alexey Dobriyan

[permalink] [raw]
Subject: Re: [patch] add /proc/pid/stack to dump task's stack trace

On Thu, Nov 06, 2008 at 11:01:39AM -0800, Ken Chen wrote:
> This patch adds the ability to query a task's stack trace via /proc/pid/stack.
> It is considered to be more useful than /proc/pid/wchan as it provides full
> stack trace instead of single depth.

Please, name handler proc_pid_stack following current convention.
And drop space before casts.

> --- a/fs/proc/base.c
> +++ b/fs/proc/base.c
> @@ -338,6 +339,35 @@ static int proc_pid_wchan
> else
> return sprintf(buffer, "%s", symname);
> }
> +
> +#define MAX_STACK_TRACE_DEPTH 32
> +static int proc_stack_trace(struct task_struct *task, char *buffer)
> +{
> + int i, len = 0;
> + unsigned long *entries;
> + struct stack_trace trace;
> +
> + entries = kmalloc(sizeof(*entries) * MAX_STACK_TRACE_DEPTH, GFP_KERNEL);
> + if (!entries)
> + goto out;
> +
> + trace.nr_entries = 0;
> + trace.max_entries = MAX_STACK_TRACE_DEPTH;
> + trace.entries = entries;
> + trace.skip = 0;
> +
> + read_lock(&tasklist_lock);
> + save_stack_trace_tsk(task, &trace);
> + read_unlock(&tasklist_lock);
> +
> + for (i = 0; i < trace.nr_entries; i++) {
> + len += sprintf(buffer + len, "[<%p>] %pS\n",
> + (void *) entries[i], (void *) entries[i]);
> + }
> + kfree(entries);
> +out:
> + return len;
> +}
> #endif /* CONFIG_KALLSYMS */
>
> #ifdef CONFIG_SCHEDSTATS
> @@ -2489,6 +2519,7 @@ static const struct pid_entry tgid_base_stuff[] = {
> DIR("attr", S_IRUGO|S_IXUGO, attr_dir),
> #endif
> #ifdef CONFIG_KALLSYMS
> + INF("stack", S_IRUSR, stack_trace),
> INF("wchan", S_IRUGO, pid_wchan),
> #endif
> #ifdef CONFIG_SCHEDSTATS

2008-11-06 20:12:50

by Ken Chen

[permalink] [raw]
Subject: Re: [patch] add /proc/pid/stack to dump task's stack trace

On Thu, Nov 6, 2008 at 11:51 AM, Alexey Dobriyan wrote:
> Please, name handler proc_pid_stack following current convention.
> And drop space before casts.

OK. handler name changed. For the space between cast, it looks like
there are different styles in the code base, either with or without.
I dropped the space since I don't have strong opinion one way or the
other.

Also wrap proc_pid_stack() inside CONFIG_STACKTRACE to fix compile
time error when config option is not selected.

Signed-off-by: Ken Chen <[email protected]>

index bcceb99..3202d5c 100644
--- a/Documentation/filesystems/proc.txt
+++ b/Documentation/filesystems/proc.txt
@@ -139,6 +139,7 @@ Table 1-1: Process specific entries in /proc
statm Process memory status information
status Process status in human readable form
wchan If CONFIG_KALLSYMS is set, a pre-decoded wchan
+ stack Report full stack trace, enable via CONFIG_STACKTRACE
smaps Extension based on maps, the rss size for each mapped file
..............................................................................

diff --git a/fs/proc/base.c b/fs/proc/base.c
index 486cf3f..9026508 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -65,6 +65,7 @@
#include <linux/mm.h>
#include <linux/rcupdate.h>
#include <linux/kallsyms.h>
+#include <linux/stacktrace.h>
#include <linux/resource.h>
#include <linux/module.h>
#include <linux/mount.h>
@@ -340,6 +341,37 @@ static int proc_pid_wchan
}
#endif /* CONFIG_KALLSYMS */

+#ifdef CONFIG_STACKTRACE
+#define MAX_STACK_TRACE_DEPTH 32
+static int proc_pid_stack(struct task_struct *task, char *buffer)
+{
+ int i, len = 0;
+ unsigned long *entries;
+ struct stack_trace trace;
+
+ entries = kmalloc(sizeof(*entries) * MAX_STACK_TRACE_DEPTH, GFP_KERNEL);
+ if (!entries)
+ goto out;
+
+ trace.nr_entries = 0;
+ trace.max_entries = MAX_STACK_TRACE_DEPTH;
+ trace.entries = entries;
+ trace.skip = 0;
+
+ read_lock(&tasklist_lock);
+ save_stack_trace_tsk(task, &trace);
+ read_unlock(&tasklist_lock);
+
+ for (i = 0; i < trace.nr_entries; i++) {
+ len += sprintf(buffer + len, "[<%p>] %pS\n",
+ (void *)entries[i], (void *)entries[i]);
+ }
+ kfree(entries);
+out:
+ return len;
+}
+#endif
+
#ifdef CONFIG_SCHEDSTATS
/*
* Provides /proc/PID/schedstat
@@ -2491,6 +2523,9 @@ static const struct pid_entry tgid_base_stuff[] = {
#ifdef CONFIG_KALLSYMS
INF("wchan", S_IRUGO, pid_wchan),
#endif
+#ifdef CONFIG_STACKTRACE
+ INF("stack", S_IRUSR, pid_stack),
+#endif
#ifdef CONFIG_SCHEDSTATS
INF("schedstat", S_IRUGO, pid_schedstat),
#endif

2008-11-06 20:35:45

by Ingo Molnar

[permalink] [raw]
Subject: Re: [patch] add /proc/pid/stack to dump task's stack trace


* Ken Chen <[email protected]> wrote:

> On Thu, Nov 6, 2008 at 11:51 AM, Alexey Dobriyan wrote:
> > Please, name handler proc_pid_stack following current convention.
> > And drop space before casts.
>
> OK. handler name changed. For the space between cast, it looks
> like there are different styles in the code base, either with or
> without. I dropped the space since I don't have strong opinion one
> way or the other.

best way is to run scripts/checkpatch.pl on your patch, that will
remind you of any potential style issues.

> Also wrap proc_pid_stack() inside CONFIG_STACKTRACE to fix compile
> time error when config option is not selected.

> +#ifdef CONFIG_STACKTRACE
> +#define MAX_STACK_TRACE_DEPTH 32

How about 64 instead? (it's such a nice round number)

> +static int proc_pid_stack(struct task_struct *task, char *buffer)
> +{
> + int i, len = 0;
> + unsigned long *entries;
> + struct stack_trace trace;
> +
> + entries = kmalloc(sizeof(*entries) * MAX_STACK_TRACE_DEPTH, GFP_KERNEL);
> + if (!entries)
> + goto out;
> +
> + trace.nr_entries = 0;
> + trace.max_entries = MAX_STACK_TRACE_DEPTH;
> + trace.entries = entries;
> + trace.skip = 0;
> +
> + read_lock(&tasklist_lock);
> + save_stack_trace_tsk(task, &trace);
> + read_unlock(&tasklist_lock);
> +
> + for (i = 0; i < trace.nr_entries; i++) {
> + len += sprintf(buffer + len, "[<%p>] %pS\n",
> + (void *)entries[i], (void *)entries[i]);

hm, this looks like a potential buffer overflow - isnt 'buffer' here
only valid up to the next PAGE_SIZE boundary?

> + }

> + kfree(entries);
> +out:
> + return len;

Not sure about the error path convention here: in the !entries kmalloc
failure path, shouldnt we return -ENOMEM? Otherwise userspace will get
zero length and would retry again and again?

Also, please rename 'out:' to 'error:' - to make it clear that it's an
error path.

Ingo

2008-11-07 00:30:43

by Ken Chen

[permalink] [raw]
Subject: Re: [patch] add /proc/pid/stack to dump task's stack trace

On Thu, Nov 6, 2008 at 12:35 PM, Ingo Molnar <[email protected]> wrote:
>> +static int proc_pid_stack(struct task_struct *task, char *buffer)
>> +{
>> + for (i = 0; i < trace.nr_entries; i++) {
>> + len += sprintf(buffer + len, "[<%p>] %pS\n",
>> + (void *)entries[i], (void
>> *)entries[i]);
>
> hm, this looks like a potential buffer overflow - isnt 'buffer' here
> only valid up to the next PAGE_SIZE boundary?

Yeah. the size of buffer allocated for printing is done at upper call
site in proc_info_read(). By the time we reach here, the size info is
lost. It would be too much churn to add a argument to the read method
of proc_op. Since these functions are all in one file, I moved
PROC_BLOCK_SIZE up so it can be used to check buffer length. Would
that be enough? Lots of other proc read methods don't check against
buffer overrun, I suppose those should be fixed as well.

updated patch that also fixed other comments.

Signed-off-by: Ken Chen <[email protected]>


index bcceb99..11f5b75 100644
--- a/Documentation/filesystems/proc.txt
+++ b/Documentation/filesystems/proc.txt
@@ -139,6 +139,7 @@ Table 1-1: Process specific entries in /proc
statm Process memory status information
status Process status in human readable form
wchan If CONFIG_KALLSYMS is set, a pre-decoded wchan
+ stack Report full stack trace, enable via CONFIG_STACKTRACE
smaps Extension based on maps, the rss size for each mapped file
..............................................................................

diff --git a/fs/proc/base.c b/fs/proc/base.c
index 486cf3f..8fb293d 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -65,6 +65,7 @@
#include <linux/mm.h>
#include <linux/rcupdate.h>
#include <linux/kallsyms.h>
+#include <linux/stacktrace.h>
#include <linux/resource.h>
#include <linux/module.h>
#include <linux/mount.h>
@@ -130,6 +131,12 @@ struct pid_entry {
{ .proc_show = &proc_##OTYPE } )

/*
+ * buffer size used for proc read. See proc_info_read().
+ * 4K page size but our output routines use some slack for overruns
+ */
+#define PROC_BLOCK_SIZE (3*1024)
+
+/*
* Count the number of hardlinks for the pid_entry table, excluding the .
* and .. links.
*/
@@ -340,6 +347,37 @@ static int proc_pid_wchan
}
#endif /* CONFIG_KALLSYMS */

+#ifdef CONFIG_STACKTRACE
+#define MAX_STACK_TRACE_DEPTH 64
+static int proc_pid_stack(struct task_struct *task, char *buffer)
+{
+ int i, len = 0;
+ unsigned long *entries;
+ struct stack_trace trace;
+
+ entries = kmalloc(sizeof(*entries) * MAX_STACK_TRACE_DEPTH, GFP_KERNEL);
+ if (!entries)
+ return -ENOMEM;
+
+ trace.nr_entries = 0;
+ trace.max_entries = MAX_STACK_TRACE_DEPTH;
+ trace.entries = entries;
+ trace.skip = 0;
+
+ read_lock(&tasklist_lock);
+ save_stack_trace_tsk(task, &trace);
+ read_unlock(&tasklist_lock);
+
+ for (i = 0; i < trace.nr_entries; i++) {
+ len += snprintf(buffer + len, PROC_BLOCK_SIZE - len,
+ "[<%p>] %pS\n",
+ (void *)entries[i], (void *)entries[i]);
+ }
+ kfree(entries);
+ return len;
+}
+#endif
+
#ifdef CONFIG_SCHEDSTATS
/*
* Provides /proc/PID/schedstat
@@ -688,8 +726,6 @@ static const struct file_operations proc_mountstats
.release = mounts_release,
};

-#define PROC_BLOCK_SIZE (3*1024) /* 4K page size but our output routines use some slack for overruns */
-
static ssize_t proc_info_read(struct file * file, char __user * buf,
size_t count, loff_t *ppos)
{
@@ -2491,6 +2527,9 @@ static const struct pid_entry tgid_base_stuff[] = {
#ifdef CONFIG_KALLSYMS
INF("wchan", S_IRUGO, pid_wchan),
#endif
+#ifdef CONFIG_STACKTRACE
+ INF("stack", S_IRUSR, pid_stack),
+#endif
#ifdef CONFIG_SCHEDSTATS
INF("schedstat", S_IRUGO, pid_schedstat),
#endif

2008-11-07 00:45:22

by Alexey Dobriyan

[permalink] [raw]
Subject: Re: [patch] add /proc/pid/stack to dump task's stack trace

On Thu, Nov 06, 2008 at 04:30:23PM -0800, Ken Chen wrote:
> On Thu, Nov 6, 2008 at 12:35 PM, Ingo Molnar <[email protected]> wrote:
> >> +static int proc_pid_stack(struct task_struct *task, char *buffer)
> >> +{
> >> + for (i = 0; i < trace.nr_entries; i++) {
> >> + len += sprintf(buffer + len, "[<%p>] %pS\n",
> >> + (void *)entries[i], (void
> >> *)entries[i]);
> >
> > hm, this looks like a potential buffer overflow - isnt 'buffer' here
> > only valid up to the next PAGE_SIZE boundary?

So, make trace depth low enough, or even better use seqfiles, if you're
scared by buffer overflows.

> --- a/fs/proc/base.c
> +++ b/fs/proc/base.c
> @@ -130,6 +131,12 @@ struct pid_entry {
> { .proc_show = &proc_##OTYPE } )
>
> /*
> + * buffer size used for proc read. See proc_info_read().
> + * 4K page size but our output routines use some slack for overruns
> + */
> +#define PROC_BLOCK_SIZE (3*1024)

2008-11-07 07:42:04

by Ingo Molnar

[permalink] [raw]
Subject: Re: [patch] add /proc/pid/stack to dump task's stack trace


* Alexey Dobriyan <[email protected]> wrote:

> On Thu, Nov 06, 2008 at 04:30:23PM -0800, Ken Chen wrote:
> > On Thu, Nov 6, 2008 at 12:35 PM, Ingo Molnar <[email protected]> wrote:
> > >> +static int proc_pid_stack(struct task_struct *task, char *buffer)
> > >> +{
> > >> + for (i = 0; i < trace.nr_entries; i++) {
> > >> + len += sprintf(buffer + len, "[<%p>] %pS\n",
> > >> + (void *)entries[i], (void
> > >> *)entries[i]);
> > >
> > > hm, this looks like a potential buffer overflow - isnt 'buffer' here
> > > only valid up to the next PAGE_SIZE boundary?
>
> So, make trace depth low enough, or even better use seqfiles, if
> you're scared by buffer overflows.

it's not about being scared, it's about doing the math: kernel symbols
can be up to 128 bytes long, so the per line max becomes
2+2+16+2+1+2+16+128+1 == 170. 4096/170 ~== 24. So without checking
we've got guaranteed space for only 24 lines - that's too low.

_In practice_, we'd need a really long trace to trigger it, but i've
seen really long traces in the past and this is debug infrastructure,
so we cannot take chances here.

> > /*
> > + * buffer size used for proc read. See proc_info_read().
> > + * 4K page size but our output routines use some slack for overruns
> > + */
> > +#define PROC_BLOCK_SIZE (3*1024)

That sounds like a proper limit - the hard limit for this particular
printout function is 4096-170, so we are well within the
PROC_BLOCK_SIZE range.

Ingo

2008-11-07 07:59:46

by Ingo Molnar

[permalink] [raw]
Subject: Re: [patch] add /proc/pid/stack to dump task's stack trace


* Ingo Molnar <[email protected]> wrote:

> > > + * buffer size used for proc read. See proc_info_read().
> > > + * 4K page size but our output routines use some slack for overruns
> > > + */
> > > +#define PROC_BLOCK_SIZE (3*1024)
>
> That sounds like a proper limit - the hard limit for this particular
> printout function is 4096-170, so we are well within the
> PROC_BLOCK_SIZE range.

ok, i've added Ken's patch to tip/core/stacktrace and started testing
it.

Alexey, i've added your Acked-by because you appeared to like the
patch - let me know if i should remove it.

i've done a few finishing touches to the patch as well - see the end
result below.

Andrew, i remember that you found some sort of problem (crashes?) with
stack-dumping tasks that are running on another CPU (or something like
that) - do you remember the specifics? Could we run into any of those
problems with the patch below, on some rare architecture?

Ingo

------------------->
>From d3d23f82adeb5ec8a9546747d1df058dcaad0589 Mon Sep 17 00:00:00 2001
From: Ken Chen <[email protected]>
Date: Thu, 6 Nov 2008 16:30:23 -0800
Subject: [PATCH] stacktrace: add /proc/<pid>/stack to dump task's stack trace

Impact: add the new (root-only) /proc/<pid>/stack debug facility

This patch adds the ability to query a task's current stack trace via
/proc/<pid>/stack.

It is considered to be more useful than /proc/pid/wchan as it provides
full stack trace instead of single depth.

It is also more useful than sysrq-t because it does not overflow the
dmesg like sysrq-t often does, can be read in a finegrained per-task
way and can be read programmatically as well.

It works on sleeping and running tasks as well.

Also, move up PROC_BLOCK_SIZE a bit so that proc_pid_stack() can use it.

[ [email protected]: small cleanups, comments ]

Signed-off-by: Ken Chen <[email protected]>
Acked-by: Alexey Dobriyan <[email protected]>
Signed-off-by: Ingo Molnar <[email protected]>
---
Documentation/filesystems/proc.txt | 1 +
fs/proc/base.c | 52 ++++++++++++++++++++++++++++++++++-
2 files changed, 51 insertions(+), 2 deletions(-)

diff --git a/Documentation/filesystems/proc.txt b/Documentation/filesystems/proc.txt
index bcceb99..11f5b75 100644
--- a/Documentation/filesystems/proc.txt
+++ b/Documentation/filesystems/proc.txt
@@ -139,6 +139,7 @@ Table 1-1: Process specific entries in /proc
statm Process memory status information
status Process status in human readable form
wchan If CONFIG_KALLSYMS is set, a pre-decoded wchan
+ stack Report full stack trace, enable via CONFIG_STACKTRACE
smaps Extension based on maps, the rss size for each mapped file
..............................................................................

diff --git a/fs/proc/base.c b/fs/proc/base.c
index 486cf3f..805e514 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -65,6 +65,7 @@
#include <linux/mm.h>
#include <linux/rcupdate.h>
#include <linux/kallsyms.h>
+#include <linux/stacktrace.h>
#include <linux/resource.h>
#include <linux/module.h>
#include <linux/mount.h>
@@ -130,6 +131,12 @@ struct pid_entry {
{ .proc_show = &proc_##OTYPE } )

/*
+ * buffer size used for proc read. See proc_info_read().
+ * 4K page size but our output routines use some slack for overruns
+ */
+#define PROC_BLOCK_SIZE (3*1024)
+
+/*
* Count the number of hardlinks for the pid_entry table, excluding the .
* and .. links.
*/
@@ -340,6 +347,46 @@ static int proc_pid_wchan(struct task_struct *task, char *buffer)
}
#endif /* CONFIG_KALLSYMS */

+#ifdef CONFIG_STACKTRACE
+
+#define MAX_STACK_TRACE_DEPTH 64
+
+static int proc_pid_stack(struct task_struct *task, char *buffer)
+{
+ struct stack_trace trace;
+ unsigned long *entries;
+ int i, len = 0;
+
+ entries = kmalloc(sizeof(*entries)*MAX_STACK_TRACE_DEPTH, GFP_KERNEL);
+ if (!entries)
+ return -ENOMEM;
+
+ trace.nr_entries = 0;
+ trace.max_entries = MAX_STACK_TRACE_DEPTH;
+ trace.entries = entries;
+ trace.skip = 0;
+
+ /*
+ * Protect against the task exiting (and deallocating its
+ * stack, etc.) while we save its backtrace:
+ */
+ read_lock(&tasklist_lock);
+ save_stack_trace_tsk(task, &trace);
+ read_unlock(&tasklist_lock);
+
+ for (i = 0; i < trace.nr_entries; i++) {
+ len += snprintf(buffer + len, PROC_BLOCK_SIZE - len,
+ "[<%p>] %pS\n",
+ (void *)entries[i], (void *)entries[i]);
+ if (!len)
+ break;
+ }
+ kfree(entries);
+
+ return len;
+}
+#endif
+
#ifdef CONFIG_SCHEDSTATS
/*
* Provides /proc/PID/schedstat
@@ -688,8 +735,6 @@ static const struct file_operations proc_mountstats_operations = {
.release = mounts_release,
};

-#define PROC_BLOCK_SIZE (3*1024) /* 4K page size but our output routines use some slack for overruns */
-
static ssize_t proc_info_read(struct file * file, char __user * buf,
size_t count, loff_t *ppos)
{
@@ -2491,6 +2536,9 @@ static const struct pid_entry tgid_base_stuff[] = {
#ifdef CONFIG_KALLSYMS
INF("wchan", S_IRUGO, pid_wchan),
#endif
+#ifdef CONFIG_STACKTRACE
+ INF("stack", S_IRUSR, pid_stack),
+#endif
#ifdef CONFIG_SCHEDSTATS
INF("schedstat", S_IRUGO, pid_schedstat),
#endif

2008-11-07 08:17:35

by Alexey Dobriyan

[permalink] [raw]
Subject: Re: [patch] add /proc/pid/stack to dump task's stack trace

On Fri, Nov 07, 2008 at 08:59:25AM +0100, Ingo Molnar wrote:
>
> * Ingo Molnar <[email protected]> wrote:
>
> > > > + * buffer size used for proc read. See proc_info_read().
> > > > + * 4K page size but our output routines use some slack for overruns
> > > > + */
> > > > +#define PROC_BLOCK_SIZE (3*1024)
> >
> > That sounds like a proper limit - the hard limit for this particular
> > printout function is 4096-170, so we are well within the
> > PROC_BLOCK_SIZE range.
>
> ok, i've added Ken's patch to tip/core/stacktrace and started testing
> it.
>
> Alexey, i've added your Acked-by because you appeared to like the
> patch - let me know if i should remove it.

Of course, I don't like it!

Switch to seqfiles, add entry in TID table as well.

The idea is good, though.

2008-11-07 08:33:20

by Ingo Molnar

[permalink] [raw]
Subject: Re: [patch] add /proc/pid/stack to dump task's stack trace


* Alexey Dobriyan <[email protected]> wrote:

> On Fri, Nov 07, 2008 at 08:59:25AM +0100, Ingo Molnar wrote:
> >
> > * Ingo Molnar <[email protected]> wrote:
> >
> > > > > + * buffer size used for proc read. See proc_info_read().
> > > > > + * 4K page size but our output routines use some slack for overruns
> > > > > + */
> > > > > +#define PROC_BLOCK_SIZE (3*1024)
> > >
> > > That sounds like a proper limit - the hard limit for this particular
> > > printout function is 4096-170, so we are well within the
> > > PROC_BLOCK_SIZE range.
> >
> > ok, i've added Ken's patch to tip/core/stacktrace and started testing
> > it.
> >
> > Alexey, i've added your Acked-by because you appeared to like the
> > patch - let me know if i should remove it.
>
> Of course, I don't like it!
>
> Switch to seqfiles, add entry in TID table as well.
>
> The idea is good, though.

oh well - Ken, could you please switch it to seqfiles?

It should be something like this to convert the currently sweet and
trivial function into a much more complex seqfile iterator splitup:

- the ::start method does the kmalloc of a loop state structure like
this:

{
struct stack_trace backtrace;
unsigned long entries[MAX_STACK_TRACE_DEPTH];

int iterator;
}

and saves the trace. (struct stack_trace trace can be on-stack, it's
only needed for the save_stack_trace() - and we keep the entries
after that.)

- the ::stop method does the kfree of the loop state.

- the ::show method prints a single line, based on ->entries[->iterator]

- the ::next method does ->iterator++, and returns NULL if iterator
reaches ->backtrace.nr_entries.

it will be more source code, larger kernel image, it will be more
fragile and will be harder to review, and it wont actually matter in
practice because 99.9999% of the backtraces we care about have a size
smaller than 3K. (and where they get larger clipping them to the first
3K is perfectly fine)

Ingo

2008-11-07 08:46:49

by Alexey Dobriyan

[permalink] [raw]
Subject: Re: [patch] add /proc/pid/stack to dump task's stack trace

On Fri, Nov 07, 2008 at 09:32:49AM +0100, Ingo Molnar wrote:
>
> * Alexey Dobriyan <[email protected]> wrote:
>
> > On Fri, Nov 07, 2008 at 08:59:25AM +0100, Ingo Molnar wrote:
> > >
> > > * Ingo Molnar <[email protected]> wrote:
> > >
> > > > > > + * buffer size used for proc read. See proc_info_read().
> > > > > > + * 4K page size but our output routines use some slack for overruns
> > > > > > + */
> > > > > > +#define PROC_BLOCK_SIZE (3*1024)
> > > >
> > > > That sounds like a proper limit - the hard limit for this particular
> > > > printout function is 4096-170, so we are well within the
> > > > PROC_BLOCK_SIZE range.
> > >
> > > ok, i've added Ken's patch to tip/core/stacktrace and started testing
> > > it.
> > >
> > > Alexey, i've added your Acked-by because you appeared to like the
> > > patch - let me know if i should remove it.
> >
> > Of course, I don't like it!
> >
> > Switch to seqfiles, add entry in TID table as well.
> >
> > The idea is good, though.
>
> oh well - Ken, could you please switch it to seqfiles?
>
> It should be something like this to convert the currently sweet and
> trivial function into a much more complex seqfile iterator splitup:
>
> - the ::start method does the kmalloc of a loop state structure like
> this:
>
> {
> struct stack_trace backtrace;
> unsigned long entries[MAX_STACK_TRACE_DEPTH];
>
> int iterator;
> }
>
> and saves the trace. (struct stack_trace trace can be on-stack, it's
> only needed for the save_stack_trace() - and we keep the entries
> after that.)
>
> - the ::stop method does the kfree of the loop state.
>
> - the ::show method prints a single line, based on ->entries[->iterator]
>
> - the ::next method does ->iterator++, and returns NULL if iterator
> reaches ->backtrace.nr_entries.
>
> it will be more source code, larger kernel image, it will be more
> fragile and will be harder to review, and it wont actually matter in
> practice because 99.9999% of the backtraces we care about have a size
> smaller than 3K. (and where they get larger clipping them to the first
> 3K is perfectly fine)

Or you can do all of this in ->show(), without start/next/stop:

for (i = 0; i < N; i++)
seq_printf(m, "[<%p>] %pS\n", x, x);

2008-11-07 08:54:18

by Ingo Molnar

[permalink] [raw]
Subject: Re: [patch] add /proc/pid/stack to dump task's stack trace


* Alexey Dobriyan <[email protected]> wrote:

> On Fri, Nov 07, 2008 at 09:32:49AM +0100, Ingo Molnar wrote:
> >
> > * Alexey Dobriyan <[email protected]> wrote:
> >
> > > On Fri, Nov 07, 2008 at 08:59:25AM +0100, Ingo Molnar wrote:
> > > >
> > > > * Ingo Molnar <[email protected]> wrote:
> > > >
> > > > > > > + * buffer size used for proc read. See proc_info_read().
> > > > > > > + * 4K page size but our output routines use some slack for overruns
> > > > > > > + */
> > > > > > > +#define PROC_BLOCK_SIZE (3*1024)
> > > > >
> > > > > That sounds like a proper limit - the hard limit for this particular
> > > > > printout function is 4096-170, so we are well within the
> > > > > PROC_BLOCK_SIZE range.
> > > >
> > > > ok, i've added Ken's patch to tip/core/stacktrace and started testing
> > > > it.
> > > >
> > > > Alexey, i've added your Acked-by because you appeared to like the
> > > > patch - let me know if i should remove it.
> > >
> > > Of course, I don't like it!
> > >
> > > Switch to seqfiles, add entry in TID table as well.
> > >
> > > The idea is good, though.
> >
> > oh well - Ken, could you please switch it to seqfiles?
> >
> > It should be something like this to convert the currently sweet and
> > trivial function into a much more complex seqfile iterator splitup:
> >
> > - the ::start method does the kmalloc of a loop state structure like
> > this:
> >
> > {
> > struct stack_trace backtrace;
> > unsigned long entries[MAX_STACK_TRACE_DEPTH];
> >
> > int iterator;
> > }
> >
> > and saves the trace. (struct stack_trace trace can be on-stack, it's
> > only needed for the save_stack_trace() - and we keep the entries
> > after that.)
> >
> > - the ::stop method does the kfree of the loop state.
> >
> > - the ::show method prints a single line, based on ->entries[->iterator]
> >
> > - the ::next method does ->iterator++, and returns NULL if iterator
> > reaches ->backtrace.nr_entries.
> >
> > it will be more source code, larger kernel image, it will be more
> > fragile and will be harder to review, and it wont actually matter in
> > practice because 99.9999% of the backtraces we care about have a size
> > smaller than 3K. (and where they get larger clipping them to the first
> > 3K is perfectly fine)
>
> Or you can do all of this in ->show(), without start/next/stop:
>
> for (i = 0; i < N; i++)
> seq_printf(m, "[<%p>] %pS\n", x, x);

hm, is that preferred over the current fs/proc/base.c code?

If that's the preferred way of doing these things, why arent all the
single-page procfs functions converted over to seq_file:

#ifdef CONFIG_KALLSYMS
INF("wchan", S_IRUGO, pid_wchan),
#endif
+#ifdef CONFIG_STACKTRACE
+ INF("stack", S_IRUSR, pid_stack),
+#endif
#ifdef CONFIG_SCHEDSTATS
INF("schedstat", S_IRUGO, pid_schedstat),
#endif

?

Really, please realize what happened here. All this unnecessary work
comes from Ken just having done the _natural_ thing when extending the
existing upstream code: using the existing fs/proc/base.c methods as a
template to write new code. If those templates use outdated APIs, then
new code will be "outdated" too - and this confusion will go on
forever.

Ingo

2008-11-07 09:03:34

by Ingo Molnar

[permalink] [raw]
Subject: Re: [patch] add /proc/pid/stack to dump task's stack trace


btw., the feature works beautifully:

task sleeping/blocked:

# cat /proc/1/stack
[<ffffffff802bfe75>] do_select+0x51a/0x582
[<ffffffff802c0059>] core_sys_select+0x17c/0x218
[<ffffffff802c0344>] sys_select+0x99/0xc1
[<ffffffff8020c23b>] system_call_fastpath+0x16/0x1b
[<ffffffffffffffff>] 0xffffffffffffffff

task running on this CPU:

# cat /proc/self/stack
[<ffffffff80216f79>] save_stack_trace_tsk+0x26/0x44
[<ffffffff802f59a5>] proc_pid_stack+0x6e/0xd3
[<ffffffff802f6da3>] proc_info_read+0x68/0xba
[<ffffffff802b2f17>] vfs_read+0xa9/0xe3
[<ffffffff802b301f>] sys_read+0x4c/0x73
[<ffffffff8020c23b>] system_call_fastpath+0x16/0x1b
[<ffffffffffffffff>] 0xffffffffffffffff

task running on another CPU in user-space:

# cat /proc/18579/stack
[<ffffffffffffffff>] 0xffffffffffffffff

task running on another CPU in kernel-space:

# cat /proc/18579/stack
[<ffffffff8026f111>] audit_syscall_exit+0x9c/0xed
[<ffffffff80214b49>] syscall_trace_leave+0x94/0xbb
[<ffffffffffffffff>] 0xffffffffffffffff

Ingo

2008-11-07 09:18:01

by Andrew Morton

[permalink] [raw]
Subject: Re: [patch] add /proc/pid/stack to dump task's stack trace

On Fri, 7 Nov 2008 10:03:04 +0100 Ingo Molnar <[email protected]> wrote:

> task running on this CPU:
>
> # cat /proc/self/stack
> [<ffffffff80216f79>] save_stack_trace_tsk+0x26/0x44
> [<ffffffff802f59a5>] proc_pid_stack+0x6e/0xd3
> [<ffffffff802f6da3>] proc_info_read+0x68/0xba
> [<ffffffff802b2f17>] vfs_read+0xa9/0xe3
> [<ffffffff802b301f>] sys_read+0x4c/0x73
> [<ffffffff8020c23b>] system_call_fastpath+0x16/0x1b
> [<ffffffffffffffff>] 0xffffffffffffffff

So we provide a means by which process A can sample process B's
instruction pointer? Even if it's in random.c or crypto code? There's
a little project for someone.

I guess the 0400 mode on that file will suffice...

2008-11-07 09:29:27

by Ingo Molnar

[permalink] [raw]
Subject: Re: [patch] add /proc/pid/stack to dump task's stack trace


* Andrew Morton <[email protected]> wrote:

> On Fri, 7 Nov 2008 10:03:04 +0100 Ingo Molnar <[email protected]> wrote:
>
> > task running on this CPU:
> >
> > # cat /proc/self/stack
> > [<ffffffff80216f79>] save_stack_trace_tsk+0x26/0x44
> > [<ffffffff802f59a5>] proc_pid_stack+0x6e/0xd3
> > [<ffffffff802f6da3>] proc_info_read+0x68/0xba
> > [<ffffffff802b2f17>] vfs_read+0xa9/0xe3
> > [<ffffffff802b301f>] sys_read+0x4c/0x73
> > [<ffffffff8020c23b>] system_call_fastpath+0x16/0x1b
> > [<ffffffffffffffff>] 0xffffffffffffffff
>
> So we provide a means by which process A can sample process B's
> instruction pointer? Even if it's in random.c or crypto code?
> There's a little project for someone.

yes - like "echo p > /proc/sysrq-trigger" and sysrq-t. Although unlike
sysrq-p, the IP itself isnt printed, just the stack trace - but
there's indeed a correlation.

> I guess the 0400 mode on that file will suffice...

correct, 0400 is used already in the present patch:

phoenix:~> cat /proc/1/stack
cat: /proc/1/stack: Permission denied

but that is _not_ enough, it should be narrowed even more, to the
boundaries that i pointed out in my first review feedback mail, and
which is not implemented yet:

1) only root should be allowed to do this - i.e. file needs to be
root-owned.

2) there also needs to be a .config entry for folks to be able to
turn it off altogether - just like folks can turn off sysrq-t
dumping via the .config.

Ingo

2008-11-07 17:48:53

by Alexey Dobriyan

[permalink] [raw]
Subject: Re: [patch] add /proc/pid/stack to dump task's stack trace

On Fri, Nov 07, 2008 at 10:29:02AM +0100, Ingo Molnar wrote:
>
> * Andrew Morton <[email protected]> wrote:
>
> > On Fri, 7 Nov 2008 10:03:04 +0100 Ingo Molnar <[email protected]> wrote:
> >
> > > task running on this CPU:
> > >
> > > # cat /proc/self/stack
> > > [<ffffffff80216f79>] save_stack_trace_tsk+0x26/0x44
> > > [<ffffffff802f59a5>] proc_pid_stack+0x6e/0xd3
> > > [<ffffffff802f6da3>] proc_info_read+0x68/0xba
> > > [<ffffffff802b2f17>] vfs_read+0xa9/0xe3
> > > [<ffffffff802b301f>] sys_read+0x4c/0x73
> > > [<ffffffff8020c23b>] system_call_fastpath+0x16/0x1b
> > > [<ffffffffffffffff>] 0xffffffffffffffff
> >
> > So we provide a means by which process A can sample process B's
> > instruction pointer? Even if it's in random.c or crypto code?
> > There's a little project for someone.
>
> yes - like "echo p > /proc/sysrq-trigger" and sysrq-t. Although unlike
> sysrq-p, the IP itself isnt printed, just the stack trace - but
> there's indeed a correlation.
>
> > I guess the 0400 mode on that file will suffice...
>
> correct, 0400 is used already in the present patch:
>
> phoenix:~> cat /proc/1/stack
> cat: /proc/1/stack: Permission denied
>
> but that is _not_ enough, it should be narrowed even more, to the
> boundaries that i pointed out in my first review feedback mail, and
> which is not implemented yet:
>
> 1) only root should be allowed to do this - i.e. file needs to be
> root-owned.
>
> 2) there also needs to be a .config entry for folks to be able to
> turn it off altogether - just like folks can turn off sysrq-t
> dumping via the .config.

In the name of everything holy, don't add another config option.
It's a _tiny_ piece of code and you have select STACKTRACE via
fault injection, latencytop or lockdep before that.

2008-11-07 18:38:27

by Ken Chen

[permalink] [raw]
Subject: Re: [patch] add /proc/pid/stack to dump task's stack trace

On Fri, Nov 7, 2008 at 12:32 AM, Ingo Molnar <[email protected]> wrote:
> oh well - Ken, could you please switch it to seqfiles?

On Fri, Nov 7, 2008 at 12:49 AM, Alexey Dobriyan wrote:
> Or you can do all of this in ->show(), without start/next/stop:
>
> for (i = 0; i < N; i++)
> seq_printf(m, "[<%p>] %pS\n", x, x);

Here is a patch convert to seq_file using proc_single_show() helper.
I don't see it to be any superior than what was before using
snprintf(). seq_read also allocate one page for printf buffer, so
functionally it is the same (perhaps a little bit better because it
can use the whole page, compare to PROC_BLOCK_SIZE currently). Ingo,
it's your call whether to merge this patch or not.

I also agree that using full blown seqfile start/show/stop won't add
value because most valuable stack at the top are already printed.


Signed-off-by: Ken Chen <[email protected]>

diff --git a/fs/proc/base.c b/fs/proc/base.c
index 805e514..6d294a4 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -351,11 +351,12 @@ static int proc_pid_wchan

#define MAX_STACK_TRACE_DEPTH 64

-static int proc_pid_stack(struct task_struct *task, char *buffer)
+static int proc_pid_stack(struct seq_file *m, struct pid_namespace *ns,
+ struct pid *pid, struct task_struct *task)
{
struct stack_trace trace;
unsigned long *entries;
- int i, len = 0;
+ int i;

entries = kmalloc(sizeof(*entries)*MAX_STACK_TRACE_DEPTH, GFP_KERNEL);
if (!entries)
@@ -375,15 +376,12 @@ static int proc_pid_stack
read_unlock(&tasklist_lock);

for (i = 0; i < trace.nr_entries; i++) {
- len += snprintf(buffer + len, PROC_BLOCK_SIZE - len,
- "[<%p>] %pS\n",
- (void *)entries[i], (void *)entries[i]);
- if (!len)
- break;
+ seq_printf(m, "[<%p>] %pS\n",
+ (void *)entries[i], (void *)entries[i]);
}
kfree(entries);

- return len;
+ return 0;
}
#endif

@@ -2537,7 +2535,7 @@ static const struct pid_entry tgid_base_stuff[]
INF("wchan", S_IRUGO, pid_wchan),
#endif
#ifdef CONFIG_STACKTRACE
- INF("stack", S_IRUSR, pid_stack),
+ ONE("stack", S_IRUSR, pid_stack),
#endif
#ifdef CONFIG_SCHEDSTATS
INF("schedstat", S_IRUGO, pid_schedstat),

2008-11-07 18:46:32

by Paul Menage

[permalink] [raw]
Subject: Re: [patch] add /proc/pid/stack to dump task's stack trace

On Fri, Nov 7, 2008 at 10:38 AM, Ken Chen <[email protected]> wrote:
>
> Here is a patch convert to seq_file using proc_single_show() helper.
> I don't see it to be any superior than what was before using
> snprintf(). seq_read also allocate one page for printf buffer, so
> functionally it is the same (perhaps a little bit better because it
> can use the whole page, compare to PROC_BLOCK_SIZE currently).

No - if you exceed the original buffer size, the seq_file system will
double up the memory buffer and try again if necessary.

Paul

2008-11-08 12:06:27

by Ingo Molnar

[permalink] [raw]
Subject: Re: [patch] add /proc/pid/stack to dump task's stack trace


* Alexey Dobriyan <[email protected]> wrote:

> > 1) only root should be allowed to do this - i.e. file needs to be
> > root-owned.
> >
> > 2) there also needs to be a .config entry for folks to be able to
> > turn it off altogether - just like folks can turn off sysrq-t
> > dumping via the .config.
>
> In the name of everything holy, don't add another config option.
> It's a _tiny_ piece of code and you have select STACKTRACE via fault
> injection, latencytop or lockdep before that.

We could make it depend on DEBUG_KERNEL or so, and always select
STACKTRACE infrastructure when DEBUG_KERNEL is enabled? There's no
reason not to get high quality backtraces via /proc when one debugs
the kernel.

Ingo

2008-11-08 12:11:19

by Ingo Molnar

[permalink] [raw]
Subject: Re: [patch] add /proc/pid/stack to dump task's stack trace


* Ken Chen <[email protected]> wrote:

> On Fri, Nov 7, 2008 at 12:32 AM, Ingo Molnar <[email protected]> wrote:
> > oh well - Ken, could you please switch it to seqfiles?
>
> On Fri, Nov 7, 2008 at 12:49 AM, Alexey Dobriyan wrote:
> > Or you can do all of this in ->show(), without start/next/stop:
> >
> > for (i = 0; i < N; i++)
> > seq_printf(m, "[<%p>] %pS\n", x, x);
>
> Here is a patch convert to seq_file using proc_single_show() helper.
> I don't see it to be any superior than what was before using
> snprintf(). seq_read also allocate one page for printf buffer, so
> functionally it is the same (perhaps a little bit better because it
> can use the whole page, compare to PROC_BLOCK_SIZE currently).
> Ingo, it's your call whether to merge this patch or not.
>
> I also agree that using full blown seqfile start/show/stop won't add
> value because most valuable stack at the top are already printed.

Well, it removes 2 lines of code and makes the iteration slightly more
resilient (as there's no 'ret' value to be maintained anymore), so
i've applied your patch to tip/core/stacktrace - thanks Ken! (see the
commit below)

Sidenote: it would still be nice if the procfs folks converted the
old-style code there to the new seqfile APIs, before requiring
everyone _else_ to follow those guidelines?

Ingo

----------------->
>From 35f0b5fd7fab907a1119eaa614d9b24e5e225755 Mon Sep 17 00:00:00 2001
From: Ken Chen <[email protected]>
Date: Fri, 7 Nov 2008 10:38:04 -0800
Subject: [PATCH] stacktrace: convert /proc/<pid>/stack to seqfiles

Here is a patch convert to seq_file using proc_single_show() helper.

Signed-off-by: Ken Chen <[email protected]>
Signed-off-by: Ingo Molnar <[email protected]>
---
fs/proc/base.c | 16 +++++++---------
1 files changed, 7 insertions(+), 9 deletions(-)

diff --git a/fs/proc/base.c b/fs/proc/base.c
index 805e514..6d294a4 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -351,11 +351,12 @@ static int proc_pid_wchan(struct task_struct *task, char *buffer)

#define MAX_STACK_TRACE_DEPTH 64

-static int proc_pid_stack(struct task_struct *task, char *buffer)
+static int proc_pid_stack(struct seq_file *m, struct pid_namespace *ns,
+ struct pid *pid, struct task_struct *task)
{
struct stack_trace trace;
unsigned long *entries;
- int i, len = 0;
+ int i;

entries = kmalloc(sizeof(*entries)*MAX_STACK_TRACE_DEPTH, GFP_KERNEL);
if (!entries)
@@ -375,15 +376,12 @@ static int proc_pid_stack(struct task_struct *task, char *buffer)
read_unlock(&tasklist_lock);

for (i = 0; i < trace.nr_entries; i++) {
- len += snprintf(buffer + len, PROC_BLOCK_SIZE - len,
- "[<%p>] %pS\n",
- (void *)entries[i], (void *)entries[i]);
- if (!len)
- break;
+ seq_printf(m, "[<%p>] %pS\n",
+ (void *)entries[i], (void *)entries[i]);
}
kfree(entries);

- return len;
+ return 0;
}
#endif

@@ -2537,7 +2535,7 @@ static const struct pid_entry tgid_base_stuff[] = {
INF("wchan", S_IRUGO, pid_wchan),
#endif
#ifdef CONFIG_STACKTRACE
- INF("stack", S_IRUSR, pid_stack),
+ ONE("stack", S_IRUSR, pid_stack),
#endif
#ifdef CONFIG_SCHEDSTATS
INF("schedstat", S_IRUGO, pid_schedstat),

2008-11-09 18:05:29

by Alexey Dobriyan

[permalink] [raw]
Subject: Re: [patch] add /proc/pid/stack to dump task's stack trace

On Sat, Nov 08, 2008 at 01:10:54PM +0100, Ingo Molnar wrote:
> Sidenote: it would still be nice if the procfs folks converted the
> old-style code there to the new seqfile APIs, before requiring
> everyone _else_ to follow those guidelines?

For every existing non seqfile /proc file, there may be (and was demonstrated)
some userspace which is doing pread(2) on it and seqfiles don't support pread
currently. Obviously, no such userspace exist for /proc/*/stack.

2008-11-10 08:41:44

by Ingo Molnar

[permalink] [raw]
Subject: Re: [patch] add /proc/pid/stack to dump task's stack trace


* Alexey Dobriyan <[email protected]> wrote:

> On Sat, Nov 08, 2008 at 01:10:54PM +0100, Ingo Molnar wrote:
> > Sidenote: it would still be nice if the procfs folks converted the
> > old-style code there to the new seqfile APIs, before requiring
> > everyone _else_ to follow those guidelines?
>
> For every existing non seqfile /proc file, there may be (and was
> demonstrated) some userspace which is doing pread(2) on it and
> seqfiles don't support pread currently. Obviously, no such userspace
> exist for /proc/*/stack.

ok, i then dont understand why we are advocating seqfile use, while
seqfiles are inferior replacements in certain aspects (no pread(2)
support). Adding pread(2) support would remove all doubt, and it could
be converted all across the spectrum, eliminating any confusion about
which facility to use, right?

Ingo

2008-11-10 23:55:29

by Andrew Morton

[permalink] [raw]
Subject: Re: [patch] add /proc/pid/stack to dump task's stack trace

On Fri, 7 Nov 2008 10:29:02 +0100
Ingo Molnar <[email protected]> wrote:

> > I guess the 0400 mode on that file will suffice...
>
> correct, 0400 is used already in the present patch:
>
> phoenix:~> cat /proc/1/stack
> cat: /proc/1/stack: Permission denied
>
> but that is _not_ enough, it should be narrowed even more, to the
> boundaries that i pointed out in my first review feedback mail, and
> which is not implemented yet:
>
> 1) only root should be allowed to do this - i.e. file needs to be
> root-owned.
>
> 2) there also needs to be a .config entry for folks to be able to
> turn it off altogether - just like folks can turn off sysrq-t
> dumping via the .config.

Doing the above is desirable for another reason: given our rather
erratic history with the stack backtracer, this /proc file is possibly
a convenient way of oopsing the kernel, sending of off into la-la-land, etc.

2008-11-11 10:01:19

by Ingo Molnar

[permalink] [raw]
Subject: Re: [patch] add /proc/pid/stack to dump task's stack trace


* Andrew Morton <[email protected]> wrote:

> On Fri, 7 Nov 2008 10:29:02 +0100
> Ingo Molnar <[email protected]> wrote:
>
> > > I guess the 0400 mode on that file will suffice...
> >
> > correct, 0400 is used already in the present patch:
> >
> > phoenix:~> cat /proc/1/stack
> > cat: /proc/1/stack: Permission denied
> >
> > but that is _not_ enough, it should be narrowed even more, to the
> > boundaries that i pointed out in my first review feedback mail, and
> > which is not implemented yet:
> >
> > 1) only root should be allowed to do this - i.e. file needs to be
> > root-owned.
> >
> > 2) there also needs to be a .config entry for folks to be able to
> > turn it off altogether - just like folks can turn off sysrq-t
> > dumping via the .config.
>
> Doing the above is desirable for another reason: given our rather
> erratic history with the stack backtracer, this /proc file is
> possibly a convenient way of oopsing the kernel, sending of off into
> la-la-land, etc.

the stack tracer is rock-solid on x86 since Arjan started cleaning up
the backtracing mess which we indeed had in x86 for years:

- adding frame-pointer support to 64-bit to improve the
quality of stack-traces

- eliminating the broken and fragile dwarf2-unwinder

- expanding the use of the generic stacktrace infrastructure to
lockdep, ftrace and other areas of code

if you know about any remaining fragility please holler, we havent had
a backtracer induced oops for a long time :-)

Ingo

2008-11-11 12:26:33

by Marcin Slusarz

[permalink] [raw]
Subject: Re: [patch] add /proc/pid/stack to dump task's stack trace

On Fri, Nov 07, 2008 at 10:03:04AM +0100, Ingo Molnar wrote:
>
> btw., the feature works beautifully:
>
> task sleeping/blocked:
>
> # cat /proc/1/stack
> [<ffffffff802bfe75>] do_select+0x51a/0x582
> [<ffffffff802c0059>] core_sys_select+0x17c/0x218
> [<ffffffff802c0344>] sys_select+0x99/0xc1
> [<ffffffff8020c23b>] system_call_fastpath+0x16/0x1b
> [<ffffffffffffffff>] 0xffffffffffffffff
>
> task running on this CPU:
>
> # cat /proc/self/stack
> [<ffffffff80216f79>] save_stack_trace_tsk+0x26/0x44
> [<ffffffff802f59a5>] proc_pid_stack+0x6e/0xd3
> [<ffffffff802f6da3>] proc_info_read+0x68/0xba
> [<ffffffff802b2f17>] vfs_read+0xa9/0xe3
> [<ffffffff802b301f>] sys_read+0x4c/0x73
> [<ffffffff8020c23b>] system_call_fastpath+0x16/0x1b
> [<ffffffffffffffff>] 0xffffffffffffffff
>
> task running on another CPU in user-space:
>
> # cat /proc/18579/stack
> [<ffffffffffffffff>] 0xffffffffffffffff

so this file provides view of _kernel_ stack only?
shouldn't it be named kernel-stack then?

> task running on another CPU in kernel-space:
>
> # cat /proc/18579/stack
> [<ffffffff8026f111>] audit_syscall_exit+0x9c/0xed
> [<ffffffff80214b49>] syscall_trace_leave+0x94/0xbb
> [<ffffffffffffffff>] 0xffffffffffffffff
>
> Ingo
> --

2008-11-11 12:30:20

by Alexey Dobriyan

[permalink] [raw]
Subject: Re: [patch] add /proc/pid/stack to dump task's stack trace

On Tue, Nov 11, 2008 at 01:25:36PM +0100, Marcin Slusarz wrote:
> On Fri, Nov 07, 2008 at 10:03:04AM +0100, Ingo Molnar wrote:
> >
> > btw., the feature works beautifully:
> >
> > task sleeping/blocked:
> >
> > # cat /proc/1/stack
> > [<ffffffff802bfe75>] do_select+0x51a/0x582
> > [<ffffffff802c0059>] core_sys_select+0x17c/0x218
> > [<ffffffff802c0344>] sys_select+0x99/0xc1
> > [<ffffffff8020c23b>] system_call_fastpath+0x16/0x1b
> > [<ffffffffffffffff>] 0xffffffffffffffff
> >
> > task running on this CPU:
> >
> > # cat /proc/self/stack
> > [<ffffffff80216f79>] save_stack_trace_tsk+0x26/0x44
> > [<ffffffff802f59a5>] proc_pid_stack+0x6e/0xd3
> > [<ffffffff802f6da3>] proc_info_read+0x68/0xba
> > [<ffffffff802b2f17>] vfs_read+0xa9/0xe3
> > [<ffffffff802b301f>] sys_read+0x4c/0x73
> > [<ffffffff8020c23b>] system_call_fastpath+0x16/0x1b
> > [<ffffffffffffffff>] 0xffffffffffffffff
> >
> > task running on another CPU in user-space:
> >
> > # cat /proc/18579/stack
> > [<ffffffffffffffff>] 0xffffffffffffffff
>
> so this file provides view of _kernel_ stack only?

Of course!

> shouldn't it be named kernel-stack then?
>
> > task running on another CPU in kernel-space:
> >
> > # cat /proc/18579/stack
> > [<ffffffff8026f111>] audit_syscall_exit+0x9c/0xed
> > [<ffffffff80214b49>] syscall_trace_leave+0x94/0xbb
> > [<ffffffffffffffff>] 0xffffffffffffffff

2008-11-11 13:20:35

by Ingo Molnar

[permalink] [raw]
Subject: Re: [patch] add /proc/pid/stack to dump task's stack trace


* Marcin Slusarz <[email protected]> wrote:

> On Fri, Nov 07, 2008 at 10:03:04AM +0100, Ingo Molnar wrote:
> >
> > btw., the feature works beautifully:
> >
> > task sleeping/blocked:
> >
> > # cat /proc/1/stack
> > [<ffffffff802bfe75>] do_select+0x51a/0x582
> > [<ffffffff802c0059>] core_sys_select+0x17c/0x218
> > [<ffffffff802c0344>] sys_select+0x99/0xc1
> > [<ffffffff8020c23b>] system_call_fastpath+0x16/0x1b
> > [<ffffffffffffffff>] 0xffffffffffffffff
> >
> > task running on this CPU:
> >
> > # cat /proc/self/stack
> > [<ffffffff80216f79>] save_stack_trace_tsk+0x26/0x44
> > [<ffffffff802f59a5>] proc_pid_stack+0x6e/0xd3
> > [<ffffffff802f6da3>] proc_info_read+0x68/0xba
> > [<ffffffff802b2f17>] vfs_read+0xa9/0xe3
> > [<ffffffff802b301f>] sys_read+0x4c/0x73
> > [<ffffffff8020c23b>] system_call_fastpath+0x16/0x1b
> > [<ffffffffffffffff>] 0xffffffffffffffff
> >
> > task running on another CPU in user-space:
> >
> > # cat /proc/18579/stack
> > [<ffffffffffffffff>] 0xffffffffffffffff
>
> so this file provides view of _kernel_ stack only?
> shouldn't it be named kernel-stack then?

it prints the kernel stack right now, but i'd not restrict it to the
kernel stack conceptually: i think we could eventually expand it to
print the user-space portion of the stack as well. (in the case when
user-space is built with frame pointers) We've got code for that in
the kernel already. It would be an easy one-stop-shop for full-range.

Ingo

2008-11-11 14:04:15

by Mikael Pettersson

[permalink] [raw]
Subject: Re: [patch] add /proc/pid/stack to dump task's stack trace

Ingo Molnar writes:
> > > # cat /proc/18579/stack
> > > [<ffffffffffffffff>] 0xffffffffffffffff
> >
> > so this file provides view of _kernel_ stack only?
> > shouldn't it be named kernel-stack then?
>
> it prints the kernel stack right now, but i'd not restrict it to the
> kernel stack conceptually: i think we could eventually expand it to
> print the user-space portion of the stack as well. (in the case when
> user-space is built with frame pointers) We've got code for that in
> the kernel already. It would be an easy one-stop-shop for full-range.

That would be quite fragile given the fact that user-space
only has to follow standard ABIs at specific points like
calls to standard library functions. In between, anything
can, and does, happen.

2008-11-11 14:20:03

by Ingo Molnar

[permalink] [raw]
Subject: Re: [patch] add /proc/pid/stack to dump task's stack trace


* Mikael Pettersson <[email protected]> wrote:

> Ingo Molnar writes:
> > > > # cat /proc/18579/stack
> > > > [<ffffffffffffffff>] 0xffffffffffffffff
> > >
> > > so this file provides view of _kernel_ stack only?
> > > shouldn't it be named kernel-stack then?
> >
> > it prints the kernel stack right now, but i'd not restrict it to
> > the kernel stack conceptually: i think we could eventually expand
> > it to print the user-space portion of the stack as well. (in the
> > case when user-space is built with frame pointers) We've got code
> > for that in the kernel already. It would be an easy one-stop-shop
> > for full-range.
>
> That would be quite fragile given the fact that user-space only has
> to follow standard ABIs at specific points like calls to standard
> library functions. In between, anything can, and does, happen.

it's not fragile to robustly walk the userspace stack. The result
might not always be meaningful of course.

Ingo