2008-11-06 05:58:29

by Ken Chen

[permalink] [raw]
Subject: [patch] sched: fix single-depth wchan output

To get a meaningful /proc/<pid>/wchan, one is required to turn on full
frame pointer when compile kernel/sched.c on x86 arch. The enabling of
frame pointer applies to entire kernel/sched.c and affects lots of other
core scheduler functions that aren't related to wchan's call stack
unwind. This causes unnecessary expansion of stack pointer push and pop
on the stack for scheduler functions. To cut down the cost of frame
pointer push/pop, one can use compile time config option 'single-depth
wchan'. However, the 'single-depth' option is broken on x86 due to lack
of stack frame marker and simple stack unwind doesn't work, i.e., wchan
always produces '0'.

This patch adds call site location explicitly in thread_struct for
schedule() function so that get_wchan() can reliably get the data and at
the same time not to overly burden the entire kernel/sched.c with frame
pointer generation. The remove of frame pointer dependency allows
compiler to generate better and faster core scheduler code on x86_64.

Signed-off-by: Ken Chen <[email protected]>

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index e60c59b..9951853 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -34,6 +34,7 @@ config X86
select HAVE_ARCH_TRACEHOOK
select HAVE_GENERIC_DMA_COHERENT if X86_32
select HAVE_EFFICIENT_UNALIGNED_ACCESS
+ select SCHED_NO_NO_OMIT_FRAME_POINTER

config ARCH_DEFCONFIG
string
@@ -367,7 +368,7 @@ config X86_RDC321X
config SCHED_NO_NO_OMIT_FRAME_POINTER
def_bool y
prompt "Single-depth WCHAN output"
- depends on X86_32
+ depends on X86
help
Calculate simpler /proc/<PID>/wchan values. If this option
is disabled then wchan values will recurse back to the
index 5ca01e3..1d2ff70 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -401,6 +401,7 @@ struct thread_struct {
unsigned long ip;
unsigned long fs;
unsigned long gs;
+ unsigned long wchan;
/* Hardware debugging registers: */
unsigned long debugreg0;
unsigned long debugreg1;
@@ -603,6 +604,12 @@ extern void release_thread(struct task_struct *);
extern void prepare_to_copy(struct task_struct *tsk);

unsigned long get_wchan(struct task_struct *p);
+#define set_wchan(task, ip) do { (task)->thread.wchan = (ip); } while (0)
+#define set_wchan_cond(task, ip) do { \
+ unsigned long *__wchan = &(task)->thread.wchan; \
+ if (!__wchan) \
+ *__wchan = (ip); \
+} while (0)

/*
* Generic CPUID function
diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c
index 0a1302f..ba02359 100644
--- a/arch/x86/kernel/process_32.c
+++ b/arch/x86/kernel/process_32.c
@@ -697,26 +697,10 @@ out:

unsigned long get_wchan(struct task_struct *p)
{
- unsigned long bp, sp, ip;
- unsigned long stack_page;
- int count = 0;
if (!p || p == current || p->state == TASK_RUNNING)
return 0;
- stack_page = (unsigned long)task_stack_page(p);
- sp = p->thread.sp;
- if (!stack_page || sp < stack_page || sp > top_esp+stack_page)
- return 0;
- /* include/asm-i386/system.h:switch_to() pushes bp last. */
- bp = *(unsigned long *) sp;
- do {
- if (bp < stack_page || bp > top_ebp+stack_page)
- return 0;
- ip = *(unsigned long *) (bp+4);
- if (!in_sched_functions(ip))
- return ip;
- bp = *(unsigned long *) bp;
- } while (count++ < 16);
- return 0;
+
+ return p->thread.wchan;
}

unsigned long arch_align_stack(unsigned long sp)
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index c958120..222029b 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -739,26 +739,10 @@ asmlinkage long sys_vfork(struct pt_regs *regs)

unsigned long get_wchan(struct task_struct *p)
{
- unsigned long stack;
- u64 fp, ip;
- int count = 0;
-
if (!p || p == current || p->state == TASK_RUNNING)
return 0;
- stack = (unsigned long)task_stack_page(p);
- if (p->thread.sp < stack || p->thread.sp >= stack+THREAD_SIZE)
- return 0;
- fp = *(u64 *)(p->thread.sp);
- do {
- if (fp < (unsigned long)stack ||
- fp >= (unsigned long)stack+THREAD_SIZE)
- return 0;
- ip = *(u64 *)(fp+8);
- if (!in_sched_functions(ip))
- return ip;
- fp = *(u64 *)fp;
- } while (count++ < 16);
- return 0;
+
+ return p->thread.wchan;
}

long do_arch_prctl(struct task_struct *task, int code, unsigned long addr)
diff --git a/include/linux/sched.h b/include/linux/sched.h
index b483f39..82f0b11 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -324,6 +324,11 @@ extern char __sched_text_start[], __sched_text_end[];
/* Is this address in the __sched functions? */
extern int in_sched_functions(unsigned long addr);

+#ifndef set_wchan
+#define set_wchan(task, ip)
+#define set_wchan_cond(task, ip)
+#endif
+
#define MAX_SCHEDULE_TIMEOUT LONG_MAX
extern signed long schedule_timeout(signed long timeout);
extern signed long schedule_timeout_interruptible(signed long timeout);
diff --git a/kernel/sched.c b/kernel/sched.c
index e8819bc..48b0965 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -4477,6 +4477,7 @@ need_resched_nonpreemptible:
rq->curr = next;
++*switch_count;

+ set_wchan_cond(prev, _RET_IP_);
context_switch(rq, prev, next); /* unlocks the rq */
/*
* the context switch might have flipped the stack from under
@@ -4487,6 +4488,7 @@ need_resched_nonpreemptible:
} else
spin_unlock_irq(&rq->lock);

+ set_wchan(current, 0);
if (unlikely(reacquire_kernel_lock(current) < 0))
goto need_resched_nonpreemptible;

@@ -4514,6 +4516,7 @@ asmlinkage void __sched preempt_schedule(void)
return;

do {
+ set_wchan(current, _RET_IP_);
add_preempt_count(PREEMPT_ACTIVE);
schedule();
sub_preempt_count(PREEMPT_ACTIVE);
@@ -4541,6 +4544,7 @@ asmlinkage void __sched preempt_schedule_irq(void)
BUG_ON(ti->preempt_count || !irqs_disabled());

do {
+ set_wchan(current, _RET_IP_);
add_preempt_count(PREEMPT_ACTIVE);
local_irq_enable();
schedule();
@@ -5547,6 +5551,7 @@ asmlinkage long sys_sched_yield(void)
_raw_spin_unlock(&rq->lock);
preempt_enable_no_resched();

+ set_wchan(current, _RET_IP_);
schedule();

return 0;
@@ -5563,6 +5568,7 @@ static void __cond_resched(void)
* cond_resched() call.
*/
do {
+ set_wchan(current, _RET_IP_);
add_preempt_count(PREEMPT_ACTIVE);
schedule();
sub_preempt_count(PREEMPT_ACTIVE);
@@ -5646,6 +5652,7 @@ void __sched io_schedule(void)

delayacct_blkio_start();
atomic_inc(&rq->nr_iowait);
+ set_wchan(current, _RET_IP_);
schedule();
atomic_dec(&rq->nr_iowait);
delayacct_blkio_end();
diff --git a/kernel/timer.c b/kernel/timer.c
index 56becf3..72def2f 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -1182,6 +1182,7 @@ signed long __sched schedule_timeout
struct timer_list timer;
unsigned long expire;

+ set_wchan(current, _RET_IP_);
switch (timeout)
{
case MAX_SCHEDULE_TIMEOUT:


2008-11-06 06:11:44

by Ingo Molnar

[permalink] [raw]
Subject: Re: [patch] sched: fix single-depth wchan output


* Ken Chen <[email protected]> wrote:

> To get a meaningful /proc/<pid>/wchan, one is required to turn on
> full frame pointer when compile kernel/sched.c on x86 arch. The
> enabling of frame pointer applies to entire kernel/sched.c and
> affects lots of other core scheduler functions that aren't related
> to wchan's call stack unwind. This causes unnecessary expansion of
> stack pointer push and pop on the stack for scheduler functions. To
> cut down the cost of frame pointer push/pop, one can use compile
> time config option 'single-depth wchan'. However, the
> 'single-depth' option is broken on x86 due to lack of stack frame
> marker and simple stack unwind doesn't work, i.e., wchan always
> produces '0'.
>
> This patch adds call site location explicitly in thread_struct for
> schedule() function so that get_wchan() can reliably get the data
> and at the same time not to overly burden the entire kernel/sched.c
> with frame pointer generation. The remove of frame pointer
> dependency allows compiler to generate better and faster core
> scheduler code on x86_64.

hm, this adds overhead - and the thing is that WCHAN is rather
uninformative to begin with (because it's a single dimension), so we
should phase it out, not expand it.

How about adding a /proc/<PID>/stacktrace file that gives us the stack
trace of any task in the system? That would be useful for a number of
other purposes as well, and about 100 times more useful than wchan.
(often it would be more useful than sysrq-t dumps)

Hm?

Ingo

2008-11-06 06:17:18

by Ken Chen

[permalink] [raw]
Subject: Re: [patch] sched: fix single-depth wchan output

On Wed, Nov 5, 2008 at 10:11 PM, Ingo Molnar <[email protected]> wrote:
> hm, this adds overhead - and the thing is that WCHAN is rather
> uninformative to begin with (because it's a single dimension), so we
> should phase it out, not expand it.
>
> How about adding a /proc/<PID>/stacktrace file that gives us the stack
> trace of any task in the system? That would be useful for a number of
> other purposes as well, and about 100 times more useful than wchan.
> (often it would be more useful than sysrq-t dumps)

Sure, my main motivation is to remove frame pointer generation. x86_64
unconditionally adds fp for kernel/sched.c right now. I'm all for
phasing out wchan if people don't think there is value in it.

- Ken

2008-11-06 06:30:34

by Ingo Molnar

[permalink] [raw]
Subject: Re: [patch] sched: fix single-depth wchan output


* Ken Chen <[email protected]> wrote:

> On Wed, Nov 5, 2008 at 10:11 PM, Ingo Molnar <[email protected]> wrote:
>
> > hm, this adds overhead - and the thing is that WCHAN is rather
> > uninformative to begin with (because it's a single dimension), so
> > we should phase it out, not expand it.
> >
> > How about adding a /proc/<PID>/stacktrace file that gives us the
> > stack trace of any task in the system? That would be useful for a
> > number of other purposes as well, and about 100 times more useful
> > than wchan. (often it would be more useful than sysrq-t dumps)
>
> Sure, my main motivation is to remove frame pointer generation.
> x86_64 unconditionally adds fp for kernel/sched.c right now. I'm
> all for phasing out wchan if people don't think there is value in
> it.

are you interested in adding /proc/<PID>/stacktrace? If yes then we
could remove fp generation for 64-bit right now and add your
stacktrace patch when you are done with it.

Generally we want frame pointers for high quality backtraces and
trouble-shooting. The small cost is almost always worth paying and
most distros enable framepointers for that reason. On 32-bit a
no-framepointers kernel image has less register pressure, but on
64-bit there's little reason to not enable them.

Ingo

2008-11-06 06:42:39

by Ken Chen

[permalink] [raw]
Subject: Re: [patch] sched: fix single-depth wchan output

On Wed, Nov 5, 2008 at 10:30 PM, Ingo Molnar <[email protected]> wrote:
> are you interested in adding /proc/<PID>/stacktrace? If yes then we
> could remove fp generation for 64-bit right now and add your
> stacktrace patch when you are done with it.

Yes, we had a patch internally at google to dump task stack trace via
procfs just like what you suggested. I will get that patch out.

- Ken

2008-11-06 06:59:46

by Ingo Molnar

[permalink] [raw]
Subject: Re: [patch] sched: fix single-depth wchan output


* Ken Chen <[email protected]> wrote:

> On Wed, Nov 5, 2008 at 10:30 PM, Ingo Molnar <[email protected]> wrote:
> > are you interested in adding /proc/<PID>/stacktrace? If yes then we
> > could remove fp generation for 64-bit right now and add your
> > stacktrace patch when you are done with it.
>
> Yes, we had a patch internally at google to dump task stack trace
> via procfs just like what you suggested. I will get that patch out.

very nice!

If it does stack walking manually then please update it to use
save_stack_trace() instead - that is the standard API that will
utilize the best possible stack walking machinery on the architecture
level.

For security reasons we want it to be admin-only readable, and we also
want a .config for it for the extra paranoid and for CONFIG_EMBEDDED.

Ingo

2008-11-06 07:28:47

by Ken Chen

[permalink] [raw]
Subject: Re: [patch] sched: fix single-depth wchan output

On Wed, Nov 5, 2008 at 10:59 PM, Ingo Molnar <[email protected]> wrote:
> If it does stack walking manually then please update it to use
> save_stack_trace() instead - that is the standard API that will
> utilize the best possible stack walking machinery on the architecture
> level.

OK, I pulled the patch out of our code that export stack trace via
/proc/pid/trace. I didn't write this patch, but I think a better choice
would be to override struct stacktrace_ops print_trace_ops with a memory
buffer pointer to dump the stack into. If you have any comments, please
let me know. I will polish this patch up and rebase to git-head.

--- linux/arch/x86/Kconfig 2008-11-05 22:30:06.000000000 -0800
+++ linux/arch/x86/Kconfig 2008-11-05 22:30:06.000000000 -0800
@@ -213,6 +213,11 @@

config KTIME_SCALAR
def_bool X86_32
+
+config ARCH_HAS_BUF_SHOW_STACK
+ bool
+ default y
+
source "init/Kconfig"

menu "Processor type and features"
--- linux/arch/x86/kernel/traps_64.c 2008-11-05 22:30:06.000000000 -0800
+++ linux/arch/x86/kernel/traps_64.c 2008-11-05 22:30:06.000000000 -0800
@@ -78,6 +78,11 @@
asmlinkage void machine_check(void);
asmlinkage void spurious_interrupt_bug(void);

+struct arch_unwind {
+ char **posp;
+ char *end;
+};
+
static unsigned int code_bytes = 64;

static inline void conditional_sti(struct pt_regs *regs)
@@ -104,22 +109,23 @@

int kstack_depth_to_print = 12;

-void printk_address(unsigned long address, int reliable)
-{
#ifdef CONFIG_KALLSYMS
+static void buf_printk_address(char **posp, char *end, unsigned long address,
+ int reliable)
+{
unsigned long offset = 0, symsize;
const char *symname;
char *modname;
char *delim = ":";
- char namebuf[KSYM_NAME_LEN];
+ char namebuf[128];
char reliab[4] = "";

symname = kallsyms_lookup(address, &symsize, &offset,
- &modname, namebuf);
+ &modname, namebuf);
if (!offset)
- return; /* We don't want to print function ptrs */
+ return; /* We don't want to print function ptrs */
if (!symname) {
- printk(" [<%016lx>]\n", address);
+ buf_printk(posp, end, " [<%016lx>]\n", address);
return;
}
if (!reliable)
@@ -127,11 +133,21 @@

if (!modname)
modname = delim = "";
- printk(" [<%016lx>] %s%s%s%s%s+0x%lx/0x%lx\n",
- address, reliab, delim, modname, delim, symname, offset, symsize);
+ buf_printk(posp, end, " [<%016lx>] %s%s%s%s%s+0x%lx/0x%lx\n",
+ address, reliab, delim, modname, delim, symname, offset,
+ symsize);
+}
#else
- printk(" [<%016lx>]\n", address);
+static void buf_printk_address(char **posp, char *end, unsigned long address,
+ int reliable)
+{
+ buf_printk(posp, end, " [<%016lx>]\n", address);
+}
#endif
+
+void printk_address(unsigned long address, int reliable)
+{
+ buf_printk_address(NULL, NULL, address, reliable);
}

static unsigned long *in_exception_stack(unsigned cpu, unsigned long stack,
@@ -357,25 +373,31 @@
static void
print_trace_warning_symbol(void *data, char *msg, unsigned long symbol)
{
- print_symbol(msg, symbol);
- printk("\n");
+ struct arch_unwind *unw = (struct arch_unwind *)data;
+
+ buf_print_symbol(unw->posp, unw->end, msg, symbol);
+ buf_printk(unw->posp, unw->end, "\n");
}

static void print_trace_warning(void *data, char *msg)
{
- printk("%s\n", msg);
+ struct arch_unwind *unw = (struct arch_unwind *)data;
+
+ buf_printk(unw->posp, unw->end, "%s\n", msg);
}

static int print_trace_stack(void *data, char *name)
{
- printk(" <%s> ", name);
+ struct arch_unwind *unw = (struct arch_unwind *)data;
+ buf_printk(unw->posp, unw->end, " <%s> ", name);
return 0;
}

static void print_trace_address(void *data, unsigned long addr, int reliable)
{
+ struct arch_unwind *unw = (struct arch_unwind *)data;
touch_nmi_watchdog();
- printk_address(addr, reliable);
+ buf_printk_address(unw->posp, unw->end, addr, reliable);
}

static const struct stacktrace_ops print_trace_ops = {
@@ -385,18 +407,29 @@
.address = print_trace_address,
};

+static void
+buf_show_trace(char **posp, char *end,
+ struct task_struct *tsk, struct pt_regs *regs,
+ unsigned long *stack, unsigned long bp) {
+ struct arch_unwind unw;
+ unw.posp = posp;
+ unw.end = end;
+
+ buf_printk(posp, end, "\nCall Trace:\n");
+ dump_trace(tsk, regs, stack, bp, &print_trace_ops, &unw);
+ buf_printk(posp, end, "\n");
+}
+
void
show_trace(struct task_struct *tsk, struct pt_regs *regs, unsigned long *stack,
unsigned long bp)
{
- printk("\nCall Trace:\n");
- dump_trace(tsk, regs, stack, bp, &print_trace_ops, NULL);
- printk("\n");
+ buf_show_trace(NULL, NULL, tsk, regs, stack, bp);
}

static void
-_show_stack(struct task_struct *tsk, struct pt_regs *regs, unsigned long *sp,
- unsigned long bp)
+_buf_show_stack(char **posp, char *end, struct task_struct *tsk,
+ struct pt_regs *regs, unsigned long *sp, unsigned long bp)
{
unsigned long *stack;
int i;
@@ -419,23 +452,29 @@
if (stack >= irqstack && stack <= irqstack_end) {
if (stack == irqstack_end) {
stack = (unsigned long *) (irqstack_end[-1]);
- printk(" <EOI> ");
+ buf_printk(posp, end, " <EOI> ");
}
} else {
if (((long) stack & (THREAD_SIZE-1)) == 0)
break;
}
if (i && ((i % 4) == 0))
- printk("\n");
- printk(" %016lx", *stack++);
+ buf_printk(posp, end, "\n");
+ buf_printk(posp, end, " %016lx", *stack++);
touch_nmi_watchdog();
}
- show_trace(tsk, regs, sp, bp);
+ buf_show_trace(posp, end, tsk, regs, sp, bp);
}

void show_stack(struct task_struct *tsk, unsigned long * sp)
{
- _show_stack(tsk, NULL, sp, 0);
+ _buf_show_stack(NULL, NULL, tsk, NULL, sp, 0);
+}
+
+void buf_show_stack(char **posp, char *end, struct task_struct *tsk,
+ unsigned long * sp)
+{
+ _buf_show_stack(posp, end, tsk, NULL, sp, 0);
}

/*
@@ -485,7 +524,8 @@
if (!user_mode(regs)) {
unsigned char c;
printk("Stack: ");
- _show_stack(NULL, regs, (unsigned long *)sp, regs->bp);
+ _buf_show_stack(NULL, NULL,
+ NULL, regs, (unsigned long *)sp, regs->bp);
printk("\n");

printk(KERN_EMERG "Code: ");
diff -u linux/fs/proc/base.c 2.6.26/fs/proc/base.c
--- linux/fs/proc/base.c 2008-11-05 22:27:42.000000000 -0800
+++ linux/fs/proc/base.c 2008-11-05 22:27:42.000000000 -0800
@@ -2271,6 +2271,31 @@

#endif

+static int proc_pid_trace(struct task_struct *task, char * buffer)
+{
+ char *pos, *end, **posp;
+
+ posp = &pos;
+ pos = buffer;
+ end = buffer + PAGE_SIZE;
+ memset(buffer, 0, PAGE_SIZE);
+
+ /*
+ * Add some safety checking
+ */
+
+ buffer[PAGE_SIZE - 1] = 1;
+ end--;
+
+ read_lock(&tasklist_lock);
+ buf_show_task(posp, end, task);
+ read_unlock(&tasklist_lock);
+
+ WARN_ON(buffer[PAGE_SIZE - 1] != 1);
+ WARN_ON(buffer[PAGE_SIZE - 2] != 0);
+ return (pos - buffer);
+}
+
#if defined(USE_ELF_CORE_DUMP) && defined(CONFIG_ELF_CORE)
static ssize_t proc_coredump_filter_read(struct file *file, char __user *buf,
size_t count, loff_t *ppos)
@@ -2574,6 +2599,7 @@
REG("smaps", S_IRUGO, smaps),
REG("pagemap", S_IRUSR, pagemap),
#endif
+ INF("trace", S_IFREG|S_IRUGO, pid_trace),
#ifdef CONFIG_SECURITY
DIR("attr", S_IRUGO|S_IXUGO, attr_dir),
#endif
--- linux/include/linux/kallsyms.h 2008-11-05 22:30:06.000000000 -0800
+++ linux/include/linux/kallsyms.h 2008-11-05 22:30:06.000000000 -0800
@@ -31,6 +31,8 @@

/* Look up a kernel symbol and print it to the kernel messages. */
extern void __print_symbol(const char *fmt, unsigned long address);
+extern void __buf_print_symbol(char **posp, char *end,
+ const char *fmt, unsigned long address);

int lookup_symbol_name(unsigned long addr, char *symname);
int lookup_symbol_attrs(unsigned long addr, unsigned long *size,
unsigned long *offset, char *modname, char *name);
@@ -75,6 +77,7 @@

/* Stupid that this does nothing, but I didn't create this mess. */
#define __print_symbol(fmt, addr)
+#define __buf_print_symbol(posp, end, fmt, addr)
#endif /*CONFIG_KALLSYMS*/

/* This macro allows us to keep printk typechecking */
@@ -105,6 +108,14 @@
print_symbol(fmt, (unsigned long)addr);
}

+static inline void buf_print_symbol(char **posp, char *end,
+ const char *fmt, unsigned long addr)
+{
+ __check_printsym_format(fmt, "");
+ __buf_print_symbol(posp, end, fmt, (unsigned long)
+ __builtin_extract_return_addr((void *)addr));
+}
+
#ifndef CONFIG_64BIT
#define print_ip_sym(ip) \
do { \
--- linux/include/linux/kernel.h 2008-11-05 22:30:06.000000000 -0800
+++ linux/include/linux/kernel.h 2008-11-05 22:30:06.000000000 -0800
@@ -224,6 +224,9 @@
extern void __attribute__((format(printf, 1, 2)))
early_printk(const char *fmt, ...);

+extern void buf_printk(char **posp, char *end, const char *fmt, ...)
+ __attribute__ ((format (printf, 3, 4)));
+
void redisplay_console_messages(void);

unsigned long int_sqrt(unsigned long);
--- linux/include/linux/sched.h 2008-11-05 22:30:06.000000000 -0800
+++ linux/include/linux/sched.h 2008-11-05 22:30:06.000000000 -0800
@@ -267,6 +267,7 @@
*/
extern void show_state_filter(unsigned long state_filter);

+extern void buf_show_task(char **posp, char *end, struct task_struct *p);
static inline void show_state(void)
{
show_state_filter(0);
@@ -280,7 +281,17 @@
* trace (or NULL if the entire call-chain of the task should be shown).
*/
extern void show_stack(struct task_struct *task, unsigned long *sp);
-
+#ifdef CONFIG_ARCH_HAS_BUF_SHOW_STACK
+extern void buf_show_stack(char **posp, char *end,
+ struct task_struct *task, unsigned long *sp);
+#else
+static inline void buf_show_stack(char **posp, char *end,
+ struct task_struct *task, unsigned long *sp)
+{
+ if (posp == NULL)
+ show_stack(task, sp);
+}
+#endif
void io_schedule(void);
long io_schedule_timeout(long timeout);

--- linux/kernel/kallsyms.c 2008-11-05 22:30:06.000000000 -0800
+++ linux/kernel/kallsyms.c 2008-11-05 22:30:06.000000000 -0800
@@ -319,13 +319,19 @@
}

/* Look up a kernel symbol and print it to the kernel messages. */
+void __buf_print_symbol(char **posp, char *end,
+ const char *fmt, unsigned long address)
+{
+ char buffer[KSYM_SYMBOL_LEN];
+
+ sprint_symbol(buffer, address);
+
+ buf_printk(posp, end, fmt, buffer);
+}
+
void __print_symbol(const char *fmt, unsigned long address)
{
- char buffer[KSYM_SYMBOL_LEN];
-
- sprint_symbol(buffer, address);
-
- printk(fmt, buffer);
+ __buf_print_symbol(NULL, NULL, fmt, address);
}

/* To avoid using get_symbol_offset for every symbol, we carry prefix along. */
--- linux/kernel/printk.c 2008-11-05 22:30:06.000000000 -0800
+++ linux/kernel/printk.c 2008-11-05 22:30:06.000000000 -0800
@@ -655,6 +655,20 @@
return r;
}

+void buf_printk(char **posp, char *end, const char *fmt, ...)
+{
+ va_list args;
+
+ va_start(args, fmt);
+
+ if (posp != NULL)
+ *posp += vscnprintf(*posp, end - (*posp), fmt, args);
+ else
+ (void)vprintk(fmt, args);
+
+ va_end(args);
+}
+
/* cpu currently holding logbuf_lock */
static volatile unsigned int printk_cpu = UINT_MAX;

--- linux/kernel/sched.c 2008-11-05 22:30:06.000000000 -0800
+++ linux/kernel/sched.c 2008-11-05 22:30:06.000000000 -0800
@@ -5398,24 +5398,24 @@

static const char stat_nam[] = "RSDTtZX";

-void sched_show_task(struct task_struct *p)
+void buf_show_task(char **posp, char *end, struct task_struct *p)
{
unsigned long free = 0;
unsigned state;

state = p->state ? __ffs(p->state) + 1 : 0;
- printk(KERN_INFO "%-13.13s %c", p->comm,
+ buf_printk(posp, end, KERN_INFO "%-13.13s %c", p->comm,
state < sizeof(stat_nam) - 1 ? stat_nam[state] : '?');
#if BITS_PER_LONG == 32
if (state == TASK_RUNNING)
- printk(KERN_CONT " running ");
+ buf_printk(posp, end, KERN_CONT " running ");
else
- printk(KERN_CONT " %08lx ", thread_saved_pc(p));
+ buf_printk(posp, end, KERN_CONT " %08lx ", thread_saved_pc(p));
#else
if (state == TASK_RUNNING)
- printk(KERN_CONT " running task ");
+ buf_printk(posp, end, KERN_CONT " running task ");
else
- printk(KERN_CONT " %016lx ", thread_saved_pc(p));
+ buf_printk(posp, end, KERN_CONT " %016lx ", thread_saved_pc(p));
#endif
#ifdef CONFIG_DEBUG_STACK_USAGE
{
@@ -5425,10 +5425,15 @@
free = (unsigned long)n - (unsigned long)end_of_stack(p);
}
#endif
- printk(KERN_CONT "%5lu %5d %6d\n", free,
+ buf_printk(posp, end, KERN_CONT "%5lu %5d %6d\n", free,
task_pid_nr(p), task_pid_nr(p->real_parent));

- show_stack(p, NULL);
+ buf_show_stack(posp, end, p, NULL);
+}
+
+void sched_show_task(struct task_struct *p)
+{
+ buf_show_task(NULL, NULL, p);
}

void show_state_filter(unsigned long state_filter)

2008-11-06 07:44:46

by Ingo Molnar

[permalink] [raw]
Subject: Re: [patch] sched: fix single-depth wchan output


* Ken Chen <[email protected]> wrote:

> On Wed, Nov 5, 2008 at 10:59 PM, Ingo Molnar <[email protected]> wrote:
> > If it does stack walking manually then please update it to use
> > save_stack_trace() instead - that is the standard API that will
> > utilize the best possible stack walking machinery on the architecture
> > level.
>
> OK, I pulled the patch out of our code that export stack trace via
> /proc/pid/trace. I didn't write this patch, but I think a better
> choice would be to override struct stacktrace_ops print_trace_ops
> with a memory buffer pointer to dump the stack into. If you have
> any comments, please let me know. I will polish this patch up and
> rebase to git-head.

hm, instead of modifying the lowlevel arch dump code, why not just use
the existing save_stack_trace(), and render the output yourself via a
trivial sprintf, just like kernel/lockdep.c does?

See kernel/stacktrace.c's print_stack_trace() - that could be extended
with a sprintf_stack_trace() method. Allocate a large enough buffer
dynamically, with a max of 128 stacktrace entries or so. (the output
buffer is limited to 4K-ish anyway, right?)

As a bonus this will work on every architecture, not just x86.

a few other details:

> - char namebuf[KSYM_NAME_LEN];
> + char namebuf[128];

... time machine back to old crappy code ;-)

> symname = kallsyms_lookup(address, &symsize, &offset,
> - &modname, namebuf);
> + &modname, namebuf);

ditto. But none of this has to be modified so you can just drop these
bits.

> + read_lock(&tasklist_lock);
> + buf_show_task(posp, end, task);
> + read_unlock(&tasklist_lock);

to get a stable trace we'll need more locking than that (tasklist_lock
does not exclude scheduling, etc.) - but it takes care of the most
important detail: tasks exiting from under us. So this should be OK.

> #endif
> + INF("trace", S_IFREG|S_IRUGO, pid_trace),

that needs to be r-------- instead of r--r--r--, for security reasons.

Ingo

2008-11-06 21:56:51

by Matt Mackall

[permalink] [raw]
Subject: Re: [patch] sched: fix single-depth wchan output

On Thu, 2008-11-06 at 07:11 +0100, Ingo Molnar wrote:
> * Ken Chen <[email protected]> wrote:
>
> > To get a meaningful /proc/<pid>/wchan, one is required to turn on
> > full frame pointer when compile kernel/sched.c on x86 arch. The
> > enabling of frame pointer applies to entire kernel/sched.c and
> > affects lots of other core scheduler functions that aren't related
> > to wchan's call stack unwind. This causes unnecessary expansion of
> > stack pointer push and pop on the stack for scheduler functions. To
> > cut down the cost of frame pointer push/pop, one can use compile
> > time config option 'single-depth wchan'. However, the
> > 'single-depth' option is broken on x86 due to lack of stack frame
> > marker and simple stack unwind doesn't work, i.e., wchan always
> > produces '0'.
> >
> > This patch adds call site location explicitly in thread_struct for
> > schedule() function so that get_wchan() can reliably get the data
> > and at the same time not to overly burden the entire kernel/sched.c
> > with frame pointer generation. The remove of frame pointer
> > dependency allows compiler to generate better and faster core
> > scheduler code on x86_64.
>
> hm, this adds overhead - and the thing is that WCHAN is rather
> uninformative to begin with (because it's a single dimension), so we
> should phase it out, not expand it.

WCHAN is a long-standing public interface and isn't a Linuxism. I don't
think phasing it out is a good idea.

> How about adding a /proc/<PID>/stacktrace file that gives us the stack
> trace of any task in the system? That would be useful for a number of
> other purposes as well, and about 100 times more useful than wchan.
> (often it would be more useful than sysrq-t dumps)

But this is a great idea, of course.

--
Mathematics is the supreme nostalgia of our time.

2008-11-06 22:30:58

by Chris Friesen

[permalink] [raw]
Subject: Re: [patch] sched: fix single-depth wchan output

Matt Mackall wrote:
> On Thu, 2008-11-06 at 07:11 +0100, Ingo Molnar wrote:

>>How about adding a /proc/<PID>/stacktrace file that gives us the stack
>>trace of any task in the system? That would be useful for a number of
>>other purposes as well, and about 100 times more useful than wchan.
>>(often it would be more useful than sysrq-t dumps)
>
>
> But this is a great idea, of course.

Agreed. I know some apps designers that would love to get their hands
on something like that...



Chris

2008-11-06 22:33:47

by Andi Kleen

[permalink] [raw]
Subject: Re: [patch] sched: fix single-depth wchan output

Ingo Molnar <[email protected]> writes:
>
> are you interested in adding /proc/<PID>/stacktrace? If yes then we
> could remove fp generation for 64-bit right now and add your
> stacktrace patch when you are done with it.

FWIW there used to be a patch floating around doing that a couple of
years ago and even used in some distros. But the big problem was if
someone read all of proc it took a lot of CPU time to do all the stack
tracing. This also was triggerable from non root.

So yes it's useful, but it can be costly.

-Andi

--
[email protected]