LinuxLists.cc - [patch] larger kernel stack (8k->16k) per task

2002-02-08 15:17:53

Subject: [patch] larger kernel stack (8k->16k) per task

Hi,

In the light of some talks here about increasing kernel stack, here is my
patch for i386 architecture that some may find useful. It also has a nice
extra (/proc/stack) implemented by Hugh Dickins which helps to find major
offenders.

It is against 2.4.9 but should be easy to port in any direction. (One way
the patch could be improved is by making the size CONFIG_ option instead
of hardcoding). Oh btw, please don't tell me "but now you'd need _four_
physically-contiguous pages to create a task instead of two!" because I
know it (and think it's not too bad).

Regards,
Tigran

diff -urN 249-vx/arch/i386/kernel/entry.S 249-vx-bigstack/arch/i386/kernel/entry.S
--- 249-vx/arch/i386/kernel/entry.S Tue Jun 12 19:47:28 2001
+++ 249-vx-bigstack/arch/i386/kernel/entry.S Sun Jan 13 00:14:30 2002
@@ -130,7 +130,7 @@
.previous

#define GET_CURRENT(reg) \
- movl $-8192, reg; \
+ movl $-16384, reg; \
andl %esp, reg

ENTRY(lcall7)
@@ -145,7 +145,7 @@
movl %ecx,CS(%esp) #
movl %esp,%ebx
pushl %ebx
- andl $-8192,%ebx # GET_CURRENT
+ andl $-16384,%ebx # GET_CURRENT
movl exec_domain(%ebx),%edx # Get the execution domain
movl 4(%edx),%edx # Get the lcall7 handler for the domain
pushl $0x7
@@ -166,7 +166,7 @@
movl %ecx,CS(%esp) #
movl %esp,%ebx
pushl %ebx
- andl $-8192,%ebx # GET_CURRENT
+ andl $-16384,%ebx # GET_CURRENT
movl exec_domain(%ebx),%edx # Get the execution domain
movl 4(%edx),%edx # Get the lcall7 handler for the domain
pushl $0x27
diff -urN 249-vx/arch/i386/kernel/head.S 249-vx-bigstack/arch/i386/kernel/head.S
--- 249-vx/arch/i386/kernel/head.S Wed Jun 20 19:00:53 2001
+++ 249-vx-bigstack/arch/i386/kernel/head.S Sun Jan 13 00:14:30 2002
@@ -320,7 +320,7 @@
ret

ENTRY(stack_start)
- .long SYMBOL_NAME(init_task_union)+8192
+ .long SYMBOL_NAME(init_task_union)+16384
.long __KERNEL_DS

/* This is the default interrupt "handler" :-) */
diff -urN 249-vx/arch/i386/kernel/process.c 249-vx-bigstack/arch/i386/kernel/process.c
--- 249-vx/arch/i386/kernel/process.c Thu Jul 26 02:19:11 2001
+++ 249-vx-bigstack/arch/i386/kernel/process.c Sun Jan 13 00:14:30 2002
@@ -757,12 +757,12 @@
return 0;
stack_page = (unsigned long)p;
esp = p->thread.esp;
- if (!stack_page || esp < stack_page || esp > 8188+stack_page)
+ if (!stack_page || esp < stack_page || esp > 16380+stack_page)
return 0;
/* include/asm-i386/system.h:switch_to() pushes ebp last. */
ebp = *(unsigned long *) esp;
do {
- if (ebp < stack_page || ebp > 8184+stack_page)
+ if (ebp < stack_page || ebp > 16376+stack_page)
return 0;
eip = *(unsigned long *) (ebp+4);
if (eip < first_sched || eip >= last_sched)
diff -urN 249-vx/arch/i386/kernel/smpboot.c 249-vx-bigstack/arch/i386/kernel/smpboot.c
--- 249-vx/arch/i386/kernel/smpboot.c Tue Feb 13 22:13:43 2001
+++ 249-vx-bigstack/arch/i386/kernel/smpboot.c Sun Jan 13 00:14:30 2002
@@ -577,7 +577,7 @@

/* So we see what's up */
printk("Booting processor %d/%d eip %lx\n", cpu, apicid, start_eip);
- stack_start.esp = (void *) (1024 + PAGE_SIZE + (char *)idle);
+ stack_start.esp = (void *) idle->thread.esp;

/*
* This grunge runs the startup process for
diff -urN 249-vx/arch/i386/kernel/traps.c 249-vx-bigstack/arch/i386/kernel/traps.c
--- 249-vx/arch/i386/kernel/traps.c Sun Aug 12 19:13:59 2001
+++ 249-vx-bigstack/arch/i386/kernel/traps.c Sun Jan 13 00:14:30 2002
@@ -160,7 +160,7 @@
unsigned long esp = tsk->thread.esp;

/* User space on another CPU? */
- if ((esp ^ (unsigned long)tsk) & (PAGE_MASK<<1))
+ if ((esp ^ (unsigned long)tsk) & ~(THREAD_SIZE-1))
return;
show_trace((unsigned long *)esp);
}
diff -urN 249-vx/arch/i386/lib/getuser.S 249-vx-bigstack/arch/i386/lib/getuser.S
--- 249-vx/arch/i386/lib/getuser.S Mon Jan 12 21:42:52 1998
+++ 249-vx-bigstack/arch/i386/lib/getuser.S Sun Jan 13 00:14:30 2002
@@ -28,7 +28,7 @@
.globl __get_user_1
__get_user_1:
movl %esp,%edx
- andl $0xffffe000,%edx
+ andl $-16384,%edx
cmpl addr_limit(%edx),%eax
jae bad_get_user
1: movzbl (%eax),%edx
@@ -41,7 +41,7 @@
addl $1,%eax
movl %esp,%edx
jc bad_get_user
- andl $0xffffe000,%edx
+ andl $-16384,%edx
cmpl addr_limit(%edx),%eax
jae bad_get_user
2: movzwl -1(%eax),%edx
@@ -54,7 +54,7 @@
addl $3,%eax
movl %esp,%edx
jc bad_get_user
- andl $0xffffe000,%edx
+ andl $-16384,%edx
cmpl addr_limit(%edx),%eax
jae bad_get_user
3: movl -3(%eax),%edx
diff -urN 249-vx/arch/i386/vmlinux.lds 249-vx-bigstack/arch/i386/vmlinux.lds
--- 249-vx/arch/i386/vmlinux.lds Sat Jan 12 21:31:54 2002
+++ 249-vx-bigstack/arch/i386/vmlinux.lds Sun Jan 13 00:14:30 2002
@@ -36,7 +36,7 @@

_edata = .; /* End of data section */

- . = ALIGN(8192); /* init_task */
+ . = ALIGN(16384); /* init_task */
.data.init_task : { *(.data.init_task) }

. = ALIGN(4096); /* Init code and data */
diff -urN 249-vx/include/asm-i386/current.h 249-vx-bigstack/include/asm-i386/current.h
--- 249-vx/include/asm-i386/current.h Sat Aug 15 00:35:22 1998
+++ 249-vx-bigstack/include/asm-i386/current.h Sun Jan 13 00:14:30 2002
@@ -6,7 +6,7 @@
static inline struct task_struct * get_current(void)
{
struct task_struct *current;
- __asm__("andl %%esp,%0; ":"=r" (current) : "0" (~8191UL));
+ __asm__("andl %%esp,%0; ":"=r" (current) : "0" (~16383UL));
return current;
}

diff -urN 249-vx/include/asm-i386/hw_irq.h 249-vx-bigstack/include/asm-i386/hw_irq.h
--- 249-vx/include/asm-i386/hw_irq.h Wed Aug 15 22:21:11 2001
+++ 249-vx-bigstack/include/asm-i386/hw_irq.h Sun Jan 13 00:14:30 2002
@@ -115,7 +115,7 @@

#define GET_CURRENT \
"movl %esp, %ebx\n\t" \
- "andl $-8192, %ebx\n\t"
+ "andl $-16384, %ebx\n\t"

/*
* SMP has a few special interrupts for IPI messages
diff -urN 249-vx/include/asm-i386/processor.h 249-vx-bigstack/include/asm-i386/processor.h
--- 249-vx/include/asm-i386/processor.h Wed Aug 15 22:21:11 2001
+++ 249-vx-bigstack/include/asm-i386/processor.h Sun Jan 13 00:14:30 2002
@@ -445,13 +445,27 @@
}

unsigned long get_wchan(struct task_struct *p);
-#define KSTK_EIP(tsk) (((unsigned long *)(4096+(unsigned long)(tsk)))[1019])
-#define KSTK_ESP(tsk) (((unsigned long *)(4096+(unsigned long)(tsk)))[1022])

-#define THREAD_SIZE (2*PAGE_SIZE)
-#define alloc_task_struct() ((struct task_struct *) __get_free_pages(GFP_KERNEL,1))
-#define free_task_struct(p) free_pages((unsigned long) (p), 1)
-#define get_task_struct(tsk) atomic_inc(&virt_to_page(tsk)->count)
+#define THREAD_ORDER 2
+#define THREAD_SIZE (PAGE_SIZE << THREAD_ORDER)
+#define THREAD_LONGS (THREAD_SIZE / sizeof(unsigned long))
+
+#define KSTK_EIP(tsk) (((unsigned long *)(tsk))[THREAD_LONGS-5])
+#define KSTK_ESP(tsk) (((unsigned long *)(tsk))[THREAD_LONGS-2])
+
+#define alloc_task_struct() \
+ ({ \
+ unsigned long tsk = __get_free_pages(GFP_KERNEL, THREAD_ORDER); \
+ if (tsk != 0) { \
+ /* Alt-SysRq-T show_task() show if stack went deep */ \
+ memset((char*)tsk+sizeof(struct task_struct), 0, \
+ THREAD_SIZE-sizeof(struct task_struct)); \
+ } \
+ (struct task_struct *) tsk; \
+ })
+
+#define free_task_struct(tsk) free_pages((unsigned long) (tsk), THREAD_ORDER)
+#define get_task_struct(tsk) atomic_inc(&virt_to_page(tsk)->count)

#define init_task (init_task_union.task)
#define init_stack (init_task_union.stack)
diff -urN 249-vx/include/linux/sched.h 249-vx-bigstack/include/linux/sched.h
--- 249-vx/include/linux/sched.h Sat Jan 12 23:53:58 2002
+++ 249-vx-bigstack/include/linux/sched.h Sun Jan 13 00:14:30 2002
@@ -484,7 +484,7 @@

#ifndef INIT_TASK_SIZE
-# define INIT_TASK_SIZE 2048*sizeof(long)
+# define INIT_TASK_SIZE 4096*sizeof(long)
#endif

union task_union {
diff -urN 249-vx/kernel/exit.c 249-vx-bigstack/kernel/exit.c
--- 249-vx/kernel/exit.c Sun Aug 12 18:52:29 2001
+++ 249-vx-bigstack/kernel/exit.c Sun Jan 13 00:14:30 2002
@@ -11,6 +11,7 @@
#include <linux/module.h>
#include <linux/completion.h>
#include <linux/tty.h>
+#include <linux/proc_fs.h>
#ifdef CONFIG_BSD_PROCESS_ACCT
#include <linux/acct.h>
#endif
@@ -24,6 +25,74 @@

int getrusage(struct task_struct *, int, struct rusage *);

+#ifdef CONFIG_PROC_FS
+static int max_stack_depth;
+static pid_t max_stack_pid;
+static struct task_struct *max_stack_task; /* kept for kdb */
+static spinlock_t max_stack_lock = SPIN_LOCK_UNLOCKED;
+static void check_stack_depth(struct task_struct *);
+
+static int stack_read_proc(char *page, char **start, off_t off,
+ int count, int *eof, void *data)
+{
+ struct task_struct *p;
+ int len;
+
+ read_lock(&tasklist_lock);
+ for_each_task(p)
+ check_stack_depth(p);
+ read_unlock(&tasklist_lock);
+
+ spin_lock(&max_stack_lock);
+ len = sprintf(page,
+ "Deepest stack is 0x%04x (+ task 0x%04x = 0x%04x) for pid %d %.16s\n",
+ max_stack_depth, sizeof(*p), sizeof(*p) + max_stack_depth,
+ max_stack_pid, max_stack_task->comm);
+ spin_unlock(&max_stack_lock);
+
+ *eof = (len <= off + count);
+ *start = page + off;
+ len -= off;
+ if (len > count)
+ len = count;
+ if (len < 0)
+ len = 0;
+ return len;
+}
+
+static void check_stack_depth(struct task_struct *p)
+{
+ int depth;
+ struct task_struct *old_stack_task;
+ unsigned long *sp = (unsigned long *)(p + 1);
+
+ while (!*sp)
+ sp++;
+ depth = (int)p + THREAD_SIZE - (int)sp;
+ if (depth <= max_stack_depth)
+ return;
+
+ old_stack_task = NULL;
+ spin_lock(&max_stack_lock);
+ if (depth > max_stack_depth) {
+ old_stack_task = max_stack_task;
+ max_stack_depth = depth;
+ max_stack_pid = p->pid;
+ max_stack_task = p;
+ get_task_struct(p);
+ }
+ spin_unlock(&max_stack_lock);
+
+ if (old_stack_task)
+ free_task_struct(old_stack_task);
+ else
+ create_proc_read_entry("stack",
+ 0444, NULL, stack_read_proc, NULL);
+}
+#else /* ndef CONFIG_PROC_FS */
+#define check_stack_depth(p) do { } while (0)
+#endif /* ndef CONFIG_PROC_FS */
+
static void release_task(struct task_struct * p)
{
if (p != current) {
@@ -63,6 +132,7 @@
current->counter += p->counter;
if (current->counter >= MAX_COUNTER)
current->counter = MAX_COUNTER;
+ check_stack_depth(p);
p->pid = 0;
free_task_struct(p);
} else {

2002-02-08 15:37:03

by Arjan van de Ven

[permalink] [raw]

Subject: Re: [patch] larger kernel stack (8k->16k) per task

Tigran Aivazian wrote:
>
> Hi,
>
> In the light of some talks here about increasing kernel stack, here is my
> patch for i386 architecture that some may find useful. It also has a nice
> extra (/proc/stack) implemented by Hugh Dickins which helps to find major
> offenders.
>
> It is against 2.4.9 but should be easy to port in any direction. (One way
> the patch could be improved is by making the size CONFIG_ option instead
> of hardcoding). Oh btw, please don't tell me "but now you'd need _four_
> physically-contiguous pages to create a task instead of two!" because I
> know it (and think it's not too bad).

I do think it is too bad. Sorry.
Also it's the wrong approach. The right approach (as done by Manfred and
David) is
to put "current" no longer on this stack just a pointer to current.
And .... if you need a 16Kb stack your kernel code is VERY VERY sick.

Greetings,
Arjan van de Ven

2002-02-08 15:39:13

by Alan

[permalink] [raw]

Subject: Re: [patch] larger kernel stack (8k->16k) per task

> It is against 2.4.9 but should be easy to port in any direction. (One way
> the patch could be improved is by making the size CONFIG_ option instead
> of hardcoding). Oh btw, please don't tell me "but now you'd need _four_
> physically-contiguous pages to create a task instead of two!" because I
> know it (and think it's not too bad).

As a debugging tool its ok. Adding 8K to the kernel stack size
is a wonderful way to eat huge amounts of RAM though.

2002-02-08 15:44:43

by Rik van Riel

[permalink] [raw]

Subject: Re: [patch] larger kernel stack (8k->16k) per task

On Fri, 8 Feb 2002, Tigran Aivazian wrote:

> It also has a nice extra (/proc/stack) implemented by Hugh Dickins
> which helps to find major offenders.

That's really nice because it gives us the opportunity to
look at the major offenders and see if we can fix those,
instead of bloating up the kernel further.

> Oh btw, please don't tell me "but now you'd need _four_
> physically-contiguous pages to create a task instead of two!" because
> I know it (and think it's not too bad).

On large machines ZONE_NORMAL is in a big squeeze, so
growing a kernel data structure without any justification
is a big no-no.

I take it you've used /proc/stack to find out what the
major offenders are ?

If so, could you share the list of major offenders with us
so we have an idea which functions to fix ?

(I take it you must have run into some problems, otherwise
you wouldn't have posted the patch)

regards,

Rik
--
"Linux holds advantages over the single-vendor commercial OS"
-- Microsoft's "Competing with Linux" document

http://www.surriel.com/ http://distro.conectiva.com/

2002-02-08 16:03:44

by Tigran Aivazian

[permalink] [raw]

Subject: Re: [patch] larger kernel stack (8k->16k) per task

Hi Arjan,

> Also it's the wrong approach. The right approach (as done by Manfred and
> David) is
> to put "current" no longer on this stack just a pointer to current.

Can you clarify this, please (because by default your statement sounds
like nonsense, sorry).

You are saying that the right approach is to move "current" off the stack.
The right approach to what? Surely not to saving kernel stack because
"current" (being merely a struct task_struct) is not a major eater of the
stack. Those functions which declare 5-6k of local variables are (if
there are still any left). Speaking of which, I will also answer Rik --
the offenders (that "VERY VERY sick code" Arjan refers to) we found were
in LKCD so it's been fixed ages ago.

So, moving struct task_struct is irrelevant, really. Unless you meant
something completely different and if so I look forward to your
clarifications.

Regards,
Tigran

2002-02-08 16:09:54

by Arjan van de Ven

[permalink] [raw]

Subject: Re: [patch] larger kernel stack (8k->16k) per task

On Fri, Feb 08, 2002 at 04:06:35PM +0000, Tigran Aivazian wrote:
> Hi Arjan,
>
> > Also it's the wrong approach. The right approach (as done by Manfred and
> > David) is
> > to put "current" no longer on this stack just a pointer to current.
>

> You are saying that the right approach is to move "current" off the stack.
> The right approach to what? Surely not to saving kernel stack because
> "current" (being merely a struct task_struct) is not a major eater of the
> stack.

1.5Kb... that's quite a lot on 8Kb

> Those functions which declare 5-6k of local variables are (if
> there are still any left).

There are none. And if there are they are very easy to find and fix.

> Speaking of which, I will also answer Rik --
> the offenders (that "VERY VERY sick code" Arjan refers to) we found were
> in LKCD so it's been fixed ages ago.

LKCD is not part of the normal kernel, and in some parts could fall under
"VER VERY sick"; esp if they indeed use 6Kb stack.

> So, moving struct task_struct is irrelevant, really. Unless you meant
> something completely different and if so I look forward to your

Apparently you see stackoverflows with some code. Well, 1.5Kb (approx)
is some win there (although most of that is reserved for stack coloring).

If you need even more in your code (I assume you do otherwise you wouldn't
have done the work) then I really suggest you take a long hard look and fix
the obvious bugs or the design....

Greetings,
Arjan van de Ven

2002-02-08 16:58:20

by Arjan van de Ven

[permalink] [raw]

Subject: Re: [patch] larger kernel stack (8k->16k) per task

On Fri, Feb 08, 2002 at 04:59:47PM +0000, Tigran Aivazian wrote:
> On Fri, 8 Feb 2002, Arjan van de Ven wrote:
> > If you need even more in your code (I assume you do otherwise you wouldn't
> > have done the work) then I really suggest you take a long hard look and fix
> > the obvious bugs or the design....
>
> Arjan, I completely agree with you, but please do not overlook one obvious
> thing -- sometimes (well, most of the time) in order to fix those stack
> corruption issues you _first_ need to apply this patch and then it becomes
> obvious that the reason for this "random" corruption is the stack
> overflow. A kernel panic is not shouting like "I am a stack overflow!"

Stack redzoning makes a lot of sense. And checking isn't that expensive...

2002-02-08 16:57:00

by Tigran Aivazian

[permalink] [raw]

Subject: Re: [patch] larger kernel stack (8k->16k) per task

On Fri, 8 Feb 2002, Arjan van de Ven wrote:
> If you need even more in your code (I assume you do otherwise you wouldn't
> have done the work) then I really suggest you take a long hard look and fix
> the obvious bugs or the design....

Arjan, I completely agree with you, but please do not overlook one obvious
thing -- sometimes (well, most of the time) in order to fix those stack
corruption issues you _first_ need to apply this patch and then it becomes
obvious that the reason for this "random" corruption is the stack
overflow. A kernel panic is not shouting like "I am a stack overflow!"
(yes, I know of Andrea's IKD of course, but sometimes it is preferrable to
apply a small non-intrusive patch instead)

So, I found this patch useful at least for debugging. Moreover, I think it
would be very useful to have it in Linus' kernel as a CONFIG_ option so
that if people complain about random memory corruption then they can try
to reproduce it with larger stack and then (with aid of /proc/stack) the
offender is found and fixed. I cc'd Alan; if he thinks this is a bad idea
I would be interested to know why.

Regards,
Tigran

2002-02-08 17:24:01

by Arjan van de Ven

[permalink] [raw]

Subject: Re: [patch] larger kernel stack (8k->16k) per task

Tigran Aivazian wrote:
>
> On Fri, 8 Feb 2002, Arjan van de Ven wrote:
> > If you need even more in your code (I assume you do otherwise you wouldn't
> > have done the work) then I really suggest you take a long hard look and fix
> > the obvious bugs or the design....
>
> Arjan, I completely agree with you, but please do not overlook one obvious
> thing -- sometimes (well, most of the time) in order to fix those stack
> corruption issues you _first_ need to apply this patch and then it becomes
> obvious that the reason for this "random" corruption is the stack

Well, there's also a simple script you can run on the vmlinux to catch
big offenders....
I can even see the point of running that at the end of "make bzImage"
and abort or
at least seriously warn if there are offenders that, say, allocate more
than 2Kb of stackspace.

2002-02-08 18:02:25

by Alan

[permalink] [raw]

Subject: Re: [patch] larger kernel stack (8k->16k) per task

> to reproduce it with larger stack and then (with aid of /proc/stack) the
> offender is found and fixed. I cc'd Alan; if he thinks this is a bad idea
> I would be interested to know why.

Personal feeling: it would be better to vmalloc the stack in this case
and use the existing vmalloc red zones. For 2.5 that should work out ok
unlike 2.4 where the scsi and usb will break

2002-02-08 18:03:36

by Alan

[permalink] [raw]

Subject: Re: [patch] larger kernel stack (8k->16k) per task

> Well, there's also a simple script you can run on the vmlinux to catch
> big offenders....
> I can even see the point of running that at the end of "make bzImage"
> and abort or
> at least seriously warn if there are offenders that, say, allocate more
> than 2Kb of stackspace.

Thats worth doing too if its reliable

2002-02-08 18:28:54

by Richard B. Johnson

[permalink] [raw]

Subject: Re: [patch] larger kernel stack (8k->16k) per task

On Fri, 8 Feb 2002, Arjan van de Ven wrote:

> Tigran Aivazian wrote:
> >
> > On Fri, 8 Feb 2002, Arjan van de Ven wrote:
> > > If you need even more in your code (I assume you do otherwise you wouldn't
> > > have done the work) then I really suggest you take a long hard look and fix
> > > the obvious bugs or the design....
> >
> > Arjan, I completely agree with you, but please do not overlook one obvious
> > thing -- sometimes (well, most of the time) in order to fix those stack
> > corruption issues you _first_ need to apply this patch and then it becomes
> > obvious that the reason for this "random" corruption is the stack
>
> Well, there's also a simple script you can run on the vmlinux to catch
> big offenders....
> I can even see the point of running that at the end of "make bzImage"
> and abort or
> at least seriously warn if there are offenders that, say, allocate more
> than 2Kb of stackspace.

Could someone please tell me why you should make a function call to
allocate, and then later on free, some temporary space for variables
when allocation on the stack involves simply subtracting a value,
calculated by the compiler at compile-time, from the stack-pointer?

I think it is entirely inefficient to call an external procedure
for temporary variable space when the actual math is done by the
compiler at compile time, and the code is a simple subtraction, then
later-on a simple addition to a single register!

This is an Intel example. The same applies for other machines, but
the syntax is different. The simplified clock calculation assumes
that a long-word operation on memory uses 4 clocks and a register
operation uses 2 clocks. This is about 90 percent accurate in spite
of the many exceptions with Intel Processors.

Local variable allocation, 4 clocks.

funct: pushl %ebx # Save registers that must be preserved.
pushl %esi
pushl %edi
subl $LOCAL_SPACE, %esp # Allocation for local variables
# 2 clocks.
.......
....... # Do stuff.
.......

addl $LOCAL_SPACE, %esp # 2 clocks
popl %edi # Restore registers that you saved
popl %esi
popl %ebx
ret

Remote allocation:

funct: pushl %ebx
pushl %esi
pushl %edi
subl $SPACE_FOR_POINTER, %esp # 2 clocks
pushl DATA_LENGTH # 4 clocks
pushl DATA_TYPE # 4 clocks
call kmalloc # 8 + kmalloc clocks
addl $8, %esp # Level stack # 2 clocks
orl %eax,%eax # Check for NULL # 2 clocks
jz failed # kmalloc failed # Not taken
movl %eax, (%esp) # Save pointer # 4 clocks
......
...... # Do stuff
......

pushl (%esp) # kmalloc pointer 4 clocks
call kfree # Free memory 8 + kfree clocks
addl $4, %esp # Level stack 2 clocks

failed:
addl $SPACE_FOR_POINTER, %esp 2 clocks
popl %edi
popl %esi
popl %ebx
ret

This takes 42 clocks, not including the thousands used up by kmalloc
and kfree.

If the kernel does not provide sufficient stack-space for small
buffers and structures, it is a kernel problem, not a driver-coding
problem. Competent driver (module) writers are acutely aware of the
necessity of obtaining speed and certainly would never call a
remote allocator for local variables. Global variables are often
allocated at initialization time using the appropriate kernel
allocator for the memory type required.

If you are attempting to require that good design practice be
abandoned according to this new whim, you are playing a microsoft-ism.
Please leave the drivers that work, alone.

Stack corruption has nothing to do with using the kernel stack for
local variables. It has everything to do with somebody or some thing
writing somewhere they don't own. This is what must be fixed.

That said, I certainly don't know how much stack-space should be available
and, probably, nobody else does either. After the stack corruption is
fixed, the stack should be profiled. This can be easily done from a
module if the stack gets zeroed before it's used, during the boot process.

Cheers,
Dick Johnson

Penguin : Linux version 2.4.1 on an i686 machine (797.90 BogoMips).

I was going to compile a list of innovations that could be
attributed to Microsoft. Once I realized that Ctrl-Alt-Del
was handled in the BIOS, I found that there aren't any.

2002-02-08 18:38:37

by Arjan van de Ven

[permalink] [raw]

Subject: Re: [patch] larger kernel stack (8k->16k) per task

On Fri, Feb 08, 2002 at 01:29:16PM -0500, Richard B. Johnson wrote:
> I think it is entirely inefficient to call an external procedure
> for temporary variable space when the actual math is done by the
> compiler at compile time, and the code is a simple subtraction, then
> later-on a simple addition to a single register!

Depends. If you need a few bytes (and upto 1Kb I'd call a few bytes if
you're careful), then stack usage is fine. If you need more, well, kmalloc
is some 100 cycles...

> If the kernel does not provide sufficient stack-space for small
> buffers and structures, it is a kernel problem,

notice the *small*

The alternative is to double the amount of PER PROCESS overhead in terms of
unswappable memory... Even 1 disk IO will hurt more than your kmalloc of 4Kb
of "small buffers and structures" will in a year.

2002-02-08 18:56:54

by Ingo Molnar

[permalink] [raw]

Subject: Re: [patch] larger kernel stack (8k->16k) per task

On Fri, 8 Feb 2002, Richard B. Johnson wrote:

> Could someone please tell me why you should make a function call to
> allocate, and then later on free, some temporary space for variables
> when allocation on the stack involves simply subtracting a value,
> calculated by the compiler at compile-time, from the stack-pointer?

because if you need that much stack space then you shouldnt care about
kmalloc() overhead anymore.

> This takes 42 clocks, not including the thousands used up by kmalloc
> and kfree.

kmalloc()+kfree() is not thousands. At least on SMP it has a nice frontend
cache and batch allocator component, which makes it much cheaper.

32 bytes you can allocate on the stack no problem. If you need to allocat
and *fill* 2K then kmalloc() overhead will not matter to you.

> If the kernel does not provide sufficient stack-space for small
> buffers and structures, it is a kernel problem, not a driver-coding
> problem. [...]

we will not add significant overhead to *every part* of the kernel just to
keep a ridiculously large stack around for those few drivers who should
use kmalloc() to begin with. And believe me, some people will then want to
do recursion on the stack and would complain even about a 1MB stack. There
is a limit to draw, and on Linux it's 'no bigger than a few hundred bytes,
allocate dynamically otherwise'.

Ingo

2002-02-08 20:09:59

by Andrew Morton

[permalink] [raw]

Subject: Re: [patch] larger kernel stack (8k->16k) per task

Tigran Aivazian wrote:
>
> ...
> + memset((char*)tsk+sizeof(struct task_struct), 0, \
> + THREAD_SIZE-sizeof(struct task_struct)); \
> + } \

This seems to be permanently enabled? If so, I'd suggest that it
be made conditional on CONFIG_SLAB_DEBUG, or such.

> ...
> -# define INIT_TASK_SIZE 2048*sizeof(long)
> +# define INIT_TASK_SIZE 4096*sizeof(long)

Personally, I'd rather we make the stack 4k for a while,
and fix the resulting mess.

-

2002-02-08 20:24:00

by Jes Sorensen

[permalink] [raw]

Subject: Re: [patch] larger kernel stack (8k->16k) per task

Tigran Aivazian <[email protected]> writes:

> So, I found this patch useful at least for debugging. Moreover, I think it
> would be very useful to have it in Linus' kernel as a CONFIG_ option so
> that if people complain about random memory corruption then they can try
> to reproduce it with larger stack and then (with aid of /proc/stack) the
> offender is found and fixed. I cc'd Alan; if he thinks this is a bad idea
> I would be interested to know why.

Well as someone suggested, stick it under CONFIG_SLAB_DEBUG then, it
surely shouldn't be an option to be enabled in normal production
kernels but for debugging it's fine.

Jes

2002-02-08 21:12:45

by Mike Fedyk

[permalink] [raw]

Subject: Re: [patch] larger kernel stack (8k->16k) per task

On Fri, Feb 08, 2002 at 09:22:05PM +0100, Jes Sorensen wrote:
> Tigran Aivazian <[email protected]> writes:
>
> > So, I found this patch useful at least for debugging. Moreover, I think it
> > would be very useful to have it in Linus' kernel as a CONFIG_ option so
> > that if people complain about random memory corruption then they can try
> > to reproduce it with larger stack and then (with aid of /proc/stack) the
> > offender is found and fixed. I cc'd Alan; if he thinks this is a bad idea
> > I would be interested to know why.
>
> Well as someone suggested, stick it under CONFIG_SLAB_DEBUG then, it

That was suggested by Andrew Morton...

> surely shouldn't be an option to be enabled in normal production
> kernels but for debugging it's fine.
>

Ahh, but if you are overflowing the stack by a few bytes, and then enable
stack debugging the error will go away because of the alrger stack.

If this goes in, it should have its own config option.

Mike

2002-02-09 07:18:09

by George Anzinger

[permalink] [raw]

Subject: Re: [patch] larger kernel stack (8k->16k) per task

Mike Fedyk wrote:
>
> On Fri, Feb 08, 2002 at 09:22:05PM +0100, Jes Sorensen wrote:
> > Tigran Aivazian <[email protected]> writes:
> >
> > > So, I found this patch useful at least for debugging. Moreover, I think it
> > > would be very useful to have it in Linus' kernel as a CONFIG_ option so
> > > that if people complain about random memory corruption then they can try
> > > to reproduce it with larger stack and then (with aid of /proc/stack) the
> > > offender is found and fixed. I cc'd Alan; if he thinks this is a bad idea
> > > I would be interested to know why.
> >
> > Well as someone suggested, stick it under CONFIG_SLAB_DEBUG then, it
>
> That was suggested by Andrew Morton...
>
> > surely shouldn't be an option to be enabled in normal production
> > kernels but for debugging it's fine.
> >
>
> Ahh, but if you are overflowing the stack by a few bytes, and then enable
> stack debugging the error will go away because of the alrger stack.

It really isn't that hard (or expensive) to put a stack overflow test in
entry.S. Of course this should be a debug config option.

George
>
> If this goes in, it should have its own config option.
>
> Mike
> -
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to [email protected]
> More majordomo info at http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at http://www.tux.org/lkml/

--
George [email protected]
High-res-timers: http://sourceforge.net/projects/high-res-timers/
Real time sched: http://sourceforge.net/projects/rtsched/