2002-06-05 02:55:43

by Benjamin LaHaise

[permalink] [raw]
Subject: [RFC] 4KB stack + irq stack for x86

Hey folks,

Below is a patch against 2.5.20 that implements 4KB stacks for tasks,
plus a seperate 4KB irq stack for use by interrupts. There are a couple
of reasons for doing this: 4KB stacks put less pressure on the VM
subsystem, reduces the overall memory usage for systems with large
numbers of tasks, and increases the reliability of the system when
under heavy irq load by provide a fixed stack size for interrupt
handlers that other kernel code will not eat into.

The interrupt stacks are stackable, so we could use multiple
4KB irq stacks. The thread_info structure is included in each
interrupt stack, and has the current pointer copied into it upon
entry.

Things can be made a bit more efficient by moving thread_info from the
bottom of the stack to the top. This would simplify the irq entry code
a bit as the same pointer can be used for the thread_info code and top
of stack. The 2.4 version of the patch does this. In addition, moving
thread_info to the top of the stack results in better cache line
sharing: thread_info is in the same cacheline as the first data pushed
onto the irq stack. Ditto for regular tasks. I'll do this if there is
interest in merging it.

I had been playing with 2.5KB stacks (4KB page minus 1.5K for task_struct),
and it is possible given a few fixes for massive (>1KB) stack allocation
in the pci layer and a few others. So far 4KB hasn't overflowed on any
of the tasks I normally run (checked using a stack overflow checker that
follows).

Comments?

-ben
--
"You will be reincarnated as a toad; and you will be much happier."


:r ~/patches/v2.5.20/v2.5.20-smallstack-A0.diff
diff -urN v2.5.20/arch/i386/config.in smallstack-2.5.20.diff/arch/i386/config.in
--- v2.5.20/arch/i386/config.in Tue Jun 4 18:00:02 2002
+++ smallstack-2.5.20.diff/arch/i386/config.in Tue Jun 4 19:30:54 2002
@@ -34,6 +34,7 @@
#
# Define implied options from the CPU selection here
#
+define_bool CONFIG_X86_HAVE_CMOV n

if [ "$CONFIG_M386" = "y" ]; then
define_bool CONFIG_X86_CMPXCHG n
@@ -90,18 +91,21 @@
define_bool CONFIG_X86_GOOD_APIC y
define_bool CONFIG_X86_USE_PPRO_CHECKSUM y
define_bool CONFIG_X86_PPRO_FENCE y
+ define_bool CONFIG_X86_HAVE_CMOV y
fi
if [ "$CONFIG_MPENTIUMIII" = "y" ]; then
define_int CONFIG_X86_L1_CACHE_SHIFT 5
define_bool CONFIG_X86_TSC y
define_bool CONFIG_X86_GOOD_APIC y
define_bool CONFIG_X86_USE_PPRO_CHECKSUM y
+ define_bool CONFIG_X86_HAVE_CMOV y
fi
if [ "$CONFIG_MPENTIUM4" = "y" ]; then
define_int CONFIG_X86_L1_CACHE_SHIFT 7
define_bool CONFIG_X86_TSC y
define_bool CONFIG_X86_GOOD_APIC y
define_bool CONFIG_X86_USE_PPRO_CHECKSUM y
+ define_bool CONFIG_X86_HAVE_CMOV y
fi
if [ "$CONFIG_MK6" = "y" ]; then
define_int CONFIG_X86_L1_CACHE_SHIFT 5
@@ -115,6 +119,7 @@
define_bool CONFIG_X86_GOOD_APIC y
define_bool CONFIG_X86_USE_3DNOW y
define_bool CONFIG_X86_USE_PPRO_CHECKSUM y
+ define_bool CONFIG_X86_HAVE_CMOV y
fi
if [ "$CONFIG_MELAN" = "y" ]; then
define_int CONFIG_X86_L1_CACHE_SHIFT 4
@@ -131,6 +136,7 @@
if [ "$CONFIG_MCRUSOE" = "y" ]; then
define_int CONFIG_X86_L1_CACHE_SHIFT 5
define_bool CONFIG_X86_TSC y
+ define_bool CONFIG_X86_HAVE_CMOV y
fi
if [ "$CONFIG_MWINCHIPC6" = "y" ]; then
define_int CONFIG_X86_L1_CACHE_SHIFT 5
diff -urN v2.5.20/arch/i386/kernel/entry.S smallstack-2.5.20.diff/arch/i386/kernel/entry.S
--- v2.5.20/arch/i386/kernel/entry.S Tue Jun 4 18:00:16 2002
+++ smallstack-2.5.20.diff/arch/i386/kernel/entry.S Tue Jun 4 20:24:51 2002
@@ -163,7 +163,7 @@
movl %ecx,CS(%esp) #
movl %esp, %ebx
pushl %ebx
- andl $-8192, %ebx # GET_THREAD_INFO
+ GET_THREAD_INFO_WITH_ESP(%ebx)
movl TI_EXEC_DOMAIN(%ebx), %edx # Get the execution domain
movl 4(%edx), %edx # Get the lcall7 handler for the domain
pushl $0x7
@@ -185,7 +185,7 @@
movl %ecx,CS(%esp) #
movl %esp, %ebx
pushl %ebx
- andl $-8192, %ebx # GET_THREAD_INFO
+ GET_THREAD_INFO_WITH_ESP(%ebx)
movl TI_EXEC_DOMAIN(%ebx), %edx # Get the execution domain
movl 4(%edx), %edx # Get the lcall7 handler for the domain
pushl $0x27
@@ -361,7 +361,30 @@
SAVE_ALL
GET_THREAD_INFO(%ebx)
INC_PRE_COUNT(%ebx)
+
+ movl TI_IRQ_STACK(%ebx),%ecx
+ movl TI_TASK(%ebx),%edx
+ movl %esp,%eax
+ leal (THREAD_SIZE-4)(%ecx),%ebx
+ testl %ecx,%ecx # is there a valid irq_stack?
+
+ # switch to the irq stack
+#ifdef CONFIG_X86_HAVE_CMOV
+ cmovnz %ebx,%esp
+#warning using cmov
+#else
+#warning cannot use cmov
+ jnz 1f
+ mov %ebx,%esp
+1:
+#endif
+
+ # update the task pointer in the irq stack
+ GET_THREAD_INFO(%ebx)
+ movl %edx,TI_TASK(%ebx)
+
call do_IRQ
+ movl %eax,%esp # potentially restore non-irq stack
jmp ret_from_intr

#define BUILD_INTERRUPT(name, nr) \
diff -urN v2.5.20/arch/i386/kernel/head.S smallstack-2.5.20.diff/arch/i386/kernel/head.S
--- v2.5.20/arch/i386/kernel/head.S Tue Jun 4 18:00:16 2002
+++ smallstack-2.5.20.diff/arch/i386/kernel/head.S Mon Jun 3 22:25:16 2002
@@ -15,6 +15,7 @@
#include <asm/page.h>
#include <asm/pgtable.h>
#include <asm/desc.h>
+#include <asm/thread_info.h>

#define OLD_CL_MAGIC_ADDR 0x90020
#define OLD_CL_MAGIC 0xA33F
@@ -310,7 +311,7 @@
ret

ENTRY(stack_start)
- .long init_thread_union+8192
+ .long init_thread_union+THREAD_SIZE
.long __KERNEL_DS

/* This is the default interrupt "handler" :-) */
diff -urN v2.5.20/arch/i386/kernel/init_task.c smallstack-2.5.20.diff/arch/i386/kernel/init_task.c
--- v2.5.20/arch/i386/kernel/init_task.c Tue Jun 4 17:59:24 2002
+++ smallstack-2.5.20.diff/arch/i386/kernel/init_task.c Tue Jun 4 20:25:47 2002
@@ -13,6 +13,9 @@
static struct signal_struct init_signals = INIT_SIGNALS;
struct mm_struct init_mm = INIT_MM(init_mm);

+union thread_union init_irq_union
+ __attribute__((__section__(".data.init_task")));
+
/*
* Initial thread structure.
*
@@ -22,7 +25,15 @@
*/
union thread_union init_thread_union
__attribute__((__section__(".data.init_task"))) =
- { INIT_THREAD_INFO(init_task) };
+ { {
+ task: &init_task,
+ exec_domain: &default_exec_domain,
+ flags: 0,
+ cpu: 0,
+ addr_limit: KERNEL_DS,
+ irq_stack: &init_irq_union,
+ } };
+

/*
* Initial task structure.
diff -urN v2.5.20/arch/i386/kernel/irq.c smallstack-2.5.20.diff/arch/i386/kernel/irq.c
--- v2.5.20/arch/i386/kernel/irq.c Tue Jun 4 17:59:47 2002
+++ smallstack-2.5.20.diff/arch/i386/kernel/irq.c Tue Jun 4 21:01:16 2002
@@ -557,7 +557,8 @@
* SMP cross-CPU interrupts have their own specific
* handlers).
*/
-asmlinkage unsigned int do_IRQ(struct pt_regs regs)
+struct pt_regs *do_IRQ(struct pt_regs *regs) __attribute__((regparm(1)));
+struct pt_regs *do_IRQ(struct pt_regs *regs)
{
/*
* We ack quickly, we don't want the irq controller
@@ -569,7 +570,7 @@
* 0 return value means that this irq is already being
* handled by some other CPU. (or is disabled)
*/
- int irq = regs.orig_eax & 0xff; /* high bits used in ret_from_ code */
+ int irq = regs->orig_eax & 0xff; /* high bits used in ret_from_ code */
int cpu = smp_processor_id();
irq_desc_t *desc = irq_desc + irq;
struct irqaction * action;
@@ -618,7 +619,7 @@
*/
for (;;) {
spin_unlock(&desc->lock);
- handle_IRQ_event(irq, &regs, action);
+ handle_IRQ_event(irq, regs, action);
spin_lock(&desc->lock);

if (!(desc->status & IRQ_PENDING))
@@ -636,7 +637,7 @@

if (softirq_pending(cpu))
do_softirq();
- return 1;
+ return regs;
}

/**
diff -urN v2.5.20/arch/i386/kernel/process.c smallstack-2.5.20.diff/arch/i386/kernel/process.c
--- v2.5.20/arch/i386/kernel/process.c Tue Jun 4 18:00:00 2002
+++ smallstack-2.5.20.diff/arch/i386/kernel/process.c Tue Jun 4 20:47:40 2002
@@ -650,6 +650,7 @@

/* never put a printk in __switch_to... printk() calls wake_up*() indirectly */

+ next_p->thread_info->irq_stack = prev_p->thread_info->irq_stack;
unlazy_fpu(prev_p);

/*
diff -urN v2.5.20/arch/i386/kernel/smpboot.c smallstack-2.5.20.diff/arch/i386/kernel/smpboot.c
--- v2.5.20/arch/i386/kernel/smpboot.c Tue Jun 4 18:00:18 2002
+++ smallstack-2.5.20.diff/arch/i386/kernel/smpboot.c Tue Jun 4 20:54:12 2002
@@ -72,6 +72,10 @@
/* Per CPU bogomips and other parameters */
struct cpuinfo_x86 cpu_data[NR_CPUS] __cacheline_aligned;

+extern union thread_union init_irq_union;
+union thread_union *irq_stacks[NR_CPUS] __cacheline_aligned =
+ { &init_irq_union, };
+
/* Set when the idlers are all forked */
int smp_threads_ready;

@@ -808,6 +812,27 @@
return (send_status | accept_status);
}

+static void __init setup_irq_stack(struct task_struct *p, int cpu)
+{
+ unsigned long stk;
+
+ stk = __get_free_pages(GFP_KERNEL, THREAD_ORDER);
+ if (!stk)
+ panic("I can't seem to allocate my irq stack. Oh well, giving up.");
+
+ irq_stacks[cpu] = (void *)stk;
+ memset(irq_stacks[cpu], 0, THREAD_SIZE);
+ irq_stacks[cpu]->thread_info.cpu = cpu;
+ irq_stacks[cpu]->thread_info.preempt_count = 1;
+ /* interrupts are not preemptable */
+ p->thread_info->irq_stack = irq_stacks[cpu];
+
+ /* If we want to make the irq stack more than one unit
+ * deep, we can chain then off of the irq_stack pointer
+ * here.
+ */
+}
+
extern unsigned long cpu_initialized;

static void __init do_boot_cpu (int apicid)
@@ -831,6 +856,8 @@
if (IS_ERR(idle))
panic("failed fork for CPU %d", cpu);

+ setup_irq_stack(idle, cpu);
+
/*
* We remove it from the pidhash and the runqueue
* once we got the process:
@@ -848,7 +875,7 @@

/* So we see what's up */
printk("Booting processor %d/%d eip %lx\n", cpu, apicid, start_eip);
- stack_start.esp = (void *) (1024 + PAGE_SIZE + (char *)idle->thread_info);
+ stack_start.esp = (void *) (THREAD_SIZE + (char *)idle->thread_info);

/*
* This grunge runs the startup process for
diff -urN v2.5.20/include/asm-i386/page.h smallstack-2.5.20.diff/include/asm-i386/page.h
--- v2.5.20/include/asm-i386/page.h Tue Jun 4 18:00:18 2002
+++ smallstack-2.5.20.diff/include/asm-i386/page.h Mon Jun 3 22:43:11 2002
@@ -3,7 +3,11 @@

/* PAGE_SHIFT determines the page size */
#define PAGE_SHIFT 12
+#ifndef __ASSEMBLY__
#define PAGE_SIZE (1UL << PAGE_SHIFT)
+#else
+#define PAGE_SIZE (1 << PAGE_SHIFT)
+#endif
#define PAGE_MASK (~(PAGE_SIZE-1))

#ifdef __KERNEL__
diff -urN v2.5.20/include/asm-i386/thread_info.h smallstack-2.5.20.diff/include/asm-i386/thread_info.h
--- v2.5.20/include/asm-i386/thread_info.h Tue Jun 4 17:59:46 2002
+++ smallstack-2.5.20.diff/include/asm-i386/thread_info.h Tue Jun 4 20:58:55 2002
@@ -9,6 +9,7 @@

#ifdef __KERNEL__

+#include <asm/page.h>
#ifndef __ASSEMBLY__
#include <asm/processor.h>
#endif
@@ -28,9 +29,11 @@
__s32 preempt_count; /* 0 => preemptable, <0 => BUG */

mm_segment_t addr_limit; /* thread address space:
+ 0 for interrupts: illegal
0-0xBFFFFFFF for user-thead
0-0xFFFFFFFF for kernel-thread
*/
+ struct thread_info *irq_stack; /* pointer to cpu irq stack */

__u8 supervisor_stack[0];
};
@@ -44,6 +47,7 @@
#define TI_CPU 0x0000000C
#define TI_PRE_COUNT 0x00000010
#define TI_ADDR_LIMIT 0x00000014
+#define TI_IRQ_STACK 0x00000018

#endif

@@ -52,41 +56,40 @@
/*
* macros/functions for gaining access to the thread information structure
*/
-#ifndef __ASSEMBLY__
-#define INIT_THREAD_INFO(tsk) \
-{ \
- task: &tsk, \
- exec_domain: &default_exec_domain, \
- flags: 0, \
- cpu: 0, \
- addr_limit: KERNEL_DS, \
-}
+#define THREAD_ORDER 0

+#ifndef __ASSEMBLY__
#define init_thread_info (init_thread_union.thread_info)
#define init_stack (init_thread_union.stack)

+/* thread information allocation */
+#define THREAD_SIZE (PAGE_SIZE << THREAD_ORDER)
+#define alloc_thread_info() ((struct thread_info *) __get_free_pages(GFP_KERNEL,THREAD_ORDER))
+#define free_thread_info(ti) free_pages((unsigned long) (ti), THREAD_ORDER)
+#define get_thread_info(ti) get_task_struct((ti)->task)
+#define put_thread_info(ti) put_task_struct((ti)->task)
+
/* how to get the thread information struct from C */
static inline struct thread_info *current_thread_info(void)
{
struct thread_info *ti;
- __asm__("andl %%esp,%0; ":"=r" (ti) : "0" (~8191UL));
+ __asm__("andl %%esp,%0; ":"=r" (ti) : "0" (~(THREAD_SIZE - 1)));
return ti;
}

-/* thread information allocation */
-#define THREAD_SIZE (2*PAGE_SIZE)
-#define alloc_thread_info() ((struct thread_info *) __get_free_pages(GFP_KERNEL,1))
-#define free_thread_info(ti) free_pages((unsigned long) (ti), 1)
-#define get_thread_info(ti) get_task_struct((ti)->task)
-#define put_thread_info(ti) put_task_struct((ti)->task)
-
#else /* !__ASSEMBLY__ */

+#define THREAD_SIZE (PAGE_SIZE << THREAD_ORDER)
+
/* how to get the thread information struct from ASM */
#define GET_THREAD_INFO(reg) \
- movl $-8192, reg; \
+ movl $-THREAD_SIZE, reg; \
andl %esp, reg

+/* use this one if reg already contains %esp */
+#define GET_THREAD_INFO_WITH_ESP(reg) \
+ andl $-THREAD_SIZE, reg
+
#endif

/*


2002-06-05 15:33:01

by Linus Torvalds

[permalink] [raw]
Subject: Re: [RFC] 4KB stack + irq stack for x86



On Tue, 4 Jun 2002, Benjamin LaHaise wrote:
>
> Below is a patch against 2.5.20 that implements 4KB stacks for tasks,
> plus a seperate 4KB irq stack for use by interrupts.

Hmm.. Interesting.

However, you seem to be moving only the task structure pointer into the
new interrupt stack thread info, which seems to ignore all the "flags"
things.

So, as far as I can tell, we now get a nasty aliasing issue on
"current_thread_info()->flags", and information like NEED_RESCHED and
SIGPENDING end up being set in the wrong place. They get set on the
_interrupt_ thread_info, not the "process native" thread_info.

Or did I miss some subtlety?

Note that some of this may be hidden by the fact that not everybody uses
the "current_thread_info()" thing, most people still use the old format
"tsk->thread_info".

For example: "set_need_resched()" -> "set_thread_flag(TIF_NEED_RESCHED)"
-> "set_bit(fTIF_NEED_RESCHED,&current_thread_info()->flags)".

So any interrupt causing a "set_need_resched()" would appear to not do the
right thing now.

Comments? We can deprecate "current_thread_info()", but that would make
some things slightly less efficient.

Linus

2002-06-05 18:44:00

by Benjamin LaHaise

[permalink] [raw]
Subject: Re: [RFC] 4KB stack + irq stack for x86

On Wed, Jun 05, 2002 at 08:33:13AM -0700, Linus Torvalds wrote:
> So, as far as I can tell, we now get a nasty aliasing issue on
> "current_thread_info()->flags", and information like NEED_RESCHED and
> SIGPENDING end up being set in the wrong place. They get set on the
> _interrupt_ thread_info, not the "process native" thread_info.
>
> Or did I miss some subtlety?

Ah, you're right. If anyone uses current_thread_info from IRQ context
it will set the flags in the wrong structure. However, it actually
works because nobody does that currently: all of the _thread_flag users
appear to be coming in from task context. Mostly that's luck as I
didn't change the smp ipis to switch stacks, so the only place that
is an interrupt and needs to access the actual thread data, does.

> Comments? We can deprecate "current_thread_info()", but that would make
> some things slightly less efficient.

I think we can keep it and flush out any misuse by a couple of
carefully placed BUG() checks (ie anyone using current_thread_info
directly from IRQ context really needs to go via current->thread_info,
so that could be made a BUG()). The only bit I'm not certain about
is the preempt_count handling. I rather like having addr_limit set
to 0 as it will prevent copy*user from working in IRQ context.

Another alternative is to move the task structure back into the same
page as the stack, but that would require a commitment to fix the
large stack users (there aren't many, and I'm certainly willing). This
approach would work well with a guard page and the task struct at the
top. Also, there is at least 348 bytes of task_struct which should be
moved out: io_bitmaps should only be allocated for the tasks that use
it (its 132 bytes, and we copy it into the per cpu tss nowadays, so
making it a pointer should be fine), and the credentials (groups +
rlim + user ids: >216) need to move to a struct cred for use by NFS,
AIO and pthreads. That alone would get task_struct down to 1072 bytes
on x86 UP (1328 bytes on SMP), which leaves almost 3KB for the task
context stack.

Thoughts?

-ben
--
"You will be reincarnated as a toad; and you will be much happier."

2002-06-05 18:54:33

by Linus Torvalds

[permalink] [raw]
Subject: Re: [RFC] 4KB stack + irq stack for x86


On Wed, 5 Jun 2002, Benjamin LaHaise wrote:
>
> Ah, you're right. If anyone uses current_thread_info from IRQ context
> it will set the flags in the wrong structure. However, it actually
> works because nobody does that currently: all of the _thread_flag users
> appear to be coming in from task context. Mostly that's luck as I
> didn't change the smp ipis to switch stacks, so the only place that
> is an interrupt and needs to access the actual thread data, does.

Hmm..

How about just making the interrupt code (ie do_IRQ()) or in the flags
into the "parent" flags.

All of the flags should be "sticky one-bits", so just oring them should do
the right thing.

That way we don't have to add nasty BUG checks to the code, and since
we're already dirtying both cache-lines the extra overhead should
literally be just the cost of doing one locked "orl".

Linus

2002-06-05 20:40:42

by Andi Kleen

[permalink] [raw]
Subject: Re: [RFC] 4KB stack + irq stack for x86

Benjamin LaHaise <[email protected]> writes:

> On Wed, Jun 05, 2002 at 08:33:13AM -0700, Linus Torvalds wrote:
> > So, as far as I can tell, we now get a nasty aliasing issue on
> > "current_thread_info()->flags", and information like NEED_RESCHED and
> > SIGPENDING end up being set in the wrong place. They get set on the
> > _interrupt_ thread_info, not the "process native" thread_info.
> >
> > Or did I miss some subtlety?
>
> Ah, you're right. If anyone uses current_thread_info from IRQ context
> it will set the flags in the wrong structure. However, it actually
> works because nobody does that currently: all of the _thread_flag users

preemptive kernels do use current_thread_info() for every spinlock.
this required me to change its implementation on x86-64 from stack
arithmetic to access the base register.

-Andi

2002-06-05 20:56:43

by Linus Torvalds

[permalink] [raw]
Subject: Re: [RFC] 4KB stack + irq stack for x86


On 5 Jun 2002, Andi Kleen wrote:
> >
> > Ah, you're right. If anyone uses current_thread_info from IRQ context
> > it will set the flags in the wrong structure. However, it actually
> > works because nobody does that currently: all of the _thread_flag users
>
> preemptive kernels do use current_thread_info() for every spinlock.
> this required me to change its implementation on x86-64 from stack
> arithmetic to access the base register.

Note that this part is ok, as long as we make sure that the irq stack gets
initialized with a preempt_count > 0 (we must not preempt an interrupt
handler anyway, it wouldn't work), _and_ we make sure that taking the
interrupt also increments the "process native" preempt_count (so that
anybody looking at that preempt_count to determine whether it could be
preempted will also get a "nope, don't preempt me").

So that part doesn't look like a fundamental problem to me. It's just a
"need to be careful" thing.

Linus

2002-06-05 21:07:09

by Benjamin LaHaise

[permalink] [raw]
Subject: Re: [RFC] 4KB stack + irq stack for x86

On Wed, Jun 05, 2002 at 11:53:10AM -0700, Linus Torvalds wrote:
> All of the flags should be "sticky one-bits", so just oring them should do
> the right thing.
>
> That way we don't have to add nasty BUG checks to the code, and since
> we're already dirtying both cache-lines the extra overhead should
> literally be just the cost of doing one locked "orl".

This patch on top of the others should do that. I've placed a full
diff from 2.5.20 to the current code at
http://www.kvack.org/~blah/stack-v2.5.20-A2.diff .

-ben
--
"You will be reincarnated as a toad; and you will be much happier."

:r ~/patches/v2.5.20/v2.5.20-smallstack-A0-A1.diff
diff -urN stackcheck-v2.5.20-A1/arch/i386/kernel/entry.S stack-2.5.20.diff/arch/i386/kernel/entry.S
--- stackcheck-v2.5.20-A1/arch/i386/kernel/entry.S Wed Jun 5 15:59:12 2002
+++ stack-2.5.20.diff/arch/i386/kernel/entry.S Wed Jun 5 15:55:09 2002
@@ -365,26 +365,34 @@
movl TI_IRQ_STACK(%ebx),%ecx
movl TI_TASK(%ebx),%edx
movl %esp,%eax
- leal (THREAD_SIZE-4)(%ecx),%ebx
+ leal (THREAD_SIZE-4)(%ecx),%esi
testl %ecx,%ecx # is there a valid irq_stack?

# switch to the irq stack
#ifdef CONFIG_X86_HAVE_CMOV
- cmovnz %ebx,%esp
-#warning using cmov
+ cmovnz %esi,%esp
#else
-#warning cannot use cmov
jnz 1f
- mov %ebx,%esp
+ mov %esi,%esp
1:
#endif

# update the task pointer in the irq stack
- GET_THREAD_INFO(%ebx)
- movl %edx,TI_TASK(%ebx)
+ GET_THREAD_INFO(%esi)
+ movl %edx,TI_TASK(%esi)

call do_IRQ
+
movl %eax,%esp # potentially restore non-irq stack
+
+ # copy flags from the irq stack back into the task's thread_info
+ # %esi is saved over the do_IRQ call and contains the irq stack
+ # thread_info pointer
+ # %ebx contains the original thread_info pointer
+ movl TI_FLAGS(%esi),%eax
+ movl $0,TI_FLAGS(%esi)
+ LOCK orl %eax,TI_FLAGS(%ebx)
+
jmp ret_from_intr

#define BUILD_INTERRUPT(name, nr) \
diff -urN stackcheck-v2.5.20-A1/arch/i386/kernel/smpboot.c stack-2.5.20.diff/arch/i386/kernel/smpboot.c
--- stackcheck-v2.5.20-A1/arch/i386/kernel/smpboot.c Wed Jun 5 15:59:08 2002
+++ stack-2.5.20.diff/arch/i386/kernel/smpboot.c Wed Jun 5 15:12:36 2002
@@ -875,7 +875,13 @@

/* So we see what's up */
printk("Booting processor %d/%d eip %lx\n", cpu, apicid, start_eip);
- stack_start.esp = (void *) (THREAD_SIZE + (char *)idle->thread_info);
+
+ /* The -4 is to correct for the fact that the stack pointer
+ * is used to find the location of the thread_info structure
+ * by masking off several of the LSBs. Without the -4, esp
+ * is pointing to the page after the one the stack is on.
+ */
+ stack_start.esp = (void *)(THREAD_SIZE - 4 + (char *)idle->thread_info);

/*
* This grunge runs the startup process for
diff -urN stackcheck-v2.5.20-A1/include/asm-i386/thread_info.h stack-2.5.20.diff/include/asm-i386/thread_info.h
--- stackcheck-v2.5.20-A1/include/asm-i386/thread_info.h Wed Jun 5 15:59:08 2002
+++ stack-2.5.20.diff/include/asm-i386/thread_info.h Wed Jun 5 14:55:04 2002
@@ -57,6 +57,7 @@
* macros/functions for gaining access to the thread information structure
*/
#define THREAD_ORDER 0
+#define INIT_THREAD_SIZE THREAD_SIZE

#ifndef __ASSEMBLY__
#define init_thread_info (init_thread_union.thread_info)

2002-06-05 22:20:03

by Steve Lord

[permalink] [raw]
Subject: Re: [RFC] 4KB stack + irq stack for x86

On Tue, 2002-06-04 at 21:55, Benjamin LaHaise wrote:
> Hey folks,
>
>
>
> I had been playing with 2.5KB stacks (4KB page minus 1.5K for task_struct),
> and it is possible given a few fixes for massive (>1KB) stack allocation
> in the pci layer and a few others. So far 4KB hasn't overflowed on any
> of the tasks I normally run (checked using a stack overflow checker that
> follows).
>

Ben,

Just what are the tasks you normally run - and how many code
paths do you think there are out there which you do not run. XFS
might get a bit stack hungry in places, we try to keep it down,
but when you get into file system land things can stack up quickly:

NFS server -> file system -> block layer -> device driver

With possibly some form of volume management out there too.

I am pounding away on xfs with your code in there including the
checker, and so far it is surviving. But I only have a plain old
scsi drive underneath, and no NFS on top.

Steve

--

Steve Lord voice: +1-651-683-3511
Principal Engineer, Filesystem Software email: [email protected]

2002-06-05 22:31:55

by Benjamin LaHaise

[permalink] [raw]
Subject: Re: [RFC] 4KB stack + irq stack for x86

On Wed, Jun 05, 2002 at 05:15:23PM -0500, Steve Lord wrote:
> Just what are the tasks you normally run - and how many code
> paths do you think there are out there which you do not run. XFS
> might get a bit stack hungry in places, we try to keep it down,
> but when you get into file system land things can stack up quickly:

You already lose in that case today, as multiple irqs may come in
from devices and eat up the stack. The whole thing that led me down
this line is seeing it happen in real life. What remains to be done
is to write an automated stack depth checker based on possible call
chains that will calculate the maximum possible stack depth. I've
already got scripts for dumping the top stack users, it's a matter
of writing code that can show us the possible call chains.

-ben

2002-06-05 23:19:45

by David Miller

[permalink] [raw]
Subject: Re: [RFC] 4KB stack + irq stack for x86

From: Benjamin LaHaise <[email protected]>
Date: Wed, 5 Jun 2002 18:31:52 -0400

On Wed, Jun 05, 2002 at 05:15:23PM -0500, Steve Lord wrote:
> Just what are the tasks you normally run - and how many code
> paths do you think there are out there which you do not run. XFS
> might get a bit stack hungry in places, we try to keep it down,
> but when you get into file system land things can stack up quickly:

You already lose in that case today, as multiple irqs may come in
from devices and eat up the stack.

I agree with Ben, if things explode due to stack overflow with his
changes they are almost certain to explode before his changes.

2002-06-06 00:24:51

by Larry McVoy

[permalink] [raw]
Subject: Re: [RFC] 4KB stack + irq stack for x86

On Wed, Jun 05, 2002 at 04:13:42PM -0700, David S. Miller wrote:
> From: Benjamin LaHaise <[email protected]>
> Date: Wed, 5 Jun 2002 18:31:52 -0400
>
> On Wed, Jun 05, 2002 at 05:15:23PM -0500, Steve Lord wrote:
> > Just what are the tasks you normally run - and how many code
> > paths do you think there are out there which you do not run. XFS
> > might get a bit stack hungry in places, we try to keep it down,
> > but when you get into file system land things can stack up quickly:
>
> You already lose in that case today, as multiple irqs may come in
> from devices and eat up the stack.
>
> I agree with Ben, if things explode due to stack overflow with his
> changes they are almost certain to explode before his changes.

Just a "me too". I like Ben's patch, it seems like it is a sort of
"bloat meter", if you overflow the stack that suggests something is
wrong, and it isn't stack size.
--
---
Larry McVoy lm at bitmover.com http://www.bitmover.com/lm

2002-06-06 01:15:44

by Andi Kleen

[permalink] [raw]
Subject: Re: [RFC] 4KB stack + irq stack for x86

"David S. Miller" <[email protected]> writes:

> I agree with Ben, if things explode due to stack overflow with his
> changes they are almost certain to explode before his changes.

Things can explode anyways as handle_irq_event does __sti even when you have
zero stack left. And there can be always another interrupts. But it doesn't
happen that often in practice as you suggest.

So either fix it completely which would be adding stack checking to
do_IRQ (and not __sti when you have less than a few hundred bytes left) or
ignore the case as it doesn't seem to happen in practice regularly
(at least I've never seen it and I see quite a lot of crashes)

Interrupt stack doesn't fix the fundamental problem. It only makes it less
likely to hit. Crippling user context code for this doesn't seem to be helpful
at least for i386 (although I would prefer it for x86-64 because the 8K
stack even without interrupts is rather tight for 64bit). But still I would
likely prefer to enlarge the x86-64 stack than to see useful software
not working because of this.

The scenario Steve outlined was rather optimistic - more pessimistic
case would be e.g:
you run NBD which calls the network stack with an complex file system on top
of it called by something else complex that does a GFP_KERNEL alloc and VM
wants to flush a page via the NBD file system - I don't see how you'll ever
manage to fit that into 4K.

-Andi

2002-06-06 01:42:39

by Benjamin LaHaise

[permalink] [raw]
Subject: Re: [RFC] 4KB stack + irq stack for x86

On Thu, Jun 06, 2002 at 03:15:17AM +0200, Andi Kleen wrote:
> The scenario Steve outlined was rather optimistic - more pessimistic
> case would be e.g:
> you run NBD which calls the network stack with an complex file system on top
> of it called by something else complex that does a GFP_KERNEL alloc and VM
> wants to flush a page via the NBD file system - I don't see how you'll ever
> manage to fit that into 4K.

Which is, honestly, a bug. The IO subsystem should not be capable of
engaging in such deep recursion. ext2/ext3 barely allocate anything
on the stack (0x90 bytes at most in only a couple of calls), the vm
is in a similar state, and even the network stack's largest allocations
are in syscalls and timer code. Face it, the majority of code that is
or could cause problems are things that probably need fixing anyways.

-ben
--
"You will be reincarnated as a toad; and you will be much happier."

2002-06-06 02:35:35

by Steve Lord

[permalink] [raw]
Subject: Re: [RFC] 4KB stack + irq stack for x86

On Wed, 2002-06-05 at 20:42, Benjamin LaHaise wrote:
> On Thu, Jun 06, 2002 at 03:15:17AM +0200, Andi Kleen wrote:
> > The scenario Steve outlined was rather optimistic - more pessimistic
> > case would be e.g:
> > you run NBD which calls the network stack with an complex file system on top
> > of it called by something else complex that does a GFP_KERNEL alloc and VM
> > wants to flush a page via the NBD file system - I don't see how you'll ever
> > manage to fit that into 4K.
>
> Which is, honestly, a bug. The IO subsystem should not be capable of
> engaging in such deep recursion. ext2/ext3 barely allocate anything
> on the stack (0x90 bytes at most in only a couple of calls), the vm
> is in a similar state, and even the network stack's largest allocations
> are in syscalls and timer code. Face it, the majority of code that is
> or could cause problems are things that probably need fixing anyways.
>

Well, reclaiming memory within the same thread which is allocating
memory is surely the root cause of this, not the I/O system.


Steve


2002-06-06 13:34:09

by Ulrich Weigand

[permalink] [raw]
Subject: Re: [RFC] 4KB stack + irq stack for x86

Benjamin LaHaise wrote:

>Which is, honestly, a bug. The IO subsystem should not be capable of
>engaging in such deep recursion. ext2/ext3 barely allocate anything
>on the stack (0x90 bytes at most in only a couple of calls), the vm
>is in a similar state, and even the network stack's largest allocations
>are in syscalls and timer code. Face it, the majority of code that is
>or could cause problems are things that probably need fixing anyways.

Just FYI here's a stack trace of a real-world stack overflow
situation we caught on s390 (before we switched to separate
interrupt stacks).

What goes on is this: system call does copy_from_user, which causes
a page fault, which goes through mm, filesystem, and buffer cache code,
calls alloc_pages there, which is out of memory and goes through
try_to_free_pages and the whole swap-out stack. At this time an
interrupt hits, followed by softirq processing. This runs the complete
TCP/IP stack and network device driver. This finally calls into kfree,
which is the last straw ... (Of course another interrupt could still
have occured at this point.)

Of course, the situation is particularly bad on s390, because every
function call needs at least 96 bytes on the stack (due to the register
save areas required by our ABI).

48 kfree [0x3bfae] SP=0x31ae7c8, FP=0x31ae828, SIZE=112
47 skb_release_data+170 [0x1ad632] SP=0x31ae838, FP=0x31ae898, SIZE=96
46 kfree_skbmem+54 [0x1ad672] SP=0x31ae898, FP=0x31ae8f8, SIZE=104
45 __kfree_skb+262 [0x1ad7ea] SP=0x31ae900, FP=0x31ae960, SIZE=96
44 lcs_txpacket+712 [0x18b63c] SP=0x31ae960, FP=0x31ae9c0, SIZE=120
43 qdisc_restart+184 [0x1b9ec0] SP=0x31ae9d8, FP=0x31aea38, SIZE=104
42 dev_queue_xmit+466 [0x1b18fe] SP=0x31aea40, FP=0x31aeaa0, SIZE=96
41 ip_output+340 [0x1c3cd8] SP=0x31aeaa0, FP=0x31aeb00, SIZE=96
40 ip_build_and_send_pkt+542 [0x1c38fe] SP=0x31aeb00, FP=0x31aeb60, SIZE=104
39 tcp_v4_send_synack+196 [0x1dbde8] SP=0x31aeb68, FP=0x31aebc8, SIZE=96
38 tcp_v4_conn_request+1030 [0x1dc25e] SP=0x31aebc8, FP=0x31aec28, SIZE=536
37 tcp_rcv_state_process+308 [0x1d44c4] SP=0x31aede0, FP=0x31aee40, SIZE=104
36 tcp_v4_do_rcv+292 [0x1dcb0c] SP=0x31aee48, FP=0x31aeea8, SIZE=96
35 tcp_v4_rcv+1436 [0x1dd124] SP=0x31aeea8, FP=0x31aef08, SIZE=96
34 ip_local_deliver+370 [0x1c090e] SP=0x31aef08, FP=0x31aef68, SIZE=96
33 ip_rcv+1122 [0x1c0e26] SP=0x31aef68, FP=0x31aefc8, SIZE=112
32 net_rx_action+634 [0x1b241a] SP=0x31aefd8, FP=0x31af038, SIZE=144
31 do_softirq+168 [0x23ac8] SP=0x31af068, FP=0x31af0c8, SIZE=112
30 io_return_bh [0x13f96] SP=0x31af0d8, FP=0x31af138, SIZE=248
29 try_to_swap_out [0x3dac8] SP=0x31af1d0, FP=0x31af230, SIZE=104
28 swap_out_pmd+320 [0x3de9c] SP=0x31af238, FP=0x31af298, SIZE=104
27 swap_out_vma+208 [0x3df8c] SP=0x31af2a0, FP=0x31af300, SIZE=144
26 swap_out_mm+130 [0x3e05a] SP=0x31af330, FP=0x31af390, SIZE=96
25 swap_out+268 [0x3e19c] SP=0x31af390, FP=0x31af3f0, SIZE=96
24 refill_inactive+158 [0x3fbf2] SP=0x31af3f0, FP=0x31af450, SIZE=96
23 do_try_to_free_pages+132 [0x3fcb4] SP=0x31af450, FP=0x31af4b0, SIZE=96
22 try_to_free_pages+66 [0x3feaa] SP=0x31af4b0, FP=0x31af510, SIZE=96
21 __alloc_pages+580 [0x40f3c] SP=0x31af510, FP=0x31af570, SIZE=104
20 _alloc_pages+54 [0x40cee] SP=0x31af578, FP=0x31af5d8, SIZE=96
19 grow_buffers+140 [0x4dcb8] SP=0x31af5d8, FP=0x31af638, SIZE=96
18 refill_freelist+96 [0x4a7f4] SP=0x31af638, FP=0x31af698, SIZE=96
17 getblk+644 [0x4b1d0] SP=0x31af698, FP=0x31af6f8, SIZE=96
16 bread+56 [0x4b69c] SP=0x31af6f8, FP=0x31af758, SIZE=104
15 ext2_get_branch+156 [0x74fa0] SP=0x31af760, FP=0x31af7c0, SIZE=104
14 ext2_get_block+202 [0x753d2] SP=0x31af7c8, FP=0x31af828, SIZE=208
13 block_read_full_page+320 [0x4c788] SP=0x31af898, FP=0x31af8f8, SIZE=144
12 ext2_readpage+46 [0x758c6] SP=0x31af928, FP=0x31af988, SIZE=96
11 read_cluster_nonblocking+346 [0x34d52] SP=0x31af988, FP=0x31af9e8, SIZE=96
10 filemap_nopage+576 [0x36600] SP=0x31af9e8, FP=0x31afa48, SIZE=112
9 do_no_page+142 [0x3198a] SP=0x31afa58, FP=0x31afab8, SIZE=96
8 handle_mm_fault+262 [0x31bd6] SP=0x31afab8, FP=0x31afb18, SIZE=120
7 do_page_fault+722 [0x12d5a] SP=0x31afb30, FP=0x31afb90, SIZE=104
6 pgm_dn [0x13ef8] SP=0x31afb98, FP=0x31afbf8, SIZE=248
5 tcp_sendmsg [0x1c9ed0] SP=0x31afc90, FP=0x31afcf0, SIZE=176
4 inet_sendmsg+84 [0x1ea538] SP=0x31afd40, FP=0x31afda0, SIZE=96
3 sock_sendmsg+132 [0x1a97c8] SP=0x31afda0, FP=0x31afe00, SIZE=120
2 sock_write+186 [0x1a9a1e] SP=0x31afe18, FP=0x31afe78, SIZE=136
1 sys_write+214 [0x47d2e] SP=0x31afea0, FP=0x31aff00, SIZE=104
0 pgm_system_call+34 [0x138c0] SP=0x31aff08, FP=0x31aff68, SIZE=248


Mit freundlichen Gruessen / Best Regards

Ulrich Weigand

--
Dr. Ulrich Weigand
Linux for S/390 Design & Development
IBM Deutschland Entwicklung GmbH, Schoenaicher Str. 220, 71032 Boeblingen
Phone: +49-7031/16-3727 --- Email: [email protected]

2002-06-06 17:38:00

by Pete Zaitcev

[permalink] [raw]
Subject: Re: [RFC] 4KB stack + irq stack for x86

>[...]
> Of course, the situation is particularly bad on s390, because every
> function call needs at least 96 bytes on the stack (due to the register
> save areas required by our ABI).

How is this different from sparc64?

-- Pete

2002-06-06 19:24:59

by Ulrich Weigand

[permalink] [raw]
Subject: Re: [RFC] 4KB stack + irq stack for x86


Pete Zaitcev wrote:

>> Of course, the situation is particularly bad on s390, because every
>> function call needs at least 96 bytes on the stack (due to the register
>> save areas required by our ABI).
>
>How is this different from sparc64?

Well, I guess it's similar on sparc. I'm not sure about the size of the
save areas needed on sparc, though. In any case both sparc and s390 are
certainly much worse w.r.t. stack space usage than i386 ...

(*Really* ugly is s390x, because we need about twice as much stack on
average than on s390, but page size is still only 4K -- most other 64-bit
platforms have 8K page size ...)


Mit freundlichen Gruessen / Best Regards

Ulrich Weigand

--
Dr. Ulrich Weigand
Linux for S/390 Design & Development
IBM Deutschland Entwicklung GmbH, Schoenaicher Str. 220, 71032 Boeblingen
Phone: +49-7031/16-3727 --- Email: [email protected]

2002-06-06 19:49:16

by Andi Kleen

[permalink] [raw]
Subject: Re: [RFC] 4KB stack + irq stack for x86

"Ulrich Weigand" <[email protected]> writes:

> (*Really* ugly is s390x, because we need about twice as much stack on
> average than on s390, but page size is still only 4K -- most other 64-bit
> platforms have 8K page size ...)

<minor detail, but perhaps still interesting>

Seems to be an old myth. Actually the 4K paged 64bit platforms are in the majority.

64bit linux platforms:

4K page: x86-64, ppc64, s390x, mips64, parisc64(?)
8K: alpha, sparc64
8-64K: ia64

-Andi

2002-06-06 20:28:16

by David Mosberger

[permalink] [raw]
Subject: Re: [RFC] 4KB stack + irq stack for x86

>>>>> On 06 Jun 2002 21:49:15 +0200, Andi Kleen <[email protected]> said:

Andi> "Ulrich Weigand" <[email protected]> writes:
>> (*Really* ugly is s390x, because we need about twice as much
>> stack on average than on s390, but page size is still only 4K --
>> most other 64-bit platforms have 8K page size ...)

Andi> <minor detail, but perhaps still interesting>

Andi> Seems to be an old myth. Actually the 4K paged 64bit platforms
Andi> are in the majority.

Andi> 64bit linux platforms:

Andi> 4K page: x86-64, ppc64, s390x, mips64, parisc64(?) 8K: alpha,
Andi> sparc64 8-64K: ia64

Just a minor nit: for ia64 it's either 32KB (for page sizes up to
16KB) or 64KB (for 64KB page size). The 32KB is conservative and
based on the assumption that there can be up to 16 nested interrupts
plus some other nested traps (such as unaligned faults). A separate
irq stack should let us reduce the per-task stack size.

--david

2002-06-06 20:55:24

by Ulrich Weigand

[permalink] [raw]
Subject: Re: [RFC] 4KB stack + irq stack for x86


David Mosberger wrote:

>Just a minor nit: for ia64 it's either 32KB (for page sizes up to
>16KB) or 64KB (for 64KB page size). The 32KB is conservative and
>based on the assumption that there can be up to 16 nested interrupts
>plus some other nested traps (such as unaligned faults). A separate
>irq stack should let us reduce the per-task stack size.

So in the case of 8K page size, you need an order-2 allocation
for the stack, right? How do you handle failures due to fragmentation?


Mit freundlichen Gruessen / Best Regards

Ulrich Weigand

--
Dr. Ulrich Weigand
Linux for S/390 Design & Development
IBM Deutschland Entwicklung GmbH, Schoenaicher Str. 220, 71032 Boeblingen
Phone: +49-7031/16-3727 --- Email: [email protected]


2002-06-06 21:19:06

by David Mosberger

[permalink] [raw]
Subject: Re: [RFC] 4KB stack + irq stack for x86

>>>>> On Thu, 6 Jun 2002 22:55:12 +0200, "Ulrich Weigand" <[email protected]> said:

Uli> So in the case of 8K page size, you need an order-2 allocation
Uli> for the stack, right? How do you handle failures due to
Uli> fragmentation?

We don't do anything special. I'm not sure what the fragmentation
statistics look like on machines with 1+GB memory; it's something I
have been wondering about and hoping to look into at some point (if
someone has done that already, I'd love to see the results). In
practice, every ia64 linux distro as of today ships with 16KB page
size, so you only get order-1 allocations for stacks.

--david

2002-06-06 22:11:17

by Martin J. Bligh

[permalink] [raw]
Subject: Re: [RFC] 4KB stack + irq stack for x86

> We don't do anything special. I'm not sure what the fragmentation
> statistics look like on machines with 1+GB memory; it's something I
> have been wondering about and hoping to look into at some point (if
> someone has done that already, I'd love to see the results). In
> practice, every ia64 linux distro as of today ships with 16KB page
> size, so you only get order-1 allocations for stacks.

I mailed out a patch that creates /proc/buddyinfo, which should give
you frag stats very easily .... would be interested to know if that works
on your machine ... slightly updated patch against 2.4.19-pre10 below:

diff -urN virgin-2.4.19-pre10/fs/proc/proc_misc.c linux-2.4.19-pre10-buddyinfo/fs/proc/proc_misc.c
--- virgin-2.4.19-pre10/fs/proc/proc_misc.c Wed Jun 5 16:32:15 2002
+++ linux-2.4.19-pre10-buddyinfo/fs/proc/proc_misc.c Wed Jun 5 16:56:05 2002
@@ -213,6 +213,21 @@
#undef K
}

+extern int buddyinfo(char *buf, int node_id);
+
+int buddyinfo_read_proc(char *page, char **start, off_t off,
+ int count, int *eof, void *data)
+{
+ int node_id;
+ int len = 0;
+
+ for (node_id = 0; node_id < numnodes; node_id++) {
+ len += buddyinfo(page+len, node_id);
+ }
+
+ return proc_calc_metrics(page, start, off, count, eof, len);
+}
+
static int version_read_proc(char *page, char **start, off_t off,
int count, int *eof, void *data)
{
@@ -589,6 +604,8 @@
entry->proc_fops = &proc_kmsg_operations;
create_seq_entry("cpuinfo", 0, &proc_cpuinfo_operations);
create_seq_entry("slabinfo",S_IWUSR|S_IRUGO,&proc_slabinfo_operations);
+ create_proc_read_entry("buddyinfo", S_IWUSR | S_IRUGO, NULL,
+ buddyinfo_read_proc, NULL);
#ifdef CONFIG_MODULES
create_seq_entry("ksyms", 0, &proc_ksyms_operations);
#endif
diff -urN virgin-2.4.19-pre10/mm/page_alloc.c linux-2.4.19-pre10-buddyinfo/mm/page_alloc.c
--- virgin-2.4.19-pre10/mm/page_alloc.c Wed Jun 5 16:32:33 2002
+++ linux-2.4.19-pre10-buddyinfo/mm/page_alloc.c Wed Jun 5 16:57:17 2002
@@ -853,3 +853,39 @@
}

__setup("memfrac=", setup_mem_frac);
+
+
+/*
+ * This walks the freelist for each zone. Whilst this is slow, I'd rather
+ * be slow here than slow down the fast path by keeping stats - mjbligh
+ */
+int buddyinfo(char *buf, int node_id)
+{
+ int zone_id, order, free, len = 0;
+ unsigned long flags;
+ zone_t *zone;
+ free_area_t * area;
+ struct list_head *head, *curr;
+
+ for (zone_id = 0; zone_id < MAX_NR_ZONES; ++zone_id) {
+ zone = &(NODE_DATA(node_id)->node_zones[zone_id]);
+ if (zone->size == 0)
+ continue;
+ spin_lock_irqsave(&zone->lock, flags);
+ len += sprintf(buf+len, "Node %d, Zone %8s, ",
+ node_id, zone->name);
+ for (order = 0; order < MAX_ORDER; ++order) {
+ area = zone->free_area + order;
+ head = &area->free_list;
+ free = 0;
+ for (curr = head->next; curr != head; curr = curr->next)
+ ++free;
+ len += sprintf(buf+len, "%d ", free);
+ }
+ len += sprintf(buf+len, "\n");
+ spin_unlock_irqrestore(&zone->lock, flags);
+ }
+
+ return len;
+}
+

2002-06-07 00:59:31

by William Lee Irwin III

[permalink] [raw]
Subject: Re: [RFC] 4KB stack + irq stack for x86

<[email protected]> said:
Uli> So in the case of 8K page size, you need an order-2 allocation
Uli> for the stack, right? How do you handle failures due to
Uli> fragmentation?

On Thu, Jun 06, 2002 at 02:19:02PM -0700, David Mosberger wrote:
> We don't do anything special. I'm not sure what the fragmentation
> statistics look like on machines with 1+GB memory; it's something I
> have been wondering about and hoping to look into at some point (if
> someone has done that already, I'd love to see the results). In
> practice, every ia64 linux distro as of today ships with 16KB page
> size, so you only get order-1 allocations for stacks.

I've been collecting information on this as well, as I've been
maintaining a patch to support deferred coalescing in the page-level
allocator. Martin Bligh contributed the code to collect some
fragmentation statistics for that patch, which was originally written
for mainline 2.4. It's slightly less expensive in lazy_buddy, as the
algorithm requires some accounting anyway, but some other statistics
besides population counts for various block sizes might also be useful
to track here. I'm holding off until I read up on seq_file() and convert
the /proc/ reporting over to it before the next release of it, though I
do have more current diffs than I've announced. I wouldn't mind at all
hearing from those who have more stringent fragmentation requirements.

Cheers,
Bill

2002-06-07 01:10:05

by David Miller

[permalink] [raw]
Subject: Re: [RFC] 4KB stack + irq stack for x86

From: Pete Zaitcev <[email protected]>
Date: Thu, 6 Jun 2002 13:37:58 -0400

>[...]
> Of course, the situation is particularly bad on s390, because every
> function call needs at least 96 bytes on the stack (due to the register
> save areas required by our ABI).

How is this different from sparc64?

Sparc64 even eats 192 bytes per function call, minimum. Sibling call
optimization in current GCC helps, but it is still an issue.

2002-06-07 11:27:48

by Maciej W. Rozycki

[permalink] [raw]
Subject: Re: [RFC] 4KB stack + irq stack for x86

On 6 Jun 2002, Andi Kleen wrote:

> Seems to be an old myth. Actually the 4K paged 64bit platforms are in the majority.
>
> 64bit linux platforms:
>
> 4K page: x86-64, ppc64, s390x, mips64, parisc64(?)
> 8K: alpha, sparc64
> 8-64K: ia64

MIPS64 is 4K-16M (per page), currently fixed at 4K, but it can be changed
if desireable.

--
+ Maciej W. Rozycki, Technical University of Gdansk, Poland +
+--------------------------------------------------------------+
+ e-mail: [email protected], PGP key available +

2002-06-09 17:59:30

by Pavel Machek

[permalink] [raw]
Subject: Re: [RFC] 4KB stack + irq stack for x86

Hi!

> The scenario Steve outlined was rather optimistic - more pessimistic
> case would be e.g:
> you run NBD which calls the network stack with an complex file system on top
> of it called by something else complex that does a GFP_KERNEL alloc and VM
> wants to flush a page via the NBD file system -

Actually, at this point we are dead anyway because of locks in NBD. NBD should
be carefull to use GFP_NOIO.
Pavel
--
Philips Velo 1: 1"x4"x8", 300gram, 60, 12MB, 40bogomips, linux, mutt,
details at http://atrey.karlin.mff.cuni.cz/~pavel/velo/index.html.

2002-06-09 18:52:36

by Andi Kleen

[permalink] [raw]
Subject: Re: [RFC] 4KB stack + irq stack for x86

On Sun, Jun 02, 2002 at 05:52:02PM +0200, Pavel Machek wrote:
> Hi!
>
> > The scenario Steve outlined was rather optimistic - more pessimistic
> > case would be e.g:
> > you run NBD which calls the network stack with an complex file system on top
> > of it called by something else complex that does a GFP_KERNEL alloc and VM
> > wants to flush a page via the NBD file system -
>
> Actually, at this point we are dead anyway because of locks in NBD. NBD should
> be carefull to use GFP_NOIO.

Reread what I wrote. It does not involve NBD recursing.

-Andi