This patch implement task_struct plus kernel stack colouring for the x86
CPU family, and it can be easily adapted for other architectures.
The slab allocator for task_struct's ( Manfred ) is cacheline aligned and
gives us a free colouring w/out any other intervention.
The kernel stack frame jittering is done inside arch/??/kernel/process.c
and right now uses 8 colours ( STACK_COLOUR_BITS == 3 ).
The task_struct pointer is kept at the stack base for a fast seek.
The patch is briefly described here :
http://www.xmailserver.org/linux-patches/misc.html#TskStackCol
- Davide
[diffstat]
arch/i386/kernel/entry.S | 7 ++---
arch/i386/kernel/head.S | 2 -
arch/i386/kernel/init_task.c | 14 ++++++++--
arch/i386/kernel/irq.c | 2 -
arch/i386/kernel/process.c | 58 +++++++++++++++++++++++++++++++++++++++++--
arch/i386/kernel/smpboot.c | 2 -
arch/i386/kernel/traps.c | 2 -
arch/i386/lib/getuser.S | 3 ++
include/asm-i386/current.h | 6 ++--
include/asm-i386/hw_irq.h | 3 +-
include/asm-i386/processor.h | 21 ++++++++++-----
include/linux/sched.h | 16 ++++++++++-
init/main.c | 6 ++++
kernel/ksyms.c | 2 -
14 files changed, 118 insertions, 26 deletions
diff -Nru linux-2.5.0.vanilla/arch/i386/kernel/entry.S linux-2.5.0.sstc/arch/i386/kernel/entry.S
--- linux-2.5.0.vanilla/arch/i386/kernel/entry.S Fri Nov 2 17:18:49 2001
+++ linux-2.5.0.sstc/arch/i386/kernel/entry.S Sun Dec 2 16:34:15 2001
@@ -130,7 +130,8 @@
#define GET_CURRENT(reg) \
movl $-8192, reg; \
- andl %esp, reg
+ andl %esp, reg; \
+ movl (reg), reg;
ENTRY(lcall7)
pushfl # We get a different stack layout with call gates,
@@ -144,7 +145,7 @@
movl %ecx,CS(%esp) #
movl %esp,%ebx
pushl %ebx
- andl $-8192,%ebx # GET_CURRENT
+ GET_CURRENT(%ebx)
movl exec_domain(%ebx),%edx # Get the execution domain
movl 4(%edx),%edx # Get the lcall7 handler for the domain
pushl $0x7
@@ -165,7 +166,7 @@
movl %ecx,CS(%esp) #
movl %esp,%ebx
pushl %ebx
- andl $-8192,%ebx # GET_CURRENT
+ GET_CURRENT(%ebx)
movl exec_domain(%ebx),%edx # Get the execution domain
movl 4(%edx),%edx # Get the lcall7 handler for the domain
pushl $0x27
diff -Nru linux-2.5.0.vanilla/arch/i386/kernel/head.S linux-2.5.0.sstc/arch/i386/kernel/head.S
--- linux-2.5.0.vanilla/arch/i386/kernel/head.S Wed Jun 20 11:00:53 2001
+++ linux-2.5.0.sstc/arch/i386/kernel/head.S Sun Dec 2 18:17:15 2001
@@ -320,7 +320,7 @@
ret
ENTRY(stack_start)
- .long SYMBOL_NAME(init_task_union)+8192
+ .long SYMBOL_NAME(init_ts)+8192
.long __KERNEL_DS
/* This is the default interrupt "handler" :-) */
diff -Nru linux-2.5.0.vanilla/arch/i386/kernel/init_task.c linux-2.5.0.sstc/arch/i386/kernel/init_task.c
--- linux-2.5.0.vanilla/arch/i386/kernel/init_task.c Mon Sep 17 15:29:09 2001
+++ linux-2.5.0.sstc/arch/i386/kernel/init_task.c Sun Dec 2 21:49:14 2001
@@ -18,9 +18,17 @@
* way process stacks are handled. This is done by having a special
* "init_task" linker map entry..
*/
-union task_union init_task_union
- __attribute__((__section__(".data.init_task"))) =
- { INIT_TASK(init_task_union.task) };
+struct init_task_struct init_ts __attribute__((__section__(".data.init_task"))) =
+{
+ stack: {(unsigned long) &init_ts.ftsk.task, 0, },
+ ftsk: {
+ task: INIT_TASK(init_ts.ftsk.task),
+ count: ATOMIC_INIT(1),
+ stack: (unsigned long) init_ts.stack,
+ stack_top: (unsigned long) init_ts.stack + INIT_TASK_SIZE
+ }
+};
+
/*
* per-CPU TSS segments. Threads are completely 'soft' on Linux,
diff -Nru linux-2.5.0.vanilla/arch/i386/kernel/irq.c linux-2.5.0.sstc/arch/i386/kernel/irq.c
--- linux-2.5.0.vanilla/arch/i386/kernel/irq.c Thu Oct 25 13:53:46 2001
+++ linux-2.5.0.sstc/arch/i386/kernel/irq.c Sun Dec 2 17:50:26 2001
@@ -223,7 +223,7 @@
continue;
}
esp &= ~(THREAD_SIZE-1);
- esp += sizeof(struct task_struct);
+ esp += sizeof(long);
show_stack((void*)esp);
}
printk("\nCPU %d:",cpu);
diff -Nru linux-2.5.0.vanilla/arch/i386/kernel/process.c linux-2.5.0.sstc/arch/i386/kernel/process.c
--- linux-2.5.0.vanilla/arch/i386/kernel/process.c Thu Oct 4 18:42:54 2001
+++ linux-2.5.0.sstc/arch/i386/kernel/process.c Sun Dec 2 21:50:44 2001
@@ -575,18 +575,27 @@
#define savesegment(seg,value) \
asm volatile("movl %%" #seg ",%0":"=m" (*(int *)&(value)))
+#define STACK_COLOUR_BITS 3
+#define STACK_COLOUR_MASK ((1 << STACK_COLOUR_BITS) - 1)
+#define STACK_SHIFT_BITS (PAGE_SHIFT + 1)
+
+static inline unsigned long get_stack_jitter(struct task_struct *p)
+{
+ return ((TSK_TO_KSTACK(p) >> STACK_SHIFT_BITS) & STACK_COLOUR_MASK) << L1_CACHE_SHIFT;
+}
+
int copy_thread(int nr, unsigned long clone_flags, unsigned long esp,
unsigned long unused,
struct task_struct * p, struct pt_regs * regs)
{
struct pt_regs * childregs;
- childregs = ((struct pt_regs *) (THREAD_SIZE + (unsigned long) p)) - 1;
+ childregs = ((struct pt_regs *) (THREAD_SIZE + TSK_TO_KSTACK(p) - get_stack_jitter(p))) - 1;
struct_cpy(childregs, regs);
childregs->eax = 0;
childregs->esp = esp;
- p->thread.esp = (unsigned long) childregs;
+ p->thread.esp = TSK_KSTACK_TOP(p) = (unsigned long) childregs;
p->thread.esp0 = (unsigned long) (childregs+1);
p->thread.eip = (unsigned long) ret_from_fork;
@@ -821,3 +830,48 @@
}
#undef last_sched
#undef first_sched
+
+
+/*
+ * this part should be moved in a system independent section.
+ */
+static kmem_cache_t * tsk_cache;
+
+void __init init_tsk_allocator(void)
+{
+ tsk_cache = kmem_cache_create("task_struct_cache",
+ sizeof(struct full_task_struct),
+ 0,
+ SLAB_HWCACHE_ALIGN,
+ NULL, NULL);
+ if (!tsk_cache)
+ panic("Cannot create task struct cache");
+
+}
+
+struct task_struct *alloc_task_struct(void)
+{
+ struct full_task_struct *f = (struct full_task_struct *) kmem_cache_alloc(tsk_cache, GFP_KERNEL);
+
+ if (!f)
+ return NULL;
+ f->stack = __get_free_pages(GFP_KERNEL, 1);
+ if (!f->stack) {
+ kmem_cache_free(tsk_cache, f);
+ return NULL;
+ }
+ atomic_set(&f->count, 1);
+ *((struct task_struct **) f->stack) = (struct task_struct *) f;
+ return (struct task_struct *) f;
+}
+
+void free_task_struct(struct task_struct *p)
+{
+ struct full_task_struct *f = (struct full_task_struct *) p;
+
+ if (atomic_dec_and_test(&f->count)) {
+ free_pages(f->stack, 1);
+ kmem_cache_free(tsk_cache, f);
+ }
+}
+
diff -Nru linux-2.5.0.vanilla/arch/i386/kernel/smpboot.c linux-2.5.0.sstc/arch/i386/kernel/smpboot.c
--- linux-2.5.0.vanilla/arch/i386/kernel/smpboot.c Wed Nov 21 10:35:48 2001
+++ linux-2.5.0.sstc/arch/i386/kernel/smpboot.c Sun Dec 2 19:31:23 2001
@@ -815,7 +815,7 @@
/* So we see what's up */
printk("Booting processor %d/%d eip %lx\n", cpu, apicid, start_eip);
- stack_start.esp = (void *) (1024 + PAGE_SIZE + (char *)idle);
+ stack_start.esp = (void *) (THREAD_SIZE + (char *) TSK_TO_KSTACK(idle));
/*
* This grunge runs the startup process for
diff -Nru linux-2.5.0.vanilla/arch/i386/kernel/traps.c linux-2.5.0.sstc/arch/i386/kernel/traps.c
--- linux-2.5.0.vanilla/arch/i386/kernel/traps.c Sun Sep 30 12:26:08 2001
+++ linux-2.5.0.sstc/arch/i386/kernel/traps.c Sun Dec 2 19:37:15 2001
@@ -209,7 +209,7 @@
printk("ds: %04x es: %04x ss: %04x\n",
regs->xds & 0xffff, regs->xes & 0xffff, ss);
printk("Process %s (pid: %d, stackpage=%08lx)",
- current->comm, current->pid, 4096+(unsigned long)current);
+ current->comm, current->pid, TSK_TO_KSTACK(current));
/*
* When in-kernel, we also print out the stack and code at the
* time of the fault..
diff -Nru linux-2.5.0.vanilla/arch/i386/lib/getuser.S linux-2.5.0.sstc/arch/i386/lib/getuser.S
--- linux-2.5.0.vanilla/arch/i386/lib/getuser.S Mon Jan 12 13:42:52 1998
+++ linux-2.5.0.sstc/arch/i386/lib/getuser.S Sun Dec 2 16:34:15 2001
@@ -29,6 +29,7 @@
__get_user_1:
movl %esp,%edx
andl $0xffffe000,%edx
+ movl (%edx),%edx
cmpl addr_limit(%edx),%eax
jae bad_get_user
1: movzbl (%eax),%edx
@@ -42,6 +43,7 @@
movl %esp,%edx
jc bad_get_user
andl $0xffffe000,%edx
+ movl (%edx),%edx
cmpl addr_limit(%edx),%eax
jae bad_get_user
2: movzwl -1(%eax),%edx
@@ -55,6 +57,7 @@
movl %esp,%edx
jc bad_get_user
andl $0xffffe000,%edx
+ movl (%edx),%edx
cmpl addr_limit(%edx),%eax
jae bad_get_user
3: movl -3(%eax),%edx
diff -Nru linux-2.5.0.vanilla/include/asm-i386/current.h linux-2.5.0.sstc/include/asm-i386/current.h
--- linux-2.5.0.vanilla/include/asm-i386/current.h Fri Aug 14 16:35:22 1998
+++ linux-2.5.0.sstc/include/asm-i386/current.h Sun Dec 2 16:34:15 2001
@@ -5,9 +5,9 @@
static inline struct task_struct * get_current(void)
{
- struct task_struct *current;
- __asm__("andl %%esp,%0; ":"=r" (current) : "0" (~8191UL));
- return current;
+ unsigned long *tskptr;
+ __asm__("andl %%esp,%0; ":"=r" (tskptr) : "0" (~8191UL));
+ return (struct task_struct *) *tskptr;
}
#define current get_current()
diff -Nru linux-2.5.0.vanilla/include/asm-i386/hw_irq.h linux-2.5.0.sstc/include/asm-i386/hw_irq.h
--- linux-2.5.0.vanilla/include/asm-i386/hw_irq.h Thu Nov 22 11:46:18 2001
+++ linux-2.5.0.sstc/include/asm-i386/hw_irq.h Sun Dec 2 22:22:06 2001
@@ -115,7 +115,8 @@
#define GET_CURRENT \
"movl %esp, %ebx\n\t" \
- "andl $-8192, %ebx\n\t"
+ "andl $-8192, %ebx\n\t" \
+ "movl (%ebx), %ebx\n\t"
/*
* SMP has a few special interrupts for IPI messages
diff -Nru linux-2.5.0.vanilla/include/asm-i386/processor.h linux-2.5.0.sstc/include/asm-i386/processor.h
--- linux-2.5.0.vanilla/include/asm-i386/processor.h Thu Nov 22 11:46:19 2001
+++ linux-2.5.0.sstc/include/asm-i386/processor.h Sun Dec 2 22:22:06 2001
@@ -14,6 +14,8 @@
#include <asm/types.h>
#include <asm/sigcontext.h>
#include <asm/cpufeature.h>
+#include <asm/ptrace.h>
+#include <linux/kernel.h>
#include <linux/cache.h>
#include <linux/config.h>
#include <linux/threads.h>
@@ -444,16 +446,21 @@
}
unsigned long get_wchan(struct task_struct *p);
-#define KSTK_EIP(tsk) (((unsigned long *)(4096+(unsigned long)(tsk)))[1019])
-#define KSTK_ESP(tsk) (((unsigned long *)(4096+(unsigned long)(tsk)))[1022])
+#define KSTK_EIP(tsk) ((unsigned long *) TSK_KSTACK_TOP(tsk))[EIP]
+#define KSTK_ESP(tsk) ((unsigned long *) TSK_KSTACK_TOP(tsk))[UESP]
+
#define THREAD_SIZE (2*PAGE_SIZE)
-#define alloc_task_struct() ((struct task_struct *) __get_free_pages(GFP_KERNEL,1))
-#define free_task_struct(p) free_pages((unsigned long) (p), 1)
-#define get_task_struct(tsk) atomic_inc(&virt_to_page(tsk)->count)
-#define init_task (init_task_union.task)
-#define init_stack (init_task_union.stack)
+extern void init_tsk_allocator(void);
+extern struct task_struct *alloc_task_struct(void);
+extern void free_task_struct(struct task_struct *p);
+
+#define get_task_struct(tsk) atomic_inc(&TSK_COUNT(tsk))
+
+#define init_task (init_ts.ftsk.task)
+#define init_stack (init_ts.stack)
+
struct microcode {
unsigned int hdrver;
diff -Nru linux-2.5.0.vanilla/include/linux/sched.h linux-2.5.0.sstc/include/linux/sched.h
--- linux-2.5.0.vanilla/include/linux/sched.h Thu Nov 22 11:46:19 2001
+++ linux-2.5.0.sstc/include/linux/sched.h Sun Dec 2 22:22:06 2001
@@ -507,12 +507,24 @@
# define INIT_TASK_SIZE 2048*sizeof(long)
#endif
-union task_union {
+#define TSK_TO_KSTACK(p) (((struct full_task_struct *) (p))->stack)
+#define TSK_KSTACK_TOP(p) (((struct full_task_struct *) (p))->stack_top)
+#define TSK_COUNT(p) (((struct full_task_struct *) (p))->count)
+
+struct full_task_struct {
struct task_struct task;
+ atomic_t count;
+ unsigned long stack;
+ unsigned long stack_top;
+};
+
+struct init_task_struct {
unsigned long stack[INIT_TASK_SIZE/sizeof(long)];
+ struct full_task_struct ftsk;
};
-extern union task_union init_task_union;
+extern struct init_task_struct init_ts;
+
extern struct mm_struct init_mm;
extern struct task_struct *init_tasks[NR_CPUS];
diff -Nru linux-2.5.0.vanilla/init/main.c linux-2.5.0.sstc/init/main.c
--- linux-2.5.0.vanilla/init/main.c Fri Nov 9 14:15:00 2001
+++ linux-2.5.0.sstc/init/main.c Sun Dec 2 18:04:08 2001
@@ -594,6 +594,12 @@
mempages = num_physpages;
fork_init(mempages);
+ /*
+ * task_struct/stack colouring is implemented only for x86 right now.
+ */
+#ifdef __i386__
+ init_tsk_allocator();
+#endif
proc_caches_init();
vfs_caches_init(mempages);
buffer_init(mempages);
diff -Nru linux-2.5.0.vanilla/kernel/ksyms.c linux-2.5.0.sstc/kernel/ksyms.c
--- linux-2.5.0.vanilla/kernel/ksyms.c Wed Nov 21 14:07:25 2001
+++ linux-2.5.0.sstc/kernel/ksyms.c Sun Dec 2 18:11:17 2001
@@ -548,7 +548,7 @@
/* init task, for moving kthread roots - ought to export a function ?? */
-EXPORT_SYMBOL(init_task_union);
+EXPORT_SYMBOL(init_ts);
EXPORT_SYMBOL(tasklist_lock);
EXPORT_SYMBOL(pidhash);
Hi,
Your patch achieved very good performance in our benchmark test, but
I would like to say the following two points about your implementation
for kernel stack colouring.
1. Your patch could implement coloured kernel stack, but not
reduce cache line conflicts.
2. I couldn't see the performance difference whether stack colouring
is done or not.
[1] get_stack_jitter() in arch/i386/kernel/process.c selects 3 bits
from the cache line index bits. So, cache conflicts still occurs at
shifted line.
Suppose the cache profile is 256KB 4-way, the address distance between
the data on a some block and the data on the other block on the same
set is multiple of 64KB. This means, the lower 16 bits of such
addresses are always same.
The patch uses 3 bits, from bit position 13 to 15, the data of set
has always the same colour as describe above. From the viewpoint of
cache miss reduction this colouring has no effect.
The patch which I have posted before uses 3bits, from bit
position 18 to 20 (1MB 4-way L2-cache) for task_structs colouring.
I suggest you the following two ways for stack colouring.
(a) Using upper bits than the cache index bits.(ex. On 256KB L2-cache
system, STACK_SHIFT_BITS should be 16(11 index bits + 5 offset
bits).
(b) Using modulo operation for colouring.
in get_stack_jitter() (arch/i386/kernel/process.c)
+#define NUM_COLOUR 9 /* the number of colouring (an odd number) */
static inline unsigned long get_stack_jitter(struct task_struct *p)
{
- return ((TSK_TO_KSTACK(p) >> STACK_SHIFT_BITS) & STACK_COLOUR_MASK) << L1_CACHE_SHIFT;
+ return ((TSK_TO_KSTACK(p) >> STACK_SHIFT_BITS) % NUM_COLOUR) << L1_CACHE_SHIFT;
}
[2] I measured the effects of your patch on 4-way PIII-Xeon with 1MB
L2-Cache systems using web-bench (apache 1.3.19), and also measured
the performance of the modified version(*), which uses 3 bits, from
bit position 16 to 18 to avoid cache conflicts.
[Benchmarking Result] request processing performance improvement,
compared to the original kernel(2.5.0)>
2.5.0 + Davide's Patch ... +11.8%up
2.5.0 + Davide's Patch ... +11.8%up
(STACK_COLOUR_BITS = 0)
2.5.0 + Davide's Patch* ... +11.9%up
(alternate version with [1](a))
Considering these result, the effects of stack colouring is very
slightly(+0.1%). This patch's major effects is task_struct colouring,
and we had no performance gains by the stack colouring at least in our
experimentation.
-----
Computer Systems Laboratories, Fujitsu Labs.
Shuji YAMAMURA ([email protected])
On Wed, 5 Dec 2001, Shuji YAMAMURA wrote:
> Hi,
>
> Your patch achieved very good performance in our benchmark test, but
> I would like to say the following two points about your implementation
> for kernel stack colouring.
>
> 1. Your patch could implement coloured kernel stack, but not
> reduce cache line conflicts.
> 2. I couldn't see the performance difference whether stack colouring
> is done or not.
>
> [1] get_stack_jitter() in arch/i386/kernel/process.c selects 3 bits
> from the cache line index bits. So, cache conflicts still occurs at
> shifted line.
>
> Suppose the cache profile is 256KB 4-way, the address distance between
> the data on a some block and the data on the other block on the same
> set is multiple of 64KB. This means, the lower 16 bits of such
> addresses are always same.
> The patch uses 3 bits, from bit position 13 to 15, the data of set
> has always the same colour as describe above. From the viewpoint of
> cache miss reduction this colouring has no effect.
>
> The patch which I have posted before uses 3bits, from bit
> position 18 to 20 (1MB 4-way L2-cache) for task_structs colouring.
>
> I suggest you the following two ways for stack colouring.
>
> (a) Using upper bits than the cache index bits.(ex. On 256KB L2-cache
> system, STACK_SHIFT_BITS should be 16(11 index bits + 5 offset
> bits).
>
> (b) Using modulo operation for colouring.
>
> in get_stack_jitter() (arch/i386/kernel/process.c)
> +#define NUM_COLOUR 9 /* the number of colouring (an odd number) */
> static inline unsigned long get_stack_jitter(struct task_struct *p)
> {
> - return ((TSK_TO_KSTACK(p) >> STACK_SHIFT_BITS) & STACK_COLOUR_MASK) << L1_CACHE_SHIFT;
> + return ((TSK_TO_KSTACK(p) >> STACK_SHIFT_BITS) % NUM_COLOUR) << L1_CACHE_SHIFT;
> }
Whatever bits you take it's a random move with a limited memory address
space and does not change the picture.
Stack colouring become evident when you have a quite big number of
processes waiting for the same event inside the kernel ( accept, ... )
that means that they're going to walk the same path in their way in/out of
the kernel.
By simplifying the pattern, with the current implementation with your
example 256Kb 4 way associative and 8Kb kernel stack you've ( statistically ):
W0 W1 W2 W3
64Kb -------- -------- -------- --------
| | | | | | | |
| | | | | | | |
| | | | | | | |
sp ->| |->| |->| |->| |
| | | | | | | |
| | | | | | | |
| | | | | | | |
56Kb ........ ........ ........ ........
| | | | | | | |
| | | | | | | |
| | | | | | | |
sp ->| |->| |->| |->| |
| | | | | | | |
| | | | | | | |
| | | | | | | |
48Kb ........ ........ ........ ........
! ! ! ! ! ! ! !
8Kb ........ ........ ........ ........
| | | | | | | |
| | | | | | | |
| | | | | | | |
sp ->| |->| |->| |->| |
| | | | | | | |
| | | | | | | |
| | | | | | | |
0Kb ........ ........ ........ ........
By adding three bits of colouring you're going to cut the collision of
about 1/8.
- Davide
Davide Libenzi wrote
>
>
>By adding three bits of colouring you're going to cut the collision of
>about 1/8.
>
No, Shuji is right:
You have just shifted the problem, without reducing collisions.
256 kB, 4 way cache with 32 byte linesize.
cacheline == bits 15..5
offset within cacheline: bits 4..0
The colouring must depend on more than just bits 13 to 15 - if these
bits are different, then the access goes into a different line even
without colouring, there won't be a collision.
Shuij, I don't understand why you need both a shift and a modulo:
address % odd_number should generate a random distribution (i.e. all
bits affect the result), even without the shift.
--
Manfred
On Wed, 5 Dec 2001, Manfred Spraul wrote:
> Davide Libenzi wrote
>
> >
> >
> >By adding three bits of colouring you're going to cut the collision of
> >about 1/8.
> >
> No, Shuji is right:
> You have just shifted the problem, without reducing collisions.
> 256 kB, 4 way cache with 32 byte linesize.
>
> cacheline == bits 15..5
> offset within cacheline: bits 4..0
>
> The colouring must depend on more than just bits 13 to 15 - if these
> bits are different, then the access goes into a different line even
> without colouring, there won't be a collision.
Yep, damn true, using 13..15 I'll get "vertical" shift that is already
implicit.
In that case we need a more deep knowledge of the cache architecture like
size and associativity.
- Davide
Manfred Spraul writes:
> Shuij, I don't understand why you need both a shift and a modulo:
> address % odd_number should generate a random distribution (i.e. all
> bits affect the result), even without the shift.
The lame duck reason was: previously it was intended to use within
Current macro, the division should be avoided such a frequently used
operation.
Currently the stack coloring operation is required only at process
creation time, division by a odd number is sufficient and we'll
experiment it.
--
Kouichi Kumon, Software Laboratory, Fujitsu Labs.
[email protected]
From: Manfred Spraul <[email protected]>
Subject: Re: [PATCH] task_struct + kernel stack colouring ...
Date: Wed, 05 Dec 2001 21:33:56 +0100
Message-ID: <[email protected]>
> Shuij, I don't understand why you need both a shift and a modulo:
> address % odd_number should generate a random distribution (i.e. all
> bits affect the result), even without the shift.
Yes, you are right. The shift operation is unnecessary.
Sorry for your confusion.
And probably, the modulo method is better than copying bits, because
it does not depend on the cache configuration.
My posted patch did't use modulo in order to avoid the overhead of
get_current(), which is frequently called. But, copy_thread() is
called once to generate a process, so such overhead may not be the
problem.
Additionally, we measured cache line conflict statistics of stack tops
by snapshotting them during running of apache on 1MB L2-cache system.
The modulo colouring certainly reduces the cache line collisions.
# of tasks # of cache line # of collisions
used AVG(MAX)
--------------------------------------------------------------
Davide's Patch 246 32 7.68(14)
(original)
Davide's Patch 244 165 1.48(6)
(copying 16 to 18 bits)
Davide's Patch 244 164 1.49(4)
(modulo 9)
but, we could't see the meaningful performance difference among them
at least in this experimentation.
Thank you,
-----
Computer Systems Laboratories, Fujitsu Labs.
Shuji YAMAMURA ([email protected])