2003-05-20 04:20:48

by Rusty Russell

[permalink] [raw]
Subject: [PATCH 4/3] Replace dynamic percpu implementation

This requires the first three patches. IA64 untested. See comment.

Name: Dynamic per-cpu allocation using static per-cpu mechanism
Author: Rusty Russell
Status: Tested on 2.5.69-bk13
Depends: Misc/kmalloc_percpu-interface.patch.gz

D: This patch replaces the dynamic per-cpu allocator, alloc_percpu,
D: to make it use the same mechanism as the static per-cpu variables, ie.
D: ptr + __per_cpu_offset[smp_processor_id()] gives the variable address.
D: This allows it to be used in modules (following patch), and hopefully
D: increases space and time efficiency of reference at the same time.
D: It gets moved to its own (SMP-only) file: mm/percpu.c.
D:
D: The basic idea is that when we need more memory, we allocate
D: another NR_CPUS*sizeof(.data.percpu section), and hand allocations
D: out from that (minus the __per_cpu_offset, which is set at boot
D: from the difference between the .data.percpu section and the
D: initial NR_CPUS*sizeof(.data.percpu section) allocation.
D:
D: The situation is made trickier by archs which want to allocate
D: per-cpu memory near the CPUs which use them: hooks are provided for
D: the initial alloc (initial_percpumem(), which can also change the
D: size of the allocation, eg. to page-align), new_percpumem(), and
D: free_percpumem(). Defining __NEED_PERCPU_ALLOC gets the default
D: implementations (basically: alloc_bootmem, vmalloc and free
D: respectively).
D:
D: Finally, IA64 uses the trick of mapping the local per-cpu area to
D: the same location on each CPU, but it's a fixed mapping of 64k.
D: This trivially maps to an initial_percpumem which always allocates
D: 64k, and a new_percpumem which always returns NULL. This will prove
D: to be insufficient if per-cpu allocations become widespread: (see
D: comment). IA64 code is untested.

diff -urpN --exclude TAGS -X /home/rusty/devel/kernel/kernel-patches/current-dontdiff --minimal .31630-linux-2.5.69-bk13/include/asm-alpha/percpu.h .31630-linux-2.5.69-bk13.updated/include/asm-alpha/percpu.h
--- .31630-linux-2.5.69-bk13/include/asm-alpha/percpu.h 2003-01-02 12:25:14.000000000 +1100
+++ .31630-linux-2.5.69-bk13.updated/include/asm-alpha/percpu.h 2003-05-20 12:59:17.000000000 +1000
@@ -2,5 +2,6 @@
#define __ALPHA_PERCPU_H

#include <asm-generic/percpu.h>
+#define __NEED_PERCPU_ALLOC

#endif /* __ALPHA_PERCPU_H */
diff -urpN --exclude TAGS -X /home/rusty/devel/kernel/kernel-patches/current-dontdiff --minimal .31630-linux-2.5.69-bk13/include/asm-generic/percpu.h .31630-linux-2.5.69-bk13.updated/include/asm-generic/percpu.h
--- .31630-linux-2.5.69-bk13/include/asm-generic/percpu.h 2003-05-20 12:57:17.000000000 +1000
+++ .31630-linux-2.5.69-bk13.updated/include/asm-generic/percpu.h 2003-05-20 12:57:19.000000000 +1000
@@ -2,7 +2,6 @@
#define _ASM_GENERIC_PERCPU_H_
#include <linux/compiler.h>

-#define __GENERIC_PER_CPU
#ifdef CONFIG_SMP

/* Separate out the type, so (int[3], foo) works. */
@@ -12,7 +11,7 @@
#endif

#define __get_cpu_var(var) per_cpu(var, smp_processor_id())
-
+#define __get_cpu_ptr(ptr) per_cpu_ptr(ptr, smp_processor_id())
#endif /* SMP */

#endif /* _ASM_GENERIC_PERCPU_H_ */
diff -urpN --exclude TAGS -X /home/rusty/devel/kernel/kernel-patches/current-dontdiff --minimal .31630-linux-2.5.69-bk13/include/asm-h8300/percpu.h .31630-linux-2.5.69-bk13.updated/include/asm-h8300/percpu.h
--- .31630-linux-2.5.69-bk13/include/asm-h8300/percpu.h 2003-04-20 18:05:12.000000000 +1000
+++ .31630-linux-2.5.69-bk13.updated/include/asm-h8300/percpu.h 2003-05-20 12:59:37.000000000 +1000
@@ -2,5 +2,6 @@
#define __ARCH_H8300_PERCPU__

#include <asm-generic/percpu.h>
+#define __NEED_PERCPU_ALLOC

#endif /* __ARCH_H8300_PERCPU__ */
diff -urpN --exclude TAGS -X /home/rusty/devel/kernel/kernel-patches/current-dontdiff --minimal .31630-linux-2.5.69-bk13/include/asm-i386/percpu.h .31630-linux-2.5.69-bk13.updated/include/asm-i386/percpu.h
--- .31630-linux-2.5.69-bk13/include/asm-i386/percpu.h 2003-01-02 12:00:21.000000000 +1100
+++ .31630-linux-2.5.69-bk13.updated/include/asm-i386/percpu.h 2003-05-20 12:57:19.000000000 +1000
@@ -2,5 +2,6 @@
#define __ARCH_I386_PERCPU__

#include <asm-generic/percpu.h>
+#define __NEED_PERCPU_ALLOC

#endif /* __ARCH_I386_PERCPU__ */
diff -urpN --exclude TAGS -X /home/rusty/devel/kernel/kernel-patches/current-dontdiff --minimal .31630-linux-2.5.69-bk13/include/asm-m68k/percpu.h .31630-linux-2.5.69-bk13.updated/include/asm-m68k/percpu.h
--- .31630-linux-2.5.69-bk13/include/asm-m68k/percpu.h 2003-01-02 12:33:55.000000000 +1100
+++ .31630-linux-2.5.69-bk13.updated/include/asm-m68k/percpu.h 2003-05-20 13:00:01.000000000 +1000
@@ -2,5 +2,6 @@
#define __ASM_M68K_PERCPU_H

#include <asm-generic/percpu.h>
+#define __NEED_PERCPU_ALLOC

#endif /* __ASM_M68K_PERCPU_H */
diff -urpN --exclude TAGS -X /home/rusty/devel/kernel/kernel-patches/current-dontdiff --minimal .31630-linux-2.5.69-bk13/include/asm-parisc/percpu.h .31630-linux-2.5.69-bk13.updated/include/asm-parisc/percpu.h
--- .31630-linux-2.5.69-bk13/include/asm-parisc/percpu.h 2003-01-02 12:32:48.000000000 +1100
+++ .31630-linux-2.5.69-bk13.updated/include/asm-parisc/percpu.h 2003-05-20 13:01:07.000000000 +1000
@@ -2,6 +2,7 @@
#define _PARISC_PERCPU_H

#include <asm-generic/percpu.h>
+#define __NEED_PERCPU_ALLOC

#endif

diff -urpN --exclude TAGS -X /home/rusty/devel/kernel/kernel-patches/current-dontdiff --minimal .31630-linux-2.5.69-bk13/include/asm-ppc/percpu.h .31630-linux-2.5.69-bk13.updated/include/asm-ppc/percpu.h
--- .31630-linux-2.5.69-bk13/include/asm-ppc/percpu.h 2003-01-02 12:00:21.000000000 +1100
+++ .31630-linux-2.5.69-bk13.updated/include/asm-ppc/percpu.h 2003-05-20 13:01:19.000000000 +1000
@@ -2,5 +2,6 @@
#define __ARCH_PPC_PERCPU__

#include <asm-generic/percpu.h>
+#define __NEED_PERCPU_ALLOC

#endif /* __ARCH_PPC_PERCPU__ */
diff -urpN --exclude TAGS -X /home/rusty/devel/kernel/kernel-patches/current-dontdiff --minimal .31630-linux-2.5.69-bk13/include/asm-s390/percpu.h .31630-linux-2.5.69-bk13.updated/include/asm-s390/percpu.h
--- .31630-linux-2.5.69-bk13/include/asm-s390/percpu.h 2003-01-02 12:06:24.000000000 +1100
+++ .31630-linux-2.5.69-bk13.updated/include/asm-s390/percpu.h 2003-05-20 13:02:50.000000000 +1000
@@ -2,5 +2,6 @@
#define __ARCH_S390_PERCPU__

#include <asm-generic/percpu.h>
+#define __NEED_PERCPU_ALLOC

#endif /* __ARCH_S390_PERCPU__ */
diff -urpN --exclude TAGS -X /home/rusty/devel/kernel/kernel-patches/current-dontdiff --minimal .31630-linux-2.5.69-bk13/include/asm-sparc/percpu.h .31630-linux-2.5.69-bk13.updated/include/asm-sparc/percpu.h
--- .31630-linux-2.5.69-bk13/include/asm-sparc/percpu.h 2003-01-02 12:02:23.000000000 +1100
+++ .31630-linux-2.5.69-bk13.updated/include/asm-sparc/percpu.h 2003-05-20 13:03:12.000000000 +1000
@@ -2,5 +2,6 @@
#define __ARCH_SPARC_PERCPU__

#include <asm-generic/percpu.h>
+#define __NEED_PERCPU_ALLOC

#endif /* __ARCH_SPARC_PERCPU__ */
diff -urpN --exclude TAGS -X /home/rusty/devel/kernel/kernel-patches/current-dontdiff --minimal .31630-linux-2.5.69-bk13/include/asm-sparc64/percpu.h .31630-linux-2.5.69-bk13.updated/include/asm-sparc64/percpu.h
--- .31630-linux-2.5.69-bk13/include/asm-sparc64/percpu.h 2003-01-02 12:02:23.000000000 +1100
+++ .31630-linux-2.5.69-bk13.updated/include/asm-sparc64/percpu.h 2003-05-20 13:03:20.000000000 +1000
@@ -2,5 +2,6 @@
#define __ARCH_SPARC64_PERCPU__

#include <asm-generic/percpu.h>
+#define __NEED_PERCPU_ALLOC

#endif /* __ARCH_SPARC64_PERCPU__ */
diff -urpN --exclude TAGS -X /home/rusty/devel/kernel/kernel-patches/current-dontdiff --minimal .31630-linux-2.5.69-bk13/include/asm-x86_64/percpu.h .31630-linux-2.5.69-bk13.updated/include/asm-x86_64/percpu.h
--- .31630-linux-2.5.69-bk13/include/asm-x86_64/percpu.h 2003-01-02 12:32:48.000000000 +1100
+++ .31630-linux-2.5.69-bk13.updated/include/asm-x86_64/percpu.h 2003-05-20 13:03:29.000000000 +1000
@@ -2,5 +2,6 @@
#define __ARCH_I386_PERCPU__

#include <asm-generic/percpu.h>
+#define __NEED_PERCPU_ALLOC

#endif /* __ARCH_I386_PERCPU__ */
diff -urpN --exclude TAGS -X /home/rusty/devel/kernel/kernel-patches/current-dontdiff --minimal .31630-linux-2.5.69-bk13/include/linux/percpu.h .31630-linux-2.5.69-bk13.updated/include/linux/percpu.h
--- .31630-linux-2.5.69-bk13/include/linux/percpu.h 2003-05-20 12:57:17.000000000 +1000
+++ .31630-linux-2.5.69-bk13.updated/include/linux/percpu.h 2003-05-20 12:57:19.000000000 +1000
@@ -5,6 +5,9 @@
#include <linux/string.h> /* For memset() */
#include <asm/percpu.h>

+/* Maximum size allowed for alloc_percpu */
+#define PERCPU_MAX PAGE_SIZE
+
/* Must be an lvalue. */
#define get_cpu_var(var) (*({ preempt_disable(); &__get_cpu_var(var); }))
#define put_cpu_var(var) preempt_enable()
@@ -15,29 +18,19 @@
#define per_cpu(var, cpu) (*RELOC_HIDE(&var##__per_cpu, __per_cpu_offset[cpu]))
extern unsigned long __per_cpu_offset[NR_CPUS];

-struct percpu_data {
- void *ptrs[NR_CPUS];
- void *blkp;
-};
-
/*
* Use this to get to a cpu's version of the per-cpu object allocated using
* alloc_percpu. If you want to get "this cpu's version", maybe you want
* to use get_cpu_ptr...
*/
-#define per_cpu_ptr(ptr, cpu) \
-({ \
- struct percpu_data *__p = (struct percpu_data *)~(unsigned long)(ptr); \
- (__typeof__(ptr))__p->ptrs[(cpu)]; \
-})
+#define per_cpu_ptr(ptr, cpu) (RELOC_HIDE(ptr, __per_cpu_offset[cpu]))

extern void *__alloc_percpu(size_t size, size_t align);
extern void free_percpu(const void *);
extern void kmalloc_percpu_init(void);

-#else /* CONFIG_SMP */
-
-#define per_cpu_ptr(ptr, cpu) (ptr)
+extern void setup_per_cpu_areas(void);
+#else /* ... !CONFIG_SMP */

static inline void *__alloc_percpu(size_t size, size_t align)
{
@@ -50,7 +43,6 @@ static inline void free_percpu(const voi
{
kfree(ptr);
}
-static inline void kmalloc_percpu_init(void) { }

/* Can't define per-cpu variables in modules. Sorry --RR */
#ifndef MODULE
@@ -60,7 +52,10 @@ static inline void kmalloc_percpu_init(v

#define per_cpu(var, cpu) ((void)cpu, var##__per_cpu)
#define __get_cpu_var(var) var##__per_cpu
+#define per_cpu_ptr(ptr, cpu) ((void)cpu, (ptr))
+#define __get_cpu_ptr(ptr) (ptr)

+static inline void setup_per_cpu_areas(void) { }
#endif /* CONFIG_SMP */

/* Simple wrapper for the common case: zeros memory. */
@@ -86,8 +81,8 @@ static inline void kmalloc_percpu_init(v
* then considered to be safe. And ofcourse, "Thou shalt not sleep between
* get_cpu_ptr and put_cpu_ptr"
*/
-#define get_cpu_ptr(ptr) per_cpu_ptr(ptr, get_cpu())
-#define put_cpu_ptr(ptr) put_cpu()
+#define get_cpu_ptr(var) (*({ preempt_disable(); &__get_cpu_ptr(ptr); }))
+#define put_cpu_ptr(var) preempt_enable()

#define DECLARE_PER_CPU(type, name) extern __typeof__(type) name##__per_cpu

diff -urpN --exclude TAGS -X /home/rusty/devel/kernel/kernel-patches/current-dontdiff --minimal .31630-linux-2.5.69-bk13/init/main.c .31630-linux-2.5.69-bk13.updated/init/main.c
--- .31630-linux-2.5.69-bk13/init/main.c 2003-05-19 10:53:51.000000000 +1000
+++ .31630-linux-2.5.69-bk13.updated/init/main.c 2003-05-20 12:57:19.000000000 +1000
@@ -306,30 +306,6 @@ static inline void smp_prepare_cpus(unsi

#else

-#ifdef __GENERIC_PER_CPU
-unsigned long __per_cpu_offset[NR_CPUS];
-
-static void __init setup_per_cpu_areas(void)
-{
- unsigned long size, i;
- char *ptr;
- /* Created by linker magic */
- extern char __per_cpu_start[], __per_cpu_end[];
-
- /* Copy section for each CPU (we discard the original) */
- size = ALIGN(__per_cpu_end - __per_cpu_start, SMP_CACHE_BYTES);
- if (!size)
- return;
-
- ptr = alloc_bootmem(size * NR_CPUS);
-
- for (i = 0; i < NR_CPUS; i++, ptr += size) {
- __per_cpu_offset[i] = ptr - __per_cpu_start;
- memcpy(ptr, __per_cpu_start, size);
- }
-}
-#endif /* !__GENERIC_PER_CPU */
-
/* Called by boot processor to activate the rest. */
static void __init smp_init(void)
{
diff -urpN --exclude TAGS -X /home/rusty/devel/kernel/kernel-patches/current-dontdiff --minimal .31630-linux-2.5.69-bk13/mm/Makefile .31630-linux-2.5.69-bk13.updated/mm/Makefile
--- .31630-linux-2.5.69-bk13/mm/Makefile 2003-02-11 14:26:20.000000000 +1100
+++ .31630-linux-2.5.69-bk13.updated/mm/Makefile 2003-05-20 12:57:20.000000000 +1000
@@ -12,3 +12,4 @@ obj-y := bootmem.o filemap.o mempool.o
slab.o swap.o truncate.o vcache.o vmscan.o $(mmu-y)

obj-$(CONFIG_SWAP) += page_io.o swap_state.o swapfile.o
+obj-$(CONFIG_SMP) += percpu.o
diff -urpN --exclude TAGS -X /home/rusty/devel/kernel/kernel-patches/current-dontdiff --minimal .31630-linux-2.5.69-bk13/mm/percpu.c .31630-linux-2.5.69-bk13.updated/mm/percpu.c
--- .31630-linux-2.5.69-bk13/mm/percpu.c 1970-01-01 10:00:00.000000000 +1000
+++ .31630-linux-2.5.69-bk13.updated/mm/percpu.c 2003-05-20 12:57:20.000000000 +1000
@@ -0,0 +1,286 @@
+/*
+ * Dynamic per-cpu allocation.
+ * Originally by Dipankar Sarma <[email protected]>
+ * This version (C) 2003 Rusty Russell, IBM Corporation.
+ */
+
+/* Simple linked list allocator: we don't stress it hard, but do want
+ it space-efficient. We keep the bookkeeping separately. */
+#include <linux/list.h>
+#include <linux/spinlock.h>
+#include <linux/percpu.h>
+#include <linux/string.h>
+#include <linux/percpu.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/vmalloc.h>
+#include <linux/bootmem.h>
+#include <asm/semaphore.h>
+
+/* Created by linker magic */
+extern char __per_cpu_start[], __per_cpu_end[];
+
+static DECLARE_MUTEX(pcpu_lock);
+static LIST_HEAD(pcpu_blocks);
+static void *percpu_base;
+unsigned long percpu_block_size;
+
+#ifdef __NEED_PERCPU_ALLOC
+static inline void *new_percpumem(size_t size)
+{
+ return vmalloc(size * NR_CPUS);
+}
+
+static inline void free_percpumem(void *ptr, size_t size)
+{
+ vfree(ptr);
+}
+
+static inline void *initial_percpumem(unsigned long *size)
+{
+ void *ptr;
+ unsigned int i;
+
+ ptr = alloc_bootmem(*size * NR_CPUS);
+ for (i = 0; i < NR_CPUS; i++) {
+ __per_cpu_offset[i] = ptr + *size*i - (void *)__per_cpu_start;
+ memcpy(ptr + *size*i, __per_cpu_start,
+ __per_cpu_end - __per_cpu_start);
+ }
+
+ return ptr;
+}
+#endif
+
+struct pcpu_block
+{
+ struct list_head list;
+
+ /* Pointer to actual allocated memory. */
+ void *base_ptr;
+
+ /* Number of blocks used and allocated. */
+ unsigned short num_used, num_allocated;
+
+ /* Size of each block. -ve means used. */
+ int size[0];
+};
+
+static struct pcpu_block *split_block(struct pcpu_block *b,
+ unsigned int i,
+ unsigned short size)
+{
+ /* Reallocation required? */
+ if (b->num_used + 1 > b->num_allocated) {
+ struct pcpu_block *new;
+
+ new = kmalloc(sizeof(*b)
+ + sizeof(b->size[0]) * b->num_allocated*2,
+ GFP_KERNEL);
+ if (!new)
+ return NULL;
+ new->base_ptr = b->base_ptr;
+ new->num_used = b->num_used;
+ new->num_allocated = b->num_allocated * 2;
+ memcpy(new->size, b->size, sizeof(b->size[0])*b->num_used);
+ list_del(&b->list);
+ list_add(&new->list, &pcpu_blocks);
+ kfree(b);
+ b = new;
+ }
+
+ /* Insert a new subblock */
+ memmove(&b->size[i+1], &b->size[i],
+ sizeof(b->size[0]) * (b->num_used - i));
+ b->num_used++;
+
+ b->size[i+1] -= size;
+ b->size[i] = size;
+ return b;
+}
+
+#define PERCPU_INIT_BLOCKS 4
+
+static int new_block(void)
+{
+ struct pcpu_block *b;
+
+ b = kmalloc(sizeof(*b) + PERCPU_INIT_BLOCKS*sizeof(int), GFP_KERNEL);
+ if (!b)
+ return 0;
+
+ b->base_ptr = new_percpumem(percpu_block_size);
+ if (!b->base_ptr) {
+ kfree(b);
+ return 0;
+ }
+
+ b->num_allocated = PERCPU_INIT_BLOCKS;
+ b->num_used = 1;
+ b->size[0] = percpu_block_size;
+
+ list_add(&b->list, &pcpu_blocks);
+ return 1;
+}
+
+static inline unsigned int abs(int val)
+{
+ if (val < 0)
+ return -val;
+ return val;
+}
+
+void *__alloc_percpu(size_t size, size_t align)
+{
+ struct pcpu_block *b;
+ unsigned long extra;
+ void *ret;
+
+ BUG_ON(align > SMP_CACHE_BYTES);
+ BUG_ON(size > PERCPU_MAX);
+ BUG_ON(!percpu_block_size);
+
+ down(&pcpu_lock);
+ again:
+ list_for_each_entry(b, &pcpu_blocks, list) {
+ unsigned int i;
+ void *ptr = b->base_ptr;
+
+ for (i = 0; i < b->num_used; ptr += abs(b->size[i]), i++) {
+ /* Extra for alignment requirement. */
+ extra = ALIGN((unsigned long)ptr, align)
+ - (unsigned long)ptr;
+ BUG_ON(i == 0 && extra != 0);
+
+ if (b->size[i] < 0 || b->size[i] < extra + size)
+ continue;
+
+ /* Transfer extra to previous block. */
+ if (b->size[i-1] < 0)
+ b->size[i-1] -= extra;
+ else
+ b->size[i-1] += extra;
+ b->size[i] -= extra;
+ ptr += extra;
+
+ /* Split block if warranted */
+ if (b->size[i] - size > sizeof(unsigned long)) {
+ struct pcpu_block *realloc;
+ realloc = split_block(b, i, size);
+ if (!realloc)
+ continue;
+ b = realloc;
+ }
+
+ /* Mark allocated */
+ b->size[i] = -b->size[i];
+ /* Pointer will be offset by this: compensate. */
+ ret = RELOC_HIDE(ptr, -(percpu_base
+ - (void *)__per_cpu_start));
+ for (i = 0; i < NR_CPUS; i++)
+ if (cpu_possible(i))
+ memset(per_cpu_ptr(ret, i), 0, size);
+ goto done;
+ }
+ }
+
+ if (new_block())
+ goto again;
+
+ ret = NULL;
+ done:
+ up(&pcpu_lock);
+ return ret;
+}
+
+static void free_block(struct pcpu_block *b, unsigned int i)
+{
+ /* Block should be used. */
+ BUG_ON(b->size[i] >= 0);
+ b->size[i] = -b->size[i];
+
+ /* Merge with previous? */
+ if (i > 0 && b->size[i-1] >= 0) {
+ b->size[i-1] += b->size[i];
+ b->num_used--;
+ memmove(&b->size[i], &b->size[i+1],
+ (b->num_used - i) * sizeof(b->size[0]));
+ i--;
+ }
+ /* Merge with next? */
+ if (i+1 < b->num_used && b->size[i+1] >= 0) {
+ b->size[i] += b->size[i+1];
+ b->num_used--;
+ memmove(&b->size[i+1], &b->size[i+2],
+ (b->num_used - (i+1)) * sizeof(b->size[0]));
+ }
+ /* Empty? */
+ if (b->num_used == 1) {
+ list_del(&b->list);
+ free_percpumem(b->base_ptr, percpu_block_size);
+ kfree(b);
+ }
+}
+
+void free_percpu(const void *freeme)
+{
+ struct pcpu_block *b;
+
+ /* Pointer will be offset by this amount: compensate. */
+ freeme = RELOC_HIDE(freeme, percpu_base - (void *)__per_cpu_start);
+ down(&pcpu_lock);
+ list_for_each_entry(b, &pcpu_blocks, list) {
+ unsigned int i;
+ void *ptr = b->base_ptr;
+
+ for (i = 0; i < b->num_used; ptr += abs(b->size[i]), i++) {
+ if (ptr == freeme) {
+ free_block(b, i);
+ up(&pcpu_lock);
+ return;
+ }
+ }
+ }
+ up(&pcpu_lock);
+ BUG();
+}
+
+unsigned long __per_cpu_offset[NR_CPUS];
+EXPORT_SYMBOL(__per_cpu_offset);
+
+void __init setup_per_cpu_areas(void)
+{
+ /* Idempotent (some archs call this earlier). */
+ if (percpu_block_size)
+ return;
+
+ /* Copy section for each CPU (we discard the original) */
+ percpu_block_size = ALIGN(__per_cpu_end - __per_cpu_start,
+ SMP_CACHE_BYTES);
+ /* We guarantee at least this much. */
+ if (percpu_block_size < PERCPU_MAX)
+ percpu_block_size = PERCPU_MAX;
+
+ percpu_base = initial_percpumem(&percpu_block_size);
+}
+
+static int init_alloc_percpu(void)
+{
+ struct pcpu_block *pcpu;
+
+ printk("Per-cpu data: %Zu of %lu bytes at %p\n",
+ __per_cpu_end - __per_cpu_start, percpu_block_size,
+ percpu_base);
+
+ pcpu = kmalloc(sizeof(*pcpu)+sizeof(pcpu->size[0])*PERCPU_INIT_BLOCKS,
+ GFP_KERNEL);
+ pcpu->num_allocated = PERCPU_INIT_BLOCKS;
+ pcpu->num_used = 2; /* Static block, and free block */
+ pcpu->size[0] = -(__per_cpu_end - __per_cpu_start);
+ pcpu->size[1] = percpu_block_size-(__per_cpu_end - __per_cpu_start);
+ pcpu->base_ptr = percpu_base;
+ list_add(&pcpu->list, &pcpu_blocks);
+
+ return 0;
+}
+core_initcall(init_alloc_percpu);
diff -urpN --exclude TAGS -X /home/rusty/devel/kernel/kernel-patches/current-dontdiff --minimal .31630-linux-2.5.69-bk13/mm/slab.c .31630-linux-2.5.69-bk13.updated/mm/slab.c
--- .31630-linux-2.5.69-bk13/mm/slab.c 2003-05-20 12:57:17.000000000 +1000
+++ .31630-linux-2.5.69-bk13.updated/mm/slab.c 2003-05-20 12:57:20.000000000 +1000
@@ -1937,47 +1937,6 @@ void * kmalloc (size_t size, int flags)
return NULL;
}

-#ifdef CONFIG_SMP
-/**
- * __alloc_percpu - allocate one copy of the object for every present
- * cpu in the system, zeroing them.
- * Objects should be dereferenced using per_cpu_ptr/get_cpu_ptr
- * macros only.
- *
- * @size: how many bytes of memory are required.
- * @align: the alignment, which can't be greater than SMP_CACHE_BYTES.
- */
-void *__alloc_percpu(size_t size, size_t align)
-{
- int i;
- struct percpu_data *pdata = kmalloc(sizeof (*pdata), GFP_KERNEL);
-
- if (!pdata)
- return NULL;
-
- for (i = 0; i < NR_CPUS; i++) {
- if (!cpu_possible(i))
- continue;
- pdata->ptrs[i] = kmalloc(size, GFP_KERNEL);
- if (!pdata->ptrs[i])
- goto unwind_oom;
- memset(pdata->ptrs[i], 0, size);
- }
-
- /* Catch derefs w/o wrappers */
- return (void *) (~(unsigned long) pdata);
-
-unwind_oom:
- while (--i >= 0) {
- if (!cpu_possible(i))
- continue;
- kfree(pdata->ptrs[i]);
- }
- kfree(pdata);
- return NULL;
-}
-#endif
-
/**
* kmem_cache_free - Deallocate an object
* @cachep: The cache the allocation was from.
@@ -2016,28 +1975,6 @@ void kfree (const void *objp)
local_irq_restore(flags);
}

-#ifdef CONFIG_SMP
-/**
- * free_percpu - free previously allocated percpu memory
- * @objp: pointer returned by alloc_percpu.
- *
- * Don't free memory not originally allocated by alloc_percpu()
- * The complemented objp is to check for that.
- */
-void
-free_percpu(const void *objp)
-{
- int i;
- struct percpu_data *p = (struct percpu_data *) (~(unsigned long) objp);
-
- for (i = 0; i < NR_CPUS; i++) {
- if (!cpu_possible(i))
- continue;
- kfree(p->ptrs[i]);
- }
-}
-#endif
-
unsigned int kmem_cache_size(kmem_cache_t *cachep)
{
unsigned int objlen = cachep->objsize;
diff -urpN --exclude TAGS -X /home/rusty/devel/kernel/kernel-patches/current-dontdiff --minimal .28912-2.5.69-bk13-kmalloc_percpu-full.pre/include/asm-ia64/percpu.h .28912-2.5.69-bk13-kmalloc_percpu-full/include/asm-ia64/percpu.h
--- .28912-2.5.69-bk13-kmalloc_percpu-full.pre/include/asm-ia64/percpu.h 2003-05-20 12:49:53.000000000 +1000
+++ .28912-2.5.69-bk13-kmalloc_percpu-full/include/asm-ia64/percpu.h 2003-05-20 12:49:54.000000000 +1000
@@ -24,9 +24,12 @@
#endif

#define __get_cpu_var(var) (var##__per_cpu)
-#endif /* CONFIG_SMP */
+#define __get_cpu_ptr(ptr) (ptr)

-extern void setup_per_cpu_areas (void);
+void *initial_percpumem(unsigned long *size);
+void *new_percpumem(size_t size);
+void free_percpumem(void *ptr, size_t size);
+#endif /* CONFIG_SMP */

#endif /* !__ASSEMBLY__ */

diff -urpN --exclude TAGS -X /home/rusty/devel/kernel/kernel-patches/current-dontdiff --minimal .28912-2.5.69-bk13-kmalloc_percpu-full.pre/arch/ia64/kernel/setup.c .28912-2.5.69-bk13-kmalloc_percpu-full/arch/ia64/kernel/setup.c
--- .28912-2.5.69-bk13-kmalloc_percpu-full.pre/arch/ia64/kernel/setup.c 2003-05-19 10:53:40.000000000 +1000
+++ .28912-2.5.69-bk13-kmalloc_percpu-full/arch/ia64/kernel/setup.c 2003-05-20 12:49:54.000000000 +1000
@@ -49,10 +49,6 @@

extern char _end;

-#ifdef CONFIG_SMP
-unsigned long __per_cpu_offset[NR_CPUS];
-#endif
-
DEFINE_PER_CPU(struct cpuinfo_ia64, cpu_info);
DEFINE_PER_CPU(unsigned long, ia64_phys_stacked_size_p8);
unsigned long ia64_cycles_per_usec;
@@ -627,12 +623,6 @@ identify_cpu (struct cpuinfo_ia64 *c)
c->unimpl_pa_mask = ~((1L<<63) | ((1L << phys_addr_size) - 1));
}

-void
-setup_per_cpu_areas (void)
-{
- /* start_kernel() requires this... */
-}
-
static void
get_max_cacheline_size (void)
{
@@ -667,6 +657,44 @@ get_max_cacheline_size (void)
ia64_max_cacheline_size = max;
}

+#ifdef CONFIG_SMP
+/* FIXME: Implement. This would need to allocate size * NR_CPUS, and
+ set up a different mapping on each CPU at that address minus
+ __per_cpu_offset[0]. Probably need to reserve address space to
+ ensure we can do that. --RR */
+static inline void *new_percpumem(size_t size)
+{
+ static int warned = 0;
+
+ if (!warned++)
+ printk(KERN_WARN "Out of per-cpu address space\n");
+ return NULL;
+}
+static inline void free_percpumem(void *ptr, size_t size)
+{
+ BUG();
+}
+
+static inline void *initial_percpumem(unsigned long *size)
+{
+ void *ptr;
+ unsigned int i;
+
+ BUG_ON(*size > PERCPU_PAGE_SIZE);
+ *size = PERCPU_PAGE_SIZE;
+
+ ptr = __alloc_bootmem(PERCPU_PAGE_SIZE * NR_CPUS, PERCPU_PAGE_SIZE,
+ __pa(MAX_DMA_ADDRESS));
+ for (i = 0; i < NR_CPUS; i++) {
+ __per_cpu_offset[i] = ptr + *size*i - (void *)__per_cpu_start;
+ memcpy(ptr + *size*i, __per_cpu_start,
+ __per_cpu_end - __per_cpu_start);
+ }
+
+ return ptr;
+}
+#endif /* CONFIG_SMP */
+
/*
* cpu_init() initializes state that is per-CPU. This function acts
* as a 'CPU state barrier', nothing should get across.
@@ -690,15 +718,8 @@ cpu_init (void)
* get_free_pages() cannot be used before cpu_init() done. BSP allocates
* "NR_CPUS" pages for all CPUs to avoid that AP calls get_zeroed_page().
*/
- if (smp_processor_id() == 0) {
- cpu_data = __alloc_bootmem(PERCPU_PAGE_SIZE * NR_CPUS, PERCPU_PAGE_SIZE,
- __pa(MAX_DMA_ADDRESS));
- for (cpu = 0; cpu < NR_CPUS; cpu++) {
- memcpy(cpu_data, __phys_per_cpu_start, __per_cpu_end - __per_cpu_start);
- __per_cpu_offset[cpu] = (char *) cpu_data - __per_cpu_start;
- cpu_data += PERCPU_PAGE_SIZE;
- }
- }
+ if (smp_processor_id() == 0)
+ setup_per_cpu_areas();
cpu_data = __per_cpu_start + __per_cpu_offset[smp_processor_id()];
#else /* !CONFIG_SMP */
cpu_data = __phys_per_cpu_start;
--
Anyone who quotes me in their sig is an idiot. -- Rusty Russell.


2003-05-21 10:16:13

by Dipankar Sarma

[permalink] [raw]
Subject: Re: [PATCH 4/3] Replace dynamic percpu implementation

On Tue, May 20, 2003 at 02:32:37PM +1000, Rusty Russell wrote:
> This requires the first three patches. IA64 untested. See comment.
>
> Name: Dynamic per-cpu allocation using static per-cpu mechanism
> Author: Rusty Russell
> Status: Tested on 2.5.69-bk13
> Depends: Misc/kmalloc_percpu-interface.patch.gz
>
> D: This patch replaces the dynamic per-cpu allocator, alloc_percpu,
> D: to make it use the same mechanism as the static per-cpu variables, ie.
> D: ptr + __per_cpu_offset[smp_processor_id()] gives the variable address.
> D: This allows it to be used in modules (following patch), and hopefully
> D: increases space and time efficiency of reference at the same time.
> D: It gets moved to its own (SMP-only) file: mm/percpu.c.

We will do some measurements with this but based on a large number
of measurements that Kiran had done earlier, we can see a couple of things -

1. Even though a percpu scheme using pointer arithmatic has one less memory
reference, the globally shared offset table is often in the cache
and therefore pointer arithmatic offers no added advantage.

2. Increased sharing of cacheline helps by reducing associativity misses.
We see this by comparing an interlaced allocator where only same
sized objects share blocks vs. the current static allocator. Sharing of
blocks by differently sized objects also allow cache lines to be
kept warm as more subsystems in the kernel access them.

Considering these results, this allocator seems to be a step in the right
direction.

Thanks
Dipankar

2003-05-22 00:34:28

by Rusty Russell (IBM)

[permalink] [raw]
Subject: Re: [PATCH 4/3] Replace dynamic percpu implementation

In message <[email protected]> you write:
> Considering these results, this allocator seems to be a step in the right
> direction.

I had a great chat with David M-T: I'm backing off this until we have
a IA64 code which keeps its current advantages.

If you can find someone to work on the IA64 specific stuff, I think
everyone will be happy.

Cheers!
Rusty.
--
Anyone who quotes me in their sig is an idiot. -- Rusty Russell.

2003-05-22 07:51:48

by Ravikiran G Thirumalai

[permalink] [raw]
Subject: Re: [PATCH 4/3] Replace dynamic percpu implementation

On Wed, May 21, 2003 at 04:01:56PM +0530, Dipankar Sarma wrote:
>...
> We will do some measurements with this but based on a large number
> of measurements that Kiran had done earlier, we can see a couple of things -
>
> 1. Even though a percpu scheme using pointer arithmatic has one less memory
> reference, the globally shared offset table is often in the cache
> and therefore pointer arithmatic offers no added advantage.
>
> 2. Increased sharing of cacheline helps by reducing associativity misses.
> We see this by comparing an interlaced allocator where only same
> sized objects share blocks vs. the current static allocator. Sharing of
> blocks by differently sized objects also allow cache lines to be
> kept warm as more subsystems in the kernel access them.
>

Here is the summary of my experiments with difft per-cpu allocator methods.

The following methods were compared
1. Static per-cpu areas
2. kmalloc_percpu with NR_CPUS pointers and one extra dereference -- the
current implementation (no interlace) (kmalloc_percpu_current)
3. kmalloc_percpu with pointer arithmetic, but no interlace
(kmalloc_percpu_new)
4. alloc_percpu using Rusty's block allocator and the shared offset table
(alloc_percpu_block)

Application used was speeding up vm_enough_memory using per-cpu counters
and reducing atomic_operataions. Benchmark used was kernbench. Profile
ticks on vm_enough_memory was used to compare allocator methods
(vm_acct_memory was made inline). This was on a 4 processor pIII xeon.

To summarise,
1. Static per-cpu areas was 6.5 % better that kmalloc_percpu_current
2. kmalloc_percpu_new and static per-cpu areas had similar results.
3. alloc_percpu results were similar to static per-cpu areas and
kmalloc_percpu_new
4. Extra dereferences in alloc_percpu were not significant, but alloc_percpu
was interlaced and kmalloc_percpu_new wasn't. Insn profile seemed to
indicate extra cost in memory dereferencing of alloc_percpu was
offset by the interlacing/objects sharing the same cacheline part.
but then insn profiles are only indicative...not accurate.

todo:
I have to see how a interlaced kmalloc_percpu with pointer arithmetic
fares in these tests (once i have it working) and the performace part
of the percpu allocators will be hopefully clear.

Thanks,
Kiran

2003-05-22 08:27:22

by Rusty Russell (IBM)

[permalink] [raw]
Subject: Re: [PATCH 4/3] Replace dynamic percpu implementation

In message <[email protected]> you write:
> 4. Extra dereferences in alloc_percpu were not significant, but alloc_percpu
> was interlaced and kmalloc_percpu_new wasn't. Insn profile seemed to
> indicate extra cost in memory dereferencing of alloc_percpu was
> offset by the interlacing/objects sharing the same cacheline part.
> but then insn profiles are only indicative...not accurate.

Interesting: personally I consider the cacheline sharing a feature,
and unless you've done something special, the static declaration
should be interlaced too, no?

If you don't want interlaced, you should make your type
____cacheline_aligned for alloc_percpu, or use

__alloc_percpu(ALIGN(sizeof(x), SMP_CACHE_BYTES), SMP_CACHE_BYTES)

Aside: if kmalloc_percpu uses the per-cpu offset too, it probably
makes sense to make the per-cpu offset to a first class citizen, and
smp_processor_id to be derived, rather than the other way around as at
the moment. This would offer further speedup by removing a level of
indirection.

If you're interested I can probably produce such a patch for x86...

Thanks for the results!
Rusty.
--
Anyone who quotes me in their sig is an idiot. -- Rusty Russell.

2003-05-22 10:27:10

by Ravikiran G Thirumalai

[permalink] [raw]
Subject: Re: [PATCH 4/3] Replace dynamic percpu implementation

On Thu, May 22, 2003 at 06:36:31PM +1000, Rusty Russell wrote:
> In message <[email protected]> you write:
> > 4. Extra dereferences in alloc_percpu were not significant, but alloc_percpu
> > was interlaced and kmalloc_percpu_new wasn't. Insn profile seemed to
> > indicate extra cost in memory dereferencing of alloc_percpu was
> > offset by the interlacing/objects sharing the same cacheline part.
> > but then insn profiles are only indicative...not accurate.
>
> Interesting: personally I consider the cacheline sharing a feature,
> and unless you've done something special, the static declaration
> should be interlaced too, no?

Yes, the static declartion was interlaced too. What I meant to say is that
cacheline sharing feature helped alloc_percpu/static percpu, compensate
for the small extra memory reference cost in getting __percpu_offset[]
when you compare with kmalloc_percpu_new.

>...
> Aside: if kmalloc_percpu uses the per-cpu offset too, it probably
> makes sense to make the per-cpu offset to a first class citizen, and
> smp_processor_id to be derived, rather than the other way around as at
> the moment. This would offer further speedup by removing a level of
> indirection.
>
> If you're interested I can probably produce such a patch for x86...

Sure, it might help per-cpu data but will it cause performance
regression elsewhere? (other users of smp_processor_id). I can run it
through the same tests and find out. Maybe it'll make good paper material
for later? ;)


Thanks,
Kiran

2003-05-23 01:33:09

by Rusty Russell (IBM)

[permalink] [raw]
Subject: Re: [PATCH 4/3] Replace dynamic percpu implementation

In message <[email protected]> you write:
> On Thu, May 22, 2003 at 06:36:31PM +1000, Rusty Russell wrote:
> > Interesting: personally I consider the cacheline sharing a feature,
> > and unless you've done something special, the static declaration
> > should be interlaced too, no?
>
> Yes, the static declartion was interlaced too. What I meant to say is that
> cacheline sharing feature helped alloc_percpu/static percpu, compensate
> for the small extra memory reference cost in getting __percpu_offset[]
> when you compare with kmalloc_percpu_new.

Ah, thanks, that clarifies. Sorry for my misread.

> > Aside: if kmalloc_percpu uses the per-cpu offset too, it probably
> > makes sense to make the per-cpu offset to a first class citizen, and
> > smp_processor_id to be derived, rather than the other way around as at
> > the moment. This would offer further speedup by removing a level of
> > indirection.
> >
> > If you're interested I can probably produce such a patch for x86...
>
> Sure, it might help per-cpu data but will it cause performance
> regression elsewhere? (other users of smp_processor_id).

AFAICT, all the time-critical smp_processor_id() things are basically
for indexing into a per-cpu data array. Even things like module.h and
percpu_counter.h would benifit from replacing those huge
inside-structure [NR_CPUS] arrays with a dynamic allocation.

> I can run it through the same tests and find out. Maybe it'll make
> good paper material for later? ;)

I'll try to find time today or early next week.

Thanks!
Rusty.
--
Anyone who quotes me in their sig is an idiot. -- Rusty Russell.

2003-05-23 07:14:28

by Rusty Russell (IBM)

[permalink] [raw]
Subject: Re: [PATCH 4/3] Replace dynamic percpu implementation

In message <[email protected]> you write:
> On Thu, May 22, 2003 at 06:36:31PM +1000, Rusty Russell wrote:
> > If you're interested I can probably produce such a patch for x86...
>
> Sure, it might help per-cpu data but will it cause performance
> regression elsewhere? (other users of smp_processor_id). I can run it
> through the same tests and find out. Maybe it'll make good paper material
> for later? ;)

OK, here's an x86-specific patch. It boots for me. I'm mainly
interested in the question of whether it increases static percpu
speed.

Thanks!
Rusty.
--
Anyone who quotes me in their sig is an idiot. -- Rusty Russell.

Name: Put __per_cpu_offset in the thread struct, remove cpu
Author: Rusty Russell
Status: Tested on 2.5.69-bk15, dual x86

D: If we had an efficient kmalloc_percpu-equiv, and moved more structures
D: across to it (or to DECLARE_PER_CPU), it makes more sense to derive
D: smp_processor_id() from the per-cpu offset, rather than the other way
D: around.
D:
D: This patch is an x86-only hack to do just that, for benchmarking.
D: It introduces a new header, asm/task_cpu.h, because I couldn't
D: resolve the horrible header tangle any other way.

diff -urNp --exclude TAGS -X /home/rusty/current-dontdiff --minimal linux-2.5.69-bk15/arch/i386/kernel/i386_ksyms.c working-2.5.69-bk15-offset-uber-alles/arch/i386/kernel/i386_ksyms.c
--- linux-2.5.69-bk15/arch/i386/kernel/i386_ksyms.c 2003-05-22 10:48:33.000000000 +1000
+++ working-2.5.69-bk15-offset-uber-alles/arch/i386/kernel/i386_ksyms.c 2003-05-23 16:13:53.000000000 +1000
@@ -152,6 +152,7 @@ EXPORT_SYMBOL(cpu_online_map);
EXPORT_SYMBOL(cpu_callout_map);
EXPORT_SYMBOL_NOVERS(__write_lock_failed);
EXPORT_SYMBOL_NOVERS(__read_lock_failed);
+EXPORT_PER_CPU_SYMBOL(__processor_id);

/* Global SMP stuff */
EXPORT_SYMBOL(synchronize_irq);
diff -urNp --exclude TAGS -X /home/rusty/current-dontdiff --minimal linux-2.5.69-bk15/arch/i386/kernel/smpboot.c working-2.5.69-bk15-offset-uber-alles/arch/i386/kernel/smpboot.c
--- linux-2.5.69-bk15/arch/i386/kernel/smpboot.c 2003-05-22 10:48:33.000000000 +1000
+++ working-2.5.69-bk15-offset-uber-alles/arch/i386/kernel/smpboot.c 2003-05-23 16:12:27.000000000 +1000
@@ -49,6 +49,7 @@
#include <asm/tlbflush.h>
#include <asm/desc.h>
#include <asm/arch_hooks.h>
+#include <asm/task_cpu.h>

#include <mach_apic.h>
#include <mach_wakecpu.h>
@@ -947,7 +948,6 @@ static void __init smp_boot_cpus(unsigne

boot_cpu_logical_apicid = logical_smp_processor_id();

- current_thread_info()->cpu = 0;
smp_tune_scheduling();

/*
@@ -1129,6 +1130,38 @@ void __devinit smp_prepare_boot_cpu(void
set_bit(smp_processor_id(), &cpu_callout_map);
}

+DEFINE_PER_CPU(u32, __processor_id);
+unsigned long __per_cpu_offset[NR_CPUS];
+
+void __init setup_per_cpu_areas(void)
+{
+ unsigned long size, i;
+ char *ptr;
+ /* Created by linker magic */
+ extern char __per_cpu_start[], __per_cpu_end[];
+
+ /* Copy section for each CPU (we discard the original) */
+ size = ALIGN(__per_cpu_end - __per_cpu_start, SMP_CACHE_BYTES);
+ if (!size)
+ return;
+
+ ptr = alloc_bootmem(size * NR_CPUS);
+
+ for (i = 0; i < NR_CPUS; i++, ptr += size) {
+ __per_cpu_offset[i] = ptr - __per_cpu_start;
+ memcpy(ptr, __per_cpu_start, size);
+ }
+
+ /* Now, setup per-cpu stuff so smp_processor_id() will work when
+ * we boot other CPUs */
+ for (i = 0; i < NR_CPUS; i++)
+ per_cpu(__processor_id, i) = i;
+
+ /* Our pcpuoff points into the original .data.percpu section:
+ that will vanish, so fixup now. */
+ set_task_cpu(current, smp_processor_id());
+}
+
int __devinit __cpu_up(unsigned int cpu)
{
/* This only works at boot for x86. See "rewrite" above. */
diff -urNp --exclude TAGS -X /home/rusty/current-dontdiff --minimal linux-2.5.69-bk15/fs/proc/array.c working-2.5.69-bk15-offset-uber-alles/fs/proc/array.c
--- linux-2.5.69-bk15/fs/proc/array.c 2003-05-05 12:37:09.000000000 +1000
+++ working-2.5.69-bk15-offset-uber-alles/fs/proc/array.c 2003-05-23 16:00:42.000000000 +1000
@@ -78,6 +78,7 @@
#include <asm/pgtable.h>
#include <asm/io.h>
#include <asm/processor.h>
+#include <asm/task_cpu.h>

/* Gcc optimizes away "strlen(x)" for constant x */
#define ADDBUF(buffer, string) \
diff -urNp --exclude TAGS -X /home/rusty/current-dontdiff --minimal linux-2.5.69-bk15/include/asm-i386/percpu.h working-2.5.69-bk15-offset-uber-alles/include/asm-i386/percpu.h
--- linux-2.5.69-bk15/include/asm-i386/percpu.h 2003-01-02 12:00:21.000000000 +1100
+++ working-2.5.69-bk15-offset-uber-alles/include/asm-i386/percpu.h 2003-05-23 16:05:22.000000000 +1000
@@ -1,6 +1,40 @@
#ifndef __ARCH_I386_PERCPU__
#define __ARCH_I386_PERCPU__

-#include <asm-generic/percpu.h>
+#include <linux/compiler.h>
+
+#ifdef CONFIG_SMP
+
+extern void setup_per_cpu_areas(void);
+extern unsigned long __per_cpu_offset[NR_CPUS];
+
+/* Separate out the type, so (int[3], foo) works. */
+#ifndef MODULE
+#define DEFINE_PER_CPU(type, name) \
+ __attribute__((__section__(".data.percpu"))) __typeof__(type) name##__per_cpu
+#endif
+
+/* var is in discarded region: offset to particular copy we want */
+#define per_cpu(var, cpu) (*RELOC_HIDE(&var##__per_cpu, __per_cpu_offset[cpu]))
+#define __get_cpu_var(var) \
+ (*RELOC_HIDE(&var##__per_cpu, current_thread_info()->pcpuoff))
+
+#else /* ! SMP */
+
+/* Can't define per-cpu variables in modules. Sorry --RR */
+#ifndef MODULE
+#define DEFINE_PER_CPU(type, name) \
+ __typeof__(type) name##__per_cpu
+#endif
+
+#define per_cpu(var, cpu) ((void)cpu, var##__per_cpu)
+#define __get_cpu_var(var) var##__per_cpu
+
+#endif /* SMP */
+
+#define DECLARE_PER_CPU(type, name) extern __typeof__(type) name##__per_cpu
+
+#define EXPORT_PER_CPU_SYMBOL(var) EXPORT_SYMBOL(var##__per_cpu)
+#define EXPORT_PER_CPU_SYMBOL_GPL(var) EXPORT_SYMBOL_GPL(var##__per_cpu)

#endif /* __ARCH_I386_PERCPU__ */
diff -urNp --exclude TAGS -X /home/rusty/current-dontdiff --minimal linux-2.5.69-bk15/include/asm-i386/smp.h working-2.5.69-bk15-offset-uber-alles/include/asm-i386/smp.h
--- linux-2.5.69-bk15/include/asm-i386/smp.h 2003-05-22 10:49:07.000000000 +1000
+++ working-2.5.69-bk15-offset-uber-alles/include/asm-i386/smp.h 2003-05-23 16:02:53.000000000 +1000
@@ -8,6 +8,7 @@
#include <linux/config.h>
#include <linux/kernel.h>
#include <linux/threads.h>
+#include <asm/percpu.h>
#endif

#ifdef CONFIG_X86_LOCAL_APIC
@@ -53,7 +54,8 @@ extern void zap_low_mappings (void);
* from the initial startup. We map APIC_BASE very early in page_setup(),
* so this is correct in the x86 case.
*/
-#define smp_processor_id() (current_thread_info()->cpu)
+DECLARE_PER_CPU(u32, __processor_id);
+#define smp_processor_id() __get_cpu_var(__processor_id)

extern volatile unsigned long cpu_callout_map;

diff -urNp --exclude TAGS -X /home/rusty/current-dontdiff --minimal linux-2.5.69-bk15/include/asm-i386/task_cpu.h working-2.5.69-bk15-offset-uber-alles/include/asm-i386/task_cpu.h
--- linux-2.5.69-bk15/include/asm-i386/task_cpu.h 1970-01-01 10:00:00.000000000 +1000
+++ working-2.5.69-bk15-offset-uber-alles/include/asm-i386/task_cpu.h 2003-05-23 16:02:41.000000000 +1000
@@ -0,0 +1,17 @@
+#ifndef _ASM_I386_TASK_CPU_H
+#define _ASM_I386_TASK_CPU_H
+#include <linux/percpu.h>
+#include <linux/sched.h>
+
+static inline unsigned int task_cpu(struct task_struct *p)
+{
+ return (*RELOC_HIDE(&__processor_id__per_cpu,
+ p->thread_info->pcpuoff));
+}
+
+static inline void set_task_cpu(struct task_struct *p, unsigned int cpu)
+{
+ /* CPU is derived. We need to set the per-cpu offset. */
+ p->thread_info->pcpuoff = __per_cpu_offset[cpu];
+}
+#endif
diff -urNp --exclude TAGS -X /home/rusty/current-dontdiff --minimal linux-2.5.69-bk15/include/asm-i386/thread_info.h working-2.5.69-bk15-offset-uber-alles/include/asm-i386/thread_info.h
--- linux-2.5.69-bk15/include/asm-i386/thread_info.h 2003-03-18 12:21:39.000000000 +1100
+++ working-2.5.69-bk15-offset-uber-alles/include/asm-i386/thread_info.h 2003-05-23 15:54:14.000000000 +1000
@@ -26,7 +26,7 @@ struct thread_info {
struct exec_domain *exec_domain; /* execution domain */
unsigned long flags; /* low level flags */
unsigned long status; /* thread-synchronous flags */
- __u32 cpu; /* current CPU */
+ unsigned long pcpuoff; /* per-cpu offset */
__s32 preempt_count; /* 0 => preemptable, <0 => BUG */

mm_segment_t addr_limit; /* thread address space:
@@ -45,7 +45,7 @@ struct thread_info {
#define TI_EXEC_DOMAIN 0x00000004
#define TI_FLAGS 0x00000008
#define TI_STATUS 0x0000000C
-#define TI_CPU 0x00000010
+#define TI_PCPUOFF 0x00000010
#define TI_PRE_COUNT 0x00000014
#define TI_ADDR_LIMIT 0x00000018
#define TI_RESTART_BLOCK 0x000001C
@@ -66,7 +66,7 @@ struct thread_info {
.task = &tsk, \
.exec_domain = &default_exec_domain, \
.flags = 0, \
- .cpu = 0, \
+ .pcpuoff = 0, \
.preempt_count = 1, \
.addr_limit = KERNEL_DS, \
.restart_block = { \
diff -urNp --exclude TAGS -X /home/rusty/current-dontdiff --minimal linux-2.5.69-bk15/include/linux/sched.h working-2.5.69-bk15-offset-uber-alles/include/linux/sched.h
--- linux-2.5.69-bk15/include/linux/sched.h 2003-05-22 10:49:18.000000000 +1000
+++ working-2.5.69-bk15-offset-uber-alles/include/linux/sched.h 2003-05-23 15:38:53.000000000 +1000
@@ -813,20 +813,7 @@ extern void signal_wake_up(struct task_s
/*
* Wrappers for p->thread_info->cpu access. No-op on UP.
*/
-#ifdef CONFIG_SMP
-
-static inline unsigned int task_cpu(struct task_struct *p)
-{
- return p->thread_info->cpu;
-}
-
-static inline void set_task_cpu(struct task_struct *p, unsigned int cpu)
-{
- p->thread_info->cpu = cpu;
-}
-
-#else
-
+#ifndef CONFIG_SMP
static inline unsigned int task_cpu(struct task_struct *p)
{
return 0;
diff -urNp --exclude TAGS -X /home/rusty/current-dontdiff --minimal linux-2.5.69-bk15/kernel/sched.c working-2.5.69-bk15-offset-uber-alles/kernel/sched.c
--- linux-2.5.69-bk15/kernel/sched.c 2003-05-22 10:49:22.000000000 +1000
+++ working-2.5.69-bk15-offset-uber-alles/kernel/sched.c 2003-05-23 16:03:55.000000000 +1000
@@ -32,6 +32,7 @@
#include <linux/delay.h>
#include <linux/timer.h>
#include <linux/rcupdate.h>
+#include <asm/task_cpu.h>

#ifdef CONFIG_NUMA
#define cpu_to_node_mask(cpu) node_to_cpumask(cpu_to_node(cpu))
@@ -1312,7 +1313,7 @@ pick_next_task:
switch_tasks:
prefetch(next);
clear_tsk_need_resched(prev);
- RCU_qsctr(prev->thread_info->cpu)++;
+ RCU_qsctr(task_cpu(prev))++;

if (likely(prev != next)) {
rq->nr_switches++;