2002-01-29 09:01:01

by Rusty Russell

[permalink] [raw]
Subject: [PATCH] per-cpu areas for 2.5.3-pre6

This patch introduces the __per_cpu_data tag for data, and the
per_cpu() & this_cpu() macros to go with it.

This allows us to get rid of all those special case structures
springing up all over the place for CPU-local data.

Thanks!
Rusty.

diff -urN -I \$.*\$ --exclude TAGS -X /home/rusty/current-dontdiff --minimal linux-2.5.3-pre6/include/linux/smp.h working-2.5.3-pre6-percpu/include/linux/smp.h
--- linux-2.5.3-pre6/include/linux/smp.h Thu Jan 17 16:35:24 2002
+++ working-2.5.3-pre6-percpu/include/linux/smp.h Wed Jan 30 04:17:59 2002
@@ -71,7 +71,17 @@
#define MSG_RESCHEDULE 0x0003 /* Reschedule request from master CPU*/
#define MSG_CALL_FUNCTION 0x0004 /* Call function on all other CPUs */

-#else
+#define __per_cpu_data __attribute__((section(".data.percpu")))
+
+#ifndef __HAVE_ARCH_CPU_OFFSET
+#define per_cpu_offset(cpu) (__per_cpu_offset[(cpu)])
+#endif
+
+#define per_cpu(var, cpu) \
+(*(__typeof__(&var)((void *)&var + per_cpu_offset(cpu))))
+
+extern unsigned long __per_cpu_offset[NR_CPUS];
+#else /* !SMP */

/*
* These macros fold the SMP functionality into a single CPU system
@@ -88,6 +98,10 @@
#define cpu_online_map 1
static inline void smp_send_reschedule(int cpu) { }
static inline void smp_send_reschedule_all(void) { }
+#define __per_cpu_data
+#define per_cpu(var, cpu) var

#endif
+
+#define this_cpu(var) per_cpu(var,smp_processor_id())
#endif
diff -urN -I \$.*\$ --exclude TAGS -X /home/rusty/current-dontdiff --minimal linux-2.5.3-pre6/arch/i386/vmlinux.lds working-2.5.3-pre6-percpu/arch/i386/vmlinux.lds
--- linux-2.5.3-pre6/arch/i386/vmlinux.lds Tue Jan 29 09:16:52 2002
+++ working-2.5.3-pre6-percpu/arch/i386/vmlinux.lds Wed Jan 30 04:00:34 2002
@@ -58,6 +58,10 @@
*(.initcall7.init)
}
__initcall_end = .;
+ . = ALIGN(32);
+ __per_cpu_start = .;
+ .data.percpu : { *(.data.percpu) }
+ __per_cpu_end = .;
. = ALIGN(4096);
__init_end = .;

diff -urN -I \$.*\$ --exclude TAGS -X /home/rusty/current-dontdiff --minimal linux-2.5.3-pre6/arch/ppc/vmlinux.lds working-2.5.3-pre6-percpu/arch/ppc/vmlinux.lds
--- linux-2.5.3-pre6/arch/ppc/vmlinux.lds Tue Jan 29 09:16:53 2002
+++ working-2.5.3-pre6-percpu/arch/ppc/vmlinux.lds Wed Jan 30 04:00:34 2002
@@ -104,6 +104,10 @@
*(.initcall7.init)
}
__initcall_end = .;
+ . = ALIGN(32);
+ __per_cpu_start = .;
+ .data.percpu : { *(.data.percpu) }
+ __per_cpu_end = .;
. = ALIGN(4096);
__init_end = .;

diff -urN -I \$.*\$ --exclude TAGS -X /home/rusty/current-dontdiff --minimal linux-2.5.3-pre6/init/main.c working-2.5.3-pre6-percpu/init/main.c
--- linux-2.5.3-pre6/init/main.c Tue Jan 29 09:17:09 2002
+++ working-2.5.3-pre6-percpu/init/main.c Wed Jan 30 06:43:10 2002
@@ -274,16 +274,42 @@

#else

+unsigned long __per_cpu_offset[NR_CPUS];
+/* Created by linker magic */
+extern char __per_cpu_start, __per_cpu_end;
+
+static void setup_per_cpu_areas(void)
+{
+ unsigned int i;
+ size_t per_cpu_size;
+ char *region;
+
+ /* Set up per-CPU offset pointers. Page align to be safe. */
+ per_cpu_size = ((&__per_cpu_end - &__per_cpu_start) + PAGE_SIZE-1)
+ & ~(PAGE_SIZE-1);
+ region = kmalloc(per_cpu_size * smp_num_cpus, GFP_KERNEL);
+ if (!region)
+ panic("Could not allocate per-cpu regions: %u bytes\n",
+ per_cpu_size * smp_num_cpus);
+ for (i = 0; i < smp_num_cpus; i++) {
+ memcpy(region + per_cpu_size*i, &__per_cpu_start,
+ &__per_cpu_end - &__per_cpu_start);
+ __per_cpu_offset[i]
+ = (region + per_cpu_size*i) - &__per_cpu_start;
+ }
+ printk("Did %u cpu regions...\n", smp_num_cpus);
+}
+
/* Called by boot processor to activate the rest. */
static void __init smp_init(void)
{
/* Get other processors into their bootup holding patterns. */
smp_boot_cpus();

+ setup_per_cpu_areas();
smp_threads_ready=1;
smp_commence();
}
-
#endif

/*

--
Anyone who quotes me in their sig is an idiot. -- Rusty Russell.


2002-01-29 09:29:51

by Andrew Morton

[permalink] [raw]
Subject: Re: [PATCH] per-cpu areas for 2.5.3-pre6

Rusty Russell wrote:
>
> This patch introduces the __per_cpu_data tag for data, and the
> per_cpu() & this_cpu() macros to go with it.
>
> This allows us to get rid of all those special case structures
> springing up all over the place for CPU-local data.

Am I missing something? smp_init() is called quite late in
the boot process, and if any code touches per-cpu data before
this, it'll get a null pointer deref, won't it?

You could possibly do:

unsigned long __per_cpu_offset[NR_CPUS] = { (unsigned long *)&__per_cpu_start, };

which takes care of the boot processor. You lose the ability
to put statically initialised data into the per-cpu area, but
that's not too bad.

Also the boot processor won't be able to initialise stuff which
belongs to other CPUs.

Or the whole thing needs to be moved super-early into the boot
process.

Or I missed something :)

-

2002-01-29 22:09:13

by Rusty Russell

[permalink] [raw]
Subject: Re: [PATCH] per-cpu areas for 2.5.3-pre6

In message <[email protected]> you write:
> Rusty Russell wrote:
> >
> > This patch introduces the __per_cpu_data tag for data, and the
> > per_cpu() & this_cpu() macros to go with it.
> >
> > This allows us to get rid of all those special case structures
> > springing up all over the place for CPU-local data.
>
> Am I missing something? smp_init() is called quite late in
> the boot process, and if any code touches per-cpu data before
> this, it'll get a null pointer deref, won't it?

Yes. But for a large amount of code it doesn't matter, and most
architectures don't know how many CPUs there are before smp_init().
Of course, we could make it NR_CPUS...

Do you have an example where you want to use this before
smp_boot_cpus()? If so, we can bite the bullet. Otherwise I'd prefer
not to waste memory.

BTW, my master plan (well, stolen from Anton and Paulus here) was to
have one register to point to the per-cpu area, and all you need is a
shift down 12 bits, and with 31 to get cpu id. Bwahahahah! (non-x86
of course).

Cheers,
Rusty.
--
Anyone who quotes me in their sig is an idiot. -- Rusty Russell.

2002-01-29 22:30:35

by Andrew Morton

[permalink] [raw]
Subject: Re: [PATCH] per-cpu areas for 2.5.3-pre6

Rusty Russell wrote:
>
> In message <[email protected]> you write:
> > Rusty Russell wrote:
> > >
> > > This patch introduces the __per_cpu_data tag for data, and the
> > > per_cpu() & this_cpu() macros to go with it.
> > >
> > > This allows us to get rid of all those special case structures
> > > springing up all over the place for CPU-local data.
> >
> > Am I missing something? smp_init() is called quite late in
> > the boot process, and if any code touches per-cpu data before
> > this, it'll get a null pointer deref, won't it?
>
> Yes. But for a large amount of code it doesn't matter, and most
> architectures don't know how many CPUs there are before smp_init().
> Of course, we could make it NR_CPUS...

I don't think there's a need. You can use .data.percpu directly
for the boot CPU, dynamically allocate storage for the others.
This actually saves one CPU's worth of memory :)

> Do you have an example where you want to use this before
> smp_boot_cpus()? If so, we can bite the bullet. Otherwise I'd prefer
> not to waste memory.

Well, the slab allocator uses per-CPU data, so with the current patch,
slab.c wouldn't be able to use per_cpu().

But if you use:

unsigned long __per_cpu_offset[NR_CPUS] = { (unsigned long *)&__per_cpu_start, };

Then each CPU has, at all times, a valid personal __per_cpu_offset[]
entry. The only restriction is that the boot CPU cannot
touch other CPU's per-cpu data before those CPUs are brought
up.

-

2002-01-29 23:20:03

by Oliver Xymoron

[permalink] [raw]
Subject: Re: [PATCH] per-cpu areas for 2.5.3-pre6

On Tue, 29 Jan 2002, Rusty Russell wrote:

> This patch introduces the __per_cpu_data tag for data, and the
> per_cpu() & this_cpu() macros to go with it.
>
> This allows us to get rid of all those special case structures
> springing up all over the place for CPU-local data.

Seems like we could do slightly better to have these local areas mapped to
the same virtual address on each processor, which does away with the need
for an entire level of indirection.

--
"Love the dolphins," she advised him. "Write by W.A.S.T.E.."

2002-01-29 23:40:24

by Linus Torvalds

[permalink] [raw]
Subject: Re: [PATCH] per-cpu areas for 2.5.3-pre6


On Tue, 29 Jan 2002, Oliver Xymoron wrote:
>
> Seems like we could do slightly better to have these local areas mapped to
> the same virtual address on each processor, which does away with the need
> for an entire level of indirection.

No no no.

That is a really stupid idea, even though every single OS developer has at
some time thought that it was the great idea (and it shows up in a lot of
OS's).

The reason it is a stupid idea is that if you do it, you can no longer
share page tables between CPU's (unless all CPU's you support have TLB
fill in software).

Linus

2002-01-30 00:21:43

by Oliver Xymoron

[permalink] [raw]
Subject: Re: [PATCH] per-cpu areas for 2.5.3-pre6

On Tue, 29 Jan 2002, Linus Torvalds wrote:

> On Tue, 29 Jan 2002, Oliver Xymoron wrote:
> >
> > Seems like we could do slightly better to have these local areas mapped to
> > the same virtual address on each processor, which does away with the need
> > for an entire level of indirection.
>
> No no no.
>
> The reason it is a stupid idea is that if you do it, you can no longer
> share page tables between CPU's (unless all CPU's you support have TLB
> fill in software).

Yes, obviously.

Nearly as good would be replacing the current logic for figuring out the
current processor id through current with logic to access the per-cpu
data. The primary use of that id is indexing that data anyway.

--
"Love the dolphins," she advised him. "Write by W.A.S.T.E.."

2002-01-30 00:43:53

by Rusty Russell

[permalink] [raw]
Subject: Re: [PATCH] per-cpu areas for 2.5.3-pre6

In message <[email protected]> you write:
> Yes, obviously.

Um... if it was so "obvious" why did you suggest it in the first
place? 8)

> Nearly as good would be replacing the current logic for figuring out the
> current processor id through current with logic to access the per-cpu
> data. The primary use of that id is indexing that data anyway.

And if you'd been reading this thread, you would have already seen
this idea, and if you'd read the x86 code, you'd have realised that
the tradeoff is arch-specific.

But thanks anyway: what this list really needs is more armchair kernel
hackers discussing code they haven't really thought about.

Cheers,
Rusty.
--
Anyone who quotes me in their sig is an idiot. -- Rusty Russell.

2002-01-30 00:48:03

by Rusty Russell

[permalink] [raw]
Subject: Re: [PATCH] per-cpu areas for 2.5.3-pre6

In message <[email protected]> you write:
> Rusty Russell wrote:
> > Yes. But for a large amount of code it doesn't matter, and most
> > architectures don't know how many CPUs there are before smp_init().
> > Of course, we could make it NR_CPUS...
>
> I don't think there's a need. You can use .data.percpu directly
> for the boot CPU, dynamically allocate storage for the others.
> This actually saves one CPU's worth of memory :)

No it doesn't: we discard the original anyway (it's within the init
section). You also destroy the "derive smp_processor_id from per-cpu
pointer register" idea by making them non-contiguous.

> unsigned long __per_cpu_offset[NR_CPUS] = { (unsigned long *)&__per_cpu_star
t, };
>
> Then each CPU has, at all times, a valid personal __per_cpu_offset[]
> entry. The only restriction is that the boot CPU cannot
> touch other CPU's per-cpu data before those CPUs are brought
> up.

You still can't use anything like the slab allocator in
eg. cpu_init(), which is inviting trouble. I think we really are best
off allocating up front and avoiding this whole mess. Patch below
(no, it won't boot at the moment, since the per_cpu area will be size
0, and that hits a BUG() in alloc_bootmem, but if you put something in
it it works).

Other patches to follow...
Rusty.

diff -urN -I \$.*\$ --exclude TAGS -X /home/rusty/devel/kernel/kernel-patches/current-dontdiff --minimal linux-2.5.3-pre6/arch/i386/vmlinux.lds working-2.5.3-pre6-percpu/arch/i386/vmlinux.lds
--- linux-2.5.3-pre6/arch/i386/vmlinux.lds Tue Jan 29 09:16:52 2002
+++ working-2.5.3-pre6-percpu/arch/i386/vmlinux.lds Wed Jan 30 10:17:38 2002
@@ -58,6 +58,10 @@
*(.initcall7.init)
}
__initcall_end = .;
+ . = ALIGN(32);
+ __per_cpu_start = .;
+ .data.percpu : { *(.data.percpu) }
+ __per_cpu_end = .;
. = ALIGN(4096);
__init_end = .;

diff -urN -I \$.*\$ --exclude TAGS -X /home/rusty/devel/kernel/kernel-patches/current-dontdiff --minimal linux-2.5.3-pre6/arch/ppc/vmlinux.lds working-2.5.3-pre6-percpu/arch/ppc/vmlinux.lds
--- linux-2.5.3-pre6/arch/ppc/vmlinux.lds Tue Jan 29 09:16:53 2002
+++ working-2.5.3-pre6-percpu/arch/ppc/vmlinux.lds Wed Jan 30 10:17:38 2002
@@ -104,6 +104,10 @@
*(.initcall7.init)
}
__initcall_end = .;
+ . = ALIGN(32);
+ __per_cpu_start = .;
+ .data.percpu : { *(.data.percpu) }
+ __per_cpu_end = .;
. = ALIGN(4096);
__init_end = .;

diff -urN -I \$.*\$ --exclude TAGS -X /home/rusty/devel/kernel/kernel-patches/current-dontdiff --minimal linux-2.5.3-pre6/include/linux/smp.h working-2.5.3-pre6-percpu/include/linux/smp.h
--- linux-2.5.3-pre6/include/linux/smp.h Thu Jan 17 16:35:24 2002
+++ working-2.5.3-pre6-percpu/include/linux/smp.h Wed Jan 30 10:17:38 2002
@@ -71,7 +71,17 @@
#define MSG_RESCHEDULE 0x0003 /* Reschedule request from master CPU*/
#define MSG_CALL_FUNCTION 0x0004 /* Call function on all other CPUs */

-#else
+#define __per_cpu_data __attribute__((section(".data.percpu")))
+
+#ifndef __HAVE_ARCH_CPU_OFFSET
+#define per_cpu_offset(cpu) (__per_cpu_offset[(cpu)])
+#endif
+
+#define per_cpu(var, cpu) \
+(*(__typeof__(&var)((void *)&var + per_cpu_offset(cpu))))
+
+extern unsigned long __per_cpu_offset[NR_CPUS];
+#else /* !SMP */

/*
* These macros fold the SMP functionality into a single CPU system
@@ -88,6 +98,10 @@
#define cpu_online_map 1
static inline void smp_send_reschedule(int cpu) { }
static inline void smp_send_reschedule_all(void) { }
+#define __per_cpu_data
+#define per_cpu(var, cpu) var

#endif
+
+#define this_cpu(var) per_cpu(var,smp_processor_id())
#endif
diff -urN -I \$.*\$ --exclude TAGS -X /home/rusty/devel/kernel/kernel-patches/current-dontdiff --minimal linux-2.5.3-pre6/init/main.c working-2.5.3-pre6-percpu/init/main.c
--- linux-2.5.3-pre6/init/main.c Tue Jan 29 09:17:09 2002
+++ working-2.5.3-pre6-percpu/init/main.c Wed Jan 30 10:43:33 2002
@@ -272,8 +272,33 @@
#define smp_init() do { } while (0)
#endif

+static inline void setup_per_cpu_areas(void)
+{
+}
#else

+unsigned long __per_cpu_offset[NR_CPUS];
+/* Created by linker magic */
+extern char __per_cpu_start, __per_cpu_end;
+
+static void setup_per_cpu_areas(void)
+{
+ unsigned int i;
+ size_t per_cpu_size;
+ char *region;
+
+ /* Set up per-CPU offset pointers. Page align to be safe. */
+ per_cpu_size = ((&__per_cpu_end - &__per_cpu_start) + PAGE_SIZE-1)
+ & ~(PAGE_SIZE-1);
+ region = alloc_bootmem_pages(per_cpu_size*NR_CPUS);
+ for (i = 0; i < NR_CPUS; i++) {
+ memcpy(region + per_cpu_size*i, &__per_cpu_start,
+ &__per_cpu_end - &__per_cpu_start);
+ __per_cpu_offset[i]
+ = (region + per_cpu_size*i) - &__per_cpu_start;
+ }
+}
+
/* Called by boot processor to activate the rest. */
static void __init smp_init(void)
{
@@ -316,6 +341,7 @@
lock_kernel();
printk(linux_banner);
setup_arch(&command_line);
+ setup_per_cpu_areas();
printk("Kernel command line: %s\n", saved_command_line);
parse_options(command_line);
trap_init();

--
Anyone who quotes me in their sig is an idiot. -- Rusty Russell.

2002-01-30 00:56:43

by Andrew Morton

[permalink] [raw]
Subject: Re: [PATCH] per-cpu areas for 2.5.3-pre6

Rusty Russell wrote:
>
> @@ -316,6 +341,7 @@
> lock_kernel();
> printk(linux_banner);
> setup_arch(&command_line);
> + setup_per_cpu_areas();
> printk("Kernel command line: %s\n", saved_command_line);
> parse_options(command_line);
> trap_init();

Looks good. I'll shut up now.

-

2002-01-30 02:00:14

by Rusty Russell

[permalink] [raw]
Subject: Re: [PATCH] per-cpu areas for 2.5.3-pre6

On Tue, 29 Jan 2002 16:49:15 -0800
Andrew Morton <[email protected]> wrote:

> Looks good. I'll shut up now.

8) Linus, this patch contains the per-cpu patch, and uses it in softirq.c.
It moves the tasklet_vec/tasklet_hi_vec inside softirq.c too (they're not used
anywhere else).

Boots and runs: more cleanups will follow once this is in...

Thanks!
Rusty.
--
Anyone who quotes me in their sig is an idiot. -- Rusty Russell.

diff -urN -I \$.*\$ --exclude TAGS -X /home/rusty/devel/kernel/kernel-patches/current-dontdiff --minimal linux-2.5.3-pre6/arch/i386/vmlinux.lds working-2.5.3-pre6-percpu/arch/i386/vmlinux.lds
--- linux-2.5.3-pre6/arch/i386/vmlinux.lds Tue Jan 29 09:16:52 2002
+++ working-2.5.3-pre6-percpu/arch/i386/vmlinux.lds Wed Jan 30 10:17:38 2002
@@ -58,6 +58,10 @@
*(.initcall7.init)
}
__initcall_end = .;
+ . = ALIGN(32);
+ __per_cpu_start = .;
+ .data.percpu : { *(.data.percpu) }
+ __per_cpu_end = .;
. = ALIGN(4096);
__init_end = .;

diff -urN -I \$.*\$ --exclude TAGS -X /home/rusty/devel/kernel/kernel-patches/current-dontdiff --minimal linux-2.5.3-pre6/arch/ppc/vmlinux.lds working-2.5.3-pre6-percpu/arch/ppc/vmlinux.lds
--- linux-2.5.3-pre6/arch/ppc/vmlinux.lds Tue Jan 29 09:16:53 2002
+++ working-2.5.3-pre6-percpu/arch/ppc/vmlinux.lds Wed Jan 30 10:17:38 2002
@@ -104,6 +104,10 @@
*(.initcall7.init)
}
__initcall_end = .;
+ . = ALIGN(32);
+ __per_cpu_start = .;
+ .data.percpu : { *(.data.percpu) }
+ __per_cpu_end = .;
. = ALIGN(4096);
__init_end = .;

diff -urN -I \$.*\$ --exclude TAGS -X /home/rusty/devel/kernel/kernel-patches/current-dontdiff --minimal linux-2.5.3-pre6/include/linux/smp.h working-2.5.3-pre6-percpu/include/linux/smp.h
--- linux-2.5.3-pre6/include/linux/smp.h Thu Jan 17 16:35:24 2002
+++ working-2.5.3-pre6-percpu/include/linux/smp.h Wed Jan 30 10:17:38 2002
@@ -71,7 +71,17 @@
#define MSG_RESCHEDULE 0x0003 /* Reschedule request from master CPU*/
#define MSG_CALL_FUNCTION 0x0004 /* Call function on all other CPUs */

-#else
+#define __per_cpu_data __attribute__((section(".data.percpu")))
+
+#ifndef __HAVE_ARCH_CPU_OFFSET
+#define per_cpu_offset(cpu) (__per_cpu_offset[(cpu)])
+#endif
+
+#define per_cpu(var, cpu) \
+(*((__typeof__(&var))((void *)&var + per_cpu_offset(cpu))))
+
+extern unsigned long __per_cpu_offset[NR_CPUS];
+#else /* !SMP */

/*
* These macros fold the SMP functionality into a single CPU system
@@ -88,6 +98,10 @@
#define cpu_online_map 1
static inline void smp_send_reschedule(int cpu) { }
static inline void smp_send_reschedule_all(void) { }
+#define __per_cpu_data
+#define per_cpu(var, cpu) var

#endif
+
+#define this_cpu(var) per_cpu(var,smp_processor_id())
#endif
diff -urN -I \$.*\$ --exclude TAGS -X /home/rusty/devel/kernel/kernel-patches/current-dontdiff --minimal linux-2.5.3-pre6/init/main.c working-2.5.3-pre6-percpu/init/main.c
--- linux-2.5.3-pre6/init/main.c Tue Jan 29 09:17:09 2002
+++ working-2.5.3-pre6-percpu/init/main.c Wed Jan 30 10:43:33 2002
@@ -272,8 +272,33 @@
#define smp_init() do { } while (0)
#endif

+static inline void setup_per_cpu_areas(void)
+{
+}
#else

+unsigned long __per_cpu_offset[NR_CPUS];
+/* Created by linker magic */
+extern char __per_cpu_start, __per_cpu_end;
+
+static void setup_per_cpu_areas(void)
+{
+ unsigned int i;
+ size_t per_cpu_size;
+ char *region;
+
+ /* Set up per-CPU offset pointers. Page align to be safe. */
+ per_cpu_size = ((&__per_cpu_end - &__per_cpu_start) + PAGE_SIZE-1)
+ & ~(PAGE_SIZE-1);
+ region = alloc_bootmem_pages(per_cpu_size*NR_CPUS);
+ for (i = 0; i < NR_CPUS; i++) {
+ memcpy(region + per_cpu_size*i, &__per_cpu_start,
+ &__per_cpu_end - &__per_cpu_start);
+ __per_cpu_offset[i]
+ = (region + per_cpu_size*i) - &__per_cpu_start;
+ }
+}
+
/* Called by boot processor to activate the rest. */
static void __init smp_init(void)
{
@@ -316,6 +341,7 @@
lock_kernel();
printk(linux_banner);
setup_arch(&command_line);
+ setup_per_cpu_areas();
printk("Kernel command line: %s\n", saved_command_line);
parse_options(command_line);
trap_init();
diff -urN -I \$.*\$ --exclude TAGS -X /home/rusty/devel/kernel/kernel-patches/current-dontdiff --minimal working-2.5.3-pre6-percpu/include/linux/interrupt.h working-2.5.3-pre6-percpu-tasklet/include/linux/interrupt.h
--- working-2.5.3-pre6-percpu/include/linux/interrupt.h Thu Jan 17 16:35:24 2002
+++ working-2.5.3-pre6-percpu-tasklet/include/linux/interrupt.h Wed Jan 30 12:00:08 2002
@@ -124,14 +124,6 @@
TASKLET_STATE_RUN /* Tasklet is running (SMP only) */
};

-struct tasklet_head
-{
- struct tasklet_struct *list;
-} __attribute__ ((__aligned__(SMP_CACHE_BYTES)));
-
-extern struct tasklet_head tasklet_vec[NR_CPUS];
-extern struct tasklet_head tasklet_hi_vec[NR_CPUS];
-
#ifdef CONFIG_SMP
static inline int tasklet_trylock(struct tasklet_struct *t)
{
diff -urN -I \$.*\$ --exclude TAGS -X /home/rusty/devel/kernel/kernel-patches/current-dontdiff --minimal working-2.5.3-pre6-percpu/kernel/ksyms.c working-2.5.3-pre6-percpu-tasklet/kernel/ksyms.c
--- working-2.5.3-pre6-percpu/kernel/ksyms.c Tue Jan 29 09:17:09 2002
+++ working-2.5.3-pre6-percpu-tasklet/kernel/ksyms.c Wed Jan 30 12:00:16 2002
@@ -542,8 +542,6 @@
EXPORT_SYMBOL(strsep);

/* software interrupts */
-EXPORT_SYMBOL(tasklet_hi_vec);
-EXPORT_SYMBOL(tasklet_vec);
EXPORT_SYMBOL(bh_task_vec);
EXPORT_SYMBOL(init_bh);
EXPORT_SYMBOL(remove_bh);
diff -urN -I \$.*\$ --exclude TAGS -X /home/rusty/devel/kernel/kernel-patches/current-dontdiff --minimal working-2.5.3-pre6-percpu/kernel/softirq.c working-2.5.3-pre6-percpu-tasklet/kernel/softirq.c
--- working-2.5.3-pre6-percpu/kernel/softirq.c Tue Jan 29 09:17:09 2002
+++ working-2.5.3-pre6-percpu-tasklet/kernel/softirq.c Wed Jan 30 12:34:12 2002
@@ -145,9 +145,13 @@


/* Tasklets */
+struct tasklet_head
+{
+ struct tasklet_struct *list;
+};

-struct tasklet_head tasklet_vec[NR_CPUS] __cacheline_aligned_in_smp;
-struct tasklet_head tasklet_hi_vec[NR_CPUS] __cacheline_aligned_in_smp;
+static struct tasklet_head tasklet_vec __per_cpu_data;
+static struct tasklet_head tasklet_hi_vec __per_cpu_data;

void __tasklet_schedule(struct tasklet_struct *t)
{
@@ -155,8 +159,8 @@
unsigned long flags;

local_irq_save(flags);
- t->next = tasklet_vec[cpu].list;
- tasklet_vec[cpu].list = t;
+ t->next = per_cpu(tasklet_vec, cpu).list;
+ per_cpu(tasklet_vec, cpu).list = t;
cpu_raise_softirq(cpu, TASKLET_SOFTIRQ);
local_irq_restore(flags);
}
@@ -167,8 +171,8 @@
unsigned long flags;

local_irq_save(flags);
- t->next = tasklet_hi_vec[cpu].list;
- tasklet_hi_vec[cpu].list = t;
+ t->next = per_cpu(tasklet_hi_vec, cpu).list;
+ per_cpu(tasklet_hi_vec, cpu).list = t;
cpu_raise_softirq(cpu, HI_SOFTIRQ);
local_irq_restore(flags);
}
@@ -179,8 +183,8 @@
struct tasklet_struct *list;

local_irq_disable();
- list = tasklet_vec[cpu].list;
- tasklet_vec[cpu].list = NULL;
+ list = per_cpu(tasklet_vec, cpu).list;
+ per_cpu(tasklet_vec, cpu).list = NULL;
local_irq_enable();

while (list) {
@@ -200,8 +204,8 @@
}

local_irq_disable();
- t->next = tasklet_vec[cpu].list;
- tasklet_vec[cpu].list = t;
+ t->next = per_cpu(tasklet_vec, cpu).list;
+ per_cpu(tasklet_vec, cpu).list = t;
__cpu_raise_softirq(cpu, TASKLET_SOFTIRQ);
local_irq_enable();
}
@@ -213,8 +217,8 @@
struct tasklet_struct *list;

local_irq_disable();
- list = tasklet_hi_vec[cpu].list;
- tasklet_hi_vec[cpu].list = NULL;
+ list = per_cpu(tasklet_hi_vec, cpu).list;
+ per_cpu(tasklet_hi_vec, cpu).list = NULL;
local_irq_enable();

while (list) {
@@ -234,8 +238,8 @@
}

local_irq_disable();
- t->next = tasklet_hi_vec[cpu].list;
- tasklet_hi_vec[cpu].list = t;
+ t->next = per_cpu(tasklet_hi_vec, cpu).list;
+ per_cpu(tasklet_hi_vec, cpu).list = t;
__cpu_raise_softirq(cpu, HI_SOFTIRQ);
local_irq_enable();
}

2002-01-30 02:14:14

by Linus Torvalds

[permalink] [raw]
Subject: Re: [PATCH] per-cpu areas for 2.5.3-pre6


On Wed, 30 Jan 2002, Rusty Russell wrote:
>
> Boots and runs: more cleanups will follow once this is in...

May I suggest a few more cleanups before I apply it, namely try to
abstract out that "&__per_cpu_end-&__per_cpu_start" thing and make it more
readable what is going on in setup_per_cpu_areas() which (quite frankly)
is otherwise a prime candidate for the obfuscated C contest.

Also, wouldn't it be much nicer to just leave CPU#0 at the start of
.data.percpu, and not have to have an offset for CPU0 at all? That would
sure make the UP case cleaner.

Linus

2002-01-30 04:37:53

by Rusty Russell

[permalink] [raw]
Subject: Re: [PATCH] per-cpu areas for 2.5.3-pre6

In message <[email protected]> you
write:
> Also, wouldn't it be much nicer to just leave CPU#0 at the start of
> .data.percpu, and not have to have an offset for CPU0 at all? That would
> sure make the UP case cleaner.

Tried that, but it's uglier 8( UP never hits this code anyway, and we
discard the original section anyway (ie. it's in the init section).

Main neatening comes from introducing an ALIGN macro in linux/cache.h.
Also extended the __HAVE_ARCH_PER_CPU so it can be replaced by arch's
without touching the core (thanks Anton). This means that they can
allocate a fixed amount per CPU, and keep the per-cpu pointer in a
register, from which smp_processor_id() can be simply derived.

Hope this is better,
Rusty.
--
Anyone who quotes me in their sig is an idiot. -- Rusty Russell.

diff -urN -I \$.*\$ --exclude TAGS -X /home/rusty/devel/kernel/kernel-patches/current-dontdiff --minimal linux-2.5.3-pre6/include/linux/cache.h working-2.5.3-pre6-percpu/include/linux/cache.h
--- linux-2.5.3-pre6/include/linux/cache.h Thu Jan 17 16:34:59 2002
+++ working-2.5.3-pre6-percpu/include/linux/cache.h Wed Jan 30 14:59:20 2002
@@ -4,8 +4,10 @@
#include <linux/config.h>
#include <asm/cache.h>

+#define ALIGN(x,a) (((x)+(a)-1)&~((a)-1))
+
#ifndef L1_CACHE_ALIGN
-#define L1_CACHE_ALIGN(x) (((x)+(L1_CACHE_BYTES-1))&~(L1_CACHE_BYTES-1))
+#define L1_CACHE_ALIGN(x) ALIGN(x, L1_CACHE_BYTES)
#endif

#ifndef SMP_CACHE_BYTES
diff -urN -I \$.*\$ --exclude TAGS -X /home/rusty/devel/kernel/kernel-patches/current-dontdiff --minimal linux-2.5.3-pre6/include/linux/smp.h working-2.5.3-pre6-percpu/include/linux/smp.h
--- linux-2.5.3-pre6/include/linux/smp.h Thu Jan 17 16:35:24 2002
+++ working-2.5.3-pre6-percpu/include/linux/smp.h Wed Jan 30 14:23:58 2002
@@ -71,7 +71,18 @@
#define MSG_RESCHEDULE 0x0003 /* Reschedule request from master CPU*/
#define MSG_CALL_FUNCTION 0x0004 /* Call function on all other CPUs */

-#else
+#define __per_cpu_data __attribute__((section(".data.percpu")))
+
+#ifndef __HAVE_ARCH_PER_CPU
+extern unsigned long __per_cpu_offset[NR_CPUS];
+
+#define per_cpu(var, cpu) \
+ (*((__typeof__(&var))((void *)&var + __per_cpu_offset[(cpu)])))
+
+#define this_cpu(var) per_cpu(var,smp_processor_id())
+
+#endif /* !__HAVE_ARCH_PER_CPU */
+#else /* !SMP */

/*
* These macros fold the SMP functionality into a single CPU system
@@ -88,6 +99,9 @@
#define cpu_online_map 1
static inline void smp_send_reschedule(int cpu) { }
static inline void smp_send_reschedule_all(void) { }
+#define __per_cpu_data
+#define per_cpu(var, cpu) var
+#define this_cpu(var) var

#endif
#endif
diff -urN -I \$.*\$ --exclude TAGS -X /home/rusty/devel/kernel/kernel-patches/current-dontdiff --minimal linux-2.5.3-pre6/init/main.c working-2.5.3-pre6-percpu/init/main.c
--- linux-2.5.3-pre6/init/main.c Tue Jan 29 09:17:09 2002
+++ working-2.5.3-pre6-percpu/init/main.c Wed Jan 30 14:44:33 2002
@@ -272,8 +272,32 @@
#define smp_init() do { } while (0)
#endif

+static inline void setup_per_cpu_areas(void)
+{
+}
#else

+#ifndef __HAVE_ARCH_PER_CPU
+unsigned long __per_cpu_offset[NR_CPUS];
+
+static void __init setup_per_cpu_areas(void)
+{
+ unsigned long size, i;
+ char *ptr;
+ /* Created by linker magic */
+ extern char __per_cpu_start, __per_cpu_end;
+
+ /* Copy section for each CPU (we discard the original) */
+ size = ALIGN(&__per_cpu_end - &__per_cpu_start, SMP_CACHE_BYTES);
+ ptr = alloc_bootmem(size * NR_CPUS);
+
+ for (i = 0; i < NR_CPUS; i++, ptr += size) {
+ __per_cpu_offset[i] = ptr - &__per_cpu_start;
+ memcpy(ptr, &__per_cpu_start, size);
+ }
+}
+#endif /* !__HAVE_ARCH_PER_CPU */
+
/* Called by boot processor to activate the rest. */
static void __init smp_init(void)
{
@@ -316,6 +340,7 @@
lock_kernel();
printk(linux_banner);
setup_arch(&command_line);
+ setup_per_cpu_areas();
printk("Kernel command line: %s\n", saved_command_line);
parse_options(command_line);
trap_init();
diff -urN -I \$.*\$ --exclude TAGS -X /home/rusty/devel/kernel/kernel-patches/current-dontdiff --minimal linux-2.5.3-pre6/arch/i386/vmlinux.lds working-2.5.3-pre6-percpu/arch/i386/vmlinux.lds
--- linux-2.5.3-pre6/arch/i386/vmlinux.lds Tue Jan 29 09:16:52 2002
+++ working-2.5.3-pre6-percpu/arch/i386/vmlinux.lds Wed Jan 30 12:02:35 2002
@@ -58,6 +58,10 @@
*(.initcall7.init)
}
__initcall_end = .;
+ . = ALIGN(32);
+ __per_cpu_start = .;
+ .data.percpu : { *(.data.percpu) }
+ __per_cpu_end = .;
. = ALIGN(4096);
__init_end = .;

diff -urN -I \$.*\$ --exclude TAGS -X /home/rusty/devel/kernel/kernel-patches/current-dontdiff --minimal linux-2.5.3-pre6/arch/ppc/vmlinux.lds working-2.5.3-pre6-percpu/arch/ppc/vmlinux.lds
--- linux-2.5.3-pre6/arch/ppc/vmlinux.lds Tue Jan 29 09:16:53 2002
+++ working-2.5.3-pre6-percpu/arch/ppc/vmlinux.lds Wed Jan 30 12:02:35 2002
@@ -104,6 +104,10 @@
*(.initcall7.init)
}
__initcall_end = .;
+ . = ALIGN(32);
+ __per_cpu_start = .;
+ .data.percpu : { *(.data.percpu) }
+ __per_cpu_end = .;
. = ALIGN(4096);
__init_end = .;

diff -urN -I \$.*\$ --exclude TAGS -X /home/rusty/devel/kernel/kernel-patches/current-dontdiff --minimal working-2.5.3-pre6-percpu/include/linux/interrupt.h working-2.5.3-pre6-percpu-tasklet/include/linux/interrupt.h
--- working-2.5.3-pre6-percpu/include/linux/interrupt.h Thu Jan 17 16:35:24 2002
+++ working-2.5.3-pre6-percpu-tasklet/include/linux/interrupt.h Wed Jan 30 12:00:08 2002
@@ -124,14 +124,6 @@
TASKLET_STATE_RUN /* Tasklet is running (SMP only) */
};

-struct tasklet_head
-{
- struct tasklet_struct *list;
-} __attribute__ ((__aligned__(SMP_CACHE_BYTES)));
-
-extern struct tasklet_head tasklet_vec[NR_CPUS];
-extern struct tasklet_head tasklet_hi_vec[NR_CPUS];
-
#ifdef CONFIG_SMP
static inline int tasklet_trylock(struct tasklet_struct *t)
{
diff -urN -I \$.*\$ --exclude TAGS -X /home/rusty/devel/kernel/kernel-patches/current-dontdiff --minimal working-2.5.3-pre6-percpu/kernel/ksyms.c working-2.5.3-pre6-percpu-tasklet/kernel/ksyms.c
--- working-2.5.3-pre6-percpu/kernel/ksyms.c Tue Jan 29 09:17:09 2002
+++ working-2.5.3-pre6-percpu-tasklet/kernel/ksyms.c Wed Jan 30 12:00:16 2002
@@ -542,8 +542,6 @@
EXPORT_SYMBOL(strsep);

/* software interrupts */
-EXPORT_SYMBOL(tasklet_hi_vec);
-EXPORT_SYMBOL(tasklet_vec);
EXPORT_SYMBOL(bh_task_vec);
EXPORT_SYMBOL(init_bh);
EXPORT_SYMBOL(remove_bh);
diff -urN -I \$.*\$ --exclude TAGS -X /home/rusty/devel/kernel/kernel-patches/current-dontdiff --minimal working-2.5.3-pre6-percpu/kernel/softirq.c working-2.5.3-pre6-percpu-tasklet/kernel/softirq.c
--- working-2.5.3-pre6-percpu/kernel/softirq.c Tue Jan 29 09:17:09 2002
+++ working-2.5.3-pre6-percpu-tasklet/kernel/softirq.c Wed Jan 30 12:34:12 2002
@@ -145,9 +145,13 @@


/* Tasklets */
+struct tasklet_head
+{
+ struct tasklet_struct *list;
+};

-struct tasklet_head tasklet_vec[NR_CPUS] __cacheline_aligned_in_smp;
-struct tasklet_head tasklet_hi_vec[NR_CPUS] __cacheline_aligned_in_smp;
+static struct tasklet_head tasklet_vec __per_cpu_data;
+static struct tasklet_head tasklet_hi_vec __per_cpu_data;

void __tasklet_schedule(struct tasklet_struct *t)
{
@@ -155,8 +159,8 @@
unsigned long flags;

local_irq_save(flags);
- t->next = tasklet_vec[cpu].list;
- tasklet_vec[cpu].list = t;
+ t->next = per_cpu(tasklet_vec, cpu).list;
+ per_cpu(tasklet_vec, cpu).list = t;
cpu_raise_softirq(cpu, TASKLET_SOFTIRQ);
local_irq_restore(flags);
}
@@ -167,8 +171,8 @@
unsigned long flags;

local_irq_save(flags);
- t->next = tasklet_hi_vec[cpu].list;
- tasklet_hi_vec[cpu].list = t;
+ t->next = per_cpu(tasklet_hi_vec, cpu).list;
+ per_cpu(tasklet_hi_vec, cpu).list = t;
cpu_raise_softirq(cpu, HI_SOFTIRQ);
local_irq_restore(flags);
}
@@ -179,8 +183,8 @@
struct tasklet_struct *list;

local_irq_disable();
- list = tasklet_vec[cpu].list;
- tasklet_vec[cpu].list = NULL;
+ list = per_cpu(tasklet_vec, cpu).list;
+ per_cpu(tasklet_vec, cpu).list = NULL;
local_irq_enable();

while (list) {
@@ -200,8 +204,8 @@
}

local_irq_disable();
- t->next = tasklet_vec[cpu].list;
- tasklet_vec[cpu].list = t;
+ t->next = per_cpu(tasklet_vec, cpu).list;
+ per_cpu(tasklet_vec, cpu).list = t;
__cpu_raise_softirq(cpu, TASKLET_SOFTIRQ);
local_irq_enable();
}
@@ -213,8 +217,8 @@
struct tasklet_struct *list;

local_irq_disable();
- list = tasklet_hi_vec[cpu].list;
- tasklet_hi_vec[cpu].list = NULL;
+ list = per_cpu(tasklet_hi_vec, cpu).list;
+ per_cpu(tasklet_hi_vec, cpu).list = NULL;
local_irq_enable();

while (list) {
@@ -234,8 +238,8 @@
}

local_irq_disable();
- t->next = tasklet_hi_vec[cpu].list;
- tasklet_hi_vec[cpu].list = t;
+ t->next = per_cpu(tasklet_hi_vec, cpu).list;
+ per_cpu(tasklet_hi_vec, cpu).list = t;
__cpu_raise_softirq(cpu, HI_SOFTIRQ);
local_irq_enable();
}

2002-01-30 07:00:36

by Oliver Xymoron

[permalink] [raw]
Subject: Re: [PATCH] per-cpu areas for 2.5.3-pre6

On Wed, 30 Jan 2002, Rusty Russell wrote:

> In message <[email protected]> you write:
>
> > Nearly as good would be replacing the current logic for figuring out the
> > current processor id through current with logic to access the per-cpu
> > data. The primary use of that id is indexing that data anyway.
>
> And if you'd been reading this thread, you would have already seen
> this idea, and if you'd read the x86 code, you'd have realised that
> the tradeoff is arch-specific.

This 'thread' as you call it began with the message I replied to. Nor can
I find any other reference to the idea on recent threads relating to
per_cpu on this list.

Looking closer, I've found an issue with your patch related to the above,
so I think it bears closer examination. The common case is of course to
access data for the current CPU, either by dedicated register on non-x86
as you've mentioned, or a scheme like I proposed, which works out
something like this:

#define per_cpu_offset(cpu) (current->per_cpu_offset)
#define smp_processor_id() (per_cpu(current_processor_id,0))

The problem is that either of these schemes don't let you get at the data
on anything but the current processor, because per_cpu would be per-force
ignoring its CPU argument to actually take advantage of the convenient
pointer/register. What is needed is an additional this_cpu interface that
looks generically something like this:

#define this_cpu(var)
(*(__typeof__(&var)((void *)&var + per_cpu_offset(smp_processor_id()))))

and on x86 something like this:

#define this_cpu(var)
(*(__typeof__(&var)((void *)&var + current->per_cpu_offset)))
#define smp_processor_id() (this_cpu(current_processor_id))

I suspect in practice the lack of a this_cpu style interface would be a
great nuisance, given that it's _the common case by definition_. Reviewing
the other arches in the main tree, going through current seems to _always_
be better than indexing __per_cpu_offset for the common case because all
arches except SPARC look in current for the processor id and SPARC already
has a register for current. And there have been various suggestions for
fitting current into special registers on x86 as well.

(While it's obviously possible to extend the task struct to hold both the
offset and the processor id, it seems that the main use of the
smp_processor_id is looking up per-cpu data and that it should largely
disappear once a per_cpu/this_cpu framework is in use. And updating both
on moving between processors would then seem pointless.)

--
"Love the dolphins," she advised him. "Write by W.A.S.T.E.."

2002-01-30 08:23:13

by Richard Henderson

[permalink] [raw]
Subject: Re: [PATCH] per-cpu areas for 2.5.3-pre6

On Wed, Jan 30, 2002 at 01:00:26PM +1100, Rusty Russell wrote:
> +#define per_cpu(var, cpu) \
> +(*((__typeof__(&var))((void *)&var + per_cpu_offset(cpu))))

Have we already forgotten the ppc reloc flamefest? Better
written as

#define per_cpu(var, cpu) \
({ __typeof__(&(var)) __ptr; \
__asm__ ("" : "=g"(__ptr) \
: "0"((void *)&(var) + per_cpu_offset(cpu))); \
*__ptr; })

> +/* Created by linker magic */
> +extern char __per_cpu_start, __per_cpu_end;
[...]
> + per_cpu_size = ((&__per_cpu_end - &__per_cpu_start) + PAGE_SIZE-1)

Will fail on targets (e.g. alpha and mips) that have a notion of a
"small data area" that can be addressed with special relocs.

Better written as

extern char __per_cpu_start[], __per_cpu_end[];
per_cpu_size = (__per_cpu_end - __per_cpu_start) ...


r~

2002-01-30 19:43:26

by Oliver Xymoron

[permalink] [raw]
Subject: Re: [PATCH] per-cpu areas for 2.5.3-pre6

On Wed, 30 Jan 2002, Oliver Xymoron wrote:

> On Wed, 30 Jan 2002, Rusty Russell wrote:
>
> > In message <[email protected]> you write:
> >
> > > Nearly as good would be replacing the current logic for figuring out the
> > > current processor id through current with logic to access the per-cpu
> > > data. The primary use of that id is indexing that data anyway.
> >
> > And if you'd been reading this thread, you would have already seen
> > this idea, and if you'd read the x86 code, you'd have realised that
> > the tradeoff is arch-specific.
>
> Looking closer, I've found an issue with your patch related to the above,
> so I think it bears closer examination.

Looking closer again (and not at 1am), I see I missed this line in reading
your patch, sorry:

+#define this_cpu(var) per_cpu(var,smp_processor_id())

I still think that tracking per_cpu_offset in task struct to eventually
replace current->processor is a win. Basically everyone except Sparc goes
through current anyway for smp_processor_id and Sparc caches current in a
register. Please elucidate your reference to "arch-specific tradeoffs".

Also, it'd be nice to unmap the original copy of the area or at least
poison it to catch silent references to var without going through
this_cpu, which will probably prove very hard to find. If there were a way
to do this at the C source level and catch such things at compile time,
that'd be even better, but I can't see a way to do it without grotty
macros.

--
"Love the dolphins," she advised him. "Write by W.A.S.T.E.."

2002-01-30 22:45:30

by Rusty Russell

[permalink] [raw]
Subject: Re: [PATCH] per-cpu areas for 2.5.3-pre6

In message <[email protected]> you write:
> Have we already forgotten the ppc reloc flamefest? Better
> written as
>
> #define per_cpu(var, cpu) \
> ({ __typeof__(&(var)) __ptr; \
> __asm__ ("" : "=g"(__ptr) \
> : "0"((void *)&(var) + per_cpu_offset(cpu))); \
> *__ptr; })

"better". Believe me, I was fully aware, but I refuse to write such
crap unless *proven* to be required.

> > +/* Created by linker magic */
> > +extern char __per_cpu_start, __per_cpu_end;
> [...]
> > + per_cpu_size = ((&__per_cpu_end - &__per_cpu_start) + PAGE_SIZE-1)
>
> Will fail on targets (e.g. alpha and mips) that have a notion of a
> "small data area" that can be addressed with special relocs.
>
> Better written as
>
> extern char __per_cpu_start[], __per_cpu_end[];
> per_cpu_size = (__per_cpu_end - __per_cpu_start) ...

I agree that this is much better. But do not understand what small
relocs have to do with simple address arithmetic? You've always been
right before: what am I missing?

Rusty.
--
Anyone who quotes me in their sig is an idiot. -- Rusty Russell.

2002-01-30 23:01:10

by Rusty Russell

[permalink] [raw]
Subject: Re: [PATCH] per-cpu areas for 2.5.3-pre6

In message <[email protected]> you write:
> I still think that tracking per_cpu_offset in task struct to eventually
> replace current->processor is a win. Basically everyone except Sparc goes
> through current anyway for smp_processor_id and Sparc caches current in a
> register. Please elucidate your reference to "arch-specific tradeoffs".

Placing useful information in the task struct is a hack. A useful
hack on the register-starved x86, of course. PPC64 will probably use
a register, too.

BTW, apologies for my previous accusations of not reading the
thread. Your reply predated mine by 12 hours:
http://www.ozlabs.org/~rusty/Stupidity.html#9

> Also, it'd be nice to unmap the original copy of the area or at least
> poison it to catch silent references to var without going through
> this_cpu, which will probably prove very hard to find. If there were a way
> to do this at the C source level and catch such things at compile time,
> that'd be even better, but I can't see a way to do it without grotty
> macros.

My first cut did this, with a macro:

DECLARE_PER_CPU(int x);

This allows you to munge x into x__per_cpu, to catch "bare"
references. But I decided against it for two reasons: firstly
__per_cpu_data is nicer, and secondly my proc/sys rewrite can handle
per-cpu data easily if the name is valid.

In practice, grep should be sufficient.

Cheers!
Rusty.
--
Anyone who quotes me in their sig is an idiot. -- Rusty Russell.

2002-01-31 05:49:51

by Richard Henderson

[permalink] [raw]
Subject: Re: [PATCH] per-cpu areas for 2.5.3-pre6

On Thu, Jan 31, 2002 at 09:45:45AM +1100, Rusty Russell wrote:
> "better". Believe me, I was fully aware, but I refuse to write such
> crap unless *proven* to be required.

You're going to wait until the compiler generates incorrect
code for you after knowing that it *probably* will? Nice.

> I agree that this is much better. But do not understand what small
> relocs have to do with simple address arithmetic? You've always been
> right before: what am I missing?

"Small" variables may be positioned by the compiler such that
they are addressable via a 16-bit relocation from some GP register.
If that variable isn't actually located in the small data area,
then the 16-bit relocation may overflow, resulting in link errors.

So it isn't a matter of the arithmetic itself, but loading the
addresses with which to do the arithmetic.

By declaring the variable to be an array of unspecified size,
you're giving the compiler no information as to the size of the
variable, and so it cannot assume the variable is located in the
small data area.


r~

2002-02-01 08:56:13

by Rusty Russell

[permalink] [raw]
Subject: Re: [PATCH] per-cpu areas for 2.5.3-pre6

In message <[email protected]> you write:
> On Thu, Jan 31, 2002 at 09:45:45AM +1100, Rusty Russell wrote:
> > "better". Believe me, I was fully aware, but I refuse to write such
> > crap unless *proven* to be required.
>
> You're going to wait until the compiler generates incorrect
> code for you after knowing that it *probably* will? Nice.

It's a judgement call: the obvious code "might" fail, the obfuscated
code "might" fail. However, your judgment >> mine.

> "Small" variables may be positioned by the compiler such that
> they are addressable via a 16-bit relocation from some GP register.
> If that variable isn't actually located in the small data area,
> then the 16-bit relocation may overflow, resulting in link errors.

Ahhh... Thanks for the clue injection.

This better?
Rusty.
--
Anyone who quotes me in their sig is an idiot. -- Rusty Russell.

diff -urN -I \$.*\$ --exclude TAGS -X /home/rusty/devel/kernel/kernel-patches/current-dontdiff --minimal linux-2.5.3/arch/i386/vmlinux.lds working-2.5.3-percpu/arch/i386/vmlinux.lds
--- linux-2.5.3/arch/i386/vmlinux.lds Thu Jan 31 08:58:52 2002
+++ working-2.5.3-percpu/arch/i386/vmlinux.lds Fri Feb 1 09:11:18 2002
@@ -58,6 +58,10 @@
*(.initcall7.init)
}
__initcall_end = .;
+ . = ALIGN(32);
+ __per_cpu_start = .;
+ .data.percpu : { *(.data.percpu) }
+ __per_cpu_end = .;
. = ALIGN(4096);
__init_end = .;

diff -urN -I \$.*\$ --exclude TAGS -X /home/rusty/devel/kernel/kernel-patches/current-dontdiff --minimal linux-2.5.3/arch/ppc/vmlinux.lds working-2.5.3-percpu/arch/ppc/vmlinux.lds
--- linux-2.5.3/arch/ppc/vmlinux.lds Thu Jan 31 08:58:55 2002
+++ working-2.5.3-percpu/arch/ppc/vmlinux.lds Fri Feb 1 09:11:18 2002
@@ -104,6 +104,10 @@
*(.initcall7.init)
}
__initcall_end = .;
+ . = ALIGN(32);
+ __per_cpu_start = .;
+ .data.percpu : { *(.data.percpu) }
+ __per_cpu_end = .;
. = ALIGN(4096);
__init_end = .;

diff -urN -I \$.*\$ --exclude TAGS -X /home/rusty/devel/kernel/kernel-patches/current-dontdiff --minimal linux-2.5.3/include/linux/cache.h working-2.5.3-percpu/include/linux/cache.h
--- linux-2.5.3/include/linux/cache.h Fri Feb 1 19:21:37 2002
+++ working-2.5.3-percpu/include/linux/cache.h Fri Feb 1 19:17:09 2002
@@ -4,8 +4,10 @@
#include <linux/config.h>
#include <asm/cache.h>

+#define ALIGN(x,a) (((x)+(a)-1)&~((a)-1))
+
#ifndef L1_CACHE_ALIGN
-#define L1_CACHE_ALIGN(x) (((x)+(L1_CACHE_BYTES-1))&~(L1_CACHE_BYTES-1))
+#define L1_CACHE_ALIGN(x) ALIGN(x, L1_CACHE_BYTES)
#endif

#ifndef SMP_CACHE_BYTES
diff -urN -I \$.*\$ --exclude TAGS -X /home/rusty/devel/kernel/kernel-patches/current-dontdiff --minimal linux-2.5.3/include/linux/compiler.h working-2.5.3-percpu/include/linux/compiler.h
--- linux-2.5.3/include/linux/compiler.h Wed Sep 19 07:12:45 2001
+++ working-2.5.3-percpu/include/linux/compiler.h Fri Feb 1 09:46:02 2002
@@ -13,4 +13,12 @@
#define likely(x) __builtin_expect((x),1)
#define unlikely(x) __builtin_expect((x),0)

+/* This macro obfuscates arithmetic on a variable address so that gcc
+ shouldn't recognize it. This prevents optimizations such as the
+ famous:
+ strcpy(s, "xxx"+X) => memcpy(s, "xxx"+X, 4-X) */
+#define rth(var, off) \
+ ({ __typeof__(&(var)) __ptr; \
+ __asm__ ("" : "=g"(__ptr) : "0"((void *)&(var) + (off))); \
+ *__ptr; })
#endif /* __LINUX_COMPILER_H */
diff -urN -I \$.*\$ --exclude TAGS -X /home/rusty/devel/kernel/kernel-patches/current-dontdiff --minimal linux-2.5.3/include/linux/interrupt.h working-2.5.3-percpu/include/linux/interrupt.h
--- linux-2.5.3/include/linux/interrupt.h Fri Feb 1 19:21:37 2002
+++ working-2.5.3-percpu/include/linux/interrupt.h Fri Feb 1 19:27:57 2002
@@ -124,14 +124,6 @@
TASKLET_STATE_RUN /* Tasklet is running (SMP only) */
};

-struct tasklet_head
-{
- struct tasklet_struct *list;
-} __attribute__ ((__aligned__(SMP_CACHE_BYTES)));
-
-extern struct tasklet_head tasklet_vec[NR_CPUS];
-extern struct tasklet_head tasklet_hi_vec[NR_CPUS];
-
#ifdef CONFIG_SMP
static inline int tasklet_trylock(struct tasklet_struct *t)
{
diff -urN -I \$.*\$ --exclude TAGS -X /home/rusty/devel/kernel/kernel-patches/current-dontdiff --minimal linux-2.5.3/include/linux/smp.h working-2.5.3-percpu/include/linux/smp.h
--- linux-2.5.3/include/linux/smp.h Fri Feb 1 19:21:37 2002
+++ working-2.5.3-percpu/include/linux/smp.h Fri Feb 1 19:17:09 2002
@@ -11,6 +11,7 @@
#ifdef CONFIG_SMP

#include <linux/kernel.h>
+#include <linux/compiler.h>
#include <asm/smp.h>

/*
@@ -71,7 +72,17 @@
#define MSG_RESCHEDULE 0x0003 /* Reschedule request from master CPU*/
#define MSG_CALL_FUNCTION 0x0004 /* Call function on all other CPUs */

-#else
+#define __per_cpu_data __attribute__((section(".data.percpu")))
+
+#ifndef __HAVE_ARCH_PER_CPU
+extern unsigned long __per_cpu_offset[NR_CPUS];
+
+/* var is in discarded region: offset to particular copy we want */
+#define per_cpu(var, cpu) rth(var, per_cpu_offset(cpu))
+
+#define this_cpu(var) per_cpu(var, smp_processor_id())
+#endif /* !__HAVE_ARCH_PER_CPU */
+#else /* !SMP */

/*
* These macros fold the SMP functionality into a single CPU system
@@ -88,6 +99,9 @@
#define cpu_online_map 1
static inline void smp_send_reschedule(int cpu) { }
static inline void smp_send_reschedule_all(void) { }
+#define __per_cpu_data
+#define per_cpu(var, cpu) var
+#define this_cpu(var) var

#endif
#endif
diff -urN -I \$.*\$ --exclude TAGS -X /home/rusty/devel/kernel/kernel-patches/current-dontdiff --minimal linux-2.5.3/init/main.c working-2.5.3-percpu/init/main.c
--- linux-2.5.3/init/main.c Thu Jan 31 08:59:17 2002
+++ working-2.5.3-percpu/init/main.c Fri Feb 1 19:13:52 2002
@@ -272,8 +272,32 @@
#define smp_init() do { } while (0)
#endif

+static inline void setup_per_cpu_areas(void)
+{
+}
#else

+#ifndef __HAVE_ARCH_PER_CPU
+unsigned long __per_cpu_offset[NR_CPUS];
+
+static void __init setup_per_cpu_areas(void)
+{
+ unsigned long size, i;
+ char *ptr;
+ /* Created by linker magic */
+ extern char __per_cpu_start[], __per_cpu_end[];
+
+ /* Copy section for each CPU (we discard the original) */
+ size = ALIGN(__per_cpu_end - __per_cpu_start, SMP_CACHE_BYTES);
+ ptr = alloc_bootmem(size * NR_CPUS);
+
+ for (i = 0; i < NR_CPUS; i++, ptr += size) {
+ __per_cpu_offset[i] = ptr - __per_cpu_start;
+ memcpy(ptr, __per_cpu_start, size);
+ }
+}
+#endif /* !__HAVE_ARCH_PER_CPU */
+
/* Called by boot processor to activate the rest. */
static void __init smp_init(void)
{
@@ -316,6 +340,7 @@
lock_kernel();
printk(linux_banner);
setup_arch(&command_line);
+ setup_per_cpu_areas();
printk("Kernel command line: %s\n", saved_command_line);
parse_options(command_line);
trap_init();
diff -urN -I \$.*\$ --exclude TAGS -X /home/rusty/devel/kernel/kernel-patches/current-dontdiff --minimal linux-2.5.3/kernel/ksyms.c working-2.5.3-percpu/kernel/ksyms.c
--- linux-2.5.3/kernel/ksyms.c Thu Jan 31 08:59:17 2002
+++ working-2.5.3-percpu/kernel/ksyms.c Fri Feb 1 09:11:18 2002
@@ -543,8 +543,6 @@
EXPORT_SYMBOL(strsep);

/* software interrupts */
-EXPORT_SYMBOL(tasklet_hi_vec);
-EXPORT_SYMBOL(tasklet_vec);
EXPORT_SYMBOL(bh_task_vec);
EXPORT_SYMBOL(init_bh);
EXPORT_SYMBOL(remove_bh);
diff -urN -I \$.*\$ --exclude TAGS -X /home/rusty/devel/kernel/kernel-patches/current-dontdiff --minimal linux-2.5.3/kernel/softirq.c working-2.5.3-percpu/kernel/softirq.c
--- linux-2.5.3/kernel/softirq.c Thu Jan 31 08:59:17 2002
+++ working-2.5.3-percpu/kernel/softirq.c Fri Feb 1 09:11:18 2002
@@ -145,9 +145,13 @@


/* Tasklets */
+struct tasklet_head
+{
+ struct tasklet_struct *list;
+};

-struct tasklet_head tasklet_vec[NR_CPUS] __cacheline_aligned_in_smp;
-struct tasklet_head tasklet_hi_vec[NR_CPUS] __cacheline_aligned_in_smp;
+static struct tasklet_head tasklet_vec __per_cpu_data;
+static struct tasklet_head tasklet_hi_vec __per_cpu_data;

void __tasklet_schedule(struct tasklet_struct *t)
{
@@ -155,8 +159,8 @@
unsigned long flags;

local_irq_save(flags);
- t->next = tasklet_vec[cpu].list;
- tasklet_vec[cpu].list = t;
+ t->next = per_cpu(tasklet_vec, cpu).list;
+ per_cpu(tasklet_vec, cpu).list = t;
cpu_raise_softirq(cpu, TASKLET_SOFTIRQ);
local_irq_restore(flags);
}
@@ -167,8 +171,8 @@
unsigned long flags;

local_irq_save(flags);
- t->next = tasklet_hi_vec[cpu].list;
- tasklet_hi_vec[cpu].list = t;
+ t->next = per_cpu(tasklet_hi_vec, cpu).list;
+ per_cpu(tasklet_hi_vec, cpu).list = t;
cpu_raise_softirq(cpu, HI_SOFTIRQ);
local_irq_restore(flags);
}
@@ -179,8 +183,8 @@
struct tasklet_struct *list;

local_irq_disable();
- list = tasklet_vec[cpu].list;
- tasklet_vec[cpu].list = NULL;
+ list = per_cpu(tasklet_vec, cpu).list;
+ per_cpu(tasklet_vec, cpu).list = NULL;
local_irq_enable();

while (list) {
@@ -200,8 +204,8 @@
}

local_irq_disable();
- t->next = tasklet_vec[cpu].list;
- tasklet_vec[cpu].list = t;
+ t->next = per_cpu(tasklet_vec, cpu).list;
+ per_cpu(tasklet_vec, cpu).list = t;
__cpu_raise_softirq(cpu, TASKLET_SOFTIRQ);
local_irq_enable();
}
@@ -213,8 +217,8 @@
struct tasklet_struct *list;

local_irq_disable();
- list = tasklet_hi_vec[cpu].list;
- tasklet_hi_vec[cpu].list = NULL;
+ list = per_cpu(tasklet_hi_vec, cpu).list;
+ per_cpu(tasklet_hi_vec, cpu).list = NULL;
local_irq_enable();

while (list) {
@@ -234,8 +238,8 @@
}

local_irq_disable();
- t->next = tasklet_hi_vec[cpu].list;
- tasklet_hi_vec[cpu].list = t;
+ t->next = per_cpu(tasklet_hi_vec, cpu).list;
+ per_cpu(tasklet_hi_vec, cpu).list = t;
__cpu_raise_softirq(cpu, HI_SOFTIRQ);
local_irq_enable();
}

2002-02-01 16:52:45

by Richard Henderson

[permalink] [raw]
Subject: Re: [PATCH] per-cpu areas for 2.5.3-pre6

On Fri, Feb 01, 2002 at 07:56:24PM +1100, Rusty Russell wrote:
> This better?

Yep. Thanks.


r~