2008-12-23 17:15:20

by Brian Gerst

[permalink] [raw]
Subject: [PATCH 1/3] x86-64: Convert the PDA to percpu.

This patch makes the PDA a normal per-cpu variable, allowing the
removal of the special allocator code. %gs still points to the
base of the PDA.

Tested on a dual-core AMD64 system.

Signed-off-by: Brian Gerst <[email protected]>
---
arch/x86/include/asm/pda.h | 3 --
arch/x86/include/asm/percpu.h | 3 --
arch/x86/include/asm/setup.h | 1 -
arch/x86/kernel/cpu/common.c | 6 ++--
arch/x86/kernel/dumpstack_64.c | 8 ++--
arch/x86/kernel/head64.c | 23 +------------
arch/x86/kernel/irq.c | 2 +-
arch/x86/kernel/nmi.c | 2 +-
arch/x86/kernel/setup_percpu.c | 70 ++++++++--------------------------------
arch/x86/kernel/smpboot.c | 58 +--------------------------------
arch/x86/xen/enlighten.c | 2 +-
arch/x86/xen/smp.c | 12 +------
12 files changed, 27 insertions(+), 163 deletions(-)

diff --git a/arch/x86/include/asm/pda.h b/arch/x86/include/asm/pda.h
index 2fbfff8..b008127 100644
--- a/arch/x86/include/asm/pda.h
+++ b/arch/x86/include/asm/pda.h
@@ -39,11 +39,8 @@ struct x8664_pda {
unsigned irq_spurious_count;
} ____cacheline_aligned_in_smp;

-extern struct x8664_pda **_cpu_pda;
extern void pda_init(int);

-#define cpu_pda(i) (_cpu_pda[i])
-
/*
* There is no fast way to get the base address of the PDA, all the accesses
* have to mention %fs/%gs. So it needs to be done this Torvaldian way.
diff --git a/arch/x86/include/asm/percpu.h b/arch/x86/include/asm/percpu.h
index ece7205..6f866fd 100644
--- a/arch/x86/include/asm/percpu.h
+++ b/arch/x86/include/asm/percpu.h
@@ -12,11 +12,8 @@
#ifdef CONFIG_SMP
#include <asm/pda.h>

-#define __per_cpu_offset(cpu) (cpu_pda(cpu)->data_offset)
#define __my_cpu_offset read_pda(data_offset)

-#define per_cpu_offset(x) (__per_cpu_offset(x))
-
#endif
#include <asm-generic/percpu.h>

diff --git a/arch/x86/include/asm/setup.h b/arch/x86/include/asm/setup.h
index f12d372..b751aaf 100644
--- a/arch/x86/include/asm/setup.h
+++ b/arch/x86/include/asm/setup.h
@@ -93,7 +93,6 @@ extern unsigned long init_pg_tables_start;
extern unsigned long init_pg_tables_end;

#else
-void __init x86_64_init_pda(void);
void __init x86_64_start_kernel(char *real_mode);
void __init x86_64_start_reservations(char *real_mode_data);

diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index b9c9ea0..7d741ef 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -857,8 +857,8 @@ __setup("clearcpuid=", setup_disablecpuid);
cpumask_t cpu_initialized __cpuinitdata = CPU_MASK_NONE;

#ifdef CONFIG_X86_64
-struct x8664_pda **_cpu_pda __read_mostly;
-EXPORT_SYMBOL(_cpu_pda);
+DEFINE_PER_CPU_PAGE_ALIGNED(struct x8664_pda, pda);
+EXPORT_PER_CPU_SYMBOL(pda);

struct desc_ptr idt_descr = { 256 * 16 - 1, (unsigned long) idt_table };

@@ -866,7 +866,7 @@ char boot_cpu_stack[IRQSTACKSIZE] __page_aligned_bss;

void __cpuinit pda_init(int cpu)
{
- struct x8664_pda *pda = cpu_pda(cpu);
+ struct x8664_pda *pda = &per_cpu(pda, cpu);

/* Setup up data that may be needed in __get_free_pages early */
loadsegment(fs, 0);
diff --git a/arch/x86/kernel/dumpstack_64.c b/arch/x86/kernel/dumpstack_64.c
index 96a5db7..1098b21 100644
--- a/arch/x86/kernel/dumpstack_64.c
+++ b/arch/x86/kernel/dumpstack_64.c
@@ -163,7 +163,7 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs,
const struct stacktrace_ops *ops, void *data)
{
const unsigned cpu = get_cpu();
- unsigned long *irqstack_end = (unsigned long *)cpu_pda(cpu)->irqstackptr;
+ unsigned long *irqstack_end = (unsigned long *)per_cpu(pda, cpu).irqstackptr;
unsigned used = 0;
struct thread_info *tinfo;

@@ -306,9 +306,9 @@ show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs,
int i;
const int cpu = smp_processor_id();
unsigned long *irqstack_end =
- (unsigned long *) (cpu_pda(cpu)->irqstackptr);
+ (unsigned long *) (per_cpu(pda, cpu).irqstackptr);
unsigned long *irqstack =
- (unsigned long *) (cpu_pda(cpu)->irqstackptr - IRQSTACKSIZE);
+ (unsigned long *) (per_cpu(pda, cpu).irqstackptr - IRQSTACKSIZE);

/*
* debugging aid: "show_stack(NULL, NULL);" prints the
@@ -374,7 +374,7 @@ void show_registers(struct pt_regs *regs)
int i;
unsigned long sp;
const int cpu = smp_processor_id();
- struct task_struct *cur = cpu_pda(cpu)->pcurrent;
+ struct task_struct *cur = per_cpu(pda, cpu).pcurrent;

sp = regs->sp;
printk("CPU %d ", cpu);
diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c
index d16084f..274e2a9 100644
--- a/arch/x86/kernel/head64.c
+++ b/arch/x86/kernel/head64.c
@@ -25,27 +25,6 @@
#include <asm/e820.h>
#include <asm/bios_ebda.h>

-/* boot cpu pda */
-static struct x8664_pda _boot_cpu_pda __read_mostly;
-
-#ifdef CONFIG_SMP
-/*
- * We install an empty cpu_pda pointer table to indicate to early users
- * (numa_set_node) that the cpu_pda pointer table for cpus other than
- * the boot cpu is not yet setup.
- */
-static struct x8664_pda *__cpu_pda[NR_CPUS] __initdata;
-#else
-static struct x8664_pda *__cpu_pda[NR_CPUS] __read_mostly;
-#endif
-
-void __init x86_64_init_pda(void)
-{
- _cpu_pda = __cpu_pda;
- cpu_pda(0) = &_boot_cpu_pda;
- pda_init(0);
-}
-
static void __init zap_identity_mappings(void)
{
pgd_t *pgd = pgd_offset_k(0UL);
@@ -111,7 +90,7 @@ void __init x86_64_start_kernel(char * real_mode_data)
if (console_loglevel == 10)
early_printk("Kernel alive\n");

- x86_64_init_pda();
+ pda_init(0);

x86_64_start_reservations(real_mode_data);
}
diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c
index d1d4dc5..066e680 100644
--- a/arch/x86/kernel/irq.c
+++ b/arch/x86/kernel/irq.c
@@ -38,7 +38,7 @@ void ack_bad_irq(unsigned int irq)
#ifdef CONFIG_X86_32
# define irq_stats(x) (&per_cpu(irq_stat, x))
#else
-# define irq_stats(x) cpu_pda(x)
+# define irq_stats(x) (&per_cpu(pda, x))
#endif
/*
* /proc/interrupts printing:
diff --git a/arch/x86/kernel/nmi.c b/arch/x86/kernel/nmi.c
index 2c97f07..4a5bb40 100644
--- a/arch/x86/kernel/nmi.c
+++ b/arch/x86/kernel/nmi.c
@@ -64,7 +64,7 @@ static int endflag __initdata;
static inline unsigned int get_nmi_count(int cpu)
{
#ifdef CONFIG_X86_64
- return cpu_pda(cpu)->__nmi_count;
+ return per_cpu(pda, cpu).__nmi_count;
#else
return nmi_count(cpu);
#endif
diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c
index ae0c0d3..fb0ccdc 100644
--- a/arch/x86/kernel/setup_percpu.c
+++ b/arch/x86/kernel/setup_percpu.c
@@ -80,58 +80,8 @@ static void __init setup_per_cpu_maps(void)
#endif
}

-#ifdef CONFIG_X86_32
-/*
- * Great future not-so-futuristic plan: make i386 and x86_64 do it
- * the same way
- */
unsigned long __per_cpu_offset[NR_CPUS] __read_mostly;
EXPORT_SYMBOL(__per_cpu_offset);
-static inline void setup_cpu_pda_map(void) { }
-
-#elif !defined(CONFIG_SMP)
-static inline void setup_cpu_pda_map(void) { }
-
-#else /* CONFIG_SMP && CONFIG_X86_64 */
-
-/*
- * Allocate cpu_pda pointer table and array via alloc_bootmem.
- */
-static void __init setup_cpu_pda_map(void)
-{
- char *pda;
- struct x8664_pda **new_cpu_pda;
- unsigned long size;
- int cpu;
-
- size = roundup(sizeof(struct x8664_pda), cache_line_size());
-
- /* allocate cpu_pda array and pointer table */
- {
- unsigned long tsize = nr_cpu_ids * sizeof(void *);
- unsigned long asize = size * (nr_cpu_ids - 1);
-
- tsize = roundup(tsize, cache_line_size());
- new_cpu_pda = alloc_bootmem(tsize + asize);
- pda = (char *)new_cpu_pda + tsize;
- }
-
- /* initialize pointer table to static pda's */
- for_each_possible_cpu(cpu) {
- if (cpu == 0) {
- /* leave boot cpu pda in place */
- new_cpu_pda[0] = cpu_pda(0);
- continue;
- }
- new_cpu_pda[cpu] = (struct x8664_pda *)pda;
- new_cpu_pda[cpu]->in_bootmem = 1;
- pda += size;
- }
-
- /* point to new pointer table */
- _cpu_pda = new_cpu_pda;
-}
-#endif

/*
* Great future plan:
@@ -145,9 +95,6 @@ void __init setup_per_cpu_areas(void)
int cpu;
unsigned long align = 1;

- /* Setup cpu_pda map */
- setup_cpu_pda_map();
-
/* Copy section for each CPU (we discard the original) */
old_size = PERCPU_ENOUGH_ROOM;
align = max_t(unsigned long, PAGE_SIZE, align);
@@ -179,10 +126,21 @@ void __init setup_per_cpu_areas(void)
cpu, node, __pa(ptr));
}
#endif
- per_cpu_offset(cpu) = ptr - __per_cpu_start;
+ __per_cpu_offset[cpu] = ptr - __per_cpu_start;
memcpy(ptr, __per_cpu_start, __per_cpu_end - __per_cpu_start);
+#ifdef CONFIG_X86_64
+ if (cpu)
+ memset(&per_cpu(pda, cpu), 0, sizeof(struct x8664_pda));
+ per_cpu(pda, cpu).data_offset = __per_cpu_offset[cpu];
+#endif
}

+#ifdef CONFIG_X86_64
+ mb();
+ wrmsrl(MSR_GS_BASE, &per_cpu(pda, 0));
+ mb();
+#endif
+
printk(KERN_DEBUG "NR_CPUS: %d, nr_cpu_ids: %d, nr_node_ids %d\n",
NR_CPUS, nr_cpu_ids, nr_node_ids);

@@ -229,8 +187,8 @@ void __cpuinit numa_set_node(int cpu, int node)
{
int *cpu_to_node_map = early_per_cpu_ptr(x86_cpu_to_node_map);

- if (cpu_pda(cpu) && node != NUMA_NO_NODE)
- cpu_pda(cpu)->nodenumber = node;
+ if (node != NUMA_NO_NODE)
+ per_cpu(pda, cpu).nodenumber = node;

if (cpu_to_node_map)
cpu_to_node_map[cpu] = node;
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index f71f96f..d25b989 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -753,52 +753,6 @@ static void __cpuinit do_fork_idle(struct work_struct *work)
complete(&c_idle->done);
}

-#ifdef CONFIG_X86_64
-
-/* __ref because it's safe to call free_bootmem when after_bootmem == 0. */
-static void __ref free_bootmem_pda(struct x8664_pda *oldpda)
-{
- if (!after_bootmem)
- free_bootmem((unsigned long)oldpda, sizeof(*oldpda));
-}
-
-/*
- * Allocate node local memory for the AP pda.
- *
- * Must be called after the _cpu_pda pointer table is initialized.
- */
-int __cpuinit get_local_pda(int cpu)
-{
- struct x8664_pda *oldpda, *newpda;
- unsigned long size = sizeof(struct x8664_pda);
- int node = cpu_to_node(cpu);
-
- if (cpu_pda(cpu) && !cpu_pda(cpu)->in_bootmem)
- return 0;
-
- oldpda = cpu_pda(cpu);
- newpda = kmalloc_node(size, GFP_ATOMIC, node);
- if (!newpda) {
- printk(KERN_ERR "Could not allocate node local PDA "
- "for CPU %d on node %d\n", cpu, node);
-
- if (oldpda)
- return 0; /* have a usable pda */
- else
- return -1;
- }
-
- if (oldpda) {
- memcpy(newpda, oldpda, size);
- free_bootmem_pda(oldpda);
- }
-
- newpda->in_bootmem = 0;
- cpu_pda(cpu) = newpda;
- return 0;
-}
-#endif /* CONFIG_X86_64 */
-
static int __cpuinit do_boot_cpu(int apicid, int cpu)
/*
* NOTE - on most systems this is a PHYSICAL apic ID, but on multiquad
@@ -816,16 +770,6 @@ static int __cpuinit do_boot_cpu(int apicid, int cpu)
};
INIT_WORK(&c_idle.work, do_fork_idle);

-#ifdef CONFIG_X86_64
- /* Allocate node local memory for AP pdas */
- if (cpu > 0) {
- boot_error = get_local_pda(cpu);
- if (boot_error)
- goto restore_state;
- /* if can't get pda memory, can't start cpu */
- }
-#endif
-
alternatives_smp_switch(1);

c_idle.idle = get_idle_for_cpu(cpu);
@@ -861,7 +805,7 @@ do_rest:
/* Stack for startup_32 can be just as for start_secondary onwards */
irq_ctx_init(cpu);
#else
- cpu_pda(cpu)->pcurrent = c_idle.idle;
+ per_cpu(pda, cpu).pcurrent = c_idle.idle;
clear_tsk_thread_flag(c_idle.idle, TIF_FORK);
#endif
early_gdt_descr.address = (unsigned long)get_cpu_gdt_table(cpu);
diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index 5e4686d..0160bb6 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -1649,7 +1649,7 @@ asmlinkage void __init xen_start_kernel(void)
#ifdef CONFIG_X86_64
/* Disable until direct per-cpu data access. */
have_vcpu_info_placement = 0;
- x86_64_init_pda();
+ pda_init(0);
#endif

xen_smp_init();
diff --git a/arch/x86/xen/smp.c b/arch/x86/xen/smp.c
index acd9b67..17823cb 100644
--- a/arch/x86/xen/smp.c
+++ b/arch/x86/xen/smp.c
@@ -280,22 +280,12 @@ static int __cpuinit xen_cpu_up(unsigned int cpu)
struct task_struct *idle = idle_task(cpu);
int rc;

-#ifdef CONFIG_X86_64
- /* Allocate node local memory for AP pdas */
- WARN_ON(cpu == 0);
- if (cpu > 0) {
- rc = get_local_pda(cpu);
- if (rc)
- return rc;
- }
-#endif
-
#ifdef CONFIG_X86_32
init_gdt(cpu);
per_cpu(current_task, cpu) = idle;
irq_ctx_init(cpu);
#else
- cpu_pda(cpu)->pcurrent = idle;
+ per_cpu(pda, cpu).pcurrent = idle;
clear_tsk_thread_flag(idle, TIF_FORK);
#endif
xen_setup_timer(cpu);
--
1.6.1.rc1


2008-12-23 17:15:36

by Brian Gerst

[permalink] [raw]
Subject: [PATCH 2/3] x86-64: Unify x86_*_percpu() functions.

Merge the 32-bit and 64-bit versions of these functions. Unlike 32-bit,
the segment base is the current cpu's PDA instead of the offset from the
original per-cpu area. This is because GCC hardcodes the stackprotector
canary at %gs:40. Since the assembler is incapable of relocating against
multiple symbols, the code ends up looking like:

movq $per_cpu__var, reg
subq $per_cpu__pda, reg
movq %gs:(reg), reg

This is still atomic since the offset is a constant (just calculated at
runtime) and not dependant on the cpu number.

Signed-off-by: Brian Gerst <[email protected]>
---
arch/x86/include/asm/percpu.h | 92 +++++++++++++++++-----------------------
1 files changed, 39 insertions(+), 53 deletions(-)

diff --git a/arch/x86/include/asm/percpu.h b/arch/x86/include/asm/percpu.h
index 6f866fd..f704243 100644
--- a/arch/x86/include/asm/percpu.h
+++ b/arch/x86/include/asm/percpu.h
@@ -1,54 +1,9 @@
#ifndef _ASM_X86_PERCPU_H
#define _ASM_X86_PERCPU_H

-#ifdef CONFIG_X86_64
-#include <linux/compiler.h>
-
-/* Same as asm-generic/percpu.h, except that we store the per cpu offset
- in the PDA. Longer term the PDA and every per cpu variable
- should be just put into a single section and referenced directly
- from %gs */
-
-#ifdef CONFIG_SMP
-#include <asm/pda.h>
-
-#define __my_cpu_offset read_pda(data_offset)
-
-#endif
-#include <asm-generic/percpu.h>
-
-DECLARE_PER_CPU(struct x8664_pda, pda);
-
-/*
- * These are supposed to be implemented as a single instruction which
- * operates on the per-cpu data base segment. x86-64 doesn't have
- * that yet, so this is a fairly inefficient workaround for the
- * meantime. The single instruction is atomic with respect to
- * preemption and interrupts, so we need to explicitly disable
- * interrupts here to achieve the same effect. However, because it
- * can be used from within interrupt-disable/enable, we can't actually
- * disable interrupts; disabling preemption is enough.
- */
-#define x86_read_percpu(var) \
- ({ \
- typeof(per_cpu_var(var)) __tmp; \
- preempt_disable(); \
- __tmp = __get_cpu_var(var); \
- preempt_enable(); \
- __tmp; \
- })
-
-#define x86_write_percpu(var, val) \
- do { \
- preempt_disable(); \
- __get_cpu_var(var) = (val); \
- preempt_enable(); \
- } while(0)
-
-#else /* CONFIG_X86_64 */
-
#ifdef __ASSEMBLY__

+#ifdef CONFIG_X86_32
/*
* PER_CPU finds an address of a per-cpu variable.
*
@@ -72,6 +27,8 @@ DECLARE_PER_CPU(struct x8664_pda, pda);
#define PER_CPU_VAR(var) per_cpu__##var
#endif /* SMP */

+#endif /* X86_32 */
+
#else /* ...!ASSEMBLY */

/*
@@ -88,19 +45,37 @@ DECLARE_PER_CPU(struct x8664_pda, pda);
*/
#ifdef CONFIG_SMP

+#ifdef CONFIG_X86_32
+
#define __my_cpu_offset x86_read_percpu(this_cpu_off)

/* fs segment starts at (positive) offset == __per_cpu_offset[cpu] */
#define __percpu_seg "%%fs:"
+#define __percpu_seg_off(x) (x)
+
+#else
+
+#define __my_cpu_offset read_pda(data_offset)
+
+#define __percpu_seg "%%gs:"
+#define __percpu_seg_off(x) RELOC_HIDE((x), -(unsigned long)&per_cpu__pda)
+
+#endif

#else /* !SMP */

#define __percpu_seg ""
+#define __percpu_seg_off(x) (x)

#endif /* SMP */

#include <asm-generic/percpu.h>

+#ifdef CONFIG_X86_64
+#include <asm/pda.h>
+DECLARE_PER_CPU(struct x8664_pda, pda);
+#endif
+
/* We can use this directly for local CPU (faster). */
DECLARE_PER_CPU(unsigned long, this_cpu_off);

@@ -111,6 +86,7 @@ extern void __bad_percpu_size(void);
#define percpu_to_op(op, var, val) \
do { \
typedef typeof(var) T__; \
+ typeof(var) *var__ = __percpu_seg_off(&var); \
if (0) { \
T__ tmp__; \
tmp__ = (val); \
@@ -118,17 +94,22 @@ do { \
switch (sizeof(var)) { \
case 1: \
asm(op "b %1,"__percpu_seg"%0" \
- : "+m" (var) \
+ : "+m" (*var__) \
: "ri" ((T__)val)); \
break; \
case 2: \
asm(op "w %1,"__percpu_seg"%0" \
- : "+m" (var) \
+ : "+m" (*var__) \
: "ri" ((T__)val)); \
break; \
case 4: \
asm(op "l %1,"__percpu_seg"%0" \
- : "+m" (var) \
+ : "+m" (*var__) \
+ : "ri" ((T__)val)); \
+ break; \
+ case 8: \
+ asm(op "q %1,"__percpu_seg"%0" \
+ : "+m" (*var__) \
: "ri" ((T__)val)); \
break; \
default: __bad_percpu_size(); \
@@ -138,21 +119,27 @@ do { \
#define percpu_from_op(op, var) \
({ \
typeof(var) ret__; \
+ typeof(var) *var__ = __percpu_seg_off(&var); \
switch (sizeof(var)) { \
case 1: \
asm(op "b "__percpu_seg"%1,%0" \
: "=r" (ret__) \
- : "m" (var)); \
+ : "m" (*var__)); \
break; \
case 2: \
asm(op "w "__percpu_seg"%1,%0" \
: "=r" (ret__) \
- : "m" (var)); \
+ : "m" (*var__)); \
break; \
case 4: \
asm(op "l "__percpu_seg"%1,%0" \
: "=r" (ret__) \
- : "m" (var)); \
+ : "m" (*var__)); \
+ break; \
+ case 8: \
+ asm(op "q "__percpu_seg"%1,%0" \
+ : "=r" (ret__) \
+ : "m" (*var__)); \
break; \
default: __bad_percpu_size(); \
} \
@@ -165,7 +152,6 @@ do { \
#define x86_sub_percpu(var, val) percpu_to_op("sub", per_cpu__##var, val)
#define x86_or_percpu(var, val) percpu_to_op("or", per_cpu__##var, val)
#endif /* !__ASSEMBLY__ */
-#endif /* !CONFIG_X86_64 */

#ifdef CONFIG_SMP

--
1.6.1.rc1

2008-12-23 17:15:48

by Brian Gerst

[permalink] [raw]
Subject: [PATCH 3/3] x86-64: Move cpu number from PDA to per-cpu and consolidate with 32-bit.

A simple example of what can now be done to consolidate with the 32-bit
code.

Signed-off-by: Brian Gerst <[email protected]>
---
arch/x86/include/asm/pda.h | 2 +-
arch/x86/include/asm/smp.h | 4 +---
arch/x86/kernel/asm-offsets_64.c | 1 -
arch/x86/kernel/cpu/common.c | 1 -
arch/x86/kernel/process_32.c | 3 ---
arch/x86/kernel/setup_percpu.c | 4 ++++
arch/x86/kernel/smpcommon.c | 1 -
7 files changed, 6 insertions(+), 10 deletions(-)

diff --git a/arch/x86/include/asm/pda.h b/arch/x86/include/asm/pda.h
index b008127..3ace2b9 100644
--- a/arch/x86/include/asm/pda.h
+++ b/arch/x86/include/asm/pda.h
@@ -15,7 +15,7 @@ struct x8664_pda {
unsigned long kernelstack; /* 16 top of kernel stack for current */
unsigned long oldrsp; /* 24 user rsp for system call */
int irqcount; /* 32 Irq nesting counter. Starts -1 */
- unsigned int cpunumber; /* 36 Logical CPU number */
+ unsigned int unused1; /* 36 was cpunumber */
#ifdef CONFIG_CC_STACKPROTECTOR
unsigned long stack_canary; /* 40 stack canary value */
/* gcc-ABI: this canary MUST be at
diff --git a/arch/x86/include/asm/smp.h b/arch/x86/include/asm/smp.h
index d12811c..f2e4b9c 100644
--- a/arch/x86/include/asm/smp.h
+++ b/arch/x86/include/asm/smp.h
@@ -34,9 +34,7 @@ extern cpumask_t cpu_initialized;
DECLARE_PER_CPU(cpumask_t, cpu_sibling_map);
DECLARE_PER_CPU(cpumask_t, cpu_core_map);
DECLARE_PER_CPU(u16, cpu_llc_id);
-#ifdef CONFIG_X86_32
DECLARE_PER_CPU(int, cpu_number);
-#endif

DECLARE_EARLY_PER_CPU(u16, x86_cpu_to_apicid);
DECLARE_EARLY_PER_CPU(u16, x86_bios_cpu_apicid);
@@ -169,7 +167,7 @@ extern unsigned disabled_cpus __cpuinitdata;
extern int safe_smp_processor_id(void);

#elif defined(CONFIG_X86_64_SMP)
-#define raw_smp_processor_id() read_pda(cpunumber)
+#define raw_smp_processor_id() (x86_read_percpu(cpu_number))

#define stack_smp_processor_id() \
({ \
diff --git a/arch/x86/kernel/asm-offsets_64.c b/arch/x86/kernel/asm-offsets_64.c
index 7fcf63d..b14bb4e 100644
--- a/arch/x86/kernel/asm-offsets_64.c
+++ b/arch/x86/kernel/asm-offsets_64.c
@@ -51,7 +51,6 @@ int main(void)
ENTRY(oldrsp);
ENTRY(pcurrent);
ENTRY(irqcount);
- ENTRY(cpunumber);
ENTRY(irqstackptr);
ENTRY(data_offset);
BLANK();
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index 7d741ef..82ad755 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -876,7 +876,6 @@ void __cpuinit pda_init(int cpu)
wrmsrl(MSR_GS_BASE, pda);
mb();

- pda->cpunumber = cpu;
pda->irqcount = -1;
pda->kernelstack = (unsigned long)stack_thread_info() -
PDA_STACKOFFSET + THREAD_SIZE;
diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c
index 0a1302f..4bb0a7d 100644
--- a/arch/x86/kernel/process_32.c
+++ b/arch/x86/kernel/process_32.c
@@ -65,9 +65,6 @@ asmlinkage void ret_from_fork(void) __asm__("ret_from_fork");
DEFINE_PER_CPU(struct task_struct *, current_task) = &init_task;
EXPORT_PER_CPU_SYMBOL(current_task);

-DEFINE_PER_CPU(int, cpu_number);
-EXPORT_PER_CPU_SYMBOL(cpu_number);
-
/*
* Return saved PC of a blocked thread.
*/
diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c
index fb0ccdc..f013781 100644
--- a/arch/x86/kernel/setup_percpu.c
+++ b/arch/x86/kernel/setup_percpu.c
@@ -15,6 +15,9 @@
#include <asm/apicdef.h>
#include <asm/highmem.h>

+DEFINE_PER_CPU(int, cpu_number);
+EXPORT_PER_CPU_SYMBOL(cpu_number);
+
#ifdef CONFIG_X86_LOCAL_APIC
unsigned int num_processors;
unsigned disabled_cpus __cpuinitdata;
@@ -133,6 +136,7 @@ void __init setup_per_cpu_areas(void)
memset(&per_cpu(pda, cpu), 0, sizeof(struct x8664_pda));
per_cpu(pda, cpu).data_offset = __per_cpu_offset[cpu];
#endif
+ per_cpu(cpu_number, cpu) = cpu;
}

#ifdef CONFIG_X86_64
diff --git a/arch/x86/kernel/smpcommon.c b/arch/x86/kernel/smpcommon.c
index 397e309..db7c5a6 100644
--- a/arch/x86/kernel/smpcommon.c
+++ b/arch/x86/kernel/smpcommon.c
@@ -25,6 +25,5 @@ __cpuinit void init_gdt(int cpu)
GDT_ENTRY_PERCPU, &gdt, DESCTYPE_S);

per_cpu(this_cpu_off, cpu) = __per_cpu_offset[cpu];
- per_cpu(cpu_number, cpu) = cpu;
}
#endif
--
1.6.1.rc1

2008-12-27 10:41:52

by Ingo Molnar

[permalink] [raw]
Subject: Re: [PATCH 1/3] x86-64: Convert the PDA to percpu.


(Cc:-ed a few more people who might be interested in this)

* Brian Gerst <[email protected]> wrote:

> This patch makes the PDA a normal per-cpu variable, allowing the
> removal of the special allocator code. %gs still points to the
> base of the PDA.
>
> Tested on a dual-core AMD64 system.
>
> Signed-off-by: Brian Gerst <[email protected]>
> ---
> arch/x86/include/asm/pda.h | 3 --
> arch/x86/include/asm/percpu.h | 3 --
> arch/x86/include/asm/setup.h | 1 -
> arch/x86/kernel/cpu/common.c | 6 ++--
> arch/x86/kernel/dumpstack_64.c | 8 ++--
> arch/x86/kernel/head64.c | 23 +------------
> arch/x86/kernel/irq.c | 2 +-
> arch/x86/kernel/nmi.c | 2 +-
> arch/x86/kernel/setup_percpu.c | 70 ++++++++--------------------------------
> arch/x86/kernel/smpboot.c | 58 +--------------------------------
> arch/x86/xen/enlighten.c | 2 +-
> arch/x86/xen/smp.c | 12 +------
> 12 files changed, 27 insertions(+), 163 deletions(-)

the simplification factor is significant. I'm wondering, have you measured
the code size impact of this on say the defconfig x86 kernel? That will
generally tell us how much worse optimizations the compiler does under
this scheme.

Ingo

2008-12-27 11:03:58

by Jeremy Fitzhardinge

[permalink] [raw]
Subject: Re: [PATCH 2/3] x86-64: Unify x86_*_percpu() functions.

Brian Gerst wrote:
> Merge the 32-bit and 64-bit versions of these functions. Unlike 32-bit,
> the segment base is the current cpu's PDA instead of the offset from the
> original per-cpu area. This is because GCC hardcodes the stackprotector
> canary at %gs:40. Since the assembler is incapable of relocating against
> multiple symbols, the code ends up looking like:
>
> movq $per_cpu__var, reg
> subq $per_cpu__pda, reg
> movq %gs:(reg), reg
>
> This is still atomic since the offset is a constant (just calculated at
> runtime) and not dependant on the cpu number.
>

Yeah, it's a real pity we can't convince the linker to do this simple
computation as a single %gs:ADDR addressing mode. On the other hand, if
the compiler can reuse the computation of %reg 2-3 times, then the
generated code could well end up being denser.

J

2008-12-27 11:09:38

by Ingo Molnar

[permalink] [raw]
Subject: Re: [PATCH 2/3] x86-64: Unify x86_*_percpu() functions.


* Jeremy Fitzhardinge <[email protected]> wrote:

> Brian Gerst wrote:
>> Merge the 32-bit and 64-bit versions of these functions. Unlike 32-bit,
>> the segment base is the current cpu's PDA instead of the offset from the
>> original per-cpu area. This is because GCC hardcodes the stackprotector
>> canary at %gs:40. Since the assembler is incapable of relocating against
>> multiple symbols, the code ends up looking like:
>>
>> movq $per_cpu__var, reg
>> subq $per_cpu__pda, reg
>> movq %gs:(reg), reg
>>
>> This is still atomic since the offset is a constant (just calculated at
>> runtime) and not dependant on the cpu number.
>>
>
> Yeah, it's a real pity we can't convince the linker to do this simple
> computation as a single %gs:ADDR addressing mode. On the other hand, if
> the compiler can reuse the computation of %reg 2-3 times, then the
> generated code could well end up being denser.

There's a nice project for linker hackers?

I'd like to see some kernel image size measurements done on x86 defconfig
to see how much real impact this has on code density. Unless the impact is
horribly unacceptable, removing ~200 lines of weird x86-specific APIs is a
definitive plus.

Ingo

2008-12-27 11:21:05

by Jeremy Fitzhardinge

[permalink] [raw]
Subject: Re: [PATCH 2/3] x86-64: Unify x86_*_percpu() functions.

Ingo Molnar wrote:
> * Jeremy Fitzhardinge <[email protected]> wrote:
>
>
>> Brian Gerst wrote:
>>
>>> Merge the 32-bit and 64-bit versions of these functions. Unlike 32-bit,
>>> the segment base is the current cpu's PDA instead of the offset from the
>>> original per-cpu area. This is because GCC hardcodes the stackprotector
>>> canary at %gs:40. Since the assembler is incapable of relocating against
>>> multiple symbols, the code ends up looking like:
>>>
>>> movq $per_cpu__var, reg
>>> subq $per_cpu__pda, reg
>>> movq %gs:(reg), reg
>>>
>>> This is still atomic since the offset is a constant (just calculated at
>>> runtime) and not dependant on the cpu number.
>>>
>>>
>> Yeah, it's a real pity we can't convince the linker to do this simple
>> computation as a single %gs:ADDR addressing mode. On the other hand, if
>> the compiler can reuse the computation of %reg 2-3 times, then the
>> generated code could well end up being denser.
>>
>
> There's a nice project for linker hackers?
>
> I'd like to see some kernel image size measurements done on x86 defconfig
> to see how much real impact this has on code density. Unless the impact is
> horribly unacceptable, removing ~200 lines of weird x86-specific APIs is a
> definitive plus.

Yep, I'm all for it. I don't think there'll be much of a size impact at
all.

J

2008-12-27 15:30:51

by Brian Gerst

[permalink] [raw]
Subject: Re: [PATCH 1/3] x86-64: Convert the PDA to percpu.

On Sat, Dec 27, 2008 at 5:41 AM, Ingo Molnar <[email protected]> wrote:
>
> (Cc:-ed a few more people who might be interested in this)
>
> * Brian Gerst <[email protected]> wrote:
>
>> This patch makes the PDA a normal per-cpu variable, allowing the
>> removal of the special allocator code. %gs still points to the
>> base of the PDA.
>>
>> Tested on a dual-core AMD64 system.
>>
>> Signed-off-by: Brian Gerst <[email protected]>
>> ---
>> arch/x86/include/asm/pda.h | 3 --
>> arch/x86/include/asm/percpu.h | 3 --
>> arch/x86/include/asm/setup.h | 1 -
>> arch/x86/kernel/cpu/common.c | 6 ++--
>> arch/x86/kernel/dumpstack_64.c | 8 ++--
>> arch/x86/kernel/head64.c | 23 +------------
>> arch/x86/kernel/irq.c | 2 +-
>> arch/x86/kernel/nmi.c | 2 +-
>> arch/x86/kernel/setup_percpu.c | 70 ++++++++--------------------------------
>> arch/x86/kernel/smpboot.c | 58 +--------------------------------
>> arch/x86/xen/enlighten.c | 2 +-
>> arch/x86/xen/smp.c | 12 +------
>> 12 files changed, 27 insertions(+), 163 deletions(-)
>
> the simplification factor is significant. I'm wondering, have you measured
> the code size impact of this on say the defconfig x86 kernel? That will
> generally tell us how much worse optimizations the compiler does under
> this scheme.
>
> Ingo
>

Patch #1 by itself doesn't change how the PDA is accessed, only how it
is allocated. The text size goes down significantly with patch #1,
but data goes up. Changing the PDA to cacheline-aligned (1a) brings
it back in line.

text data bss dec hex filename
7033648 1754476 758508 9546632 91ab88 vmlinux.0 (vanilla 2.6.28)
7029563 1758428 758508 9546499 91ab03 vmlinux.1 (with patch #1)
7029563 1754460 758508 9542531 919b83 vmlinux.1a (with patch #1 cache align)
7036694 1758428 758508 9553630 91c6de vmlinux.3 (with all three patches)

I think the first patch (with the alignment fix) is a clear win. As
for the other patches, they add about 8 bytes per use of a PDA
variable. cpu_number is used 903 times in this compile, so this is
likely the most extreme example. I have an idea to optimize this
particular case further that I'd like to look at which would lessen
the impact.

--
Brian Gerst

2008-12-27 15:54:19

by Ingo Molnar

[permalink] [raw]
Subject: Re: [PATCH 1/3] x86-64: Convert the PDA to percpu.


* Brian Gerst <[email protected]> wrote:

> On Sat, Dec 27, 2008 at 5:41 AM, Ingo Molnar <[email protected]> wrote:
> >
> > (Cc:-ed a few more people who might be interested in this)
> >
> > * Brian Gerst <[email protected]> wrote:
> >
> >> This patch makes the PDA a normal per-cpu variable, allowing the
> >> removal of the special allocator code. %gs still points to the
> >> base of the PDA.
> >>
> >> Tested on a dual-core AMD64 system.
> >>
> >> Signed-off-by: Brian Gerst <[email protected]>
> >> ---
> >> arch/x86/include/asm/pda.h | 3 --
> >> arch/x86/include/asm/percpu.h | 3 --
> >> arch/x86/include/asm/setup.h | 1 -
> >> arch/x86/kernel/cpu/common.c | 6 ++--
> >> arch/x86/kernel/dumpstack_64.c | 8 ++--
> >> arch/x86/kernel/head64.c | 23 +------------
> >> arch/x86/kernel/irq.c | 2 +-
> >> arch/x86/kernel/nmi.c | 2 +-
> >> arch/x86/kernel/setup_percpu.c | 70 ++++++++--------------------------------
> >> arch/x86/kernel/smpboot.c | 58 +--------------------------------
> >> arch/x86/xen/enlighten.c | 2 +-
> >> arch/x86/xen/smp.c | 12 +------
> >> 12 files changed, 27 insertions(+), 163 deletions(-)
> >
> > the simplification factor is significant. I'm wondering, have you measured
> > the code size impact of this on say the defconfig x86 kernel? That will
> > generally tell us how much worse optimizations the compiler does under
> > this scheme.
> >
> > Ingo
> >
>
> Patch #1 by itself doesn't change how the PDA is accessed, only how it
> is allocated. The text size goes down significantly with patch #1,
> but data goes up. Changing the PDA to cacheline-aligned (1a) brings
> it back in line.
>
> text data bss dec hex filename
> 7033648 1754476 758508 9546632 91ab88 vmlinux.0 (vanilla 2.6.28)
> 7029563 1758428 758508 9546499 91ab03 vmlinux.1 (with patch #1)
> 7029563 1754460 758508 9542531 919b83 vmlinux.1a (with patch #1 cache align)
> 7036694 1758428 758508 9553630 91c6de vmlinux.3 (with all three patches)
>
> I think the first patch (with the alignment fix) is a clear win. As for
> the other patches, they add about 8 bytes per use of a PDA variable.
> cpu_number is used 903 times in this compile, so this is likely the most
> extreme example. I have an idea to optimize this particular case
> further that I'd like to look at which would lessen the impact.

curious, what idea is that?

Ingo

2008-12-27 17:16:40

by Brian Gerst

[permalink] [raw]
Subject: Re: [PATCH 1/3] x86-64: Convert the PDA to percpu.

On Sat, Dec 27, 2008 at 10:53 AM, Ingo Molnar <[email protected]> wrote:
>
> * Brian Gerst <[email protected]> wrote:
>
>> On Sat, Dec 27, 2008 at 5:41 AM, Ingo Molnar <[email protected]> wrote:
>> >
>> > (Cc:-ed a few more people who might be interested in this)
>> >
>> > * Brian Gerst <[email protected]> wrote:
>> >
>> >> This patch makes the PDA a normal per-cpu variable, allowing the
>> >> removal of the special allocator code. %gs still points to the
>> >> base of the PDA.
>> >>
>> >> Tested on a dual-core AMD64 system.
>> >>
>> >> Signed-off-by: Brian Gerst <[email protected]>
>> >> ---
>> >> arch/x86/include/asm/pda.h | 3 --
>> >> arch/x86/include/asm/percpu.h | 3 --
>> >> arch/x86/include/asm/setup.h | 1 -
>> >> arch/x86/kernel/cpu/common.c | 6 ++--
>> >> arch/x86/kernel/dumpstack_64.c | 8 ++--
>> >> arch/x86/kernel/head64.c | 23 +------------
>> >> arch/x86/kernel/irq.c | 2 +-
>> >> arch/x86/kernel/nmi.c | 2 +-
>> >> arch/x86/kernel/setup_percpu.c | 70 ++++++++--------------------------------
>> >> arch/x86/kernel/smpboot.c | 58 +--------------------------------
>> >> arch/x86/xen/enlighten.c | 2 +-
>> >> arch/x86/xen/smp.c | 12 +------
>> >> 12 files changed, 27 insertions(+), 163 deletions(-)
>> >
>> > the simplification factor is significant. I'm wondering, have you measured
>> > the code size impact of this on say the defconfig x86 kernel? That will
>> > generally tell us how much worse optimizations the compiler does under
>> > this scheme.
>> >
>> > Ingo
>> >
>>
>> Patch #1 by itself doesn't change how the PDA is accessed, only how it
>> is allocated. The text size goes down significantly with patch #1,
>> but data goes up. Changing the PDA to cacheline-aligned (1a) brings
>> it back in line.
>>
>> text data bss dec hex filename
>> 7033648 1754476 758508 9546632 91ab88 vmlinux.0 (vanilla 2.6.28)
>> 7029563 1758428 758508 9546499 91ab03 vmlinux.1 (with patch #1)
>> 7029563 1754460 758508 9542531 919b83 vmlinux.1a (with patch #1 cache align)
>> 7036694 1758428 758508 9553630 91c6de vmlinux.3 (with all three patches)
>>
>> I think the first patch (with the alignment fix) is a clear win. As for
>> the other patches, they add about 8 bytes per use of a PDA variable.
>> cpu_number is used 903 times in this compile, so this is likely the most
>> extreme example. I have an idea to optimize this particular case
>> further that I'd like to look at which would lessen the impact.
>
> curious, what idea is that?
>
> Ingo
>

Something like this:
+#define raw_smp_processor_id() \
+({ \
+ extern int gsoff__cpu_number; \
+ int cpu; \
+ __asm__("movl %%gs:%1, %0" : "=r" (cpu) \
+ : "m" (gsoff__cpu_number); \
+ cpu; \
+})

And add this to vmlinux_64.lds.S:
+#define GSOFF(x) gsoff__##x = per_cpu__##x - per_cpu__pda
+ GSOFF(cpu_number);

The trick is that the linker can calculate against multiple symbols,
but it must be done in the final link. The problem with this approach
is that only a limited set of symbols can be used. There isn't a
simple solution for all per-cpu variables. Some post-processing would
have to be done, similar to kallsyms.

Looking some more at the usage statistics of the PDA members, there
are four heavy hitters:
pda->pcurrent (2719)
pda->kernelstack (1055)
pda->cpunumber (933)
pda->data_offset (327)

The rest of the PDA members have an insignificant number of accesses.
I think for now I'll avoid converting the above four fields until an
optimal solution can be agreed on, but the others (primarily the TLB
and irqstat fields) can be converted without bloating the kernel code
alot.

--
Brian Gerst