Signed-off-by: Brian Gerst <[email protected]>
---
arch/x86/include/asm/hardirq_64.h | 24 +++++++++++++++++++-----
arch/x86/include/asm/pda.h | 10 ----------
arch/x86/kernel/irq.c | 6 +-----
arch/x86/kernel/irq_64.c | 3 +++
arch/x86/kernel/nmi.c | 10 +---------
arch/x86/xen/smp.c | 18 +++---------------
6 files changed, 27 insertions(+), 44 deletions(-)
diff --git a/arch/x86/include/asm/hardirq_64.h b/arch/x86/include/asm/hardirq_64.h
index b5a6b5d..a65bab2 100644
--- a/arch/x86/include/asm/hardirq_64.h
+++ b/arch/x86/include/asm/hardirq_64.h
@@ -3,22 +3,36 @@
#include <linux/threads.h>
#include <linux/irq.h>
-#include <asm/pda.h>
#include <asm/apic.h>
+typedef struct {
+ unsigned int __softirq_pending;
+ unsigned int __nmi_count; /* arch dependent */
+ unsigned int apic_timer_irqs; /* arch dependent */
+ unsigned int irq0_irqs;
+ unsigned int irq_resched_count;
+ unsigned int irq_call_count;
+ unsigned int irq_tlb_count;
+ unsigned int irq_thermal_count;
+ unsigned int irq_spurious_count;
+ unsigned int irq_threshold_count;
+} ____cacheline_aligned irq_cpustat_t;
+
+DECLARE_PER_CPU(irq_cpustat_t, irq_stat);
+
/* We can have at most NR_VECTORS irqs routed to a cpu at a time */
#define MAX_HARDIRQS_PER_CPU NR_VECTORS
#define __ARCH_IRQ_STAT 1
-#define inc_irq_stat(member) add_pda(member, 1)
+#define inc_irq_stat(member) percpu_add(irq_stat.member, 1)
-#define local_softirq_pending() read_pda(__softirq_pending)
+#define local_softirq_pending() percpu_read(irq_stat.__softirq_pending)
#define __ARCH_SET_SOFTIRQ_PENDING 1
-#define set_softirq_pending(x) write_pda(__softirq_pending, (x))
-#define or_softirq_pending(x) or_pda(__softirq_pending, (x))
+#define set_softirq_pending(x) percpu_write(irq_stat.__softirq_pending, (x))
+#define or_softirq_pending(x) percpu_or(irq_stat.__softirq_pending, (x))
extern void ack_bad_irq(unsigned int irq);
diff --git a/arch/x86/include/asm/pda.h b/arch/x86/include/asm/pda.h
index 47f274f..69a4075 100644
--- a/arch/x86/include/asm/pda.h
+++ b/arch/x86/include/asm/pda.h
@@ -25,19 +25,9 @@ struct x8664_pda {
char *irqstackptr;
short nodenumber; /* number of current node (32k max) */
short in_bootmem; /* pda lives in bootmem */
- unsigned int __softirq_pending;
- unsigned int __nmi_count; /* number of NMI on this CPUs */
short mmu_state;
short isidle;
struct mm_struct *active_mm;
- unsigned apic_timer_irqs;
- unsigned irq0_irqs;
- unsigned irq_resched_count;
- unsigned irq_call_count;
- unsigned irq_tlb_count;
- unsigned irq_thermal_count;
- unsigned irq_threshold_count;
- unsigned irq_spurious_count;
} ____cacheline_aligned_in_smp;
DECLARE_PER_CPU(struct x8664_pda, __pda);
diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c
index 3973e2d..8b30d0c 100644
--- a/arch/x86/kernel/irq.c
+++ b/arch/x86/kernel/irq.c
@@ -36,11 +36,7 @@ void ack_bad_irq(unsigned int irq)
#endif
}
-#ifdef CONFIG_X86_32
-# define irq_stats(x) (&per_cpu(irq_stat, x))
-#else
-# define irq_stats(x) cpu_pda(x)
-#endif
+#define irq_stats(x) (&per_cpu(irq_stat, x))
/*
* /proc/interrupts printing:
*/
diff --git a/arch/x86/kernel/irq_64.c b/arch/x86/kernel/irq_64.c
index 0b21cb1..1db0524 100644
--- a/arch/x86/kernel/irq_64.c
+++ b/arch/x86/kernel/irq_64.c
@@ -19,6 +19,9 @@
#include <asm/io_apic.h>
#include <asm/idle.h>
+DEFINE_PER_CPU_SHARED_ALIGNED(irq_cpustat_t, irq_stat);
+EXPORT_PER_CPU_SYMBOL(irq_stat);
+
/*
* Probabilistic stack overflow check:
*
diff --git a/arch/x86/kernel/nmi.c b/arch/x86/kernel/nmi.c
index 7228979..23b6d9e 100644
--- a/arch/x86/kernel/nmi.c
+++ b/arch/x86/kernel/nmi.c
@@ -61,11 +61,7 @@ static int endflag __initdata;
static inline unsigned int get_nmi_count(int cpu)
{
-#ifdef CONFIG_X86_64
- return cpu_pda(cpu)->__nmi_count;
-#else
- return nmi_count(cpu);
-#endif
+ return per_cpu(irq_stat, cpu).__nmi_count;
}
static inline int mce_in_progress(void)
@@ -82,12 +78,8 @@ static inline int mce_in_progress(void)
*/
static inline unsigned int get_timer_irqs(int cpu)
{
-#ifdef CONFIG_X86_64
- return read_pda(apic_timer_irqs) + read_pda(irq0_irqs);
-#else
return per_cpu(irq_stat, cpu).apic_timer_irqs +
per_cpu(irq_stat, cpu).irq0_irqs;
-#endif
}
#ifdef CONFIG_SMP
diff --git a/arch/x86/xen/smp.c b/arch/x86/xen/smp.c
index 3bfd6dd..9ff3b09 100644
--- a/arch/x86/xen/smp.c
+++ b/arch/x86/xen/smp.c
@@ -50,11 +50,7 @@ static irqreturn_t xen_call_function_single_interrupt(int irq, void *dev_id);
*/
static irqreturn_t xen_reschedule_interrupt(int irq, void *dev_id)
{
-#ifdef CONFIG_X86_32
- __get_cpu_var(irq_stat).irq_resched_count++;
-#else
- add_pda(irq_resched_count, 1);
-#endif
+ inc_irq_stat(irq_resched_count);
return IRQ_HANDLED;
}
@@ -435,11 +431,7 @@ static irqreturn_t xen_call_function_interrupt(int irq, void *dev_id)
{
irq_enter();
generic_smp_call_function_interrupt();
-#ifdef CONFIG_X86_32
- __get_cpu_var(irq_stat).irq_call_count++;
-#else
- add_pda(irq_call_count, 1);
-#endif
+ inc_irq_stat(irq_call_count);
irq_exit();
return IRQ_HANDLED;
@@ -449,11 +441,7 @@ static irqreturn_t xen_call_function_single_interrupt(int irq, void *dev_id)
{
irq_enter();
generic_smp_call_function_single_interrupt();
-#ifdef CONFIG_X86_32
- __get_cpu_var(irq_stat).irq_call_count++;
-#else
- add_pda(irq_call_count, 1);
-#endif
+ inc_irq_stat(irq_call_count);
irq_exit();
return IRQ_HANDLED;
--
1.6.1.rc1
Move the irqstackptr variable from the PDA to per-cpu.
Make the stacks themselves per-cpu, removing some specific allocation code.
Add a seperate flag (is_boot_cpu) to simplify the per-cpu boot adjustments.
Signed-off-by: Brian Gerst <[email protected]>
---
arch/x86/include/asm/pda.h | 1 -
arch/x86/include/asm/processor.h | 3 +++
arch/x86/kernel/asm-offsets_64.c | 1 -
arch/x86/kernel/cpu/common.c | 14 ++------------
arch/x86/kernel/dumpstack_64.c | 6 +++---
arch/x86/kernel/entry_64.S | 6 +++---
arch/x86/kernel/head_64.S | 16 +++++-----------
arch/x86/kernel/setup_percpu.c | 3 ++-
arch/x86/kernel/smpboot.c | 3 +++
9 files changed, 21 insertions(+), 32 deletions(-)
diff --git a/arch/x86/include/asm/pda.h b/arch/x86/include/asm/pda.h
index 8ee835e..09965f7 100644
--- a/arch/x86/include/asm/pda.h
+++ b/arch/x86/include/asm/pda.h
@@ -22,7 +22,6 @@ struct x8664_pda {
/* gcc-ABI: this canary MUST be at
offset 40!!! */
#endif
- char *irqstackptr;
short nodenumber; /* number of current node (32k max) */
short in_bootmem; /* pda lives in bootmem */
short isidle;
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index 091cd88..e32ee80 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -378,6 +378,9 @@ union thread_xstate {
#ifdef CONFIG_X86_64
DECLARE_PER_CPU(struct orig_ist, orig_ist);
+
+DECLARE_PER_CPU(char, irqstack[IRQSTACKSIZE]);
+DECLARE_PER_CPU(char *, irqstackptr);
#endif
extern void print_cpu_info(struct cpuinfo_x86 *);
diff --git a/arch/x86/kernel/asm-offsets_64.c b/arch/x86/kernel/asm-offsets_64.c
index f4cc81b..5b821fb 100644
--- a/arch/x86/kernel/asm-offsets_64.c
+++ b/arch/x86/kernel/asm-offsets_64.c
@@ -54,7 +54,6 @@ int main(void)
ENTRY(pcurrent);
ENTRY(irqcount);
ENTRY(cpunumber);
- ENTRY(irqstackptr);
DEFINE(pda_size, sizeof(struct x8664_pda));
BLANK();
#undef ENTRY
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index 3d0cc6f..5ddbca0 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -881,7 +881,8 @@ __setup("clearcpuid=", setup_disablecpuid);
#ifdef CONFIG_X86_64
struct desc_ptr idt_descr = { 256 * 16 - 1, (unsigned long) idt_table };
-static char boot_cpu_stack[IRQSTACKSIZE] __page_aligned_bss;
+DEFINE_PER_CPU_PAGE_ALIGNED(char, irqstack[IRQSTACKSIZE]) __aligned(PAGE_SIZE);
+DEFINE_PER_CPU(char *, irqstackptr) = per_cpu_var(irqstack) + IRQSTACKSIZE - 64;
void __cpuinit pda_init(int cpu)
{
@@ -901,18 +902,7 @@ void __cpuinit pda_init(int cpu)
if (cpu == 0) {
/* others are initialized in smpboot.c */
pda->pcurrent = &init_task;
- pda->irqstackptr = boot_cpu_stack;
- pda->irqstackptr += IRQSTACKSIZE - 64;
} else {
- if (!pda->irqstackptr) {
- pda->irqstackptr = (char *)
- __get_free_pages(GFP_ATOMIC, IRQSTACK_ORDER);
- if (!pda->irqstackptr)
- panic("cannot allocate irqstack for cpu %d",
- cpu);
- pda->irqstackptr += IRQSTACKSIZE - 64;
- }
-
if (pda->nodenumber == 0 && cpu_to_node(cpu) != NUMA_NO_NODE)
pda->nodenumber = cpu_to_node(cpu);
}
diff --git a/arch/x86/kernel/dumpstack_64.c b/arch/x86/kernel/dumpstack_64.c
index c302d07..46c6bf0 100644
--- a/arch/x86/kernel/dumpstack_64.c
+++ b/arch/x86/kernel/dumpstack_64.c
@@ -106,7 +106,7 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs,
const struct stacktrace_ops *ops, void *data)
{
const unsigned cpu = get_cpu();
- unsigned long *irqstack_end = (unsigned long *)cpu_pda(cpu)->irqstackptr;
+ unsigned long *irqstack_end = (unsigned long *)per_cpu(irqstackptr, cpu);
unsigned used = 0;
struct thread_info *tinfo;
int graph = 0;
@@ -200,9 +200,9 @@ show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs,
int i;
const int cpu = smp_processor_id();
unsigned long *irqstack_end =
- (unsigned long *) (cpu_pda(cpu)->irqstackptr);
+ (unsigned long *) (per_cpu(irqstackptr, cpu));
unsigned long *irqstack =
- (unsigned long *) (cpu_pda(cpu)->irqstackptr - IRQSTACKSIZE);
+ (unsigned long *) (per_cpu(irqstackptr, cpu) - IRQSTACKSIZE);
/*
* debugging aid: "show_stack(NULL, NULL);" prints the
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index 4833f3a..5cd892f 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -345,7 +345,7 @@ ENTRY(save_args)
1: incl %gs:pda_irqcount
jne 2f
popq_cfi %rax /* move return address... */
- mov %gs:pda_irqstackptr,%rsp
+ mov PER_CPU_VAR(irqstackptr),%rsp
EMPTY_FRAME 0
pushq_cfi %rax /* ... to the new stack */
/*
@@ -1261,7 +1261,7 @@ ENTRY(call_softirq)
mov %rsp,%rbp
CFI_DEF_CFA_REGISTER rbp
incl %gs:pda_irqcount
- cmove %gs:pda_irqstackptr,%rsp
+ cmove PER_CPU_VAR(irqstackptr),%rsp
push %rbp # backlink for old unwinder
call __do_softirq
leaveq
@@ -1300,7 +1300,7 @@ ENTRY(xen_do_hypervisor_callback) # do_hypervisor_callback(struct *pt_regs)
11: incl %gs:pda_irqcount
movq %rsp,%rbp
CFI_DEF_CFA_REGISTER rbp
- cmovzq %gs:pda_irqstackptr,%rsp
+ cmovzq PER_CPU_VAR(irqstackptr),%rsp
pushq %rbp # backlink for old unwinder
call xen_evtchn_do_upcall
popq %rsp
diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S
index c8ace88..b565719 100644
--- a/arch/x86/kernel/head_64.S
+++ b/arch/x86/kernel/head_64.S
@@ -211,15 +211,13 @@ ENTRY(secondary_startup_64)
* data area. Computing this requires two symbols - __per_cpu_load
* and per_cpu__gdt_page. As linker can't do no such relocation, do
* it by hand. As early_gdt_descr is manipulated by C code for
- * secondary CPUs, this should be done only once for the boot CPU
- * when early_gdt_descr_base contains zero.
+ * secondary CPUs, this should be done only once for the boot CPU.
*/
- movq early_gdt_descr_base(%rip), %rax
- testq %rax, %rax
- jnz 1f
+ cmpb $0, is_boot_cpu(%rip)
+ je 1f
movq $__per_cpu_load, %rax
- addq $per_cpu__gdt_page, %rax
- movq %rax, early_gdt_descr_base(%rip)
+ addq %rax, early_gdt_descr_base(%rip)
+ addq %rax, per_cpu__irqstackptr(%rax)
1:
#endif
/*
@@ -431,12 +429,8 @@ NEXT_PAGE(level2_spare_pgt)
.globl early_gdt_descr
early_gdt_descr:
.word GDT_ENTRIES*8-1
-#ifdef CONFIG_SMP
early_gdt_descr_base:
- .quad 0x0000000000000000
-#else
.quad per_cpu__gdt_page
-#endif
ENTRY(phys_base)
/* This must match the first entry in level2_kernel_pgt */
diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c
index b5c35af..e3d399f 100644
--- a/arch/x86/kernel/setup_percpu.c
+++ b/arch/x86/kernel/setup_percpu.c
@@ -192,7 +192,9 @@ void __init setup_per_cpu_areas(void)
memcpy(ptr, __per_cpu_load, __per_cpu_end - __per_cpu_start);
per_cpu_offset(cpu) = ptr - __per_cpu_start;
+ per_cpu(this_cpu_off, cpu) = per_cpu_offset(cpu);
#ifdef CONFIG_X86_64
+ per_cpu(irqstackptr, cpu) = per_cpu(irqstack, cpu) + IRQSTACKSIZE - 64;
/*
* CPU0 modified pda in the init data area, reload pda
* offset for CPU0 and clear the area for others.
@@ -202,7 +204,6 @@ void __init setup_per_cpu_areas(void)
else
memset(cpu_pda(cpu), 0, sizeof(*cpu_pda(cpu)));
#endif
- per_cpu(this_cpu_off, cpu) = per_cpu_offset(cpu);
DBG("PERCPU: cpu %4d %p\n", cpu, ptr);
}
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index 2f0e0f1..a33da98 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -73,6 +73,8 @@ u8 apicid_2_node[MAX_APICID];
static int low_mappings;
#endif
+unsigned char is_boot_cpu = 1;
+
/* State of each CPU */
DEFINE_PER_CPU(int, cpu_state) = { 0 };
@@ -800,6 +802,7 @@ do_rest:
clear_tsk_thread_flag(c_idle.idle, TIF_FORK);
initial_gs = per_cpu_offset(cpu);
#endif
+ is_boot_cpu = 0;
early_gdt_descr.address = (unsigned long)get_cpu_gdt_table(cpu);
initial_code = (unsigned long)start_secondary;
stack_start.sp = (void *) c_idle.idle->thread.sp;
--
1.6.1.rc1
Signed-off-by: Brian Gerst <[email protected]>
---
arch/x86/include/asm/pda.h | 2 +-
arch/x86/include/asm/smp.h | 4 +---
arch/x86/kernel/asm-offsets_64.c | 1 -
arch/x86/kernel/cpu/common.c | 1 -
arch/x86/kernel/process_32.c | 3 ---
arch/x86/kernel/setup_percpu.c | 4 ++++
arch/x86/kernel/smpcommon.c | 2 --
7 files changed, 6 insertions(+), 11 deletions(-)
diff --git a/arch/x86/include/asm/pda.h b/arch/x86/include/asm/pda.h
index 09965f7..668d5a5 100644
--- a/arch/x86/include/asm/pda.h
+++ b/arch/x86/include/asm/pda.h
@@ -16,7 +16,7 @@ struct x8664_pda {
unsigned long kernelstack; /* 16 top of kernel stack for current */
unsigned long oldrsp; /* 24 user rsp for system call */
int irqcount; /* 32 Irq nesting counter. Starts -1 */
- unsigned int cpunumber; /* 36 Logical CPU number */
+ unsigned int unused6; /* 36 was cpunumber */
#ifdef CONFIG_CC_STACKPROTECTOR
unsigned long stack_canary; /* 40 stack canary value */
/* gcc-ABI: this canary MUST be at
diff --git a/arch/x86/include/asm/smp.h b/arch/x86/include/asm/smp.h
index c7bbbbe..68636e7 100644
--- a/arch/x86/include/asm/smp.h
+++ b/arch/x86/include/asm/smp.h
@@ -25,9 +25,7 @@ extern unsigned int num_processors;
DECLARE_PER_CPU(cpumask_t, cpu_sibling_map);
DECLARE_PER_CPU(cpumask_t, cpu_core_map);
DECLARE_PER_CPU(u16, cpu_llc_id);
-#ifdef CONFIG_X86_32
DECLARE_PER_CPU(int, cpu_number);
-#endif
static inline struct cpumask *cpu_sibling_mask(int cpu)
{
@@ -164,7 +162,7 @@ extern unsigned disabled_cpus __cpuinitdata;
extern int safe_smp_processor_id(void);
#elif defined(CONFIG_X86_64_SMP)
-#define raw_smp_processor_id() read_pda(cpunumber)
+#define raw_smp_processor_id() (percpu_read(cpu_number))
#define stack_smp_processor_id() \
({ \
diff --git a/arch/x86/kernel/asm-offsets_64.c b/arch/x86/kernel/asm-offsets_64.c
index 5b821fb..cae6697 100644
--- a/arch/x86/kernel/asm-offsets_64.c
+++ b/arch/x86/kernel/asm-offsets_64.c
@@ -53,7 +53,6 @@ int main(void)
ENTRY(oldrsp);
ENTRY(pcurrent);
ENTRY(irqcount);
- ENTRY(cpunumber);
DEFINE(pda_size, sizeof(struct x8664_pda));
BLANK();
#undef ENTRY
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index 70ce998..1cf466f 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -894,7 +894,6 @@ void __cpuinit pda_init(int cpu)
load_pda_offset(cpu);
- pda->cpunumber = cpu;
pda->irqcount = -1;
pda->kernelstack = (unsigned long)stack_thread_info() -
PDA_STACKOFFSET + THREAD_SIZE;
diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c
index 77d5468..2c00a57 100644
--- a/arch/x86/kernel/process_32.c
+++ b/arch/x86/kernel/process_32.c
@@ -66,9 +66,6 @@ asmlinkage void ret_from_fork(void) __asm__("ret_from_fork");
DEFINE_PER_CPU(struct task_struct *, current_task) = &init_task;
EXPORT_PER_CPU_SYMBOL(current_task);
-DEFINE_PER_CPU(int, cpu_number);
-EXPORT_PER_CPU_SYMBOL(cpu_number);
-
/*
* Return saved PC of a blocked thread.
*/
diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c
index e3d399f..bd38127 100644
--- a/arch/x86/kernel/setup_percpu.c
+++ b/arch/x86/kernel/setup_percpu.c
@@ -147,6 +147,9 @@ unsigned long __per_cpu_offset[NR_CPUS] __read_mostly;
#endif
EXPORT_SYMBOL(__per_cpu_offset);
+DEFINE_PER_CPU(int, cpu_number);
+EXPORT_PER_CPU_SYMBOL(cpu_number);
+
/*
* Great future plan:
* Declare PDA itself and support (irqstack,tss,pgd) as per cpu data.
@@ -193,6 +196,7 @@ void __init setup_per_cpu_areas(void)
memcpy(ptr, __per_cpu_load, __per_cpu_end - __per_cpu_start);
per_cpu_offset(cpu) = ptr - __per_cpu_start;
per_cpu(this_cpu_off, cpu) = per_cpu_offset(cpu);
+ per_cpu(cpu_number, cpu) = cpu;
#ifdef CONFIG_X86_64
per_cpu(irqstackptr, cpu) = per_cpu(irqstack, cpu) + IRQSTACKSIZE - 64;
/*
diff --git a/arch/x86/kernel/smpcommon.c b/arch/x86/kernel/smpcommon.c
index 7e15781..add36b4 100644
--- a/arch/x86/kernel/smpcommon.c
+++ b/arch/x86/kernel/smpcommon.c
@@ -28,7 +28,5 @@ __cpuinit void init_gdt(int cpu)
write_gdt_entry(get_cpu_gdt_table(cpu),
GDT_ENTRY_PERCPU, &gdt, DESCTYPE_S);
-
- per_cpu(cpu_number, cpu) = cpu;
}
#endif
--
1.6.1.rc1
Signed-off-by: Brian Gerst <[email protected]>
---
arch/x86/include/asm/current.h | 24 +++---------------------
arch/x86/include/asm/pda.h | 4 ++--
arch/x86/include/asm/system.h | 4 ++--
arch/x86/kernel/asm-offsets_64.c | 1 -
arch/x86/kernel/cpu/common.c | 5 +----
arch/x86/kernel/dumpstack_64.c | 2 +-
arch/x86/kernel/process_64.c | 5 ++++-
arch/x86/kernel/smpboot.c | 3 +--
arch/x86/xen/smp.c | 3 +--
9 files changed, 15 insertions(+), 36 deletions(-)
diff --git a/arch/x86/include/asm/current.h b/arch/x86/include/asm/current.h
index 0728480..c68c361 100644
--- a/arch/x86/include/asm/current.h
+++ b/arch/x86/include/asm/current.h
@@ -1,39 +1,21 @@
#ifndef _ASM_X86_CURRENT_H
#define _ASM_X86_CURRENT_H
-#ifdef CONFIG_X86_32
#include <linux/compiler.h>
#include <asm/percpu.h>
+#ifndef __ASSEMBLY__
struct task_struct;
DECLARE_PER_CPU(struct task_struct *, current_task);
-static __always_inline struct task_struct *get_current(void)
-{
- return percpu_read(current_task);
-}
-
-#else /* X86_32 */
-
-#ifndef __ASSEMBLY__
-#include <asm/pda.h>
-
-struct task_struct;
static __always_inline struct task_struct *get_current(void)
{
- return read_pda(pcurrent);
+ return percpu_read(current_task);
}
-#else /* __ASSEMBLY__ */
-
-#include <asm/asm-offsets.h>
-#define GET_CURRENT(reg) movq %gs:(pda_pcurrent),reg
+#define current get_current()
#endif /* __ASSEMBLY__ */
-#endif /* X86_32 */
-
-#define current get_current()
-
#endif /* _ASM_X86_CURRENT_H */
diff --git a/arch/x86/include/asm/pda.h b/arch/x86/include/asm/pda.h
index 668d5a5..7209302 100644
--- a/arch/x86/include/asm/pda.h
+++ b/arch/x86/include/asm/pda.h
@@ -11,8 +11,8 @@
/* Per processor datastructure. %gs points to it while the kernel runs */
struct x8664_pda {
- struct task_struct *pcurrent; /* 0 Current process */
- unsigned long dummy;
+ unsigned long unused1;
+ unsigned long unused2;
unsigned long kernelstack; /* 16 top of kernel stack for current */
unsigned long oldrsp; /* 24 user rsp for system call */
int irqcount; /* 32 Irq nesting counter. Starts -1 */
diff --git a/arch/x86/include/asm/system.h b/arch/x86/include/asm/system.h
index 8e626ea..4399aac 100644
--- a/arch/x86/include/asm/system.h
+++ b/arch/x86/include/asm/system.h
@@ -94,7 +94,7 @@ do { \
"call __switch_to\n\t" \
".globl thread_return\n" \
"thread_return:\n\t" \
- "movq %%gs:%P[pda_pcurrent],%%rsi\n\t" \
+ "movq "__percpu_seg_str"%P[current_task],%%rsi\n\t" \
"movq %P[thread_info](%%rsi),%%r8\n\t" \
LOCK_PREFIX "btr %[tif_fork],%P[ti_flags](%%r8)\n\t" \
"movq %%rax,%%rdi\n\t" \
@@ -106,7 +106,7 @@ do { \
[ti_flags] "i" (offsetof(struct thread_info, flags)), \
[tif_fork] "i" (TIF_FORK), \
[thread_info] "i" (offsetof(struct task_struct, stack)), \
- [pda_pcurrent] "i" (offsetof(struct x8664_pda, pcurrent)) \
+ [current_task] "m" (per_cpu_var(current_task)) \
: "memory", "cc" __EXTRA_CLOBBER)
#endif
diff --git a/arch/x86/kernel/asm-offsets_64.c b/arch/x86/kernel/asm-offsets_64.c
index cae6697..4f7a210 100644
--- a/arch/x86/kernel/asm-offsets_64.c
+++ b/arch/x86/kernel/asm-offsets_64.c
@@ -51,7 +51,6 @@ int main(void)
#define ENTRY(entry) DEFINE(pda_ ## entry, offsetof(struct x8664_pda, entry))
ENTRY(kernelstack);
ENTRY(oldrsp);
- ENTRY(pcurrent);
ENTRY(irqcount);
DEFINE(pda_size, sizeof(struct x8664_pda));
BLANK();
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index 1cf466f..fbc8468 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -898,10 +898,7 @@ void __cpuinit pda_init(int cpu)
pda->kernelstack = (unsigned long)stack_thread_info() -
PDA_STACKOFFSET + THREAD_SIZE;
- if (cpu == 0) {
- /* others are initialized in smpboot.c */
- pda->pcurrent = &init_task;
- } else {
+ if (cpu != 0) {
if (pda->nodenumber == 0 && cpu_to_node(cpu) != NUMA_NO_NODE)
pda->nodenumber = cpu_to_node(cpu);
}
diff --git a/arch/x86/kernel/dumpstack_64.c b/arch/x86/kernel/dumpstack_64.c
index 46c6bf0..0c01ad2 100644
--- a/arch/x86/kernel/dumpstack_64.c
+++ b/arch/x86/kernel/dumpstack_64.c
@@ -241,7 +241,7 @@ void show_registers(struct pt_regs *regs)
int i;
unsigned long sp;
const int cpu = smp_processor_id();
- struct task_struct *cur = cpu_pda(cpu)->pcurrent;
+ struct task_struct *cur = current;
sp = regs->sp;
printk("CPU %d ", cpu);
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index 416fb92..e00c31a 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -57,6 +57,9 @@
asmlinkage extern void ret_from_fork(void);
+DEFINE_PER_CPU(struct task_struct *, current_task) = &init_task;
+EXPORT_PER_CPU_SYMBOL(current_task);
+
unsigned long kernel_thread_flags = CLONE_VM | CLONE_UNTRACED;
static ATOMIC_NOTIFIER_HEAD(idle_notifier);
@@ -615,7 +618,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
*/
prev->usersp = read_pda(oldrsp);
write_pda(oldrsp, next->usersp);
- write_pda(pcurrent, next_p);
+ percpu_write(current_task, next_p);
write_pda(kernelstack,
(unsigned long)task_stack_page(next_p) +
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index a33da98..ecc2fc5 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -792,13 +792,12 @@ static int __cpuinit do_boot_cpu(int apicid, int cpu)
set_idle_for_cpu(cpu, c_idle.idle);
do_rest:
-#ifdef CONFIG_X86_32
per_cpu(current_task, cpu) = c_idle.idle;
+#ifdef CONFIG_X86_32
init_gdt(cpu);
/* Stack for startup_32 can be just as for start_secondary onwards */
irq_ctx_init(cpu);
#else
- cpu_pda(cpu)->pcurrent = c_idle.idle;
clear_tsk_thread_flag(c_idle.idle, TIF_FORK);
initial_gs = per_cpu_offset(cpu);
#endif
diff --git a/arch/x86/xen/smp.c b/arch/x86/xen/smp.c
index 9ff3b09..72c2eb9 100644
--- a/arch/x86/xen/smp.c
+++ b/arch/x86/xen/smp.c
@@ -279,12 +279,11 @@ static int __cpuinit xen_cpu_up(unsigned int cpu)
struct task_struct *idle = idle_task(cpu);
int rc;
+ per_cpu(current_task, cpu) = idle;
#ifdef CONFIG_X86_32
init_gdt(cpu);
- per_cpu(current_task, cpu) = idle;
irq_ctx_init(cpu);
#else
- cpu_pda(cpu)->pcurrent = idle;
clear_tsk_thread_flag(idle, TIF_FORK);
#endif
xen_setup_timer(cpu);
--
1.6.1.rc1
Also clean up PER_CPU_VAR usage in xen-asm_64.S
Signed-off-by: Brian Gerst <[email protected]>
---
arch/x86/ia32/ia32entry.S | 8 ++++----
arch/x86/include/asm/pda.h | 4 +---
arch/x86/include/asm/thread_info.h | 11 +++++++----
arch/x86/kernel/asm-offsets_64.c | 1 -
arch/x86/kernel/cpu/common.c | 6 ++++--
arch/x86/kernel/entry_64.S | 4 ++--
arch/x86/kernel/process_64.c | 4 ++--
arch/x86/kernel/smpboot.c | 2 ++
arch/x86/xen/xen-asm_64.S | 18 +++++++++---------
9 files changed, 31 insertions(+), 27 deletions(-)
diff --git a/arch/x86/ia32/ia32entry.S b/arch/x86/ia32/ia32entry.S
index 256b00b..16e1524 100644
--- a/arch/x86/ia32/ia32entry.S
+++ b/arch/x86/ia32/ia32entry.S
@@ -112,8 +112,8 @@ ENTRY(ia32_sysenter_target)
CFI_DEF_CFA rsp,0
CFI_REGISTER rsp,rbp
SWAPGS_UNSAFE_STACK
- movq %gs:pda_kernelstack, %rsp
- addq $(PDA_STACKOFFSET),%rsp
+ movq PER_CPU_VAR(kernelstack), %rsp
+ addq $(KERNELSTACK_OFFSET),%rsp
/*
* No need to follow this irqs on/off section: the syscall
* disabled irqs, here we enable it straight after entry:
@@ -273,13 +273,13 @@ ENDPROC(ia32_sysenter_target)
ENTRY(ia32_cstar_target)
CFI_STARTPROC32 simple
CFI_SIGNAL_FRAME
- CFI_DEF_CFA rsp,PDA_STACKOFFSET
+ CFI_DEF_CFA rsp,KERNELSTACK_OFFSET
CFI_REGISTER rip,rcx
/*CFI_REGISTER rflags,r11*/
SWAPGS_UNSAFE_STACK
movl %esp,%r8d
CFI_REGISTER rsp,r8
- movq %gs:pda_kernelstack,%rsp
+ movq PER_CPU_VAR(kernelstack),%rsp
/*
* No need to follow this irqs on/off section: the syscall
* disabled irqs and here we enable it straight after entry:
diff --git a/arch/x86/include/asm/pda.h b/arch/x86/include/asm/pda.h
index 7209302..4d28ffb 100644
--- a/arch/x86/include/asm/pda.h
+++ b/arch/x86/include/asm/pda.h
@@ -13,7 +13,7 @@
struct x8664_pda {
unsigned long unused1;
unsigned long unused2;
- unsigned long kernelstack; /* 16 top of kernel stack for current */
+ unsigned long unused3;
unsigned long oldrsp; /* 24 user rsp for system call */
int irqcount; /* 32 Irq nesting counter. Starts -1 */
unsigned int unused6; /* 36 was cpunumber */
@@ -44,6 +44,4 @@ extern void pda_init(int);
#endif
-#define PDA_STACKOFFSET (5*8)
-
#endif /* _ASM_X86_PDA_H */
diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h
index 9878964..8f33a3d 100644
--- a/arch/x86/include/asm/thread_info.h
+++ b/arch/x86/include/asm/thread_info.h
@@ -194,17 +194,20 @@ static inline struct thread_info *current_thread_info(void)
#else /* X86_32 */
-#include <asm/pda.h>
+#include <asm/percpu.h>
+#define KERNELSTACK_OFFSET (5*8)
/*
* macros/functions for gaining access to the thread information structure
* preempt_count needs to be 1 initially, until the scheduler is functional.
*/
#ifndef __ASSEMBLY__
+DECLARE_PER_CPU(unsigned long, kernelstack);
+
static inline struct thread_info *current_thread_info(void)
{
struct thread_info *ti;
- ti = (void *)(read_pda(kernelstack) + PDA_STACKOFFSET - THREAD_SIZE);
+ ti = (void *)(percpu_read(kernelstack) + KERNELSTACK_OFFSET - THREAD_SIZE);
return ti;
}
@@ -220,8 +223,8 @@ static inline struct thread_info *stack_thread_info(void)
/* how to get the thread information struct from ASM */
#define GET_THREAD_INFO(reg) \
- movq %gs:pda_kernelstack,reg ; \
- subq $(THREAD_SIZE-PDA_STACKOFFSET),reg
+ movq PER_CPU_VAR(kernelstack),reg ; \
+ subq $(THREAD_SIZE-KERNELSTACK_OFFSET),reg
#endif
diff --git a/arch/x86/kernel/asm-offsets_64.c b/arch/x86/kernel/asm-offsets_64.c
index 4f7a210..cafff5f 100644
--- a/arch/x86/kernel/asm-offsets_64.c
+++ b/arch/x86/kernel/asm-offsets_64.c
@@ -49,7 +49,6 @@ int main(void)
BLANK();
#undef ENTRY
#define ENTRY(entry) DEFINE(pda_ ## entry, offsetof(struct x8664_pda, entry))
- ENTRY(kernelstack);
ENTRY(oldrsp);
ENTRY(irqcount);
DEFINE(pda_size, sizeof(struct x8664_pda));
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index fbc8468..f653860 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -884,6 +884,10 @@ struct desc_ptr idt_descr = { 256 * 16 - 1, (unsigned long) idt_table };
DEFINE_PER_CPU_PAGE_ALIGNED(char, irqstack[IRQSTACKSIZE]) __aligned(PAGE_SIZE);
DEFINE_PER_CPU(char *, irqstackptr) = per_cpu_var(irqstack) + IRQSTACKSIZE - 64;
+DEFINE_PER_CPU(unsigned long, kernelstack) = (unsigned long)&init_thread_union -
+ KERNELSTACK_OFFSET + THREAD_SIZE;
+EXPORT_PER_CPU_SYMBOL(kernelstack);
+
void __cpuinit pda_init(int cpu)
{
struct x8664_pda *pda = cpu_pda(cpu);
@@ -895,8 +899,6 @@ void __cpuinit pda_init(int cpu)
load_pda_offset(cpu);
pda->irqcount = -1;
- pda->kernelstack = (unsigned long)stack_thread_info() -
- PDA_STACKOFFSET + THREAD_SIZE;
if (cpu != 0) {
if (pda->nodenumber == 0 && cpu_to_node(cpu) != NUMA_NO_NODE)
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index 5cd892f..8f35796 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -468,7 +468,7 @@ END(ret_from_fork)
ENTRY(system_call)
CFI_STARTPROC simple
CFI_SIGNAL_FRAME
- CFI_DEF_CFA rsp,PDA_STACKOFFSET
+ CFI_DEF_CFA rsp,KERNELSTACK_OFFSET
CFI_REGISTER rip,rcx
/*CFI_REGISTER rflags,r11*/
SWAPGS_UNSAFE_STACK
@@ -480,7 +480,7 @@ ENTRY(system_call)
ENTRY(system_call_after_swapgs)
movq %rsp,%gs:pda_oldrsp
- movq %gs:pda_kernelstack,%rsp
+ movq PER_CPU_VAR(kernelstack),%rsp
/*
* No need to follow this irqs off/on section - it's straight
* and short:
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index e00c31a..2f5bb4d 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -620,9 +620,9 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
write_pda(oldrsp, next->usersp);
percpu_write(current_task, next_p);
- write_pda(kernelstack,
+ percpu_write(kernelstack,
(unsigned long)task_stack_page(next_p) +
- THREAD_SIZE - PDA_STACKOFFSET);
+ THREAD_SIZE - KERNELSTACK_OFFSET);
#ifdef CONFIG_CC_STACKPROTECTOR
write_pda(stack_canary, next_p->stack_canary);
/*
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index ecc2fc5..8bf1a43 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -800,6 +800,8 @@ do_rest:
#else
clear_tsk_thread_flag(c_idle.idle, TIF_FORK);
initial_gs = per_cpu_offset(cpu);
+ per_cpu(kernelstack, cpu) = (unsigned long)task_stack_page(c_idle.idle) -
+ KERNELSTACK_OFFSET + THREAD_SIZE;
#endif
is_boot_cpu = 0;
early_gdt_descr.address = (unsigned long)get_cpu_gdt_table(cpu);
diff --git a/arch/x86/xen/xen-asm_64.S b/arch/x86/xen/xen-asm_64.S
index 05794c5..eac2794 100644
--- a/arch/x86/xen/xen-asm_64.S
+++ b/arch/x86/xen/xen-asm_64.S
@@ -17,6 +17,7 @@
#include <asm/processor-flags.h>
#include <asm/errno.h>
#include <asm/segment.h>
+#include <asm/percpu.h>
#include <xen/interface/xen.h>
@@ -33,7 +34,6 @@
never gets used
*/
#define BUG ud2a
-#define PER_CPU_VAR(var, off) 0xdeadbeef
#endif
/*
@@ -45,14 +45,14 @@ ENTRY(xen_irq_enable_direct)
BUG
/* Unmask events */
- movb $0, PER_CPU_VAR(xen_vcpu_info, XEN_vcpu_info_mask)
+ movb $0, PER_CPU_VAR(xen_vcpu_info) + XEN_vcpu_info_mask
/* Preempt here doesn't matter because that will deal with
any pending interrupts. The pending check may end up being
run on the wrong CPU, but that doesn't hurt. */
/* Test for pending */
- testb $0xff, PER_CPU_VAR(xen_vcpu_info, XEN_vcpu_info_pending)
+ testb $0xff, PER_CPU_VAR(xen_vcpu_info) + XEN_vcpu_info_pending
jz 1f
2: call check_events
@@ -69,7 +69,7 @@ ENDPATCH(xen_irq_enable_direct)
ENTRY(xen_irq_disable_direct)
BUG
- movb $1, PER_CPU_VAR(xen_vcpu_info, XEN_vcpu_info_mask)
+ movb $1, PER_CPU_VAR(xen_vcpu_info) + XEN_vcpu_info_mask
ENDPATCH(xen_irq_disable_direct)
ret
ENDPROC(xen_irq_disable_direct)
@@ -87,7 +87,7 @@ ENDPATCH(xen_irq_disable_direct)
ENTRY(xen_save_fl_direct)
BUG
- testb $0xff, PER_CPU_VAR(xen_vcpu_info, XEN_vcpu_info_mask)
+ testb $0xff, PER_CPU_VAR(xen_vcpu_info) + XEN_vcpu_info_mask
setz %ah
addb %ah,%ah
ENDPATCH(xen_save_fl_direct)
@@ -107,13 +107,13 @@ ENTRY(xen_restore_fl_direct)
BUG
testb $X86_EFLAGS_IF>>8, %ah
- setz PER_CPU_VAR(xen_vcpu_info, XEN_vcpu_info_mask)
+ setz PER_CPU_VAR(xen_vcpu_info) + XEN_vcpu_info_mask
/* Preempt here doesn't matter because that will deal with
any pending interrupts. The pending check may end up being
run on the wrong CPU, but that doesn't hurt. */
/* check for unmasked and pending */
- cmpw $0x0001, PER_CPU_VAR(xen_vcpu_info, XEN_vcpu_info_pending)
+ cmpw $0x0001, PER_CPU_VAR(xen_vcpu_info) + XEN_vcpu_info_pending
jz 1f
2: call check_events
1:
@@ -196,7 +196,7 @@ ENTRY(xen_sysret64)
/* We're already on the usermode stack at this point, but still
with the kernel gs, so we can easily switch back */
movq %rsp, %gs:pda_oldrsp
- movq %gs:pda_kernelstack,%rsp
+ movq PER_CPU_VAR(kernelstack),%rsp
pushq $__USER_DS
pushq %gs:pda_oldrsp
@@ -213,7 +213,7 @@ ENTRY(xen_sysret32)
/* We're already on the usermode stack at this point, but still
with the kernel gs, so we can easily switch back */
movq %rsp, %gs:pda_oldrsp
- movq %gs:pda_kernelstack, %rsp
+ movq PER_CPU_VAR(kernelstack), %rsp
pushq $__USER32_DS
pushq %gs:pda_oldrsp
--
1.6.1.rc1
Signed-off-by: Brian Gerst <[email protected]>
---
arch/x86/include/asm/pda.h | 1 -
arch/x86/include/asm/topology.h | 3 ++-
arch/x86/kernel/cpu/common.c | 11 ++++++-----
arch/x86/kernel/setup_percpu.c | 4 +++-
4 files changed, 11 insertions(+), 8 deletions(-)
diff --git a/arch/x86/include/asm/pda.h b/arch/x86/include/asm/pda.h
index 4527d70..b30ef6b 100644
--- a/arch/x86/include/asm/pda.h
+++ b/arch/x86/include/asm/pda.h
@@ -22,7 +22,6 @@ struct x8664_pda {
/* gcc-ABI: this canary MUST be at
offset 40!!! */
#endif
- short nodenumber; /* number of current node (32k max) */
short in_bootmem; /* pda lives in bootmem */
short isidle;
} ____cacheline_aligned_in_smp;
diff --git a/arch/x86/include/asm/topology.h b/arch/x86/include/asm/topology.h
index 87ca3fd..90a8a15 100644
--- a/arch/x86/include/asm/topology.h
+++ b/arch/x86/include/asm/topology.h
@@ -83,7 +83,8 @@ extern cpumask_t *node_to_cpumask_map;
DECLARE_EARLY_PER_CPU(int, x86_cpu_to_node_map);
/* Returns the number of the current Node. */
-#define numa_node_id() read_pda(nodenumber)
+DECLARE_PER_CPU(int, nodenumber);
+#define numa_node_id() percpu_read(nodenumber)
#ifdef CONFIG_DEBUG_PER_CPU_MAPS
extern int cpu_to_node(int cpu);
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index 62f9665..ccc6f8a 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -899,11 +899,6 @@ void __cpuinit pda_init(int cpu)
loadsegment(gs, 0);
load_pda_offset(cpu);
-
- if (cpu != 0) {
- if (pda->nodenumber == 0 && cpu_to_node(cpu) != NUMA_NO_NODE)
- pda->nodenumber = cpu_to_node(cpu);
- }
}
static DEFINE_PER_CPU_PAGE_ALIGNED(char, exception_stacks
@@ -973,6 +968,12 @@ void __cpuinit cpu_init(void)
if (cpu != 0)
pda_init(cpu);
+#ifdef CONFIG_NUMA
+ if (cpu != 0 && percpu_read(nodenumber) == 0 &&
+ cpu_to_node(cpu) != NUMA_NO_NODE)
+ percpu_write(nodenumber, cpu_to_node(cpu));
+#endif
+
me = current;
if (cpumask_test_and_set_cpu(cpu, cpu_initialized_mask))
diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c
index bd38127..0ddb184 100644
--- a/arch/x86/kernel/setup_percpu.c
+++ b/arch/x86/kernel/setup_percpu.c
@@ -44,6 +44,8 @@ EXPORT_EARLY_PER_CPU_SYMBOL(x86_bios_cpu_apicid);
#if defined(CONFIG_NUMA) && defined(CONFIG_X86_64)
#define X86_64_NUMA 1 /* (used later) */
+DEFINE_PER_CPU(int, nodenumber) = 0;
+EXPORT_PER_CPU_SYMBOL(nodenumber);
/*
* Map cpu index to node index
@@ -276,7 +278,7 @@ void __cpuinit numa_set_node(int cpu, int node)
per_cpu(x86_cpu_to_node_map, cpu) = node;
if (node != NUMA_NO_NODE)
- cpu_pda(cpu)->nodenumber = node;
+ per_cpu(nodenumber, cpu) = node;
}
void __cpuinit numa_clear_node(int cpu)
--
1.6.1.rc1
Accessing memory through %gs should not use rip-relative addressing.
Adding a P prefix for the argument tells gcc to not add (%rip) to
the memory references.
Signed-off-by: Brian Gerst <[email protected]>
---
arch/x86/include/asm/percpu.h | 26 +++++++++++++-------------
arch/x86/include/asm/system.h | 2 +-
2 files changed, 14 insertions(+), 14 deletions(-)
diff --git a/arch/x86/include/asm/percpu.h b/arch/x86/include/asm/percpu.h
index 03aa4b0..165d527 100644
--- a/arch/x86/include/asm/percpu.h
+++ b/arch/x86/include/asm/percpu.h
@@ -39,10 +39,10 @@
#include <linux/stringify.h>
#ifdef CONFIG_SMP
-#define __percpu_seg_str "%%"__stringify(__percpu_seg)":"
+#define __percpu_arg(x) "%%"__stringify(__percpu_seg)":%P" #x
#define __my_cpu_offset percpu_read(this_cpu_off)
#else
-#define __percpu_seg_str
+#define __percpu_arg(x) "%" #x
#endif
/* For arch-specific code, we can use direct single-insn ops (they
@@ -58,22 +58,22 @@ do { \
} \
switch (sizeof(var)) { \
case 1: \
- asm(op "b %1,"__percpu_seg_str"%0" \
+ asm(op "b %1,"__percpu_arg(0) \
: "+m" (var) \
: "ri" ((T__)val)); \
break; \
case 2: \
- asm(op "w %1,"__percpu_seg_str"%0" \
+ asm(op "w %1,"__percpu_arg(0) \
: "+m" (var) \
: "ri" ((T__)val)); \
break; \
case 4: \
- asm(op "l %1,"__percpu_seg_str"%0" \
+ asm(op "l %1,"__percpu_arg(0) \
: "+m" (var) \
: "ri" ((T__)val)); \
break; \
case 8: \
- asm(op "q %1,"__percpu_seg_str"%0" \
+ asm(op "q %1,"__percpu_arg(0) \
: "+m" (var) \
: "r" ((T__)val)); \
break; \
@@ -86,22 +86,22 @@ do { \
typeof(var) ret__; \
switch (sizeof(var)) { \
case 1: \
- asm(op "b "__percpu_seg_str"%1,%0" \
+ asm(op "b "__percpu_arg(1)",%0" \
: "=r" (ret__) \
: "m" (var)); \
break; \
case 2: \
- asm(op "w "__percpu_seg_str"%1,%0" \
+ asm(op "w "__percpu_arg(1)",%0" \
: "=r" (ret__) \
: "m" (var)); \
break; \
case 4: \
- asm(op "l "__percpu_seg_str"%1,%0" \
+ asm(op "l "__percpu_arg(1)",%0" \
: "=r" (ret__) \
: "m" (var)); \
break; \
case 8: \
- asm(op "q "__percpu_seg_str"%1,%0" \
+ asm(op "q "__percpu_arg(1)",%0" \
: "=r" (ret__) \
: "m" (var)); \
break; \
@@ -122,9 +122,9 @@ do { \
#define x86_test_and_clear_bit_percpu(bit, var) \
({ \
int old__; \
- asm volatile("btr %1,"__percpu_seg_str"%c2\n\tsbbl %0,%0" \
- : "=r" (old__) \
- : "dIr" (bit), "i" (&per_cpu__##var) : "memory"); \
+ asm volatile("btr %2,"__percpu_arg(1)"\n\tsbbl %0,%0" \
+ : "=r" (old__), "+m" (per_cpu__##var) \
+ : "dIr" (bit)); \
old__; \
})
diff --git a/arch/x86/include/asm/system.h b/arch/x86/include/asm/system.h
index 4399aac..d1dc27d 100644
--- a/arch/x86/include/asm/system.h
+++ b/arch/x86/include/asm/system.h
@@ -94,7 +94,7 @@ do { \
"call __switch_to\n\t" \
".globl thread_return\n" \
"thread_return:\n\t" \
- "movq "__percpu_seg_str"%P[current_task],%%rsi\n\t" \
+ "movq "__percpu_arg([current_task])",%%rsi\n\t" \
"movq %P[thread_info](%%rsi),%%r8\n\t" \
LOCK_PREFIX "btr %[tif_fork],%P[ti_flags](%%r8)\n\t" \
"movq %%rax,%%rdi\n\t" \
--
1.6.1.rc1
There is only one place now where the %gs base is changed after boot.
Move the code inline to setup_per_cpu_areas().
Signed-off-by: Brian Gerst <[email protected]>
---
arch/x86/include/asm/percpu.h | 6 ------
arch/x86/kernel/setup_percpu.c | 20 ++++++--------------
2 files changed, 6 insertions(+), 20 deletions(-)
diff --git a/arch/x86/include/asm/percpu.h b/arch/x86/include/asm/percpu.h
index 165d527..ce980db 100644
--- a/arch/x86/include/asm/percpu.h
+++ b/arch/x86/include/asm/percpu.h
@@ -133,12 +133,6 @@ do { \
/* We can use this directly for local CPU (faster). */
DECLARE_PER_CPU(unsigned long, this_cpu_off);
-#ifdef CONFIG_X86_64
-extern void load_pda_offset(int cpu);
-#else
-static inline void load_pda_offset(int cpu) { }
-#endif
-
#endif /* !__ASSEMBLY__ */
#ifdef CONFIG_SMP
diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c
index 0ddb184..916e2cf 100644
--- a/arch/x86/kernel/setup_percpu.c
+++ b/arch/x86/kernel/setup_percpu.c
@@ -69,23 +69,12 @@ static inline void setup_node_to_cpumask_map(void) { }
#endif
/*
- * Define load_pda_offset() and per-cpu __pda for x86_64.
- * load_pda_offset() is responsible for loading the offset of pda into
- * %gs.
- *
* On SMP, pda offset also duals as percpu base address and thus it
* should be at the start of per-cpu area. To achieve this, it's
* preallocated in vmlinux_64.lds.S directly instead of using
* DEFINE_PER_CPU().
*/
#ifdef CONFIG_X86_64
-void __cpuinit load_pda_offset(int cpu)
-{
- /* Memory clobbers used to order pda/percpu accesses */
- mb();
- wrmsrl(MSR_GS_BASE, cpu_pda(cpu));
- mb();
-}
#ifndef CONFIG_SMP
DEFINE_PER_CPU(struct x8664_pda, __pda);
#endif
@@ -205,9 +194,12 @@ void __init setup_per_cpu_areas(void)
* CPU0 modified pda in the init data area, reload pda
* offset for CPU0 and clear the area for others.
*/
- if (cpu == 0)
- load_pda_offset(0);
- else
+ if (cpu == 0) {
+ /* Memory clobbers used to order pda/percpu accesses */
+ mb();
+ wrmsrl(MSR_GS_BASE, cpu_pda(0));
+ mb();
+ } else
memset(cpu_pda(cpu), 0, sizeof(*cpu_pda(cpu)));
#endif
--
1.6.1.rc1
Refactor the DEFINE_PER_CPU_* macros to enable adding arch specific
per-cpu sections simpler.
Signed-off-by: Brian Gerst <[email protected]>
---
include/linux/percpu.h | 36 ++++++++++++++++++------------------
1 files changed, 18 insertions(+), 18 deletions(-)
diff --git a/include/linux/percpu.h b/include/linux/percpu.h
index 9f2a375..73ef5d8 100644
--- a/include/linux/percpu.h
+++ b/include/linux/percpu.h
@@ -9,34 +9,34 @@
#include <asm/percpu.h>
#ifdef CONFIG_SMP
-#define DEFINE_PER_CPU(type, name) \
- __attribute__((__section__(".data.percpu"))) \
- PER_CPU_ATTRIBUTES __typeof__(type) per_cpu__##name
+#define PER_CPU_BASE_SECTION ".data.percpu"
#ifdef MODULE
-#define SHARED_ALIGNED_SECTION ".data.percpu"
+#define PER_CPU_SHARED_ALIGNED_SECTION ""
#else
-#define SHARED_ALIGNED_SECTION ".data.percpu.shared_aligned"
+#define PER_CPU_SHARED_ALIGNED_SECTION ".shared_aligned"
#endif
-#define DEFINE_PER_CPU_SHARED_ALIGNED(type, name) \
- __attribute__((__section__(SHARED_ALIGNED_SECTION))) \
- PER_CPU_ATTRIBUTES __typeof__(type) per_cpu__##name \
- ____cacheline_aligned_in_smp
+#else
+
+#define PER_CPU_BASE_SECTION ".data"
+#define PER_CPU_SHARED_ALIGNED_SECTION ""
+
+#endif
-#define DEFINE_PER_CPU_PAGE_ALIGNED(type, name) \
- __attribute__((__section__(".data.percpu.page_aligned"))) \
+#define DEFINE_PER_CPU_SECTION(type, name, section) \
+ __attribute__((__section__(PER_CPU_BASE_SECTION section))) \
PER_CPU_ATTRIBUTES __typeof__(type) per_cpu__##name
-#else
+
#define DEFINE_PER_CPU(type, name) \
- PER_CPU_ATTRIBUTES __typeof__(type) per_cpu__##name
+ DEFINE_PER_CPU_SECTION(type, name, "")
-#define DEFINE_PER_CPU_SHARED_ALIGNED(type, name) \
- DEFINE_PER_CPU(type, name)
+#define DEFINE_PER_CPU_SHARED_ALIGNED(type, name) \
+ DEFINE_PER_CPU_SECTION(type, name, PER_CPU_SHARED_ALIGNED_SECTION) \
+ ____cacheline_aligned_in_smp
-#define DEFINE_PER_CPU_PAGE_ALIGNED(type, name) \
- DEFINE_PER_CPU(type, name)
-#endif
+#define DEFINE_PER_CPU_PAGE_ALIGNED(type, name) \
+ DEFINE_PER_CPU_SECTION(type, name, ".page_aligned")
#define EXPORT_PER_CPU_SYMBOL(var) EXPORT_SYMBOL(per_cpu__##var)
#define EXPORT_PER_CPU_SYMBOL_GPL(var) EXPORT_SYMBOL_GPL(per_cpu__##var)
--
1.6.1.rc1
Now that the PDA is empty except for the stack canary, it can be removed.
The irqstack is moved to the start of the per-cpu section. If the stack
protector is enabled, the canary overlaps the bottom 48 bytes of the irqstack
on SMP. On UP it is a seperate variable, since it is the only thing referenced
via %gs.
Signed-off-by: Brian Gerst <[email protected]>
---
arch/x86/include/asm/pda.h | 5 -----
arch/x86/include/asm/processor.h | 4 ++++
arch/x86/kernel/asm-offsets_64.c | 4 ----
arch/x86/kernel/cpu/common.c | 10 +++++++++-
arch/x86/kernel/head_64.S | 6 +++---
arch/x86/kernel/process_64.c | 7 +------
arch/x86/kernel/setup_percpu.c | 22 ++++------------------
arch/x86/kernel/vmlinux_64.lds.S | 8 ++++++--
include/asm-generic/vmlinux.lds.h | 35 +----------------------------------
9 files changed, 28 insertions(+), 73 deletions(-)
diff --git a/arch/x86/include/asm/pda.h b/arch/x86/include/asm/pda.h
index 6ca7bc0..ba46416 100644
--- a/arch/x86/include/asm/pda.h
+++ b/arch/x86/include/asm/pda.h
@@ -17,11 +17,6 @@ struct x8664_pda {
unsigned long unused4;
int unused5;
unsigned int unused6; /* 36 was cpunumber */
-#ifdef CONFIG_CC_STACKPROTECTOR
- unsigned long stack_canary; /* 40 stack canary value */
- /* gcc-ABI: this canary MUST be at
- offset 40!!! */
-#endif
short in_bootmem; /* pda lives in bootmem */
} ____cacheline_aligned_in_smp;
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index e32ee80..a20e5f5 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -381,6 +381,10 @@ DECLARE_PER_CPU(struct orig_ist, orig_ist);
DECLARE_PER_CPU(char, irqstack[IRQSTACKSIZE]);
DECLARE_PER_CPU(char *, irqstackptr);
+
+#ifdef CONFIG_CC_STACKPROTECTOR
+DECLARE_PER_CPU(unsigned long, stack_canary);
+#endif
#endif
extern void print_cpu_info(struct cpuinfo_x86 *);
diff --git a/arch/x86/kernel/asm-offsets_64.c b/arch/x86/kernel/asm-offsets_64.c
index fbc6045..8793ab3 100644
--- a/arch/x86/kernel/asm-offsets_64.c
+++ b/arch/x86/kernel/asm-offsets_64.c
@@ -47,10 +47,6 @@ int main(void)
#endif
BLANK();
#undef ENTRY
-#define ENTRY(entry) DEFINE(pda_ ## entry, offsetof(struct x8664_pda, entry))
- DEFINE(pda_size, sizeof(struct x8664_pda));
- BLANK();
-#undef ENTRY
#ifdef CONFIG_PARAVIRT
BLANK();
OFFSET(PARAVIRT_enabled, pv_info, paravirt_enabled);
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index 716c1e8..cc4e398 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -881,7 +881,15 @@ __setup("clearcpuid=", setup_disablecpuid);
#ifdef CONFIG_X86_64
struct desc_ptr idt_descr = { 256 * 16 - 1, (unsigned long) idt_table };
-DEFINE_PER_CPU_PAGE_ALIGNED(char, irqstack[IRQSTACKSIZE]) __aligned(PAGE_SIZE);
+#ifdef CONFIG_SMP
+/* On SMP, the canary overlaps the bottom of the irqstack */
+#define IRQSTACK_SECTION ".first"
+#else
+#define IRQSTACK_SECTION ".page_aligned"
+DEFINE_PER_CPU(unsigned long, stack_canary);
+#endif
+
+DEFINE_PER_CPU_SECTION(char, irqstack[IRQSTACKSIZE], IRQSTACK_SECTION) __aligned(PAGE_SIZE);
DEFINE_PER_CPU(char *, irqstackptr) = per_cpu_var(irqstack) + IRQSTACKSIZE - 64;
DEFINE_PER_CPU(unsigned long, kernelstack) = (unsigned long)&init_thread_union -
diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S
index b565719..d7ad8bc 100644
--- a/arch/x86/kernel/head_64.S
+++ b/arch/x86/kernel/head_64.S
@@ -249,8 +249,8 @@ ENTRY(secondary_startup_64)
* secondary CPU,initial_gs should be set to its pda address
* before the CPU runs this code.
*
- * On UP, initial_gs points to PER_CPU_VAR(__pda) and doesn't
- * change.
+ * On UP, initial_gs points to the stack canary (offset by -40)
+ * and doesn't change.
*/
movl $MSR_GS_BASE,%ecx
movq initial_gs(%rip),%rax
@@ -283,7 +283,7 @@ ENTRY(secondary_startup_64)
#ifdef CONFIG_SMP
.quad __per_cpu_load
#else
- .quad PER_CPU_VAR(__pda)
+ .quad PER_CPU_VAR(stack_canary)-40
#endif
__FINITDATA
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index 3cf12f4..458e1de 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -627,12 +627,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
(unsigned long)task_stack_page(next_p) +
THREAD_SIZE - KERNELSTACK_OFFSET);
#ifdef CONFIG_CC_STACKPROTECTOR
- write_pda(stack_canary, next_p->stack_canary);
- /*
- * Build time only check to make sure the stack_canary is at
- * offset 40 in the pda; this is a gcc ABI requirement
- */
- BUILD_BUG_ON(offsetof(struct x8664_pda, stack_canary) != 40);
+ x86_write_percpu(stack_canary, next_p->stack_canary);
#endif
/*
diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c
index 916e2cf..5e832b9 100644
--- a/arch/x86/kernel/setup_percpu.c
+++ b/arch/x86/kernel/setup_percpu.c
@@ -68,19 +68,6 @@ static void __init setup_node_to_cpumask_map(void);
static inline void setup_node_to_cpumask_map(void) { }
#endif
-/*
- * On SMP, pda offset also duals as percpu base address and thus it
- * should be at the start of per-cpu area. To achieve this, it's
- * preallocated in vmlinux_64.lds.S directly instead of using
- * DEFINE_PER_CPU().
- */
-#ifdef CONFIG_X86_64
-#ifndef CONFIG_SMP
-DEFINE_PER_CPU(struct x8664_pda, __pda);
-#endif
-EXPORT_PER_CPU_SYMBOL(__pda);
-#endif /* CONFIG_SMP && CONFIG_X86_64 */
-
#ifdef CONFIG_X86_64
/* correctly size the local cpu masks */
@@ -191,16 +178,15 @@ void __init setup_per_cpu_areas(void)
#ifdef CONFIG_X86_64
per_cpu(irqstackptr, cpu) = per_cpu(irqstack, cpu) + IRQSTACKSIZE - 64;
/*
- * CPU0 modified pda in the init data area, reload pda
- * offset for CPU0 and clear the area for others.
+ * CPU0 modified data in the init per-cpu area, reload %gs
+ * offset for CPU0.
*/
if (cpu == 0) {
/* Memory clobbers used to order pda/percpu accesses */
mb();
- wrmsrl(MSR_GS_BASE, cpu_pda(0));
+ wrmsrl(MSR_GS_BASE, per_cpu_offset(0));
mb();
- } else
- memset(cpu_pda(cpu), 0, sizeof(*cpu_pda(cpu)));
+ }
#endif
DBG("PERCPU: cpu %4d %p\n", cpu, ptr);
diff --git a/arch/x86/kernel/vmlinux_64.lds.S b/arch/x86/kernel/vmlinux_64.lds.S
index a09abb8..c52af06 100644
--- a/arch/x86/kernel/vmlinux_64.lds.S
+++ b/arch/x86/kernel/vmlinux_64.lds.S
@@ -4,6 +4,10 @@
#define LOAD_OFFSET __START_KERNEL_map
+#define PER_CPU_SECTIONS \
+ *(.data.percpu.irqstack) \
+ DEFAULT_PER_CPU_SECTIONS
+
#include <asm-generic/vmlinux.lds.h>
#include <asm/asm-offsets.h>
#include <asm/page.h>
@@ -220,8 +224,8 @@ SECTIONS
* so that it can be accessed as a percpu variable.
*/
. = ALIGN(PAGE_SIZE);
- PERCPU_VADDR_PREALLOC(0, :percpu, pda_size)
- per_cpu____pda = __per_cpu_start;
+ PERCPU_VADDR(0, :percpu)
+ per_cpu__stack_canary = __per_cpu_start + 40;
#else
PERCPU(PAGE_SIZE)
#endif
diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h
index e53319c..4e92e0d 100644
--- a/include/asm-generic/vmlinux.lds.h
+++ b/include/asm-generic/vmlinux.lds.h
@@ -441,40 +441,6 @@
. = __per_cpu_load + SIZEOF(.data.percpu);
/**
- * PERCPU_VADDR_PREALLOC - define output section for percpu area with prealloc
- * @vaddr: explicit base address (optional)
- * @phdr: destination PHDR (optional)
- * @prealloc: the size of prealloc area
- *
- * Macro which expands to output section for percpu area. If @vaddr
- * is not blank, it specifies explicit base address and all percpu
- * symbols will be offset from the given address. If blank, @vaddr
- * always equals @laddr + LOAD_OFFSET.
- *
- * @phdr defines the output PHDR to use if not blank. Be warned that
- * output PHDR is sticky. If @phdr is specified, the next output
- * section in the linker script will go there too. @phdr should have
- * a leading colon.
- *
- * If @prealloc is non-zero, the specified number of bytes will be
- * reserved at the start of percpu area. As the prealloc area is
- * likely to break alignment, this macro puts areas in increasing
- * alignment order.
- *
- * This macro defines three symbols, __per_cpu_load, __per_cpu_start
- * and __per_cpu_end. The first one is the vaddr of loaded percpu
- * init data. __per_cpu_start equals @vaddr and __per_cpu_end is the
- * end offset.
- */
-#define PERCPU_VADDR_PREALLOC(vaddr, segment, prealloc) \
- PERCPU_PROLOG(vaddr) \
- . += prealloc; \
- *(.data.percpu) \
- *(.data.percpu.shared_aligned) \
- *(.data.percpu.page_aligned) \
- PERCPU_EPILOG(segment)
-
-/**
* PERCPU_VADDR - define output section for percpu area
* @vaddr: explicit base address (optional)
* @phdr: destination PHDR (optional)
@@ -485,6 +451,7 @@
*/
#define PERCPU_VADDR(vaddr, phdr) \
PERCPU_PROLOG(vaddr) \
+ *(.data.percpu.first) \
*(.data.percpu.page_aligned) \
*(.data.percpu) \
*(.data.percpu.shared_aligned) \
--
1.6.1.rc1
Signed-off-by: Brian Gerst <[email protected]>
---
arch/x86/include/asm/pda.h | 39 -------------------------------------
arch/x86/include/asm/pgtable_64.h | 1 -
arch/x86/include/asm/smp.h | 1 -
arch/x86/kernel/cpu/common.c | 1 -
arch/x86/kernel/process_64.c | 1 -
arch/x86/kernel/traps.c | 1 -
6 files changed, 0 insertions(+), 44 deletions(-)
delete mode 100644 arch/x86/include/asm/pda.h
diff --git a/arch/x86/include/asm/pda.h b/arch/x86/include/asm/pda.h
deleted file mode 100644
index ba46416..0000000
--- a/arch/x86/include/asm/pda.h
+++ /dev/null
@@ -1,39 +0,0 @@
-#ifndef _ASM_X86_PDA_H
-#define _ASM_X86_PDA_H
-
-#ifndef __ASSEMBLY__
-#include <linux/stddef.h>
-#include <linux/types.h>
-#include <linux/cache.h>
-#include <linux/threads.h>
-#include <asm/page.h>
-#include <asm/percpu.h>
-
-/* Per processor datastructure. %gs points to it while the kernel runs */
-struct x8664_pda {
- unsigned long unused1;
- unsigned long unused2;
- unsigned long unused3;
- unsigned long unused4;
- int unused5;
- unsigned int unused6; /* 36 was cpunumber */
- short in_bootmem; /* pda lives in bootmem */
-} ____cacheline_aligned_in_smp;
-
-DECLARE_PER_CPU(struct x8664_pda, __pda);
-
-#define cpu_pda(cpu) (&per_cpu(__pda, cpu))
-
-#define read_pda(field) percpu_read(__pda.field)
-#define write_pda(field, val) percpu_write(__pda.field, val)
-#define add_pda(field, val) percpu_add(__pda.field, val)
-#define sub_pda(field, val) percpu_sub(__pda.field, val)
-#define or_pda(field, val) percpu_or(__pda.field, val)
-
-/* This is not atomic against other CPUs -- CPU preemption needs to be off */
-#define test_and_clear_bit_pda(bit, field) \
- x86_test_and_clear_bit_percpu(bit, __pda.field)
-
-#endif
-
-#endif /* _ASM_X86_PDA_H */
diff --git a/arch/x86/include/asm/pgtable_64.h b/arch/x86/include/asm/pgtable_64.h
index ba09289..1df9637 100644
--- a/arch/x86/include/asm/pgtable_64.h
+++ b/arch/x86/include/asm/pgtable_64.h
@@ -11,7 +11,6 @@
#include <asm/processor.h>
#include <linux/bitops.h>
#include <linux/threads.h>
-#include <asm/pda.h>
extern pud_t level3_kernel_pgt[512];
extern pud_t level3_ident_pgt[512];
diff --git a/arch/x86/include/asm/smp.h b/arch/x86/include/asm/smp.h
index 68636e7..45ef8a1 100644
--- a/arch/x86/include/asm/smp.h
+++ b/arch/x86/include/asm/smp.h
@@ -15,7 +15,6 @@
# include <asm/io_apic.h>
# endif
#endif
-#include <asm/pda.h>
#include <asm/thread_info.h>
#include <asm/cpumask.h>
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index cc4e398..6b11925 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -30,7 +30,6 @@
#include <asm/genapic.h>
#endif
-#include <asm/pda.h>
#include <asm/pgtable.h>
#include <asm/processor.h>
#include <asm/desc.h>
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index 458e1de..79499ea 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -46,7 +46,6 @@
#include <asm/processor.h>
#include <asm/i387.h>
#include <asm/mmu_context.h>
-#include <asm/pda.h>
#include <asm/prctl.h>
#include <asm/desc.h>
#include <asm/proto.h>
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index 98c2d05..ed5aee5 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -59,7 +59,6 @@
#ifdef CONFIG_X86_64
#include <asm/pgalloc.h>
#include <asm/proto.h>
-#include <asm/pda.h>
#else
#include <asm/processor-flags.h>
#include <asm/arch_hooks.h>
--
1.6.1.rc1
Signed-off-by: Brian Gerst <[email protected]>
---
arch/x86/include/asm/pda.h | 2 +-
arch/x86/kernel/asm-offsets_64.c | 1 -
arch/x86/kernel/cpu/common.c | 4 ++--
arch/x86/kernel/entry_64.S | 12 ++++++------
4 files changed, 9 insertions(+), 10 deletions(-)
diff --git a/arch/x86/include/asm/pda.h b/arch/x86/include/asm/pda.h
index ae23deb..4527d70 100644
--- a/arch/x86/include/asm/pda.h
+++ b/arch/x86/include/asm/pda.h
@@ -15,7 +15,7 @@ struct x8664_pda {
unsigned long unused2;
unsigned long unused3;
unsigned long unused4;
- int irqcount; /* 32 Irq nesting counter. Starts -1 */
+ int unused5;
unsigned int unused6; /* 36 was cpunumber */
#ifdef CONFIG_CC_STACKPROTECTOR
unsigned long stack_canary; /* 40 stack canary value */
diff --git a/arch/x86/kernel/asm-offsets_64.c b/arch/x86/kernel/asm-offsets_64.c
index 6a13983..fbc6045 100644
--- a/arch/x86/kernel/asm-offsets_64.c
+++ b/arch/x86/kernel/asm-offsets_64.c
@@ -48,7 +48,6 @@ int main(void)
BLANK();
#undef ENTRY
#define ENTRY(entry) DEFINE(pda_ ## entry, offsetof(struct x8664_pda, entry))
- ENTRY(irqcount);
DEFINE(pda_size, sizeof(struct x8664_pda));
BLANK();
#undef ENTRY
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index f653860..62f9665 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -888,6 +888,8 @@ DEFINE_PER_CPU(unsigned long, kernelstack) = (unsigned long)&init_thread_union -
KERNELSTACK_OFFSET + THREAD_SIZE;
EXPORT_PER_CPU_SYMBOL(kernelstack);
+DEFINE_PER_CPU(unsigned int, irqcount) = -1;
+
void __cpuinit pda_init(int cpu)
{
struct x8664_pda *pda = cpu_pda(cpu);
@@ -898,8 +900,6 @@ void __cpuinit pda_init(int cpu)
load_pda_offset(cpu);
- pda->irqcount = -1;
-
if (cpu != 0) {
if (pda->nodenumber == 0 && cpu_to_node(cpu) != NUMA_NO_NODE)
pda->nodenumber = cpu_to_node(cpu);
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index 1331fd2..c97a1b5 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -342,7 +342,7 @@ ENTRY(save_args)
* a little cheaper to use a separate counter in the PDA (short of
* moving irq_enter into assembly, which would be too much work)
*/
-1: incl %gs:pda_irqcount
+1: incl PER_CPU_VAR(irqcount)
jne 2f
popq_cfi %rax /* move return address... */
mov PER_CPU_VAR(irqstackptr),%rsp
@@ -837,7 +837,7 @@ common_interrupt:
ret_from_intr:
DISABLE_INTERRUPTS(CLBR_NONE)
TRACE_IRQS_OFF
- decl %gs:pda_irqcount
+ decl PER_CPU_VAR(irqcount)
leaveq
CFI_DEF_CFA_REGISTER rsp
CFI_ADJUST_CFA_OFFSET -8
@@ -1260,14 +1260,14 @@ ENTRY(call_softirq)
CFI_REL_OFFSET rbp,0
mov %rsp,%rbp
CFI_DEF_CFA_REGISTER rbp
- incl %gs:pda_irqcount
+ incl PER_CPU_VAR(irqcount)
cmove PER_CPU_VAR(irqstackptr),%rsp
push %rbp # backlink for old unwinder
call __do_softirq
leaveq
CFI_DEF_CFA_REGISTER rsp
CFI_ADJUST_CFA_OFFSET -8
- decl %gs:pda_irqcount
+ decl PER_CPU_VAR(irqcount)
ret
CFI_ENDPROC
END(call_softirq)
@@ -1297,7 +1297,7 @@ ENTRY(xen_do_hypervisor_callback) # do_hypervisor_callback(struct *pt_regs)
movq %rdi, %rsp # we don't return, adjust the stack frame
CFI_ENDPROC
DEFAULT_FRAME
-11: incl %gs:pda_irqcount
+11: incl PER_CPU_VAR(irqcount)
movq %rsp,%rbp
CFI_DEF_CFA_REGISTER rbp
cmovzq PER_CPU_VAR(irqstackptr),%rsp
@@ -1305,7 +1305,7 @@ ENTRY(xen_do_hypervisor_callback) # do_hypervisor_callback(struct *pt_regs)
call xen_evtchn_do_upcall
popq %rsp
CFI_DEF_CFA_REGISTER rsp
- decl %gs:pda_irqcount
+ decl PER_CPU_VAR(irqcount)
jmp error_exit
CFI_ENDPROC
END(do_hypervisor_callback)
--
1.6.1.rc1
Remove pda_init(), since it is now redundant with code in head_64.S.
Signed-off-by: Brian Gerst <[email protected]>
---
arch/x86/include/asm/pda.h | 1 -
arch/x86/kernel/cpu/common.c | 15 ---------------
arch/x86/kernel/head64.c | 2 --
arch/x86/xen/enlighten.c | 1 -
4 files changed, 0 insertions(+), 19 deletions(-)
diff --git a/arch/x86/include/asm/pda.h b/arch/x86/include/asm/pda.h
index c31ca04..6ca7bc0 100644
--- a/arch/x86/include/asm/pda.h
+++ b/arch/x86/include/asm/pda.h
@@ -26,7 +26,6 @@ struct x8664_pda {
} ____cacheline_aligned_in_smp;
DECLARE_PER_CPU(struct x8664_pda, __pda);
-extern void pda_init(int);
#define cpu_pda(cpu) (&per_cpu(__pda, cpu))
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index ccc6f8a..716c1e8 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -890,17 +890,6 @@ EXPORT_PER_CPU_SYMBOL(kernelstack);
DEFINE_PER_CPU(unsigned int, irqcount) = -1;
-void __cpuinit pda_init(int cpu)
-{
- struct x8664_pda *pda = cpu_pda(cpu);
-
- /* Setup up data that may be needed in __get_free_pages early */
- loadsegment(fs, 0);
- loadsegment(gs, 0);
-
- load_pda_offset(cpu);
-}
-
static DEFINE_PER_CPU_PAGE_ALIGNED(char, exception_stacks
[(N_EXCEPTION_STACKS - 1) * EXCEPTION_STKSZ + DEBUG_STKSZ])
__aligned(PAGE_SIZE);
@@ -964,10 +953,6 @@ void __cpuinit cpu_init(void)
struct task_struct *me;
int i;
- /* CPU 0 is initialised in head64.c */
- if (cpu != 0)
- pda_init(cpu);
-
#ifdef CONFIG_NUMA
if (cpu != 0 && percpu_read(nodenumber) == 0 &&
cpu_to_node(cpu) != NUMA_NO_NODE)
diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c
index af67d32..f5b2722 100644
--- a/arch/x86/kernel/head64.c
+++ b/arch/x86/kernel/head64.c
@@ -91,8 +91,6 @@ void __init x86_64_start_kernel(char * real_mode_data)
if (console_loglevel == 10)
early_printk("Kernel alive\n");
- pda_init(0);
-
x86_64_start_reservations(real_mode_data);
}
diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index 75b9413..bef941f 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -1645,7 +1645,6 @@ asmlinkage void __init xen_start_kernel(void)
#ifdef CONFIG_X86_64
/* Disable until direct per-cpu data access. */
have_vcpu_info_placement = 0;
- pda_init(0);
#endif
xen_smp_init();
--
1.6.1.rc1
Signed-off-by: Brian Gerst <[email protected]>
---
arch/x86/include/asm/mmu_context_64.h | 16 +++++++---------
arch/x86/include/asm/pda.h | 2 --
arch/x86/include/asm/tlbflush.h | 7 ++-----
arch/x86/kernel/cpu/common.c | 2 --
arch/x86/kernel/tlb_32.c | 12 ++----------
arch/x86/kernel/tlb_64.c | 13 ++++++++-----
arch/x86/xen/mmu.c | 6 +-----
7 files changed, 20 insertions(+), 38 deletions(-)
diff --git a/arch/x86/include/asm/mmu_context_64.h b/arch/x86/include/asm/mmu_context_64.h
index 677d36e..c457250 100644
--- a/arch/x86/include/asm/mmu_context_64.h
+++ b/arch/x86/include/asm/mmu_context_64.h
@@ -1,13 +1,11 @@
#ifndef _ASM_X86_MMU_CONTEXT_64_H
#define _ASM_X86_MMU_CONTEXT_64_H
-#include <asm/pda.h>
-
static inline void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk)
{
#ifdef CONFIG_SMP
- if (read_pda(mmu_state) == TLBSTATE_OK)
- write_pda(mmu_state, TLBSTATE_LAZY);
+ if (percpu_read(cpu_tlbstate.state) == TLBSTATE_OK)
+ percpu_write(cpu_tlbstate.state, TLBSTATE_LAZY);
#endif
}
@@ -19,8 +17,8 @@ static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next,
/* stop flush ipis for the previous mm */
cpu_clear(cpu, prev->cpu_vm_mask);
#ifdef CONFIG_SMP
- write_pda(mmu_state, TLBSTATE_OK);
- write_pda(active_mm, next);
+ percpu_write(cpu_tlbstate.state, TLBSTATE_OK);
+ percpu_write(cpu_tlbstate.active_mm, next);
#endif
cpu_set(cpu, next->cpu_vm_mask);
load_cr3(next->pgd);
@@ -30,9 +28,9 @@ static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next,
}
#ifdef CONFIG_SMP
else {
- write_pda(mmu_state, TLBSTATE_OK);
- if (read_pda(active_mm) != next)
- BUG();
+ percpu_write(cpu_tlbstate.state, TLBSTATE_OK);
+ BUG_ON(percpu_read(cpu_tlbstate.active_mm) != next);
+
if (!cpu_test_and_set(cpu, next->cpu_vm_mask)) {
/* We were in lazy tlb mode and leave_mm disabled
* tlb flush IPI delivery. We must reload CR3
diff --git a/arch/x86/include/asm/pda.h b/arch/x86/include/asm/pda.h
index 69a4075..8ee835e 100644
--- a/arch/x86/include/asm/pda.h
+++ b/arch/x86/include/asm/pda.h
@@ -25,9 +25,7 @@ struct x8664_pda {
char *irqstackptr;
short nodenumber; /* number of current node (32k max) */
short in_bootmem; /* pda lives in bootmem */
- short mmu_state;
short isidle;
- struct mm_struct *active_mm;
} ____cacheline_aligned_in_smp;
DECLARE_PER_CPU(struct x8664_pda, __pda);
diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h
index 17feaa9..d3539f9 100644
--- a/arch/x86/include/asm/tlbflush.h
+++ b/arch/x86/include/asm/tlbflush.h
@@ -148,20 +148,17 @@ void native_flush_tlb_others(const struct cpumask *cpumask,
#define TLBSTATE_OK 1
#define TLBSTATE_LAZY 2
-#ifdef CONFIG_X86_32
struct tlb_state {
struct mm_struct *active_mm;
int state;
- char __cacheline_padding[L1_CACHE_BYTES-8];
};
DECLARE_PER_CPU(struct tlb_state, cpu_tlbstate);
-void reset_lazy_tlbstate(void);
-#else
static inline void reset_lazy_tlbstate(void)
{
+ percpu_write(cpu_tlbstate.state, 0);
+ percpu_write(cpu_tlbstate.active_mm, &init_mm);
}
-#endif
#endif /* SMP */
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index c49498d..3d0cc6f 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -897,8 +897,6 @@ void __cpuinit pda_init(int cpu)
pda->irqcount = -1;
pda->kernelstack = (unsigned long)stack_thread_info() -
PDA_STACKOFFSET + THREAD_SIZE;
- pda->active_mm = &init_mm;
- pda->mmu_state = 0;
if (cpu == 0) {
/* others are initialized in smpboot.c */
diff --git a/arch/x86/kernel/tlb_32.c b/arch/x86/kernel/tlb_32.c
index e65449d..abf0808 100644
--- a/arch/x86/kernel/tlb_32.c
+++ b/arch/x86/kernel/tlb_32.c
@@ -4,8 +4,8 @@
#include <asm/tlbflush.h>
-DEFINE_PER_CPU(struct tlb_state, cpu_tlbstate)
- ____cacheline_aligned = { &init_mm, 0, };
+DEFINE_PER_CPU_SHARED_ALIGNED(struct tlb_state, cpu_tlbstate)
+ = { &init_mm, 0, };
/* must come after the send_IPI functions above for inlining */
#include <mach_ipi.h>
@@ -231,14 +231,6 @@ void flush_tlb_all(void)
on_each_cpu(do_flush_tlb_all, NULL, 1);
}
-void reset_lazy_tlbstate(void)
-{
- int cpu = raw_smp_processor_id();
-
- per_cpu(cpu_tlbstate, cpu).state = 0;
- per_cpu(cpu_tlbstate, cpu).active_mm = &init_mm;
-}
-
static int init_flush_cpumask(void)
{
alloc_cpumask_var(&flush_cpumask, GFP_KERNEL);
diff --git a/arch/x86/kernel/tlb_64.c b/arch/x86/kernel/tlb_64.c
index 7f4141d..e64a32c 100644
--- a/arch/x86/kernel/tlb_64.c
+++ b/arch/x86/kernel/tlb_64.c
@@ -18,6 +18,9 @@
#include <asm/uv/uv_hub.h>
#include <asm/uv/uv_bau.h>
+DEFINE_PER_CPU_SHARED_ALIGNED(struct tlb_state, cpu_tlbstate)
+ = { &init_mm, 0, };
+
#include <mach_ipi.h>
/*
* Smarter SMP flushing macros.
@@ -62,9 +65,9 @@ static DEFINE_PER_CPU(union smp_flush_state, flush_state);
*/
void leave_mm(int cpu)
{
- if (read_pda(mmu_state) == TLBSTATE_OK)
+ if (percpu_read(cpu_tlbstate.state) == TLBSTATE_OK)
BUG();
- cpu_clear(cpu, read_pda(active_mm)->cpu_vm_mask);
+ cpu_clear(cpu, percpu_read(cpu_tlbstate.active_mm)->cpu_vm_mask);
load_cr3(swapper_pg_dir);
}
EXPORT_SYMBOL_GPL(leave_mm);
@@ -142,8 +145,8 @@ asmlinkage void smp_invalidate_interrupt(struct pt_regs *regs)
* BUG();
*/
- if (f->flush_mm == read_pda(active_mm)) {
- if (read_pda(mmu_state) == TLBSTATE_OK) {
+ if (f->flush_mm == percpu_read(cpu_tlbstate.active_mm)) {
+ if (percpu_read(cpu_tlbstate.state) == TLBSTATE_OK) {
if (f->flush_va == TLB_FLUSH_ALL)
local_flush_tlb();
else
@@ -281,7 +284,7 @@ static void do_flush_tlb_all(void *info)
unsigned long cpu = smp_processor_id();
__flush_tlb_all();
- if (read_pda(mmu_state) == TLBSTATE_LAZY)
+ if (percpu_read(cpu_tlbstate.state) == TLBSTATE_LAZY)
leave_mm(cpu);
}
diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c
index 7bc7852..98cb986 100644
--- a/arch/x86/xen/mmu.c
+++ b/arch/x86/xen/mmu.c
@@ -1063,11 +1063,7 @@ static void drop_other_mm_ref(void *info)
struct mm_struct *mm = info;
struct mm_struct *active_mm;
-#ifdef CONFIG_X86_64
- active_mm = read_pda(active_mm);
-#else
- active_mm = __get_cpu_var(cpu_tlbstate).active_mm;
-#endif
+ active_mm = percpu_read(cpu_tlbstate.active_mm);
if (active_mm == mm)
leave_mm(smp_processor_id());
--
1.6.1.rc1
Move the exception stacks to per-cpu, removing specific allocation code.
Signed-off-by: Brian Gerst <[email protected]>
---
arch/x86/kernel/cpu/common.c | 23 ++++++++---------------
1 files changed, 8 insertions(+), 15 deletions(-)
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index 5ddbca0..70ce998 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -908,8 +908,9 @@ void __cpuinit pda_init(int cpu)
}
}
-static char boot_exception_stacks[(N_EXCEPTION_STACKS - 1) * EXCEPTION_STKSZ +
- DEBUG_STKSZ] __page_aligned_bss;
+static DEFINE_PER_CPU_PAGE_ALIGNED(char, exception_stacks
+ [(N_EXCEPTION_STACKS - 1) * EXCEPTION_STKSZ + DEBUG_STKSZ])
+ __aligned(PAGE_SIZE);
extern asmlinkage void ignore_sysret(void);
@@ -967,15 +968,12 @@ void __cpuinit cpu_init(void)
struct tss_struct *t = &per_cpu(init_tss, cpu);
struct orig_ist *orig_ist = &per_cpu(orig_ist, cpu);
unsigned long v;
- char *estacks = NULL;
struct task_struct *me;
int i;
/* CPU 0 is initialised in head64.c */
if (cpu != 0)
pda_init(cpu);
- else
- estacks = boot_exception_stacks;
me = current;
@@ -1009,18 +1007,13 @@ void __cpuinit cpu_init(void)
* set up and load the per-CPU TSS
*/
if (!orig_ist->ist[0]) {
- static const unsigned int order[N_EXCEPTION_STACKS] = {
- [0 ... N_EXCEPTION_STACKS - 1] = EXCEPTION_STACK_ORDER,
- [DEBUG_STACK - 1] = DEBUG_STACK_ORDER
+ static const unsigned int sizes[N_EXCEPTION_STACKS] = {
+ [0 ... N_EXCEPTION_STACKS - 1] = EXCEPTION_STKSZ,
+ [DEBUG_STACK - 1] = DEBUG_STKSZ
};
+ char *estacks = per_cpu(exception_stacks, cpu);
for (v = 0; v < N_EXCEPTION_STACKS; v++) {
- if (cpu) {
- estacks = (char *)__get_free_pages(GFP_ATOMIC, order[v]);
- if (!estacks)
- panic("Cannot allocate exception "
- "stack %ld %d\n", v, cpu);
- }
- estacks += PAGE_SIZE << order[v];
+ estacks += sizes[v];
orig_ist->ist[v] = t->x86_tss.ist[v] =
(unsigned long)estacks;
}
--
1.6.1.rc1
Signed-off-by: Brian Gerst <[email protected]>
---
arch/x86/include/asm/pda.h | 1 -
arch/x86/kernel/process_64.c | 5 +++--
2 files changed, 3 insertions(+), 3 deletions(-)
diff --git a/arch/x86/include/asm/pda.h b/arch/x86/include/asm/pda.h
index b30ef6b..c31ca04 100644
--- a/arch/x86/include/asm/pda.h
+++ b/arch/x86/include/asm/pda.h
@@ -23,7 +23,6 @@ struct x8664_pda {
offset 40!!! */
#endif
short in_bootmem; /* pda lives in bootmem */
- short isidle;
} ____cacheline_aligned_in_smp;
DECLARE_PER_CPU(struct x8664_pda, __pda);
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index d4a7391..3cf12f4 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -61,6 +61,7 @@ DEFINE_PER_CPU(struct task_struct *, current_task) = &init_task;
EXPORT_PER_CPU_SYMBOL(current_task);
DEFINE_PER_CPU(unsigned long, oldrsp);
+static DEFINE_PER_CPU(unsigned char, isidle);
unsigned long kernel_thread_flags = CLONE_VM | CLONE_UNTRACED;
@@ -80,13 +81,13 @@ EXPORT_SYMBOL_GPL(idle_notifier_unregister);
void enter_idle(void)
{
- write_pda(isidle, 1);
+ percpu_write(isidle, 1);
atomic_notifier_call_chain(&idle_notifier, IDLE_START, NULL);
}
static void __exit_idle(void)
{
- if (test_and_clear_bit_pda(0, isidle) == 0)
+ if (x86_test_and_clear_bit_percpu(0, isidle) == 0)
return;
atomic_notifier_call_chain(&idle_notifier, IDLE_END, NULL);
}
--
1.6.1.rc1
Signed-off-by: Brian Gerst <[email protected]>
---
arch/x86/include/asm/pda.h | 2 +-
arch/x86/kernel/asm-offsets_64.c | 2 --
arch/x86/kernel/entry_64.S | 8 ++++----
arch/x86/kernel/process_64.c | 8 +++++---
arch/x86/xen/xen-asm_64.S | 8 ++++----
5 files changed, 14 insertions(+), 14 deletions(-)
diff --git a/arch/x86/include/asm/pda.h b/arch/x86/include/asm/pda.h
index 4d28ffb..ae23deb 100644
--- a/arch/x86/include/asm/pda.h
+++ b/arch/x86/include/asm/pda.h
@@ -14,7 +14,7 @@ struct x8664_pda {
unsigned long unused1;
unsigned long unused2;
unsigned long unused3;
- unsigned long oldrsp; /* 24 user rsp for system call */
+ unsigned long unused4;
int irqcount; /* 32 Irq nesting counter. Starts -1 */
unsigned int unused6; /* 36 was cpunumber */
#ifdef CONFIG_CC_STACKPROTECTOR
diff --git a/arch/x86/kernel/asm-offsets_64.c b/arch/x86/kernel/asm-offsets_64.c
index cafff5f..6a13983 100644
--- a/arch/x86/kernel/asm-offsets_64.c
+++ b/arch/x86/kernel/asm-offsets_64.c
@@ -11,7 +11,6 @@
#include <linux/hardirq.h>
#include <linux/suspend.h>
#include <linux/kbuild.h>
-#include <asm/pda.h>
#include <asm/processor.h>
#include <asm/segment.h>
#include <asm/thread_info.h>
@@ -49,7 +48,6 @@ int main(void)
BLANK();
#undef ENTRY
#define ENTRY(entry) DEFINE(pda_ ## entry, offsetof(struct x8664_pda, entry))
- ENTRY(oldrsp);
ENTRY(irqcount);
DEFINE(pda_size, sizeof(struct x8664_pda));
BLANK();
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index 8f35796..1331fd2 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -210,7 +210,7 @@ ENTRY(native_usergs_sysret64)
/* %rsp:at FRAMEEND */
.macro FIXUP_TOP_OF_STACK tmp offset=0
- movq %gs:pda_oldrsp,\tmp
+ movq PER_CPU_VAR(oldrsp),\tmp
movq \tmp,RSP+\offset(%rsp)
movq $__USER_DS,SS+\offset(%rsp)
movq $__USER_CS,CS+\offset(%rsp)
@@ -221,7 +221,7 @@ ENTRY(native_usergs_sysret64)
.macro RESTORE_TOP_OF_STACK tmp offset=0
movq RSP+\offset(%rsp),\tmp
- movq \tmp,%gs:pda_oldrsp
+ movq \tmp,PER_CPU_VAR(oldrsp)
movq EFLAGS+\offset(%rsp),\tmp
movq \tmp,R11+\offset(%rsp)
.endm
@@ -479,7 +479,7 @@ ENTRY(system_call)
*/
ENTRY(system_call_after_swapgs)
- movq %rsp,%gs:pda_oldrsp
+ movq %rsp,PER_CPU_VAR(oldrsp)
movq PER_CPU_VAR(kernelstack),%rsp
/*
* No need to follow this irqs off/on section - it's straight
@@ -523,7 +523,7 @@ sysret_check:
CFI_REGISTER rip,rcx
RESTORE_ARGS 0,-ARG_SKIP,1
/*CFI_REGISTER rflags,r11*/
- movq %gs:pda_oldrsp, %rsp
+ movq PER_CPU_VAR(oldrsp), %rsp
USERGS_SYSRET64
CFI_RESTORE_STATE
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index 2f5bb4d..d4a7391 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -60,6 +60,8 @@ asmlinkage extern void ret_from_fork(void);
DEFINE_PER_CPU(struct task_struct *, current_task) = &init_task;
EXPORT_PER_CPU_SYMBOL(current_task);
+DEFINE_PER_CPU(unsigned long, oldrsp);
+
unsigned long kernel_thread_flags = CLONE_VM | CLONE_UNTRACED;
static ATOMIC_NOTIFIER_HEAD(idle_notifier);
@@ -395,7 +397,7 @@ start_thread(struct pt_regs *regs, unsigned long new_ip, unsigned long new_sp)
load_gs_index(0);
regs->ip = new_ip;
regs->sp = new_sp;
- write_pda(oldrsp, new_sp);
+ percpu_write(oldrsp, new_sp);
regs->cs = __USER_CS;
regs->ss = __USER_DS;
regs->flags = 0x200;
@@ -616,8 +618,8 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
/*
* Switch the PDA and FPU contexts.
*/
- prev->usersp = read_pda(oldrsp);
- write_pda(oldrsp, next->usersp);
+ prev->usersp = percpu_read(oldrsp);
+ percpu_write(oldrsp, next->usersp);
percpu_write(current_task, next_p);
percpu_write(kernelstack,
diff --git a/arch/x86/xen/xen-asm_64.S b/arch/x86/xen/xen-asm_64.S
index eac2794..16d3bfb 100644
--- a/arch/x86/xen/xen-asm_64.S
+++ b/arch/x86/xen/xen-asm_64.S
@@ -195,11 +195,11 @@ RELOC(xen_sysexit, 1b+1)
ENTRY(xen_sysret64)
/* We're already on the usermode stack at this point, but still
with the kernel gs, so we can easily switch back */
- movq %rsp, %gs:pda_oldrsp
+ movq %rsp, PER_CPU_VAR(oldrsp)
movq PER_CPU_VAR(kernelstack),%rsp
pushq $__USER_DS
- pushq %gs:pda_oldrsp
+ pushq PER_CPU_VAR(oldrsp)
pushq %r11
pushq $__USER_CS
pushq %rcx
@@ -212,11 +212,11 @@ RELOC(xen_sysret64, 1b+1)
ENTRY(xen_sysret32)
/* We're already on the usermode stack at this point, but still
with the kernel gs, so we can easily switch back */
- movq %rsp, %gs:pda_oldrsp
+ movq %rsp, PER_CPU_VAR(oldrsp)
movq PER_CPU_VAR(kernelstack), %rsp
pushq $__USER32_DS
- pushq %gs:pda_oldrsp
+ pushq PER_CPU_VAR(oldrsp)
pushq %r11
pushq $__USER32_CS
pushq %rcx
--
1.6.1.rc1
Hello, Brian.
Brian Gerst wrote:
> Accessing memory through %gs should not use rip-relative addressing.
> Adding a P prefix for the argument tells gcc to not add (%rip) to
> the memory references.
Nice catch. I dind't know about the P prefix thing. It also is used
in other places too. Hmmm... I can't find anything about the P
argument prefix in the gcc info page (4.3). Any ideas where I can
find some information about it? It's a bit weird that it's not a
constraint prefix but an argument one.
Thanks.
--
tejun
Hello,
Brian Gerst wrote:
> -void __cpuinit pda_init(int cpu)
> -{
> - struct x8664_pda *pda = cpu_pda(cpu);
> -
> - /* Setup up data that may be needed in __get_free_pages early */
> - loadsegment(fs, 0);
> - loadsegment(gs, 0);
> -
> - load_pda_offset(cpu);
> -}
Simply removing these should work fine as they're also initialized in
other places earlier but I think they still need to be in cpu_init()
because...
/*
* cpu_init() initializes state that is per-CPU. Some data is already
* initialized (naturally) in the bootstrap process, such as the GDT
* and IDT. We reload them nevertheless, this function acts as a
^^^^^^^^^^^^^^^^^^^^^^^^^^^
* 'CPU state barrier', nothing should get across.
* A lot of state is already set up in PDA init for 64 bit
*/
Thanks.
--
tejun
Hello,
Brian Gerst wrote:
> Now that the PDA is empty except for the stack canary, it can be removed.
> The irqstack is moved to the start of the per-cpu section. If the stack
> protector is enabled, the canary overlaps the bottom 48 bytes of the irqstack
> on SMP. On UP it is a seperate variable, since it is the only thing referenced
> via %gs.
Eh... I don't know. Locating stack canary at hard 40byte offset is a
dirty thing to do one way or another. I kind of like doing it
directly in the linker script as it makes the dirty nature more
obvious and doesn't require hunting down the definition in the first
section.
How about something like the following?
#define CANARY_OFFSET 40
#define CANARY_SIZE 8
DECLARE_PER_CPU(unsigned long, stack_canary);
and in linker script,
PERCPU_VADDR_PREALLOC(0, :percpu, CANARY_OFFSET + CANARY_SIZE)
per_cpu__stack_canary = __per_cpu_start + CANARY_OFFSET;
> diff --git a/arch/x86/kernel/vmlinux_64.lds.S b/arch/x86/kernel/vmlinux_64.lds.S
> index a09abb8..c52af06 100644
> --- a/arch/x86/kernel/vmlinux_64.lds.S
> +++ b/arch/x86/kernel/vmlinux_64.lds.S
> @@ -4,6 +4,10 @@
>
> #define LOAD_OFFSET __START_KERNEL_map
>
> +#define PER_CPU_SECTIONS \
> + *(.data.percpu.irqstack) \
> + DEFAULT_PER_CPU_SECTIONS
> +
Where is this used?
> #define PERCPU_VADDR(vaddr, phdr) \
> PERCPU_PROLOG(vaddr) \
> + *(.data.percpu.first) \
> *(.data.percpu.page_aligned) \
> *(.data.percpu) \
> *(.data.percpu.shared_aligned) \
If this is gonna go in the generic PERCPU script directly, why just
not add DEFINE_PER_CPU_FIRST() too?
Thanks.
--
tejun
Brian Gerst wrote:
> Signed-off-by: Brian Gerst <[email protected]>
> ---
> arch/x86/include/asm/pda.h | 39 -------------------------------------
> arch/x86/include/asm/pgtable_64.h | 1 -
> arch/x86/include/asm/smp.h | 1 -
> arch/x86/kernel/cpu/common.c | 1 -
> arch/x86/kernel/process_64.c | 1 -
> arch/x86/kernel/traps.c | 1 -
> 6 files changed, 0 insertions(+), 44 deletions(-)
> delete mode 100644 arch/x86/include/asm/pda.h
Ah.. the sweet smell of death. Nice. :-)
--
tejun
Hello,
Brian Gerst wrote:
> @@ -211,15 +211,13 @@ ENTRY(secondary_startup_64)
> * data area. Computing this requires two symbols - __per_cpu_load
> * and per_cpu__gdt_page. As linker can't do no such relocation, do
> * it by hand. As early_gdt_descr is manipulated by C code for
> - * secondary CPUs, this should be done only once for the boot CPU
> - * when early_gdt_descr_base contains zero.
> + * secondary CPUs, this should be done only once for the boot CPU.
> */
It would probably be nice to mention that it's calculating per cpu
load addresses for early access. The comment only refers to gdt_page
which is a bit misleading as it now also adjusts irqstackptr.
Also, as pda field name, irqstackptr might be fine but wouldn't it be
better to rename it to at least irq_stack_ptr as this change requires
touching every users anyway?
Thanks.
--
tejun
Oops, one more thing.
Brian Gerst wrote:
> +unsigned char is_boot_cpu = 1;
> +
Comment saying that it's reference in head_64.S would be nice.
--
tejun
On Sat, Jan 17, 2009 at 11:58 PM, Tejun Heo <[email protected]> wrote:
> Hello,
>
> Brian Gerst wrote:
>> @@ -211,15 +211,13 @@ ENTRY(secondary_startup_64)
>> * data area. Computing this requires two symbols - __per_cpu_load
>> * and per_cpu__gdt_page. As linker can't do no such relocation, do
>> * it by hand. As early_gdt_descr is manipulated by C code for
>> - * secondary CPUs, this should be done only once for the boot CPU
>> - * when early_gdt_descr_base contains zero.
>> + * secondary CPUs, this should be done only once for the boot CPU.
>> */
>
> It would probably be nice to mention that it's calculating per cpu
> load addresses for early access. The comment only refers to gdt_page
> which is a bit misleading as it now also adjusts irqstackptr.
>
> Also, as pda field name, irqstackptr might be fine but wouldn't it be
> better to rename it to at least irq_stack_ptr as this change requires
> touching every users anyway?
I kept the same name unless it would conflict with something. If you
want to rename it, it should be a separate patch.
--
Brian Gerst
Hello,
> --- a/arch/x86/kernel/setup_percpu.c
> +++ b/arch/x86/kernel/setup_percpu.c
> @@ -147,6 +147,9 @@ unsigned long __per_cpu_offset[NR_CPUS] __read_mostly;
> #endif
> EXPORT_SYMBOL(__per_cpu_offset);
>
> +DEFINE_PER_CPU(int, cpu_number);
> +EXPORT_PER_CPU_SYMBOL(cpu_number);
This is inside CONFIG_HAVE_SETUP_PER_CPU_AREA. I think voyage would
be unhappy with this change.
Thanks.
--
tejun
Hello,
Brian Gerst wrote:
>> Also, as pda field name, irqstackptr might be fine but wouldn't it be
>> better to rename it to at least irq_stack_ptr as this change requires
>> touching every users anyway?
>
> I kept the same name unless it would conflict with something. If you
> want to rename it, it should be a separate patch.
Sure, that's an option too but it might as well be done when making
those percpu variables. I don't really see why those two changes
should be separate. There isn't any reason why they shouldn't be
separate either but if you're already mucking around every user...
Thanks.
--
tejun
On Sat, Jan 17, 2009 at 11:32 PM, Tejun Heo <[email protected]> wrote:
> Hello,
>
> Brian Gerst wrote:
>> -void __cpuinit pda_init(int cpu)
>> -{
>> - struct x8664_pda *pda = cpu_pda(cpu);
>> -
>> - /* Setup up data that may be needed in __get_free_pages early */
>> - loadsegment(fs, 0);
>> - loadsegment(gs, 0);
>> -
>> - load_pda_offset(cpu);
>> -}
>
> Simply removing these should work fine as they're also initialized in
> other places earlier but I think they still need to be in cpu_init()
> because...
>
> /*
> * cpu_init() initializes state that is per-CPU. Some data is already
> * initialized (naturally) in the bootstrap process, such as the GDT
> * and IDT. We reload them nevertheless, this function acts as a
> ^^^^^^^^^^^^^^^^^^^^^^^^^^^
> * 'CPU state barrier', nothing should get across.
> * A lot of state is already set up in PDA init for 64 bit
> */
That comment seems like overkill. It may have been relevant in an
earlier time, but I don't think we need to be so strict now,
especially for things we know are set properly in head_xx.S.
--
Brian Gerst
Brian Gerst wrote:
> On Sat, Jan 17, 2009 at 11:32 PM, Tejun Heo <[email protected]> wrote:
>> Hello,
>>
>> Brian Gerst wrote:
>>> -void __cpuinit pda_init(int cpu)
>>> -{
>>> - struct x8664_pda *pda = cpu_pda(cpu);
>>> -
>>> - /* Setup up data that may be needed in __get_free_pages early */
>>> - loadsegment(fs, 0);
>>> - loadsegment(gs, 0);
>>> -
>>> - load_pda_offset(cpu);
>>> -}
>> Simply removing these should work fine as they're also initialized in
>> other places earlier but I think they still need to be in cpu_init()
>> because...
>>
>> /*
>> * cpu_init() initializes state that is per-CPU. Some data is already
>> * initialized (naturally) in the bootstrap process, such as the GDT
>> * and IDT. We reload them nevertheless, this function acts as a
>> ^^^^^^^^^^^^^^^^^^^^^^^^^^^
>> * 'CPU state barrier', nothing should get across.
>> * A lot of state is already set up in PDA init for 64 bit
>> */
>
> That comment seems like overkill. It may have been relevant in an
> earlier time, but I don't think we need to be so strict now,
> especially for things we know are set properly in head_xx.S.
That may be so, but wouldn't such change deserve a separate patch with
accompanying update in the comment? It's not simple move from A to B
change.
Thanks.
--
tejun
On Sun, Jan 18, 2009 at 12:05 AM, Tejun Heo <[email protected]> wrote:
> Hello,
>
>> --- a/arch/x86/kernel/setup_percpu.c
>> +++ b/arch/x86/kernel/setup_percpu.c
>> @@ -147,6 +147,9 @@ unsigned long __per_cpu_offset[NR_CPUS] __read_mostly;
>> #endif
>> EXPORT_SYMBOL(__per_cpu_offset);
>>
>> +DEFINE_PER_CPU(int, cpu_number);
>> +EXPORT_PER_CPU_SYMBOL(cpu_number);
>
> This is inside CONFIG_HAVE_SETUP_PER_CPU_AREA. I think voyage would
> be unhappy with this change.
Is there any specific reason Voyager doesn't use the x86
setup_per_cpu_areas() function? I don't see anything on a quick
glance that would not work. The x86 code is pretty much a superset of
the default code in init/main.c.
--
Brian Gerst
Brian Gerst wrote:
> On Sun, Jan 18, 2009 at 12:05 AM, Tejun Heo <[email protected]> wrote:
>> Hello,
>>
>>> --- a/arch/x86/kernel/setup_percpu.c
>>> +++ b/arch/x86/kernel/setup_percpu.c
>>> @@ -147,6 +147,9 @@ unsigned long __per_cpu_offset[NR_CPUS] __read_mostly;
>>> #endif
>>> EXPORT_SYMBOL(__per_cpu_offset);
>>>
>>> +DEFINE_PER_CPU(int, cpu_number);
>>> +EXPORT_PER_CPU_SYMBOL(cpu_number);
>> This is inside CONFIG_HAVE_SETUP_PER_CPU_AREA. I think voyage would
>> be unhappy with this change.
>
> Is there any specific reason Voyager doesn't use the x86
> setup_per_cpu_areas() function? I don't see anything on a quick
> glance that would not work. The x86 code is pretty much a superset of
> the default code in init/main.c.
I have no idea at all. Given that not many people can test it, I
figured just leaving it alone would be the best course but if it can
be merged, all the better.
Thanks.
--
tejun
On Sun, Jan 18, 2009 at 12:59 AM, Tejun Heo <[email protected]> wrote:
> Brian Gerst wrote:
>> On Sun, Jan 18, 2009 at 12:05 AM, Tejun Heo <[email protected]> wrote:
>>> Hello,
>>>
>>>> --- a/arch/x86/kernel/setup_percpu.c
>>>> +++ b/arch/x86/kernel/setup_percpu.c
>>>> @@ -147,6 +147,9 @@ unsigned long __per_cpu_offset[NR_CPUS] __read_mostly;
>>>> #endif
>>>> EXPORT_SYMBOL(__per_cpu_offset);
>>>>
>>>> +DEFINE_PER_CPU(int, cpu_number);
>>>> +EXPORT_PER_CPU_SYMBOL(cpu_number);
>>> This is inside CONFIG_HAVE_SETUP_PER_CPU_AREA. I think voyage would
>>> be unhappy with this change.
>>
>> Is there any specific reason Voyager doesn't use the x86
>> setup_per_cpu_areas() function? I don't see anything on a quick
>> glance that would not work. The x86 code is pretty much a superset of
>> the default code in init/main.c.
>
> I have no idea at all. Given that not many people can test it, I
> figured just leaving it alone would be the best course but if it can
> be merged, all the better.
Unfortunately Voyager doesn't compile currently for unrelated reasons.
I'll take a look at incorporating it into these patches, but I can't
even do a compile test right now.
--
Brian Gerst
* Brian Gerst <[email protected]> wrote:
> On Sun, Jan 18, 2009 at 12:59 AM, Tejun Heo <[email protected]> wrote:
> > Brian Gerst wrote:
> >> On Sun, Jan 18, 2009 at 12:05 AM, Tejun Heo <[email protected]> wrote:
> >>> Hello,
> >>>
> >>>> --- a/arch/x86/kernel/setup_percpu.c
> >>>> +++ b/arch/x86/kernel/setup_percpu.c
> >>>> @@ -147,6 +147,9 @@ unsigned long __per_cpu_offset[NR_CPUS] __read_mostly;
> >>>> #endif
> >>>> EXPORT_SYMBOL(__per_cpu_offset);
> >>>>
> >>>> +DEFINE_PER_CPU(int, cpu_number);
> >>>> +EXPORT_PER_CPU_SYMBOL(cpu_number);
> >>> This is inside CONFIG_HAVE_SETUP_PER_CPU_AREA. I think voyage would
> >>> be unhappy with this change.
> >>
> >> Is there any specific reason Voyager doesn't use the x86
> >> setup_per_cpu_areas() function? I don't see anything on a quick
> >> glance that would not work. The x86 code is pretty much a superset of
> >> the default code in init/main.c.
> >
> > I have no idea at all. Given that not many people can test it, I
> > figured just leaving it alone would be the best course but if it can
> > be merged, all the better.
>
> Unfortunately Voyager doesn't compile currently for unrelated reasons.
> I'll take a look at incorporating it into these patches, but I can't
> even do a compile test right now.
Peter/James, what's the current status of x86/Voyager cleanups?
A couple of months ago i made a few suggestions about how to convert
Voyager to the cleaner x86_quirks 'quirks HAL' (from the current fragile
and hard and expensive to maintain 'compile time HAL'), but it didnt seem
to go anywhere. See the discussion of this timeframe:
http://lkml.org/lkml/2008/11/3/53
The VisWS subarch (which was a similarly excentric design that was only a
PC in terms of having Intel CPUs) has been converted to CONFIG_X86_VISWS
already, with arch/x86/kernel/visws_quirks.c holding the optional quirk
handlers.
The desired end result would be to have a CONFIG_X86_VOYAGER=y build mode
that adds the quirk handlers to an otherwise generic kernel, with most of
the quirks concentrated into a single arch/x86/kernel/voyager_quirks.c
file - instead of having a full subarch for x86/Voyager. Both
arch/x86/mach-voyager/ and arch/x86/include/asm/mach-voyager/ would go
away in the end - because all functionality is merged into the generic
code and the quirks would be in voyager_quirks.c.
I'd be glad to lend a helping hand both with the patches and with testing
on non-Voyager - especially the SMP bits probably need extensions on the
x86_quirks side. (And i'm sure the other x86 maintainers would we glad to
help out with this process too.)
x86/Voyager is the last holdout in this area, and with an active kernel
developer like James behind it it ought to be fixable - should James have
the time/interest.
If there's no time/interest in that then we can temporarily mark Voyager
CONFIG_BROKEN until cleanup/fix patches arrive.
Ingo
On Sat, Jan 17, 2009 at 11:52 PM, Tejun Heo <[email protected]> wrote:
> Hello,
>
> Brian Gerst wrote:
>> Now that the PDA is empty except for the stack canary, it can be removed.
>> The irqstack is moved to the start of the per-cpu section. If the stack
>> protector is enabled, the canary overlaps the bottom 48 bytes of the irqstack
>> on SMP. On UP it is a seperate variable, since it is the only thing referenced
>> via %gs.
>
> Eh... I don't know. Locating stack canary at hard 40byte offset is a
> dirty thing to do one way or another. I kind of like doing it
> directly in the linker script as it makes the dirty nature more
> obvious and doesn't require hunting down the definition in the first
> section.
>
> How about something like the following?
>
> #define CANARY_OFFSET 40
> #define CANARY_SIZE 8
>
> DECLARE_PER_CPU(unsigned long, stack_canary);
>
> and in linker script,
>
> PERCPU_VADDR_PREALLOC(0, :percpu, CANARY_OFFSET + CANARY_SIZE)
> per_cpu__stack_canary = __per_cpu_start + CANARY_OFFSET;
>
The thing I don't like about the prealloc method is that it puts the
page-aligned variables at the end. This leaves a gap which is
unavailable for dynamic allocations. Stealing 48 bytes from the
bottom of the irqstack (which is 16k) keeps the page-aligned section
at the start. It's really no different than how the thread_info
structure sits at the bottom of the process stack.
How about something like:
union irq_stack_union {
char irq_stack[IRQSTACKSIZE];
struct {
char pad[40];
unsigned long stack_canary;
}
};
That documents the overlay better, and avoids having to touch the linker script.
--
Brian Gerst
Hello, Brian.
Brian Gerst wrote:
>> How about something like the following?
>>
>> #define CANARY_OFFSET 40
>> #define CANARY_SIZE 8
>>
>> DECLARE_PER_CPU(unsigned long, stack_canary);
>>
>> and in linker script,
>>
>> PERCPU_VADDR_PREALLOC(0, :percpu, CANARY_OFFSET + CANARY_SIZE)
>> per_cpu__stack_canary = __per_cpu_start + CANARY_OFFSET;
>>
>
> The thing I don't like about the prealloc method is that it puts the
> page-aligned variables at the end. This leaves a gap which is
> unavailable for dynamic allocations. Stealing 48 bytes from the
> bottom of the irqstack (which is 16k) keeps the page-aligned section
> at the start. It's really no different than how the thread_info
> structure sits at the bottom of the process stack.
>
> How about something like:
> union irq_stack_union {
> char irq_stack[IRQSTACKSIZE];
> struct {
> char pad[40];
> unsigned long stack_canary;
> }
> };
>
> That documents the overlay better, and avoids having to touch the
> linker script.
I have no objection as long as it's sufficiently documented.
Thanks.
--
tejun
* Tejun Heo <[email protected]> wrote:
> Hello,
>
> Brian Gerst wrote:
> >> Also, as pda field name, irqstackptr might be fine but wouldn't it be
> >> better to rename it to at least irq_stack_ptr as this change requires
> >> touching every users anyway?
> >
> > I kept the same name unless it would conflict with something. If you
> > want to rename it, it should be a separate patch.
>
> Sure, that's an option too but it might as well be done when making
> those percpu variables. I don't really see why those two changes should
> be separate. There isn't any reason why they shouldn't be separate
> either but if you're already mucking around every user...
yes, we should do the rename in this same patch as both patches have no
impact on the actual kernel image. (sans source code line and string
related deltas in the image.) Feel free to do the rename in his patch -
that's the fastest way.
I'd suggest to rename from:
DECLARE_PER_CPU(char, irqstack[IRQSTACKSIZE]);
DECLARE_PER_CPU(char *, irqstackptr);
to:
DECLARE_PER_CPU(char, irq_stack[IRQSTACKSIZE]);
DECLARE_PER_CPU(char *, irq_stack_ptr);
In other cases, lets get Brian's patches in without further latencies so
that we drain his pending-patches pipeline and get a consolidated base
everyone can work from. Brian already had an unfortunate (and
time-consuming) rebase/conflict-resolution pass due to us moving the
percpu code from under him. We can do non-critical followups in separate
delta patches.
Ingo
* Tejun Heo <[email protected]> wrote:
> Hello, Brian.
>
> Brian Gerst wrote:
> >> How about something like the following?
> >>
> >> #define CANARY_OFFSET 40
> >> #define CANARY_SIZE 8
> >>
> >> DECLARE_PER_CPU(unsigned long, stack_canary);
> >>
> >> and in linker script,
> >>
> >> PERCPU_VADDR_PREALLOC(0, :percpu, CANARY_OFFSET + CANARY_SIZE)
> >> per_cpu__stack_canary = __per_cpu_start + CANARY_OFFSET;
> >>
> >
> > The thing I don't like about the prealloc method is that it puts the
> > page-aligned variables at the end. This leaves a gap which is
> > unavailable for dynamic allocations. Stealing 48 bytes from the
> > bottom of the irqstack (which is 16k) keeps the page-aligned section
> > at the start. It's really no different than how the thread_info
> > structure sits at the bottom of the process stack.
> >
> > How about something like:
> > union irq_stack_union {
> > char irq_stack[IRQSTACKSIZE];
> > struct {
> > char pad[40];
> > unsigned long stack_canary;
> > }
> > };
> >
> > That documents the overlay better, and avoids having to touch the
> > linker script.
>
> I have no objection as long as it's sufficiently documented.
There is another advantage from Brian's trick of reusing the IRQ stack
bottom: if we ever overflow the IRQ stack the kernel will likely stomp on
the canary and overwrite it, and then (if the user runs on a
stackprotector kernel) we will get an instantaneous assert and backtrace,
exactly where the overflow happened.
Small overflows are otherwise rather hard to catch right on the spot so
this is a bonus.
Ingo
Hello, Ingo, Brian.
Ingo Molnar wrote:
>> Sure, that's an option too but it might as well be done when making
>> those percpu variables. I don't really see why those two changes should
>> be separate. There isn't any reason why they shouldn't be separate
>> either but if you're already mucking around every user...
>
> yes, we should do the rename in this same patch as both patches have no
> impact on the actual kernel image. (sans source code line and string
> related deltas in the image.) Feel free to do the rename in his patch -
> that's the fastest way.
>
> I'd suggest to rename from:
>
> DECLARE_PER_CPU(char, irqstack[IRQSTACKSIZE]);
> DECLARE_PER_CPU(char *, irqstackptr);
>
> to:
>
> DECLARE_PER_CPU(char, irq_stack[IRQSTACKSIZE]);
> DECLARE_PER_CPU(char *, irq_stack_ptr);
>
> In other cases, lets get Brian's patches in without further latencies so
> that we drain his pending-patches pipeline and get a consolidated base
> everyone can work from. Brian already had an unfortunate (and
> time-consuming) rebase/conflict-resolution pass due to us moving the
> percpu code from under him. We can do non-critical followups in separate
> delta patches.
Alright, there are other places where renaming would be nice -
kernelstack, oldrsp, irqcount, irqstackptr and isidle. I'll rename
them as I add the patches. Here's the plan.
01-04 : w/ renames
05 : will mark voyage broken
06-12 : w/ renames
13-17 : will wait for update
If anyone doesn't like it. Please scream.
Thanks.
--
tejun
* Tejun Heo <[email protected]> wrote:
> Hello, Ingo, Brian.
>
> Ingo Molnar wrote:
> >> Sure, that's an option too but it might as well be done when making
> >> those percpu variables. I don't really see why those two changes should
> >> be separate. There isn't any reason why they shouldn't be separate
> >> either but if you're already mucking around every user...
> >
> > yes, we should do the rename in this same patch as both patches have no
> > impact on the actual kernel image. (sans source code line and string
> > related deltas in the image.) Feel free to do the rename in his patch -
> > that's the fastest way.
> >
> > I'd suggest to rename from:
> >
> > DECLARE_PER_CPU(char, irqstack[IRQSTACKSIZE]);
> > DECLARE_PER_CPU(char *, irqstackptr);
> >
> > to:
> >
> > DECLARE_PER_CPU(char, irq_stack[IRQSTACKSIZE]);
> > DECLARE_PER_CPU(char *, irq_stack_ptr);
> >
> > In other cases, lets get Brian's patches in without further latencies so
> > that we drain his pending-patches pipeline and get a consolidated base
> > everyone can work from. Brian already had an unfortunate (and
> > time-consuming) rebase/conflict-resolution pass due to us moving the
> > percpu code from under him. We can do non-critical followups in separate
> > delta patches.
>
> Alright, there are other places where renaming would be nice -
> kernelstack, oldrsp, irqcount, irqstackptr and isidle. I'll rename
> them as I add the patches. Here's the plan.
yeah. Please also rename IRQSTACKSIZE to the muchmorereadable
IRQ_STACK_SIZE ;-) Some keyboard must have been missing the underline key
when this was added, it reads awful.
> 01-04 : w/ renames
> 05 : will mark voyage broken
(Please hold this one until James has had a chance to react.)
> 06-12 : w/ renames
> 13-17 : will wait for update
>
> If anyone doesn't like it. Please scream.
Sounds good!
Ingo
On Sun, 2009-01-18 at 08:14 +0100, Ingo Molnar wrote:
> * Brian Gerst <[email protected]> wrote:
>
> > On Sun, Jan 18, 2009 at 12:59 AM, Tejun Heo <[email protected]> wrote:
> > > Brian Gerst wrote:
> > >> On Sun, Jan 18, 2009 at 12:05 AM, Tejun Heo <[email protected]> wrote:
> > >>> Hello,
> > >>>
> > >>>> --- a/arch/x86/kernel/setup_percpu.c
> > >>>> +++ b/arch/x86/kernel/setup_percpu.c
> > >>>> @@ -147,6 +147,9 @@ unsigned long __per_cpu_offset[NR_CPUS] __read_mostly;
> > >>>> #endif
> > >>>> EXPORT_SYMBOL(__per_cpu_offset);
> > >>>>
> > >>>> +DEFINE_PER_CPU(int, cpu_number);
> > >>>> +EXPORT_PER_CPU_SYMBOL(cpu_number);
> > >>> This is inside CONFIG_HAVE_SETUP_PER_CPU_AREA. I think voyage would
> > >>> be unhappy with this change.
> > >>
> > >> Is there any specific reason Voyager doesn't use the x86
> > >> setup_per_cpu_areas() function? I don't see anything on a quick
> > >> glance that would not work. The x86 code is pretty much a superset of
> > >> the default code in init/main.c.
> > >
> > > I have no idea at all. Given that not many people can test it, I
> > > figured just leaving it alone would be the best course but if it can
> > > be merged, all the better.
> >
> > Unfortunately Voyager doesn't compile currently for unrelated reasons.
> > I'll take a look at incorporating it into these patches, but I can't
> > even do a compile test right now.
What are "unrelated reasons"?, 2.6.28 compiles and boots for me, except
some of the compile fixes (which are regressions, by the way) aren't
included in spite of being sent several times.
I've put them up here:
git://git.kernel.org/pub/scm/linux/kernel/git/jejb/voyager-2.6.git
I haven't included the cpumask fixes (so it won't compile on 2.6.29-rc2
yet) because I'll have to try to polish them to fit in with whatever's
going on. Plus there's some type of initramfs boot failure that I need
to investigate. However, usually I wait until the x86 churn is
finished, which is a lot later into the -rc cycle than this before
fixing up all the breakage.
> Peter/James, what's the current status of x86/Voyager cleanups?
The only outstanding problem I can see in 2.6.29 is a cpumask screw up
caused by Mike Travis ... it looks easily fixable, he just forgot to
convert voyager.
I have to say that putting the SMP CPU definitions in cpu/common.c
hedged around with ifdefs for type looks really to be the wrong thing to
do. We already have compile selected files with these types, the
definition should be in there.
> A couple of months ago i made a few suggestions about how to convert
> Voyager to the cleaner x86_quirks 'quirks HAL' (from the current fragile
> and hard and expensive to maintain 'compile time HAL'), but it didnt seem
> to go anywhere. See the discussion of this timeframe:
>
> http://lkml.org/lkml/2008/11/3/53
>
> The VisWS subarch (which was a similarly excentric design that was only a
> PC in terms of having Intel CPUs) has been converted to CONFIG_X86_VISWS
> already, with arch/x86/kernel/visws_quirks.c holding the optional quirk
> handlers.
>
> The desired end result would be to have a CONFIG_X86_VOYAGER=y build mode
> that adds the quirk handlers to an otherwise generic kernel, with most of
> the quirks concentrated into a single arch/x86/kernel/voyager_quirks.c
> file - instead of having a full subarch for x86/Voyager. Both
> arch/x86/mach-voyager/ and arch/x86/include/asm/mach-voyager/ would go
> away in the end - because all functionality is merged into the generic
> code and the quirks would be in voyager_quirks.c.
You appear to have forgotten that we already had this discussion here:
http://marc.info/?t=122539020300002
But to precis, the bottom line is that I'm concerned about the damage to
mainline x86 this would cause because voyager is a vastly different
beast. We'd be doubling at least the number of function pointer
indirections, plus the current quirk stuff is inadequate: voyager needs
boot time separation to handle the unique SUS maps and other things, so
there'd be a big intrusion into the boot system as well.
> I'd be glad to lend a helping hand both with the patches and with testing
> on non-Voyager - especially the SMP bits probably need extensions on the
> x86_quirks side. (And i'm sure the other x86 maintainers would we glad to
> help out with this process too.)
>
> x86/Voyager is the last holdout in this area, and with an active kernel
> developer like James behind it it ought to be fixable - should James have
> the time/interest.
But no-one's yet made any argument for why it's a worthwhile thing to be
doing.
> If there's no time/interest in that then we can temporarily mark Voyager
> CONFIG_BROKEN until cleanup/fix patches arrive.
It's not broken and I've already sent you the cleanup/fix patches ... I
can send them directly to Linus as voyager maintainer if you prefer.
James
* Brian Gerst <[email protected]> wrote:
> On Sat, Jan 17, 2009 at 11:22 PM, Tejun Heo <[email protected]> wrote:
>
> > Hello, Brian.
> >
> > Brian Gerst wrote:
> > > Accessing memory through %gs should not use rip-relative addressing.
> > > Adding a P prefix for the argument tells gcc to not add (%rip) to
> > > the memory references.
> >
> > Nice catch. I dind't know about the P prefix thing. It also is used
> > in other places too. Hmmm... I can't find anything about the P
> > argument prefix in the gcc info page (4.3). Any ideas where I can
> > find some information about it? It's a bit weird that it's not a
> > constraint prefix but an argument one.
>
> The only place I could confirm that it works is in the gcc source
> itself, and even there it's not well documented.
does %P support go back as far as gcc 3.2 (the earliest GCC we still
support)?
Ingo
* Ingo Molnar <[email protected]> wrote:
>
> * Brian Gerst <[email protected]> wrote:
>
> > On Sat, Jan 17, 2009 at 11:22 PM, Tejun Heo <[email protected]> wrote:
> >
> > > Hello, Brian.
> > >
> > > Brian Gerst wrote:
> > > > Accessing memory through %gs should not use rip-relative addressing.
> > > > Adding a P prefix for the argument tells gcc to not add (%rip) to
> > > > the memory references.
> > >
> > > Nice catch. I dind't know about the P prefix thing. It also is used
> > > in other places too. Hmmm... I can't find anything about the P
> > > argument prefix in the gcc info page (4.3). Any ideas where I can
> > > find some information about it? It's a bit weird that it's not a
> > > constraint prefix but an argument one.
> >
> > The only place I could confirm that it works is in the gcc source
> > itself, and even there it's not well documented.
>
> does %P support go back as far as gcc 3.2 (the earliest GCC we still
> support)?
update: Brian pointed it out off-list that switch_to() already uses %P, so
we already rely on it.
Ingo
On Sun, Jan 18, 2009 at 11:41 AM, James Bottomley
<[email protected]> wrote:
> On Sun, 2009-01-18 at 08:14 +0100, Ingo Molnar wrote:
>> * Brian Gerst <[email protected]> wrote:
>>
>> > On Sun, Jan 18, 2009 at 12:59 AM, Tejun Heo <[email protected]> wrote:
>> > > Brian Gerst wrote:
>> > >> On Sun, Jan 18, 2009 at 12:05 AM, Tejun Heo <[email protected]> wrote:
>> > >>> Hello,
>> > >>>
>> > >>>> --- a/arch/x86/kernel/setup_percpu.c
>> > >>>> +++ b/arch/x86/kernel/setup_percpu.c
>> > >>>> @@ -147,6 +147,9 @@ unsigned long __per_cpu_offset[NR_CPUS] __read_mostly;
>> > >>>> #endif
>> > >>>> EXPORT_SYMBOL(__per_cpu_offset);
>> > >>>>
>> > >>>> +DEFINE_PER_CPU(int, cpu_number);
>> > >>>> +EXPORT_PER_CPU_SYMBOL(cpu_number);
>> > >>> This is inside CONFIG_HAVE_SETUP_PER_CPU_AREA. I think voyage would
>> > >>> be unhappy with this change.
>> > >>
>> > >> Is there any specific reason Voyager doesn't use the x86
>> > >> setup_per_cpu_areas() function? I don't see anything on a quick
>> > >> glance that would not work. The x86 code is pretty much a superset of
>> > >> the default code in init/main.c.
>> > >
>> > > I have no idea at all. Given that not many people can test it, I
>> > > figured just leaving it alone would be the best course but if it can
>> > > be merged, all the better.
>> >
>> > Unfortunately Voyager doesn't compile currently for unrelated reasons.
>> > I'll take a look at incorporating it into these patches, but I can't
>> > even do a compile test right now.
>
> What are "unrelated reasons"?, 2.6.28 compiles and boots for me, except
> some of the compile fixes (which are regressions, by the way) aren't
> included in spite of being sent several times.
>
> I've put them up here:
>
> git://git.kernel.org/pub/scm/linux/kernel/git/jejb/voyager-2.6.git
>
> I haven't included the cpumask fixes (so it won't compile on 2.6.29-rc2
> yet) because I'll have to try to polish them to fit in with whatever's
> going on. Plus there's some type of initramfs boot failure that I need
> to investigate. However, usually I wait until the x86 churn is
> finished, which is a lot later into the -rc cycle than this before
> fixing up all the breakage.
>
>> Peter/James, what's the current status of x86/Voyager cleanups?
>
> The only outstanding problem I can see in 2.6.29 is a cpumask screw up
> caused by Mike Travis ... it looks easily fixable, he just forgot to
> convert voyager.
>
> I have to say that putting the SMP CPU definitions in cpu/common.c
> hedged around with ifdefs for type looks really to be the wrong thing to
> do. We already have compile selected files with these types, the
> definition should be in there.
>
>> A couple of months ago i made a few suggestions about how to convert
>> Voyager to the cleaner x86_quirks 'quirks HAL' (from the current fragile
>> and hard and expensive to maintain 'compile time HAL'), but it didnt seem
>> to go anywhere. See the discussion of this timeframe:
>>
>> http://lkml.org/lkml/2008/11/3/53
>>
>> The VisWS subarch (which was a similarly excentric design that was only a
>> PC in terms of having Intel CPUs) has been converted to CONFIG_X86_VISWS
>> already, with arch/x86/kernel/visws_quirks.c holding the optional quirk
>> handlers.
>>
>> The desired end result would be to have a CONFIG_X86_VOYAGER=y build mode
>> that adds the quirk handlers to an otherwise generic kernel, with most of
>> the quirks concentrated into a single arch/x86/kernel/voyager_quirks.c
>> file - instead of having a full subarch for x86/Voyager. Both
>> arch/x86/mach-voyager/ and arch/x86/include/asm/mach-voyager/ would go
>> away in the end - because all functionality is merged into the generic
>> code and the quirks would be in voyager_quirks.c.
>
> You appear to have forgotten that we already had this discussion here:
>
> http://marc.info/?t=122539020300002
>
> But to precis, the bottom line is that I'm concerned about the damage to
> mainline x86 this would cause because voyager is a vastly different
> beast. We'd be doubling at least the number of function pointer
> indirections, plus the current quirk stuff is inadequate: voyager needs
> boot time separation to handle the unique SUS maps and other things, so
> there'd be a big intrusion into the boot system as well.
>
>> I'd be glad to lend a helping hand both with the patches and with testing
>> on non-Voyager - especially the SMP bits probably need extensions on the
>> x86_quirks side. (And i'm sure the other x86 maintainers would we glad to
>> help out with this process too.)
>>
>> x86/Voyager is the last holdout in this area, and with an active kernel
>> developer like James behind it it ought to be fixable - should James have
>> the time/interest.
>
> But no-one's yet made any argument for why it's a worthwhile thing to be
> doing.
>
>> If there's no time/interest in that then we can temporarily mark Voyager
>> CONFIG_BROKEN until cleanup/fix patches arrive.
>
> It's not broken and I've already sent you the cleanup/fix patches ... I
> can send them directly to Linus as voyager maintainer if you prefer.
The build breakage was due to the cpumask changes I believe, inherited
from -tip.
There is alot of duplicated code in voyager_smp.c that is making it
difficult for me to work on the per-cpu changes. Do you see any
reason that Voyager can't use the normal x86 setup_per_cpu_areas()
code?
--
Brian Gerst
On Sun, 2009-01-18 at 12:41 -0500, Brian Gerst wrote:
> On Sun, Jan 18, 2009 at 11:41 AM, James Bottomley
> <[email protected]> wrote:
> > On Sun, 2009-01-18 at 08:14 +0100, Ingo Molnar wrote:
> >> If there's no time/interest in that then we can temporarily mark Voyager
> >> CONFIG_BROKEN until cleanup/fix patches arrive.
> >
> > It's not broken and I've already sent you the cleanup/fix patches ... I
> > can send them directly to Linus as voyager maintainer if you prefer.
>
> The build breakage was due to the cpumask changes I believe, inherited
> from -tip.
>
> There is alot of duplicated code in voyager_smp.c that is making it
> difficult for me to work on the per-cpu changes.
Actually, there's very little duplicated code, but what there is we can
unify. The reason it duplicates the API is because it has to provide a
different implementation for voyager ... it's not a PC x86 architecture.
> Do you see any
> reason that Voyager can't use the normal x86 setup_per_cpu_areas()
> code?
Er, well, yes. Current setup_per_cpu_areas is setting up the cpu<->apic
maps. Voyager has no apics, so it has no use for any of the arrays
being set up in there.
If you're proposing to add arrays that would actually be useful to
voyager, then sure we can use it ... it's just at the moment there's no
need. What is it you want to add in there?
James
* James Bottomley <[email protected]> wrote:
> On Sun, 2009-01-18 at 08:14 +0100, Ingo Molnar wrote:
> > * Brian Gerst <[email protected]> wrote:
> >
> > > On Sun, Jan 18, 2009 at 12:59 AM, Tejun Heo <[email protected]> wrote:
> > > > Brian Gerst wrote:
> > > >> On Sun, Jan 18, 2009 at 12:05 AM, Tejun Heo <[email protected]> wrote:
> > > >>> Hello,
> > > >>>
> > > >>>> --- a/arch/x86/kernel/setup_percpu.c
> > > >>>> +++ b/arch/x86/kernel/setup_percpu.c
> > > >>>> @@ -147,6 +147,9 @@ unsigned long __per_cpu_offset[NR_CPUS] __read_mostly;
> > > >>>> #endif
> > > >>>> EXPORT_SYMBOL(__per_cpu_offset);
> > > >>>>
> > > >>>> +DEFINE_PER_CPU(int, cpu_number);
> > > >>>> +EXPORT_PER_CPU_SYMBOL(cpu_number);
> > > >>> This is inside CONFIG_HAVE_SETUP_PER_CPU_AREA. I think voyage would
> > > >>> be unhappy with this change.
> > > >>
> > > >> Is there any specific reason Voyager doesn't use the x86
> > > >> setup_per_cpu_areas() function? I don't see anything on a quick
> > > >> glance that would not work. The x86 code is pretty much a superset of
> > > >> the default code in init/main.c.
> > > >
> > > > I have no idea at all. Given that not many people can test it, I
> > > > figured just leaving it alone would be the best course but if it can
> > > > be merged, all the better.
> > >
> > > Unfortunately Voyager doesn't compile currently for unrelated reasons.
> > > I'll take a look at incorporating it into these patches, but I can't
> > > even do a compile test right now.
>
> What are "unrelated reasons"?, 2.6.28 compiles and boots for me, except
> some of the compile fixes (which are regressions, by the way) aren't
> included in spite of being sent several times.
>
> I've put them up here:
>
> git://git.kernel.org/pub/scm/linux/kernel/git/jejb/voyager-2.6.git
These are not complete as you did not implement the cleanups that we
suggested - and hence they are not acceptable (i.e. consider this a NAK).
They just prolongue the pain of subarchitectures.
Your previous round of fixes were problematic: i remember them breaking
the normal x86 build at least twice - showing the collateral cost of
subarchitectures. If we had the x86/Voyager complications isolated in a
single arch/x86/kernel/voyager_quicks.c module, via a qx86_quirks and
similar mechanisms the end result would be far more maintainable.
> I haven't included the cpumask fixes (so it won't compile on 2.6.29-rc2
> yet) because I'll have to try to polish them to fit in with whatever's
> going on. Plus there's some type of initramfs boot failure that I need
> to investigate. However, usually I wait until the x86 churn is
> finished, which is a lot later into the -rc cycle than this before
> fixing up all the breakage.
>
> > Peter/James, what's the current status of x86/Voyager cleanups?
>
> The only outstanding problem I can see in 2.6.29 is a cpumask screw up
> caused by Mike Travis ... it looks easily fixable, he just forgot to
> convert voyager.
>
> I have to say that putting the SMP CPU definitions in cpu/common.c
> hedged around with ifdefs for type looks really to be the wrong thing to
> do. We already have compile selected files with these types, the
> definition should be in there.
>
> > A couple of months ago i made a few suggestions about how to convert
> > Voyager to the cleaner x86_quirks 'quirks HAL' (from the current
> > fragile and hard and expensive to maintain 'compile time HAL'), but it
> > didnt seem to go anywhere. See the discussion of this timeframe:
> >
> > http://lkml.org/lkml/2008/11/3/53
> >
> > The VisWS subarch (which was a similarly excentric design that was
> > only a PC in terms of having Intel CPUs) has been converted to
> > CONFIG_X86_VISWS already, with arch/x86/kernel/visws_quirks.c holding
> > the optional quirk handlers.
> >
> > The desired end result would be to have a CONFIG_X86_VOYAGER=y build
> > mode that adds the quirk handlers to an otherwise generic kernel, with
> > most of the quirks concentrated into a single
> > arch/x86/kernel/voyager_quirks.c file - instead of having a full
> > subarch for x86/Voyager. Both arch/x86/mach-voyager/ and
> > arch/x86/include/asm/mach-voyager/ would go away in the end - because
> > all functionality is merged into the generic code and the quirks would
> > be in voyager_quirks.c.
>
> You appear to have forgotten that we already had this discussion here:
>
> http://marc.info/?t=122539020300002
Why would i have forgotten that? We asked you to do those cleanups and
offered help. AFAICS you have not submitted patches to that effect and you
did not address the review feedback we gave you on your patches. If you
have sent patches that implement my x86_quirks suggestions then please
show me the URIs.
> But to precis, the bottom line is that I'm concerned about the damage to
> mainline x86 this would cause because voyager is a vastly different
> beast. We'd be doubling at least the number of function pointer
> indirections, plus the current quirk stuff is inadequate: voyager needs
> boot time separation to handle the unique SUS maps and other things, so
> there'd be a big intrusion into the boot system as well.
>
> > I'd be glad to lend a helping hand both with the patches and with testing
> > on non-Voyager - especially the SMP bits probably need extensions on the
> > x86_quirks side. (And i'm sure the other x86 maintainers would we glad to
> > help out with this process too.)
> >
> > x86/Voyager is the last holdout in this area, and with an active kernel
> > developer like James behind it it ought to be fixable - should James have
> > the time/interest.
>
> But no-one's yet made any argument for why it's a worthwhile thing to be
> doing.
Because the sub-arch code is butt-ugly.
x86 subarchitectures are a hack that should never have gone upstream - and
we are now reversing that braindamage, step by step. Subarchitectures are
a compile-time "HAL", but a highly non-transparent one at that. They
complicates the x86 architecture in a couple of key structures and very
fundamentally so - and that results in continued complications in critical
areas of the x86 code.
One example where this shows up in full force in the Kconfig space. For
example in arch/x86/Kconfig we have _more than 20_ VOYAGER quirks:
select HAVE_KVM if ((X86_32 && !X86_VOYAGER && !X86_VISWS && !X86_NUMAQ) || X86_64)
select HAVE_ARCH_KGDB if !X86_VOYAGER
def_bool X86_64_SMP || (X86_SMP && !X86_VOYAGER)
depends on !SMP || !X86_VOYAGER
depends on !X86_VOYAGER
depends on SMP && ((X86_32 && !X86_VOYAGER) || X86_64)
depends on (X86_32 && !X86_VOYAGER) || X86_64
depends on !X86_VOYAGER
depends on X86_SMP || (X86_VOYAGER && SMP) || (64BIT && ACPI_SLEEP)
depends on X86_VOYAGER
depends on X86_MPPARSE || X86_VOYAGER
depends on X86_32 && PCI && !X86_VOYAGER && X86_MPPARSE && PCI_GODIRECT
depends on !X86_VOYAGER
depends on !X86_VOYAGER
depends on !X86_VOYAGER
depends on !X86_VOYAGER
depends on X86_32 && !SMP && !(X86_VOYAGER || X86_GENERICARCH)
depends on X86_64 || (X86_32 && (X86_UP_APIC || (SMP && !X86_VOYAGER) || X86_GENERICARCH))
depends on X86_64 || (X86_32 && (X86_UP_IOAPIC || (SMP && !X86_VOYAGER) || X86_GENERICARCH))
depends on !X86_VOYAGER
depends on SMP && HOTPLUG && !X86_VOYAGER
depends on !X86_VOYAGER
depends on !X86_VOYAGER
bool "MCA support" if !X86_VOYAGER
default y if X86_VOYAGER
depends on !X86_VOYAGER
The VISWS code was in a similar situation not so long ago. It was a quirky
subarchitecture similar to Voyager: it had no standard PC compatibility at
all, other than the use of Intel CPUs.
VISWS had about 20 quirks in arch/x86/Kconfig that needlessly complicated
the picture there. It caused a number of build breakages and complicated
development for many cycles.
After we merged the VISWS subarch into the generic code its quirk count in
arch/x86/Kconfig went down to _one_ only. The whole VISWS impact has
dwindled down to almost zero. The difference is significant, and we'd like
to see the same kind of cleanup happen with x86/Voyager too.
There's countless of other areas where the elimination of subarchitectures
simplifies the code and shrinks x86 maintenance costs.
> > If there's no time/interest in that then we can temporarily mark
> > Voyager CONFIG_BROKEN until cleanup/fix patches arrive.
>
> It's not broken and I've already sent you the cleanup/fix patches ... I
> can send them directly to Linus as voyager maintainer if you prefer.
Well, i'm NAK-ing those patches in their current form - x86/Voyager should
be restructured like the other ex subarchitectures were done - or we'll
have to mark it CONFIG_BROKEN until the right kind of patches arrive.
Please send patches to the x86 maintainers and implement the cleanups we
have asked for - or let us know if you dont have time/interest in doing
so.
Thanks,
Ingo
On Sun, Jan 18, 2009 at 1:04 PM, James Bottomley
<[email protected]> wrote:
> On Sun, 2009-01-18 at 12:41 -0500, Brian Gerst wrote:
>> On Sun, Jan 18, 2009 at 11:41 AM, James Bottomley
>> <[email protected]> wrote:
>> > On Sun, 2009-01-18 at 08:14 +0100, Ingo Molnar wrote:
>> >> If there's no time/interest in that then we can temporarily mark Voyager
>> >> CONFIG_BROKEN until cleanup/fix patches arrive.
>> >
>> > It's not broken and I've already sent you the cleanup/fix patches ... I
>> > can send them directly to Linus as voyager maintainer if you prefer.
>>
>> The build breakage was due to the cpumask changes I believe, inherited
>> from -tip.
>>
>> There is alot of duplicated code in voyager_smp.c that is making it
>> difficult for me to work on the per-cpu changes.
>
> Actually, there's very little duplicated code, but what there is we can
> unify. The reason it duplicates the API is because it has to provide a
> different implementation for voyager ... it's not a PC x86 architecture.
>
>> Do you see any
>> reason that Voyager can't use the normal x86 setup_per_cpu_areas()
>> code?
>
> Er, well, yes. Current setup_per_cpu_areas is setting up the cpu<->apic
> maps. Voyager has no apics, so it has no use for any of the arrays
> being set up in there.
>
> If you're proposing to add arrays that would actually be useful to
> voyager, then sure we can use it ... it's just at the moment there's no
> need. What is it you want to add in there?
The apic code can be ifdef'ed out for voyager. The reason I want to
use the x86 setup_per_cpu_areas() is that I want to consolidate
initializing per-cpu variables in one place. Voyager currently has
sprinkled in various places setting this_cpu_off, cpu_number, etc.
--
Brian Gerst
On Sun, 2009-01-18 at 19:17 +0100, Ingo Molnar wrote:
> * James Bottomley <[email protected]> wrote:
>
> > On Sun, 2009-01-18 at 08:14 +0100, Ingo Molnar wrote:
> > > * Brian Gerst <[email protected]> wrote:
> > >
> > > > On Sun, Jan 18, 2009 at 12:59 AM, Tejun Heo <[email protected]> wrote:
> > > > > Brian Gerst wrote:
> > > > >> On Sun, Jan 18, 2009 at 12:05 AM, Tejun Heo <[email protected]> wrote:
> > > > >>> Hello,
> > > > >>>
> > > > >>>> --- a/arch/x86/kernel/setup_percpu.c
> > > > >>>> +++ b/arch/x86/kernel/setup_percpu.c
> > > > >>>> @@ -147,6 +147,9 @@ unsigned long __per_cpu_offset[NR_CPUS] __read_mostly;
> > > > >>>> #endif
> > > > >>>> EXPORT_SYMBOL(__per_cpu_offset);
> > > > >>>>
> > > > >>>> +DEFINE_PER_CPU(int, cpu_number);
> > > > >>>> +EXPORT_PER_CPU_SYMBOL(cpu_number);
> > > > >>> This is inside CONFIG_HAVE_SETUP_PER_CPU_AREA. I think voyage would
> > > > >>> be unhappy with this change.
> > > > >>
> > > > >> Is there any specific reason Voyager doesn't use the x86
> > > > >> setup_per_cpu_areas() function? I don't see anything on a quick
> > > > >> glance that would not work. The x86 code is pretty much a superset of
> > > > >> the default code in init/main.c.
> > > > >
> > > > > I have no idea at all. Given that not many people can test it, I
> > > > > figured just leaving it alone would be the best course but if it can
> > > > > be merged, all the better.
> > > >
> > > > Unfortunately Voyager doesn't compile currently for unrelated reasons.
> > > > I'll take a look at incorporating it into these patches, but I can't
> > > > even do a compile test right now.
> >
> > What are "unrelated reasons"?, 2.6.28 compiles and boots for me, except
> > some of the compile fixes (which are regressions, by the way) aren't
> > included in spite of being sent several times.
> >
> > I've put them up here:
> >
> > git://git.kernel.org/pub/scm/linux/kernel/git/jejb/voyager-2.6.git
>
> These are not complete as you did not implement the cleanups that we
> suggested - and hence they are not acceptable (i.e. consider this a NAK).
They were build fixes for a -rc kernel. Even if I agreed with your new
feature work, that's inappropriate for the -rc stage (and actually would
take much longer). Holding regressions fixes hostage to new feature
work is wrong.
> They just prolongue the pain of subarchitectures.
The pain being where you break it and I fix it?
> Your previous round of fixes were problematic: i remember them breaking
> the normal x86 build at least twice - showing the collateral cost of
> subarchitectures.
I thought I fixed everything: it works for me (tm) on all my PC like x86
boxes ... if it's broken on something more esoteric, you'll actually
need to send me details so I can fix it. What exactly is broken?
> If we had the x86/Voyager complications isolated in a
> single arch/x86/kernel/voyager_quicks.c module, via a qx86_quirks and
> similar mechanisms the end result would be far more maintainable.
>
> > I haven't included the cpumask fixes (so it won't compile on 2.6.29-rc2
> > yet) because I'll have to try to polish them to fit in with whatever's
> > going on. Plus there's some type of initramfs boot failure that I need
> > to investigate. However, usually I wait until the x86 churn is
> > finished, which is a lot later into the -rc cycle than this before
> > fixing up all the breakage.
> >
> > > Peter/James, what's the current status of x86/Voyager cleanups?
> >
> > The only outstanding problem I can see in 2.6.29 is a cpumask screw up
> > caused by Mike Travis ... it looks easily fixable, he just forgot to
> > convert voyager.
> >
> > I have to say that putting the SMP CPU definitions in cpu/common.c
> > hedged around with ifdefs for type looks really to be the wrong thing to
> > do. We already have compile selected files with these types, the
> > definition should be in there.
> >
> > > A couple of months ago i made a few suggestions about how to convert
> > > Voyager to the cleaner x86_quirks 'quirks HAL' (from the current
> > > fragile and hard and expensive to maintain 'compile time HAL'), but it
> > > didnt seem to go anywhere. See the discussion of this timeframe:
> > >
> > > http://lkml.org/lkml/2008/11/3/53
> > >
> > > The VisWS subarch (which was a similarly excentric design that was
> > > only a PC in terms of having Intel CPUs) has been converted to
> > > CONFIG_X86_VISWS already, with arch/x86/kernel/visws_quirks.c holding
> > > the optional quirk handlers.
> > >
> > > The desired end result would be to have a CONFIG_X86_VOYAGER=y build
> > > mode that adds the quirk handlers to an otherwise generic kernel, with
> > > most of the quirks concentrated into a single
> > > arch/x86/kernel/voyager_quirks.c file - instead of having a full
> > > subarch for x86/Voyager. Both arch/x86/mach-voyager/ and
> > > arch/x86/include/asm/mach-voyager/ would go away in the end - because
> > > all functionality is merged into the generic code and the quirks would
> > > be in voyager_quirks.c.
> >
> > You appear to have forgotten that we already had this discussion here:
> >
> > http://marc.info/?t=122539020300002
>
> Why would i have forgotten that? We asked you to do those cleanups and
> offered help. AFAICS you have not submitted patches to that effect and you
> did not address the review feedback we gave you on your patches. If you
> have sent patches that implement my x86_quirks suggestions then please
> show me the URIs.
I addressed all the review issues ... even that thread you quote has my
reply as its last entry.
> > But to precis, the bottom line is that I'm concerned about the damage to
> > mainline x86 this would cause because voyager is a vastly different
> > beast. We'd be doubling at least the number of function pointer
> > indirections, plus the current quirk stuff is inadequate: voyager needs
> > boot time separation to handle the unique SUS maps and other things, so
> > there'd be a big intrusion into the boot system as well.
> >
> > > I'd be glad to lend a helping hand both with the patches and with testing
> > > on non-Voyager - especially the SMP bits probably need extensions on the
> > > x86_quirks side. (And i'm sure the other x86 maintainers would we glad to
> > > help out with this process too.)
> > >
> > > x86/Voyager is the last holdout in this area, and with an active kernel
> > > developer like James behind it it ought to be fixable - should James have
> > > the time/interest.
> >
> > But no-one's yet made any argument for why it's a worthwhile thing to be
> > doing.
>
> Because the sub-arch code is butt-ugly.
>
> x86 subarchitectures are a hack that should never have gone upstream - and
> we are now reversing that braindamage, step by step. Subarchitectures are
> a compile-time "HAL", but a highly non-transparent one at that. They
> complicates the x86 architecture in a couple of key structures and very
> fundamentally so - and that results in continued complications in critical
> areas of the x86 code.
Right: it's a compile time HAL. Your "cleanup" is to convert it to a
runtime one, which is a lot more complex because voyager still has to
influence the x86 path in those locations regardless: it's not an x86 PC
architecture and so is never going to be able to boot through an APIC/MP
table SMP boot sequence.
I'm not actually bothered by the complexity, though ... it would be cool
for me to have a kernel that boots on both voyager and a PC. What
worries me is the cost (both in terms of execution time and maintenance
burden) this would impose on the standard PC path.
The other thing I will point out is that if you think a runtime HAL
simplification could be applied to the current kernel, a compile time
HAL could equally well be done.
> One example where this shows up in full force in the Kconfig space. For
> example in arch/x86/Kconfig we have _more than 20_ VOYAGER quirks:
>
> select HAVE_KVM if ((X86_32 && !X86_VOYAGER && !X86_VISWS && !X86_NUMAQ) || X86_64)
> select HAVE_ARCH_KGDB if !X86_VOYAGER
> def_bool X86_64_SMP || (X86_SMP && !X86_VOYAGER)
> depends on !SMP || !X86_VOYAGER
> depends on !X86_VOYAGER
> depends on SMP && ((X86_32 && !X86_VOYAGER) || X86_64)
> depends on (X86_32 && !X86_VOYAGER) || X86_64
> depends on !X86_VOYAGER
> depends on X86_SMP || (X86_VOYAGER && SMP) || (64BIT && ACPI_SLEEP)
> depends on X86_VOYAGER
> depends on X86_MPPARSE || X86_VOYAGER
> depends on X86_32 && PCI && !X86_VOYAGER && X86_MPPARSE && PCI_GODIRECT
> depends on !X86_VOYAGER
> depends on !X86_VOYAGER
> depends on !X86_VOYAGER
> depends on !X86_VOYAGER
> depends on X86_32 && !SMP && !(X86_VOYAGER || X86_GENERICARCH)
> depends on X86_64 || (X86_32 && (X86_UP_APIC || (SMP && !X86_VOYAGER) || X86_GENERICARCH))
> depends on X86_64 || (X86_32 && (X86_UP_IOAPIC || (SMP && !X86_VOYAGER) || X86_GENERICARCH))
> depends on !X86_VOYAGER
> depends on SMP && HOTPLUG && !X86_VOYAGER
> depends on !X86_VOYAGER
> depends on !X86_VOYAGER
> bool "MCA support" if !X86_VOYAGER
> default y if X86_VOYAGER
> depends on !X86_VOYAGER
>
> The VISWS code was in a similar situation not so long ago. It was a quirky
> subarchitecture similar to Voyager: it had no standard PC compatibility at
> all, other than the use of Intel CPUs.
Hmm, but what that shows is that the PC configuration path is complex
anyway. Voyager doesn't really make that much more so (as shown by the
fact that most of those depends have things other than voyager in them).
> VISWS had about 20 quirks in arch/x86/Kconfig that needlessly complicated
> the picture there. It caused a number of build breakages and complicated
> development for many cycles.
>
> After we merged the VISWS subarch into the generic code its quirk count in
> arch/x86/Kconfig went down to _one_ only. The whole VISWS impact has
> dwindled down to almost zero. The difference is significant, and we'd like
> to see the same kind of cleanup happen with x86/Voyager too.
>
> There's countless of other areas where the elimination of subarchitectures
> simplifies the code and shrinks x86 maintenance costs.
Visws isn't very different from a standard PC ... it's really only about
how the apics get included and a few SGI specific things (like mem
configuration, reboot and power off).
I think the runtime piece is probably doable ... largely because
virtualisation has already cost us hugely via function pointers in this
area, so the extra additions to pull in the few voyager differences
might not be that noticeable.
I really don't see how the boot path can be simplified by this ... in
fact, I think it will grow in complexity.
So ... the probable way to get this to work is to boot x86 as a UP
system and then enable SMP later on (voyager can boot a standard UP
kernel because the SUS can emulate a UP PC) ... basically switch to SMP
using alternatives and then bring the CPUs up via hotplug. This
approach is really very different from the way x86 works today, where
the SMP structures are sprayed throughout the architecture specific
directory.
It can work, though ... hotplug bringup is how we boot parisc SMP as
well ... I just think it will be a lot of perturbation to x86.
> > > If there's no time/interest in that then we can temporarily mark
> > > Voyager CONFIG_BROKEN until cleanup/fix patches arrive.
> >
> > It's not broken and I've already sent you the cleanup/fix patches ... I
> > can send them directly to Linus as voyager maintainer if you prefer.
>
> Well, i'm NAK-ing those patches in their current form - x86/Voyager should
> be restructured like the other ex subarchitectures were done - or we'll
> have to mark it CONFIG_BROKEN until the right kind of patches arrive.
>
> Please send patches to the x86 maintainers and implement the cleanups we
> have asked for - or let us know if you dont have time/interest in doing
> so.
Look, I've already told you your quirks don't work because of the way
voyager boots.
However, voyager can make use of the current quirks you have ... it's
just there'll be a lot left over when that's all over, plus some pieces
of the infrastructure itself will need adjusting. So why don't we do
this: Fix what's broken now (or show me what needs adjusting about the
fixes) and I'll update voyager to use the current quirks infrastructure
(and update the infrastructure). Then we can both look at what's left
over when that's finished?
James
* James Bottomley <[email protected]> wrote:
> > > But no-one's yet made any argument for why it's a worthwhile thing
> > > to be doing.
> >
> > Because the sub-arch code is butt-ugly.
> >
> > x86 subarchitectures are a hack that should never have gone upstream -
> > and we are now reversing that braindamage, step by step.
> > Subarchitectures are a compile-time "HAL", but a highly
> > non-transparent one at that. They complicates the x86 architecture in
> > a couple of key structures and very fundamentally so - and that
> > results in continued complications in critical areas of the x86 code.
>
> Right: it's a compile time HAL. Your "cleanup" is to convert it to a
> runtime one, which is a lot more complex because voyager still has to
> influence the x86 path in those locations regardless: it's not an x86 PC
> architecture and so is never going to be able to boot through an APIC/MP
> table SMP boot sequence.
>
> I'm not actually bothered by the complexity, though ... it would be cool
> for me to have a kernel that boots on both voyager and a PC. What
> worries me is the cost (both in terms of execution time and maintenance
> burden) this would impose on the standard PC path.
That cost is negligible. Voyager already uses smp_ops which solves most of
the callbacks. The remaining non-slowpath (non-init, non-shutdown, etc.)
bits are, roughly:
flush_tlb_[all|current_task|mm|page]()
smp_apic_timer_interrupt
Which can either reuse PARAVIRT with a Voyager-specific custom template
for these methods, or, if that's easier, we can add x86_quirk handlers as
well.
There is near zero overhead for normal PCs, ~5 functions will have:
if (unlikely(x86_quirks.smp_apic_timer_interrupt)) {
... quirk-path ...
}
type of constructs.
> The other thing I will point out is that if you think a runtime HAL
> simplification could be applied to the current kernel, a compile time
> HAL could equally well be done.
There are numerous advantages. Just a few of them, from the top of my
head:
- The point is to not split the build space on such a fundamental level -
testing is way too complex already.
- Runtime quirks tend to be tested far more than build-time quirks: for
example because a test kernel can just include all the runtime quirks
all at once - while it cannot possibly include all the
subarchitectures.
- It is far more apparent from the source code what happens if quirks are
out and visible. A compile-time 'HAL' is far less transparent: it is
very easy to miss that a function is present in multiple copies and
behaves in different ways, dependent on which subarch we are building
for. Developers tend to concentrate on a single piece of code, so
consolidating code flows is a development advantage.
- Runtime quirks tend to be much more usable. A distro kernel can include
all the quirks with near zero impact. Now in the specific case of
x86/Voyager this is probably not a big factor: as you seem to be owning
the last two (one?) working Voyager box in existence that runs
development kernels? But it is a factor in the general push to
eliminate subarchitectures.
(For similarly good reasons most of the hw quirks in the normal Linux
driver space are done via runtime constructs and not via build-time
constructs.)
Anyway, none of this is really new. We eliminated the ES7000, the VISWS
and the RDC321X subarchitectures already - Voyager is the holdout.
Ingo