2008-06-04 00:31:40

by Mike Travis

[permalink] [raw]
Subject: [PATCH 3/4] x86_64: Fold pda into per cpu area

* Declare the pda as a per cpu variable.

* Make the x86_64 per cpu area start at zero.

* Since the pda is now the first element of the per_cpu area, cpu_pda()
is no longer needed and per_cpu() can be used instead. This also makes
the _cpu_pda[] table obsolete.

* Since %gs is pointing to the pda, it will then also point to the per cpu
variables and can be accessed thusly:

%gs:[&per_cpu_xxxx - __per_cpu_start]

Based on linux-2.6.tip

Signed-off-by: Christoph Lameter <[email protected]>
Signed-off-by: Mike Travis <[email protected]>
---
arch/x86/Kconfig | 3 +
arch/x86/kernel/head64.c | 34 ++++++--------
arch/x86/kernel/irq_64.c | 36 ++++++++-------
arch/x86/kernel/setup.c | 90 ++++++++++++---------------------------
arch/x86/kernel/setup64.c | 5 --
arch/x86/kernel/smpboot.c | 51 ----------------------
arch/x86/kernel/traps_64.c | 11 +++-
arch/x86/kernel/vmlinux_64.lds.S | 1
include/asm-x86/percpu.h | 48 ++++++--------------
9 files changed, 89 insertions(+), 190 deletions(-)

--- linux-2.6.tip.orig/arch/x86/Kconfig
+++ linux-2.6.tip/arch/x86/Kconfig
@@ -129,6 +129,9 @@ config HAVE_SETUP_PER_CPU_AREA
config HAVE_CPUMASK_OF_CPU_MAP
def_bool X86_64_SMP

+config HAVE_ZERO_BASED_PER_CPU
+ def_bool X86_64_SMP
+
config ARCH_HIBERNATION_POSSIBLE
def_bool y
depends on !SMP || !X86_VOYAGER
--- linux-2.6.tip.orig/arch/x86/kernel/head64.c
+++ linux-2.6.tip/arch/x86/kernel/head64.c
@@ -25,20 +25,6 @@
#include <asm/e820.h>
#include <asm/bios_ebda.h>

-/* boot cpu pda */
-static struct x8664_pda _boot_cpu_pda __read_mostly;
-
-#ifdef CONFIG_SMP
-/*
- * We install an empty cpu_pda pointer table to indicate to early users
- * (numa_set_node) that the cpu_pda pointer table for cpus other than
- * the boot cpu is not yet setup.
- */
-static struct x8664_pda *__cpu_pda[NR_CPUS] __initdata;
-#else
-static struct x8664_pda *__cpu_pda[NR_CPUS] __read_mostly;
-#endif
-
static void __init zap_identity_mappings(void)
{
pgd_t *pgd = pgd_offset_k(0UL);
@@ -159,6 +145,20 @@ void __init x86_64_start_kernel(char * r
/* Cleanup the over mapped high alias */
cleanup_highmap();

+ /* point to boot pda which is the first element in the percpu area */
+ {
+ struct x8664_pda *pda;
+#ifdef CONFIG_SMP
+ pda = (struct x8664_pda *)__per_cpu_load;
+ pda->data_offset = per_cpu_offset(0) = (unsigned long)pda;
+#else
+ pda = &per_cpu(pda, 0);
+ pda->data_offset = (unsigned long)pda;
+#endif
+ }
+ /* initialize boot cpu_pda data */
+ pda_init(0);
+
for (i = 0; i < NUM_EXCEPTION_VECTORS; i++) {
#ifdef CONFIG_EARLY_PRINTK
set_intr_gate(i, &early_idt_handlers[i]);
@@ -170,12 +170,6 @@ void __init x86_64_start_kernel(char * r

early_printk("Kernel alive\n");

- _cpu_pda = __cpu_pda;
- cpu_pda(0) = &_boot_cpu_pda;
- pda_init(0);
-
- early_printk("Kernel really alive\n");
-
copy_bootdata(__va(real_mode_data));

reserve_early(__pa_symbol(&_text), __pa_symbol(&_end), "TEXT DATA BSS");
--- linux-2.6.tip.orig/arch/x86/kernel/irq_64.c
+++ linux-2.6.tip/arch/x86/kernel/irq_64.c
@@ -115,39 +115,43 @@ skip:
} else if (i == NR_IRQS) {
seq_printf(p, "NMI: ");
for_each_online_cpu(j)
- seq_printf(p, "%10u ", cpu_pda(j)->__nmi_count);
+ seq_printf(p, "%10u ", per_cpu(pda.__nmi_count, j));
seq_printf(p, " Non-maskable interrupts\n");
seq_printf(p, "LOC: ");
for_each_online_cpu(j)
- seq_printf(p, "%10u ", cpu_pda(j)->apic_timer_irqs);
+ seq_printf(p, "%10u ", per_cpu(pda.apic_timer_irqs, j));
seq_printf(p, " Local timer interrupts\n");
#ifdef CONFIG_SMP
seq_printf(p, "RES: ");
for_each_online_cpu(j)
- seq_printf(p, "%10u ", cpu_pda(j)->irq_resched_count);
+ seq_printf(p, "%10u ",
+ per_cpu(pda.irq_resched_count, j));
seq_printf(p, " Rescheduling interrupts\n");
seq_printf(p, "CAL: ");
for_each_online_cpu(j)
- seq_printf(p, "%10u ", cpu_pda(j)->irq_call_count);
+ seq_printf(p, "%10u ", per_cpu(pda.irq_call_count, j));
seq_printf(p, " function call interrupts\n");
seq_printf(p, "TLB: ");
for_each_online_cpu(j)
- seq_printf(p, "%10u ", cpu_pda(j)->irq_tlb_count);
+ seq_printf(p, "%10u ", per_cpu(pda.irq_tlb_count, j));
seq_printf(p, " TLB shootdowns\n");
#endif
#ifdef CONFIG_X86_MCE
seq_printf(p, "TRM: ");
for_each_online_cpu(j)
- seq_printf(p, "%10u ", cpu_pda(j)->irq_thermal_count);
+ seq_printf(p, "%10u ",
+ per_cpu(pda.irq_thermal_count, j));
seq_printf(p, " Thermal event interrupts\n");
seq_printf(p, "THR: ");
for_each_online_cpu(j)
- seq_printf(p, "%10u ", cpu_pda(j)->irq_threshold_count);
+ seq_printf(p, "%10u ",
+ per_cpu(pda.irq_threshold_count, j));
seq_printf(p, " Threshold APIC interrupts\n");
#endif
seq_printf(p, "SPU: ");
for_each_online_cpu(j)
- seq_printf(p, "%10u ", cpu_pda(j)->irq_spurious_count);
+ seq_printf(p, "%10u ",
+ per_cpu(pda.irq_spurious_count, j));
seq_printf(p, " Spurious interrupts\n");
seq_printf(p, "ERR: %10u\n", atomic_read(&irq_err_count));
}
@@ -159,19 +163,19 @@ skip:
*/
u64 arch_irq_stat_cpu(unsigned int cpu)
{
- u64 sum = cpu_pda(cpu)->__nmi_count;
+ u64 sum = per_cpu(pda.__nmi_count, cpu);

- sum += cpu_pda(cpu)->apic_timer_irqs;
+ sum += per_cpu(pda.apic_timer_irqs, cpu);
#ifdef CONFIG_SMP
- sum += cpu_pda(cpu)->irq_resched_count;
- sum += cpu_pda(cpu)->irq_call_count;
- sum += cpu_pda(cpu)->irq_tlb_count;
+ sum += per_cpu(pda.irq_resched_count, cpu);
+ sum += per_cpu(pda.irq_call_count, cpu);
+ sum += per_cpu(pda.irq_tlb_count, cpu);
#endif
#ifdef CONFIG_X86_MCE
- sum += cpu_pda(cpu)->irq_thermal_count;
- sum += cpu_pda(cpu)->irq_threshold_count;
+ sum += per_cpu(pda.irq_thermal_count, cpu);
+ sum += per_cpu(pda.irq_threshold_count, cpu);
#endif
- sum += cpu_pda(cpu)->irq_spurious_count;
+ sum += per_cpu(pda.irq_spurious_count, cpu);
return sum;
}

--- linux-2.6.tip.orig/arch/x86/kernel/setup.c
+++ linux-2.6.tip/arch/x86/kernel/setup.c
@@ -29,6 +29,11 @@ DEFINE_EARLY_PER_CPU(u16, x86_bios_cpu_a
EXPORT_EARLY_PER_CPU_SYMBOL(x86_cpu_to_apicid);
EXPORT_EARLY_PER_CPU_SYMBOL(x86_bios_cpu_apicid);

+#ifdef CONFIG_X86_64
+DEFINE_PER_CPU_FIRST(struct x8664_pda, pda);
+EXPORT_PER_CPU_SYMBOL(pda);
+#endif
+
#if defined(CONFIG_NUMA) && defined(CONFIG_X86_64)
#define X86_64_NUMA 1

@@ -47,7 +52,7 @@ static void __init setup_node_to_cpumask
static inline void setup_node_to_cpumask_map(void) { }
#endif

-#if defined(CONFIG_HAVE_SETUP_PER_CPU_AREA) && defined(CONFIG_SMP)
+#ifdef CONFIG_HAVE_SETUP_PER_CPU_AREA
/*
* Copy data used in early init routines from the initial arrays to the
* per cpu data areas. These arrays then become expendable and the
@@ -94,64 +99,9 @@ static void __init setup_cpumask_of_cpu(
static inline void setup_cpumask_of_cpu(void) { }
#endif

-#ifdef CONFIG_X86_32
-/*
- * Great future not-so-futuristic plan: make i386 and x86_64 do it
- * the same way
- */
unsigned long __per_cpu_offset[NR_CPUS] __read_mostly;
EXPORT_SYMBOL(__per_cpu_offset);
-static inline void setup_cpu_pda_map(void) { }
-
-#elif !defined(CONFIG_SMP)
-static inline void setup_cpu_pda_map(void) { }
-
-#else /* CONFIG_SMP && CONFIG_X86_64 */
-
-/*
- * Allocate cpu_pda pointer table and array via alloc_bootmem.
- */
-static void __init setup_cpu_pda_map(void)
-{
- char *pda;
- struct x8664_pda **new_cpu_pda;
- unsigned long size;
- int cpu;
-
- size = roundup(sizeof(struct x8664_pda), cache_line_size());
-
- /* allocate cpu_pda array and pointer table */
- {
- unsigned long tsize = nr_cpu_ids * sizeof(void *);
- unsigned long asize = size * (nr_cpu_ids - 1);
-
- tsize = roundup(tsize, cache_line_size());
- new_cpu_pda = alloc_bootmem(tsize + asize);
- pda = (char *)new_cpu_pda + tsize;
- }

- /* initialize pointer table to static pda's */
- for_each_possible_cpu(cpu) {
- if (cpu == 0) {
- /* leave boot cpu pda in place */
- new_cpu_pda[0] = cpu_pda(0);
- continue;
- }
- new_cpu_pda[cpu] = (struct x8664_pda *)pda;
- new_cpu_pda[cpu]->in_bootmem = 1;
- pda += size;
- }
-
- /* point to new pointer table */
- _cpu_pda = new_cpu_pda;
-}
-#endif
-
-/*
- * Great future plan:
- * Declare PDA itself and support (irqstack,tss,pgd) as per cpu data.
- * Always point %gs to its beginning
- */
void __init setup_per_cpu_areas(void)
{
ssize_t size = PERCPU_ENOUGH_ROOM;
@@ -164,9 +114,6 @@ void __init setup_per_cpu_areas(void)
nr_cpu_ids = num_processors;
#endif

- /* Setup cpu_pda map */
- setup_cpu_pda_map();
-
/* Copy section for each CPU (we discard the original) */
size = PERCPU_ENOUGH_ROOM;
printk(KERN_INFO "PERCPU: Allocating %lu bytes of per cpu data\n",
@@ -186,9 +133,28 @@ void __init setup_per_cpu_areas(void)
else
ptr = alloc_bootmem_pages_node(NODE_DATA(node), size);
#endif
+ /* Initialize each cpu's per_cpu area and save pointer */
+ memcpy(ptr, __per_cpu_load, __per_cpu_size);
per_cpu_offset(cpu) = ptr - __per_cpu_start;
- memcpy(ptr, __per_cpu_start, __per_cpu_end - __per_cpu_start);

+#ifdef CONFIG_X86_64
+ /*
+ * Note the boot cpu has been using the static per_cpu load
+ * area for it's pda. We need to zero out the pda's for the
+ * other cpu's that are coming online.
+ */
+ {
+ /* we rely on the fact that pda is the first element */
+ struct x8664_pda *pda = (struct x8664_pda *)ptr;
+
+ if (cpu)
+ memset(pda, 0, sizeof(struct x8664_pda));
+ else
+ pda_init(0);
+
+ pda->data_offset = (unsigned long)ptr;
+ }
+#endif
}

printk(KERN_DEBUG "NR_CPUS: %d, nr_cpu_ids: %d, nr_node_ids %d\n",
@@ -240,8 +206,8 @@ void __cpuinit numa_set_node(int cpu, in
{
int *cpu_to_node_map = early_per_cpu_ptr(x86_cpu_to_node_map);

- if (cpu_pda(cpu) && node != NUMA_NO_NODE)
- cpu_pda(cpu)->nodenumber = node;
+ if (per_cpu_offset(cpu))
+ per_cpu(pda.nodenumber, cpu) = node;

if (cpu_to_node_map)
cpu_to_node_map[cpu] = node;
--- linux-2.6.tip.orig/arch/x86/kernel/setup64.c
+++ linux-2.6.tip/arch/x86/kernel/setup64.c
@@ -35,9 +35,6 @@ struct boot_params boot_params;

cpumask_t cpu_initialized __cpuinitdata = CPU_MASK_NONE;

-struct x8664_pda **_cpu_pda __read_mostly;
-EXPORT_SYMBOL(_cpu_pda);
-
struct desc_ptr idt_descr = { 256 * 16 - 1, (unsigned long) idt_table };

char boot_cpu_stack[IRQSTACKSIZE] __attribute__((section(".bss.page_aligned")));
@@ -89,7 +86,7 @@ __setup("noexec32=", nonx32_setup);

void pda_init(int cpu)
{
- struct x8664_pda *pda = cpu_pda(cpu);
+ struct x8664_pda *pda = &per_cpu(pda, cpu);

/* Setup up data that may be needed in __get_free_pages early */
asm volatile("movl %0,%%fs ; movl %0,%%gs" :: "r" (0));
--- linux-2.6.tip.orig/arch/x86/kernel/smpboot.c
+++ linux-2.6.tip/arch/x86/kernel/smpboot.c
@@ -798,45 +798,6 @@ static void __cpuinit do_fork_idle(struc
complete(&c_idle->done);
}

-#ifdef CONFIG_X86_64
-/*
- * Allocate node local memory for the AP pda.
- *
- * Must be called after the _cpu_pda pointer table is initialized.
- */
-static int __cpuinit get_local_pda(int cpu)
-{
- struct x8664_pda *oldpda, *newpda;
- unsigned long size = sizeof(struct x8664_pda);
- int node = cpu_to_node(cpu);
-
- if (cpu_pda(cpu) && !cpu_pda(cpu)->in_bootmem)
- return 0;
-
- oldpda = cpu_pda(cpu);
- newpda = kmalloc_node(size, GFP_ATOMIC, node);
- if (!newpda) {
- printk(KERN_ERR "Could not allocate node local PDA "
- "for CPU %d on node %d\n", cpu, node);
-
- if (oldpda)
- return 0; /* have a usable pda */
- else
- return -1;
- }
-
- if (oldpda) {
- memcpy(newpda, oldpda, size);
- if (!after_bootmem)
- free_bootmem((unsigned long)oldpda, size);
- }
-
- newpda->in_bootmem = 0;
- cpu_pda(cpu) = newpda;
- return 0;
-}
-#endif /* CONFIG_X86_64 */
-
static int __cpuinit do_boot_cpu(int apicid, int cpu)
/*
* NOTE - on most systems this is a PHYSICAL apic ID, but on multiquad
@@ -860,14 +821,6 @@ static int __cpuinit do_boot_cpu(int api
printk(KERN_ERR "Failed to allocate GDT for CPU %d\n", cpu);
return -1;
}
-
- /* Allocate node local memory for AP pdas */
- if (cpu > 0) {
- boot_error = get_local_pda(cpu);
- if (boot_error)
- goto restore_state;
- /* if can't get pda memory, can't start cpu */
- }
#endif

alternatives_smp_switch(1);
@@ -908,7 +861,7 @@ do_rest:
stack_start.sp = (void *) c_idle.idle->thread.sp;
irq_ctx_init(cpu);
#else
- cpu_pda(cpu)->pcurrent = c_idle.idle;
+ per_cpu(pda.pcurrent, cpu) = c_idle.idle;
init_rsp = c_idle.idle->thread.sp;
load_sp0(&per_cpu(init_tss, cpu), &c_idle.idle->thread);
initial_code = (unsigned long)start_secondary;
@@ -985,8 +938,6 @@ do_rest:
}
}

-restore_state:
-
if (boot_error) {
/* Try to put things back the way they were before ... */
unmap_cpu_to_logical_apicid(cpu);
--- linux-2.6.tip.orig/arch/x86/kernel/traps_64.c
+++ linux-2.6.tip/arch/x86/kernel/traps_64.c
@@ -265,7 +265,8 @@ void dump_trace(struct task_struct *tsk,
const struct stacktrace_ops *ops, void *data)
{
const unsigned cpu = get_cpu();
- unsigned long *irqstack_end = (unsigned long*)cpu_pda(cpu)->irqstackptr;
+ unsigned long *irqstack_end =
+ (unsigned long*)per_cpu(pda.irqstackptr, cpu);
unsigned used = 0;
struct thread_info *tinfo;

@@ -399,8 +400,10 @@ _show_stack(struct task_struct *tsk, str
unsigned long *stack;
int i;
const int cpu = smp_processor_id();
- unsigned long *irqstack_end = (unsigned long *) (cpu_pda(cpu)->irqstackptr);
- unsigned long *irqstack = (unsigned long *) (cpu_pda(cpu)->irqstackptr - IRQSTACKSIZE);
+ unsigned long *irqstack_end =
+ (unsigned long *)per_cpu(pda.irqstackptr, cpu);
+ unsigned long *irqstack =
+ (unsigned long *)(per_cpu(pda.irqstackptr, cpu) - IRQSTACKSIZE);

// debugging aid: "show_stack(NULL, NULL);" prints the
// back trace for this cpu.
@@ -464,7 +467,7 @@ void show_registers(struct pt_regs *regs
int i;
unsigned long sp;
const int cpu = smp_processor_id();
- struct task_struct *cur = cpu_pda(cpu)->pcurrent;
+ struct task_struct *cur = __get_cpu_var(pda.pcurrent);
u8 *ip;
unsigned int code_prologue = code_bytes * 43 / 64;
unsigned int code_len = code_bytes;
--- linux-2.6.tip.orig/arch/x86/kernel/vmlinux_64.lds.S
+++ linux-2.6.tip/arch/x86/kernel/vmlinux_64.lds.S
@@ -16,6 +16,7 @@ jiffies_64 = jiffies;
_proxy_pda = 1;
PHDRS {
text PT_LOAD FLAGS(5); /* R_E */
+ percpu PT_LOAD FLAGS(4); /* R__ */
data PT_LOAD FLAGS(7); /* RWE */
user PT_LOAD FLAGS(7); /* RWE */
data.init PT_LOAD FLAGS(7); /* RWE */
--- linux-2.6.tip.orig/include/asm-x86/percpu.h
+++ linux-2.6.tip/include/asm-x86/percpu.h
@@ -3,26 +3,20 @@

#ifdef CONFIG_X86_64
#include <linux/compiler.h>
-
-/* Same as asm-generic/percpu.h, except that we store the per cpu offset
- in the PDA. Longer term the PDA and every per cpu variable
- should be just put into a single section and referenced directly
- from %gs */
-
-#ifdef CONFIG_SMP
#include <asm/pda.h>

-#define __per_cpu_offset(cpu) (cpu_pda(cpu)->data_offset)
-#define __my_cpu_offset read_pda(data_offset)
-
-#define per_cpu_offset(x) (__per_cpu_offset(x))
-
+#ifdef CONFIG_SMP
+#define __my_cpu_offset (x86_read_percpu(pda.data_offset))
+#define __percpu_seg "%%gs:"
+#else
+#define __percpu_seg ""
#endif
+
#include <asm-generic/percpu.h>

DECLARE_PER_CPU(struct x8664_pda, pda);

-#else /* CONFIG_X86_64 */
+#else /* !CONFIG_X86_64 */

#ifdef __ASSEMBLY__

@@ -51,36 +45,23 @@ DECLARE_PER_CPU(struct x8664_pda, pda);

#else /* ...!ASSEMBLY */

-/*
- * PER_CPU finds an address of a per-cpu variable.
- *
- * Args:
- * var - variable name
- * cpu - 32bit register containing the current CPU number
- *
- * The resulting address is stored in the "cpu" argument.
- *
- * Example:
- * PER_CPU(cpu_gdt_descr, %ebx)
- */
#ifdef CONFIG_SMP
-
#define __my_cpu_offset x86_read_percpu(this_cpu_off)
-
-/* fs segment starts at (positive) offset == __per_cpu_offset[cpu] */
#define __percpu_seg "%%fs:"
-
-#else /* !SMP */
-
+#else
#define __percpu_seg ""
-
-#endif /* SMP */
+#endif

#include <asm-generic/percpu.h>

/* We can use this directly for local CPU (faster). */
DECLARE_PER_CPU(unsigned long, this_cpu_off);

+#endif /* __ASSEMBLY__ */
+#endif /* !CONFIG_X86_64 */
+
+#ifndef __ASSEMBLY__
+
/* For arch-specific code, we can use direct single-insn ops (they
* don't give an lvalue though). */
extern void __bad_percpu_size(void);
@@ -215,7 +196,6 @@ do { \
percpu_cmpxchg_op(per_cpu_var(var), old, new)

#endif /* !__ASSEMBLY__ */
-#endif /* !CONFIG_X86_64 */

#ifdef CONFIG_SMP


--


2008-06-04 13:00:45

by Jeremy Fitzhardinge

[permalink] [raw]
Subject: Re: [PATCH 3/4] x86_64: Fold pda into per cpu area

Mike Travis wrote:
> * Declare the pda as a per cpu variable.
>
> * Make the x86_64 per cpu area start at zero.
>
> * Since the pda is now the first element of the per_cpu area, cpu_pda()
> is no longer needed and per_cpu() can be used instead. This also makes
> the _cpu_pda[] table obsolete.
>
> * Since %gs is pointing to the pda, it will then also point to the per cpu
> variables and can be accessed thusly:
>
> %gs:[&per_cpu_xxxx - __per_cpu_start]
>

Unfortunately that doesn't actually work, because you can't have a reloc
with two variables.

In something like:

mov %gs:per_cpu__foo - 12345, %rax
mov %gs:per_cpu__foo, %rax
mov %gs:per_cpu__foo - 12345(%rip), %rax
mov %gs:per_cpu__foo(%rip), %rax
mov %gs:per_cpu__foo - __per_cpu_start, %rax
mov %gs:per_cpu__foo - __per_cpu_start(%rip), %rax

the last two lines will not assemble:

t.S:5: Error: can't resolve `per_cpu__foo' {*UND* section} - `__per_cpu_start' {*UND* section}
t.S:6: Error: can't resolve `per_cpu__foo' {*UND* section} - `__per_cpu_start' {*UND* section}

Unfortunately, the only way I can think of fixing this is to compute the
offset into a temp register, then use that:

lea per_cpu__foo(%rip), %rax
mov %gs:__per_cpu_offset(%rax), %rax

(where __per_cpu_offset is defined in the linker script as
-__per_cpu_start).

This seems to be a general problem with zero-offset per-cpu. And its
unfortunate, because no-register access to per-cpu variables is nice to
have.

The other alternative - and I have no idea whether this is practical or
possible - is to define a complete set of pre-offset per_cpu symbols.

J

2008-06-04 13:48:44

by Mike Travis

[permalink] [raw]
Subject: Re: [PATCH 3/4] x86_64: Fold pda into per cpu area

Jeremy Fitzhardinge wrote:
> Mike Travis wrote:
>> * Declare the pda as a per cpu variable.
>>
>> * Make the x86_64 per cpu area start at zero.
>>
>> * Since the pda is now the first element of the per_cpu area, cpu_pda()
>> is no longer needed and per_cpu() can be used instead. This also
>> makes
>> the _cpu_pda[] table obsolete.
>>
>> * Since %gs is pointing to the pda, it will then also point to the
>> per cpu
>> variables and can be accessed thusly:
>>
>> %gs:[&per_cpu_xxxx - __per_cpu_start]
>>


The above is only a partial story (I folded the two patches but didn't
update the comments correctly.] The variables are already offset from
__per_cpu_start by virtue of the .data.percpu section being based at
zero. Therefore only the %gs register needs to be set to the base of
each cpu's percpu section to resolve the target address:

%gs:&per_cpu_xxxx

And the .data.percpu.first forces the pda percpu variable to the front.


>
> Unfortunately that doesn't actually work, because you can't have a reloc
> with two variables.
>
> In something like:
>
> mov %gs:per_cpu__foo - 12345, %rax
> mov %gs:per_cpu__foo, %rax
> mov %gs:per_cpu__foo - 12345(%rip), %rax
> mov %gs:per_cpu__foo(%rip), %rax
> mov %gs:per_cpu__foo - __per_cpu_start, %rax
> mov %gs:per_cpu__foo - __per_cpu_start(%rip), %rax
>
> the last two lines will not assemble:
>
> t.S:5: Error: can't resolve `per_cpu__foo' {*UND* section} -
> `__per_cpu_start' {*UND* section}
> t.S:6: Error: can't resolve `per_cpu__foo' {*UND* section} -
> `__per_cpu_start' {*UND* section}
>
> Unfortunately, the only way I can think of fixing this is to compute the
> offset into a temp register, then use that:
>
> lea per_cpu__foo(%rip), %rax
> mov %gs:__per_cpu_offset(%rax), %rax
>
> (where __per_cpu_offset is defined in the linker script as
> -__per_cpu_start).
>
> This seems to be a general problem with zero-offset per-cpu. And its
> unfortunate, because no-register access to per-cpu variables is nice to
> have.
>
> The other alternative - and I have no idea whether this is practical or
> possible - is to define a complete set of pre-offset per_cpu symbols.
>
> J

2008-06-04 13:58:59

by Jeremy Fitzhardinge

[permalink] [raw]
Subject: Re: [PATCH 3/4] x86_64: Fold pda into per cpu area

Mike Travis wrote:
> Jeremy Fitzhardinge wrote:
>
>> Mike Travis wrote:
>>
>>> * Declare the pda as a per cpu variable.
>>>
>>> * Make the x86_64 per cpu area start at zero.
>>>
>>> * Since the pda is now the first element of the per_cpu area, cpu_pda()
>>> is no longer needed and per_cpu() can be used instead. This also
>>> makes
>>> the _cpu_pda[] table obsolete.
>>>
>>> * Since %gs is pointing to the pda, it will then also point to the
>>> per cpu
>>> variables and can be accessed thusly:
>>>
>>> %gs:[&per_cpu_xxxx - __per_cpu_start]
>>>
>>>
>
>
> The above is only a partial story (I folded the two patches but didn't
> update the comments correctly.] The variables are already offset from
> __per_cpu_start by virtue of the .data.percpu section being based at
> zero. Therefore only the %gs register needs to be set to the base of
> each cpu's percpu section to resolve the target address:
>
> %gs:&per_cpu_xxxx
>

Oh, good. I'd played with trying to make that work at one point, and
got lost in linker bugs and/or random version-specific strangeness.

J

2008-06-04 14:17:20

by Mike Travis

[permalink] [raw]
Subject: Re: [PATCH 3/4] x86_64: Fold pda into per cpu area

Jeremy Fitzhardinge wrote:
> Mike Travis wrote:
>> Jeremy Fitzhardinge wrote:
>>
>>> Mike Travis wrote:
>>>
>>>> * Declare the pda as a per cpu variable.
>>>>
>>>> * Make the x86_64 per cpu area start at zero.
>>>>
>>>> * Since the pda is now the first element of the per_cpu area,
>>>> cpu_pda()
>>>> is no longer needed and per_cpu() can be used instead. This also
>>>> makes
>>>> the _cpu_pda[] table obsolete.
>>>>
>>>> * Since %gs is pointing to the pda, it will then also point to the
>>>> per cpu
>>>> variables and can be accessed thusly:
>>>>
>>>> %gs:[&per_cpu_xxxx - __per_cpu_start]
>>>>
>>>>
>>
>> The above is only a partial story (I folded the two patches but didn't
>> update the comments correctly.] The variables are already offset from
>> __per_cpu_start by virtue of the .data.percpu section being based at
>> zero. Therefore only the %gs register needs to be set to the base of
>> each cpu's percpu section to resolve the target address:
>>
>> %gs:&per_cpu_xxxx
>>
>
> Oh, good. I'd played with trying to make that work at one point, and
> got lost in linker bugs and/or random version-specific strangeness.
> J

Incidentally, this is why the following load is needed in x86_64_start_kernel():

pda = (struct x8664_pda *)__per_cpu_load;
pda->data_offset = per_cpu_offset(0) = (unsigned long)pda;

/* initialize boot cpu_pda data */
pda_init(0);

pda_init() loads the %gs reg so early accesses to the static per_cpu section
can be executed before the percpu areas are allocated.

2008-06-05 10:23:16

by Ingo Molnar

[permalink] [raw]
Subject: [crash, bisected] Re: [PATCH 3/4] x86_64: Fold pda into per cpu area


* Mike Travis <[email protected]> wrote:

> * Declare the pda as a per cpu variable.
>
> * Make the x86_64 per cpu area start at zero.
>
> * Since the pda is now the first element of the per_cpu area, cpu_pda()
> is no longer needed and per_cpu() can be used instead. This also makes
> the _cpu_pda[] table obsolete.
>
> * Since %gs is pointing to the pda, it will then also point to the per cpu
> variables and can be accessed thusly:
>
> %gs:[&per_cpu_xxxx - __per_cpu_start]
>
> Based on linux-2.6.tip

-tip testing found an instantaneous reboot crash on 64-bit x86, with
this config:

http://redhat.com/~mingo/misc/config-Thu_Jun__5_11_43_51_CEST_2008.bad

there is no boot log as the instantaneous reboot happens before anything
is printed to the (early-) serial console. I have bisected it down to:

| 7670dc09e89a2b151a1cf49eccebc07c41c2ce9f is first bad commit
| commit 7670dc09e89a2b151a1cf49eccebc07c41c2ce9f
| Author: Mike Travis <[email protected]>
| Date: Tue Jun 3 17:30:21 2008 -0700
|
| x86_64: Fold pda into per cpu area

the big problem is not just this crash, but that the patch is _way_ too
big:

arch/x86/Kconfig | 3 +
arch/x86/kernel/head64.c | 34 ++++++--------
arch/x86/kernel/irq_64.c | 36 ++++++++-------
arch/x86/kernel/setup.c | 90 ++++++++++++---------------------------
arch/x86/kernel/setup64.c | 5 --
arch/x86/kernel/smpboot.c | 51 ----------------------
arch/x86/kernel/traps_64.c | 11 +++-
arch/x86/kernel/vmlinux_64.lds.S | 1
include/asm-x86/percpu.h | 48 ++++++--------------
9 files changed, 89 insertions(+), 190 deletions(-)

considering the danger involved, this is just way too large, and there's
no reasonable debugging i can do in the bisection to narrow it down any
further.

Please resubmit with the bug fixed and with a proper splitup, the more
patches you manage to create, the better. For a dangerous code area like
this, with a track record of frequent breakages in the past, i would not
mind a "one line of code changed per patch" splitup either. (Feel free
to send a git tree link for us to try as well.)

Ingo

2008-06-05 16:02:29

by Mike Travis

[permalink] [raw]
Subject: Re: [crash, bisected] Re: [PATCH 3/4] x86_64: Fold pda into per cpu area

Ingo Molnar wrote:
> * Mike Travis <[email protected]> wrote:
>
>> * Declare the pda as a per cpu variable.
>>
>> * Make the x86_64 per cpu area start at zero.
>>
>> * Since the pda is now the first element of the per_cpu area, cpu_pda()
>> is no longer needed and per_cpu() can be used instead. This also makes
>> the _cpu_pda[] table obsolete.
>>
>> * Since %gs is pointing to the pda, it will then also point to the per cpu
>> variables and can be accessed thusly:
>>
>> %gs:[&per_cpu_xxxx - __per_cpu_start]
>>
>> Based on linux-2.6.tip
>
> -tip testing found an instantaneous reboot crash on 64-bit x86, with
> this config:
>
> http://redhat.com/~mingo/misc/config-Thu_Jun__5_11_43_51_CEST_2008.bad
>
> there is no boot log as the instantaneous reboot happens before anything
> is printed to the (early-) serial console. I have bisected it down to:
>
> | 7670dc09e89a2b151a1cf49eccebc07c41c2ce9f is first bad commit
> | commit 7670dc09e89a2b151a1cf49eccebc07c41c2ce9f
> | Author: Mike Travis <[email protected]>
> | Date: Tue Jun 3 17:30:21 2008 -0700
> |
> | x86_64: Fold pda into per cpu area
>
> the big problem is not just this crash, but that the patch is _way_ too
> big:
>
> arch/x86/Kconfig | 3 +
> arch/x86/kernel/head64.c | 34 ++++++--------
> arch/x86/kernel/irq_64.c | 36 ++++++++-------
> arch/x86/kernel/setup.c | 90 ++++++++++++---------------------------
> arch/x86/kernel/setup64.c | 5 --
> arch/x86/kernel/smpboot.c | 51 ----------------------
> arch/x86/kernel/traps_64.c | 11 +++-
> arch/x86/kernel/vmlinux_64.lds.S | 1
> include/asm-x86/percpu.h | 48 ++++++--------------
> 9 files changed, 89 insertions(+), 190 deletions(-)
>
> considering the danger involved, this is just way too large, and there's
> no reasonable debugging i can do in the bisection to narrow it down any
> further.
>
> Please resubmit with the bug fixed and with a proper splitup, the more
> patches you manage to create, the better. For a dangerous code area like
> this, with a track record of frequent breakages in the past, i would not
> mind a "one line of code changed per patch" splitup either. (Feel free
> to send a git tree link for us to try as well.)
>
> Ingo

Thanks for the feedback Ingo. I'll test the above config and look at
splitting up the patch. The difficulty is making each patch independently
compilable and testable.

Mike

2008-06-06 08:30:17

by Jeremy Fitzhardinge

[permalink] [raw]
Subject: Re: [crash, bisected] Re: [PATCH 3/4] x86_64: Fold pda into per cpu area

Mike Travis wrote:
> Ingo Molnar wrote:
>
>> * Mike Travis <[email protected]> wrote:
>>
>>
>>> * Declare the pda as a per cpu variable.
>>>
>>> * Make the x86_64 per cpu area start at zero.
>>>
>>> * Since the pda is now the first element of the per_cpu area, cpu_pda()
>>> is no longer needed and per_cpu() can be used instead. This also makes
>>> the _cpu_pda[] table obsolete.
>>>
>>> * Since %gs is pointing to the pda, it will then also point to the per cpu
>>> variables and can be accessed thusly:
>>>
>>> %gs:[&per_cpu_xxxx - __per_cpu_start]
>>>
>>> Based on linux-2.6.tip
>>>
>> -tip testing found an instantaneous reboot crash on 64-bit x86, with
>> this config:
>>
>> http://redhat.com/~mingo/misc/config-Thu_Jun__5_11_43_51_CEST_2008.bad
>>
>> there is no boot log as the instantaneous reboot happens before anything
>> is printed to the (early-) serial console. I have bisected it down to:
>>
>> | 7670dc09e89a2b151a1cf49eccebc07c41c2ce9f is first bad commit
>> | commit 7670dc09e89a2b151a1cf49eccebc07c41c2ce9f
>> | Author: Mike Travis <[email protected]>
>> | Date: Tue Jun 3 17:30:21 2008 -0700
>> |
>> | x86_64: Fold pda into per cpu area
>>
>> the big problem is not just this crash, but that the patch is _way_ too
>> big:
>>
>> arch/x86/Kconfig | 3 +
>> arch/x86/kernel/head64.c | 34 ++++++--------
>> arch/x86/kernel/irq_64.c | 36 ++++++++-------
>> arch/x86/kernel/setup.c | 90 ++++++++++++---------------------------
>> arch/x86/kernel/setup64.c | 5 --
>> arch/x86/kernel/smpboot.c | 51 ----------------------
>> arch/x86/kernel/traps_64.c | 11 +++-
>> arch/x86/kernel/vmlinux_64.lds.S | 1
>> include/asm-x86/percpu.h | 48 ++++++--------------
>> 9 files changed, 89 insertions(+), 190 deletions(-)
>>
>> considering the danger involved, this is just way too large, and there's
>> no reasonable debugging i can do in the bisection to narrow it down any
>> further.
>>
>> Please resubmit with the bug fixed and with a proper splitup, the more
>> patches you manage to create, the better. For a dangerous code area like
>> this, with a track record of frequent breakages in the past, i would not
>> mind a "one line of code changed per patch" splitup either. (Feel free
>> to send a git tree link for us to try as well.)
>>
>> Ingo
>>
>
> Thanks for the feedback Ingo. I'll test the above config and look at
> splitting up the patch. The difficulty is making each patch independently
> compilable and testable.

FWIW, I'm getting past the "crashes very, very early" stage with this
series applied when booting under Xen. Then it crashes pretty early,
but that's not your fault...

J

2008-06-06 13:15:33

by Mike Travis

[permalink] [raw]
Subject: Re: [crash, bisected] Re: [PATCH 3/4] x86_64: Fold pda into per cpu area

Jeremy Fitzhardinge wrote:
> Mike Travis wrote:
>> Ingo Molnar wrote:
>>
>>> * Mike Travis <[email protected]> wrote:
>>>
>>>
>>>> * Declare the pda as a per cpu variable.
>>>>
>>>> * Make the x86_64 per cpu area start at zero.
>>>>
>>>> * Since the pda is now the first element of the per_cpu area,
>>>> cpu_pda()
>>>> is no longer needed and per_cpu() can be used instead. This
>>>> also makes
>>>> the _cpu_pda[] table obsolete.
>>>>
>>>> * Since %gs is pointing to the pda, it will then also point to the
>>>> per cpu
>>>> variables and can be accessed thusly:
>>>>
>>>> %gs:[&per_cpu_xxxx - __per_cpu_start]
>>>>
>>>> Based on linux-2.6.tip
>>>>
>>> -tip testing found an instantaneous reboot crash on 64-bit x86, with
>>> this config:
>>>
>>> http://redhat.com/~mingo/misc/config-Thu_Jun__5_11_43_51_CEST_2008.bad
>>>
>>> there is no boot log as the instantaneous reboot happens before
>>> anything is printed to the (early-) serial console. I have bisected
>>> it down to:
>>>
>>> | 7670dc09e89a2b151a1cf49eccebc07c41c2ce9f is first bad commit
>>> | commit 7670dc09e89a2b151a1cf49eccebc07c41c2ce9f
>>> | Author: Mike Travis <[email protected]>
>>> | Date: Tue Jun 3 17:30:21 2008 -0700
>>> |
>>> | x86_64: Fold pda into per cpu area
>>>
>>> the big problem is not just this crash, but that the patch is _way_
>>> too big:
>>>
>>> arch/x86/Kconfig | 3 +
>>> arch/x86/kernel/head64.c | 34 ++++++--------
>>> arch/x86/kernel/irq_64.c | 36 ++++++++-------
>>> arch/x86/kernel/setup.c | 90
>>> ++++++++++++---------------------------
>>> arch/x86/kernel/setup64.c | 5 --
>>> arch/x86/kernel/smpboot.c | 51 ----------------------
>>> arch/x86/kernel/traps_64.c | 11 +++-
>>> arch/x86/kernel/vmlinux_64.lds.S | 1
>>> include/asm-x86/percpu.h | 48 ++++++--------------
>>> 9 files changed, 89 insertions(+), 190 deletions(-)
>>>
>>> considering the danger involved, this is just way too large, and
>>> there's no reasonable debugging i can do in the bisection to narrow
>>> it down any further.
>>>
>>> Please resubmit with the bug fixed and with a proper splitup, the
>>> more patches you manage to create, the better. For a dangerous code
>>> area like this, with a track record of frequent breakages in the
>>> past, i would not mind a "one line of code changed per patch" splitup
>>> either. (Feel free to send a git tree link for us to try as well.)
>>>
>>> Ingo
>>>
>>
>> Thanks for the feedback Ingo. I'll test the above config and look at
>> splitting up the patch. The difficulty is making each patch
>> independently
>> compilable and testable.
>
> FWIW, I'm getting past the "crashes very, very early" stage with this
> series applied when booting under Xen. Then it crashes pretty early,
> but that's not your fault...
>
> J

Hi Jeremy,

Yes we have a simulator for Nahelem that also breezes past the boot up
problem (actually makes it to the kernel login prompt.) Weirdly, the
problem doesn't exist in an earlier code base so my changes are tickling
something else newly introduced. I'm attempting to see if I can use
GRUB 2 with the GDB stubs to track it down (which is time consuming in
itself to setup.)

It is definitely related to basing percpu variable offsets from %gs and
(I think) interrupts.

Thanks,
Mike

2008-06-09 23:18:43

by Christoph Lameter

[permalink] [raw]
Subject: Re: [PATCH 3/4] x86_64: Fold pda into per cpu area

On Wed, 4 Jun 2008, Jeremy Fitzhardinge wrote:

> > %gs:[&per_cpu_xxxx - __per_cpu_start]
> >
>
> Unfortunately that doesn't actually work, because you can't have a reloc with
> two variables.

That is just a conceptual discussion. __per_cpu_start is 0 with the zero
based patch. And thus this reduces to

%gs[&per_cpu_xxx]

2008-06-10 21:31:27

by Mike Travis

[permalink] [raw]
Subject: Re: [crash, bisected] Re: [PATCH 3/4] x86_64: Fold pda into per cpu area

Ingo Molnar wrote:
> * Mike Travis <[email protected]> wrote:
>
>> * Declare the pda as a per cpu variable.
>>
>> * Make the x86_64 per cpu area start at zero.
>>
>> * Since the pda is now the first element of the per_cpu area, cpu_pda()
>> is no longer needed and per_cpu() can be used instead. This also makes
>> the _cpu_pda[] table obsolete.
>>
>> * Since %gs is pointing to the pda, it will then also point to the per cpu
>> variables and can be accessed thusly:
>>
>> %gs:[&per_cpu_xxxx - __per_cpu_start]
>>
>> Based on linux-2.6.tip
>
> -tip testing found an instantaneous reboot crash on 64-bit x86, with
> this config:
>
> http://redhat.com/~mingo/misc/config-Thu_Jun__5_11_43_51_CEST_2008.bad

I'm still stuck on this one. One new development is that the current -tip
branch without the patches boots to the kernel prompt then hangs after a few
moments and then reboots. It seems you can tickle it using ^C to abort a
process.

-Mike

>
> there is no boot log as the instantaneous reboot happens before anything
> is printed to the (early-) serial console. I have bisected it down to:
>
> | 7670dc09e89a2b151a1cf49eccebc07c41c2ce9f is first bad commit
> | commit 7670dc09e89a2b151a1cf49eccebc07c41c2ce9f
> | Author: Mike Travis <[email protected]>
> | Date: Tue Jun 3 17:30:21 2008 -0700
> |
> | x86_64: Fold pda into per cpu area
>
> the big problem is not just this crash, but that the patch is _way_ too
> big:
>
> arch/x86/Kconfig | 3 +
> arch/x86/kernel/head64.c | 34 ++++++--------
> arch/x86/kernel/irq_64.c | 36 ++++++++-------
> arch/x86/kernel/setup.c | 90 ++++++++++++---------------------------
> arch/x86/kernel/setup64.c | 5 --
> arch/x86/kernel/smpboot.c | 51 ----------------------
> arch/x86/kernel/traps_64.c | 11 +++-
> arch/x86/kernel/vmlinux_64.lds.S | 1
> include/asm-x86/percpu.h | 48 ++++++--------------
> 9 files changed, 89 insertions(+), 190 deletions(-)
>
> considering the danger involved, this is just way too large, and there's
> no reasonable debugging i can do in the bisection to narrow it down any
> further.
>
> Please resubmit with the bug fixed and with a proper splitup, the more
> patches you manage to create, the better. For a dangerous code area like
> this, with a track record of frequent breakages in the past, i would not
> mind a "one line of code changed per patch" splitup either. (Feel free
> to send a git tree link for us to try as well.)
>
> Ingo

2008-06-18 05:35:39

by Jeremy Fitzhardinge

[permalink] [raw]
Subject: Re: [crash, bisected] Re: [PATCH 3/4] x86_64: Fold pda into per cpu area

Mike Travis wrote:
> Jeremy Fitzhardinge wrote:
>
>> Mike Travis wrote:
>>
>>> Ingo Molnar wrote:
>>>
>>>
>>>> * Mike Travis <[email protected]> wrote:
>>>>
>>>>
>>>>
>>>>> * Declare the pda as a per cpu variable.
>>>>>
>>>>> * Make the x86_64 per cpu area start at zero.
>>>>>
>>>>> * Since the pda is now the first element of the per_cpu area,
>>>>> cpu_pda()
>>>>> is no longer needed and per_cpu() can be used instead. This
>>>>> also makes
>>>>> the _cpu_pda[] table obsolete.
>>>>>
>>>>> * Since %gs is pointing to the pda, it will then also point to the
>>>>> per cpu
>>>>> variables and can be accessed thusly:
>>>>>
>>>>> %gs:[&per_cpu_xxxx - __per_cpu_start]
>>>>>
>>>>> Based on linux-2.6.tip
>>>>>
>>>>>
>>>> -tip testing found an instantaneous reboot crash on 64-bit x86, with
>>>> this config:
>>>>
>>>> http://redhat.com/~mingo/misc/config-Thu_Jun__5_11_43_51_CEST_2008.bad
>>>>
>>>> there is no boot log as the instantaneous reboot happens before
>>>> anything is printed to the (early-) serial console. I have bisected
>>>> it down to:
>>>>
>>>> | 7670dc09e89a2b151a1cf49eccebc07c41c2ce9f is first bad commit
>>>> | commit 7670dc09e89a2b151a1cf49eccebc07c41c2ce9f
>>>> | Author: Mike Travis <[email protected]>
>>>> | Date: Tue Jun 3 17:30:21 2008 -0700
>>>> |
>>>> | x86_64: Fold pda into per cpu area
>>>>
>>>> the big problem is not just this crash, but that the patch is _way_
>>>> too big:
>>>>
>>>> arch/x86/Kconfig | 3 +
>>>> arch/x86/kernel/head64.c | 34 ++++++--------
>>>> arch/x86/kernel/irq_64.c | 36 ++++++++-------
>>>> arch/x86/kernel/setup.c | 90
>>>> ++++++++++++---------------------------
>>>> arch/x86/kernel/setup64.c | 5 --
>>>> arch/x86/kernel/smpboot.c | 51 ----------------------
>>>> arch/x86/kernel/traps_64.c | 11 +++-
>>>> arch/x86/kernel/vmlinux_64.lds.S | 1
>>>> include/asm-x86/percpu.h | 48 ++++++--------------
>>>> 9 files changed, 89 insertions(+), 190 deletions(-)
>>>>
>>>> considering the danger involved, this is just way too large, and
>>>> there's no reasonable debugging i can do in the bisection to narrow
>>>> it down any further.
>>>>
>>>> Please resubmit with the bug fixed and with a proper splitup, the
>>>> more patches you manage to create, the better. For a dangerous code
>>>> area like this, with a track record of frequent breakages in the
>>>> past, i would not mind a "one line of code changed per patch" splitup
>>>> either. (Feel free to send a git tree link for us to try as well.)
>>>>
>>>> Ingo
>>>>
>>>>
>>> Thanks for the feedback Ingo. I'll test the above config and look at
>>> splitting up the patch. The difficulty is making each patch
>>> independently
>>> compilable and testable.
>>>
>> FWIW, I'm getting past the "crashes very, very early" stage with this
>> series applied when booting under Xen. Then it crashes pretty early,
>> but that's not your fault...
>>
>> J
>>
>
> Hi Jeremy,
>
> Yes we have a simulator for Nahelem that also breezes past the boot up
> problem (actually makes it to the kernel login prompt.) Weirdly, the
> problem doesn't exist in an earlier code base so my changes are tickling
> something else newly introduced. I'm attempting to see if I can use
> GRUB 2 with the GDB stubs to track it down (which is time consuming in
> itself to setup.)
>
> It is definitely related to basing percpu variable offsets from %gs and
> (I think) interrupts.
>

Hi Mike,

Have you made any progress on this? I'm bumping up against it when I
run on native hardware (as opposed to under Xen).

J

2008-06-18 17:37:32

by Jeremy Fitzhardinge

[permalink] [raw]
Subject: Re: [crash, bisected] Re: [PATCH 3/4] x86_64: Fold pda into per cpu area

Mike Travis wrote:
> Ingo Molnar wrote:
>
>> * Mike Travis <[email protected]> wrote:
>>
>>
>>> * Declare the pda as a per cpu variable.
>>>
>>> * Make the x86_64 per cpu area start at zero.
>>>
>>> * Since the pda is now the first element of the per_cpu area, cpu_pda()
>>> is no longer needed and per_cpu() can be used instead. This also makes
>>> the _cpu_pda[] table obsolete.
>>>
>>> * Since %gs is pointing to the pda, it will then also point to the per cpu
>>> variables and can be accessed thusly:
>>>
>>> %gs:[&per_cpu_xxxx - __per_cpu_start]
>>>
>>> Based on linux-2.6.tip
>>>
>> -tip testing found an instantaneous reboot crash on 64-bit x86, with
>> this config:
>>
>> http://redhat.com/~mingo/misc/config-Thu_Jun__5_11_43_51_CEST_2008.bad
>>
>
> I'm still stuck on this one. One new development is that the current -tip
> branch without the patches boots to the kernel prompt then hangs after a few
> moments and then reboots. It seems you can tickle it using ^C to abort a
> process.

Hi Mike,

I added some instrumentation to Xen to print the cpu state on
triple-fault, which highlights an obvious-looking problem.

(XEN) hvm.c:767:d1 Triple fault on VCPU0 - invoking HVM system reset.
(XEN) ----[ Xen-3.3-unstable x86_64 debug=y Not tainted ]----
(XEN) CPU: 1
(XEN) RIP: 0010:[<ffffffff80200160>]
(XEN) RFLAGS: 0000000000010002 CONTEXT: hvm
(XEN) rax: 0000000000000018 rbx: 0000000000000000 rcx: 00000000c0000080
(XEN) rdx: 0000000000000000 rsi: 0000000000092f40 rdi: 0000000020100800
(XEN) rbp: 0000000000000000 rsp: ffffffff807dfff8 r8: 0000000000208000
(XEN) r9: 0000000000000000 r10: 0000000000000000 r11: 00000000000000de
(XEN) r12: 0000000000000000 r13: 0000000000000000 r14: 0000000000000000
(XEN) r15: 0000000000000000 cr0: 0000000080050033 cr4: 00000000000000a0
(XEN) cr3: 0000000000201000 cr2: 0000000000000000
(XEN) ds: 0000 es: 0000 fs: 0000 gs: 0000 ss: 0000 cs: 0010

The rip is:

(gdb) x/i 0xffffffff80200160
0xffffffff80200160 <secondary_startup_64+96>: movl %eax,%ds

which is:

lgdt early_gdt_descr(%rip)

/* set up data segments. actually 0 would do too */
movl $__KERNEL_DS,%eax
movl %eax,%ds
movl %eax,%ss
movl %eax,%es

And early_gdt_descr is:

.globl early_gdt_descr
early_gdt_descr:
.word GDT_ENTRIES*8-1
.quad per_cpu__gdt_page

and per_cpu__gdt_page is zero-based, and therefore not a directly
addressable symbol.

I tried this patch, but it didn't work. Perhaps I'm missing something.

diff -r bf5a46e13f78 arch/x86/kernel/head_64.S
--- a/arch/x86/kernel/head_64.S Tue Jun 17 22:10:51 2008 -0700
+++ b/arch/x86/kernel/head_64.S Wed Jun 18 10:34:24 2008 -0700
@@ -94,6 +94,8 @@

addq %rbp, level2_fixmap_pgt + (506*8)(%rip)

+ addq $__per_cpu_load, early_gdt_descr+2(%rip)
+
/* Add an Identity mapping if I am above 1G */
leaq _text(%rip), %rdi
andq $PMD_PAGE_MASK, %rdi


J

2008-06-18 18:17:18

by Mike Travis

[permalink] [raw]
Subject: Re: [crash, bisected] Re: [PATCH 3/4] x86_64: Fold pda into per cpu area

Jeremy Fitzhardinge wrote:
> Mike Travis wrote:
>> Ingo Molnar wrote:
>>
>>> * Mike Travis <[email protected]> wrote:
>>>
>>>
>>>> * Declare the pda as a per cpu variable.
>>>>
>>>> * Make the x86_64 per cpu area start at zero.
>>>>
>>>> * Since the pda is now the first element of the per_cpu area,
>>>> cpu_pda()
>>>> is no longer needed and per_cpu() can be used instead. This
>>>> also makes
>>>> the _cpu_pda[] table obsolete.
>>>>
>>>> * Since %gs is pointing to the pda, it will then also point to the
>>>> per cpu
>>>> variables and can be accessed thusly:
>>>>
>>>> %gs:[&per_cpu_xxxx - __per_cpu_start]
>>>>
>>>> Based on linux-2.6.tip
>>>>
>>> -tip testing found an instantaneous reboot crash on 64-bit x86, with
>>> this config:
>>>
>>> http://redhat.com/~mingo/misc/config-Thu_Jun__5_11_43_51_CEST_2008.bad
>>>
>>
>> I'm still stuck on this one. One new development is that the current
>> -tip
>> branch without the patches boots to the kernel prompt then hangs after
>> a few
>> moments and then reboots. It seems you can tickle it using ^C to abort a
>> process.
>
> Hi Mike,
>
> I added some instrumentation to Xen to print the cpu state on
> triple-fault, which highlights an obvious-looking problem.
>
> (XEN) hvm.c:767:d1 Triple fault on VCPU0 - invoking HVM system reset.
> (XEN) ----[ Xen-3.3-unstable x86_64 debug=y Not tainted ]----
> (XEN) CPU: 1
> (XEN) RIP: 0010:[<ffffffff80200160>]
> (XEN) RFLAGS: 0000000000010002 CONTEXT: hvm
> (XEN) rax: 0000000000000018 rbx: 0000000000000000 rcx: 00000000c0000080
> (XEN) rdx: 0000000000000000 rsi: 0000000000092f40 rdi: 0000000020100800
> (XEN) rbp: 0000000000000000 rsp: ffffffff807dfff8 r8: 0000000000208000
> (XEN) r9: 0000000000000000 r10: 0000000000000000 r11: 00000000000000de
> (XEN) r12: 0000000000000000 r13: 0000000000000000 r14: 0000000000000000
> (XEN) r15: 0000000000000000 cr0: 0000000080050033 cr4: 00000000000000a0
> (XEN) cr3: 0000000000201000 cr2: 0000000000000000
> (XEN) ds: 0000 es: 0000 fs: 0000 gs: 0000 ss: 0000 cs: 0010
>
> The rip is:
>
> (gdb) x/i 0xffffffff80200160
> 0xffffffff80200160 <secondary_startup_64+96>: movl %eax,%ds
>
> which is:
>
> lgdt early_gdt_descr(%rip)
>
> /* set up data segments. actually 0 would do too */
> movl $__KERNEL_DS,%eax
> movl %eax,%ds
> movl %eax,%ss
> movl %eax,%es
>
> And early_gdt_descr is:
>
> .globl early_gdt_descr
> early_gdt_descr:
> .word GDT_ENTRIES*8-1
> .quad per_cpu__gdt_page
>
> and per_cpu__gdt_page is zero-based, and therefore not a directly
> addressable symbol.
>
> I tried this patch, but it didn't work. Perhaps I'm missing something.
>
> diff -r bf5a46e13f78 arch/x86/kernel/head_64.S
> --- a/arch/x86/kernel/head_64.S Tue Jun 17 22:10:51 2008 -0700
> +++ b/arch/x86/kernel/head_64.S Wed Jun 18 10:34:24 2008 -0700
> @@ -94,6 +94,8 @@
>
> addq %rbp, level2_fixmap_pgt + (506*8)(%rip)
>
> + addq $__per_cpu_load, early_gdt_descr+2(%rip)
> +
> /* Add an Identity mapping if I am above 1G */
> leaq _text(%rip), %rdi
> andq $PMD_PAGE_MASK, %rdi
>
>
> J

Hi Jeremy,

I'm not finding that code in the tip/latest or linux-next branches... ?

I can send you my latest version of the patch which is better than
the previous but still is having problems with the config file that
Ingo sent out. (It also has a weird quirk that it will hang and
reboot after about 30 seconds with or without my patch.)

Thanks,
Mike

2008-06-18 18:35:05

by Ingo Molnar

[permalink] [raw]
Subject: Re: [crash, bisected] Re: [PATCH 3/4] x86_64: Fold pda into per cpu area


* Mike Travis <[email protected]> wrote:

> Hi Jeremy,
>
> I'm not finding that code in the tip/latest or linux-next branches...
> ?
>
> I can send you my latest version of the patch which is better than the
> previous but still is having problems with the config file that Ingo
> sent out. (It also has a weird quirk that it will hang and reboot
> after about 30 seconds with or without my patch.)

the patch is not in -tip yet because we dont keep known-broken patches
applied unless there's some really strong reason to do so.

Ingo

2008-06-18 19:34:35

by Jeremy Fitzhardinge

[permalink] [raw]
Subject: Re: [crash, bisected] Re: [PATCH 3/4] x86_64: Fold pda into per cpu area

Mike Travis wrote:
> Jeremy Fitzhardinge wrote:
>
>> Mike Travis wrote:
>>
>>> Ingo Molnar wrote:
>>>
>>>
>>>> * Mike Travis <[email protected]> wrote:
>>>>
>>>>
>>>>
>>>>> * Declare the pda as a per cpu variable.
>>>>>
>>>>> * Make the x86_64 per cpu area start at zero.
>>>>>
>>>>> * Since the pda is now the first element of the per_cpu area,
>>>>> cpu_pda()
>>>>> is no longer needed and per_cpu() can be used instead. This
>>>>> also makes
>>>>> the _cpu_pda[] table obsolete.
>>>>>
>>>>> * Since %gs is pointing to the pda, it will then also point to the
>>>>> per cpu
>>>>> variables and can be accessed thusly:
>>>>>
>>>>> %gs:[&per_cpu_xxxx - __per_cpu_start]
>>>>>
>>>>> Based on linux-2.6.tip
>>>>>
>>>>>
>>>> -tip testing found an instantaneous reboot crash on 64-bit x86, with
>>>> this config:
>>>>
>>>> http://redhat.com/~mingo/misc/config-Thu_Jun__5_11_43_51_CEST_2008.bad
>>>>
>>>>
>>> I'm still stuck on this one. One new development is that the current
>>> -tip
>>> branch without the patches boots to the kernel prompt then hangs after
>>> a few
>>> moments and then reboots. It seems you can tickle it using ^C to abort a
>>> process.
>>>
>> Hi Mike,
>>
>> I added some instrumentation to Xen to print the cpu state on
>> triple-fault, which highlights an obvious-looking problem.
>>
>> (XEN) hvm.c:767:d1 Triple fault on VCPU0 - invoking HVM system reset.
>> (XEN) ----[ Xen-3.3-unstable x86_64 debug=y Not tainted ]----
>> (XEN) CPU: 1
>> (XEN) RIP: 0010:[<ffffffff80200160>]
>> (XEN) RFLAGS: 0000000000010002 CONTEXT: hvm
>> (XEN) rax: 0000000000000018 rbx: 0000000000000000 rcx: 00000000c0000080
>> (XEN) rdx: 0000000000000000 rsi: 0000000000092f40 rdi: 0000000020100800
>> (XEN) rbp: 0000000000000000 rsp: ffffffff807dfff8 r8: 0000000000208000
>> (XEN) r9: 0000000000000000 r10: 0000000000000000 r11: 00000000000000de
>> (XEN) r12: 0000000000000000 r13: 0000000000000000 r14: 0000000000000000
>> (XEN) r15: 0000000000000000 cr0: 0000000080050033 cr4: 00000000000000a0
>> (XEN) cr3: 0000000000201000 cr2: 0000000000000000
>> (XEN) ds: 0000 es: 0000 fs: 0000 gs: 0000 ss: 0000 cs: 0010
>>
>> The rip is:
>>
>> (gdb) x/i 0xffffffff80200160
>> 0xffffffff80200160 <secondary_startup_64+96>: movl %eax,%ds
>>
>> which is:
>>
>> lgdt early_gdt_descr(%rip)
>>
>> /* set up data segments. actually 0 would do too */
>> movl $__KERNEL_DS,%eax
>> movl %eax,%ds
>> movl %eax,%ss
>> movl %eax,%es
>>
>> And early_gdt_descr is:
>>
>> .globl early_gdt_descr
>> early_gdt_descr:
>> .word GDT_ENTRIES*8-1
>> .quad per_cpu__gdt_page
>>
>> and per_cpu__gdt_page is zero-based, and therefore not a directly
>> addressable symbol.
>>
>> I tried this patch, but it didn't work. Perhaps I'm missing something.
>>
>> diff -r bf5a46e13f78 arch/x86/kernel/head_64.S
>> --- a/arch/x86/kernel/head_64.S Tue Jun 17 22:10:51 2008 -0700
>> +++ b/arch/x86/kernel/head_64.S Wed Jun 18 10:34:24 2008 -0700
>> @@ -94,6 +94,8 @@
>>
>> addq %rbp, level2_fixmap_pgt + (506*8)(%rip)
>>
>> + addq $__per_cpu_load, early_gdt_descr+2(%rip)
>> +
>> /* Add an Identity mapping if I am above 1G */
>> leaq _text(%rip), %rdi
>> andq $PMD_PAGE_MASK, %rdi
>>
>>
>> J
>>
>
> Hi Jeremy,
>
> I'm not finding that code in the tip/latest or linux-next branches... ?
>

You mean your percpu/pda code? No, I'm carrying it locally because I
need it as a base for my Xen work. Xen bypasses these early boot
stages, so I haven't seen any problems so far.
But I'd also like to make sure that my Xen changes don't break native
boots, too...

> I can send you my latest version of the patch which is better than
> the previous but still is having problems with the config file that
> Ingo sent out. (It also has a weird quirk that it will hang and
> reboot after about 30 seconds with or without my patch.)
>

Yes, keep me uptodate with the percpu work.

J

2008-06-19 21:37:29

by Jeremy Fitzhardinge

[permalink] [raw]
Subject: Re: [crash, bisected] Re: [PATCH 3/4] x86_64: Fold pda into per cpu area

Mike Travis wrote:
> Oh yeah, is it alright to re-use the pda in the static percpu load area
> for each startup cpu, or should it be adjusted to use the areas allocated
> by setup_per_cpu_areas()? pda_init() is called in x86_64_start_kernel
> so it would only be for anything that occurs before then. (And I moved
> the call to pda_init() to before the early_idt_handlers are setup.)
>

Why not use the real pda for all cpus?

Do you move the boot-cpu's per-cpu data? (Please don't) If not, you can
just use percpu__pda from the start without having to do anything else,
and then set up %gs pointing to the pda base for each secondary cpu.

64-bit inherits 32-bit's use of per-cpu gdts, though its mostly useless
on 64-bit.

More important is to have a:

startup_percpu_base: .quad __per_cpu_load

which you stick the processor's initial %gs into, and then load that
from in startup_secondary_64:

mov $X86_MSR_GSBASE, %ecx
mov startup_percpu_base, %eax
mov startup_percpu_base+4, %edx
wrmsr

and put

startup_percpu_base = new_cpus_percpu_base;

in do_cpu_boot().

> I hadn't realized that this code is executed for cpus other than the
> boot cpu. Is there a way to find out if this is the boot cpu (and/or
> the initial execution)?
>

Don't think so. If you want something to happen only at boot time, do
it in startup_64.

> If it's the boot cpu, then this would work for the gdt, yes?
>
> leaq early_gdt_descr_base(%rip), %edi
> movq 0(%edi), %rax
> addq $__per_cpu_load, %rax
> movq %rax, 0(%edi)
> lgdt early_gdt_descr(%rip)
>

As I mentioned in my other mail, a simple add should be enough.

> But it should only be executed for the boot because do_boot_cpu()
> does this:
>
> early_gdt_descr.address = (unsigned long)get_cpu_gdt_table(cpu);
>
> static inline struct desc_struct *get_cpu_gdt_table(unsigned int cpu)
> {
> return per_cpu(gdt_page, cpu).gdt;
> }
>

Right, do it in startup_64.

> Btw, I've only been testing on an x86_64 system. I'm sure I've got
> things to fix up for i386.
>

It should be possible to share almost everything, at least in C.

J

2008-06-19 21:55:07

by Jeremy Fitzhardinge

[permalink] [raw]
Subject: Re: [crash, bisected] Re: [PATCH 3/4] x86_64: Fold pda into per cpu area

Jeremy Fitzhardinge wrote:
> 64-bit inherits 32-bit's use of per-cpu gdts, though its mostly
> useless on 64-bit.

Note to self: Not really true; they're still needed for 32-on-64.

J

2008-06-19 22:13:52

by Mike Travis

[permalink] [raw]
Subject: Re: [crash, bisected] Re: [PATCH 3/4] x86_64: Fold pda into per cpu area

Jeremy Fitzhardinge wrote:

>
> Why not use the real pda for all cpus?

Yeah, I figured that out after doing some more thinking... ;-)

>
> Do you move the boot-cpu's per-cpu data? (Please don't) If not, you can
> just use percpu__pda from the start without having to do anything else,
> and then set up %gs pointing to the pda base for each secondary cpu.

The problem is that the static percpu area is removed as it lies
in the initdata section, so the pda is removed as well.

But I took your suggestion to move the fixup to before secondary_startup.

Below is a revised version. It builds but I'll have to test it tomorrow.
Note the addition of:

+ initial_pda = (unsigned long)get_percpu_pda(cpu);

in do_boot_cpu.

I'm not sure yet what to put into acpi_save_state_mem:

initial_code = (unsigned long)wakeup_long64;
+ /* ZZZ initial_pda = (unsigned long)?; */

Thanks again for your help!

Based on linux-2.6.tip/master

Signed-off-by: Christoph Lameter <[email protected]>
Signed-off-by: Mike Travis <[email protected]>
---
arch/x86/Kconfig | 3 +
arch/x86/kernel/acpi/sleep.c | 1
arch/x86/kernel/head64.c | 34 ++++++---------
arch/x86/kernel/head_64.S | 13 +++++
arch/x86/kernel/setup.c | 86 +++++++++++----------------------------
arch/x86/kernel/setup64.c | 3 -
arch/x86/kernel/smpboot.c | 52 -----------------------
arch/x86/kernel/vmlinux_64.lds.S | 1
include/asm-x86/desc.h | 5 ++
include/asm-x86/pda.h | 3 -
include/asm-x86/percpu.h | 46 +++++---------------
include/asm-x86/trampoline.h | 1
12 files changed, 78 insertions(+), 170 deletions(-)

--- linux-2.6.tip.orig/arch/x86/Kconfig
+++ linux-2.6.tip/arch/x86/Kconfig
@@ -129,6 +129,9 @@ config HAVE_SETUP_PER_CPU_AREA
config HAVE_CPUMASK_OF_CPU_MAP
def_bool X86_64_SMP

+config HAVE_ZERO_BASED_PER_CPU
+ def_bool X86_64_SMP
+
config ARCH_HIBERNATION_POSSIBLE
def_bool y
depends on !SMP || !X86_VOYAGER
--- linux-2.6.tip.orig/arch/x86/kernel/acpi/sleep.c
+++ linux-2.6.tip/arch/x86/kernel/acpi/sleep.c
@@ -76,6 +76,7 @@ int acpi_save_state_mem(void)
stack_start.sp = temp_stack + 4096;
#endif
initial_code = (unsigned long)wakeup_long64;
+ /* ZZZ initial_pda = (unsigned long)?; */
saved_magic = 0x123456789abcdef0;
#endif /* CONFIG_64BIT */

--- linux-2.6.tip.orig/arch/x86/kernel/head64.c
+++ linux-2.6.tip/arch/x86/kernel/head64.c
@@ -25,20 +25,6 @@
#include <asm/e820.h>
#include <asm/bios_ebda.h>

-/* boot cpu pda */
-static struct x8664_pda _boot_cpu_pda __read_mostly;
-
-#ifdef CONFIG_SMP
-/*
- * We install an empty cpu_pda pointer table to indicate to early users
- * (numa_set_node) that the cpu_pda pointer table for cpus other than
- * the boot cpu is not yet setup.
- */
-static struct x8664_pda *__cpu_pda[NR_CPUS] __initdata;
-#else
-static struct x8664_pda *__cpu_pda[NR_CPUS] __read_mostly;
-#endif
-
static void __init zap_identity_mappings(void)
{
pgd_t *pgd = pgd_offset_k(0UL);
@@ -91,6 +77,20 @@ void __init x86_64_start_kernel(char * r
/* Cleanup the over mapped high alias */
cleanup_highmap();

+ /* point to boot pda which is the first element in the percpu area */
+ {
+ struct x8664_pda *pda;
+#ifdef CONFIG_SMP
+ pda = (struct x8664_pda *)__per_cpu_load;
+ pda->data_offset = per_cpu_offset(0) = (unsigned long)pda;
+#else
+ pda = &per_cpu(pda, 0);
+ pda->data_offset = (unsigned long)pda;
+#endif
+ }
+ /* initialize boot cpu_pda data */
+ pda_init(0);
+
for (i = 0; i < NUM_EXCEPTION_VECTORS; i++) {
#ifdef CONFIG_EARLY_PRINTK
set_intr_gate(i, &early_idt_handlers[i]);
@@ -102,12 +102,6 @@ void __init x86_64_start_kernel(char * r

early_printk("Kernel alive\n");

- _cpu_pda = __cpu_pda;
- cpu_pda(0) = &_boot_cpu_pda;
- pda_init(0);
-
- early_printk("Kernel really alive\n");
-
copy_bootdata(__va(real_mode_data));

reserve_early(__pa_symbol(&_text), __pa_symbol(&_end), "TEXT DATA BSS");
--- linux-2.6.tip.orig/arch/x86/kernel/head_64.S
+++ linux-2.6.tip/arch/x86/kernel/head_64.S
@@ -12,6 +12,7 @@
#include <linux/linkage.h>
#include <linux/threads.h>
#include <linux/init.h>
+#include <asm/asm-offsets.h>
#include <asm/desc.h>
#include <asm/segment.h>
#include <asm/pgtable.h>
@@ -132,6 +133,12 @@ ident_complete:
#ifdef CONFIG_SMP
addq %rbp, trampoline_level4_pgt + 0(%rip)
addq %rbp, trampoline_level4_pgt + (511*8)(%rip)
+
+ /*
+ * Fix up per_cpu__gdt_page offset when basing percpu
+ * variables at zero. This is only needed for the boot cpu.
+ */
+ addq $__per_cpu_load, early_gdt_descr_base
#endif

/* Due to ENTRY(), sometimes the empty space gets filled with
@@ -224,10 +231,11 @@ ENTRY(secondary_startup_64)
* that does in_interrupt()
*/
movl $MSR_GS_BASE,%ecx
- movq $empty_zero_page,%rax
+ movq initial_pda(%rip), %rax
movq %rax,%rdx
shrq $32,%rdx
wrmsr
+ movq %rax,%gs:pda_data_offset

/* esi is pointer to real mode structure with interesting info.
pass it to C */
@@ -250,6 +258,8 @@ ENTRY(secondary_startup_64)
.align 8
ENTRY(initial_code)
.quad x86_64_start_kernel
+ ENTRY(initial_pda)
+ .quad __per_cpu_load
__FINITDATA

ENTRY(stack_start)
@@ -394,6 +404,7 @@ NEXT_PAGE(level2_spare_pgt)
.globl early_gdt_descr
early_gdt_descr:
.word GDT_ENTRIES*8-1
+early_gdt_descr_base:
.quad per_cpu__gdt_page

ENTRY(phys_base)
--- linux-2.6.tip.orig/arch/x86/kernel/setup.c
+++ linux-2.6.tip/arch/x86/kernel/setup.c
@@ -30,6 +30,11 @@ DEFINE_EARLY_PER_CPU(u16, x86_bios_cpu_a
EXPORT_EARLY_PER_CPU_SYMBOL(x86_cpu_to_apicid);
EXPORT_EARLY_PER_CPU_SYMBOL(x86_bios_cpu_apicid);

+#ifdef CONFIG_X86_64
+DEFINE_PER_CPU_FIRST(struct x8664_pda, pda);
+EXPORT_PER_CPU_SYMBOL(pda);
+#endif
+
#if defined(CONFIG_NUMA) && defined(CONFIG_X86_64)
#define X86_64_NUMA 1

@@ -48,7 +53,7 @@ static void __init setup_node_to_cpumask
static inline void setup_node_to_cpumask_map(void) { }
#endif

-#if defined(CONFIG_HAVE_SETUP_PER_CPU_AREA) && defined(CONFIG_SMP)
+#ifdef CONFIG_HAVE_SETUP_PER_CPU_AREA
/*
* Copy data used in early init routines from the initial arrays to the
* per cpu data areas. These arrays then become expendable and the
@@ -95,64 +100,9 @@ static void __init setup_cpumask_of_cpu(
static inline void setup_cpumask_of_cpu(void) { }
#endif

-#ifdef CONFIG_X86_32
-/*
- * Great future not-so-futuristic plan: make i386 and x86_64 do it
- * the same way
- */
unsigned long __per_cpu_offset[NR_CPUS] __read_mostly;
EXPORT_SYMBOL(__per_cpu_offset);
-static inline void setup_cpu_pda_map(void) { }
-
-#elif !defined(CONFIG_SMP)
-static inline void setup_cpu_pda_map(void) { }
-
-#else /* CONFIG_SMP && CONFIG_X86_64 */
-
-/*
- * Allocate cpu_pda pointer table and array via alloc_bootmem.
- */
-static void __init setup_cpu_pda_map(void)
-{
- char *pda;
- struct x8664_pda **new_cpu_pda;
- unsigned long size;
- int cpu;
-
- size = roundup(sizeof(struct x8664_pda), cache_line_size());
-
- /* allocate cpu_pda array and pointer table */
- {
- unsigned long tsize = nr_cpu_ids * sizeof(void *);
- unsigned long asize = size * (nr_cpu_ids - 1);
-
- tsize = roundup(tsize, cache_line_size());
- new_cpu_pda = alloc_bootmem(tsize + asize);
- pda = (char *)new_cpu_pda + tsize;
- }
-
- /* initialize pointer table to static pda's */
- for_each_possible_cpu(cpu) {
- if (cpu == 0) {
- /* leave boot cpu pda in place */
- new_cpu_pda[0] = cpu_pda(0);
- continue;
- }
- new_cpu_pda[cpu] = (struct x8664_pda *)pda;
- new_cpu_pda[cpu]->in_bootmem = 1;
- pda += size;
- }
-
- /* point to new pointer table */
- _cpu_pda = new_cpu_pda;
-}
-#endif

-/*
- * Great future plan:
- * Declare PDA itself and support (irqstack,tss,pgd) as per cpu data.
- * Always point %gs to its beginning
- */
void __init setup_per_cpu_areas(void)
{
ssize_t size = PERCPU_ENOUGH_ROOM;
@@ -165,9 +115,6 @@ void __init setup_per_cpu_areas(void)
nr_cpu_ids = num_processors;
#endif

- /* Setup cpu_pda map */
- setup_cpu_pda_map();
-
/* Copy section for each CPU (we discard the original) */
size = PERCPU_ENOUGH_ROOM;
printk(KERN_INFO "PERCPU: Allocating %zd bytes of per cpu data\n",
@@ -187,9 +134,28 @@ void __init setup_per_cpu_areas(void)
else
ptr = alloc_bootmem_pages_node(NODE_DATA(node), size);
#endif
+ /* Initialize each cpu's per_cpu area and save pointer */
+ memcpy(ptr, __per_cpu_load, __per_cpu_size);
per_cpu_offset(cpu) = ptr - __per_cpu_start;
- memcpy(ptr, __per_cpu_start, __per_cpu_end - __per_cpu_start);

+#ifdef CONFIG_X86_64
+ /*
+ * Note the boot cpu has been using the static per_cpu load
+ * area for it's pda. We need to zero out the pda's for the
+ * other cpu's that are coming online.
+ */
+ {
+ /* we rely on the fact that pda is the first element */
+ struct x8664_pda *pda = (struct x8664_pda *)ptr;
+
+ if (cpu)
+ memset(pda, 0, sizeof(struct x8664_pda));
+ else
+ pda_init(0);
+
+ pda->data_offset = (unsigned long)ptr;
+ }
+#endif
}

printk(KERN_DEBUG "NR_CPUS: %d, nr_cpu_ids: %d, nr_node_ids %d\n",
--- linux-2.6.tip.orig/arch/x86/kernel/setup64.c
+++ linux-2.6.tip/arch/x86/kernel/setup64.c
@@ -35,9 +35,6 @@ struct boot_params boot_params;

cpumask_t cpu_initialized __cpuinitdata = CPU_MASK_NONE;

-struct x8664_pda **_cpu_pda __read_mostly;
-EXPORT_SYMBOL(_cpu_pda);
-
struct desc_ptr idt_descr = { 256 * 16 - 1, (unsigned long) idt_table };

char boot_cpu_stack[IRQSTACKSIZE] __page_aligned_bss;
--- linux-2.6.tip.orig/arch/x86/kernel/smpboot.c
+++ linux-2.6.tip/arch/x86/kernel/smpboot.c
@@ -762,45 +762,6 @@ static void __cpuinit do_fork_idle(struc
complete(&c_idle->done);
}

-#ifdef CONFIG_X86_64
-/*
- * Allocate node local memory for the AP pda.
- *
- * Must be called after the _cpu_pda pointer table is initialized.
- */
-static int __cpuinit get_local_pda(int cpu)
-{
- struct x8664_pda *oldpda, *newpda;
- unsigned long size = sizeof(struct x8664_pda);
- int node = cpu_to_node(cpu);
-
- if (cpu_pda(cpu) && !cpu_pda(cpu)->in_bootmem)
- return 0;
-
- oldpda = cpu_pda(cpu);
- newpda = kmalloc_node(size, GFP_ATOMIC, node);
- if (!newpda) {
- printk(KERN_ERR "Could not allocate node local PDA "
- "for CPU %d on node %d\n", cpu, node);
-
- if (oldpda)
- return 0; /* have a usable pda */
- else
- return -1;
- }
-
- if (oldpda) {
- memcpy(newpda, oldpda, size);
- if (!after_bootmem)
- free_bootmem((unsigned long)oldpda, size);
- }
-
- newpda->in_bootmem = 0;
- cpu_pda(cpu) = newpda;
- return 0;
-}
-#endif /* CONFIG_X86_64 */
-
static int __cpuinit do_boot_cpu(int apicid, int cpu)
/*
* NOTE - on most systems this is a PHYSICAL apic ID, but on multiquad
@@ -818,16 +779,6 @@ static int __cpuinit do_boot_cpu(int api
};
INIT_WORK(&c_idle.work, do_fork_idle);

-#ifdef CONFIG_X86_64
- /* Allocate node local memory for AP pdas */
- if (cpu > 0) {
- boot_error = get_local_pda(cpu);
- if (boot_error)
- goto restore_state;
- /* if can't get pda memory, can't start cpu */
- }
-#endif
-
alternatives_smp_switch(1);

c_idle.idle = get_idle_for_cpu(cpu);
@@ -865,6 +816,7 @@ do_rest:
#else
cpu_pda(cpu)->pcurrent = c_idle.idle;
clear_tsk_thread_flag(c_idle.idle, TIF_FORK);
+ initial_pda = (unsigned long)get_percpu_pda(cpu);
#endif
early_gdt_descr.address = (unsigned long)get_cpu_gdt_table(cpu);
initial_code = (unsigned long)start_secondary;
@@ -940,8 +892,6 @@ do_rest:
}
}

-restore_state:
-
if (boot_error) {
/* Try to put things back the way they were before ... */
numa_remove_cpu(cpu); /* was set by numa_add_cpu */
--- linux-2.6.tip.orig/arch/x86/kernel/vmlinux_64.lds.S
+++ linux-2.6.tip/arch/x86/kernel/vmlinux_64.lds.S
@@ -16,6 +16,7 @@ jiffies_64 = jiffies;
_proxy_pda = 1;
PHDRS {
text PT_LOAD FLAGS(5); /* R_E */
+ percpu PT_LOAD FLAGS(7); /* RWE */
data PT_LOAD FLAGS(7); /* RWE */
user PT_LOAD FLAGS(7); /* RWE */
data.init PT_LOAD FLAGS(7); /* RWE */
--- linux-2.6.tip.orig/include/asm-x86/desc.h
+++ linux-2.6.tip/include/asm-x86/desc.h
@@ -41,6 +41,11 @@ static inline struct desc_struct *get_cp

#ifdef CONFIG_X86_64

+static inline struct x8664_pda *get_percpu_pda(unsigned int cpu)
+{
+ return &per_cpu(pda, cpu);
+}
+
static inline void pack_gate(gate_desc *gate, unsigned type, unsigned long func,
unsigned dpl, unsigned ist, unsigned seg)
{
--- linux-2.6.tip.orig/include/asm-x86/pda.h
+++ linux-2.6.tip/include/asm-x86/pda.h
@@ -37,10 +37,9 @@ struct x8664_pda {
unsigned irq_spurious_count;
} ____cacheline_aligned_in_smp;

-extern struct x8664_pda **_cpu_pda;
extern void pda_init(int);

-#define cpu_pda(i) (_cpu_pda[i])
+#define cpu_pda(i) (&per_cpu(pda, i))

/*
* There is no fast way to get the base address of the PDA, all the accesses
--- linux-2.6.tip.orig/include/asm-x86/percpu.h
+++ linux-2.6.tip/include/asm-x86/percpu.h
@@ -3,26 +3,20 @@

#ifdef CONFIG_X86_64
#include <linux/compiler.h>
-
-/* Same as asm-generic/percpu.h, except that we store the per cpu offset
- in the PDA. Longer term the PDA and every per cpu variable
- should be just put into a single section and referenced directly
- from %gs */
-
-#ifdef CONFIG_SMP
#include <asm/pda.h>

-#define __per_cpu_offset(cpu) (cpu_pda(cpu)->data_offset)
+#ifdef CONFIG_SMP
#define __my_cpu_offset read_pda(data_offset)
-
-#define per_cpu_offset(x) (__per_cpu_offset(x))
-
+#define __percpu_seg "%%gs:"
+#else
+#define __percpu_seg ""
#endif
+
#include <asm-generic/percpu.h>

DECLARE_PER_CPU(struct x8664_pda, pda);

-#else /* CONFIG_X86_64 */
+#else /* !CONFIG_X86_64 */

#ifdef __ASSEMBLY__

@@ -51,36 +45,23 @@ DECLARE_PER_CPU(struct x8664_pda, pda);

#else /* ...!ASSEMBLY */

-/*
- * PER_CPU finds an address of a per-cpu variable.
- *
- * Args:
- * var - variable name
- * cpu - 32bit register containing the current CPU number
- *
- * The resulting address is stored in the "cpu" argument.
- *
- * Example:
- * PER_CPU(cpu_gdt_descr, %ebx)
- */
#ifdef CONFIG_SMP
-
#define __my_cpu_offset x86_read_percpu(this_cpu_off)
-
-/* fs segment starts at (positive) offset == __per_cpu_offset[cpu] */
#define __percpu_seg "%%fs:"
-
-#else /* !SMP */
-
+#else
#define __percpu_seg ""
-
-#endif /* SMP */
+#endif

#include <asm-generic/percpu.h>

/* We can use this directly for local CPU (faster). */
DECLARE_PER_CPU(unsigned long, this_cpu_off);

+#endif /* __ASSEMBLY__ */
+#endif /* !CONFIG_X86_64 */
+
+#ifndef __ASSEMBLY__
+
/* For arch-specific code, we can use direct single-insn ops (they
* don't give an lvalue though). */
extern void __bad_percpu_size(void);
@@ -215,7 +196,6 @@ do { \
percpu_cmpxchg_op(per_cpu_var(var), old, new)

#endif /* !__ASSEMBLY__ */
-#endif /* !CONFIG_X86_64 */

#ifdef CONFIG_SMP

--- linux-2.6.tip.orig/include/asm-x86/trampoline.h
+++ linux-2.6.tip/include/asm-x86/trampoline.h
@@ -12,6 +12,7 @@ extern unsigned char *trampoline_base;

extern unsigned long init_rsp;
extern unsigned long initial_code;
+extern unsigned long initial_pda;

#define TRAMPOLINE_BASE 0x6000
extern unsigned long setup_trampoline(void);

2008-06-19 22:21:56

by Jeremy Fitzhardinge

[permalink] [raw]
Subject: Re: [crash, bisected] Re: [PATCH 3/4] x86_64: Fold pda into per cpu area

Mike Travis wrote:
> Jeremy Fitzhardinge wrote:
>
>
>> Why not use the real pda for all cpus?
>>
>
> Yeah, I figured that out after doing some more thinking... ;-)
>
>
>> Do you move the boot-cpu's per-cpu data? (Please don't) If not, you can
>> just use percpu__pda from the start without having to do anything else,
>> and then set up %gs pointing to the pda base for each secondary cpu.
>>
>
> The problem is that the static percpu area is removed as it lies
> in the initdata section, so the pda is removed as well.
>

Well, that's easy to fix...

> But I took your suggestion to move the fixup to before secondary_startup.
>
> Below is a revised version.

(Incremental diffs are easier to review and work with.)

> It builds but I'll have to test it tomorrow.
> Note the addition of:
>
> + initial_pda = (unsigned long)get_percpu_pda(cpu);
>
> in do_boot_cpu.
>
> I'm not sure yet what to put into acpi_save_state_mem:
>
> initial_code = (unsigned long)wakeup_long64;
> + /* ZZZ initial_pda = (unsigned long)?; */
>

You'll need to change wakeup_long64 to load the right value into the
GS_BASE msr anyway.

J

2008-06-19 22:24:38

by Jeremy Fitzhardinge

[permalink] [raw]
Subject: Re: [crash, bisected] Re: [PATCH 3/4] x86_64: Fold pda into per cpu area

Mike Travis wrote:
> @@ -132,6 +133,12 @@ ident_complete:
> #ifdef CONFIG_SMP
> addq %rbp, trampoline_level4_pgt + 0(%rip)
> addq %rbp, trampoline_level4_pgt + (511*8)(%rip)
> +
> + /*
> + * Fix up per_cpu__gdt_page offset when basing percpu
> + * variables at zero. This is only needed for the boot cpu.
> + */
> + addq $__per_cpu_load, early_gdt_descr_base
>

This needs to be rip-relative. An absolute reference here will fail
because you're still running in physical addresses.

J

2008-06-20 17:25:43

by Jeremy Fitzhardinge

[permalink] [raw]
Subject: Re: [crash, bisected] Re: [PATCH 3/4] x86_64: Fold pda into per cpu area

Mike Travis wrote:
> Jeremy Fitzhardinge wrote:
>
>> Mike Travis wrote:
>>
>>> @@ -132,6 +133,12 @@ ident_complete:
>>> #ifdef CONFIG_SMP
>>> addq %rbp, trampoline_level4_pgt + 0(%rip)
>>> addq %rbp, trampoline_level4_pgt + (511*8)(%rip)
>>> +
>>> + /*
>>> + * Fix up per_cpu__gdt_page offset when basing percpu
>>> + * variables at zero. This is only needed for the boot cpu.
>>> + */
>>> + addq $__per_cpu_load, early_gdt_descr_base
>>>
>>>
>> This needs to be rip-relative. An absolute reference here will fail
>> because you're still running in physical addresses.
>>
>> J
>>
>
> Still bombs right at boot up... ;-(
>

Yep. I see the triple-fault at the "mov %eax,%ds", which means it's
having trouble with the gdt. Either 1) the lgdt pointed to a bad
address, or 2) there's something wrong with the descriptor there.

The dump is:

(XEN) hvm.c:767:d14 Triple fault on VCPU0 - invoking HVM system reset.
(XEN) ----[ Xen-3.3-unstable x86_64 debug=n Not tainted ]----
(XEN) CPU: 0
(XEN) RIP: 0010:[<ffffffff80200167>]
(XEN) RFLAGS: 0000000000010002 CONTEXT: hvm
(XEN) rax: 0000000000000018 rbx: 0000000000000000 rcx: ffffffff808d6000
(XEN) rdx: 0000000000000000 rsi: 0000000000092f40 rdi: 0000000020100800
(XEN) rbp: 0000000000000000 rsp: ffffffff80827ff8 r8: 0000000000208000
(XEN) r9: 0000000000000000 r10: 0000000000000000 r11: 00000000000000d8
(XEN) r12: 0000000000000000 r13: 0000000000000000 r14: 0000000000000000
(XEN) r15: 0000000000000000 cr0: 0000000080050033 cr4: 00000000000000a0
(XEN) cr3: 0000000000201000 cr2: 0000000000000000
(XEN) ds: 0000 es: 0000 fs: 0000 gs: 0000 ss: 0000 cs: 0010


I loaded early_gdt_descr+2 into %rcx, which looks reasonable. Hm, but
loading the __KERNEL_DS descriptor into %rdx, which is all zero.

So it seems the problem is that the pre-initialized gdt_page is being
lost and replaced with zero. Linker script bug?

J

--- a/arch/x86/kernel/head_64.S Fri Jun 20 09:50:02 2008 -0700
+++ b/arch/x86/kernel/head_64.S Fri Jun 20 10:19:20 2008 -0700
@@ -213,6 +213,8 @@
* because in 32bit we couldn't load a 64bit linear address.
*/
lgdt early_gdt_descr(%rip)
+ movq early_gdt_descr+2(%rip), %rcx
+ movq __KERNEL_DS(%rcx), %rdx

/* set up data segments. actually 0 would do too */
movl $__KERNEL_DS,%eax

2008-06-20 17:48:19

by Christoph Lameter

[permalink] [raw]
Subject: Re: [crash, bisected] Re: [PATCH 3/4] x86_64: Fold pda into per cpu area

On Fri, 20 Jun 2008, Jeremy Fitzhardinge wrote:

> So it seems the problem is that the pre-initialized gdt_page is being lost and
> replaced with zero. Linker script bug?

Is the pre initialized gdt page in the per cpu area? Does not look like
it. The loader setup for the percpu section changes with zero basing.
Maybe that has bad side effects?

2008-06-20 18:30:35

by Mike Travis

[permalink] [raw]
Subject: Re: [crash, bisected] Re: [PATCH 3/4] x86_64: Fold pda into per cpu area

Christoph Lameter wrote:
> On Fri, 20 Jun 2008, Jeremy Fitzhardinge wrote:
>
>> So it seems the problem is that the pre-initialized gdt_page is being lost and
>> replaced with zero. Linker script bug?
>
> Is the pre initialized gdt page in the per cpu area? Does not look like
> it. The loader setup for the percpu section changes with zero basing.
> Maybe that has bad side effects?

Yes, it is... The fixup logic is this:

0000000000004000 D per_cpu__gdt_page
ffffffff81911000 A __per_cpu_load

arch/x86/kernel/cpu/common.c:

DEFINE_PER_CPU_PAGE_ALIGNED(struct gdt_page, gdt_page) = { .gdt = {
[GDT_ENTRY_KERNEL_CS] = { { { 0x0000ffff, 0x00cf9a00 } } },
...


arch/x86/kernel/head_64.S:

startup_64:
...
/*
* Fix up per_cpu__gdt_page offset when basing percpu
* variables at zero. This is only needed for the boot cpu.
*/
addq $__per_cpu_load, early_gdt_descr_base(%rip)

ENTRY(secondary_startup_64)
...
/*
* We must switch to a new descriptor in kernel space for the GDT
* because soon the kernel won't have access anymore to the userspace
* addresses where we're currently running on. We have to do that here
* because in 32bit we couldn't load a 64bit linear address.
*/
lgdt early_gdt_descr(%rip)
...
.globl early_gdt_descr
early_gdt_descr:
.word GDT_ENTRIES*8-1
early_gdt_descr_base:
.quad per_cpu__gdt_page

2008-06-20 18:37:53

by Jeremy Fitzhardinge

[permalink] [raw]
Subject: Re: [crash, bisected] Re: [PATCH 3/4] x86_64: Fold pda into per cpu area

Christoph Lameter wrote:
> On Fri, 20 Jun 2008, Jeremy Fitzhardinge wrote:
>
>
>> So it seems the problem is that the pre-initialized gdt_page is being lost and
>> replaced with zero. Linker script bug?
>>
>
> Is the pre initialized gdt page in the per cpu area? Does not look like
> it.

Yes, it should be. arch/x86/kernel/setup_64.c:

DEFINE_PER_CPU(struct gdt_page, gdt_page) = { .gdt = {
[GDT_ENTRY_KERNEL32_CS] = { { { 0x0000ffff, 0x00cf9b00 } } },
[GDT_ENTRY_KERNEL_CS] = { { { 0x0000ffff, 0x00af9b00 } } },
[GDT_ENTRY_KERNEL_DS] = { { { 0x0000ffff, 0x00cf9300 } } },
[GDT_ENTRY_DEFAULT_USER32_CS] = { { { 0x0000ffff, 0x00cffb00 } } },
[GDT_ENTRY_DEFAULT_USER_DS] = { { { 0x0000ffff, 0x00cff300 } } },
[GDT_ENTRY_DEFAULT_USER_CS] = { { { 0x0000ffff, 0x00affb00 } } },
} };


> The loader setup for the percpu section changes with zero basing.
> Maybe that has bad side effects

How does it work? The symbols in the percpu segment are 0-based, but
where does the data for the sections which correspond to that segment go?

The vmlinux looks like it has the gdt_page data in it.
per_cpu__gdt_page is 0x5000, and offset 0x5000 in .data.percpu has the
right stuff:

5000 00000000 00000000 ffff0000 009bcf00 ................
5010 ffff0000 009baf00 ffff0000 0093cf00 ................
5020 ffff0000 00fbcf00 ffff0000 00f3cf00 ................
5030 ffff0000 00fbaf00 00000000 00000000 ................
5040 00000000 00000000 00000000 00000000 ................
5050 00000000 00000000 00000000 00000000 ................
5060 00000000 00000000 00000000 00000000 ................


So the question is what kernel virtual address is it being loaded to?
__per_cpu_load is ffffffff808d1000, so ffffffff808d6000 is what you'd
expect...

Hm, but what happens when this gets converted to bzImage? Hm, looks OK,
I think.

BTW, I think __per_cpu_load will cause trouble if you make a relocatable
kernel, being an absolute symbol. But I have relocation off at the moment.

J

2008-06-20 18:40:46

by Jeremy Fitzhardinge

[permalink] [raw]
Subject: Re: [crash, bisected] Re: [PATCH 3/4] x86_64: Fold pda into per cpu area

Mike Travis wrote:
> Yes, it is... The fixup logic is this:
>
> 0000000000004000 D per_cpu__gdt_page
> ffffffff81911000 A __per_cpu_load
>
> arch/x86/kernel/cpu/common.c:
>
> DEFINE_PER_CPU_PAGE_ALIGNED(struct gdt_page, gdt_page) = { .gdt = {
> [GDT_ENTRY_KERNEL_CS] = { { { 0x0000ffff, 0x00cf9a00 } } },
>

Aha, you fell into the trap! "common" in this case means "common to
32-bit x86". Or something. But setup_64.c is what you want to be
looking at.

J

2008-06-20 18:51:20

by Christoph Lameter

[permalink] [raw]
Subject: Re: [crash, bisected] Re: [PATCH 3/4] x86_64: Fold pda into per cpu area

On Fri, 20 Jun 2008, Jeremy Fitzhardinge wrote:

> > The loader setup for the percpu section changes with zero basing. Maybe that
> > has bad side effects
>
> How does it work? The symbols in the percpu segment are 0-based, but where
> does the data for the sections which correspond to that segment go?

Its loaded at __per_cpu_load but the symbols have addresses starting at 0.

> So the question is what kernel virtual address is it being loaded to?
> __per_cpu_load is ffffffff808d1000, so ffffffff808d6000 is what you'd
> expect...

Correct.

> Hm, but what happens when this gets converted to bzImage? Hm, looks OK, I
> think.
>
> BTW, I think __per_cpu_load will cause trouble if you make a relocatable
> kernel, being an absolute symbol. But I have relocation off at the moment.

Hmmm.... we could add the relocation offset to __per_cpu_load?
__per_cpu_load is used very sparingly. Basically only useful during early
boot and when a new per cpu area has to be setup. In that case we want to
copy from __per_cpu_load to the newly allocated percpu area.

2008-06-20 19:05:10

by Jeremy Fitzhardinge

[permalink] [raw]
Subject: Re: [crash, bisected] Re: [PATCH 3/4] x86_64: Fold pda into per cpu area

Christoph Lameter wrote:
> On Fri, 20 Jun 2008, Jeremy Fitzhardinge wrote:
>
>
>>> The loader setup for the percpu section changes with zero basing. Maybe that
>>> has bad side effects
>>>
>> How does it work? The symbols in the percpu segment are 0-based, but where
>> does the data for the sections which correspond to that segment go?
>>
>
> Its loaded at __per_cpu_load but the symbols have addresses starting at 0.
>

Yes, which leads to an odd-looking ELF file where the Phdrs aren't
sorted by virtual address order. I'm wondering what would happen if a
bootloader that actually understood ELF files tried to load it as an
actual ELF file...

>> So the question is what kernel virtual address is it being loaded to?
>> __per_cpu_load is ffffffff808d1000, so ffffffff808d6000 is what you'd
>> expect...
>>
>
> Correct.
>

Well, reading back from that address got zeros, so something is amiss.

>> Hm, but what happens when this gets converted to bzImage? Hm, looks OK, I
>> think.
>>
>> BTW, I think __per_cpu_load will cause trouble if you make a relocatable
>> kernel, being an absolute symbol. But I have relocation off at the moment.
>>
>
> Hmmm.... we could add the relocation offset to __per_cpu_load?
> __per_cpu_load is used very sparingly. Basically only useful during early
> boot and when a new per cpu area has to be setup. In that case we want to
> copy from __per_cpu_load to the newly allocated percpu area.
>

Yes, it should be fairly easy to manually relocate it by applying the
(load - link) offset to it.


J

2008-06-20 19:06:54

by Mike Travis

[permalink] [raw]
Subject: Re: [crash, bisected] Re: [PATCH 3/4] x86_64: Fold pda into per cpu area

Jeremy Fitzhardinge wrote:
>
>
> BTW, I think __per_cpu_load will cause trouble if you make a relocatable
> kernel, being an absolute symbol. But I have relocation off at the moment.
>
...
Here's where it's defined (in include/asm-generic/vmlinux.lds.h):

#ifdef CONFIG_HAVE_ZERO_BASED_PER_CPU
#define PERCPU(align) \
. = ALIGN(align); \
percpu : { } :percpu \
__per_cpu_load = .; \
.data.percpu 0 : AT(__per_cpu_load - LOAD_OFFSET) { \
*(.data.percpu.first) \
*(.data.percpu.shared_aligned) \
*(.data.percpu) \
*(.data.percpu.page_aligned) \
____per_cpu_size = .; \
} \
. = __per_cpu_load + ____per_cpu_size; \
data : { } :data
#else

Can we generate a new symbol which would account for LOAD_OFFSET?

2008-06-20 19:25:04

by H. Peter Anvin

[permalink] [raw]
Subject: Re: [crash, bisected] Re: [PATCH 3/4] x86_64: Fold pda into per cpu area

Jeremy Fitzhardinge wrote:
>>
>> Its loaded at __per_cpu_load but the symbols have addresses starting
>> at 0.
>
> Yes, which leads to an odd-looking ELF file where the Phdrs aren't
> sorted by virtual address order. I'm wondering what would happen if a
> bootloader that actually understood ELF files tried to load it as an
> actual ELF file...
>

If it is implemented correctly, it will work. It might trigger bugs in
such loaders, however.

>> Hmmm.... we could add the relocation offset to __per_cpu_load?
>> __per_cpu_load is used very sparingly. Basically only useful during
>> early boot and when a new per cpu area has to be setup. In that case
>> we want to copy from __per_cpu_load to the newly allocated percpu area.
>
> Yes, it should be fairly easy to manually relocate it by applying the
> (load - link) offset to it.

Seems easy enough, and as already stated, this is not
performance-critical so a few extra instructions is pretty much a non-issue.

-hpa

2008-06-20 19:51:21

by Eric W. Biederman

[permalink] [raw]
Subject: Re: [crash, bisected] Re: [PATCH 3/4] x86_64: Fold pda into per cpu area

Jeremy Fitzhardinge <[email protected]> writes:

> Christoph Lameter wrote:
>> On Fri, 20 Jun 2008, Jeremy Fitzhardinge wrote:
>>
>>
>>>> The loader setup for the percpu section changes with zero basing. Maybe that
>>>> has bad side effects
>>>>
>>> How does it work? The symbols in the percpu segment are 0-based, but where
>>> does the data for the sections which correspond to that segment go?
>>>
>>
>> Its loaded at __per_cpu_load but the symbols have addresses starting at 0.
>>
>
> Yes, which leads to an odd-looking ELF file where the Phdrs aren't sorted by
> virtual address order. I'm wondering what would happen if a bootloader that
> actually understood ELF files tried to load it as an actual ELF
> file...

Well /sbin/kexec looks at the physical addresses not the virtual ones
so that may not be a problem.

>>> So the question is what kernel virtual address is it being loaded to?
>>> __per_cpu_load is ffffffff808d1000, so ffffffff808d6000 is what you'd
>>> expect...
>>>
>>
>> Correct.
>>
>
> Well, reading back from that address got zeros, so something is
> amiss.

Weird.

>>> Hm, but what happens when this gets converted to bzImage? Hm, looks OK, I
>>> think.
>>>
>>> BTW, I think __per_cpu_load will cause trouble if you make a relocatable
>>> kernel, being an absolute symbol. But I have relocation off at the moment.
>>>
>>
>> Hmmm.... we could add the relocation offset to __per_cpu_load? __per_cpu_load
>> is used very sparingly. Basically only useful during early boot and when a new
>> per cpu area has to be setup. In that case we want to copy from __per_cpu_load
>> to the newly allocated percpu area.
>>
>
> Yes, it should be fairly easy to manually relocate it by applying the (load -
> link) offset to it.

For x86_64 all kernels are built relocatable as the only cost was
changing the physical addresses in the initial page tables. The
virtual address always remain the same but the physical addresses
change. So that could be part of what is going on.

Is this a change that only got tested on x86_32?

As long as we are not changing the way the kernel virtual address
are actually being used we should be ok with a change to make the pda
0 based. Still it is an area you need to be especially careful with.


Eric

2008-06-20 20:04:40

by Mike Travis

[permalink] [raw]
Subject: Re: [crash, bisected] Re: [PATCH 3/4] x86_64: Fold pda into per cpu area

Eric W. Biederman wrote:
> Jeremy Fitzhardinge <[email protected]> writes:
>
>> Christoph Lameter wrote:
>>> On Fri, 20 Jun 2008, Jeremy Fitzhardinge wrote:
>>>
>>>
>>>>> The loader setup for the percpu section changes with zero basing. Maybe that
>>>>> has bad side effects
>>>>>
>>>> How does it work? The symbols in the percpu segment are 0-based, but where
>>>> does the data for the sections which correspond to that segment go?
>>>>
>>> Its loaded at __per_cpu_load but the symbols have addresses starting at 0.
>>>
>> Yes, which leads to an odd-looking ELF file where the Phdrs aren't sorted by
>> virtual address order. I'm wondering what would happen if a bootloader that
>> actually understood ELF files tried to load it as an actual ELF
>> file...
>
> Well /sbin/kexec looks at the physical addresses not the virtual ones
> so that may not be a problem.
>
>>>> So the question is what kernel virtual address is it being loaded to?
>>>> __per_cpu_load is ffffffff808d1000, so ffffffff808d6000 is what you'd
>>>> expect...
>>>>
>>> Correct.
>>>
>> Well, reading back from that address got zeros, so something is
>> amiss.
>
> Weird.
>
>>>> Hm, but what happens when this gets converted to bzImage? Hm, looks OK, I
>>>> think.
>>>>
>>>> BTW, I think __per_cpu_load will cause trouble if you make a relocatable
>>>> kernel, being an absolute symbol. But I have relocation off at the moment.
>>>>
>>> Hmmm.... we could add the relocation offset to __per_cpu_load? __per_cpu_load
>>> is used very sparingly. Basically only useful during early boot and when a new
>>> per cpu area has to be setup. In that case we want to copy from __per_cpu_load
>>> to the newly allocated percpu area.
>>>
>> Yes, it should be fairly easy to manually relocate it by applying the (load -
>> link) offset to it.
>
> For x86_64 all kernels are built relocatable as the only cost was
> changing the physical addresses in the initial page tables. The
> virtual address always remain the same but the physical addresses
> change. So that could be part of what is going on.
>
> Is this a change that only got tested on x86_32?

I'm only testing this on x86_64. The zero-based percpu/pda changes worked fine
up until just recently. At first it was one of Ingo's "randconfig" config files
that was tripping it up, but lately it's not working on any config.

>
> As long as we are not changing the way the kernel virtual address
> are actually being used we should be ok with a change to make the pda
> 0 based. Still it is an area you need to be especially careful with.

The major gotcha's seem to be in referencing the per_cpu symbol directly though
I've examined them all and nothing seems amiss.

>
>
> Eric

Thanks,
Mike

2008-06-20 20:46:12

by Eric W. Biederman

[permalink] [raw]
Subject: Re: [crash, bisected] Re: [PATCH 3/4] x86_64: Fold pda into per cpu area

Mike Travis <[email protected]> writes:

> Jeremy Fitzhardinge wrote:
>>
>>
>> BTW, I think __per_cpu_load will cause trouble if you make a relocatable
>> kernel, being an absolute symbol. But I have relocation off at the moment.
>>
> ...
> Here's where it's defined (in include/asm-generic/vmlinux.lds.h):
>
> #ifdef CONFIG_HAVE_ZERO_BASED_PER_CPU
> #define PERCPU(align) \
> . = ALIGN(align); \
> percpu : { } :percpu \
> __per_cpu_load = .; \
> .data.percpu 0 : AT(__per_cpu_load - LOAD_OFFSET) { \
> *(.data.percpu.first) \
> *(.data.percpu.shared_aligned) \
> *(.data.percpu) \
> *(.data.percpu.page_aligned) \
> ____per_cpu_size = .; \
> } \
> . = __per_cpu_load + ____per_cpu_size; \
> data : { } :data
> #else
>
> Can we generate a new symbol which would account for LOAD_OFFSET?

Ouch. Absolute symbols indeed. On the 32bit kernel that may play havoc
with the relocatable kernel, although we have had similar absolute logic
for the last year. With __per_cpu_start and __per_cpu_end so it may
not be a problem.

To initialize the percpu data you do want to talk to the virtual address
at __per_coup_load. But it is absolute Ugh.

It might be worth saying something like.
.data.percpu.start : AT(.data.percpu.dummy - LOAD_OFFSET) {
DATA(0)
. = ALIGN(align);
__per_cpu_load = . ;
}
To make __per_cpu_load a relative symbol. ld has a bad habit of taking
symbols out of empty sections and making them absolute. Which is why
I added the DATA(0).

Still I don't think that would be the 64bit problem.

Eric

2008-06-20 20:48:00

by Christoph Lameter

[permalink] [raw]
Subject: Re: [crash, bisected] Re: [PATCH 3/4] x86_64: Fold pda into per cpu area

On Fri, 20 Jun 2008, Mike Travis wrote:

> > Is this a change that only got tested on x86_32?
>
> I'm only testing this on x86_64. The zero-based percpu/pda changes worked fine
> up until just recently. At first it was one of Ingo's "randconfig" config files
> that was tripping it up, but lately it's not working on any config.

x86_32 does not need the zero basing since it does not have a pda.

2008-06-20 20:55:26

by Christoph Lameter

[permalink] [raw]
Subject: Re: [crash, bisected] Re: [PATCH 3/4] x86_64: Fold pda into per cpu area

On Fri, 20 Jun 2008, Eric W. Biederman wrote:

> at __per_coup_load. But it is absolute Ugh.
>
> It might be worth saying something like.
> .data.percpu.start : AT(.data.percpu.dummy - LOAD_OFFSET) {
> DATA(0)
> . = ALIGN(align);
> __per_cpu_load = . ;
> }
> To make __per_cpu_load a relative symbol. ld has a bad habit of taking
> symbols out of empty sections and making them absolute. Which is why
> I added the DATA(0).
>
> Still I don't think that would be the 64bit problem.

Ahh.. Good idea. I had a long fight with the loader before it did the
right thing.

2008-06-23 16:55:23

by Mike Travis

[permalink] [raw]
Subject: Re: [crash, bisected] Re: [PATCH 3/4] x86_64: Fold pda into per cpu area

Eric W. Biederman wrote:
> Mike Travis <[email protected]> writes:
>
>> Jeremy Fitzhardinge wrote:
>>>
>>> BTW, I think __per_cpu_load will cause trouble if you make a relocatable
>>> kernel, being an absolute symbol. But I have relocation off at the moment.
>>>
>> ...
>> Here's where it's defined (in include/asm-generic/vmlinux.lds.h):
>>
>> #ifdef CONFIG_HAVE_ZERO_BASED_PER_CPU
>> #define PERCPU(align) \
>> . = ALIGN(align); \
>> percpu : { } :percpu \
>> __per_cpu_load = .; \
>> .data.percpu 0 : AT(__per_cpu_load - LOAD_OFFSET) { \
>> *(.data.percpu.first) \
>> *(.data.percpu.shared_aligned) \
>> *(.data.percpu) \
>> *(.data.percpu.page_aligned) \
>> ____per_cpu_size = .; \
>> } \
>> . = __per_cpu_load + ____per_cpu_size; \
>> data : { } :data
>> #else
>>
>> Can we generate a new symbol which would account for LOAD_OFFSET?
>
> Ouch. Absolute symbols indeed. On the 32bit kernel that may play havoc
> with the relocatable kernel, although we have had similar absolute logic
> for the last year. With __per_cpu_start and __per_cpu_end so it may
> not be a problem.
>
> To initialize the percpu data you do want to talk to the virtual address
> at __per_coup_load. But it is absolute Ugh.
>
> It might be worth saying something like.
> .data.percpu.start : AT(.data.percpu.dummy - LOAD_OFFSET) {
> DATA(0)
> . = ALIGN(align);
> __per_cpu_load = . ;
> }
> To make __per_cpu_load a relative symbol. ld has a bad habit of taking
> symbols out of empty sections and making them absolute. Which is why
> I added the DATA(0).
>
> Still I don't think that would be the 64bit problem.
>
> Eric

I'm not sure I understand the linker lingo enough to fill in the rest
of the blanks... I've tried various versions around this framework and
none have been accepted yet.

#ifdef CONFIG_HAVE_ZERO_BASED_PER_CPU
#define PERCPU(align) \
.data.percpu.start : AT(.data.percpu.dummy - LOAD_OFFSET) { \
DATA(0) \
. = ALIGN(align); \
__per_cpu_load = .; \
*(.data.percpu.first) \
*(.data.percpu.shared_aligned) \
*(.data.percpu) \
*(.data.percpu.page_aligned) \
____per_cpu_size = . - __per_cpu_load \
} \
#else

Thanks,
Mike

2008-06-23 17:34:17

by Jeremy Fitzhardinge

[permalink] [raw]
Subject: Re: [crash, bisected] Re: [PATCH 3/4] x86_64: Fold pda into per cpu area

Mike Travis wrote:
> Eric W. Biederman wrote:
>
>> Mike Travis <[email protected]> writes:
>>
>>
>>> Jeremy Fitzhardinge wrote:
>>>
>>>> BTW, I think __per_cpu_load will cause trouble if you make a relocatable
>>>> kernel, being an absolute symbol. But I have relocation off at the moment.
>>>>
>>>>
>>> ...
>>> Here's where it's defined (in include/asm-generic/vmlinux.lds.h):
>>>
>>> #ifdef CONFIG_HAVE_ZERO_BASED_PER_CPU
>>> #define PERCPU(align) \
>>> . = ALIGN(align); \
>>> percpu : { } :percpu \
>>> __per_cpu_load = .; \
>>> .data.percpu 0 : AT(__per_cpu_load - LOAD_OFFSET) { \
>>> *(.data.percpu.first) \
>>> *(.data.percpu.shared_aligned) \
>>> *(.data.percpu) \
>>> *(.data.percpu.page_aligned) \
>>> ____per_cpu_size = .; \
>>> } \
>>> . = __per_cpu_load + ____per_cpu_size; \
>>> data : { } :data
>>> #else
>>>
>>> Can we generate a new symbol which would account for LOAD_OFFSET?
>>>
>> Ouch. Absolute symbols indeed. On the 32bit kernel that may play havoc
>> with the relocatable kernel, although we have had similar absolute logic
>> for the last year. With __per_cpu_start and __per_cpu_end so it may
>> not be a problem.
>>
>> To initialize the percpu data you do want to talk to the virtual address
>> at __per_coup_load. But it is absolute Ugh.
>>
>> It might be worth saying something like.
>> .data.percpu.start : AT(.data.percpu.dummy - LOAD_OFFSET) {
>> DATA(0)
>> . = ALIGN(align);
>> __per_cpu_load = . ;
>> }
>> To make __per_cpu_load a relative symbol. ld has a bad habit of taking
>> symbols out of empty sections and making them absolute. Which is why
>> I added the DATA(0).
>>
>> Still I don't think that would be the 64bit problem.
>>
>> Eric
>>
>
> I'm not sure I understand the linker lingo enough to fill in the rest
> of the blanks... I've tried various versions around this framework and
> none have been accepted yet.
>
> #ifdef CONFIG_HAVE_ZERO_BASED_PER_CPU
> #define PERCPU(align) \
> .data.percpu.start : AT(.data.percpu.dummy - LOAD_OFFSET) { \
> DATA(0) \
> . = ALIGN(align); \
> __per_cpu_load = .; \
> *(.data.percpu.first) \
> *(.data.percpu.shared_aligned) \
> *(.data.percpu) \
> *(.data.percpu.page_aligned) \
> ____per_cpu_size = . - __per_cpu_load \
> } \
> #else
>

That looks OK to me. Does it work?

J

2008-06-23 18:05:12

by Mike Travis

[permalink] [raw]
Subject: Re: [crash, bisected] Re: [PATCH 3/4] x86_64: Fold pda into per cpu area

Jeremy Fitzhardinge wrote:
> Mike Travis wrote:
>> Eric W. Biederman wrote:
>>
>>> Mike Travis <[email protected]> writes:
>>>
>>>
>>>> Jeremy Fitzhardinge wrote:
>>>>
>>>>> BTW, I think __per_cpu_load will cause trouble if you make a
>>>>> relocatable
>>>>> kernel, being an absolute symbol. But I have relocation off at the
>>>>> moment.
>>>>>
>>>>>
>>>> ...
>>>> Here's where it's defined (in include/asm-generic/vmlinux.lds.h):
>>>>
>>>> #ifdef CONFIG_HAVE_ZERO_BASED_PER_CPU
>>>> #define
>>>> PERCPU(align) \
>>>> . =
>>>> ALIGN(align); \
>>>> percpu : { }
>>>> :percpu \
>>>> __per_cpu_load =
>>>> .; \
>>>> .data.percpu 0 : AT(__per_cpu_load - LOAD_OFFSET)
>>>> { \
>>>>
>>>> *(.data.percpu.first) \
>>>>
>>>> *(.data.percpu.shared_aligned) \
>>>>
>>>> *(.data.percpu) \
>>>>
>>>> *(.data.percpu.page_aligned) \
>>>> ____per_cpu_size =
>>>> .; \
>>>>
>>>> } \
>>>> . = __per_cpu_load +
>>>> ____per_cpu_size; \
>>>> data : { } :data
>>>> #else
>>>>
>>>> Can we generate a new symbol which would account for LOAD_OFFSET?
>>>>
>>> Ouch. Absolute symbols indeed. On the 32bit kernel that may play havoc
>>> with the relocatable kernel, although we have had similar absolute logic
>>> for the last year. With __per_cpu_start and __per_cpu_end so it may
>>> not be a problem.
>>>
>>> To initialize the percpu data you do want to talk to the virtual address
>>> at __per_coup_load. But it is absolute Ugh.
>>> It might be worth saying something like.
>>> .data.percpu.start : AT(.data.percpu.dummy - LOAD_OFFSET) {
>>> DATA(0) . = ALIGN(align);
>>> __per_cpu_load = . ; }
>>> To make __per_cpu_load a relative symbol. ld has a bad habit of taking
>>> symbols out of empty sections and making them absolute. Which is why
>>> I added the DATA(0).
>>>
>>> Still I don't think that would be the 64bit problem.
>>>
>>> Eric
>>>
>>
>> I'm not sure I understand the linker lingo enough to fill in the rest
>> of the blanks... I've tried various versions around this framework and
>> none have been accepted yet.
>>
>> #ifdef CONFIG_HAVE_ZERO_BASED_PER_CPU
>> #define PERCPU(align) \
>> .data.percpu.start : AT(.data.percpu.dummy - LOAD_OFFSET) { \
>> DATA(0) \
>> . = ALIGN(align); \
>> __per_cpu_load = .; \
>> *(.data.percpu.first) \
>> *(.data.percpu.shared_aligned) \
>> *(.data.percpu) \
>> *(.data.percpu.page_aligned) \
>> ____per_cpu_size = . - __per_cpu_load \
>> } \
>> #else
>>
>
> That looks OK to me. Does it work?
>
> J

Nope, fighting undefines and/or syntax errors in the linker.

2008-06-23 18:36:37

by Mike Travis

[permalink] [raw]
Subject: Re: [crash, bisected] Re: [PATCH 3/4] x86_64: Fold pda into per cpu area


>>>>
>>>> To initialize the percpu data you do want to talk to the virtual address
>>>> at __per_coup_load. But it is absolute Ugh.
>>>> It might be worth saying something like.
>>>> .data.percpu.start : AT(.data.percpu.dummy - LOAD_OFFSET) {
>>>> DATA(0) . = ALIGN(align);
>>>> __per_cpu_load = . ; }
>>>> To make __per_cpu_load a relative symbol. ld has a bad habit of taking
>>>> symbols out of empty sections and making them absolute. Which is why
>>>> I added the DATA(0).

The syntax error is at this "DATA(0)" statement. I don't find this as a
linker script command or a macro. What is it we're trying to do with this?

Thanks,
Mike
...
>>> I'm not sure I understand the linker lingo enough to fill in the rest
>>> of the blanks... I've tried various versions around this framework and
>>> none have been accepted yet.
>>>
>>> #ifdef CONFIG_HAVE_ZERO_BASED_PER_CPU
>>> #define PERCPU(align) \
>>> .data.percpu.start : AT(.data.percpu.dummy - LOAD_OFFSET) { \
>>> DATA(0) \
>>> . = ALIGN(align); \
>>> __per_cpu_load = .; \
>>> *(.data.percpu.first) \
>>> *(.data.percpu.shared_aligned) \
>>> *(.data.percpu) \
>>> *(.data.percpu.page_aligned) \
>>> ____per_cpu_size = . - __per_cpu_load \
>>> } \
>>> #else

2008-06-23 19:41:46

by Jeremy Fitzhardinge

[permalink] [raw]
Subject: Re: [crash, bisected] Re: [PATCH 3/4] x86_64: Fold pda into per cpu area

Mike Travis wrote:
> The syntax error is at this "DATA(0)" statement. I don't find this as a
> linker script command or a macro. What is it we're trying to do with this?
>

In Eric's sample, it's intended to prevent there being an empty section,
which can cause linker bugs. In your case it probably isn't necessary,
since you're also putting the percpu data in that section.

"DATA" is probably a typo. It should be "LONG" or something like that.
(See "3.6.5 Output Section Data" in the linker manual.)

J

2008-06-24 00:02:19

by Mike Travis

[permalink] [raw]
Subject: Re: [crash, bisected] Re: [PATCH 3/4] x86_64: Fold pda into per cpu area

Jeremy Fitzhardinge wrote:
> Mike Travis wrote:
>> The syntax error is at this "DATA(0)" statement. I don't find this as a
>> linker script command or a macro. What is it we're trying to do with
>> this?
>>
>
> In Eric's sample, it's intended to prevent there being an empty section,
> which can cause linker bugs. In your case it probably isn't necessary,
> since you're also putting the percpu data in that section.
>
> "DATA" is probably a typo. It should be "LONG" or something like that.
> (See "3.6.5 Output Section Data" in the linker manual.)
>
> J

Yes, thanks I did find that.

I now have the version below which seems to have what we need... but it
hasn't had an effect on the boot startup panic. I'm back to verifying
that the assembler effective addresses are correct in the loaded object.

ffffffff81911000 D __per_cpu_load
0000000000000000 D per_cpu__pda
0000000000000080 D per_cpu__init_tss
.
.
000000000000a2d0 d per_cpu__cookie_scratch
000000000000a470 d per_cpu__cookie_scratch
000000000000a604 D ____per_cpu_size

Btw, the "percpu : { } :percpu" below removes a linker warning about an empty
section.

#ifdef CONFIG_HAVE_ZERO_BASED_PER_CPU
#define PERCPU(align) \
.data.percpu.abs = .; \
percpu : { } :percpu \
.data.percpu.header : AT(.data.percpu.abs - LOAD_OFFSET) { \
BYTE(0) \
. = ALIGN(align); \
__per_cpu_load = .; \
} \
.data.percpu 0 : AT(__per_cpu_load - LOAD_OFFSET) { \
*(.data.percpu.first) \
*(.data.percpu.shared_aligned) \
*(.data.percpu) \
*(.data.percpu.page_aligned) \
____per_cpu_size = .; \
} \
. = __per_cpu_load + ____per_cpu_size;

2008-06-30 17:07:28

by Mike Travis

[permalink] [raw]
Subject: Re: [crash, bisected] Re: [PATCH 3/4] x86_64: Fold pda into per cpu area

Eric W. Biederman wrote:
> Mike Travis <[email protected]> writes:
...
>> Can we generate a new symbol which would account for LOAD_OFFSET?
>
> Ouch. Absolute symbols indeed. On the 32bit kernel that may play havoc
> with the relocatable kernel, although we have had similar absolute logic
> for the last year. With __per_cpu_start and __per_cpu_end so it may
> not be a problem.
>
> To initialize the percpu data you do want to talk to the virtual address
> at __per_coup_load. But it is absolute Ugh.
>
> It might be worth saying something like.
> .data.percpu.start : AT(.data.percpu.dummy - LOAD_OFFSET) {
> DATA(0)
> . = ALIGN(align);
> __per_cpu_load = . ;
> }
> To make __per_cpu_load a relative symbol. ld has a bad habit of taking
> symbols out of empty sections and making them absolute. Which is why
> I added the DATA(0).
>
> Still I don't think that would be the 64bit problem.
>
> Eric

FYI, I did try this out and it caused the bootloader to scramble the
loaded data. The first corruption I found was the .x86cpuvendor.init
section contained all zeroes.

Mike

2008-06-30 17:21:46

by H. Peter Anvin

[permalink] [raw]
Subject: Re: [crash, bisected] Re: [PATCH 3/4] x86_64: Fold pda into per cpu area

Mike Travis wrote:
>
> FYI, I did try this out and it caused the bootloader to scramble the
> loaded data. The first corruption I found was the .x86cpuvendor.init
> section contained all zeroes.
>

Explain what you mean with "the bootloader" in this context.

-hpa

2008-06-30 17:43:18

by Jeremy Fitzhardinge

[permalink] [raw]
Subject: Re: [crash, bisected] Re: [PATCH 3/4] x86_64: Fold pda into per cpu area

Mike Travis wrote:
> Eric W. Biederman wrote:
>
>> Mike Travis <[email protected]> writes:
>>
> ...
>
>>> Can we generate a new symbol which would account for LOAD_OFFSET?
>>>
>> Ouch. Absolute symbols indeed. On the 32bit kernel that may play havoc
>> with the relocatable kernel, although we have had similar absolute logic
>> for the last year. With __per_cpu_start and __per_cpu_end so it may
>> not be a problem.
>>
>> To initialize the percpu data you do want to talk to the virtual address
>> at __per_coup_load. But it is absolute Ugh.
>>
>> It might be worth saying something like.
>> .data.percpu.start : AT(.data.percpu.dummy - LOAD_OFFSET) {
>> DATA(0)
>> . = ALIGN(align);
>> __per_cpu_load = . ;
>> }
>> To make __per_cpu_load a relative symbol. ld has a bad habit of taking
>> symbols out of empty sections and making them absolute. Which is why
>> I added the DATA(0).
>>
>> Still I don't think that would be the 64bit problem.
>>
>> Eric
>>
>
> FYI, I did try this out and it caused the bootloader to scramble the
> loaded data. The first corruption I found was the .x86cpuvendor.init
> section contained all zeroes.

Well, that's what appeared to be happening with the pre-initialized GDT
as well, so I'm not sure that's a new symptom.

J

2008-06-30 17:50:12

by Mike Travis

[permalink] [raw]
Subject: Re: [crash, bisected] Re: [PATCH 3/4] x86_64: Fold pda into per cpu area

Jeremy Fitzhardinge wrote:
> Mike Travis wrote:
...
>> I'm not sure yet what to put into acpi_save_state_mem:
>>
>> initial_code = (unsigned long)wakeup_long64;
>> + /* ZZZ initial_pda = (unsigned long)?; */
>>
>
> You'll need to change wakeup_long64 to load the right value into the
> GS_BASE msr anyway.
>
> J

I'm afraid I don't quite understand the transitioning of the
ACPI states to figure out the correct thing to do. My first
inclination would be to: [sorry, cut and pasted]

--- linux-2.6.tip.orig/arch/x86/kernel/acpi/sleep.c
+++ linux-2.6.tip/arch/x86/kernel/acpi/sleep.c
@@ -89,6 +89,8 @@ int acpi_save_state_mem(void)
#ifdef CONFIG_SMP
stack_start.sp = temp_stack + 4096;
#endif
+ early_gdt_descr.address = (unsigned long)get_cpu_gdt_table(cpu);
+ initial_pda = (unsigned long)get_cpu_pda(cpu);
initial_code = (unsigned long)wakeup_long64;
saved_magic = 0x123456789abcdef0;
#endif /* CONFIG_64BIT */


But I'd like some confirmation that this is right thing to do...

[This mimics what smpboot.c:do_boot_cpu() does.]

Len - I'll cc you on the full patch submission shortly.

Thanks,
Mike

2008-06-30 17:57:49

by Mike Travis

[permalink] [raw]
Subject: Re: [crash, bisected] Re: [PATCH 3/4] x86_64: Fold pda into per cpu area

H. Peter Anvin wrote:
> Mike Travis wrote:
>>
>> FYI, I did try this out and it caused the bootloader to scramble the
>> loaded data. The first corruption I found was the .x86cpuvendor.init
>> section contained all zeroes.
>>
>
> Explain what you mean with "the bootloader" in this context.
>
> -hpa


After the code was loaded (the compressed code, it seems that my GRUB
doesn't support uncompressed loading), the above section contained
zeroes. I snapped it fairly early, around secondary_startup_64, and
then printed it in x86_64_start_kernel.

The object file had the correct data (as displayed by objdump) so I'm
assuming that the bootloading process didn't load the section correctly.

Below was the linker script I used:

--- linux-2.6.tip.orig/include/asm-generic/vmlinux.lds.h
+++ linux-2.6.tip/include/asm-generic/vmlinux.lds.h
@@ -373,9 +373,13 @@

#ifdef CONFIG_HAVE_ZERO_BASED_PER_CPU
#define PERCPU(align) \
- . = ALIGN(align); \
+ .data.percpu.abs = .; \
percpu : { } :percpu \
- __per_cpu_load = .; \
+ .data.percpu.rel : AT(.data.percpu.abs - LOAD_OFFSET) { \
+ BYTE(0) \
+ . = ALIGN(align); \
+ __per_cpu_load = .; \
+ } \
.data.percpu 0 : AT(__per_cpu_load - LOAD_OFFSET) { \
*(.data.percpu.first) \
*(.data.percpu.shared_aligned) \
@@ -383,8 +387,8 @@
*(.data.percpu.page_aligned) \
____per_cpu_size = .; \
} \
- . = __per_cpu_load + ____per_cpu_size; \
- data : { } :data
+ . = __per_cpu_load + ____per_cpu_size;
+
#else
#define PERCPU(align) \
. = ALIGN(align); \

It showed all the correct address in the map and __per_cpu_load was a
relative symbol (which was the objective.)

Btw, our simulator, which only loads uncompressed code, had the data correct,
so it *may* only be a result of the code being compressed.

Thanks,
Mike

2008-06-30 20:51:45

by Eric W. Biederman

[permalink] [raw]
Subject: Re: [crash, bisected] Re: [PATCH 3/4] x86_64: Fold pda into per cpu area

Mike Travis <[email protected]> writes:

> H. Peter Anvin wrote:
>> Mike Travis wrote:
>>>
>>> FYI, I did try this out and it caused the bootloader to scramble the
>>> loaded data. The first corruption I found was the .x86cpuvendor.init
>>> section contained all zeroes.
>>>
>>
>> Explain what you mean with "the bootloader" in this context.
>>
>> -hpa
>
>
> After the code was loaded (the compressed code, it seems that my GRUB
> doesn't support uncompressed loading), the above section contained
> zeroes. I snapped it fairly early, around secondary_startup_64, and
> then printed it in x86_64_start_kernel.
>
> The object file had the correct data (as displayed by objdump) so I'm
> assuming that the bootloading process didn't load the section correctly.
>
> Below was the linker script I used:
>
> --- linux-2.6.tip.orig/include/asm-generic/vmlinux.lds.h
> +++ linux-2.6.tip/include/asm-generic/vmlinux.lds.h
> @@ -373,9 +373,13 @@
>
> #ifdef CONFIG_HAVE_ZERO_BASED_PER_CPU
> #define PERCPU(align) \
> - . = ALIGN(align); \
> + .data.percpu.abs = .; \
> percpu : { } :percpu \
> - __per_cpu_load = .; \
> + .data.percpu.rel : AT(.data.percpu.abs - LOAD_OFFSET) { \
> + BYTE(0) \
> + . = ALIGN(align); \
> + __per_cpu_load = .; \
> + } \
> .data.percpu 0 : AT(__per_cpu_load - LOAD_OFFSET) { \
> *(.data.percpu.first) \
> *(.data.percpu.shared_aligned) \
> @@ -383,8 +387,8 @@
> *(.data.percpu.page_aligned) \
> ____per_cpu_size = .; \
> } \
> - . = __per_cpu_load + ____per_cpu_size; \
> - data : { } :data
> + . = __per_cpu_load + ____per_cpu_size;
> +
> #else
> #define PERCPU(align) \
> . = ALIGN(align); \
>
> It showed all the correct address in the map and __per_cpu_load was a
> relative symbol (which was the objective.)
>
> Btw, our simulator, which only loads uncompressed code, had the data correct,
> so it *may* only be a result of the code being compressed.

Weird. Grub doesn't get involved in the decompression the kernel does it
all itself so we should be able to track where things go bad.

Last I looked the compressed code was formed by essentially.
objcopy vmlinux -O binary vmlinux.bin
gzip vmlinux.bin
And then we take on a magic header to the gzip compressed file.

Are things only bad with the change above?

Eric

2008-06-30 21:08:30

by Jeremy Fitzhardinge

[permalink] [raw]
Subject: Re: [crash, bisected] Re: [PATCH 3/4] x86_64: Fold pda into per cpu area

Eric W. Biederman wrote:
> Mike Travis <[email protected]> writes:
>
>
>> H. Peter Anvin wrote:
>>
>>> Mike Travis wrote:
>>>
>>>> FYI, I did try this out and it caused the bootloader to scramble the
>>>> loaded data. The first corruption I found was the .x86cpuvendor.init
>>>> section contained all zeroes.
>>>>
>>>>
>>> Explain what you mean with "the bootloader" in this context.
>>>
>>> -hpa
>>>
>> After the code was loaded (the compressed code, it seems that my GRUB
>> doesn't support uncompressed loading), the above section contained
>> zeroes. I snapped it fairly early, around secondary_startup_64, and
>> then printed it in x86_64_start_kernel.
>>
>> The object file had the correct data (as displayed by objdump) so I'm
>> assuming that the bootloading process didn't load the section correctly.
>>
>> Below was the linker script I used:
>>
>> --- linux-2.6.tip.orig/include/asm-generic/vmlinux.lds.h
>> +++ linux-2.6.tip/include/asm-generic/vmlinux.lds.h
>> @@ -373,9 +373,13 @@
>>
>> #ifdef CONFIG_HAVE_ZERO_BASED_PER_CPU
>> #define PERCPU(align) \
>> - . = ALIGN(align); \
>> + .data.percpu.abs = .; \
>> percpu : { } :percpu \
>> - __per_cpu_load = .; \
>> + .data.percpu.rel : AT(.data.percpu.abs - LOAD_OFFSET) { \
>> + BYTE(0) \
>> + . = ALIGN(align); \
>> + __per_cpu_load = .; \
>> + } \
>> .data.percpu 0 : AT(__per_cpu_load - LOAD_OFFSET) { \
>> *(.data.percpu.first) \
>> *(.data.percpu.shared_aligned) \
>> @@ -383,8 +387,8 @@
>> *(.data.percpu.page_aligned) \
>> ____per_cpu_size = .; \
>> } \
>> - . = __per_cpu_load + ____per_cpu_size; \
>> - data : { } :data
>> + . = __per_cpu_load + ____per_cpu_size;
>> +
>> #else
>> #define PERCPU(align) \
>> . = ALIGN(align); \
>>
>> It showed all the correct address in the map and __per_cpu_load was a
>> relative symbol (which was the objective.)
>>
>> Btw, our simulator, which only loads uncompressed code, had the data correct,
>> so it *may* only be a result of the code being compressed.
>>
>
> Weird. Grub doesn't get involved in the decompression the kernel does it
> all itself so we should be able to track where things go bad.
>
> Last I looked the compressed code was formed by essentially.
> objcopy vmlinux -O binary vmlinux.bin
> gzip vmlinux.bin
> And then we take on a magic header to the gzip compressed file.
>
> Are things only bad with the change above?

No, the original crash being discussed was a GP fault in head_64.S as it
tries to initialize the kernel segments. The cause was that the
prototype GDT is all zero, even though it's an initialized variable, and
inspection of vmlinux shows that it has the right contents. But somehow
it's either 1) getting zeroed on load, or 2) is loaded to the wrong place.

The zero-based PDA mechanism requires the introduction of a new ELF
segment based at vaddr 0 which is sufficiently unusual that it wouldn't
surprise me if its triggering some toolchain bug.

Mike: what would happen if the PDA were based at 4k rather than 0? The
stack canary would still be at its small offset (0x20?), but it doesn't
need to be initialized. I'm not sure if doing so would fix anything,
however.

J

2008-07-01 08:41:44

by Eric W. Biederman

[permalink] [raw]
Subject: Re: [crash, bisected] Re: [PATCH 3/4] x86_64: Fold pda into per cpu area

Jeremy Fitzhardinge <[email protected]> writes:

> No, the original crash being discussed was a GP fault in head_64.S as it tries
> to initialize the kernel segments. The cause was that the prototype GDT is all
> zero, even though it's an initialized variable, and inspection of vmlinux shows
> that it has the right contents. But somehow it's either 1) getting zeroed on
> load, or 2) is loaded to the wrong place.
>
> The zero-based PDA mechanism requires the introduction of a new ELF segment
> based at vaddr 0 which is sufficiently unusual that it wouldn't surprise me if
> its triggering some toolchain bug.

Agreed. Given the previous description my hunch is that the bug is occurring
during objcopy. If vmlinux is good and the compressed kernel is bad.

It should be possible to look at vmlinux.bin and see if that was generated
properly.

> Mike: what would happen if the PDA were based at 4k rather than 0? The stack
> canary would still be at its small offset (0x20?), but it doesn't need to be
> initialized. I'm not sure if doing so would fix anything, however.

I'm dense today. Why are we doing a zero based pda? That seems the most
likely culprit of linker trouble, and we should be able to put a smaller
offset in the segment register to allow for everything to work as expected.

Eric

2008-07-01 11:49:54

by Mike Travis

[permalink] [raw]
Subject: Re: [crash, bisected] Re: [PATCH 3/4] x86_64: Fold pda into per cpu area

Eric W. Biederman wrote:
> Mike Travis <[email protected]> writes:
>
>> H. Peter Anvin wrote:
>>> Mike Travis wrote:
>>>> FYI, I did try this out and it caused the bootloader to scramble the
>>>> loaded data. The first corruption I found was the .x86cpuvendor.init
>>>> section contained all zeroes.
>>>>
>>> Explain what you mean with "the bootloader" in this context.
>>>
>>> -hpa
>>
>> After the code was loaded (the compressed code, it seems that my GRUB
>> doesn't support uncompressed loading), the above section contained
>> zeroes. I snapped it fairly early, around secondary_startup_64, and
>> then printed it in x86_64_start_kernel.
>>
>> The object file had the correct data (as displayed by objdump) so I'm
>> assuming that the bootloading process didn't load the section correctly.
>>
>> Below was the linker script I used:
>>
>> --- linux-2.6.tip.orig/include/asm-generic/vmlinux.lds.h
>> +++ linux-2.6.tip/include/asm-generic/vmlinux.lds.h
>> @@ -373,9 +373,13 @@
>>
>> #ifdef CONFIG_HAVE_ZERO_BASED_PER_CPU
>> #define PERCPU(align) \
>> - . = ALIGN(align); \
>> + .data.percpu.abs = .; \
>> percpu : { } :percpu \
>> - __per_cpu_load = .; \
>> + .data.percpu.rel : AT(.data.percpu.abs - LOAD_OFFSET) { \
>> + BYTE(0) \
>> + . = ALIGN(align); \
>> + __per_cpu_load = .; \
>> + } \
>> .data.percpu 0 : AT(__per_cpu_load - LOAD_OFFSET) { \
>> *(.data.percpu.first) \
>> *(.data.percpu.shared_aligned) \
>> @@ -383,8 +387,8 @@
>> *(.data.percpu.page_aligned) \
>> ____per_cpu_size = .; \
>> } \
>> - . = __per_cpu_load + ____per_cpu_size; \
>> - data : { } :data
>> + . = __per_cpu_load + ____per_cpu_size;
>> +
>> #else
>> #define PERCPU(align) \
>> . = ALIGN(align); \
>>
>> It showed all the correct address in the map and __per_cpu_load was a
>> relative symbol (which was the objective.)
>>
>> Btw, our simulator, which only loads uncompressed code, had the data correct,
>> so it *may* only be a result of the code being compressed.
>
> Weird. Grub doesn't get involved in the decompression the kernel does it
> all itself so we should be able to track where things go bad.
>
> Last I looked the compressed code was formed by essentially.
> objcopy vmlinux -O binary vmlinux.bin
> gzip vmlinux.bin
> And then we take on a magic header to the gzip compressed file.
>
> Are things only bad with the change above?
>
> Eric

Yes. The failure was "Unsupported CPU" (or some such) which clued me into
the vendor section.

I was able to get the zero-based variables working well for standard
configs. It's getting tripped up now by some of Ingo's random configs,
in very unusual places... And once again, it only fails on real h/w, not
on our simulator, so catching the elusive bugger is tricky.

Thanks,
Mike

2008-07-01 12:09:59

by Mike Travis

[permalink] [raw]
Subject: Re: [crash, bisected] Re: [PATCH 3/4] x86_64: Fold pda into per cpu area

Jeremy Fitzhardinge wrote:
> Eric W. Biederman wrote:
>> Mike Travis <[email protected]> writes:
>>
>>
>>> H. Peter Anvin wrote:
>>>
>>>> Mike Travis wrote:
>>>>
>>>>> FYI, I did try this out and it caused the bootloader to scramble the
>>>>> loaded data. The first corruption I found was the .x86cpuvendor.init
>>>>> section contained all zeroes.
>>>>>
>>>>>
>>>> Explain what you mean with "the bootloader" in this context.
>>>>
>>>> -hpa
>>>>
>>> After the code was loaded (the compressed code, it seems that my GRUB
>>> doesn't support uncompressed loading), the above section contained
>>> zeroes. I snapped it fairly early, around secondary_startup_64, and
>>> then printed it in x86_64_start_kernel.
>>>
>>> The object file had the correct data (as displayed by objdump) so I'm
>>> assuming that the bootloading process didn't load the section correctly.
>>>
>>> Below was the linker script I used:
>>>
>>> --- linux-2.6.tip.orig/include/asm-generic/vmlinux.lds.h
>>> +++ linux-2.6.tip/include/asm-generic/vmlinux.lds.h
>>> @@ -373,9 +373,13 @@
>>>
>>> #ifdef CONFIG_HAVE_ZERO_BASED_PER_CPU
>>> #define
>>> PERCPU(align) \
>>> - . =
>>> ALIGN(align); \
>>> + .data.percpu.abs =
>>> .; \
>>> percpu : { }
>>> :percpu \
>>> - __per_cpu_load =
>>> .; \
>>> + .data.percpu.rel : AT(.data.percpu.abs - LOAD_OFFSET)
>>> { \
>>> +
>>> BYTE(0) \
>>> + . =
>>> ALIGN(align); \
>>> + __per_cpu_load =
>>> .; \
>>> +
>>> } \
>>> .data.percpu 0 : AT(__per_cpu_load - LOAD_OFFSET)
>>> { \
>>>
>>> *(.data.percpu.first) \
>>>
>>> *(.data.percpu.shared_aligned) \
>>> @@ -383,8 +387,8 @@
>>>
>>> *(.data.percpu.page_aligned) \
>>> ____per_cpu_size =
>>> .; \
>>>
>>> } \
>>> - . = __per_cpu_load +
>>> ____per_cpu_size; \
>>> - data : { } :data
>>> + . = __per_cpu_load + ____per_cpu_size;
>>> +
>>> #else
>>> #define
>>> PERCPU(align) \
>>> . =
>>> ALIGN(align); \
>>>
>>> It showed all the correct address in the map and __per_cpu_load was a
>>> relative symbol (which was the objective.)
>>>
>>> Btw, our simulator, which only loads uncompressed code, had the data
>>> correct,
>>> so it *may* only be a result of the code being compressed.
>>>
>>
>> Weird. Grub doesn't get involved in the decompression the kernel does it
>> all itself so we should be able to track where things go bad.
>>
>> Last I looked the compressed code was formed by essentially.
>> objcopy vmlinux -O binary vmlinux.bin
>> gzip vmlinux.bin
>> And then we take on a magic header to the gzip compressed file.
>>
>> Are things only bad with the change above?
>
> No, the original crash being discussed was a GP fault in head_64.S as it
> tries to initialize the kernel segments. The cause was that the
> prototype GDT is all zero, even though it's an initialized variable, and
> inspection of vmlinux shows that it has the right contents. But somehow
> it's either 1) getting zeroed on load, or 2) is loaded to the wrong place.
>
> The zero-based PDA mechanism requires the introduction of a new ELF
> segment based at vaddr 0 which is sufficiently unusual that it wouldn't
> surprise me if its triggering some toolchain bug.
>
> Mike: what would happen if the PDA were based at 4k rather than 0? The
> stack canary would still be at its small offset (0x20?), but it doesn't
> need to be initialized. I'm not sure if doing so would fix anything,
> however.
>
> J

I don't know that the basing at 0 or 4k would matter. I'll post the patch
in it's current form (as an RFC?) to show what was needed to initialize the
pda and gdt page pointer.

Thanks,
Mike

2008-07-01 16:28:26

by Jeremy Fitzhardinge

[permalink] [raw]
Subject: Re: [crash, bisected] Re: [PATCH 3/4] x86_64: Fold pda into per cpu area

Eric W. Biederman wrote:
> Jeremy Fitzhardinge <[email protected]> writes:
>
>
>> No, the original crash being discussed was a GP fault in head_64.S as it tries
>> to initialize the kernel segments. The cause was that the prototype GDT is all
>> zero, even though it's an initialized variable, and inspection of vmlinux shows
>> that it has the right contents. But somehow it's either 1) getting zeroed on
>> load, or 2) is loaded to the wrong place.
>>
>> The zero-based PDA mechanism requires the introduction of a new ELF segment
>> based at vaddr 0 which is sufficiently unusual that it wouldn't surprise me if
>> its triggering some toolchain bug.
>>
>
> Agreed. Given the previous description my hunch is that the bug is occurring
> during objcopy. If vmlinux is good and the compressed kernel is bad.
>
> It should be possible to look at vmlinux.bin and see if that was generated
> properly.
>
>
>> Mike: what would happen if the PDA were based at 4k rather than 0? The stack
>> canary would still be at its small offset (0x20?), but it doesn't need to be
>> initialized. I'm not sure if doing so would fix anything, however.
>>
>
> I'm dense today. Why are we doing a zero based pda? That seems the most
> likely culprit of linker trouble, and we should be able to put a smaller
> offset in the segment register to allow for everything to work as expected.
>

The only reason we need to do a zero-based PDA is because of the
boneheaded gcc/x86_64 ABI decision to put the stack canary at a fixed
offset from %gs (all they had to do was define it as a weak symbol we
could override). If we want to support stack-protector and unify the
handling of per-cpu variables, we need to rebase the per-cpu area at
zero, starting with the PDA.

My own inclination would be to drop stack-protector support until gcc
gets fixed, rather than letting it prevent us from unifying an area
which is in need of unification...

J

2008-07-01 16:55:27

by Mike Travis

[permalink] [raw]
Subject: Re: [crash, bisected] Re: [PATCH 3/4] x86_64: Fold pda into per cpu area

Jeremy Fitzhardinge wrote:
> Eric W. Biederman wrote:
>> Jeremy Fitzhardinge <[email protected]> writes:
>>
>>
>>> No, the original crash being discussed was a GP fault in head_64.S as
>>> it tries
>>> to initialize the kernel segments. The cause was that the prototype
>>> GDT is all
>>> zero, even though it's an initialized variable, and inspection of
>>> vmlinux shows
>>> that it has the right contents. But somehow it's either 1) getting
>>> zeroed on
>>> load, or 2) is loaded to the wrong place.
>>>
>>> The zero-based PDA mechanism requires the introduction of a new ELF
>>> segment
>>> based at vaddr 0 which is sufficiently unusual that it wouldn't
>>> surprise me if
>>> its triggering some toolchain bug.
>>>
>>
>> Agreed. Given the previous description my hunch is that the bug is
>> occurring
>> during objcopy. If vmlinux is good and the compressed kernel is bad.
>>
>> It should be possible to look at vmlinux.bin and see if that was
>> generated
>> properly.
>>
>>
>>> Mike: what would happen if the PDA were based at 4k rather than 0?
>>> The stack
>>> canary would still be at its small offset (0x20?), but it doesn't
>>> need to be
>>> initialized. I'm not sure if doing so would fix anything, however.
>>>
>>
>> I'm dense today. Why are we doing a zero based pda? That seems the most
>> likely culprit of linker trouble, and we should be able to put a smaller
>> offset in the segment register to allow for everything to work as
>> expected.
>>
>
> The only reason we need to do a zero-based PDA is because of the
> boneheaded gcc/x86_64 ABI decision to put the stack canary at a fixed
> offset from %gs (all they had to do was define it as a weak symbol we
> could override). If we want to support stack-protector and unify the
> handling of per-cpu variables, we need to rebase the per-cpu area at
> zero, starting with the PDA.
>
> My own inclination would be to drop stack-protector support until gcc
> gets fixed, rather than letting it prevent us from unifying an area
> which is in need of unification...
>
> J

I might be inclined to agree except most of the past few months of
finding problems caused by NR_CPUS=4096 has been stack overflow. So
any help detecting this condition is very useful. I can get static
stacksizes (of course), but there's not a lot of help determining
call chains except via actually executing the code.

Thanks,
Mike

2008-07-01 16:59:33

by H. Peter Anvin

[permalink] [raw]
Subject: Re: [crash, bisected] Re: [PATCH 3/4] x86_64: Fold pda into per cpu area

Eric W. Biederman wrote:
>>
>> The zero-based PDA mechanism requires the introduction of a new ELF segment
>> based at vaddr 0 which is sufficiently unusual that it wouldn't surprise me if
>> its triggering some toolchain bug.
>
> Agreed. Given the previous description my hunch is that the bug is occurring
> during objcopy. If vmlinux is good and the compressed kernel is bad.
>

Actually, it's not all that unusual... it's pretty common in various
restricted environments. That being said, it's probably uncommon for
*64-bit* code.

-hpa

2008-07-01 17:27:15

by Jeremy Fitzhardinge

[permalink] [raw]
Subject: Re: [crash, bisected] Re: [PATCH 3/4] x86_64: Fold pda into per cpu area

H. Peter Anvin wrote:
> Eric W. Biederman wrote:
>>>
>>> The zero-based PDA mechanism requires the introduction of a new ELF
>>> segment
>>> based at vaddr 0 which is sufficiently unusual that it wouldn't
>>> surprise me if
>>> its triggering some toolchain bug.
>>
>> Agreed. Given the previous description my hunch is that the bug is
>> occurring
>> during objcopy. If vmlinux is good and the compressed kernel is bad.
>>
>
> Actually, it's not all that unusual... it's pretty common in various
> restricted environments. That being said, it's probably uncommon for
> *64-bit* code.

Well, it's also unusual because 1) it's vaddr 0, but paddr <high>, and
2) the PHDRs are not sorted by vaddr order. 2) might actually be a bug.

J

2008-07-01 18:51:46

by Eric W. Biederman

[permalink] [raw]
Subject: Re: [crash, bisected] Re: [PATCH 3/4] x86_64: Fold pda into per cpu area

"H. Peter Anvin" <[email protected]> writes:

> Eric W. Biederman wrote:
>>>
>>> The zero-based PDA mechanism requires the introduction of a new ELF segment
>>> based at vaddr 0 which is sufficiently unusual that it wouldn't surprise me
> if
>>> its triggering some toolchain bug.
>>
>> Agreed. Given the previous description my hunch is that the bug is occurring
>> during objcopy. If vmlinux is good and the compressed kernel is bad.
>>
>
> Actually, it's not all that unusual... it's pretty common in various restricted
> environments. That being said, it's probably uncommon for *64-bit* code.

It is a sensible thing to expect to work. By unusual I mean it isn't triggered
by normal userspace code. In general I find that ld features if they aren't used
in userspace and they aren't used in the kernel don't work reliably across versions.

Eric

2008-07-01 20:41:55

by Eric W. Biederman

[permalink] [raw]
Subject: Re: [crash, bisected] Re: [PATCH 3/4] x86_64: Fold pda into per cpu area

Jeremy Fitzhardinge <[email protected]> writes:

> H. Peter Anvin wrote:
>> Eric W. Biederman wrote:
>>>>
>>>> The zero-based PDA mechanism requires the introduction of a new ELF segment
>>>> based at vaddr 0 which is sufficiently unusual that it wouldn't surprise me
>>>> if
>>>> its triggering some toolchain bug.
>>>
>>> Agreed. Given the previous description my hunch is that the bug is occurring
>>> during objcopy. If vmlinux is good and the compressed kernel is bad.
>>>
>>
>> Actually, it's not all that unusual... it's pretty common in various
>> restricted environments. That being said, it's probably uncommon for *64-bit*
>> code.
>
> Well, it's also unusual because 1) it's vaddr 0, but paddr <high>, and 2) the
> PHDRs are not sorted by vaddr order. 2) might actually be a bug.

I just looked and gcc does not use this technique for thread local data.

My initial concern about all of this was not making symbols section relative
is relieved as this all appears to be a 64bit arch thing where that doesn't
matter.

Has anyone investigated using the technique gcc uses for thread local storage?
http://people.redhat.com/drepper/tls.pdf

In particular using the local exec model so we can say:
movq %fs:x@tpoff,%rax

To load the contents of a per cpu variable x into %rax ?

If we can use that model it should make it easier to interface with things like
the stack protector code. Although we would still need to be very careful
about thread switches.

Eric

2008-07-01 21:10:53

by Jeremy Fitzhardinge

[permalink] [raw]
Subject: Re: [crash, bisected] Re: [PATCH 3/4] x86_64: Fold pda into per cpu area

Eric W. Biederman wrote:
> Jeremy Fitzhardinge <[email protected]> writes:
>
>
>> H. Peter Anvin wrote:
>>
>>> Eric W. Biederman wrote:
>>>
>>>>> The zero-based PDA mechanism requires the introduction of a new ELF segment
>>>>> based at vaddr 0 which is sufficiently unusual that it wouldn't surprise me
>>>>> if
>>>>> its triggering some toolchain bug.
>>>>>
>>>> Agreed. Given the previous description my hunch is that the bug is occurring
>>>> during objcopy. If vmlinux is good and the compressed kernel is bad.
>>>>
>>>>
>>> Actually, it's not all that unusual... it's pretty common in various
>>> restricted environments. That being said, it's probably uncommon for *64-bit*
>>> code.
>>>
>> Well, it's also unusual because 1) it's vaddr 0, but paddr <high>, and 2) the
>> PHDRs are not sorted by vaddr order. 2) might actually be a bug.
>>
>
> I just looked and gcc does not use this technique for thread local data.
>

Which technique? It does assume you put the thread-local data near %gs
(%fs in userspace), and it uses a small offset (positive or negative) to
reach it.

At present, the x86-64 only uses %gs-relative addressing to reach the
pda, which are always small positive offsets. It always accesses
per-cpu data in a two-step process of getting the base of per-cpu data,
then offsetting to find the particular variable.

x86-32 has no pda, and arranges %fs so that %fs:variable gets the percpu
variant of variable. The offsets are always quite large.

> My initial concern about all of this was not making symbols section relative
> is relieved as this all appears to be a 64bit arch thing where that doesn't
> matter.
>

Why's that? I thought you cared particularly about making the x86-64
kernel relocatable for kdump, and that using non-absolute symbols was
part of that?

> Has anyone investigated using the technique gcc uses for thread local storage?
> http://people.redhat.com/drepper/tls.pdf
>

The powerpc guys tried using gcc-level thread-local storage, but it
doesn't work well. per-cpu data and per-thread data have different
constraints, and its hard to tell gcc about them. For example, if you
have a section of preemptable code in your function, it's hard to tell
gcc not to cache a "thread-local" variable across it, even though we
could have switched CPUs in the meantime.

> In particular using the local exec model so we can say:
> movq %fs:x@tpoff,%rax
>
> To load the contents of a per cpu variable x into %rax ?
>
> If we can use that model it should make it easier to interface with things like
> the stack protector code. Although we would still need to be very careful
> about thread switches.
>

You mean cpu switches? We don't really have a notion of thread-local
data in the kernel, other than things hanging off the kernel stack.

J

2008-07-01 21:11:54

by Andi Kleen

[permalink] [raw]
Subject: Re: [crash, bisected] Re: [PATCH 3/4] x86_64: Fold pda into per cpu area

[email protected] (Eric W. Biederman) writes:
>
> Has anyone investigated using the technique gcc uses for thread local storage?

I investigated a long time ago (given when the binutils/gcc support
was much more primitive) and my conclusion back then was that doing
the same for kernel module (negative addresses) would need
new relocation types. And the pain of a binutils change didn't
seem worth it.

-Andi

2008-07-01 21:41:47

by Eric W. Biederman

[permalink] [raw]
Subject: Re: [crash, bisected] Re: [PATCH 3/4] x86_64: Fold pda into per cpu area

Jeremy Fitzhardinge <[email protected]> writes:

>> I just looked and gcc does not use this technique for thread local data.
>>
>
> Which technique?

A section located at 0.
> It does assume you put the thread-local data near %gs (%fs in
> userspace), and it uses a small offset (positive or negative) to
> reach it.

Nope. It achieves that affect with a magic set of relocations instead
of linker magic.

> At present, the x86-64 only uses %gs-relative addressing to reach the pda, which
> are always small positive offsets. It always accesses per-cpu data in a
> two-step process of getting the base of per-cpu data, then offsetting to find
> the particular variable.
>
> x86-32 has no pda, and arranges %fs so that %fs:variable gets the percpu variant
> of variable. The offsets are always quite large.

As a practical matter I like that approach (except for extra code size
of the offsets).

>> My initial concern about all of this was not making symbols section relative
>> is relieved as this all appears to be a 64bit arch thing where that doesn't
>> matter.
>>
>
> Why's that? I thought you cared particularly about making the x86-64 kernel
> relocatable for kdump, and that using non-absolute symbols was part of that?

That is all true but unconnected.

For x86_64 the kernel lives at a fixed virtual address. So absolute or
non absolute symbols don't matter. Only __pa and a little bit of code
in head64.S that sets up the intial page tables has to be aware of it.
So relocation on x86_64 is practically free.

For i386 since virtual address space is precious and because there were
concerns about putting code in __pa we actually relocate the kernel symbols
during load right after decompression. When we do relocations absolute
symbols are a killer.

>> Has anyone investigated using the technique gcc uses for thread local storage?
>> http://people.redhat.com/drepper/tls.pdf
>>
>
> The powerpc guys tried using gcc-level thread-local storage, but it doesn't work
> well. per-cpu data and per-thread data have different constraints, and its hard
> to tell gcc about them. For example, if you have a section of preemptable code
> in your function, it's hard to tell gcc not to cache a "thread-local" variable
> across it, even though we could have switched CPUs in the meantime.

Yes, I completely agree with that. It doesn't mean however that we
can't keep gcc ignorant and generate the same code manually.


>> In particular using the local exec model so we can say:
>> movq %fs:x@tpoff,%rax
>>
>> To load the contents of a per cpu variable x into %rax ?
>>
>> If we can use that model it should make it easier to interface with things
> like
>> the stack protector code. Although we would still need to be very careful
>> about thread switches.
>>
>
> You mean cpu switches? We don't really have a notion of thread-local data in
> the kernel, other than things hanging off the kernel stack.

Well I was thinking threads switching on a cpu having the kinds of problems you
described when it was tried on ppc.


Eric

2008-07-01 21:51:43

by Eric W. Biederman

[permalink] [raw]
Subject: Re: [crash, bisected] Re: [PATCH 3/4] x86_64: Fold pda into per cpu area

Andi Kleen <[email protected]> writes:

> [email protected] (Eric W. Biederman) writes:
>>
>> Has anyone investigated using the technique gcc uses for thread local storage?
>
> I investigated a long time ago (given when the binutils/gcc support
> was much more primitive) and my conclusion back then was that doing
> the same for kernel module (negative addresses) would need
> new relocation types. And the pain of a binutils change didn't
> seem worth it.

Thanks. That does seem to be the fly in the ointment of using the
builtin linker support. The kernel lives at negative addresses which
is necessary but weird.

If the @tpoff relocation doesn't work for us we clearly can't use
the support. I will have to look and see if usable relocation types
are generated from @tpoff.

Eric

2008-07-01 21:53:05

by Jeremy Fitzhardinge

[permalink] [raw]
Subject: Re: [crash, bisected] Re: [PATCH 3/4] x86_64: Fold pda into per cpu area

Eric W. Biederman wrote:
> Nope. It achieves that affect with a magic set of relocations instead
> of linker magic.
>

Well, the code gcc generates for -fstack-protector emits a literal
"%gs:40", so there's no relocations at all.

>> At present, the x86-64 only uses %gs-relative addressing to reach the pda, which
>> are always small positive offsets. It always accesses per-cpu data in a
>> two-step process of getting the base of per-cpu data, then offsetting to find
>> the particular variable.
>>
>> x86-32 has no pda, and arranges %fs so that %fs:variable gets the percpu variant
>> of variable. The offsets are always quite large.
>>
>
> As a practical matter I like that approach (except for extra code size
> of the offsets).
>

Yes, and there's no reason we couldn't do the same on 64-bit, aside from
the stack-protector's use of %gs:40. There's no code-size cost in large
offsets, since they're always 32-bits anyway (there's no short absolute
addressing mode).

>> The powerpc guys tried using gcc-level thread-local storage, but it doesn't work
>> well. per-cpu data and per-thread data have different constraints, and its hard
>> to tell gcc about them. For example, if you have a section of preemptable code
>> in your function, it's hard to tell gcc not to cache a "thread-local" variable
>> across it, even though we could have switched CPUs in the meantime.
>>
>
> Yes, I completely agree with that. It doesn't mean however that we
> can't keep gcc ignorant and generate the same code manually.
>

Yes, I see. I haven't looked at that specifically, but I think both
Rusty and Andi have, and it gets tricky with modules and -ve kernel
addresses, or something.

> Well I was thinking threads switching on a cpu having the kinds of problems you
> described when it was tried on ppc.

Uh, I think we're having a nomenclature imprecision here. Strictly
speaking, the kernel doesn't have threads, only tasks and CPUs. We only
care about per-cpu data, not per-task data, so the concern is not
"threads switching on a CPU" but "CPUs switching on (under) a task".
But I think we understand each other regardless ;)

If we manually generate %gs-relative references to percpu data, then
it's no different to what we do with 32-bit, whether it be a specific
symbol address or using the TLS relocations.

J

2008-07-02 00:24:20

by H. Peter Anvin

[permalink] [raw]
Subject: Re: [crash, bisected] Re: [PATCH 3/4] x86_64: Fold pda into per cpu area

Jeremy Fitzhardinge wrote:
>
> Yes, and there's no reason we couldn't do the same on 64-bit, aside from
> the stack-protector's use of %gs:40. There's no code-size cost in large
> offsets, since they're always 32-bits anyway (there's no short absolute
> addressing mode).
>
> If we manually generate %gs-relative references to percpu data, then
> it's no different to what we do with 32-bit, whether it be a specific
> symbol address or using the TLS relocations.
>

If we think the problem is the zero-basing triggering linker bugs, we
should probably just use a small offset, like 64 (put a small dummy
section before the .percpu.data section to occupy this section.)

I'm going to play with this a bit and see if I come up with something
sanish.

-hpa

2008-07-02 01:15:55

by Mike Travis

[permalink] [raw]
Subject: Re: [crash, bisected] Re: [PATCH 3/4] x86_64: Fold pda into per cpu area

H. Peter Anvin wrote:
> Jeremy Fitzhardinge wrote:
>>
>> Yes, and there's no reason we couldn't do the same on 64-bit, aside
>> from the stack-protector's use of %gs:40. There's no code-size cost
>> in large offsets, since they're always 32-bits anyway (there's no
>> short absolute addressing mode).
>>
>> If we manually generate %gs-relative references to percpu data, then
>> it's no different to what we do with 32-bit, whether it be a specific
>> symbol address or using the TLS relocations.
>>
>
> If we think the problem is the zero-basing triggering linker bugs, we
> should probably just use a small offset, like 64 (put a small dummy
> section before the .percpu.data section to occupy this section.)
>
> I'm going to play with this a bit and see if I come up with something
> sanish.
>
> -hpa

One interesting thing I've discovered is the gcc --version may make a
difference.

The kernel panic that occurred from Ingo's config, I was able to replicate
with GCC 4.2.0 (which is on our devel server). But this one complained
about not being able to handle the STACK-PROTECTOR option so I moved
everything to another machine that has 4.2.4, and now it seems that it
works fine. I'm still re-verifying that the source bits and config options
are identical (it was a later git-remote update), and that in fact it is
the gcc --version, but that may be the conclusion. (My code also has some
patches submitted but not yet included in the tip/master tree. Curiously
just enabling some debug options changed the footprint of the panic.)

Are we allowed to insist on a specific level of GCC for compiling the
kernel?

Thanks,
Mike

2008-07-02 01:41:47

by Eric W. Biederman

[permalink] [raw]
Subject: Re: [crash, bisected] Re: [PATCH 3/4] x86_64: Fold pda into per cpu area

Mike Travis <[email protected]> writes:

> H. Peter Anvin wrote:
>> Jeremy Fitzhardinge wrote:
>>>
>>> Yes, and there's no reason we couldn't do the same on 64-bit, aside
>>> from the stack-protector's use of %gs:40. There's no code-size cost
>>> in large offsets, since they're always 32-bits anyway (there's no
>>> short absolute addressing mode).
>>>
>>> If we manually generate %gs-relative references to percpu data, then
>>> it's no different to what we do with 32-bit, whether it be a specific
>>> symbol address or using the TLS relocations.
>>>
>>
>> If we think the problem is the zero-basing triggering linker bugs, we
>> should probably just use a small offset, like 64 (put a small dummy
>> section before the .percpu.data section to occupy this section.)
>>
>> I'm going to play with this a bit and see if I come up with something
>> sanish.
>>
>> -hpa
>
> One interesting thing I've discovered is the gcc --version may make a
> difference.
>
> The kernel panic that occurred from Ingo's config, I was able to replicate
> with GCC 4.2.0 (which is on our devel server). But this one complained
> about not being able to handle the STACK-PROTECTOR option so I moved
> everything to another machine that has 4.2.4, and now it seems that it
> works fine. I'm still re-verifying that the source bits and config options
> are identical (it was a later git-remote update), and that in fact it is
> the gcc --version, but that may be the conclusion. (My code also has some
> patches submitted but not yet included in the tip/master tree. Curiously
> just enabling some debug options changed the footprint of the panic.)
>
> Are we allowed to insist on a specific level of GCC for compiling the
> kernel?

Depends on the root cause. If it turns out to be something that is buggy
in gcc and we can't work around. We might do something. I don't recall
that kind of thing happening often. I think our minimum gcc is currently
gcc-3.4.

Eric

2008-07-02 01:44:17

by H. Peter Anvin

[permalink] [raw]
Subject: Re: [crash, bisected] Re: [PATCH 3/4] x86_64: Fold pda into per cpu area

Mike Travis wrote:
>
> One interesting thing I've discovered is the gcc --version may make a
> difference.
>
> The kernel panic that occurred from Ingo's config, I was able to replicate
> with GCC 4.2.0 (which is on our devel server). But this one complained
> about not being able to handle the STACK-PROTECTOR option so I moved
> everything to another machine that has 4.2.4, and now it seems that it
> works fine. I'm still re-verifying that the source bits and config options
> are identical (it was a later git-remote update), and that in fact it is
> the gcc --version, but that may be the conclusion. (My code also has some
> patches submitted but not yet included in the tip/master tree. Curiously
> just enabling some debug options changed the footprint of the panic.)
>
> Are we allowed to insist on a specific level of GCC for compiling the
> kernel?
>

Yes, but certainly not anything even close to that recent -- I think
right now we're supposed to support back to 3.2-something overall;
specific architectures might have more stringent requirements. There
are a couple of gcc versions known to miscompile there kernel that we
don't support; I don't know if 4.2.0 is one of them.

-hpa

2008-07-02 01:44:32

by Mike Travis

[permalink] [raw]
Subject: Re: [crash, bisected] Re: [PATCH 3/4] x86_64: Fold pda into per cpu area

Mike Travis wrote:
... I'm still re-verifying that the source bits and config options
> are identical (it was a later git-remote update), and that in fact it is
> the gcc --version, but that may be the conclusion.
...

Yup, it's the gcc --version that makes the difference. GCC 4.2.0 couldn't
boot past the grub screen, GCC 4.2.4 made it to the login prompt.

The only config changes were imposed by the make script:


stapp 125> diff ../configs/config-Tue_Jul__1_16_48_45_CEST_2008.bad ../build/ingo-test-0701/.config
4c4
< # Tue Jul 1 16:53:49 2008
---
> # Tue Jul 1 16:09:33 2008
64c64
< CONFIG_LOCALVERSION=""
---
> CONFIG_LOCALVERSION="-ingo-test-0701"
120d119
< CONFIG_USE_GENERIC_SMP_HELPERS=y
124d122
< # CONFIG_HAVE_GENERIC_DMA_COHERENT is not set
1294d1291
< CONFIG_THERMAL_HWMON=y

Posting the complete patchset RSN... (or tomorrow am so I can test some
more configs and functionality.)

Thanks,
Mike

2008-07-02 01:49:17

by H. Peter Anvin

[permalink] [raw]
Subject: Re: [crash, bisected] Re: [PATCH 3/4] x86_64: Fold pda into per cpu area

Mike Travis wrote:
> Mike Travis wrote:
> ... I'm still re-verifying that the source bits and config options
>> are identical (it was a later git-remote update), and that in fact it is
>> the gcc --version, but that may be the conclusion.
> ...
>
> Yup, it's the gcc --version that makes the difference. GCC 4.2.0 couldn't
> boot past the grub screen, GCC 4.2.4 made it to the login prompt.
>

IIRC, 4.2.0, 4.2.1 and 4.3.0 are known to miscompile the kernel in one
way or another, however, that is from memory so don't quote me on it.

-hpa

2008-07-02 01:51:19

by Mike Travis

[permalink] [raw]
Subject: Re: [crash, bisected] Re: [PATCH 3/4] x86_64: Fold pda into per cpu area

Eric W. Biederman wrote:
> Mike Travis <[email protected]> writes:
>
>> H. Peter Anvin wrote:
>>> Jeremy Fitzhardinge wrote:
>>>> Yes, and there's no reason we couldn't do the same on 64-bit, aside
>>>> from the stack-protector's use of %gs:40. There's no code-size cost
>>>> in large offsets, since they're always 32-bits anyway (there's no
>>>> short absolute addressing mode).
>>>>
>>>> If we manually generate %gs-relative references to percpu data, then
>>>> it's no different to what we do with 32-bit, whether it be a specific
>>>> symbol address or using the TLS relocations.
>>>>
>>> If we think the problem is the zero-basing triggering linker bugs, we
>>> should probably just use a small offset, like 64 (put a small dummy
>>> section before the .percpu.data section to occupy this section.)
>>>
>>> I'm going to play with this a bit and see if I come up with something
>>> sanish.
>>>
>>> -hpa
>> One interesting thing I've discovered is the gcc --version may make a
>> difference.
>>
>> The kernel panic that occurred from Ingo's config, I was able to replicate
>> with GCC 4.2.0 (which is on our devel server). But this one complained
>> about not being able to handle the STACK-PROTECTOR option so I moved
>> everything to another machine that has 4.2.4, and now it seems that it
>> works fine. I'm still re-verifying that the source bits and config options
>> are identical (it was a later git-remote update), and that in fact it is
>> the gcc --version, but that may be the conclusion. (My code also has some
>> patches submitted but not yet included in the tip/master tree. Curiously
>> just enabling some debug options changed the footprint of the panic.)
>>
>> Are we allowed to insist on a specific level of GCC for compiling the
>> kernel?
>
> Depends on the root cause. If it turns out to be something that is buggy
> in gcc and we can't work around. We might do something. I don't recall
> that kind of thing happening often. I think our minimum gcc is currently
> gcc-3.4.
>
> Eric

Ouch. How far into it do we need to investigate? I can surely compare the
vmlinux object files, but I'm not cognizant enough about the linker internals
to examine much more than that.

But hey, maybe gcc-3.4 will work ok...? ;-)

[Or it may be the stack-protector thing is introducing better code? I'll try
some more config options tomorrow to see if that affects anything.]

Cheers,
Mike

2008-07-02 01:55:44

by Mike Travis

[permalink] [raw]
Subject: Re: [crash, bisected] Re: [PATCH 3/4] x86_64: Fold pda into per cpu area

H. Peter Anvin wrote:
> Mike Travis wrote:
>> Mike Travis wrote:
>> ... I'm still re-verifying that the source bits and config options
>>> are identical (it was a later git-remote update), and that in fact it is
>>> the gcc --version, but that may be the conclusion.
>> ...
>>
>> Yup, it's the gcc --version that makes the difference. GCC 4.2.0
>> couldn't
>> boot past the grub screen, GCC 4.2.4 made it to the login prompt.
>>
>
> IIRC, 4.2.0, 4.2.1 and 4.3.0 are known to miscompile the kernel in one
> way or another, however, that is from memory so don't quote me on it.
>
> -hpa

Great. That's what's been on my devel server for the past 3 or 4 months now...

[And it's a big shared server that I'm but a small ant wandering around on it. ;-)]

2008-07-02 02:05:05

by H. Peter Anvin

[permalink] [raw]
Subject: Re: [crash, bisected] Re: [PATCH 3/4] x86_64: Fold pda into per cpu area

Eric W. Biederman wrote:
>
> For i386 since virtual address space is precious and because there were
> concerns about putting code in __pa we actually relocate the kernel symbols
> during load right after decompression. When we do relocations absolute
> symbols are a killer.
>

Well, it means making it clear to the relocator if it should relocate
those symbols or not. Since IIRC we're doing "all or nothing"
relocation, the relative offsets are always the same, even between sections.

-hpa

2008-07-02 02:51:47

by Eric W. Biederman

[permalink] [raw]
Subject: Re: [crash, bisected] Re: [PATCH 3/4] x86_64: Fold pda into per cpu area

Mike Travis <[email protected]> writes:

> Ouch. How far into it do we need to investigate? I can surely compare the
> vmlinux object files, but I'm not cognizant enough about the linker internals
> to examine much more than that.

As a first step we just need to know what we tell gcc or the linker
to do, and what is incorrectly output. Once the problem is
understood we can think about how to deal with the problem.

What we really need is a recipe for success.
A recipe for failure.

At that point it should be much easier for someone else to reproduce
the problem, and/or look into the specific details and see what is
wrong or to suggest patches.

The kernel is a significant enough program and a different enough one it
is hard to predict how things go.

Eric

2008-07-02 03:12:04

by Eric W. Biederman

[permalink] [raw]
Subject: Re: [crash, bisected] Re: [PATCH 3/4] x86_64: Fold pda into per cpu area

"H. Peter Anvin" <[email protected]> writes:

> Eric W. Biederman wrote:
>>
>> For i386 since virtual address space is precious and because there were
>> concerns about putting code in __pa we actually relocate the kernel symbols
>> during load right after decompression. When we do relocations absolute
>> symbols are a killer.
>>
>
> Well, it means making it clear to the relocator if it should relocate those
> symbols or not. Since IIRC we're doing "all or nothing" relocation, the
> relative offsets are always the same, even between sections.

Yes the relative offsets stay the same. I don't remember if ld
generates useable relocations for what it figures are absolute
symbols. I remember the solution was that anything that we wanted to
relocate we would make section relative, and anything else we would
leave absolute, and that those were easy things to do.

The nasty case is that occasionally ld has a bug where it turns section
relative symbols into global symbols if there is a section without data.
I don't recall us caring in those instances.

While interesting. All of this is irrelevant until we start talking
unification between x86_63 and x86_64 as only x86_32 has this
restriction.

Eric

2008-07-02 22:50:40

by Mike Travis

[permalink] [raw]
Subject: Re: [crash, bisected] Re: [PATCH 3/4] x86_64: Fold pda into per cpu area

H. Peter Anvin wrote:
> Mike Travis wrote:
>> Mike Travis wrote:
>> ... I'm still re-verifying that the source bits and config options
>>> are identical (it was a later git-remote update), and that in fact it is
>>> the gcc --version, but that may be the conclusion.
>> ...
>>
>> Yup, it's the gcc --version that makes the difference. GCC 4.2.0
>> couldn't
>> boot past the grub screen, GCC 4.2.4 made it to the login prompt.
>>
>
> IIRC, 4.2.0, 4.2.1 and 4.3.0 are known to miscompile the kernel in one
> way or another, however, that is from memory so don't quote me on it.
>
> -hpa

This is definitely getting strange...

Ingo's randconfig at:

http://redhat.com/~mingo/misc/config-Tue_Jul__1_16_48_45_CEST_2008.bad

will only boot and run with gcc-4.2.4, with gcc-4.2.0 it fails at the grub screen.

Other configs like:

defconfig w/NR_CPUS=4096
nonuma

will only boot and run with gcc-4.2.0, and does the "failure at grub" screen with
gcc-4.2.4 ...!

The nosmp config works with either.

I'm looking at the generated vmlinux files now (well at least the assembler, I'm
not really familiar enough with the linker objects to look at those.) I'm also
trying to track which config options changes the behavior so radically.

Any other suggestions?

Thanks!
Mike

2008-07-03 07:09:40

by Eric W. Biederman

[permalink] [raw]
Subject: Re: [crash, bisected] Re: [PATCH 3/4] x86_64: Fold pda into per cpu area

Mike Travis <[email protected]> writes:

> will only boot and run with gcc-4.2.0, and does the "failure at grub" screen
> with
> gcc-4.2.4 ...!

Do these tests have an early serial console enabled? I want to confirm
that the failure is very early in boot before or just as we reach C code.

If you don't have an early console we could be failing late in the process just
not have visibility.

> Any other suggestions?

If you have a serial console to add a little bit of instrumentation super duper early.
Although looking at the generated objects might be just as interesting.

Eric

2008-07-07 17:17:50

by Mike Travis

[permalink] [raw]
Subject: Re: [crash, bisected] Re: [PATCH 3/4] x86_64: Fold pda into per cpu area

Eric W. Biederman wrote:
> Mike Travis <[email protected]> writes:
>
>> will only boot and run with gcc-4.2.0, and does the "failure at grub" screen
>> with
>> gcc-4.2.4 ...!
>
> Do these tests have an early serial console enabled? I want to confirm
> that the failure is very early in boot before or just as we reach C code.
>
> If you don't have an early console we could be failing late in the process just
> not have visibility.
>
>> Any other suggestions?
>
> If you have a serial console to add a little bit of instrumentation super duper early.
> Although looking at the generated objects might be just as interesting.
>
> Eric

Hi,

Sorry for the delay, been off the past 4 days.

I'll see how closely I can narrow down where the fault is. I have been inserting
very early printk's to track how far into the startup it gets before bailing.

Thanks,
Mike

2008-07-07 19:52:09

by Eric W. Biederman

[permalink] [raw]
Subject: Re: [crash, bisected] Re: [PATCH 3/4] x86_64: Fold pda into per cpu area

Mike Travis <[email protected]> writes:
> Hi,
>
> Sorry for the delay, been off the past 4 days.
>
> I'll see how closely I can narrow down where the fault is. I have been
> inserting
> very early printk's to track how far into the startup it gets before bailing.

Thanks. That should help narrow down what is going on.

Eric

2008-07-08 18:21:20

by Mike Travis

[permalink] [raw]
Subject: Re: [crash, bisected] Re: [PATCH 3/4] x86_64: Fold pda into per cpu area

Eric W. Biederman wrote:
> Mike Travis <[email protected]> writes:
>> Hi,
>>
>> Sorry for the delay, been off the past 4 days.
>>
>> I'll see how closely I can narrow down where the fault is. I have been
>> inserting
>> very early printk's to track how far into the startup it gets before bailing.
>
> Thanks. That should help narrow down what is going on.
>
> Eric

Unfortunately it's back to the problem of faulting before x86_64_start_kernel()
and grub just immediately reboots. So I'm back at analyzing assembler and
config differences.

Mike

2008-07-08 23:49:40

by Jeremy Fitzhardinge

[permalink] [raw]
Subject: Re: [crash, bisected] Re: [PATCH 3/4] x86_64: Fold pda into per cpu area

Eric W. Biederman wrote:
> Mike Travis <[email protected]> writes:
>
>
>> Unfortunately it's back to the problem of faulting before x86_64_start_kernel()
>> and grub just immediately reboots. So I'm back at analyzing assembler and
>> config differences.
>>
>
> Ok. That is a narrow window of code. So it shouldn't be too bad, nasty though.
>
> If you would like to trace through it I have attached my serial port
> debugging routines that I use in that part of the code.

Last time it was doing this, it was a result of a triple-fault caused by
loading %ds with an all-zero gdt. I modified Xen to dump the CPU state
on triple-faults, so it was easy to pinpoint. I can do that again if it
helps.

J

2008-07-09 14:37:32

by Mike Travis

[permalink] [raw]
Subject: Re: [crash, bisected] Re: [PATCH 3/4] x86_64: Fold pda into per cpu area

Eric W. Biederman wrote:
> Mike Travis <[email protected]> writes:
>
>> Unfortunately it's back to the problem of faulting before x86_64_start_kernel()
>> and grub just immediately reboots. So I'm back at analyzing assembler and
>> config differences.
>
> Ok. That is a narrow window of code. So it shouldn't be too bad, nasty though.
>
> If you would like to trace through it I have attached my serial port
> debugging routines that I use in that part of the code.
>
> Eric
>
>

Very cool, thanks!!! I will start using this. (I have been using the trick
to replace printk with early_printk so messages come out immediately instead
of from the log buf.)

I've been able to make some more progress. I've gotten to a point where it
panics from stack overflow. I've verified this by bumping THREAD_ORDER and
it boots fine. Now tracking down stack usages. (I have found a couple of new
functions using set_cpus_allowed(..., CPU_MASK_ALL) instead of
set_cpus_allowed_ptr(... , CPU_MASK_ALL_PTR). But these are not in the calling
sequence so subsequently are not the cause.

One weird thing is early_idt_handler seems to have been called and that's one
thing our simulator does not mimic for standard Intel FSB systems - early pending
interrupts. (It's designed after all to mimic our h/w, and of course it's been
booting fine under that environment.)

Two patches are in the queue to reduce this stack usage:

Subject: [PATCH 1/1] sched: Reduce stack size in isolated_cpu_setup()
Subject: [PATCH 1/1] kthread: Reduce stack pressure in create_kthread and kthreadd

The other stack pigs are:

1640 sched_domain_node_span
1576 tick_notify
1576 setup_IO_APIC_irq
1576 move_task_off_dead_cpu
1560 arch_setup_ht_irq
1560 __assign_irq_vector
1544 tick_handle_oneshot_broadcast
1352 zc0301_ioctl_v4l2
1336 i2o_cfg_compat_ioctl
1192 sn9c102_ioctl_v4l2
1176 __build_sched_domains
1152 e1000_check_options
1144 __build_all_zonelists
1128 setup_IO_APIC
1096 sched_balance_self
1096 _cpu_down
1080 do_ida_request
1064 sched_rt_period_timer
1064 native_smp_call_function_mask
1048 setup_timer_IRQ0_pin
1048 setup_ioapic_dest
1048 set_ioapic_affinity_irq
1048 set_ht_irq_affinity
1048 pci_device_probe
1048 native_machine_crash_shutdown
1032 tick_do_periodic_broadcast
1032 sched_setaffinity
1032 native_flush_tlb_others
1032 local_cpus_show
1032 local_cpulist_show
1032 irq_select_affinity
1032 irq_complete_move
1032 irq_affinity_write_proc
1032 ioapic_retrigger_irq
1032 flush_tlb_mm
1032 flush_tlb_current_task
1032 fixup_irqs
1032 do_cciss_request
1032 create_irq
1024 uv_vector_allocation_domain
1024 uv_send_IPI_allbutself
1024 smp_call_function_single
1024 smp_call_function
1024 physflat_send_IPI_allbutself
1024 pci_bus_show_cpuaffinity
1024 move_masked_irq
1024 flush_tlb_page
1024 flat_send_IPI_allbutself
1000 security_load_policy

Only a few of these though I would think might get called early in
the boot, that might also be contributing to the stack overflow.

Oh yeah, I looked very closely at the differences in the assembler
for vmlinux when compiled with 4.2.0 (fails) and 4.2.4 (which boots
with the above mentioned THREAD_ORDER change) and except for some
weirdness around ident_complete it seems to be the same code. But
the per_cpu variables are in a completely different address order.
I wouldn't think that the -j10 for make could cause this but I can
verify that with -j1. But in any case, I'm sticking with 4.2.4 for
now.

Thanks,
Mike

2008-07-09 14:40:11

by Mike Travis

[permalink] [raw]
Subject: Re: [crash, bisected] Re: [PATCH 3/4] x86_64: Fold pda into per cpu area

Jeremy Fitzhardinge wrote:
> Eric W. Biederman wrote:
>> Mike Travis <[email protected]> writes:
>>
>>
>>> Unfortunately it's back to the problem of faulting before
>>> x86_64_start_kernel()
>>> and grub just immediately reboots. So I'm back at analyzing
>>> assembler and
>>> config differences.
>>>
>>
>> Ok. That is a narrow window of code. So it shouldn't be too bad,
>> nasty though.
>>
>> If you would like to trace through it I have attached my serial port
>> debugging routines that I use in that part of the code.
>
> Last time it was doing this, it was a result of a triple-fault caused by
> loading %ds with an all-zero gdt. I modified Xen to dump the CPU state
> on triple-faults, so it was easy to pinpoint. I can do that again if it
> helps.
>
> J

Absolutely! I'll repost the latest version of the patchset. Still haven't
gotten around to enabling a XEN boot but it sure does sound handy.

Thanks,
Mike

2008-07-09 22:42:12

by Eric W. Biederman

[permalink] [raw]
Subject: Re: [crash, bisected] Re: [PATCH 3/4] x86_64: Fold pda into per cpu area

Mike Travis <[email protected]> writes:

> Very cool, thanks!!! I will start using this. (I have been using the trick
> to replace printk with early_printk so messages come out immediately instead
> of from the log buf.)

Just passing early_printk=xxx on the command line should have that effect.
Although I do admit you have to be a little bit into the boot before early_printk
is setup.

> I've been able to make some more progress. I've gotten to a point where it
> panics from stack overflow. I've verified this by bumping THREAD_ORDER and
> it boots fine. Now tracking down stack usages. (I have found a couple of new
> functions using set_cpus_allowed(..., CPU_MASK_ALL) instead of
> set_cpus_allowed_ptr(... , CPU_MASK_ALL_PTR). But these are not in the calling
> sequence so subsequently are not the cause.

Is stack overflow the only problem you are seeing or are there still other mysteries?

> One weird thing is early_idt_handler seems to have been called and that's one
> thing our simulator does not mimic for standard Intel FSB systems - early
> pending
> interrupts. (It's designed after all to mimic our h/w, and of course it's been
> booting fine under that environment.)

That usually indicates you are taking an exception during boot not that you
have received an external interrupt. Something like a page fault or a
division by 0 error.

> Only a few of these though I would think might get called early in
> the boot, that might also be contributing to the stack overflow.

Still the call chain depth shouldn't really be changing. So why should it
matter? Ah. The high cpu count is growing cpumask_t so when you put
it on the stack. That makes sense. So what stars out as a 4 byte
variable on the stack in a normal setup winds up being a 1k variable
with 4k cpus.

> Oh yeah, I looked very closely at the differences in the assembler
> for vmlinux when compiled with 4.2.0 (fails) and 4.2.4 (which boots
> with the above mentioned THREAD_ORDER change) and except for some
> weirdness around ident_complete it seems to be the same code. But
> the per_cpu variables are in a completely different address order.
> I wouldn't think that the -j10 for make could cause this but I can
> verify that with -j1. But in any case, I'm sticking with 4.2.4 for
> now.

Reasonable. The practical problem is you are mixing a lot of changes
simultaneously and it confuses things. Compiling with NR_CPUS=4096
and working out the bugs from a growing cpumask_t, putting the per cpu
area in a zero based segment, and putting putting the pda into the
per cpu area all at the same time.

Who knows maybe the only difference between 4.2.0 and 4.2.4 is that
4.2.4 optimizes it's stack usage a little better and you don't see
a stack overflow.

It would be very very good if we could separate out these issues
especially the segment for the per cpu variables. We need something
like that.

Eric

2008-07-09 23:30:28

by Mike Travis

[permalink] [raw]
Subject: Re: [crash, bisected] Re: [PATCH 3/4] x86_64: Fold pda into per cpu area

Eric W. Biederman wrote:
> Mike Travis <[email protected]> writes:
>
... (I have been using the trick
>> to replace printk with early_printk so messages come out immediately instead
>> of from the log buf.)
>
> Just passing early_printk=xxx on the command line should have that effect.
> Although I do admit you have to be a little bit into the boot before early_printk
> is setup.

What I meant was using early_printk in place of printk, which seems to stuff the
messages into the log buf until the serial console is setup fairly late in start_kernel.
I did this by removing printk() and renaming early_printk() to be printk (and a couple
other things like #define early_printk printk ...

>
>> I've been able to make some more progress. I've gotten to a point where it
>> panics from stack overflow. I've verified this by bumping THREAD_ORDER and
>> it boots fine. Now tracking down stack usages. (I have found a couple of new
>> functions using set_cpus_allowed(..., CPU_MASK_ALL) instead of
>> set_cpus_allowed_ptr(... , CPU_MASK_ALL_PTR). But these are not in the calling
>> sequence so subsequently are not the cause.
>
> Is stack overflow the only problem you are seeing or are there still other mysteries?

I'm not entirely sure it's a stack overflow, the fault has a NULL dereference and
then the stack overflow message.

>
>> One weird thing is early_idt_handler seems to have been called and that's one
>> thing our simulator does not mimic for standard Intel FSB systems - early
>> pending
>> interrupts. (It's designed after all to mimic our h/w, and of course it's been
>> booting fine under that environment.)
>
> That usually indicates you are taking an exception during boot not that you
> have received an external interrupt. Something like a page fault or a
> division by 0 error.

I was thinking maybe an RTC interrupt? But a fault does sound more likely.

>
>> Only a few of these though I would think might get called early in
>> the boot, that might also be contributing to the stack overflow.
>
> Still the call chain depth shouldn't really be changing. So why should it
> matter? Ah. The high cpu count is growing cpumask_t so when you put
> it on the stack. That makes sense. So what stars out as a 4 byte
> variable on the stack in a normal setup winds up being a 1k variable
> with 4k cpus.

Yes, it's definitely the three related:

NR_CPUS Patch_Applied THREAD_ORDER Results
256 NO 1 works (obviously ;-)
256 YES 1 works
4096 NO 1 works
4096 YES 1 panics
4096 YES 3 works (just happened to pick 3,
2 probably will work as well.)

> Reasonable. The practical problem is you are mixing a lot of changes
> simultaneously and it confuses things. Compiling with NR_CPUS=4096
> and working out the bugs from a growing cpumask_t, putting the per cpu
> area in a zero based segment, and putting putting the pda into the
> per cpu area all at the same time.

I've been testing NR_CPUS=4096 for quite a while and it's been very
reliable. It's just weird that this config fails with this new patch
applied. (default configs and some fairly normal distro configs also
work fine.) And with the zillion config straws we now have, spotting
the arbitrary needle is proving difficult. ;-)

> Who knows maybe the only difference between 4.2.0 and 4.2.4 is that
> 4.2.4 optimizes it's stack usage a little better and you don't see
> a stack overflow.

I haven't tried the THREAD_ORDER=3 (or 2) under 4.2.0, but that would
seem to indicate this may be true.

> It would be very very good if we could separate out these issues
> especially the segment for the per cpu variables. We need something
> like that.

One reason I've been sticking with 4.2.4.

Thanks again for your help.

Mike

2008-07-10 00:12:10

by Eric W. Biederman

[permalink] [raw]
Subject: Re: [crash, bisected] Re: [PATCH 3/4] x86_64: Fold pda into per cpu area

Mike Travis <[email protected]> writes:

> What I meant was using early_printk in place of printk, which seems to stuff the
> messages into the log buf until the serial console is setup fairly late in
> start_kernel.
> I did this by removing printk() and renaming early_printk() to be printk (and a
> couple
> other things like #define early_printk printk ...

Last I looked after the magic early_printk setup. printk calls early_printk
and stuff messages in the log buffer.

It matters little though. As long as you get the print messages. Weird
cases where you don't get into C code worry me much more.

Once you get into C things are much easier to track.

>> Is stack overflow the only problem you are seeing or are there still other
> mysteries?
>
> I'm not entirely sure it's a stack overflow, the fault has a NULL dereference
> and
> then the stack overflow message.

Ok. Interesting.

>>> Only a few of these though I would think might get called early in
>>> the boot, that might also be contributing to the stack overflow.
>>
>> Still the call chain depth shouldn't really be changing. So why should it
>> matter? Ah. The high cpu count is growing cpumask_t so when you put
>> it on the stack. That makes sense. So what stars out as a 4 byte
>> variable on the stack in a normal setup winds up being a 1k variable
>> with 4k cpus.
>
> Yes, it's definitely the three related:
>
> NR_CPUS Patch_Applied THREAD_ORDER Results
> 256 NO 1 works (obviously ;-)
> 256 YES 1 works
> 4096 NO 1 works
> 4096 YES 1 panics
> 4096 YES 3 works (just happened to pick 3,
> 2 probably will work as well.)

> I've been testing NR_CPUS=4096 for quite a while and it's been very
> reliable. It's just weird that this config fails with this new patch
> applied. (default configs and some fairly normal distro configs also
> work fine.) And with the zillion config straws we now have, spotting
> the arbitrary needle is proving difficult. ;-)

Right. Just please split your patch up. It would be good to see
if simply changing the per cpu segment address to 0 is related
to your problem. Or if it the other logic changes necessary to
put the use the pda as a per cpu variable?

I just noticed that we always allocate the pda in the per cpu section.

> One reason I've been sticking with 4.2.4.

Eric

2008-07-25 20:06:25

by Mike Travis

[permalink] [raw]
Subject: Re: [crash, bisected] Re: [PATCH 3/4] x86_64: Fold pda into per cpu area

Jeremy Fitzhardinge wrote:
...
> Last time it was doing this, it was a result of a triple-fault caused by
> loading %ds with an all-zero gdt. I modified Xen to dump the CPU state
> on triple-faults, so it was easy to pinpoint. I can do that again if it
> helps.
>
> J

Hi Jeremy,

There are two question marks for my patchset. The first is in

arch/x86/xen/smp.c:xen_cpu_up()

287 #ifdef CONFIG_X86_64
288 /* Allocate node local memory for AP pdas */
289 WARN_ON(cpu == 0);
290 if (cpu > 0) {
291 rc = get_local_pda(cpu);
292 if (rc)
293 return rc;
294 }
295 #endif

and the second is at:

arch/x86/xen/enlighten.c:xen_start_kernel()

1748 #ifdef CONFIG_X86_64
1749 /* Disable until direct per-cpu data access. */
1750 have_vcpu_info_placement = 0;
1751 x86_64_init_pda();
1752 #endif

I believe with the pda folded into the percpu area, get_local_pda()
and x86_64_init_pda() have been removed, so these are no longer
required, yes?

Also, arch/x86/kernel/acpi/sleep.c:acpi_save_state_mem() sets up
the startup code address with:

102 initial_code = (unsigned long)wakeup_long64;
103 saved_magic = 0x123456789abcdef0;

Should the pda and gdt_page address also be setup as is done in
smpboot.c:do_boot_cpu():

(CONFIG_X86_64)
801 initial_pda = (unsigned long)get_cpu_pda(cpu);
802 #endif
803 early_gdt_descr.address = (unsigned long)get_cpu_gdt_table(cpu);
804 initial_code = (unsigned long)start_secondary;


Thanks!
Mike

2008-07-25 20:13:08

by Jeremy Fitzhardinge

[permalink] [raw]
Subject: Re: [crash, bisected] Re: [PATCH 3/4] x86_64: Fold pda into per cpu area

Mike Travis wrote:
> Jeremy Fitzhardinge wrote:
> ...
>
>> Last time it was doing this, it was a result of a triple-fault caused by
>> loading %ds with an all-zero gdt. I modified Xen to dump the CPU state
>> on triple-faults, so it was easy to pinpoint. I can do that again if it
>> helps.
>>
>> J
>>
>
> Hi Jeremy,
>
> There are two question marks for my patchset. The first is in
>
> arch/x86/xen/smp.c:xen_cpu_up()
>
> 287 #ifdef CONFIG_X86_64
> 288 /* Allocate node local memory for AP pdas */
> 289 WARN_ON(cpu == 0);
> 290 if (cpu > 0) {
> 291 rc = get_local_pda(cpu);
> 292 if (rc)
> 293 return rc;
> 294 }
> 295 #endif
>
> and the second is at:
>
> arch/x86/xen/enlighten.c:xen_start_kernel()
>
> 1748 #ifdef CONFIG_X86_64
> 1749 /* Disable until direct per-cpu data access. */
> 1750 have_vcpu_info_placement = 0;
> 1751 x86_64_init_pda();
> 1752 #endif
>
> I believe with the pda folded into the percpu area, get_local_pda()
> and x86_64_init_pda() have been removed, so these are no longer
> required, yes?
>

Well, presumably they need to be replaced with whatever setup you need
to do now.

xen_start_kernel() is the first function called after a Xen kernel boot,
and so it must make sure the early percpu setup is done before it can
start using percpu variables.

xen_cpu_up() needs to do whatever initialization needed for a new cpu's
percpu area (presumably whatever do_boot_cpu() does).

> Also, arch/x86/kernel/acpi/sleep.c:acpi_save_state_mem() sets up
> the startup code address with:
>
> 102 initial_code = (unsigned long)wakeup_long64;
> 103 saved_magic = 0x123456789abcdef0;
>
> Should the pda and gdt_page address also be setup as is done in
> smpboot.c:do_boot_cpu():
>
> (CONFIG_X86_64)
> 801 initial_pda = (unsigned long)get_cpu_pda(cpu);
> 802 #endif
> 803 early_gdt_descr.address = (unsigned long)get_cpu_gdt_table(cpu);
> 804 initial_code = (unsigned long)start_secondary;
>
I don't think so. It looks like it's doing its own gdt save/restore.

J

2008-07-25 20:34:55

by Mike Travis

[permalink] [raw]
Subject: Re: [crash, bisected] Re: [PATCH 3/4] x86_64: Fold pda into per cpu area

Jeremy Fitzhardinge wrote:
> Mike Travis wrote:
>>... The first is in
>>
>> arch/x86/xen/smp.c:xen_cpu_up()
>>
>> 287 #ifdef CONFIG_X86_64
>> 288 /* Allocate node local memory for AP pdas */
>> 289 WARN_ON(cpu == 0);
>> 290 if (cpu > 0) {
>> 291 rc = get_local_pda(cpu);
>> 292 if (rc)
>> 293 return rc;
>> 294 }
>> 295 #endif
>>
>> and the second is at:
>>
>> arch/x86/xen/enlighten.c:xen_start_kernel()
>>
>> 1748 #ifdef CONFIG_X86_64
>> 1749 /* Disable until direct per-cpu data access. */
>> 1750 have_vcpu_info_placement = 0;
>> 1751 x86_64_init_pda();
>> 1752 #endif
>>
>> I believe with the pda folded into the percpu area, get_local_pda()
>> and x86_64_init_pda() have been removed, so these are no longer
>> required, yes?
>>
>
> Well, presumably they need to be replaced with whatever setup you need
> to do now.
>
> xen_start_kernel() is the first function called after a Xen kernel boot,
> and so it must make sure the early percpu setup is done before it can
> start using percpu variables.

Is this for the boot cpu (0), or for all cpus? For the boot cpu, I have
this now in arch/x86/kernel/setup_percpu.c:

+#ifdef CONFIG_HAVE_ZERO_BASED_PER_CPU
+
+/* Initialize percpu offset for boot cpu (0) */
+unsigned long __per_cpu_offset[NR_CPUS] __read_mostly = {
+ [0] = (unsigned long)__per_cpu_load
+};
+#else
unsigned long __per_cpu_offset[NR_CPUS] __read_mostly;
+#endif

So this should apply as well to the xen startup?

>
> xen_cpu_up() needs to do whatever initialization needed for a new cpu's
> percpu area (presumably whatever do_boot_cpu() does).
>

Does the startup include executing arch/x86/kernel/head_64.S:startup_64() ?
I see arch/x86/xen/xen-head.S:startup_xen() so I'm guessing not?

For the real startup, I do the following two things. But I'm not comfortable
enough with xen to think I'll get it right putting this in xen-head.S.

- lgdt early_gdt_descr(%rip)
+
+#ifdef CONFIG_SMP
+ /*
+ * For zero-based percpu variables, the base (__per_cpu_load) must
+ * be added to the offset of per_cpu__gdt_page. This is only needed
+ * for the boot cpu but we can't do this prior to secondary_startup_64.
+ * So we use a NULL gdt adrs to indicate that we are starting up the
+ * boot cpu and not the secondary cpus. do_boot_cpu() will fixup
+ * the gdt adrs for those cpus.
+ */
+#define PER_CPU_GDT_PAGE 0
+ movq early_gdt_descr_base(%rip), %rax
+ testq %rax, %rax
+ jnz 1f
+ movq $__per_cpu_load, %rax
+ addq $per_cpu__gdt_page, %rax
+ movq %rax, early_gdt_descr_base(%rip)
+#else
+#define PER_CPU_GDT_PAGE per_cpu__gdt_page
+#endif
+1: lgdt early_gdt_descr(%rip)

and:

+ * Setup up the real PDA.
+ *
+ * For SMP, the boot cpu (0) uses the static pda which is the first
+ * element in the percpu area (@__per_cpu_load). This pda is moved
+ * to the real percpu area once that is allocated. Secondary cpus
+ * will use the initial_pda value setup in do_boot_cpu().
*/
movl $MSR_GS_BASE,%ecx
- movq $empty_zero_page,%rax
+ movq initial_pda(%rip), %rax
movq %rax,%rdx
shrq $32,%rdx
wrmsr
+#ifdef CONFIG_SMP
+ movq %rax, %gs:pda_data_offset
+#endif

+ ENTRY(initial_pda)
+#ifdef CONFIG_SMP
+ .quad __per_cpu_load # Overwritten for secondary CPUs
+#else
+ .quad per_cpu__pda
+#endif


Thanks!
Mike

2008-07-25 20:43:39

by Jeremy Fitzhardinge

[permalink] [raw]
Subject: Re: [crash, bisected] Re: [PATCH 3/4] x86_64: Fold pda into per cpu area

Mike Travis wrote:
> Is this for the boot cpu (0), or for all cpus? For the boot cpu, I have
> this now in arch/x86/kernel/setup_percpu.c:
>
> +#ifdef CONFIG_HAVE_ZERO_BASED_PER_CPU
> +
> +/* Initialize percpu offset for boot cpu (0) */
> +unsigned long __per_cpu_offset[NR_CPUS] __read_mostly = {
> + [0] = (unsigned long)__per_cpu_load
> +};
> +#else
> unsigned long __per_cpu_offset[NR_CPUS] __read_mostly;
> +#endif
>
> So this should apply as well to the xen startup?
>

If it's just a static initialization, then it should be fine. But some
equivalent of your head_64.S changes are needed to actually set things up?


>> xen_cpu_up() needs to do whatever initialization needed for a new cpu's
>> percpu area (presumably whatever do_boot_cpu() does).
>>
>>
>
> Does the startup include executing arch/x86/kernel/head_64.S:startup_64() ?
> I see arch/x86/xen/xen-head.S:startup_xen() so I'm guessing not?
>

No, it doesn't. It bypasses all that startup code. Aside from the few
instructions in xen-head.S, xen_start_kernel() is the first thing to get
run.

But when bringing up a secondary cpu, where does the new percpu memory
actually get allocated?

> For the real startup, I do the following two things. But I'm not comfortable
> enough with xen to think I'll get it right putting this in xen-head.S.
>

Yes, it needn't be in the asm code. I'll work out what to do. Looks
like I just need to do an appropriate wrmsr(MSR_GS_BASE, ).

J

2008-07-25 21:05:44

by Mike Travis

[permalink] [raw]
Subject: Re: [crash, bisected] Re: [PATCH 3/4] x86_64: Fold pda into per cpu area


Ok, I'll just post what I have now (compiles and boots cleanly)... and then
we can discuss these more extensively.

Thanks,
Mike

Jeremy Fitzhardinge wrote:
> Mike Travis wrote:
>> Is this for the boot cpu (0), or for all cpus? For the boot cpu, I have
>> this now in arch/x86/kernel/setup_percpu.c:
>>
>> +#ifdef CONFIG_HAVE_ZERO_BASED_PER_CPU
>> +
>> +/* Initialize percpu offset for boot cpu (0) */
>> +unsigned long __per_cpu_offset[NR_CPUS] __read_mostly = {
>> + [0] = (unsigned long)__per_cpu_load
>> +};
>> +#else
>> unsigned long __per_cpu_offset[NR_CPUS] __read_mostly;
>> +#endif
>>
>> So this should apply as well to the xen startup?
>>
>
> If it's just a static initialization, then it should be fine. But some
> equivalent of your head_64.S changes are needed to actually set things up?
>
>
>>> xen_cpu_up() needs to do whatever initialization needed for a new cpu's
>>> percpu area (presumably whatever do_boot_cpu() does).
>>>
>>>
>>
>> Does the startup include executing
>> arch/x86/kernel/head_64.S:startup_64() ?
>> I see arch/x86/xen/xen-head.S:startup_xen() so I'm guessing not?
>>
>
> No, it doesn't. It bypasses all that startup code. Aside from the few
> instructions in xen-head.S, xen_start_kernel() is the first thing to get
> run.
>
> But when bringing up a secondary cpu, where does the new percpu memory
> actually get allocated?
>
>> For the real startup, I do the following two things. But I'm not
>> comfortable
>> enough with xen to think I'll get it right putting this in xen-head.S.
>>
>
> Yes, it needn't be in the asm code. I'll work out what to do. Looks
> like I just need to do an appropriate wrmsr(MSR_GS_BASE, ).
>
> J