2005-12-15 02:33:55

by Ravikiran G Thirumalai

[permalink] [raw]
Subject: [patch 1/3] x86_64: Node local pda take 2 -- early cpu_to_node

Here is take 2 on x86_64 node local pda allocation.

This patchset does away with the extra memory reference for non CONFIG_NUMA
case. The early cpu_to_node helps AMD and EM64T systems which work well
with CONFIG_ACPI_NUMA. cpu_to_node is not inited early for AMD systems
which work only with old style K8_NUMA. (Tested on EM64 NUMA and Tyan K8
dual core 4 cpu boxes)

Andi, I could not eliminate the need for a initial static pda array, since
sched_init needs the static per-cpu offset array for NR_CPUS early. Hope
this is OK.

Thanks,
Kiran

---

Patch enables early intialization of cpu_to_node. apicid_to_node is built by reading
the SRAT table, from acpi_numa_init, and x86_cpu_to_apicid is built by parsing the ACPI
MADT table, from acpi_boot_init. We combine these two tables and setup cpu_to_node.

Early intialization helps the static per_cpu_areas in getting pages from correct node.

Tested on EM64T NUMA and Tyan K8 dual core board (with CONFIG_ACPI_NUMA + K8)

Signed-off-by: Alok N Kataria <[email protected]>
Signed-off-by: Ravikiran Thirumalai <[email protected]>

Index: linux-2.6.15-rc4/arch/x86_64/kernel/setup.c
===================================================================
--- linux-2.6.15-rc4.orig/arch/x86_64/kernel/setup.c 2005-12-02 16:25:19.000000000 -0800
+++ linux-2.6.15-rc4/arch/x86_64/kernel/setup.c 2005-12-12 01:49:00.000000000 -0800
@@ -669,6 +669,8 @@
acpi_boot_init();
#endif

+ init_cpu_to_node();
+
#ifdef CONFIG_X86_LOCAL_APIC
/*
* get boot-time SMP configuration:
Index: linux-2.6.15-rc4/arch/x86_64/mm/srat.c
===================================================================
--- linux-2.6.15-rc4.orig/arch/x86_64/mm/srat.c 2005-12-01 17:09:51.000000000 -0800
+++ linux-2.6.15-rc4/arch/x86_64/mm/srat.c 2005-12-12 01:19:00.000000000 -0800
@@ -226,4 +226,15 @@
return acpi_slit->entry[index + node_to_pxm(b)];
}

+/*
+ * Setup cpu_to_node using the SRAT lapcis & ACPI MADT table
+ * info.
+ */
+void __init init_cpu_to_node(void)
+{
+ int i;
+ for (i = 0; i < NR_CPUS; i++)
+ cpu_to_node[i] = apicid_to_node[x86_cpu_to_apicid[i]];
+}
+
EXPORT_SYMBOL(__node_distance);
Index: linux-2.6.15-rc4/include/linux/acpi.h
===================================================================
--- linux-2.6.15-rc4.orig/include/linux/acpi.h 2005-10-27 17:02:08.000000000 -0700
+++ linux-2.6.15-rc4/include/linux/acpi.h 2005-12-12 01:52:28.000000000 -0800
@@ -519,11 +519,16 @@

#ifdef CONFIG_ACPI_NUMA
int acpi_get_pxm(acpi_handle handle);
+void __init init_cpu_to_node();
#else
static inline int acpi_get_pxm(acpi_handle handle)
{
return 0;
}
+
+static inline void init_cpu_to_node(void)
+{
+}
#endif

extern int pnpacpi_disabled;


2005-12-15 02:35:37

by Ravikiran G Thirumalai

[permalink] [raw]
Subject: [patch 2/3] x86_64: Node local pda take 2 -- cpu_pda_prep

Helper patch to change cpu_pda users to use macros to access cpu_pda
instead of the cpu_pda[] array.

Signed-off-by: Ravikiran Thirumalai <[email protected]>
Signed-off-by: Shai Fultheim <[email protected]>

Index: linux-2.6.15-rc1git/arch/x86_64/kernel/irq.c
===================================================================
--- linux-2.6.15-rc1git.orig/arch/x86_64/kernel/irq.c 2005-10-27 17:02:08.000000000 -0700
+++ linux-2.6.15-rc1git/arch/x86_64/kernel/irq.c 2005-11-16 14:08:14.000000000 -0800
@@ -69,13 +69,13 @@
seq_printf(p, "NMI: ");
for (j = 0; j < NR_CPUS; j++)
if (cpu_online(j))
- seq_printf(p, "%10u ", cpu_pda[j].__nmi_count);
+ seq_printf(p, "%10u ", cpu_pda(j)->__nmi_count);
seq_putc(p, '\n');
#ifdef CONFIG_X86_LOCAL_APIC
seq_printf(p, "LOC: ");
for (j = 0; j < NR_CPUS; j++)
if (cpu_online(j))
- seq_printf(p, "%10u ", cpu_pda[j].apic_timer_irqs);
+ seq_printf(p, "%10u ", cpu_pda(j)->apic_timer_irqs);
seq_putc(p, '\n');
#endif
seq_printf(p, "ERR: %10u\n", atomic_read(&irq_err_count));
Index: linux-2.6.15-rc1git/arch/x86_64/kernel/nmi.c
===================================================================
--- linux-2.6.15-rc1git.orig/arch/x86_64/kernel/nmi.c 2005-10-27 17:02:08.000000000 -0700
+++ linux-2.6.15-rc1git/arch/x86_64/kernel/nmi.c 2005-11-16 14:08:14.000000000 -0800
@@ -155,19 +155,19 @@
smp_call_function(nmi_cpu_busy, (void *)&endflag, 0, 0);

for (cpu = 0; cpu < NR_CPUS; cpu++)
- counts[cpu] = cpu_pda[cpu].__nmi_count;
+ counts[cpu] = cpu_pda(cpu)->__nmi_count;
local_irq_enable();
mdelay((10*1000)/nmi_hz); // wait 10 ticks

for (cpu = 0; cpu < NR_CPUS; cpu++) {
if (!cpu_online(cpu))
continue;
- if (cpu_pda[cpu].__nmi_count - counts[cpu] <= 5) {
+ if (cpu_pda(cpu)->__nmi_count - counts[cpu] <= 5) {
endflag = 1;
printk("CPU#%d: NMI appears to be stuck (%d->%d)!\n",
cpu,
counts[cpu],
- cpu_pda[cpu].__nmi_count);
+ cpu_pda(cpu)->__nmi_count);
nmi_active = 0;
lapic_nmi_owner &= ~LAPIC_NMI_WATCHDOG;
nmi_perfctr_msr = 0;
Index: linux-2.6.15-rc1git/arch/x86_64/kernel/setup64.c
===================================================================
--- linux-2.6.15-rc1git.orig/arch/x86_64/kernel/setup64.c 2005-11-16 12:13:40.000000000 -0800
+++ linux-2.6.15-rc1git/arch/x86_64/kernel/setup64.c 2005-11-16 14:08:14.000000000 -0800
@@ -30,7 +30,7 @@

cpumask_t cpu_initialized __cpuinitdata = CPU_MASK_NONE;

-struct x8664_pda cpu_pda[NR_CPUS] __cacheline_aligned;
+struct x8664_pda _cpu_pda[NR_CPUS] __cacheline_aligned;

struct desc_ptr idt_descr = { 256 * 16, (unsigned long) idt_table };

@@ -110,18 +110,18 @@
}
if (!ptr)
panic("Cannot allocate cpu data for CPU %d\n", i);
- cpu_pda[i].data_offset = ptr - __per_cpu_start;
+ cpu_pda(i)->data_offset = ptr - __per_cpu_start;
memcpy(ptr, __per_cpu_start, __per_cpu_end - __per_cpu_start);
}
}

void pda_init(int cpu)
{
- struct x8664_pda *pda = &cpu_pda[cpu];
+ struct x8664_pda *pda = cpu_pda(cpu);

/* Setup up data that may be needed in __get_free_pages early */
asm volatile("movl %0,%%fs ; movl %0,%%gs" :: "r" (0));
- wrmsrl(MSR_GS_BASE, cpu_pda + cpu);
+ wrmsrl(MSR_GS_BASE, pda);

pda->cpunumber = cpu;
pda->irqcount = -1;
Index: linux-2.6.15-rc1git/arch/x86_64/kernel/smpboot.c
===================================================================
--- linux-2.6.15-rc1git.orig/arch/x86_64/kernel/smpboot.c 2005-11-16 12:13:40.000000000 -0800
+++ linux-2.6.15-rc1git/arch/x86_64/kernel/smpboot.c 2005-11-16 14:08:14.000000000 -0800
@@ -778,7 +778,7 @@

do_rest:

- cpu_pda[cpu].pcurrent = c_idle.idle;
+ cpu_pda(cpu)->pcurrent = c_idle.idle;

start_rip = setup_trampoline();

Index: linux-2.6.15-rc1git/arch/x86_64/kernel/traps.c
===================================================================
--- linux-2.6.15-rc1git.orig/arch/x86_64/kernel/traps.c 2005-11-16 12:13:40.000000000 -0800
+++ linux-2.6.15-rc1git/arch/x86_64/kernel/traps.c 2005-11-16 14:08:14.000000000 -0800
@@ -158,7 +158,7 @@
{
unsigned long addr;
const unsigned cpu = safe_smp_processor_id();
- unsigned long *irqstack_end = (unsigned long *)cpu_pda[cpu].irqstackptr;
+ unsigned long *irqstack_end = (unsigned long *)cpu_pda(cpu)->irqstackptr;
int i;
unsigned used = 0;

@@ -226,8 +226,8 @@
unsigned long *stack;
int i;
const int cpu = safe_smp_processor_id();
- unsigned long *irqstack_end = (unsigned long *) (cpu_pda[cpu].irqstackptr);
- unsigned long *irqstack = (unsigned long *) (cpu_pda[cpu].irqstackptr - IRQSTACKSIZE);
+ unsigned long *irqstack_end = (unsigned long *) (cpu_pda(cpu)->irqstackptr);
+ unsigned long *irqstack = (unsigned long *) (cpu_pda(cpu)->irqstackptr - IRQSTACKSIZE);

// debugging aid: "show_stack(NULL, NULL);" prints the
// back trace for this cpu.
@@ -275,7 +275,7 @@
int in_kernel = !user_mode(regs);
unsigned long rsp;
const int cpu = safe_smp_processor_id();
- struct task_struct *cur = cpu_pda[cpu].pcurrent;
+ struct task_struct *cur = cpu_pda(cpu)->pcurrent;

rsp = regs->rsp;

Index: linux-2.6.15-rc1git/arch/x86_64/kernel/x8664_ksyms.c
===================================================================
--- linux-2.6.15-rc1git.orig/arch/x86_64/kernel/x8664_ksyms.c 2005-11-16 12:13:40.000000000 -0800
+++ linux-2.6.15-rc1git/arch/x86_64/kernel/x8664_ksyms.c 2005-11-16 14:08:14.000000000 -0800
@@ -109,7 +109,7 @@
EXPORT_SYMBOL(copy_page);
EXPORT_SYMBOL(clear_page);

-EXPORT_SYMBOL(cpu_pda);
+EXPORT_SYMBOL(_cpu_pda);
#ifdef CONFIG_SMP
EXPORT_SYMBOL(cpu_data);
EXPORT_SYMBOL(cpu_online_map);
Index: linux-2.6.15-rc1git/arch/x86_64/mm/numa.c
===================================================================
--- linux-2.6.15-rc1git.orig/arch/x86_64/mm/numa.c 2005-11-16 12:13:40.000000000 -0800
+++ linux-2.6.15-rc1git/arch/x86_64/mm/numa.c 2005-11-16 14:11:41.000000000 -0800
@@ -270,7 +270,7 @@

void __cpuinit numa_set_node(int cpu, int node)
{
- cpu_pda[cpu].nodenumber = node;
+ cpu_pda(cpu)->nodenumber = node;
cpu_to_node[cpu] = node;
}

Index: linux-2.6.15-rc1git/include/asm-x86_64/pda.h
===================================================================
--- linux-2.6.15-rc1git.orig/include/asm-x86_64/pda.h 2005-11-16 12:13:40.000000000 -0800
+++ linux-2.6.15-rc1git/include/asm-x86_64/pda.h 2005-11-16 14:08:14.000000000 -0800
@@ -27,7 +27,9 @@
#define IRQSTACK_ORDER 2
#define IRQSTACKSIZE (PAGE_SIZE << IRQSTACK_ORDER)

-extern struct x8664_pda cpu_pda[];
+extern struct x8664_pda _cpu_pda[];
+
+#define cpu_pda(i) (&_cpu_pda[i])

/*
* There is no fast way to get the base address of the PDA, all the accesses
Index: linux-2.6.15-rc1git/include/asm-x86_64/percpu.h
===================================================================
--- linux-2.6.15-rc1git.orig/include/asm-x86_64/percpu.h 2005-10-27 17:02:08.000000000 -0700
+++ linux-2.6.15-rc1git/include/asm-x86_64/percpu.h 2005-11-16 14:08:14.000000000 -0800
@@ -11,7 +11,7 @@

#include <asm/pda.h>

-#define __per_cpu_offset(cpu) (cpu_pda[cpu].data_offset)
+#define __per_cpu_offset(cpu) (cpu_pda(cpu)->data_offset)
#define __my_cpu_offset() read_pda(data_offset)

/* Separate out the type, so (int[3], foo) works. */

2005-12-15 02:38:04

by Ravikiran G Thirumalai

[permalink] [raw]
Subject: [patch 3/3] x86_64: Node local pda take 2 -- node local pda allocation

Patch uses a static PDA array early at boot and reallocates processor PDA
with node local memory when kmalloc is ready, just before pda_init.
The boot_cpu_pda is needed since the cpu_pda is used even before pda_init for
that cpu is called.
(pda_init is called when APs are brought on at rest_init(). But
setup_per_cpu_areas is called early in start_kernel and
sched_init uses the per-cpu offset table early)

Signed-off-by: Ravikiran Thirumalai <[email protected]>
Signed-off-by: Shai Fultheim <[email protected]>

Index: linux-2.6.15-rc4/arch/x86_64/kernel/head64.c
===================================================================
--- linux-2.6.15-rc4.orig/arch/x86_64/kernel/head64.c 2005-12-12 01:11:01.000000000 -0800
+++ linux-2.6.15-rc4/arch/x86_64/kernel/head64.c 2005-12-12 02:24:02.000000000 -0800
@@ -92,6 +92,11 @@
memcpy(init_level4_pgt, boot_level4_pgt, PTRS_PER_PGD*sizeof(pgd_t));
asm volatile("movq %0,%%cr3" :: "r" (__pa_symbol(&init_level4_pgt)));

+#ifdef CONFIG_NUMA
+ for (i = 0; i < NR_CPUS; i++)
+ cpu_pda(i) = &boot_cpu_pda[i];
+#endif
+
pda_init(0);
copy_bootdata(real_mode_data);
#ifdef CONFIG_SMP
Index: linux-2.6.15-rc4/arch/x86_64/kernel/setup64.c
===================================================================
--- linux-2.6.15-rc4.orig/arch/x86_64/kernel/setup64.c 2005-12-12 02:24:00.000000000 -0800
+++ linux-2.6.15-rc4/arch/x86_64/kernel/setup64.c 2005-12-12 02:24:02.000000000 -0800
@@ -30,7 +30,12 @@

cpumask_t cpu_initialized __cpuinitdata = CPU_MASK_NONE;

-struct x8664_pda _cpu_pda[NR_CPUS] __cacheline_aligned;
+#ifdef CONFIG_NUMA
+struct x8664_pda *_cpu_pda[NR_CPUS] __read_mostly;
+struct x8664_pda boot_cpu_pda[NR_CPUS] __cacheline_aligned;
+#else
+struct x8664_pda _cpu_pda[NR_CPUS] __read_mostly;
+#endif

struct desc_ptr idt_descr = { 256 * 16, (unsigned long) idt_table };

@@ -119,6 +124,25 @@
{
struct x8664_pda *pda = cpu_pda(cpu);

+#ifdef CONFIG_NUMA
+ /* Allocate node local memory for AP pdas */
+ if (cpu) {
+ struct x8664_pda *newpda;
+ newpda = kmalloc_node(sizeof (struct x8664_pda), GFP_ATOMIC,
+ cpu_to_node(cpu));
+ if (newpda) {
+ printk("Allocating node local PDA for cpu %d at 0x%lx\n",
+ cpu, (unsigned long) newpda);
+ memcpy(newpda, pda, sizeof (struct x8664_pda));
+ pda = newpda;
+ cpu_pda(cpu) = pda;
+ }
+ else
+ printk("Could not allocate node local PDA for cpu %d\n",
+ cpu);
+ }
+#endif
+
/* Setup up data that may be needed in __get_free_pages early */
asm volatile("movl %0,%%fs ; movl %0,%%gs" :: "r" (0));
wrmsrl(MSR_GS_BASE, pda);
Index: linux-2.6.15-rc4/include/asm-x86_64/pda.h
===================================================================
--- linux-2.6.15-rc4.orig/include/asm-x86_64/pda.h 2005-12-12 02:24:00.000000000 -0800
+++ linux-2.6.15-rc4/include/asm-x86_64/pda.h 2005-12-12 02:24:02.000000000 -0800
@@ -27,9 +27,14 @@
#define IRQSTACK_ORDER 2
#define IRQSTACKSIZE (PAGE_SIZE << IRQSTACK_ORDER)

+#ifdef CONFIG_NUMA
+extern struct x8664_pda *_cpu_pda[];
+extern struct x8664_pda boot_cpu_pda[];
+#define cpu_pda(i) (_cpu_pda[i])
+#else
extern struct x8664_pda _cpu_pda[];
-
#define cpu_pda(i) (&_cpu_pda[i])
+#endif

/*
* There is no fast way to get the base address of the PDA, all the accesses

2005-12-15 08:22:59

by Eric Dumazet

[permalink] [raw]
Subject: Re: [patch 3/3] x86_64: Node local pda take 2 -- node local pda allocation

Ravikiran G Thirumalai a ?crit :
> Patch uses a static PDA array early at boot and reallocates processor PDA
> with node local memory when kmalloc is ready, just before pda_init.
> The boot_cpu_pda is needed since the cpu_pda is used even before pda_init for
> that cpu is called.
> (pda_init is called when APs are brought on at rest_init(). But
> setup_per_cpu_areas is called early in start_kernel and
> sched_init uses the per-cpu offset table early)

That seems good, thank you !

Do you have an idea of the performance gain we could expect from this node
local pda allocation ?

Say a CPU is on Node 1, was a change in pda (allocated on Node 0) immediatly
mirrored on remote node or not ?

Eric

2005-12-15 09:36:58

by Andi Kleen

[permalink] [raw]
Subject: Re: [patch 3/3] x86_64: Node local pda take 2 -- node local pda allocation

> Do you have an idea of the performance gain we could expect from this node
> local pda allocation ?

I wouldn't expect very much.

> Say a CPU is on Node 1, was a change in pda (allocated on Node 0)
> immediatly mirrored on remote node or not ?

The Opteron caches are write back afaik - this means data only leaves
the L2 cache when other data pushes it out.
But the additional traffic on the interconnect was likely negligible.

If anything I would expect the reduced latency when a user space program eat up all
cache and the PDA is needed on the next kernel entry to be helpful.

But it's not very much at least on an Opteron because the NUMA factor
isn't that bad. On Kiran's machines which likely have a higher NUMA
factor I guess it helps more.

-Andi

2005-12-15 09:42:34

by Andi Kleen

[permalink] [raw]
Subject: Re: [discuss] [patch 3/3] x86_64: Node local pda take 2 -- node local pda allocation

On Wed, Dec 14, 2005 at 06:37:48PM -0800, Ravikiran G Thirumalai wrote:
> Patch uses a static PDA array early at boot and reallocates processor PDA
> with node local memory when kmalloc is ready, just before pda_init.
> The boot_cpu_pda is needed since the cpu_pda is used even before pda_init for
> that cpu is called.
> (pda_init is called when APs are brought on at rest_init(). But
> setup_per_cpu_areas is called early in start_kernel and
> sched_init uses the per-cpu offset table early)
>

That is why I suggested to allocate it in smpboot.c in advance before
starting the AP. Can you please do that change?

-Andi

2005-12-15 09:44:38

by Andi Kleen

[permalink] [raw]
Subject: Re: [discuss] [patch 1/3] x86_64: Node local pda take 2 -- early cpu_to_node

On Wed, Dec 14, 2005 at 06:33:45PM -0800, Ravikiran G Thirumalai wrote:
> Here is take 2 on x86_64 node local pda allocation.
>
> This patchset does away with the extra memory reference for non CONFIG_NUMA
> case. The early cpu_to_node helps AMD and EM64T systems which work well
> with CONFIG_ACPI_NUMA. cpu_to_node is not inited early for AMD systems
> which work only with old style K8_NUMA. (Tested on EM64 NUMA and Tyan K8
> dual core 4 cpu boxes)

Thanks for now testing on AMD too - that makes me more confident in your
patches.

> Andi, I could not eliminate the need for a initial static pda array, since
> sched_init needs the static per-cpu offset array for NR_CPUS early. Hope
> this is OK.

See my comment.

> + * Setup cpu_to_node using the SRAT lapcis & ACPI MADT table
> + * info.
> + */
> +void __init init_cpu_to_node(void)
> +{
> + int i;
> + for (i = 0; i < NR_CPUS; i++)
> + cpu_to_node[i] = apicid_to_node[x86_cpu_to_apicid[i]];
> +}

I would prefer it if you moved that to numa.c and run always
(even for the k8topology case). Otherwise k8topology will behave
differently whether CONFIG_ACPI_NUMA is set or not, and I don't like
that.

-Andi

2005-12-15 18:47:25

by Ravikiran G Thirumalai

[permalink] [raw]
Subject: Re: [discuss] [patch 3/3] x86_64: Node local pda take 2 -- node local pda allocation

On Thu, Dec 15, 2005 at 10:42:32AM +0100, Andi Kleen wrote:
> On Wed, Dec 14, 2005 at 06:37:48PM -0800, Ravikiran G Thirumalai wrote:
> > Patch uses a static PDA array early at boot and reallocates processor PDA
> > with node local memory when kmalloc is ready, just before pda_init.
> > The boot_cpu_pda is needed since the cpu_pda is used even before pda_init for
> > that cpu is called.
> > (pda_init is called when APs are brought on at rest_init(). But
> > setup_per_cpu_areas is called early in start_kernel and
> > sched_init uses the per-cpu offset table early)
> >
>
> That is why I suggested to allocate it in smpboot.c in advance before
> starting the AP. Can you please do that change?

Maybe I am missing something, or not getting what you are suggesting;
As I see it,

asmlinkage void __init start_kernel(void)
{
...
...
...
setup_arch(&command_line); --> (1)
setup_per_cpu_areas(); --> (2)
...
sched_init(); --> (3)
...
vfs_caches_init_early();
mem_init();
kmem_cache_init(); --> (4)
...
rest_init() --> (5)
}


I could allocate memory for pda somewhere in setup_arch after cpu_to_node is
initialized, but I would have to use alloc_bootmem_node and allocate for
NR_CPUS, which could be wasteful. I cannot use kmalloc_node until after (4)
above, and sched_init refers to the per-cpu offset table before that.

So are you suggesting I use alloc_bootmem_node and allocate PDA for
NR_CPUS?

2005-12-15 19:01:53

by Ravikiran G Thirumalai

[permalink] [raw]
Subject: Re: [discuss] [patch 1/3] x86_64: Node local pda take 2 -- early cpu_to_node

On Thu, Dec 15, 2005 at 10:44:37AM +0100, Andi Kleen wrote:
> On Wed, Dec 14, 2005 at 06:33:45PM -0800, Ravikiran G Thirumalai wrote:
> > + * info.
> > + */
> > +void __init init_cpu_to_node(void)
> > +{
> > + int i;
> > + for (i = 0; i < NR_CPUS; i++)
> > + cpu_to_node[i] = apicid_to_node[x86_cpu_to_apicid[i]];
> > +}
>
> I would prefer it if you moved that to numa.c and run always
> (even for the k8topology case). Otherwise k8topology will behave
> differently whether CONFIG_ACPI_NUMA is set or not, and I don't like
> that.

Sure! I moved it to srat.c based on your suggestion to my earlier post.
I will move this to numa.c.

Thanks,
Kiran

2005-12-16 00:20:04

by Andi Kleen

[permalink] [raw]
Subject: Re: [discuss] [patch 1/3] x86_64: Node local pda take 2 -- early cpu_to_node

On Thu, Dec 15, 2005 at 11:01:42AM -0800, Ravikiran G Thirumalai wrote:
> On Thu, Dec 15, 2005 at 10:44:37AM +0100, Andi Kleen wrote:
> > On Wed, Dec 14, 2005 at 06:33:45PM -0800, Ravikiran G Thirumalai wrote:
> > > + * info.
> > > + */
> > > +void __init init_cpu_to_node(void)
> > > +{
> > > + int i;
> > > + for (i = 0; i < NR_CPUS; i++)
> > > + cpu_to_node[i] = apicid_to_node[x86_cpu_to_apicid[i]];
> > > +}
> >
> > I would prefer it if you moved that to numa.c and run always
> > (even for the k8topology case). Otherwise k8topology will behave
> > differently whether CONFIG_ACPI_NUMA is set or not, and I don't like
> > that.
>
> Sure! I moved it to srat.c based on your suggestion to my earlier post.
> I will move this to numa.c.

Sorry for changing my mind on this. I hope you can bear with me.

-Andi

2005-12-16 00:19:40

by Andi Kleen

[permalink] [raw]
Subject: Re: [discuss] [patch 3/3] x86_64: Node local pda take 2 -- node local pda allocation

> So are you suggesting I use alloc_bootmem_node and allocate PDA for
> NR_CPUS?

Continue to allocate the boot PDA of the BP statically - this should
be ok because BP should be always on node 0 (or if you're paranoid
about it you could also reallocate, but it's probably not needed)

And for the APs you allocate the PDA in smpboot.c before actually sending
the startup IPI to the AP.

-Andi

2005-12-16 03:55:53

by Ravikiran G Thirumalai

[permalink] [raw]
Subject: Re: [discuss] [patch 3/3] x86_64: Node local pda take 2 -- node local pda allocation

On Fri, Dec 16, 2005 at 01:19:34AM +0100, Andi Kleen wrote:
>
> And for the APs you allocate the PDA in smpboot.c before actually sending
> the startup IPI to the AP.

You mean wakeup_secondary_via_INIT, called by do_boot_cpu?
That is too late. sched_init happens much earlier, and the per-cpu offset
table for all AP cpus not present is referenced, and I hit an early exception.
sched_init is executed on the BP very early and sched_init does this:

for (i = 0; i < NR_CPUS; i++) {
prio_array_t *array;

rq = cpu_rq(i);

The cpu_rq macro ends up needing per-cpu offset table stored in cpu_pda of
the AP cpus, even before we hit the code to send startup IPIs.
(#define __per_cpu_offset(cpu) (cpu_pda[cpu].data_offset))
This is way before slab is ready. So I either use alloc_bootmem before
sched_init in setup_arch, or keep the static boot_cpu_pda.

Am I missing something?

Thanks,
Kiran

2005-12-16 08:11:45

by Ravikiran G Thirumalai

[permalink] [raw]
Subject: Re: [discuss] [patch 1/3] x86_64: Node local pda take 2 -- early cpu_to_node

On Fri, Dec 16, 2005 at 01:20:01AM +0100, Andi Kleen wrote:
> On Thu, Dec 15, 2005 at 11:01:42AM -0800, Ravikiran G Thirumalai wrote:
> > On Thu, Dec 15, 2005 at 10:44:37AM +0100, Andi Kleen wrote:
> > > On Wed, Dec 14, 2005 at 06:33:45PM -0800, Ravikiran G Thirumalai wrote:
> >
> > Sure! I moved it to srat.c based on your suggestion to my earlier post.
> > I will move this to numa.c.
>
> Sorry for changing my mind on this. I hope you can bear with me.

No problem. I hadn't done this earlier 'cause I didn't have a K8 box to
test. Here is the modified patch.

Thanks,
Kiran

---
Patch enables early intialization of cpu_to_node.
apicid_to_node is built by reading the SRAT table, from acpi_numa_init with
ACPI_NUMA and k8_scan_nodes with K8_NUMA.
x86_cpu_to_apicid is built by parsing the ACPI MADT table, from acpi_boot_init. We combine these two tables and setup cpu_to_node.

Early intialization helps the static per_cpu_areas in getting pages from
correct node.

Patch tested on TYAN dual core 4P board with K8 only and then ACPI_NUMA.
Tested on EM64T NUMA too.

Signed-off-by: Alok N Kataria <[email protected]>
Signed-off-by: Ravikiran Thirumalai <[email protected]>

Index: linux-2.6.15-rc5/arch/x86_64/kernel/setup.c
===================================================================
--- linux-2.6.15-rc5.orig/arch/x86_64/kernel/setup.c 2005-12-14 17:02:14.000000000 -0800
+++ linux-2.6.15-rc5/arch/x86_64/kernel/setup.c 2005-12-14 17:16:07.000000000 -0800
@@ -669,6 +669,8 @@
acpi_boot_init();
#endif

+ init_cpu_to_node();
+
#ifdef CONFIG_X86_LOCAL_APIC
/*
* get boot-time SMP configuration:
Index: linux-2.6.15-rc5/arch/x86_64/mm/numa.c
===================================================================
--- linux-2.6.15-rc5.orig/arch/x86_64/mm/numa.c 2005-12-15 12:44:39.000000000 -0800
+++ linux-2.6.15-rc5/arch/x86_64/mm/numa.c 2005-12-15 23:03:07.000000000 -0800
@@ -330,6 +330,16 @@
return 1;
}

+/*
+ * Setup early cpu_to_node.
+ */
+void __init init_cpu_to_node(void)
+{
+ int i;
+ for (i = 0; i < NR_CPUS; i++)
+ cpu_to_node[i] = apicid_to_node[x86_cpu_to_apicid[i]];
+}
+
EXPORT_SYMBOL(cpu_to_node);
EXPORT_SYMBOL(node_to_cpumask);
EXPORT_SYMBOL(memnode_shift);
Index: linux-2.6.15-rc5/include/asm-x86_64/numa.h
===================================================================
--- linux-2.6.15-rc5.orig/include/asm-x86_64/numa.h 2005-12-14 15:33:35.000000000 -0800
+++ linux-2.6.15-rc5/include/asm-x86_64/numa.h 2005-12-15 23:11:35.000000000 -0800
@@ -21,6 +21,11 @@

extern unsigned char apicid_to_node[256];

+#ifdef CONFIG_NUMA
+extern void __init init_cpu_to_node(void);
+#else
+#define init_cpu_to_node() do {} while (0)
+#endif
#define NUMA_NO_NODE 0xff

#endif