Add x86-64 specific memory hot-add functions, Kconfig options,
and runtime kernel page table update functions to make
hot-add usable on x86-64 machines. Also, fixup the nefarious
conditional locking and exports pointed out by Andi.
Tested on Intel and IBM x86-64 memory hot-add capable systems.
Signed-off-by: Matt Tolentino <[email protected]>
---
diff -urNp linux-2.6.15/arch/x86_64/Kconfig linux-2.6.15-matt/arch/x86_64/Kconfig
--- linux-2.6.15/arch/x86_64/Kconfig 2006-01-06 14:42:45.000000000 -0500
+++ linux-2.6.15-matt/arch/x86_64/Kconfig 2006-01-06 11:32:12.000000000 -0500
@@ -283,7 +283,11 @@ config ARCH_DISCONTIGMEM_DEFAULT
config ARCH_SPARSEMEM_ENABLE
def_bool y
- depends on NUMA
+ depends on (NUMA || EXPERIMENTAL)
+
+config ARCH_MEMORY_PROBE
+ def_bool y
+ depends on MEMORY_HOTPLUG
config ARCH_FLATMEM_ENABLE
def_bool y
@@ -293,6 +297,7 @@ source "mm/Kconfig"
config HAVE_ARCH_EARLY_PFN_TO_NID
def_bool y
+ depends on NUMA
config NR_CPUS
int "Maximum number of CPUs (2-256)"
diff -urNp linux-2.6.15/arch/x86_64/mm/init.c linux-2.6.15-matt/arch/x86_64/mm/init.c
--- linux-2.6.15/arch/x86_64/mm/init.c 2006-01-06 14:42:45.000000000 -0500
+++ linux-2.6.15-matt/arch/x86_64/mm/init.c 2006-01-06 12:56:35.000000000 -0500
@@ -23,6 +23,8 @@
#include <linux/bootmem.h>
#include <linux/proc_fs.h>
#include <linux/pci.h>
+#include <linux/module.h>
+#include <linux/memory_hotplug.h>
#include <asm/processor.h>
#include <asm/system.h>
@@ -174,13 +176,19 @@ static struct temp_map {
{}
};
-static __init void *alloc_low_page(int *index, unsigned long *phys)
+static __meminit void *alloc_low_page(int *index, unsigned long *phys)
{
struct temp_map *ti;
int i;
unsigned long pfn = table_end++, paddr;
void *adr;
+ if (after_bootmem) {
+ adr = (void *)get_zeroed_page(GFP_ATOMIC);
+ *phys = __pa(adr);
+ return adr;
+ }
+
if (pfn >= end_pfn)
panic("alloc_low_page: ran out of memory");
for (i = 0; temp_mappings[i].allocated; i++) {
@@ -193,55 +201,86 @@ static __init void *alloc_low_page(int *
ti->allocated = 1;
__flush_tlb();
adr = ti->address + ((pfn << PAGE_SHIFT) & ~PMD_MASK);
+ memset(adr, 0, PAGE_SIZE);
*index = i;
*phys = pfn * PAGE_SIZE;
return adr;
}
-static __init void unmap_low_page(int i)
+static __meminit void unmap_low_page(int i)
{
- struct temp_map *ti = &temp_mappings[i];
+ struct temp_map *ti;
+
+ if (after_bootmem)
+ return;
+
+ ti = &temp_mappings[i];
set_pmd(ti->pmd, __pmd(0));
ti->allocated = 0;
}
-static void __init phys_pud_init(pud_t *pud, unsigned long address, unsigned long end)
+static void __meminit
+phys_pmd_init(pmd_t *pmd, unsigned long address, unsigned long end)
+{
+ int i;
+
+ for (i = 0; i < PTRS_PER_PMD; pmd++, i++, address += PMD_SIZE) {
+ unsigned long entry;
+
+ if (address > end) {
+ for (; i < PTRS_PER_PMD; i++, pmd++)
+ set_pmd(pmd, __pmd(0));
+ break;
+ }
+ entry = _PAGE_NX|_PAGE_PSE|_KERNPG_TABLE|_PAGE_GLOBAL|address;
+ entry &= __supported_pte_mask;
+ set_pmd(pmd, __pmd(entry));
+ }
+}
+
+static void __meminit
+phys_pmd_update(pud_t *pud, unsigned long address, unsigned long end)
+{
+ pmd_t *pmd = pmd_offset(pud, (unsigned long)__va(address));
+
+ if (pmd_none(*pmd)) {
+ spin_lock(&init_mm.page_table_lock);
+ phys_pmd_init(pmd, address, end);
+ spin_unlock(&init_mm.page_table_lock);
+ __flush_tlb_all();
+ }
+}
+
+static void __meminit phys_pud_init(pud_t *pud, unsigned long address, unsigned long end)
{
- long i, j;
+ long i = pud_index(address);
- i = pud_index(address);
pud = pud + i;
+
+ if (after_bootmem && pud_val(*pud)) {
+ phys_pmd_update(pud, address, end);
+ return;
+ }
+
for (; i < PTRS_PER_PUD; pud++, i++) {
int map;
unsigned long paddr, pmd_phys;
pmd_t *pmd;
- paddr = address + i*PUD_SIZE;
- if (paddr >= end) {
- for (; i < PTRS_PER_PUD; i++, pud++)
- set_pud(pud, __pud(0));
+ paddr = (address & PGDIR_MASK) + i*PUD_SIZE;
+ if (paddr >= end)
break;
- }
- if (!e820_mapped(paddr, paddr+PUD_SIZE, 0)) {
+ if (!after_bootmem && !e820_mapped(paddr, paddr+PUD_SIZE, 0)) {
set_pud(pud, __pud(0));
continue;
}
pmd = alloc_low_page(&map, &pmd_phys);
+ spin_lock(&init_mm.page_table_lock);
set_pud(pud, __pud(pmd_phys | _KERNPG_TABLE));
- for (j = 0; j < PTRS_PER_PMD; pmd++, j++, paddr += PMD_SIZE) {
- unsigned long pe;
-
- if (paddr >= end) {
- for (; j < PTRS_PER_PMD; j++, pmd++)
- set_pmd(pmd, __pmd(0));
- break;
- }
- pe = _PAGE_NX|_PAGE_PSE | _KERNPG_TABLE | _PAGE_GLOBAL | paddr;
- pe &= __supported_pte_mask;
- set_pmd(pmd, __pmd(pe));
- }
+ phys_pmd_init(pmd, paddr, end);
+ spin_unlock(&init_mm.page_table_lock);
unmap_low_page(map);
}
__flush_tlb();
@@ -262,12 +301,15 @@ static void __init find_early_table_spac
table_start >>= PAGE_SHIFT;
table_end = table_start;
+
+ early_printk("kernel direct mapping tables up to %lx @ %lx-%lx\n",
+ end, table_start << PAGE_SHIFT, table_end << PAGE_SHIFT);
}
/* Setup the direct mapping of the physical memory at PAGE_OFFSET.
This runs before bootmem is initialized and gets pages directly from the
physical memory. To access them they are temporarily mapped. */
-void __init init_memory_mapping(unsigned long start, unsigned long end)
+void __meminit init_memory_mapping(unsigned long start, unsigned long end)
{
unsigned long next;
@@ -279,7 +321,8 @@ void __init init_memory_mapping(unsigned
* mapped. Unfortunately this is done currently before the nodes are
* discovered.
*/
- find_early_table_space(end);
+ if (!after_bootmem)
+ find_early_table_space(end);
start = (unsigned long)__va(start);
end = (unsigned long)__va(end);
@@ -287,20 +330,26 @@ void __init init_memory_mapping(unsigned
for (; start < end; start = next) {
int map;
unsigned long pud_phys;
- pud_t *pud = alloc_low_page(&map, &pud_phys);
+ pgd_t *pgd = pgd_offset_k(start);
+ pud_t *pud;
+
+ if (after_bootmem)
+ pud = pud_offset_k(pgd, __PAGE_OFFSET);
+ else
+ pud = alloc_low_page(&map, &pud_phys);
+
next = start + PGDIR_SIZE;
if (next > end)
next = end;
phys_pud_init(pud, __pa(start), __pa(next));
- set_pgd(pgd_offset_k(start), mk_kernel_pgd(pud_phys));
+ if (!after_bootmem)
+ set_pgd(pgd_offset_k(start), mk_kernel_pgd(pud_phys));
unmap_low_page(map);
}
- asm volatile("movq %%cr4,%0" : "=r" (mmu_cr4_features));
+ if (!after_bootmem)
+ asm volatile("movq %%cr4,%0" : "=r" (mmu_cr4_features));
__flush_tlb_all();
- early_printk("kernel direct mapping tables upto %lx @ %lx-%lx\n", end,
- table_start<<PAGE_SHIFT,
- table_end<<PAGE_SHIFT);
}
void __cpuinit zap_low_mappings(int cpu)
@@ -375,6 +424,9 @@ size_zones(unsigned long *z, unsigned lo
void __init paging_init(void)
{
unsigned long zones[MAX_NR_ZONES], holes[MAX_NR_ZONES];
+
+ memory_present(0, 0, end_pfn);
+ sparse_init();
size_zones(zones, holes, 0, end_pfn);
free_area_init_node(0, NODE_DATA(0), zones,
__pa(PAGE_OFFSET) >> PAGE_SHIFT, holes);
@@ -415,6 +467,50 @@ void __init clear_kernel_mapping(unsigne
__flush_tlb_all();
}
+/*
+ * Memory hotplug specific functions
+ * These are only for non-NUMA machines right now.
+ */
+#ifdef CONFIG_MEMORY_HOTPLUG
+
+void online_page(struct page *page)
+{
+ ClearPageReserved(page);
+ set_page_count(page, 1);
+ __free_page(page);
+ totalram_pages++;
+ num_physpages++;
+}
+
+int add_memory(u64 start, u64 size)
+{
+ struct pglist_data *pgdat = NODE_DATA(0);
+ struct zone *zone = pgdat->node_zones + MAX_NR_ZONES-2;
+ unsigned long start_pfn = start >> PAGE_SHIFT;
+ unsigned long nr_pages = size >> PAGE_SHIFT;
+ int ret;
+
+ ret = __add_pages(zone, start_pfn, nr_pages);
+ if (ret)
+ goto error;
+
+ init_memory_mapping(start, (start + size -1));
+
+ return ret;
+error:
+ printk("%s: Problem encountered in __add_pages!\n", __func__);
+ return ret;
+}
+EXPORT_SYMBOL_GPL(add_memory);
+
+int remove_memory(u64 start, u64 size)
+{
+ return -EINVAL;
+}
+EXPORT_SYMBOL_GPL(remove_memory);
+
+#endif
+
static struct kcore_list kcore_mem, kcore_vmalloc, kcore_kernel, kcore_modules,
kcore_vsyscall;
On Monday 09 January 2006 16:21, Matt Tolentino wrote:
> Add x86-64 specific memory hot-add functions, Kconfig options,
> and runtime kernel page table update functions to make
> hot-add usable on x86-64 machines. Also, fixup the nefarious
> conditional locking and exports pointed out by Andi.
I'm trying to stabilize my tree for the 2.6.16 submission right now
and this one comes a bit too late and is a bit too involved
to slip through - sorry. I will consider it after Linus
has merged the whole batch of changes for 2.6.16 - so hopefully
in 2.6.17.
> +/*
> + * Memory hotplug specific functions
> + * These are only for non-NUMA machines right now.
How much work would it be to allow it for NUMA kernels too?
-Andi
for Opteron NUMA, need to update
0. stop the DCT on the node that will plug the new DIMM
1. read spd_rom for the added dimm
2. init the ram size and update the memory routing table...
3. init the timing...
4. update relate info in TOM and TOM2, and MTRR, and e820
It looks like we need to get some code about ram init from LinuxBIOS.....
YH
On Mon, 2006-01-09 at 10:51 -0800, Yinghai Lu wrote:
> for Opteron NUMA, need to update
> 0. stop the DCT on the node that will plug the new DIMM
> 1. read spd_rom for the added dimm
> 2. init the ram size and update the memory routing table...
> 3. init the timing...
> 4. update relate info in TOM and TOM2, and MTRR, and e820
>
> It looks like we need to get some code about ram init from LinuxBIOS.....
Is the AMD box not going to use the ACPI add-memory mechanism?
--
keith <[email protected]>
I don't know, even yes, according to BKDG, you still need to update
related Routing Table in NB.
YH
-----Original Message-----
From: keith [mailto:[email protected]]
Sent: Monday, January 09, 2006 11:25 AM
To: Lu, Yinghai
Cc: Andi Kleen; Matt Tolentino; [email protected]; [email protected];
[email protected]
Subject: Re: [patch 2/2] add x86-64 support for memory hot-add
On Mon, 2006-01-09 at 10:51 -0800, Yinghai Lu wrote:
> for Opteron NUMA, need to update
> 0. stop the DCT on the node that will plug the new DIMM
> 1. read spd_rom for the added dimm
> 2. init the ram size and update the memory routing table...
> 3. init the timing...
> 4. update relate info in TOM and TOM2, and MTRR, and e820
>
> It looks like we need to get some code about ram init from
LinuxBIOS.....
Is the AMD box not going to use the ACPI add-memory mechanism?
--
keith <[email protected]>
Do you mean use acpi run-time asl code to update these reg?
YH
-----Original Message-----
From: keith [mailto:[email protected]]
Sent: Monday, January 09, 2006 11:25 AM
To: Lu, Yinghai
Cc: Andi Kleen; Matt Tolentino; [email protected]; [email protected];
[email protected]
Subject: Re: [patch 2/2] add x86-64 support for memory hot-add
On Mon, 2006-01-09 at 10:51 -0800, Yinghai Lu wrote:
> for Opteron NUMA, need to update
> 0. stop the DCT on the node that will plug the new DIMM
> 1. read spd_rom for the added dimm
> 2. init the ram size and update the memory routing table...
> 3. init the timing...
> 4. update relate info in TOM and TOM2, and MTRR, and e820
>
> It looks like we need to get some code about ram init from
LinuxBIOS.....
Is the AMD box not going to use the ACPI add-memory mechanism?
--
keith <[email protected]>
On Monday 09 January 2006 20:28, Lu, Yinghai wrote:
> I don't know, even yes, according to BKDG, you still need to update
> related Routing Table in NB.
I don't really want any memory controller or HT routing
touching code in the kernel. It would be far too fragile.
Pushing that work over to firmware using ACPI seems like
the right thing to do.
-Andi
Andi Kleen <mailto:[email protected]> wrote:
> On Monday 09 January 2006 16:21, Matt Tolentino wrote:
>> Add x86-64 specific memory hot-add functions, Kconfig options,
>> and runtime kernel page table update functions to make
>> hot-add usable on x86-64 machines. Also, fixup the nefarious
>> conditional locking and exports pointed out by Andi.
>
> I'm trying to stabilize my tree for the 2.6.16 submission right now
> and this one comes a bit too late and is a bit too involved
> to slip through - sorry. I will consider it after Linus
> has merged the whole batch of changes for 2.6.16 - so hopefully
> in 2.6.17.
>
>> +/*
>> + * Memory hotplug specific functions
>> + * These are only for non-NUMA machines right now.
>
> How much work would it be to allow it for NUMA kernels too?
I've looked at it in the past, but haven't pursued it to
date, nor have I quantified the work involved.
The reason for the this approach thus far has been to
enable machines that support hot-add today...which are
non-NUMA.
matt
Andi Kleen <mailto:[email protected]> wrote:
> On Monday 09 January 2006 20:28, Lu, Yinghai wrote:
>> I don't know, even yes, according to BKDG, you still need to update
>> related Routing Table in NB.
>
> I don't really want any memory controller or HT routing
> touching code in the kernel. It would be far too fragile.
> Pushing that work over to firmware using ACPI seems like
> the right thing to do.
The proposals I've seen for NUMA systems have all done this
through ACPI.
matt
On Mon, 2006-01-09 at 11:55 -0800, Tolentino, Matthew E wrote:
> Andi Kleen <mailto:[email protected]> wrote:
> > On Monday 09 January 2006 16:21, Matt Tolentino wrote:
> >> Add x86-64 specific memory hot-add functions, Kconfig options,
> >> and runtime kernel page table update functions to make
> >> hot-add usable on x86-64 machines. Also, fixup the nefarious
> >> conditional locking and exports pointed out by Andi.
> >
> > I'm trying to stabilize my tree for the 2.6.16 submission right now
> > and this one comes a bit too late and is a bit too involved
> > to slip through - sorry. I will consider it after Linus
> > has merged the whole batch of changes for 2.6.16 - so hopefully
> > in 2.6.17.
> >
> >> +/*
> >> + * Memory hotplug specific functions
> >> + * These are only for non-NUMA machines right now.
> >
> > How much work would it be to allow it for NUMA kernels too?
Not too much. I have a start of this code. Just saving off the SRAT
locality information and using it during the add-event to decide which
node it goes to. But I went to test this weekend on a multi-node system
and the underlying __add_pages refused to add the pages. The underlying
sparsemem can do this (It works for PPC). I am still collecting the
debug info needed.
When I get the numa system sorted I will post patches.
--
keith <[email protected]>
On Monday 09 January 2006 20:55, Tolentino, Matthew E wrote:
> I've looked at it in the past, but haven't pursued it to
> date, nor have I quantified the work involved.
> The reason for the this approach thus far has been to
> enable machines that support hot-add today...which are
> non-NUMA.
Yes, but they'll be likely running NUMA kernels.
-Andi
> On Mon, 2006-01-09 at 11:55 -0800, Tolentino, Matthew E wrote:
> > Andi Kleen <mailto:[email protected]> wrote:
> > > On Monday 09 January 2006 16:21, Matt Tolentino wrote:
> > >> Add x86-64 specific memory hot-add functions, Kconfig options,
> > >> and runtime kernel page table update functions to make
> > >> hot-add usable on x86-64 machines. Also, fixup the nefarious
> > >> conditional locking and exports pointed out by Andi.
> > >
> > > I'm trying to stabilize my tree for the 2.6.16 submission right now
> > > and this one comes a bit too late and is a bit too involved
> > > to slip through - sorry. I will consider it after Linus
> > > has merged the whole batch of changes for 2.6.16 - so hopefully
> > > in 2.6.17.
> > >
> > >> +/*
> > >> + * Memory hotplug specific functions
> > >> + * These are only for non-NUMA machines right now.
> > >
> > > How much work would it be to allow it for NUMA kernels too?
>
> Not too much.
Hmm. If there is already a pgdat for new memory, it is not too much.
Just searching is necessary, which node new memory belongs to.
But, if not (this means it is new node), then it must not only allocate
and initialize new pgdat, all node's zonelists also must be updated,
because other zonelists don't know there is new node's zone.
And kmalloc is not good for allocation of new pgdat as further step.
The new pgdat should be on new own nodes for performance,
but kmalloc can't allocate on it, because its node is not initialized
itself.
I've worked for it for long time, and I have patch set against
2.6.14-rc12-git8-mhp1. But it needs many patches which have
included stock kernel, so I would like to move 2.6.15-mm2.
(Fortunately, basis of memory-hot add is included in stock kernel.
So, I would like to post them step by step like followings.
1st) There is pgdat for new memory.
2nd) allocate pgdat by kmalloc.
3rd) allocate pgdat on its nodes.
But, in addition, mempolicy and cpusets also must be updated.
I've not worked for it yet.
> I have a start of this code. Just saving off the SRAT
> locality information and using it during the add-event to decide which
> node it goes to. But I went to test this weekend on a multi-node system
> and the underlying __add_pages refused to add the pages. The underlying
> sparsemem can do this (It works for PPC). I am still collecting the
> debug info needed.
>
> When I get the numa system sorted I will post patches.
IIRC, SRAT is just for booting time. So, when hotplug occured,
it is not reliable. DSDT should be used for it in order to SRAT
like following 2 patches.
First is to get pxm from physical address.
I'll post the second patch after this post.
Thanks.
Signed-off-by: Yasunori Goto <[email protected]>
Index: current_source/drivers/acpi/acpi_memhotplug.c
===================================================================
--- current_source.orig/drivers/acpi/acpi_memhotplug.c 2005-12-27 16:21:40.000000000 +0900
+++ current_source/drivers/acpi/acpi_memhotplug.c 2005-12-27 20:39:45.000000000 +0900
@@ -81,6 +81,22 @@ struct acpi_memory_device {
};
static int
+acpi_get_node_id(acpi_handle *handle, int *nid)
+{
+ int pxm;
+
+ ACPI_FUNCTION_TRACE("acpi_get_node_id");
+
+ pxm = acpi_get_pxm(handle);
+ if (pxm < 0)
+ return_VALUE(-ENODEV);
+
+ *nid = acpi_map_pxm_to_nid(pxm);
+
+ return_VALUE(0);
+}
+
+static int
acpi_memory_get_device_resources(struct acpi_memory_device *mem_device)
{
acpi_status status;
@@ -532,6 +548,78 @@ static acpi_status __init acpi_memory_se
return acpi_memory_set_name(mem_device);
}
+struct find_memdevice_arg{
+ u64 start_addr;
+ u64 size;
+};
+
+acpi_status
+acpi_memory_match_paddr_to_memdevice(acpi_handle handle, u32 lvl, void *context, void **retv)
+{
+ acpi_status status;
+ struct acpi_device *device = NULL;
+ struct find_memdevice_arg *arg = context;
+ struct acpi_memory_device *mem_device = NULL;
+
+ ACPI_FUNCTION_TRACE("acpi_memory_match_paddr_to_memdevice\n");
+
+ status = is_memory_device(handle);
+ if (ACPI_FAILURE(status))
+ return_ACPI_STATUS(AE_OK); /* Not memory device. continue */
+
+ if (acpi_bus_get_device(handle, &device) || !device)
+ return_ACPI_STATUS(AE_OK); /* Device is not attached. continue */
+
+ mem_device = acpi_driver_data(device);
+ if ((status = acpi_memory_check_device(mem_device)) < 0)
+ return_ACPI_STATUS(AE_OK); /* Not online. continue */
+
+ if (mem_device->start_addr > arg->start_addr ||
+ mem_device->end_addr + 1 < arg->start_addr + arg->size)
+ return_ACPI_STATUS(AE_OK); /* Not match. continue */
+
+ *retv = (void *)mem_device;
+
+ return_ACPI_STATUS(AE_CTRL_TERMINATE);
+}
+
+static int
+acpi_memory_find_memdevice(u64 start_addr, u64 size,
+ struct acpi_memory_device **mem_device)
+{
+ acpi_status status;
+ struct find_memdevice_arg arg;
+
+ ACPI_FUNCTION_TRACE("acpi_memory_find_memdevice\n");
+
+ arg.start_addr = start_addr;
+ arg.size = size;
+
+ status = acpi_walk_namespace(ACPI_TYPE_DEVICE, ACPI_ROOT_OBJECT,
+ ACPI_UINT32_MAX,
+ acpi_memory_match_paddr_to_memdevice,
+ (void *)&arg, (void **)mem_device);
+
+ if (ACPI_FAILURE(status)) {
+ ACPI_DEBUG_PRINT ((ACPI_DB_ERROR, "walk_namespace_failed at acpi_memory_fine_memdevice\n"));
+ return_VALUE(-ENODEV);
+ }
+ return_VALUE(0);
+}
+
+int
+acpi_search_node_id(u64 start_addr, u64 size)
+{
+ struct acpi_memory_device *mem_device;
+ int nid = -1;
+ ACPI_FUNCTION_TRACE("acpi_search_node_id\n");
+
+ if (!acpi_memory_find_memdevice(start_addr, size, &mem_device))
+ acpi_get_node_id(mem_device->handle, &nid);
+
+ return_VALUE(nid);
+}
+
static int __init acpi_memory_device_init(void)
{
int result;
--
Yasunori Goto
> IIRC, SRAT is just for booting time. So, when hotplug occured,
> it is not reliable. DSDT should be used for it in order to SRAT
> like following 2 patches.
> First is to get pxm from physical address.
> I'll post the second patch after this post.
Second one is here.
This is map/unmap between pxm to nid. This is just for ia64.
But I guess for x86-64 is not so difference.
Signed-off-by: Keiichiro Tokunaga <[email protected]>
Signed-off-by: Yasunori Goto <[email protected]>
Index: current_source/arch/ia64/kernel/acpi.c
===================================================================
--- current_source.orig/arch/ia64/kernel/acpi.c 2005-12-27 17:05:19.000000000 +0900
+++ current_source/arch/ia64/kernel/acpi.c 2005-12-27 17:08:18.000000000 +0900
@@ -67,6 +67,7 @@ EXPORT_SYMBOL(pm_power_off);
unsigned char acpi_kbd_controller_present = 1;
unsigned char acpi_legacy_devices;
+static nodemask_t node_present_map = NODE_MASK_NONE;
static unsigned int __initdata acpi_madt_rev;
@@ -408,10 +409,11 @@ static int __init acpi_parse_madt(unsign
static int __initdata srat_num_cpus; /* number of cpus */
static u32 __devinitdata pxm_flag[PXM_FLAG_LEN];
#define pxm_bit_set(bit) (set_bit(bit,(void *)pxm_flag))
+#define pxm_bit_clear(bit) (clear_bit(bit,(void *)pxm_flag))
#define pxm_bit_test(bit) (test_bit(bit,(void *)pxm_flag))
/* maps to convert between proximity domain and logical node ID */
int __devinitdata pxm_to_nid_map[MAX_PXM_DOMAINS];
-int __initdata nid_to_pxm_map[MAX_NUMNODES];
+int __devinitdata nid_to_pxm_map[MAX_NUMNODES];
static struct acpi_table_slit __initdata *slit_table;
/*
@@ -447,6 +449,36 @@ acpi_numa_processor_affinity_init(struct
srat_num_cpus++;
}
+int __devinit
+acpi_map_pxm_to_nid(int pxm)
+{
+ int nid;
+ nodemask_t tmp_map;
+
+ if (pxm_to_nid_map[pxm] != -1)
+ nid = pxm_to_nid_map[pxm];
+ else {
+ nodes_complement(tmp_map, node_present_map);
+ nid = first_node(tmp_map);
+ pxm_to_nid_map[pxm] = nid;
+ nid_to_pxm_map[nid] = pxm;
+ pxm_bit_set(pxm);
+ }
+
+ set_bit(nid, node_present_map.bits);
+
+ return nid;
+}
+
+void
+acpi_unmap_pxm_to_nid(int nid)
+{
+
+ if ((node_items[nid].num_cpus == 0) &&
+ (node_items[nid].num_memblks == 0))
+ clear_bit(nid, node_present_map.bits);
+}
+
void __init
acpi_numa_memory_affinity_init(struct acpi_table_memory_affinity *ma)
{
@@ -504,18 +536,19 @@ void __init acpi_numa_arch_fixup(void)
memset(pxm_to_nid_map, -1, sizeof(pxm_to_nid_map));
memset(nid_to_pxm_map, -1, sizeof(nid_to_pxm_map));
nodes_clear(node_online_map);
+ nodes_clear(node_present_map);
for (i = 0; i < MAX_PXM_DOMAINS; i++) {
if (pxm_bit_test(i)) {
- int nid = num_online_nodes();
- pxm_to_nid_map[i] = nid;
- nid_to_pxm_map[nid] = i;
+ int nid = acpi_map_pxm_to_nid(i);
node_set_online(nid);
}
}
/* set logical node id in memory chunk structure */
- for (i = 0; i < num_node_memblks; i++)
+ for (i = 0; i < num_node_memblks; i++) {
node_memblk[i].nid = pxm_to_nid_map[node_memblk[i].nid];
+ node_items[node_memblk[i].nid].num_memblks++;
+ }
/* assign memory bank numbers for each chunk on each node */
for_each_online_node(i) {
@@ -528,8 +561,10 @@ void __init acpi_numa_arch_fixup(void)
}
/* set logical node id in cpu structure */
- for (i = 0; i < srat_num_cpus; i++)
+ for (i = 0; i < srat_num_cpus; i++) {
node_cpuid[i].nid = pxm_to_nid_map[node_cpuid[i].nid];
+ node_items[node_cpuid[i].nid].num_cpus++;
+ }
printk(KERN_INFO "Number of logical nodes in system = %d\n",
num_online_nodes());
@@ -751,16 +786,50 @@ int acpi_map_cpu2node(acpi_handle handle
pxm_id = acpi_get_pxm(handle);
/*
- * Assuming that the container driver would have set the proximity
- * domain and would have initialized pxm_to_nid_map[pxm_id] && pxm_flag
+ * Assuming that if at least one processor's PXM < 0, the system does
+ * not have multiple PXMs. In this case, there is one PXM and all the
+ * devices belong to it.
*/
- node_cpuid[cpu].nid = (pxm_id < 0) ? 0 : pxm_to_nid_map[pxm_id];
+ if (pxm_id < 0)
+ pxm_id = 0;
+
+ /*
+ * Container driver might call cpu hotplug driver before memory hot-add.
+ * So, pxm_to_nid must be mapped here.
+ */
+ if ((pxm_id >= 0) && (pxm_id < MAX_PXM_DOMAINS)){
+ acpi_map_pxm_to_nid(pxm_id);
+ arch_register_node(pxm_to_nid_map[pxm_id]);
+ }
+
+ node_cpuid[cpu].nid = pxm_to_nid_map[pxm_id];
node_cpuid[cpu].phys_id = physid;
+ node_items[node_cpuid[cpu].nid].num_cpus++;
#endif
return (0);
}
+static
+void acpi_unmap_cpu2node(int cpu)
+{
+#ifdef CONFIG_ACPI_NUMA
+ int nid;
+ int pxm_id;
+
+ nid = node_cpuid[cpu].nid;
+ pxm_id = nid_to_pxm_map[nid];
+
+ if (node_items[nid].num_cpus > 0)
+ node_items[nid].num_cpus--;
+
+ acpi_unmap_pxm_to_nid(pxm_id);
+
+ node_cpuid[cpu].phys_id = 0;
+ node_cpuid[cpu].nid = 0;
+#endif
+}
+
int acpi_map_lsapic(acpi_handle handle, int *pcpu)
{
struct acpi_buffer buffer = { ACPI_ALLOCATE_BUFFER, NULL };
@@ -828,7 +897,7 @@ int acpi_unmap_lsapic(int cpu)
cpu_clear(cpu, cpu_present_map);
#ifdef CONFIG_ACPI_NUMA
- /* NUMA specific cleanup's */
+ acpi_unmap_cpu2node(cpu);
#endif
return (0);
Index: current_source/arch/ia64/mm/numa.c
===================================================================
--- current_source.orig/arch/ia64/mm/numa.c 2005-12-27 17:05:19.000000000 +0900
+++ current_source/arch/ia64/mm/numa.c 2005-12-27 17:08:18.000000000 +0900
@@ -28,6 +28,7 @@
int num_node_memblks;
struct node_memblk_s node_memblk[NR_NODE_MEMBLKS];
struct node_cpuid_s node_cpuid[NR_CPUS];
+struct node_items_s node_items[MAX_NUMNODES];
/*
* This is a matrix with "distances" between nodes, they should be
* proportional to the memory access latency ratios.
Index: current_source/include/asm-ia64/acpi.h
===================================================================
--- current_source.orig/include/asm-ia64/acpi.h 2005-12-27 17:05:19.000000000 +0900
+++ current_source/include/asm-ia64/acpi.h 2005-12-27 17:08:18.000000000 +0900
@@ -111,7 +111,7 @@ extern unsigned int get_cpei_target_cpu(
/* Proximity bitmap length; _PXM is at most 255 (8 bit)*/
#define MAX_PXM_DOMAINS (256)
extern int __devinitdata pxm_to_nid_map[MAX_PXM_DOMAINS];
-extern int __initdata nid_to_pxm_map[MAX_NUMNODES];
+extern int __devinitdata nid_to_pxm_map[MAX_NUMNODES];
#endif
extern u16 ia64_acpiid_to_sapicid[];
Index: current_source/include/asm-ia64/numa.h
===================================================================
--- current_source.orig/include/asm-ia64/numa.h 2005-12-27 17:05:19.000000000 +0900
+++ current_source/include/asm-ia64/numa.h 2005-12-27 18:44:16.000000000 +0900
@@ -47,8 +47,14 @@ struct node_cpuid_s {
int nid; /* logical node containing this CPU */
};
+struct node_items_s {
+ int num_cpus; /* total num of cpus in a node */
+ int num_memblks; /* total num of memblks in a node */
+};
+
extern struct node_memblk_s node_memblk[NR_NODE_MEMBLKS];
extern struct node_cpuid_s node_cpuid[NR_CPUS];
+extern struct node_items_s node_items[MAX_NUMNODES];
/*
* ACPI 2.0 SLIT (System Locality Information Table)
@@ -68,11 +74,17 @@ extern int paddr_to_nid(unsigned long pa
extern int acpi_search_node_id(u64, u64);
#define firmware_phys_to_nid(start_addr, size) acpi_search_node_id(start_addr, size)
+extern int acpi_map_pxm_to_nid(int);
+extern void acpi_unmap_pxm_to_nid(int);
+#define arch_release_node_id(nid) acpi_unmap_pxm_to_nid(nid)
+
#else /* !CONFIG_NUMA */
#define paddr_to_nid(addr) 0
#define firmware_phys_to_nid(start_addr, size) 0
+#define acpi_map_pxm_to_nid(pxm) 0
+#define acpi_unmap_pxm_to_nid(pxm) {}
#endif /* CONFIG_NUMA */
#endif /* _ASM_IA64_NUMA_H */
--
Yasunori Goto
On Tuesday 10 January 2006 13:43, Yasunori Goto wrote:
> > IIRC, SRAT is just for booting time. So, when hotplug occured,
> > it is not reliable. DSDT should be used for it in order to SRAT
> > like following 2 patches.
> > First is to get pxm from physical address.
> > I'll post the second patch after this post.
>
> Second one is here.
> This is map/unmap between pxm to nid. This is just for ia64.
> But I guess for x86-64 is not so difference.
It probably is. The x86-64 NUMA setup is quite different from IA64.
-Andi
> On Tuesday 10 January 2006 13:43, Yasunori Goto wrote:
> > > IIRC, SRAT is just for booting time. So, when hotplug occured,
> > > it is not reliable. DSDT should be used for it in order to SRAT
> > > like following 2 patches.
> > > First is to get pxm from physical address.
> > > I'll post the second patch after this post.
> >
> > Second one is here.
> > This is map/unmap between pxm to nid. This is just for ia64.
> > But I guess for x86-64 is not so difference.
>
> It probably is. The x86-64 NUMA setup is quite different from IA64.
Ah... Ok.
I wish my patch would be good example for x86-64.
But, hmm. I feel it is a bit strange.
Why there is a difference among x86-64 and ia64 about mapping
pxm to nid? (in addition i386.)
PXM is defined by ACPI. ACPI is used on all of them.
Node id is used on Linux generically.
So, ia64 and i386 has pxm_to_nid_map[], and x86-64 has pxm2node[] too.
Why are these arrays and codes are defined on each arch?
Does anyone know it?
Its code might be able to be common on driver/acpi/numa.c...
--
Yasunori Goto