2004-03-16 17:22:05

by Robert Picco

[permalink] [raw]
Subject: boot time node and memory limit options

This patch supports three boot line options. mem_limit limits the amount of physical memory.
node_mem_limit limits the amount of physical memory per node on a NUMA machine. nodes_limit
reduces the number of NUMA nodes to the value specified. On a NUMA machine an eliminated node's
CPU(s) are removed from the cpu_possible_map.

The patch has been tested on an IA64 NUMA machine and uniprocessor X86 machine.

thanks,

Bob


--- linux-2.6.4-orig/mm/page_alloc.c 2004-03-10 21:55:22.000000000 -0500
+++ linux-2.6.4/mm/page_alloc.c 2004-03-15 12:11:35.000000000 -0500
@@ -55,6 +55,43 @@
static char *zone_names[MAX_NR_ZONES] = { "DMA", "Normal", "HighMem" };
int min_free_kbytes = 1024;

+static unsigned long mem_limit __initdata = ~0UL;
+static unsigned long total_mem __initdata;
+
+static int __init mem_setup(char *str)
+{
+ char *end;
+
+ mem_limit = memparse(str + 1, &end) - 1;
+ return 1;
+}
+
+__setup("mem_limit", mem_setup);
+
+#ifdef CONFIG_NUMA
+static unsigned long node_mem_limit __initdata = ~0UL;
+static long node_limit __initdata = MAX_NUMNODES;
+
+static int __init node_mem_setup(char *str)
+{
+ char *end;
+
+ node_mem_limit = memparse(str + 1, &end) - 1;
+ return 1;
+}
+
+static int __init nodes_setup(char *str)
+{
+ node_limit = simple_strtol(str+1, NULL, 10);
+ if (!node_limit)
+ node_limit = 1;
+ return 1;
+}
+
+__setup("node_mem_limit", node_mem_setup);
+__setup("nodes_limit", nodes_setup);
+#endif
+
/*
* Temporary debugging check for pages not lying within a given zone.
*/
@@ -1371,6 +1408,106 @@
}
}

+#ifdef CONFIG_NUMA
+static void __init do_trim_cpu(int node)
+{
+ int i;
+
+ for (i = 0; i < NR_CPUS; i++)
+ if (cpu_to_node(i) == node)
+ cpu_clear(i, cpu_possible_map);
+ return;
+}
+#endif
+
+static unsigned long __init dma_zone_top(struct pglist_data *pgdat, int *dmazones)
+{
+ unsigned long top;
+#define DMA_SPAN_MIN ((64*1024*1024) >> PAGE_SHIFT)
+ top = 0UL;
+
+ if (pgdat->node_zones[ZONE_DMA].spanned_pages) {
+ if (*dmazones > 1)
+ --*dmazones;
+ else {
+ if (pgdat->node_zones[ZONE_DMA].spanned_pages > DMA_SPAN_MIN)
+ top = DMA_SPAN_MIN + pgdat->node_zones[ZONE_DMA].zone_start_pfn;
+ else
+ top = pgdat->node_zones[ZONE_DMA].zone_start_pfn +
+ pgdat->node_zones[ZONE_DMA].spanned_pages;
+ }
+ }
+
+ return top;
+}
+
+void __init do_mem_limits(void)
+{
+ unsigned long total, alloc, free, top;
+ struct pglist_data *pgdat;
+ int dmazones;
+
+#ifdef CONFIG_NUMA
+ if (node_limit == MAX_NUMNODES && node_mem_limit == ~0UL && mem_limit == ~0UL)
+#else
+ if (mem_limit == ~0UL)
+#endif
+ return;
+
+ dmazones = 0;
+ for_each_pgdat(pgdat)
+ if (pgdat->node_zones[ZONE_DMA].spanned_pages)
+ dmazones++;
+
+ for_each_pgdat(pgdat) {
+#ifdef CONFIG_NUMA
+ if (node_limit != MAX_NUMNODES && pgdat->node_id >= node_limit) {
+ top = dma_zone_top(pgdat, &dmazones);
+ bootmem_memory_size(pgdat, &alloc, &total);
+ bootmem_memory_trim(pgdat, total - alloc, top);
+ do_trim_cpu(pgdat->node_id);
+ continue;
+ }
+#endif
+ if (mem_limit != ~0UL) {
+ unsigned long mem;
+
+ bootmem_memory_size(pgdat, &alloc, &total);
+ mem = total << PAGE_SHIFT;
+ if ((mem + total_mem) <= mem_limit)
+ total_mem += mem;
+ else {
+ free = (mem + total_mem) - mem_limit;
+ total_mem = mem_limit;
+ top = dma_zone_top(pgdat, &dmazones);
+#ifdef CONFIG_NUMA
+ if (free == mem)
+ do_trim_cpu(pgdat->node_id);
+#endif
+ free >>= PAGE_SHIFT;
+ bootmem_memory_trim(pgdat, free, top);
+ }
+ }
+#ifdef CONFIG_NUMA
+ else if (node_mem_limit != ~0UL) {
+ unsigned long mem;
+
+ bootmem_memory_size(pgdat, &alloc, &total);
+ mem = total << PAGE_SHIFT;
+
+ if (mem <= node_mem_limit)
+ continue;
+
+ top = dma_zone_top(pgdat, &dmazones);
+ free = (mem - node_mem_limit) >> PAGE_SHIFT;
+ bootmem_memory_trim(pgdat, free, top);
+ }
+#endif
+ }
+
+ return;
+}
+
void __init free_area_init_node(int nid, struct pglist_data *pgdat,
struct page *node_mem_map, unsigned long *zones_size,
unsigned long node_start_pfn, unsigned long *zholes_size)
@@ -1397,6 +1534,7 @@

void __init free_area_init(unsigned long *zones_size)
{
+ pgdat_list = &contig_page_data;
free_area_init_node(0, &contig_page_data, NULL, zones_size,
__pa(PAGE_OFFSET) >> PAGE_SHIFT, NULL);
mem_map = contig_page_data.node_mem_map;
--- linux-2.6.4-orig/mm/bootmem.c 2004-03-10 21:55:24.000000000 -0500
+++ linux-2.6.4/mm/bootmem.c 2004-03-15 13:07:50.000000000 -0500
@@ -384,3 +384,51 @@
return NULL;
}

+void __init bootmem_memory_size(pg_data_t *pgdat, unsigned long *alloc, unsigned long *total)
+{
+ unsigned long ralloc, i, idx, v, m, *map;
+ bootmem_data_t *bdata;
+
+ bdata = pgdat->bdata;
+ idx = bdata->node_low_pfn - (bdata->node_boot_start >> PAGE_SHIFT);
+ *total = idx;
+ map = bdata->node_bootmem_map;
+ for (ralloc = 0, i = 0; i < idx; ) {
+ v = map[i / BITS_PER_LONG];
+ if (v) {
+ for (m = 1; m && i < idx; m <<= 1, i++)
+ if (v & m)
+ ralloc++;
+ } else
+ i += BITS_PER_LONG;
+ }
+
+ *alloc = ralloc;
+ return;
+}
+
+void __init bootmem_memory_trim(pg_data_t *pgdat, unsigned long trim, unsigned long top)
+{
+ unsigned long i, t, idx, v, m, *map;
+ bootmem_data_t *bdata;
+
+ bdata = pgdat->bdata;
+ idx = bdata->node_low_pfn - (bdata->node_boot_start >> PAGE_SHIFT);
+ if (top != 0UL)
+ top -= (bdata->node_boot_start >> PAGE_SHIFT);
+ map = bdata->node_bootmem_map;
+ for (i = idx - 1, t = 0; t < trim && i != 0 && i >= top; ) {
+ v = ~map[i / BITS_PER_LONG];
+ if (v) {
+ for (m = 1UL << (i & (BITS_PER_LONG - 1));
+ m && i >= top && i != 0 && t < trim; m >>= 1, i--)
+ if (v & m) {
+ t++;
+ map[i / BITS_PER_LONG] |= m;
+ }
+ } else
+ i -= min((unsigned long) BITS_PER_LONG, i);
+ }
+
+ return;
+}
--- linux-2.6.4-orig/init/main.c 2004-03-10 21:55:23.000000000 -0500
+++ linux-2.6.4/init/main.c 2004-03-12 14:45:37.000000000 -0500
@@ -450,6 +450,7 @@
}
#endif
page_address_init();
+ do_mem_limits();
mem_init();
kmem_cache_init();
if (late_time_init)
--- linux-2.6.4-orig/include/linux/mm.h 2004-03-10 21:55:21.000000000 -0500
+++ linux-2.6.4/include/linux/mm.h 2004-03-12 14:45:38.000000000 -0500
@@ -517,6 +517,7 @@
return pmd_offset(pgd, address);
}

+extern void do_mem_limits(void);
extern void free_area_init(unsigned long * zones_size);
extern void free_area_init_node(int nid, pg_data_t *pgdat, struct page *pmap,
unsigned long * zones_size, unsigned long zone_start_pfn,
--- linux-2.6.4-orig/include/linux/bootmem.h 2004-03-10 21:55:44.000000000 -0500
+++ linux-2.6.4/include/linux/bootmem.h 2004-03-12 14:45:38.000000000 -0500
@@ -58,6 +58,9 @@
extern void __init free_bootmem_node (pg_data_t *pgdat, unsigned long addr, unsigned long size);
extern unsigned long __init free_all_bootmem_node (pg_data_t *pgdat);
extern void * __init __alloc_bootmem_node (pg_data_t *pgdat, unsigned long size, unsigned long align, unsigned long goal);
+extern void __init bootmem_memory_size(pg_data_t *pgdat, unsigned long *alloc, unsigned long *total);
+extern void __init bootmem_memory_trim(pg_data_t *pgdat, unsigned long trim, unsigned long top);
+
#ifndef CONFIG_HAVE_ARCH_BOOTMEM_NODE
#define alloc_bootmem_node(pgdat, x) \
__alloc_bootmem_node((pgdat), (x), SMP_CACHE_BYTES, __pa(MAX_DMA_ADDRESS))


2004-03-16 17:45:19

by Jesse Barnes

[permalink] [raw]
Subject: Re: boot time node and memory limit options

On Tue, Mar 16, 2004 at 12:28:10PM -0500, Robert Picco wrote:
> This patch supports three boot line options. mem_limit limits the
> amount of physical memory. node_mem_limit limits the amount of
> physical memory per node on a NUMA machine. nodes_limit reduces the
> number of NUMA nodes to the value specified. On a NUMA machine an
> eliminated node's CPU(s) are removed from the cpu_possible_map.
>
> The patch has been tested on an IA64 NUMA machine and uniprocessor X86
> machine.

I think this patch will be really useful. Matt and Martin, does it look
ok to you? Given that discontiguous support is pretty platform specific
right now, I thought it might be less code if it was done in arch/, but
a platform independent version is awfully nice...

Thanks,
Jesse

2004-03-16 17:41:52

by Randy.Dunlap

[permalink] [raw]
Subject: Re: boot time node and memory limit options

On Tue, 16 Mar 2004 12:07:44 -0500 Robert Picco wrote:

| This patch supports three boot line options. mem_limit limits the amount of physical memory.
| node_mem_limit limits the amount of physical memory per node on a NUMA machine. nodes_limit
| reduces the number of NUMA nodes to the value specified. On a NUMA machine an eliminated node's
| CPU(s) are removed from the cpu_possible_map.
|
| The patch has been tested on an IA64 NUMA machine and uniprocessor X86 machine.

These kernel boot ("command line") parameters need to be documented
in Documentation/kernel-parameters.txt, please:

| +__setup("mem_limit", mem_setup);
| +__setup("node_mem_limit", node_mem_setup);
| +__setup("nodes_limit", nodes_setup);

--
~Randy

2004-03-16 19:45:16

by Martin J. Bligh

[permalink] [raw]
Subject: Re: boot time node and memory limit options

--On Tuesday, March 16, 2004 09:43:29 -0800 Jesse Barnes <[email protected]> wrote:

> On Tue, Mar 16, 2004 at 12:28:10PM -0500, Robert Picco wrote:
>> This patch supports three boot line options. mem_limit limits the
>> amount of physical memory. node_mem_limit limits the amount of
>> physical memory per node on a NUMA machine. nodes_limit reduces the
>> number of NUMA nodes to the value specified. On a NUMA machine an
>> eliminated node's CPU(s) are removed from the cpu_possible_map.
>>
>> The patch has been tested on an IA64 NUMA machine and uniprocessor X86
>> machine.
>
> I think this patch will be really useful. Matt and Martin, does it look
> ok to you? Given that discontiguous support is pretty platform specific
> right now, I thought it might be less code if it was done in arch/, but
> a platform independent version is awfully nice...

I haven't looked at your code yet, but I've had a similar patch in my tree
from Dave Hansen for a while you might want to look at:

diff -purN -X /home/mbligh/.diff.exclude 320-kcg/arch/i386/kernel/numaq.c 330-numa_mem_equals/arch/i386/kernel/numaq.c
--- 320-kcg/arch/i386/kernel/numaq.c 2003-10-01 11:47:33.000000000 -0700
+++ 330-numa_mem_equals/arch/i386/kernel/numaq.c 2004-03-14 09:54:00.000000000 -0800
@@ -42,6 +42,10 @@ extern long node_start_pfn[], node_end_p
* function also increments numnodes with the number of nodes (quads)
* present.
*/
+extern unsigned long max_pages_per_node;
+extern int limit_mem_per_node;
+
+#define node_size_pages(n) (node_end_pfn[n] - node_start_pfn[n])
static void __init smp_dump_qct(void)
{
int node;
@@ -60,6 +64,8 @@ static void __init smp_dump_qct(void)
eq->hi_shrd_mem_start - eq->priv_mem_size);
node_end_pfn[node] = MB_TO_PAGES(
eq->hi_shrd_mem_start + eq->hi_shrd_mem_size);
+ if (node_size_pages(node) > max_pages_per_node)
+ node_end_pfn[node] = node_start_pfn[node] + max_pages_per_node;
}
}
}
diff -purN -X /home/mbligh/.diff.exclude 320-kcg/arch/i386/kernel/setup.c 330-numa_mem_equals/arch/i386/kernel/setup.c
--- 320-kcg/arch/i386/kernel/setup.c 2004-03-11 14:33:36.000000000 -0800
+++ 330-numa_mem_equals/arch/i386/kernel/setup.c 2004-03-14 09:54:00.000000000 -0800
@@ -142,7 +142,7 @@ static void __init probe_roms(void)
probe_extension_roms(roms);
}

-static void __init limit_regions(unsigned long long size)
+void __init limit_regions(unsigned long long size)
{
unsigned long long current_addr = 0;
int i;
@@ -478,6 +478,7 @@ static void __init setup_memory_region(v
print_memory_map(who);
} /* setup_memory_region */

+unsigned long max_pages_per_node = 0xFFFFFFFF;

static void __init parse_cmdline_early (char ** cmdline_p)
{
@@ -521,6 +522,14 @@ static void __init parse_cmdline_early (
userdef=1;
}
}
+
+ if (c == ' ' && !memcmp(from, "memnode=", 8)) {
+ unsigned long long node_size_bytes;
+ if (to != command_line)
+ to--;
+ node_size_bytes = memparse(from+8, &from);
+ max_pages_per_node = node_size_bytes >> PAGE_SHIFT;
+ }

if (c == ' ' && !memcmp(from, "memmap=", 7)) {
if (to != command_line)
diff -purN -X /home/mbligh/.diff.exclude 320-kcg/arch/i386/kernel/srat.c 330-numa_mem_equals/arch/i386/kernel/srat.c
--- 320-kcg/arch/i386/kernel/srat.c 2003-10-01 11:47:33.000000000 -0700
+++ 330-numa_mem_equals/arch/i386/kernel/srat.c 2004-03-14 09:54:01.000000000 -0800
@@ -53,6 +53,10 @@ struct node_memory_chunk_s {
};
static struct node_memory_chunk_s node_memory_chunk[MAXCHUNKS];

+#define chunk_start(i) (node_memory_chunk[i].start_pfn)
+#define chunk_end(i) (node_memory_chunk[i].end_pfn)
+#define chunk_size(i) (chunk_end(i)-chunk_start(i))
+
static int num_memory_chunks; /* total number of memory chunks */
static int zholes_size_init;
static unsigned long zholes_size[MAX_NUMNODES * MAX_NR_ZONES];
@@ -198,6 +202,9 @@ static void __init initialize_physnode_m
}
}

+extern unsigned long max_pages_per_node;
+extern int limit_mem_per_node;
+
/* Parse the ACPI Static Resource Affinity Table */
static int __init acpi20_parse_srat(struct acpi_table_srat *sratp)
{
@@ -281,23 +288,27 @@ static int __init acpi20_parse_srat(stru
node_memory_chunk[j].start_pfn,
node_memory_chunk[j].end_pfn);
}
-
+
/*calculate node_start_pfn/node_end_pfn arrays*/
for (nid = 0; nid < numnodes; nid++) {
- int been_here_before = 0;
+ unsigned long node_present_pages = 0;

+ node_start_pfn[nid] = -1;
for (j = 0; j < num_memory_chunks; j++){
- if (node_memory_chunk[j].nid == nid) {
- if (been_here_before == 0) {
- node_start_pfn[nid] = node_memory_chunk[j].start_pfn;
- node_end_pfn[nid] = node_memory_chunk[j].end_pfn;
- been_here_before = 1;
- } else { /* We've found another chunk of memory for the node */
- if (node_start_pfn[nid] < node_memory_chunk[j].start_pfn) {
- node_end_pfn[nid] = node_memory_chunk[j].end_pfn;
- }
- }
- }
+ unsigned long proposed_size;
+
+ if (node_memory_chunk[j].nid != nid)
+ continue;
+
+ proposed_size = node_present_pages + chunk_size(j);
+ if (proposed_size > max_pages_per_node)
+ chunk_end(j) = chunk_start(j) +
+ max_pages_per_node - node_present_pages;
+ node_present_pages += chunk_size(j);
+
+ if (node_start_pfn[nid] == -1)
+ node_start_pfn[nid] = chunk_start(j);
+ node_end_pfn[nid] = chunk_end(j);
}
}
return 1;

2004-03-17 16:22:56

by Robert Picco

[permalink] [raw]
Subject: Re: boot time node and memory limit options

Hi Martin:

I did something like this before my posted patch in the IA64 ACPI NUMA
memory initialization code. It wasn't posted or even reviewed by
peers. Your patch below basically
trims the NUMA node memory information before the X86 discontig code
calls the bootmem initialization routines. The problem with coming up
with a solution at this level is each (at least ones I've looked at)
architecture handles low level memory initialization differently and
there needs to be a common way to parse early boot arguments.

The patch I posted was arrived at after some people suggested an
architecture independent patch. My patch
basically allocates memory from the bootmem allocator before mem_init
calls free_all_bootmem_core. It's architecture independent. If the
real goal is to limit physical memory before the bootmem allocator is
initialized, then my current patch doesn't accomplish this.

thanks,

Bob
Martin J. Bligh wrote:

>--On Tuesday, March 16, 2004 09:43:29 -0800 Jesse Barnes <[email protected]> wrote:
>
>
>
>>On Tue, Mar 16, 2004 at 12:28:10PM -0500, Robert Picco wrote:
>>
>>
>>>This patch supports three boot line options. mem_limit limits the
>>>amount of physical memory. node_mem_limit limits the amount of
>>>physical memory per node on a NUMA machine. nodes_limit reduces the
>>>number of NUMA nodes to the value specified. On a NUMA machine an
>>>eliminated node's CPU(s) are removed from the cpu_possible_map.
>>>
>>>The patch has been tested on an IA64 NUMA machine and uniprocessor X86
>>>machine.
>>>
>>>
>>I think this patch will be really useful. Matt and Martin, does it look
>>ok to you? Given that discontiguous support is pretty platform specific
>>right now, I thought it might be less code if it was done in arch/, but
>>a platform independent version is awfully nice...
>>
>>
>
>I haven't looked at your code yet, but I've had a similar patch in my tree
>from Dave Hansen for a while you might want to look at:
>
>diff -purN -X /home/mbligh/.diff.exclude 320-kcg/arch/i386/kernel/numaq.c 330-numa_mem_equals/arch/i386/kernel/numaq.c
>--- 320-kcg/arch/i386/kernel/numaq.c 2003-10-01 11:47:33.000000000 -0700
>+++ 330-numa_mem_equals/arch/i386/kernel/numaq.c 2004-03-14 09:54:00.000000000 -0800
>@@ -42,6 +42,10 @@ extern long node_start_pfn[], node_end_p
> * function also increments numnodes with the number of nodes (quads)
> * present.
> */
>+extern unsigned long max_pages_per_node;
>+extern int limit_mem_per_node;
>+
>+#define node_size_pages(n) (node_end_pfn[n] - node_start_pfn[n])
> static void __init smp_dump_qct(void)
> {
> int node;
>@@ -60,6 +64,8 @@ static void __init smp_dump_qct(void)
> eq->hi_shrd_mem_start - eq->priv_mem_size);
> node_end_pfn[node] = MB_TO_PAGES(
> eq->hi_shrd_mem_start + eq->hi_shrd_mem_size);
>+ if (node_size_pages(node) > max_pages_per_node)
>+ node_end_pfn[node] = node_start_pfn[node] + max_pages_per_node;
> }
> }
> }
>diff -purN -X /home/mbligh/.diff.exclude 320-kcg/arch/i386/kernel/setup.c 330-numa_mem_equals/arch/i386/kernel/setup.c
>--- 320-kcg/arch/i386/kernel/setup.c 2004-03-11 14:33:36.000000000 -0800
>+++ 330-numa_mem_equals/arch/i386/kernel/setup.c 2004-03-14 09:54:00.000000000 -0800
>@@ -142,7 +142,7 @@ static void __init probe_roms(void)
> probe_extension_roms(roms);
> }
>
>-static void __init limit_regions(unsigned long long size)
>+void __init limit_regions(unsigned long long size)
> {
> unsigned long long current_addr = 0;
> int i;
>@@ -478,6 +478,7 @@ static void __init setup_memory_region(v
> print_memory_map(who);
> } /* setup_memory_region */
>
>+unsigned long max_pages_per_node = 0xFFFFFFFF;
>
> static void __init parse_cmdline_early (char ** cmdline_p)
> {
>@@ -521,6 +522,14 @@ static void __init parse_cmdline_early (
> userdef=1;
> }
> }
>+
>+ if (c == ' ' && !memcmp(from, "memnode=", 8)) {
>+ unsigned long long node_size_bytes;
>+ if (to != command_line)
>+ to--;
>+ node_size_bytes = memparse(from+8, &from);
>+ max_pages_per_node = node_size_bytes >> PAGE_SHIFT;
>+ }
>
> if (c == ' ' && !memcmp(from, "memmap=", 7)) {
> if (to != command_line)
>diff -purN -X /home/mbligh/.diff.exclude 320-kcg/arch/i386/kernel/srat.c 330-numa_mem_equals/arch/i386/kernel/srat.c
>--- 320-kcg/arch/i386/kernel/srat.c 2003-10-01 11:47:33.000000000 -0700
>+++ 330-numa_mem_equals/arch/i386/kernel/srat.c 2004-03-14 09:54:01.000000000 -0800
>@@ -53,6 +53,10 @@ struct node_memory_chunk_s {
> };
> static struct node_memory_chunk_s node_memory_chunk[MAXCHUNKS];
>
>+#define chunk_start(i) (node_memory_chunk[i].start_pfn)
>+#define chunk_end(i) (node_memory_chunk[i].end_pfn)
>+#define chunk_size(i) (chunk_end(i)-chunk_start(i))
>+
> static int num_memory_chunks; /* total number of memory chunks */
> static int zholes_size_init;
> static unsigned long zholes_size[MAX_NUMNODES * MAX_NR_ZONES];
>@@ -198,6 +202,9 @@ static void __init initialize_physnode_m
> }
> }
>
>+extern unsigned long max_pages_per_node;
>+extern int limit_mem_per_node;
>+
> /* Parse the ACPI Static Resource Affinity Table */
> static int __init acpi20_parse_srat(struct acpi_table_srat *sratp)
> {
>@@ -281,23 +288,27 @@ static int __init acpi20_parse_srat(stru
> node_memory_chunk[j].start_pfn,
> node_memory_chunk[j].end_pfn);
> }
>-
>+
> /*calculate node_start_pfn/node_end_pfn arrays*/
> for (nid = 0; nid < numnodes; nid++) {
>- int been_here_before = 0;
>+ unsigned long node_present_pages = 0;
>
>+ node_start_pfn[nid] = -1;
> for (j = 0; j < num_memory_chunks; j++){
>- if (node_memory_chunk[j].nid == nid) {
>- if (been_here_before == 0) {
>- node_start_pfn[nid] = node_memory_chunk[j].start_pfn;
>- node_end_pfn[nid] = node_memory_chunk[j].end_pfn;
>- been_here_before = 1;
>- } else { /* We've found another chunk of memory for the node */
>- if (node_start_pfn[nid] < node_memory_chunk[j].start_pfn) {
>- node_end_pfn[nid] = node_memory_chunk[j].end_pfn;
>- }
>- }
>- }
>+ unsigned long proposed_size;
>+
>+ if (node_memory_chunk[j].nid != nid)
>+ continue;
>+
>+ proposed_size = node_present_pages + chunk_size(j);
>+ if (proposed_size > max_pages_per_node)
>+ chunk_end(j) = chunk_start(j) +
>+ max_pages_per_node - node_present_pages;
>+ node_present_pages += chunk_size(j);
>+
>+ if (node_start_pfn[nid] == -1)
>+ node_start_pfn[nid] = chunk_start(j);
>+ node_end_pfn[nid] = chunk_end(j);
> }
> }
> return 1;
>
>-
>To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
>the body of a message to [email protected]
>More majordomo info at http://vger.kernel.org/majordomo-info.html
>Please read the FAQ at http://www.tux.org/lkml/
>
>
>

2004-03-17 16:37:18

by Martin J. Bligh

[permalink] [raw]
Subject: Re: boot time node and memory limit options

> I did something like this before my posted patch in the IA64
> ACPI NUMA memory initialization code. It wasn't posted or even
> reviewed by peers. Your patch below basically trims the NUMA node
> memory information before the X86 discontig code calls the bootmem
> initialization routines. The problem with coming up with a solution
> at this level is each (at least ones I've looked at) architecture
> handles low level memory initialization differently and there needs
> to be a common way to parse early boot arguments.
>
> The patch I posted was arrived at after some people suggested an
> architecture independent patch. My patch basically allocates memory
> from the bootmem allocator before mem_init calls free_all_bootmem_core.
> It's architecture independent. If the real goal is to limit physical
> memory before the bootmem allocator is initialized, then my current
> patch doesn't accomplish this.

Mmmm. That does worry me somewhat, as its possible to allocate large
amounts of bootmem for hash tables, etc, IIRC. I think that's too late
to restrict things accurately. The fact that we only have bootmem on
node 0 on ia32 isn't going to help matters either ;-)

Don't we have the same arch dependant issue with the current mem= anyway?
Can we come up with something where the arch code calls back into a generic
function to derive limitations, and thereby at least get the parsing done
in a common routine for consistency? There aren't *that* many NUMA arches
to change anyway ...

M.

> Bob
> Martin J. Bligh wrote:
>
>> --On Tuesday, March 16, 2004 09:43:29 -0800 Jesse Barnes <[email protected]> wrote:
>>
>>
>>
>>> On Tue, Mar 16, 2004 at 12:28:10PM -0500, Robert Picco wrote:
>>>
>>>
>>>> This patch supports three boot line options. mem_limit limits the
>>>> amount of physical memory. node_mem_limit limits the amount of
>>>> physical memory per node on a NUMA machine. nodes_limit reduces the
>>>> number of NUMA nodes to the value specified. On a NUMA machine an
>>>> eliminated node's CPU(s) are removed from the cpu_possible_map.
>>>>
>>>> The patch has been tested on an IA64 NUMA machine and uniprocessor X86
>>>> machine.
>>>>
>>>>
>>> I think this patch will be really useful. Matt and Martin, does it look
>>> ok to you? Given that discontiguous support is pretty platform specific
>>> right now, I thought it might be less code if it was done in arch/, but
>>> a platform independent version is awfully nice...
>>>
>>>
>>
>> I haven't looked at your code yet, but I've had a similar patch in my tree
>> from Dave Hansen for a while you might want to look at:
>>
>> diff -purN -X /home/mbligh/.diff.exclude 320-kcg/arch/i386/kernel/numaq.c 330-numa_mem_equals/arch/i386/kernel/numaq.c
>> --- 320-kcg/arch/i386/kernel/numaq.c 2003-10-01 11:47:33.000000000 -0700
>> +++ 330-numa_mem_equals/arch/i386/kernel/numaq.c 2004-03-14 09:54:00.000000000 -0800
>> @@ -42,6 +42,10 @@ extern long node_start_pfn[], node_end_p
>> * function also increments numnodes with the number of nodes (quads)
>> * present.
>> */
>> +extern unsigned long max_pages_per_node;
>> +extern int limit_mem_per_node;
>> +
>> +#define node_size_pages(n) (node_end_pfn[n] - node_start_pfn[n])
>> static void __init smp_dump_qct(void)
>> {
>> int node;
>> @@ -60,6 +64,8 @@ static void __init smp_dump_qct(void)
>> eq->hi_shrd_mem_start - eq->priv_mem_size);
>> node_end_pfn[node] = MB_TO_PAGES(
>> eq->hi_shrd_mem_start + eq->hi_shrd_mem_size);
>> + if (node_size_pages(node) > max_pages_per_node)
>> + node_end_pfn[node] = node_start_pfn[node] + max_pages_per_node;
>> }
>> }
>> }
>> diff -purN -X /home/mbligh/.diff.exclude 320-kcg/arch/i386/kernel/setup.c 330-numa_mem_equals/arch/i386/kernel/setup.c
>> --- 320-kcg/arch/i386/kernel/setup.c 2004-03-11 14:33:36.000000000 -0800
>> +++ 330-numa_mem_equals/arch/i386/kernel/setup.c 2004-03-14 09:54:00.000000000 -0800
>> @@ -142,7 +142,7 @@ static void __init probe_roms(void)
>> probe_extension_roms(roms);
>> }
>>
>> -static void __init limit_regions(unsigned long long size)
>> +void __init limit_regions(unsigned long long size)
>> {
>> unsigned long long current_addr = 0;
>> int i;
>> @@ -478,6 +478,7 @@ static void __init setup_memory_region(v
>> print_memory_map(who);
>> } /* setup_memory_region */
>>
>> +unsigned long max_pages_per_node = 0xFFFFFFFF;
>>
>> static void __init parse_cmdline_early (char ** cmdline_p)
>> {
>> @@ -521,6 +522,14 @@ static void __init parse_cmdline_early (
>> userdef=1;
>> }
>> }
>> +
>> + if (c == ' ' && !memcmp(from, "memnode=", 8)) {
>> + unsigned long long node_size_bytes;
>> + if (to != command_line)
>> + to--;
>> + node_size_bytes = memparse(from+8, &from);
>> + max_pages_per_node = node_size_bytes >> PAGE_SHIFT;
>> + }
>>
>> if (c == ' ' && !memcmp(from, "memmap=", 7)) {
>> if (to != command_line)
>> diff -purN -X /home/mbligh/.diff.exclude 320-kcg/arch/i386/kernel/srat.c 330-numa_mem_equals/arch/i386/kernel/srat.c
>> --- 320-kcg/arch/i386/kernel/srat.c 2003-10-01 11:47:33.000000000 -0700
>> +++ 330-numa_mem_equals/arch/i386/kernel/srat.c 2004-03-14 09:54:01.000000000 -0800
>> @@ -53,6 +53,10 @@ struct node_memory_chunk_s {
>> };
>> static struct node_memory_chunk_s node_memory_chunk[MAXCHUNKS];
>>
>> +#define chunk_start(i) (node_memory_chunk[i].start_pfn)
>> +#define chunk_end(i) (node_memory_chunk[i].end_pfn)
>> +#define chunk_size(i) (chunk_end(i)-chunk_start(i))
>> +
>> static int num_memory_chunks; /* total number of memory chunks */
>> static int zholes_size_init;
>> static unsigned long zholes_size[MAX_NUMNODES * MAX_NR_ZONES];
>> @@ -198,6 +202,9 @@ static void __init initialize_physnode_m
>> }
>> }
>>
>> +extern unsigned long max_pages_per_node;
>> +extern int limit_mem_per_node;
>> +
>> /* Parse the ACPI Static Resource Affinity Table */
>> static int __init acpi20_parse_srat(struct acpi_table_srat *sratp)
>> {
>> @@ -281,23 +288,27 @@ static int __init acpi20_parse_srat(stru
>> node_memory_chunk[j].start_pfn,
>> node_memory_chunk[j].end_pfn);
>> }
>> -
>> +
>> /*calculate node_start_pfn/node_end_pfn arrays*/
>> for (nid = 0; nid < numnodes; nid++) {
>> - int been_here_before = 0;
>> + unsigned long node_present_pages = 0;
>>
>> + node_start_pfn[nid] = -1;
>> for (j = 0; j < num_memory_chunks; j++){
>> - if (node_memory_chunk[j].nid == nid) {
>> - if (been_here_before == 0) {
>> - node_start_pfn[nid] = node_memory_chunk[j].start_pfn;
>> - node_end_pfn[nid] = node_memory_chunk[j].end_pfn;
>> - been_here_before = 1;
>> - } else { /* We've found another chunk of memory for the node */
>> - if (node_start_pfn[nid] < node_memory_chunk[j].start_pfn) {
>> - node_end_pfn[nid] = node_memory_chunk[j].end_pfn;
>> - }
>> - }
>> - }
>> + unsigned long proposed_size;
>> +
>> + if (node_memory_chunk[j].nid != nid)
>> + continue;
>> +
>> + proposed_size = node_present_pages + chunk_size(j);
>> + if (proposed_size > max_pages_per_node)
>> + chunk_end(j) = chunk_start(j) +
>> + max_pages_per_node - node_present_pages;
>> + node_present_pages += chunk_size(j);
>> +
>> + if (node_start_pfn[nid] == -1)
>> + node_start_pfn[nid] = chunk_start(j);
>> + node_end_pfn[nid] = chunk_end(j);
>> }
>> }
>> return 1;
>>
>> -
>> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
>> the body of a message to [email protected]
>> More majordomo info at http://vger.kernel.org/majordomo-info.html
>> Please read the FAQ at http://www.tux.org/lkml/
>>
>>
>>
>
>


2004-03-17 17:11:48

by Dave Hansen

[permalink] [raw]
Subject: Re: boot time node and memory limit options

On Wed, 2004-03-17 at 08:36, Martin J. Bligh wrote:
> > The patch I posted was arrived at after some people suggested an
> > architecture independent patch. My patch basically allocates memory
> > from the bootmem allocator before mem_init calls free_all_bootmem_core.
> > It's architecture independent. If the real goal is to limit physical
> > memory before the bootmem allocator is initialized, then my current
> > patch doesn't accomplish this.
>
> Don't we have the same arch dependant issue with the current mem= anyway?
> Can we come up with something where the arch code calls back into a generic
> function to derive limitations, and thereby at least get the parsing done
> in a common routine for consistency? There aren't *that* many NUMA arches
> to change anyway ...

The problem with doing it in generic code is that it has to happen
_after_ the memory layout is discovered. It's a mess to reconstruct all
of the necessary information about where holes stop and start, at least
from the current information that we store. Then, you have to go track
down any information that might have "leaked" into the arch code before
you parsed the mem=, which includes all of the {min,max)_{high,low}_pfn
variable. I prefer to just take care of it at its source where NUMA
information is read out of the hardware.

Every arch has its own way of describing its layout. Some use "chunks"
and others like ppc64 use LMB (logical memory blocks). If each arch was
willing to store their memory layout information in a generic way, then
we might have a shot at doing a generic mem= or a NUMA version.

I coded this up a few days ago to see if I could replace the x440 SRAT
chunks with it. I never got around to actually doing that part, but
something like this is what we need to do *layout* manipulation in an
architecture-agnostic way.

I started coding this before I thought *too* much about it. What I want
is a way to get rid of all of the crap that each architecture (and
subarch) have to store their physical memory layout. On normal x86 we
have the e820 and the EFI tables and on Summit/x440, we have yet another
way to do it.

What I'd like to do is present a standard way for all of these
architectures to store the information that they need to record at boot
time, plus make something flexible enough that we can use it for stuff
at runtime when hotplug memory is involved.

The code I'd like to see go away from boot-time is anything that deals
with arch-specific structures like the e820, functions like
lmb_end_of_DRAM(), or any code that deals with zholes. I'd like to get
it to a point where we can do a mostly arch-independent mem=.

So, here's a little bit of (now userspace) code that implements a very
simple way to track physical memory areas.

stuff that sucks:
- long type names/indiscriminate use of u64
- "section" is on my brain from CONFIG_NONLINEAR, probably don't want
to use that name again
- Doesn't coalesce adjacent sections with identical attributes, only
extends existing ones.
- could sort arrays instead of using lists for speed/space
- can leave "UNDEF" holes
- can't add new sections spanning 2 old ones


-- dave


Attachments:
layout.c (5.38 kB)
list.h (17.64 kB)
Download all attachments

2004-03-17 17:53:32

by Jesse Barnes

[permalink] [raw]
Subject: Re: boot time node and memory limit options

On Wed, Mar 17, 2004 at 09:09:45AM -0800, Dave Hansen wrote:
> Every arch has its own way of describing its layout. Some use "chunks"
> and others like ppc64 use LMB (logical memory blocks). If each arch was
> willing to store their memory layout information in a generic way, then
> we might have a shot at doing a generic mem= or a NUMA version.

>
> I coded this up a few days ago to see if I could replace the x440 SRAT
> chunks with it. I never got around to actually doing that part, but
> something like this is what we need to do *layout* manipulation in an
> architecture-agnostic way.
>
> I started coding this before I thought *too* much about it. What I want
> is a way to get rid of all of the crap that each architecture (and
> subarch) have to store their physical memory layout. On normal x86 we
> have the e820 and the EFI tables and on Summit/x440, we have yet another
> way to do it.

In some cases (ia64 for example) there are additional restrictions on
each memory chunk. For example, the EFI memory map may describe a
contiguous chunk of memory 28MB in size, but if your kernel page size
was set to 64MB, you'd have to throw it away as unusable. Should that
be dealt with in the arch independent code (i.e. is similar stuff done
on other platforms?) or is it best to only add sections that are usable?

> What I'd like to do is present a standard way for all of these
> architectures to store the information that they need to record at boot
> time, plus make something flexible enough that we can use it for stuff
> at runtime when hotplug memory is involved.

That would be great, what you have below seems sensible.

> The code I'd like to see go away from boot-time is anything that deals
> with arch-specific structures like the e820, functions like
> lmb_end_of_DRAM(), or any code that deals with zholes. I'd like to get
> it to a point where we can do a mostly arch-independent mem=.

So what you have here would be only for boot time setup, while
CONFIG_NONLINEAR would be used in lieu of multiple pgdats per node or a
virtual memmap in the case of intranode discontiguous memory?

Thanks,
Jesse

2004-03-17 18:13:12

by Dave Hansen

[permalink] [raw]
Subject: Re: boot time node and memory limit options

On Wed, 2004-03-17 at 09:51, Jesse Barnes wrote:
> In some cases (ia64 for example) there are additional restrictions on
> each memory chunk. For example, the EFI memory map may describe a
> contiguous chunk of memory 28MB in size, but if your kernel page size
> was set to 64MB, you'd have to throw it away as unusable. Should that
> be dealt with in the arch independent code (i.e. is similar stuff done
> on other platforms?) or is it best to only add sections that are usable?

I was really hoping that this mechanism can be as stupid about what it
contains as possible. It's _just_ there to store the memory layout, and
wouldn't decide or implement policy for the architecture.

The "runt" section of memory should be added to the structures and
tracked. If, for some random reason, another 36MB of contiguous memory
got added to it later, you could start to think about coalescing it with
the runt from before.

The place to ignore the runt is in your architecture code that sets up
the page tables. Your arch code would, of course, be reading from this
layout code.

> > What I'd like to do is present a standard way for all of these
> > architectures to store the information that they need to record at boot
> > time, plus make something flexible enough that we can use it for stuff
> > at runtime when hotplug memory is involved.
>
> That would be great, what you have below seems sensible.

Mostly sensible. I definitely need to make sure that it can cover all
the cases. The "section" terminology should probably be removed so that
we can use it for CONFIG_NONLINEAR, and we need to think about what
happens when conflicting sections are added. For instance, it might be
valid to add RAM from 0-4GB, then reserve 3.75-4GB later on for PCI
space. Also, the code currently leaves "undefined" sections instead of
creating holes. That can be dealt with later.

Anyway, I'm not too attached to that code, it just realizes an idea that
I have.

> > The code I'd like to see go away from boot-time is anything that deals
> > with arch-specific structures like the e820, functions like
> > lmb_end_of_DRAM(), or any code that deals with zholes. I'd like to get
> > it to a point where we can do a mostly arch-independent mem=.
>
> So what you have here would be only for boot time setup, while
> CONFIG_NONLINEAR would be used in lieu of multiple pgdats per node or a
> virtual memmap in the case of intranode discontiguous memory?

Well, I was hoping that whatever we use at boot-time could stick around
for runtime. I'd like to get to the point where the interface for
bringing up boot-time memory is the same for hotplugging memory. (for
2.7, of course)

Just as with the CPU hotplug code, having separate code paths for
hotplug memory is asking for trouble, because the coverage will never be
as high as the generic boot case.

-- dave

2004-03-17 18:14:42

by Robert Picco

[permalink] [raw]
Subject: Re: boot time node and memory limit options

Hi David:

Well our IA64 "mem=" is used in efi_memmap_walk. We could change the
name to "max_address=". The X86 "mem=" takes effect before the bootmem
allocator is initialized. My patch eliminates memory before
mem_init frees all bootmap memory. My proposed patch doesn't have the
same functionality as X86 "mem=".

thanks,

Bob

David Mosberger wrote:

>Hi Bob,
>
>
>
>>>>>>On Tue, 16 Mar 2004 12:07:44 -0500, Robert Picco <[email protected]> said:
>>>>>>
>>>>>>
>
> Bob> This patch supports three boot line options. mem_limit limits
> Bob> the amount of physical memory. node_mem_limit limits the
> Bob> amount of physical memory per node on a NUMA machine.
> Bob> nodes_limit reduces the number of NUMA nodes to the value
> Bob> specified. On a NUMA machine an eliminated node's CPU(s) are
> Bob> removed from the cpu_possible_map.
>
> Bob> The patch has been tested on an IA64 NUMA machine and
> Bob> uniprocessor X86 machine.
>
>Would it make sense to improve on the consistency of the "mem" option
>at the same time. IIRC, "mem=N" on x86 means "limit amount of memory
>to N", whereas on ia64 it means "ignore memory above N". In my
>opinion, it would make sense to change the ia64 "mem" to option to
>match the behavior on x86 and then to use "mem_limit=N" for the
>"ignore memory above N" case (which is very useful for testing
>addressing issues, such as I/O MMU issues).
>
>Thanks,
>
> --david
>
>
>

2004-03-17 19:37:30

by Robert Picco

[permalink] [raw]
Subject: Re: boot time node and memory limit options

Martin J. Bligh wrote:

>>I did something like this before my posted patch in the IA64
>>ACPI NUMA memory initialization code. It wasn't posted or even
>>reviewed by peers. Your patch below basically trims the NUMA node
>>memory information before the X86 discontig code calls the bootmem
>>initialization routines. The problem with coming up with a solution
>>at this level is each (at least ones I've looked at) architecture
>>handles low level memory initialization differently and there needs
>>to be a common way to parse early boot arguments.
>>
>>The patch I posted was arrived at after some people suggested an
>>architecture independent patch. My patch basically allocates memory
>>from the bootmem allocator before mem_init calls free_all_bootmem_core.
>>It's architecture independent. If the real goal is to limit physical
>>memory before the bootmem allocator is initialized, then my current
>>patch doesn't accomplish this.
>>
>>
>
>Mmmm. That does worry me somewhat, as its possible to allocate large
>amounts of bootmem for hash tables, etc, IIRC. I think that's too late
>to restrict things accurately. The fact that we only have bootmem on
>node 0 on ia32 isn't going to help matters either ;-)
>
>
I agree with sizing issues at boot of hash tables. I've seen them all
recover when failing to allocate based
on num_physpages and then iterating at smaller allocations until
successful. All the primary initialization allocations recover but
probably not all drivers. You could have similiar failure scenarios
for any boot line parameter(s) implementation which reduces memory.

>Don't we have the same arch dependant issue with the current mem= anyway?
>Can we come up with something where the arch code calls back into a generic
>function to derive limitations, and thereby at least get the parsing done
>in a common routine for consistency? There aren't *that* many NUMA arches
>to change anyway ...
>
>
Well this is heading in the direction Dave has proposed and probably 2.7
material. This would really solve the problem differently than my
proposed patch.

thanks,

Bob

>M.
>
>
>
>>Bob
>>Martin J. Bligh wrote:
>>
>>
>>
>>>--On Tuesday, March 16, 2004 09:43:29 -0800 Jesse Barnes <[email protected]> wrote:
>>>
>>>
>>>
>>>
>>>
>>>>On Tue, Mar 16, 2004 at 12:28:10PM -0500, Robert Picco wrote:
>>>>
>>>>
>>>>
>>>>
>>>>>This patch supports three boot line options. mem_limit limits the
>>>>>amount of physical memory. node_mem_limit limits the amount of
>>>>>physical memory per node on a NUMA machine. nodes_limit reduces the
>>>>>number of NUMA nodes to the value specified. On a NUMA machine an
>>>>>eliminated node's CPU(s) are removed from the cpu_possible_map.
>>>>>
>>>>>The patch has been tested on an IA64 NUMA machine and uniprocessor X86
>>>>>machine.
>>>>>
>>>>>
>>>>>
>>>>>
>>>>I think this patch will be really useful. Matt and Martin, does it look
>>>>ok to you? Given that discontiguous support is pretty platform specific
>>>>right now, I thought it might be less code if it was done in arch/, but
>>>>a platform independent version is awfully nice...
>>>>
>>>>
>>>>
>>>>
>>>I haven't looked at your code yet, but I've had a similar patch in my tree
>>>from Dave Hansen for a while you might want to look at:
>>>
>>>diff -purN -X /home/mbligh/.diff.exclude 320-kcg/arch/i386/kernel/numaq.c 330-numa_mem_equals/arch/i386/kernel/numaq.c
>>>--- 320-kcg/arch/i386/kernel/numaq.c 2003-10-01 11:47:33.000000000 -0700
>>>+++ 330-numa_mem_equals/arch/i386/kernel/numaq.c 2004-03-14 09:54:00.000000000 -0800
>>>@@ -42,6 +42,10 @@ extern long node_start_pfn[], node_end_p
>>> * function also increments numnodes with the number of nodes (quads)
>>> * present.
>>> */
>>>+extern unsigned long max_pages_per_node;
>>>+extern int limit_mem_per_node;
>>>+
>>>+#define node_size_pages(n) (node_end_pfn[n] - node_start_pfn[n])
>>>static void __init smp_dump_qct(void)
>>>{
>>> int node;
>>>@@ -60,6 +64,8 @@ static void __init smp_dump_qct(void)
>>> eq->hi_shrd_mem_start - eq->priv_mem_size);
>>> node_end_pfn[node] = MB_TO_PAGES(
>>> eq->hi_shrd_mem_start + eq->hi_shrd_mem_size);
>>>+ if (node_size_pages(node) > max_pages_per_node)
>>>+ node_end_pfn[node] = node_start_pfn[node] + max_pages_per_node;
>>> }
>>> }
>>>}
>>>diff -purN -X /home/mbligh/.diff.exclude 320-kcg/arch/i386/kernel/setup.c 330-numa_mem_equals/arch/i386/kernel/setup.c
>>>--- 320-kcg/arch/i386/kernel/setup.c 2004-03-11 14:33:36.000000000 -0800
>>>+++ 330-numa_mem_equals/arch/i386/kernel/setup.c 2004-03-14 09:54:00.000000000 -0800
>>>@@ -142,7 +142,7 @@ static void __init probe_roms(void)
>>> probe_extension_roms(roms);
>>>}
>>>
>>>-static void __init limit_regions(unsigned long long size)
>>>+void __init limit_regions(unsigned long long size)
>>>{
>>> unsigned long long current_addr = 0;
>>> int i;
>>>@@ -478,6 +478,7 @@ static void __init setup_memory_region(v
>>> print_memory_map(who);
>>>} /* setup_memory_region */
>>>
>>>+unsigned long max_pages_per_node = 0xFFFFFFFF;
>>>
>>>static void __init parse_cmdline_early (char ** cmdline_p)
>>>{
>>>@@ -521,6 +522,14 @@ static void __init parse_cmdline_early (
>>> userdef=1;
>>> }
>>> }
>>>+
>>>+ if (c == ' ' && !memcmp(from, "memnode=", 8)) {
>>>+ unsigned long long node_size_bytes;
>>>+ if (to != command_line)
>>>+ to--;
>>>+ node_size_bytes = memparse(from+8, &from);
>>>+ max_pages_per_node = node_size_bytes >> PAGE_SHIFT;
>>>+ }
>>>
>>> if (c == ' ' && !memcmp(from, "memmap=", 7)) {
>>> if (to != command_line)
>>>diff -purN -X /home/mbligh/.diff.exclude 320-kcg/arch/i386/kernel/srat.c 330-numa_mem_equals/arch/i386/kernel/srat.c
>>>--- 320-kcg/arch/i386/kernel/srat.c 2003-10-01 11:47:33.000000000 -0700
>>>+++ 330-numa_mem_equals/arch/i386/kernel/srat.c 2004-03-14 09:54:01.000000000 -0800
>>>@@ -53,6 +53,10 @@ struct node_memory_chunk_s {
>>>};
>>>static struct node_memory_chunk_s node_memory_chunk[MAXCHUNKS];
>>>
>>>+#define chunk_start(i) (node_memory_chunk[i].start_pfn)
>>>+#define chunk_end(i) (node_memory_chunk[i].end_pfn)
>>>+#define chunk_size(i) (chunk_end(i)-chunk_start(i))
>>>+
>>>static int num_memory_chunks; /* total number of memory chunks */
>>>static int zholes_size_init;
>>>static unsigned long zholes_size[MAX_NUMNODES * MAX_NR_ZONES];
>>>@@ -198,6 +202,9 @@ static void __init initialize_physnode_m
>>> }
>>>}
>>>
>>>+extern unsigned long max_pages_per_node;
>>>+extern int limit_mem_per_node;
>>>+
>>>/* Parse the ACPI Static Resource Affinity Table */
>>>static int __init acpi20_parse_srat(struct acpi_table_srat *sratp)
>>>{
>>>@@ -281,23 +288,27 @@ static int __init acpi20_parse_srat(stru
>>> node_memory_chunk[j].start_pfn,
>>> node_memory_chunk[j].end_pfn);
>>> }
>>>-
>>>+
>>> /*calculate node_start_pfn/node_end_pfn arrays*/
>>> for (nid = 0; nid < numnodes; nid++) {
>>>- int been_here_before = 0;
>>>+ unsigned long node_present_pages = 0;
>>>
>>>+ node_start_pfn[nid] = -1;
>>> for (j = 0; j < num_memory_chunks; j++){
>>>- if (node_memory_chunk[j].nid == nid) {
>>>- if (been_here_before == 0) {
>>>- node_start_pfn[nid] = node_memory_chunk[j].start_pfn;
>>>- node_end_pfn[nid] = node_memory_chunk[j].end_pfn;
>>>- been_here_before = 1;
>>>- } else { /* We've found another chunk of memory for the node */
>>>- if (node_start_pfn[nid] < node_memory_chunk[j].start_pfn) {
>>>- node_end_pfn[nid] = node_memory_chunk[j].end_pfn;
>>>- }
>>>- }
>>>- }
>>>+ unsigned long proposed_size;
>>>+
>>>+ if (node_memory_chunk[j].nid != nid)
>>>+ continue;
>>>+
>>>+ proposed_size = node_present_pages + chunk_size(j);
>>>+ if (proposed_size > max_pages_per_node)
>>>+ chunk_end(j) = chunk_start(j) +
>>>+ max_pages_per_node - node_present_pages;
>>>+ node_present_pages += chunk_size(j);
>>>+
>>>+ if (node_start_pfn[nid] == -1)
>>>+ node_start_pfn[nid] = chunk_start(j);
>>>+ node_end_pfn[nid] = chunk_end(j);
>>> }
>>> }
>>> return 1;
>>>
>>>-
>>>To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
>>>the body of a message to [email protected]
>>>More majordomo info at http://vger.kernel.org/majordomo-info.html
>>>Please read the FAQ at http://www.tux.org/lkml/
>>>
>>>
>>>
>>>
>>>
>>
>>
>
>
>-
>To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
>the body of a message to [email protected]
>More majordomo info at http://vger.kernel.org/majordomo-info.html
>Please read the FAQ at http://www.tux.org/lkml/
>
>
>

2004-03-17 19:45:57

by Martin J. Bligh

[permalink] [raw]
Subject: Re: boot time node and memory limit options

> I agree with sizing issues at boot of hash tables. I've seen them all recover when failing to allocate based
> on num_physpages and then iterating at smaller allocations until successful. All the primary initialization allocations recover but probably not all drivers. You could have similiar failure scenarios for any boot line parameter(s) implementation which reduces memory.
>> Don't we have the same arch dependant issue with the current mem= anyway?
>> Can we come up with something where the arch code calls back into a generic
>> function to derive limitations, and thereby at least get the parsing done
>> in a common routine for consistency? There aren't *that* many NUMA arches
>> to change anyway ...
>>
>>
> Well this is heading in the direction Dave has proposed and probably 2.7 material. This would really solve the problem differently than my proposed patch.

Yes ... that's looking very 2.7-ish to reorganise all that stuff. However,
for now, I still think we need to restrict memory very early on, before
anything else can allocate bootmem. Are you the absolute first thing that
ever runs in the boot allocator?

M.

> thanks,
>
> Bob
>
>> M.
>>
>>
>>
>>> Bob
>>> Martin J. Bligh wrote:
>>>
>>>
>>>
>>>> --On Tuesday, March 16, 2004 09:43:29 -0800 Jesse Barnes <[email protected]> wrote:
>>>>
>>>>
>>>>
>>>>
>>>>
>>>>> On Tue, Mar 16, 2004 at 12:28:10PM -0500, Robert Picco wrote:
>>>>>
>>>>>
>>>>>
>>>>>
>>>>>> This patch supports three boot line options. mem_limit limits the
>>>>>> amount of physical memory. node_mem_limit limits the amount of
>>>>>> physical memory per node on a NUMA machine. nodes_limit reduces the
>>>>>> number of NUMA nodes to the value specified. On a NUMA machine an
>>>>>> eliminated node's CPU(s) are removed from the cpu_possible_map.
>>>>>>
>>>>>> The patch has been tested on an IA64 NUMA machine and uniprocessor X86
>>>>>> machine.
>>>>>>
>>>>>>
>>>>>>
>>>>>>
>>>>> I think this patch will be really useful. Matt and Martin, does it look
>>>>> ok to you? Given that discontiguous support is pretty platform specific
>>>>> right now, I thought it might be less code if it was done in arch/, but
>>>>> a platform independent version is awfully nice...
>>>>>
>>>>>
>>>>>
>>>>>
>>>> I haven't looked at your code yet, but I've had a similar patch in my tree
>>>> from Dave Hansen for a while you might want to look at:
>>>>
>>>> diff -purN -X /home/mbligh/.diff.exclude 320-kcg/arch/i386/kernel/numaq.c 330-numa_mem_equals/arch/i386/kernel/numaq.c
>>>> --- 320-kcg/arch/i386/kernel/numaq.c 2003-10-01 11:47:33.000000000 -0700
>>>> +++ 330-numa_mem_equals/arch/i386/kernel/numaq.c 2004-03-14 09:54:00.000000000 -0800
>>>> @@ -42,6 +42,10 @@ extern long node_start_pfn[], node_end_p
>>>> * function also increments numnodes with the number of nodes (quads)
>>>> * present.
>>>> */
>>>> +extern unsigned long max_pages_per_node;
>>>> +extern int limit_mem_per_node;
>>>> +
>>>> +#define node_size_pages(n) (node_end_pfn[n] - node_start_pfn[n])
>>>> static void __init smp_dump_qct(void)
>>>> {
>>>> int node;
>>>> @@ -60,6 +64,8 @@ static void __init smp_dump_qct(void)
>>>> eq->hi_shrd_mem_start - eq->priv_mem_size);
>>>> node_end_pfn[node] = MB_TO_PAGES(
>>>> eq->hi_shrd_mem_start + eq->hi_shrd_mem_size);
>>>> + if (node_size_pages(node) > max_pages_per_node)
>>>> + node_end_pfn[node] = node_start_pfn[node] + max_pages_per_node;
>>>> }
>>>> }
>>>> }
>>>> diff -purN -X /home/mbligh/.diff.exclude 320-kcg/arch/i386/kernel/setup.c 330-numa_mem_equals/arch/i386/kernel/setup.c
>>>> --- 320-kcg/arch/i386/kernel/setup.c 2004-03-11 14:33:36.000000000 -0800
>>>> +++ 330-numa_mem_equals/arch/i386/kernel/setup.c 2004-03-14 09:54:00.000000000 -0800
>>>> @@ -142,7 +142,7 @@ static void __init probe_roms(void)
>>>> probe_extension_roms(roms);
>>>> }
>>>>
>>>> -static void __init limit_regions(unsigned long long size)
>>>> +void __init limit_regions(unsigned long long size)
>>>> {
>>>> unsigned long long current_addr = 0;
>>>> int i;
>>>> @@ -478,6 +478,7 @@ static void __init setup_memory_region(v
>>>> print_memory_map(who);
>>>> } /* setup_memory_region */
>>>>
>>>> +unsigned long max_pages_per_node = 0xFFFFFFFF;
>>>>
>>>> static void __init parse_cmdline_early (char ** cmdline_p)
>>>> {
>>>> @@ -521,6 +522,14 @@ static void __init parse_cmdline_early (
>>>> userdef=1;
>>>> }
>>>> }
>>>> +
>>>> + if (c == ' ' && !memcmp(from, "memnode=", 8)) {
>>>> + unsigned long long node_size_bytes;
>>>> + if (to != command_line)
>>>> + to--;
>>>> + node_size_bytes = memparse(from+8, &from);
>>>> + max_pages_per_node = node_size_bytes >> PAGE_SHIFT;
>>>> + }
>>>>
>>>> if (c == ' ' && !memcmp(from, "memmap=", 7)) {
>>>> if (to != command_line)
>>>> diff -purN -X /home/mbligh/.diff.exclude 320-kcg/arch/i386/kernel/srat.c 330-numa_mem_equals/arch/i386/kernel/srat.c
>>>> --- 320-kcg/arch/i386/kernel/srat.c 2003-10-01 11:47:33.000000000 -0700
>>>> +++ 330-numa_mem_equals/arch/i386/kernel/srat.c 2004-03-14 09:54:01.000000000 -0800
>>>> @@ -53,6 +53,10 @@ struct node_memory_chunk_s {
>>>> };
>>>> static struct node_memory_chunk_s node_memory_chunk[MAXCHUNKS];
>>>>
>>>> +#define chunk_start(i) (node_memory_chunk[i].start_pfn)
>>>> +#define chunk_end(i) (node_memory_chunk[i].end_pfn)
>>>> +#define chunk_size(i) (chunk_end(i)-chunk_start(i))
>>>> +
>>>> static int num_memory_chunks; /* total number of memory chunks */
>>>> static int zholes_size_init;
>>>> static unsigned long zholes_size[MAX_NUMNODES * MAX_NR_ZONES];
>>>> @@ -198,6 +202,9 @@ static void __init initialize_physnode_m
>>>> }
>>>> }
>>>>
>>>> +extern unsigned long max_pages_per_node;
>>>> +extern int limit_mem_per_node;
>>>> +
>>>> /* Parse the ACPI Static Resource Affinity Table */
>>>> static int __init acpi20_parse_srat(struct acpi_table_srat *sratp)
>>>> {
>>>> @@ -281,23 +288,27 @@ static int __init acpi20_parse_srat(stru
>>>> node_memory_chunk[j].start_pfn,
>>>> node_memory_chunk[j].end_pfn);
>>>> }
>>>> -
>>>> +
>>>> /*calculate node_start_pfn/node_end_pfn arrays*/
>>>> for (nid = 0; nid < numnodes; nid++) {
>>>> - int been_here_before = 0;
>>>> + unsigned long node_present_pages = 0;
>>>>
>>>> + node_start_pfn[nid] = -1;
>>>> for (j = 0; j < num_memory_chunks; j++){
>>>> - if (node_memory_chunk[j].nid == nid) {
>>>> - if (been_here_before == 0) {
>>>> - node_start_pfn[nid] = node_memory_chunk[j].start_pfn;
>>>> - node_end_pfn[nid] = node_memory_chunk[j].end_pfn;
>>>> - been_here_before = 1;
>>>> - } else { /* We've found another chunk of memory for the node */
>>>> - if (node_start_pfn[nid] < node_memory_chunk[j].start_pfn) {
>>>> - node_end_pfn[nid] = node_memory_chunk[j].end_pfn;
>>>> - }
>>>> - }
>>>> - }
>>>> + unsigned long proposed_size;
>>>> +
>>>> + if (node_memory_chunk[j].nid != nid)
>>>> + continue;
>>>> +
>>>> + proposed_size = node_present_pages + chunk_size(j);
>>>> + if (proposed_size > max_pages_per_node)
>>>> + chunk_end(j) = chunk_start(j) +
>>>> + max_pages_per_node - node_present_pages;
>>>> + node_present_pages += chunk_size(j);
>>>> +
>>>> + if (node_start_pfn[nid] == -1)
>>>> + node_start_pfn[nid] = chunk_start(j);
>>>> + node_end_pfn[nid] = chunk_end(j);
>>>> }
>>>> }
>>>> return 1;
>>>>
>>>> -
>>>> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
>>>> the body of a message to [email protected]
>>>> More majordomo info at http://vger.kernel.org/majordomo-info.html
>>>> Please read the FAQ at http://www.tux.org/lkml/
>>>>
>>>>
>>>>
>>>>
>>>>
>>>
>>>
>>
>>
>> -
>> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
>> the body of a message to [email protected]
>> More majordomo info at http://vger.kernel.org/majordomo-info.html
>> Please read the FAQ at http://www.tux.org/lkml/
>>
>>
>>
>
>


2004-03-17 20:08:42

by Robert Picco

[permalink] [raw]
Subject: Re: boot time node and memory limit options

Martin J. Bligh wrote:

>>I agree with sizing issues at boot of hash tables. I've seen them all recover when failing to allocate based
>>on num_physpages and then iterating at smaller allocations until successful. All the primary initialization allocations recover but probably not all drivers. You could have similiar failure scenarios for any boot line parameter(s) implementation which reduces memory.
>>
>>
>>>Don't we have the same arch dependant issue with the current mem= anyway?
>>>Can we come up with something where the arch code calls back into a generic
>>>function to derive limitations, and thereby at least get the parsing done
>>>in a common routine for consistency? There aren't *that* many NUMA arches
>>>to change anyway ...
>>>
>>>
>>>
>>>
>>Well this is heading in the direction Dave has proposed and probably 2.7 material. This would really solve the problem differently than my proposed patch.
>>
>>
>
>Yes ... that's looking very 2.7-ish to reorganise all that stuff. However,
>for now, I still think we need to restrict memory very early on, before
>anything else can allocate bootmem. Are you the absolute first thing that
>ever runs in the boot allocator?
>
>M.
>
>
All the machine dependent initialization code could have allocated
and/or reserved bootmem before the patch would claim additional memory
based on boot line parameters. The patch is called just before
mem_init. There aren't any pages on freelist yet because mem_init
hasn't been called. So I'm not the first thing that ever runs in the
boot allocator. I'm not sure that my answer is addressing your question?

Bob

>
>
>>thanks,
>>
>>Bob
>>
>>
>>
>>>M.
>>>
>>>
>>>
>>>
>>>
>>>>Bob
>>>>Martin J. Bligh wrote:
>>>>
>>>>
>>>>
>>>>
>>>>
>>>>>--On Tuesday, March 16, 2004 09:43:29 -0800 Jesse Barnes <[email protected]> wrote:
>>>>>
>>>>>
>>>>>
>>>>>
>>>>>
>>>>>
>>>>>
>>>>>>On Tue, Mar 16, 2004 at 12:28:10PM -0500, Robert Picco wrote:
>>>>>>
>>>>>>
>>>>>>
>>>>>>
>>>>>>
>>>>>>
>>>>>>>This patch supports three boot line options. mem_limit limits the
>>>>>>>amount of physical memory. node_mem_limit limits the amount of
>>>>>>>physical memory per node on a NUMA machine. nodes_limit reduces the
>>>>>>>number of NUMA nodes to the value specified. On a NUMA machine an
>>>>>>>eliminated node's CPU(s) are removed from the cpu_possible_map.
>>>>>>>
>>>>>>>The patch has been tested on an IA64 NUMA machine and uniprocessor X86
>>>>>>>machine.
>>>>>>>
>>>>>>>
>>>>>>>
>>>>>>>
>>>>>>>
>>>>>>>
>>>>>>I think this patch will be really useful. Matt and Martin, does it look
>>>>>>ok to you? Given that discontiguous support is pretty platform specific
>>>>>>right now, I thought it might be less code if it was done in arch/, but
>>>>>>a platform independent version is awfully nice...
>>>>>>
>>>>>>
>>>>>>
>>>>>>
>>>>>>
>>>>>>
>>>>>I haven't looked at your code yet, but I've had a similar patch in my tree
>>>>>from Dave Hansen for a while you might want to look at:
>>>>>
>>>>>diff -purN -X /home/mbligh/.diff.exclude 320-kcg/arch/i386/kernel/numaq.c 330-numa_mem_equals/arch/i386/kernel/numaq.c
>>>>>--- 320-kcg/arch/i386/kernel/numaq.c 2003-10-01 11:47:33.000000000 -0700
>>>>>+++ 330-numa_mem_equals/arch/i386/kernel/numaq.c 2004-03-14 09:54:00.000000000 -0800
>>>>>@@ -42,6 +42,10 @@ extern long node_start_pfn[], node_end_p
>>>>>* function also increments numnodes with the number of nodes (quads)
>>>>>* present.
>>>>>*/
>>>>>+extern unsigned long max_pages_per_node;
>>>>>+extern int limit_mem_per_node;
>>>>>+
>>>>>+#define node_size_pages(n) (node_end_pfn[n] - node_start_pfn[n])
>>>>>static void __init smp_dump_qct(void)
>>>>>{
>>>>> int node;
>>>>>@@ -60,6 +64,8 @@ static void __init smp_dump_qct(void)
>>>>> eq->hi_shrd_mem_start - eq->priv_mem_size);
>>>>> node_end_pfn[node] = MB_TO_PAGES(
>>>>> eq->hi_shrd_mem_start + eq->hi_shrd_mem_size);
>>>>>+ if (node_size_pages(node) > max_pages_per_node)
>>>>>+ node_end_pfn[node] = node_start_pfn[node] + max_pages_per_node;
>>>>> }
>>>>> }
>>>>>}
>>>>>diff -purN -X /home/mbligh/.diff.exclude 320-kcg/arch/i386/kernel/setup.c 330-numa_mem_equals/arch/i386/kernel/setup.c
>>>>>--- 320-kcg/arch/i386/kernel/setup.c 2004-03-11 14:33:36.000000000 -0800
>>>>>+++ 330-numa_mem_equals/arch/i386/kernel/setup.c 2004-03-14 09:54:00.000000000 -0800
>>>>>@@ -142,7 +142,7 @@ static void __init probe_roms(void)
>>>>> probe_extension_roms(roms);
>>>>>}
>>>>>
>>>>>-static void __init limit_regions(unsigned long long size)
>>>>>+void __init limit_regions(unsigned long long size)
>>>>>{
>>>>> unsigned long long current_addr = 0;
>>>>> int i;
>>>>>@@ -478,6 +478,7 @@ static void __init setup_memory_region(v
>>>>> print_memory_map(who);
>>>>>} /* setup_memory_region */
>>>>>
>>>>>+unsigned long max_pages_per_node = 0xFFFFFFFF;
>>>>>
>>>>>static void __init parse_cmdline_early (char ** cmdline_p)
>>>>>{
>>>>>@@ -521,6 +522,14 @@ static void __init parse_cmdline_early (
>>>>> userdef=1;
>>>>> }
>>>>> }
>>>>>+
>>>>>+ if (c == ' ' && !memcmp(from, "memnode=", 8)) {
>>>>>+ unsigned long long node_size_bytes;
>>>>>+ if (to != command_line)
>>>>>+ to--;
>>>>>+ node_size_bytes = memparse(from+8, &from);
>>>>>+ max_pages_per_node = node_size_bytes >> PAGE_SHIFT;
>>>>>+ }
>>>>>
>>>>> if (c == ' ' && !memcmp(from, "memmap=", 7)) {
>>>>> if (to != command_line)
>>>>>diff -purN -X /home/mbligh/.diff.exclude 320-kcg/arch/i386/kernel/srat.c 330-numa_mem_equals/arch/i386/kernel/srat.c
>>>>>--- 320-kcg/arch/i386/kernel/srat.c 2003-10-01 11:47:33.000000000 -0700
>>>>>+++ 330-numa_mem_equals/arch/i386/kernel/srat.c 2004-03-14 09:54:01.000000000 -0800
>>>>>@@ -53,6 +53,10 @@ struct node_memory_chunk_s {
>>>>>};
>>>>>static struct node_memory_chunk_s node_memory_chunk[MAXCHUNKS];
>>>>>
>>>>>+#define chunk_start(i) (node_memory_chunk[i].start_pfn)
>>>>>+#define chunk_end(i) (node_memory_chunk[i].end_pfn)
>>>>>+#define chunk_size(i) (chunk_end(i)-chunk_start(i))
>>>>>+
>>>>>static int num_memory_chunks; /* total number of memory chunks */
>>>>>static int zholes_size_init;
>>>>>static unsigned long zholes_size[MAX_NUMNODES * MAX_NR_ZONES];
>>>>>@@ -198,6 +202,9 @@ static void __init initialize_physnode_m
>>>>> }
>>>>>}
>>>>>
>>>>>+extern unsigned long max_pages_per_node;
>>>>>+extern int limit_mem_per_node;
>>>>>+
>>>>>/* Parse the ACPI Static Resource Affinity Table */
>>>>>static int __init acpi20_parse_srat(struct acpi_table_srat *sratp)
>>>>>{
>>>>>@@ -281,23 +288,27 @@ static int __init acpi20_parse_srat(stru
>>>>> node_memory_chunk[j].start_pfn,
>>>>> node_memory_chunk[j].end_pfn);
>>>>> }
>>>>>-
>>>>>+
>>>>> /*calculate node_start_pfn/node_end_pfn arrays*/
>>>>> for (nid = 0; nid < numnodes; nid++) {
>>>>>- int been_here_before = 0;
>>>>>+ unsigned long node_present_pages = 0;
>>>>>
>>>>>+ node_start_pfn[nid] = -1;
>>>>> for (j = 0; j < num_memory_chunks; j++){
>>>>>- if (node_memory_chunk[j].nid == nid) {
>>>>>- if (been_here_before == 0) {
>>>>>- node_start_pfn[nid] = node_memory_chunk[j].start_pfn;
>>>>>- node_end_pfn[nid] = node_memory_chunk[j].end_pfn;
>>>>>- been_here_before = 1;
>>>>>- } else { /* We've found another chunk of memory for the node */
>>>>>- if (node_start_pfn[nid] < node_memory_chunk[j].start_pfn) {
>>>>>- node_end_pfn[nid] = node_memory_chunk[j].end_pfn;
>>>>>- }
>>>>>- }
>>>>>- }
>>>>>+ unsigned long proposed_size;
>>>>>+
>>>>>+ if (node_memory_chunk[j].nid != nid)
>>>>>+ continue;
>>>>>+
>>>>>+ proposed_size = node_present_pages + chunk_size(j);
>>>>>+ if (proposed_size > max_pages_per_node)
>>>>>+ chunk_end(j) = chunk_start(j) +
>>>>>+ max_pages_per_node - node_present_pages;
>>>>>+ node_present_pages += chunk_size(j);
>>>>>+
>>>>>+ if (node_start_pfn[nid] == -1)
>>>>>+ node_start_pfn[nid] = chunk_start(j);
>>>>>+ node_end_pfn[nid] = chunk_end(j);
>>>>> }
>>>>> }
>>>>> return 1;
>>>>>
>>>>>-
>>>>>To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
>>>>>the body of a message to [email protected]
>>>>>More majordomo info at http://vger.kernel.org/majordomo-info.html
>>>>>Please read the FAQ at http://www.tux.org/lkml/
>>>>>
>>>>>
>>>>>
>>>>>
>>>>>
>>>>>
>>>>>
>>>>
>>>>
>>>>
>>>>
>>>-
>>>To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
>>>the body of a message to [email protected]
>>>More majordomo info at http://vger.kernel.org/majordomo-info.html
>>>Please read the FAQ at http://www.tux.org/lkml/
>>>
>>>
>>>
>>>
>>>
>>
>>
>
>
>
>

2004-03-17 20:53:09

by Dave Hansen

[permalink] [raw]
Subject: Re: boot time node and memory limit options

On Wed, 2004-03-17 at 11:44, Martin J. Bligh wrote:
> Yes ... that's looking very 2.7-ish to reorganise all that stuff.
> However, for now, I still think we need to restrict memory very early
> on, before anything else can allocate bootmem. Are you the absolute
> first thing that ever runs in the boot allocator?

I definitely agree with the 2.7 target for what I posted. We can do it
cleanly in 2.7, but for now, I think the most best solution is to do it
in each architecture. Partly because it's the way that we already do
mem=, plus I'm not sure the boot allocator code will work with all
architectures, at least ppc64.

It's probably an oversight in the implementation (of the early ppc64
boot code), but there is some required correlation required with things
like lmb_end_of_DRAM() and how much memory is being used by the mm
structures. I've played with it a bit, and I _think_ that you would be
required to modify the lmb structures, even with Robert's bootmem patch.

I could be wrong, so can somebody test it on a NUMA ppc64 machine?

Also, it may have been discussed before, but does the bootmem patch have
any applicability to the 32-bit NUMA platforms? It looks like it just
deals with ZONE_DMA.

-- dave

2004-03-17 20:58:28

by Martin J. Bligh

[permalink] [raw]
Subject: Re: boot time node and memory limit options

>> Yes ... that's looking very 2.7-ish to reorganise all that stuff. However,
>> for now, I still think we need to restrict memory very early on, before
>> anything else can allocate bootmem. Are you the absolute first thing that
>> ever runs in the boot allocator?
>>
>> M.
>>
>>
> All the machine dependent initialization code could have allocated and/or reserved bootmem before the patch would claim additional memory based on boot line parameters. The patch is called just before mem_init. There aren't any pages on freelist yet because mem_init hasn't been called. So I'm not the first thing that ever runs in the boot allocator. I'm not sure that my answer is addressing your question?

You are, but it's not the answer I want ;-) If you can allocate stuff out
of bootmem that should have been barred by the limiter, I think that's
a bad idea ... you should be restricting earlier, IMHO.

M.