From: Haicheng Li <[email protected]>
NUMA hotplug emulator needs to hide memory regions at the very
beginning of kernel booting. Then emulator will use these
memory regions to fake offlined numa nodes.
CC: Yinghai Lu <[email protected]>
Signed-off-by: Haicheng Li <[email protected]>
Signed-off-by: Shaohui Zheng <[email protected]>
---
arch/x86/include/asm/e820.h | 1 +
arch/x86/kernel/e820.c | 19 ++++++++++++++++++-
2 files changed, 19 insertions(+), 1 deletions(-)
Index: linux-hpe4/arch/x86/include/asm/e820.h
===================================================================
--- linux-hpe4.orig/arch/x86/include/asm/e820.h 2010-11-15 17:13:02.483461667 +0800
+++ linux-hpe4/arch/x86/include/asm/e820.h 2010-11-15 17:13:07.083461581 +0800
@@ -129,6 +129,7 @@
extern void e820_register_active_regions(int nid, unsigned long start_pfn,
unsigned long end_pfn);
extern u64 e820_hole_size(u64 start, u64 end);
+extern u64 e820_hide_mem(u64 mem_size);
extern void finish_e820_parsing(void);
extern void e820_reserve_resources(void);
extern void e820_reserve_resources_late(void);
Index: linux-hpe4/arch/x86/kernel/e820.c
===================================================================
--- linux-hpe4.orig/arch/x86/kernel/e820.c 2010-11-15 17:13:02.483461667 +0800
+++ linux-hpe4/arch/x86/kernel/e820.c 2010-11-15 17:13:07.083461581 +0800
@@ -971,6 +971,7 @@
}
static int userdef __initdata;
+static u64 max_mem_size __initdata = ULLONG_MAX;
/* "mem=nopentium" disables the 4MB page tables. */
static int __init parse_memopt(char *p)
@@ -989,12 +990,28 @@
userdef = 1;
mem_size = memparse(p, &p);
- e820_remove_range(mem_size, ULLONG_MAX - mem_size, E820_RAM, 1);
+ e820_remove_range(mem_size, max_mem_size - mem_size, E820_RAM, 1);
+ max_mem_size = mem_size;
return 0;
}
early_param("mem", parse_memopt);
+#ifdef CONFIG_NODE_HOTPLUG_EMU
+u64 __init e820_hide_mem(u64 mem_size)
+{
+ u64 start, end_pfn;
+
+ userdef = 1;
+ end_pfn = e820_end_of_ram_pfn();
+ start = (end_pfn << PAGE_SHIFT) - mem_size;
+ e820_remove_range(start, max_mem_size - start, E820_RAM, 1);
+ max_mem_size = start;
+
+ return start;
+}
+#endif
+
static int __init parse_memmap_opt(char *p)
{
char *oldp;
--
Thanks & Regards,
Shaohui
On Wed, 17 Nov 2010, [email protected] wrote:
> Index: linux-hpe4/arch/x86/kernel/e820.c
> ===================================================================
> --- linux-hpe4.orig/arch/x86/kernel/e820.c 2010-11-15 17:13:02.483461667 +0800
> +++ linux-hpe4/arch/x86/kernel/e820.c 2010-11-15 17:13:07.083461581 +0800
> @@ -971,6 +971,7 @@
> }
>
> static int userdef __initdata;
> +static u64 max_mem_size __initdata = ULLONG_MAX;
>
> /* "mem=nopentium" disables the 4MB page tables. */
> static int __init parse_memopt(char *p)
> @@ -989,12 +990,28 @@
>
> userdef = 1;
> mem_size = memparse(p, &p);
> - e820_remove_range(mem_size, ULLONG_MAX - mem_size, E820_RAM, 1);
> + e820_remove_range(mem_size, max_mem_size - mem_size, E820_RAM, 1);
> + max_mem_size = mem_size;
>
> return 0;
> }
This needs memmap= support as well, right?
> early_param("mem", parse_memopt);
>
> +#ifdef CONFIG_NODE_HOTPLUG_EMU
> +u64 __init e820_hide_mem(u64 mem_size)
> +{
> + u64 start, end_pfn;
> +
> + userdef = 1;
> + end_pfn = e820_end_of_ram_pfn();
> + start = (end_pfn << PAGE_SHIFT) - mem_size;
> + e820_remove_range(start, max_mem_size - start, E820_RAM, 1);
> + max_mem_size = start;
> +
> + return start;
> +}
> +#endif
This doesn't have any sanity checking for whether e820_remove_range() will
leave any significant amount of memory behind so the kernel will even boot
(probably should have a guaranteed FAKE_NODE_MIN_SIZE left behind?).
> +
> static int __init parse_memmap_opt(char *p)
> {
> char *oldp;
On Wed, Nov 17, 2010 at 12:16:34AM -0800, David Rientjes wrote:
> On Wed, 17 Nov 2010, [email protected] wrote:
>
> > Index: linux-hpe4/arch/x86/kernel/e820.c
> > ===================================================================
> > --- linux-hpe4.orig/arch/x86/kernel/e820.c 2010-11-15 17:13:02.483461667 +0800
> > +++ linux-hpe4/arch/x86/kernel/e820.c 2010-11-15 17:13:07.083461581 +0800
> > @@ -971,6 +971,7 @@
> > }
> >
> > static int userdef __initdata;
> > +static u64 max_mem_size __initdata = ULLONG_MAX;
> >
> > /* "mem=nopentium" disables the 4MB page tables. */
> > static int __init parse_memopt(char *p)
> > @@ -989,12 +990,28 @@
> >
> > userdef = 1;
> > mem_size = memparse(p, &p);
> > - e820_remove_range(mem_size, ULLONG_MAX - mem_size, E820_RAM, 1);
> > + e820_remove_range(mem_size, max_mem_size - mem_size, E820_RAM, 1);
> > + max_mem_size = mem_size;
> >
> > return 0;
> > }
>
> This needs memmap= support as well, right?
we did not do the testing after combine both memmap and numa=hide paramter,
I think that the result should similar with mem=XX, they both remove a memory
region from the e820 table.
>
> > early_param("mem", parse_memopt);
> >
> > +#ifdef CONFIG_NODE_HOTPLUG_EMU
> > +u64 __init e820_hide_mem(u64 mem_size)
> > +{
> > + u64 start, end_pfn;
> > +
> > + userdef = 1;
> > + end_pfn = e820_end_of_ram_pfn();
> > + start = (end_pfn << PAGE_SHIFT) - mem_size;
> > + e820_remove_range(start, max_mem_size - start, E820_RAM, 1);
> > + max_mem_size = start;
> > +
> > + return start;
> > +}
> > +#endif
>
> This doesn't have any sanity checking for whether e820_remove_range() will
> leave any significant amount of memory behind so the kernel will even boot
> (probably should have a guaranteed FAKE_NODE_MIN_SIZE left behind?).
it should not be checked here, it should be checked by the function who call
e820_hide_mem, and truncate the mem_size with FAKE_NODE_MIN_SIZE.
>
> > +
> > static int __init parse_memmap_opt(char *p)
> > {
> > char *oldp;
--
Thanks & Regards,
Shaohui
On Thu, 18 Nov 2010, Shaohui Zheng wrote:
> > > Index: linux-hpe4/arch/x86/kernel/e820.c
> > > ===================================================================
> > > --- linux-hpe4.orig/arch/x86/kernel/e820.c 2010-11-15 17:13:02.483461667 +0800
> > > +++ linux-hpe4/arch/x86/kernel/e820.c 2010-11-15 17:13:07.083461581 +0800
> > > @@ -971,6 +971,7 @@
> > > }
> > >
> > > static int userdef __initdata;
> > > +static u64 max_mem_size __initdata = ULLONG_MAX;
> > >
> > > /* "mem=nopentium" disables the 4MB page tables. */
> > > static int __init parse_memopt(char *p)
> > > @@ -989,12 +990,28 @@
> > >
> > > userdef = 1;
> > > mem_size = memparse(p, &p);
> > > - e820_remove_range(mem_size, ULLONG_MAX - mem_size, E820_RAM, 1);
> > > + e820_remove_range(mem_size, max_mem_size - mem_size, E820_RAM, 1);
> > > + max_mem_size = mem_size;
> > >
> > > return 0;
> > > }
> >
> > This needs memmap= support as well, right?
> we did not do the testing after combine both memmap and numa=hide paramter,
> I think that the result should similar with mem=XX, they both remove a memory
> region from the e820 table.
>
You've modified the parser for mem= but not memmap= so the change needs
additional support for the latter.
> > > early_param("mem", parse_memopt);
> > >
> > > +#ifdef CONFIG_NODE_HOTPLUG_EMU
> > > +u64 __init e820_hide_mem(u64 mem_size)
> > > +{
> > > + u64 start, end_pfn;
> > > +
> > > + userdef = 1;
> > > + end_pfn = e820_end_of_ram_pfn();
> > > + start = (end_pfn << PAGE_SHIFT) - mem_size;
> > > + e820_remove_range(start, max_mem_size - start, E820_RAM, 1);
> > > + max_mem_size = start;
> > > +
> > > + return start;
> > > +}
> > > +#endif
> >
> > This doesn't have any sanity checking for whether e820_remove_range() will
> > leave any significant amount of memory behind so the kernel will even boot
> > (probably should have a guaranteed FAKE_NODE_MIN_SIZE left behind?).
>
> it should not be checked here, it should be checked by the function who call
> e820_hide_mem, and truncate the mem_size with FAKE_NODE_MIN_SIZE.
>
Your patchset doesn't do that, I'm talking specifically about the amount
of memory left behind so that the kernel at least still boots. That seems
to be a function of e820_hide_mem() to do some sanity checking so we
actually still get a kernel rather than the responsibility of the
command-line parser.
On Thu, Nov 18, 2010 at 01:16:07PM -0800, David Rientjes wrote:
> On Thu, 18 Nov 2010, Shaohui Zheng wrote:
>
> > > > Index: linux-hpe4/arch/x86/kernel/e820.c
> > > > ===================================================================
> > > > --- linux-hpe4.orig/arch/x86/kernel/e820.c 2010-11-15 17:13:02.483461667 +0800
> > > > +++ linux-hpe4/arch/x86/kernel/e820.c 2010-11-15 17:13:07.083461581 +0800
> > > > @@ -971,6 +971,7 @@
> > > > }
> > > >
> > > > static int userdef __initdata;
> > > > +static u64 max_mem_size __initdata = ULLONG_MAX;
> > > >
> > > > /* "mem=nopentium" disables the 4MB page tables. */
> > > > static int __init parse_memopt(char *p)
> > > > @@ -989,12 +990,28 @@
> > > >
> > > > userdef = 1;
> > > > mem_size = memparse(p, &p);
> > > > - e820_remove_range(mem_size, ULLONG_MAX - mem_size, E820_RAM, 1);
> > > > + e820_remove_range(mem_size, max_mem_size - mem_size, E820_RAM, 1);
> > > > + max_mem_size = mem_size;
> > > >
> > > > return 0;
> > > > }
> > >
> > > This needs memmap= support as well, right?
> > we did not do the testing after combine both memmap and numa=hide paramter,
> > I think that the result should similar with mem=XX, they both remove a memory
> > region from the e820 table.
> >
>
> You've modified the parser for mem= but not memmap= so the change needs
> additional support for the latter.
>
the parser for mem= is not modified, the changed parser is numa=, I add a addtional
option numa=hide=.
>From current discussion, numa=hide= interface should be removed, we will use mem=
to hide memory.
> > > > early_param("mem", parse_memopt);
> > > >
> > > > +#ifdef CONFIG_NODE_HOTPLUG_EMU
> > > > +u64 __init e820_hide_mem(u64 mem_size)
> > > > +{
> > > > + u64 start, end_pfn;
> > > > +
> > > > + userdef = 1;
> > > > + end_pfn = e820_end_of_ram_pfn();
> > > > + start = (end_pfn << PAGE_SHIFT) - mem_size;
> > > > + e820_remove_range(start, max_mem_size - start, E820_RAM, 1);
> > > > + max_mem_size = start;
> > > > +
> > > > + return start;
> > > > +}
> > > > +#endif
> > >
> > > This doesn't have any sanity checking for whether e820_remove_range() will
> > > leave any significant amount of memory behind so the kernel will even boot
> > > (probably should have a guaranteed FAKE_NODE_MIN_SIZE left behind?).
> >
> > it should not be checked here, it should be checked by the function who call
> > e820_hide_mem, and truncate the mem_size with FAKE_NODE_MIN_SIZE.
> >
>
> Your patchset doesn't do that, I'm talking specifically about the amount
> of memory left behind so that the kernel at least still boots. That seems
> to be a function of e820_hide_mem() to do some sanity checking so we
> actually still get a kernel rather than the responsibility of the
> command-line parser.
How much memory is enough to make sure the kernel can still boot, it is very
hard to measure. it is almost impossible to get the exact data. I try to leave very
few memory to kernel(hide most memory with numa=hide), it cause a panic directly.
I have no idea about it, do you have any suggestions?
Another example,
I try to add paramter "mem=1M", it compains "Select item can not fit into memory",
and I did not find where the error message comes from, I guess that it should
be printed by grub.
On Fri, 19 Nov 2010, Shaohui Zheng wrote:
> > > > > Index: linux-hpe4/arch/x86/kernel/e820.c
> > > > > ===================================================================
> > > > > --- linux-hpe4.orig/arch/x86/kernel/e820.c 2010-11-15 17:13:02.483461667 +0800
> > > > > +++ linux-hpe4/arch/x86/kernel/e820.c 2010-11-15 17:13:07.083461581 +0800
> > > > > @@ -971,6 +971,7 @@
> > > > > }
> > > > >
> > > > > static int userdef __initdata;
> > > > > +static u64 max_mem_size __initdata = ULLONG_MAX;
> > > > >
> > > > > /* "mem=nopentium" disables the 4MB page tables. */
> > > > > static int __init parse_memopt(char *p)
> > > > > @@ -989,12 +990,28 @@
> > > > >
> > > > > userdef = 1;
> > > > > mem_size = memparse(p, &p);
> > > > > - e820_remove_range(mem_size, ULLONG_MAX - mem_size, E820_RAM, 1);
> > > > > + e820_remove_range(mem_size, max_mem_size - mem_size, E820_RAM, 1);
> > > > > + max_mem_size = mem_size;
> > > > >
> > > > > return 0;
> > > > > }
> > > >
> > > > This needs memmap= support as well, right?
> > > we did not do the testing after combine both memmap and numa=hide paramter,
> > > I think that the result should similar with mem=XX, they both remove a memory
> > > region from the e820 table.
> > >
> >
> > You've modified the parser for mem= but not memmap= so the change needs
> > additional support for the latter.
> >
>
> the parser for mem= is not modified, the changed parser is numa=, I add a addtional
> option numa=hide=.
>
The above hunk is modifying the x86 parser for the mem= parameter.
> > Your patchset doesn't do that, I'm talking specifically about the amount
> > of memory left behind so that the kernel at least still boots. That seems
> > to be a function of e820_hide_mem() to do some sanity checking so we
> > actually still get a kernel rather than the responsibility of the
> > command-line parser.
>
> How much memory is enough to make sure the kernel can still boot, it is very
> hard to measure. it is almost impossible to get the exact data. I try to leave very
> few memory to kernel(hide most memory with numa=hide), it cause a panic directly.
>
> I have no idea about it, do you have any suggestions?
>
Yes, I think we should use FAKE_NODE_MIN_SIZE to represent the smallest
node that may be added and so the appropriate behavior or e820_hide_mem()
would be to leave at least this quantity behind for the kernel to be
loaded.
On Sat, Nov 20, 2010 at 04:45:06PM -0800, David Rientjes wrote:
>On Fri, 19 Nov 2010, Shaohui Zheng wrote:
>
>> > > > > Index: linux-hpe4/arch/x86/kernel/e820.c
>> > > > > ===================================================================
>> > > > > --- linux-hpe4.orig/arch/x86/kernel/e820.c 2010-11-15 17:13:02.483461667 +0800
>> > > > > +++ linux-hpe4/arch/x86/kernel/e820.c 2010-11-15 17:13:07.083461581 +0800
>> > > > > @@ -971,6 +971,7 @@
>> > > > > }
>> > > > >
>> > > > > static int userdef __initdata;
>> > > > > +static u64 max_mem_size __initdata = ULLONG_MAX;
>> > > > >
>> > > > > /* "mem=nopentium" disables the 4MB page tables. */
>> > > > > static int __init parse_memopt(char *p)
>> > > > > @@ -989,12 +990,28 @@
>> > > > >
>> > > > > userdef = 1;
>> > > > > mem_size = memparse(p, &p);
>> > > > > - e820_remove_range(mem_size, ULLONG_MAX - mem_size, E820_RAM, 1);
>> > > > > + e820_remove_range(mem_size, max_mem_size - mem_size, E820_RAM, 1);
>> > > > > + max_mem_size = mem_size;
>> > > > >
>> > > > > return 0;
>> > > > > }
>> > > >
>> > > > This needs memmap= support as well, right?
>> > > we did not do the testing after combine both memmap and numa=hide paramter,
>> > > I think that the result should similar with mem=XX, they both remove a memory
>> > > region from the e820 table.
>> > >
>> >
>> > You've modified the parser for mem= but not memmap= so the change needs
>> > additional support for the latter.
>> >
>>
>> the parser for mem= is not modified, the changed parser is numa=, I add a addtional
>> option numa=hide=.
>>
>
>The above hunk is modifying the x86 parser for the mem= parameter.
>
That is fine as long as "mem=" is parsed before "numa=".
I think "mem=" should always be parsed before "numa=" no matter what
order they are specified in cmdline, since we need know how much total
memory we have at first.
Thanks.
On Sun, 21 Nov 2010, Américo Wang wrote:
> >> > > > > Index: linux-hpe4/arch/x86/kernel/e820.c
> >> > > > > ===================================================================
> >> > > > > --- linux-hpe4.orig/arch/x86/kernel/e820.c 2010-11-15 17:13:02.483461667 +0800
> >> > > > > +++ linux-hpe4/arch/x86/kernel/e820.c 2010-11-15 17:13:07.083461581 +0800
> >> > > > > @@ -971,6 +971,7 @@
> >> > > > > }
> >> > > > >
> >> > > > > static int userdef __initdata;
> >> > > > > +static u64 max_mem_size __initdata = ULLONG_MAX;
> >> > > > >
> >> > > > > /* "mem=nopentium" disables the 4MB page tables. */
> >> > > > > static int __init parse_memopt(char *p)
> >> > > > > @@ -989,12 +990,28 @@
> >> > > > >
> >> > > > > userdef = 1;
> >> > > > > mem_size = memparse(p, &p);
> >> > > > > - e820_remove_range(mem_size, ULLONG_MAX - mem_size, E820_RAM, 1);
> >> > > > > + e820_remove_range(mem_size, max_mem_size - mem_size, E820_RAM, 1);
> >> > > > > + max_mem_size = mem_size;
> >> > > > >
> >> > > > > return 0;
> >> > > > > }
> >> > > >
> >> > > > This needs memmap= support as well, right?
> >> > > we did not do the testing after combine both memmap and numa=hide paramter,
> >> > > I think that the result should similar with mem=XX, they both remove a memory
> >> > > region from the e820 table.
> >> > >
> >> >
> >> > You've modified the parser for mem= but not memmap= so the change needs
> >> > additional support for the latter.
> >> >
> >>
> >> the parser for mem= is not modified, the changed parser is numa=, I add a addtional
> >> option numa=hide=.
> >>
> >
> >The above hunk is modifying the x86 parser for the mem= parameter.
> >
>
> That is fine as long as "mem=" is parsed before "numa=".
>
If you'll read the discussion, I had no problem with modifying the mem
parser. I merely suggested that Shaohui modify the memmap parser in the
same way to save max_mem_size so users can use it as well for the hidden
nodes, that are now obsolete. Apparently that was misunderstood by both
of you although it looks pretty clear above, I dunno.