2012-06-11 08:40:35

by Wen Congyang

[permalink] [raw]
Subject: [PATCH 1/2 v2] x86: add max_addr boot option

Currently, the boot option max_addr is only supported on ia64 platform.
We also need it on x86 platform.
For example:
There are two nodes:
NODE#0 address range 0x00000000 00000000 - 0x00010000 00000000
NODE#1 address range 0x00010000 00000000 - 0x00020000 00000000
If we only want to use node0, we can specify the max_addr. The boot
option "mem=" can do the same thing now. But the boot option "mem="
means the total memory used by the system. If we tell the user
that the boot option "mem=" can do this, it will confuse the user.
So we need an new boot option "max_addr" on x86 platform.

Signed-off-by: Wen Congyang <[email protected]>
---
Documentation/kernel-parameters.txt | 2 +-
arch/x86/kernel/e820.c | 36 +++++++++++++++++++++++++++++++++++
2 files changed, 37 insertions(+), 1 deletions(-)

diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
index a92c5eb..034609d 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -1441,7 +1441,7 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
yeeloong laptop.
Example: machtype=lemote-yeeloong-2f-7inch

- max_addr=nn[KMG] [KNL,BOOT,ia64] All physical memory greater
+ max_addr=nn[KMG] [KNL,BOOT,ia64,X86] All physical memory greater
than or equal to this physical address is ignored.

maxcpus= [SMP] Maximum number of processors that an SMP kernel
diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c
index 4185797..cd07226 100644
--- a/arch/x86/kernel/e820.c
+++ b/arch/x86/kernel/e820.c
@@ -47,6 +47,7 @@ unsigned long pci_mem_start = 0xaeedbabe;
#ifdef CONFIG_PCI
EXPORT_SYMBOL(pci_mem_start);
#endif
+static u64 max_addr = ~0ULL;

/*
* This function checks if any part of the range <start,end> is mapped
@@ -119,6 +120,20 @@ static void __init __e820_add_region(struct e820map *e820x, u64 start, u64 size,
return;
}

+ if (start >= max_addr) {
+ printk(KERN_ERR "e820: ignoring [mem %#010llx-%#010llx]\n",
+ (unsigned long long)start,
+ (unsigned long long)(start + size - 1));
+ return;
+ }
+
+ if (max_addr - start < size) {
+ printk(KERN_ERR "e820: ignoring [mem %#010llx-%#010llx]\n",
+ (unsigned long long)max_addr,
+ (unsigned long long)(start + size - 1));
+ size = max_addr - start;
+ }
+
e820x->map[x].addr = start;
e820x->map[x].size = size;
e820x->map[x].type = type;
@@ -835,6 +850,22 @@ static int __init parse_memopt(char *p)
}
early_param("mem", parse_memopt);

+static int __init parse_memmax_opt(char *p)
+{
+ char *oldp;
+
+ if (!p)
+ return -EINVAL;
+
+ oldp = p;
+ max_addr = memparse(p, &p);
+ if (p == oldp)
+ return -EINVAL;
+
+ return 0;
+}
+early_param("max_addr", parse_memmax_opt);
+
static int __init parse_memmap_opt(char *p)
{
char *oldp;
@@ -881,6 +912,11 @@ early_param("memmap", parse_memmap_opt);

void __init finish_e820_parsing(void)
{
+ if (max_addr != ~0ULL) {
+ userdef = 1;
+ e820_remove_range(max_addr, ULLONG_MAX - max_addr, E820_RAM, 1);
+ }
+
if (userdef) {
u32 nr = e820.nr_map;

--
1.7.1


2012-06-11 08:42:01

by Wen Congyang

[permalink] [raw]
Subject: [PATCH 2/2 v2] x86: reimplement mem boot option

The boot option "mem=" specifies the total memory that the system can
use. But we implement it as max_addr.

The x86 system can be booted by EFI. If the user specify the boot
option "add_efi_memmap", we add all memory map from EFI, but we
donot handle the memory map according to the boot option "mem=".

This patch reimplement the boot option "mem=", and handle the memory
map after calling efi_init().

Signed-off-by: Wen Congyang <[email protected]>
---
arch/x86/include/asm/e820.h | 1 +
arch/x86/kernel/e820.c | 36 +++++++++++++++++++++++++++++++-----
arch/x86/kernel/setup.c | 1 +
3 files changed, 33 insertions(+), 5 deletions(-)

diff --git a/arch/x86/include/asm/e820.h b/arch/x86/include/asm/e820.h
index 3778256..d1bb772 100644
--- a/arch/x86/include/asm/e820.h
+++ b/arch/x86/include/asm/e820.h
@@ -127,6 +127,7 @@ extern void e820_reserve_resources(void);
extern void e820_reserve_resources_late(void);
extern void setup_memory_map(void);
extern char *default_machine_specific_memory_setup(void);
+extern void set_memlimit(void);

/*
* Returns true iff the specified range [s,e) is completely contained inside
diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c
index cd07226..b234b25 100644
--- a/arch/x86/kernel/e820.c
+++ b/arch/x86/kernel/e820.c
@@ -48,6 +48,7 @@ unsigned long pci_mem_start = 0xaeedbabe;
EXPORT_SYMBOL(pci_mem_start);
#endif
static u64 max_addr = ~0ULL;
+static u64 mem_limit = ~0ULL;

/*
* This function checks if any part of the range <start,end> is mapped
@@ -824,8 +825,6 @@ static int userdef __initdata;
/* "mem=nopentium" disables the 4MB page tables. */
static int __init parse_memopt(char *p)
{
- u64 mem_size;
-
if (!p)
return -EINVAL;

@@ -840,16 +839,43 @@ static int __init parse_memopt(char *p)
}

userdef = 1;
- mem_size = memparse(p, &p);
+ mem_limit = memparse(p, &p);
/* don't remove all of memory when handling "mem={invalid}" param */
- if (mem_size == 0)
+ if (mem_limit == 0)
return -EINVAL;
- e820_remove_range(mem_size, ULLONG_MAX - mem_size, E820_RAM, 1);

return 0;
}
early_param("mem", parse_memopt);

+void __init set_memlimit(void)
+{
+ u64 total_size = 0;
+ int i;
+
+ if (mem_limit == ~0ULL)
+ return;
+
+ for (i = 0; i < e820.nr_map; i++) {
+ struct e820entry *ei = &e820.map[i];
+
+ if (ei->type != E820_RAM)
+ continue;
+
+ if (total_size >= mem_limit) {
+ memset(ei, 0, sizeof(struct e820entry));
+ continue;
+ }
+
+ if (mem_limit - total_size <= ei->size)
+ ei->size = mem_limit - total_size;
+
+ total_size += ei->size;
+ }
+
+ sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map);
+}
+
static int __init parse_memmax_opt(char *p)
{
char *oldp;
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index 16be6dc..a3c4ac3 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -815,6 +815,7 @@ void __init setup_arch(char **cmdline_p)

if (efi_enabled)
efi_init();
+ set_memlimit();

dmi_scan_machine();

--
1.7.1

2012-06-11 17:35:37

by Bjorn Helgaas

[permalink] [raw]
Subject: Re: [PATCH 1/2 v2] x86: add max_addr boot option

On Mon, Jun 11, 2012 at 1:44 AM, Wen Congyang <[email protected]> wrote:
> Currently, the boot option max_addr is only supported on ia64 platform.
> We also need it on x86 platform.
> For example:
> There are two nodes:
> ?NODE#0 ?address range 0x00000000 00000000 - 0x00010000 00000000
> ?NODE#1 ?address range 0x00010000 00000000 - 0x00020000 00000000
> If we only want to use node0, we can specify the max_addr. The boot
> option "mem=" can do the same thing now. But the boot option "mem="
> means the total memory used by the system. If we tell the user
> that the boot option "mem=" can do this, it will confuse the user.
> So we need an new boot option "max_addr" on x86 platform.

I don't object to this patch (and thanks for tweaking the mem range printk).

I don't know what your use case is, but from a user interface
perspective, the "max_addr=" option feels like a bit of a hack. If
you're trying to avoid use of other nodes, "max_addr" is an awkward
way to do it. It requires the user to know the physical address ->
node mappings, and it doesn't affect the CPUs and I/O resources on
other nodes. You could implement a "numa_node=" or similar parameter
that would allow you to ignore remote memory, CPUs, and I/O.

> Signed-off-by: Wen Congyang <[email protected]>
> ---
> ?Documentation/kernel-parameters.txt | ? ?2 +-
> ?arch/x86/kernel/e820.c ? ? ? ? ? ? ?| ? 36 +++++++++++++++++++++++++++++++++++
> ?2 files changed, 37 insertions(+), 1 deletions(-)
>
> diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
> index a92c5eb..034609d 100644
> --- a/Documentation/kernel-parameters.txt
> +++ b/Documentation/kernel-parameters.txt
> @@ -1441,7 +1441,7 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
> ? ? ? ? ? ? ? ? ? ? ? ? yeeloong laptop.
> ? ? ? ? ? ? ? ? ? ? ? ?Example: machtype=lemote-yeeloong-2f-7inch
>
> - ? ? ? max_addr=nn[KMG] ? ? ? ?[KNL,BOOT,ia64] All physical memory greater
> + ? ? ? max_addr=nn[KMG] ? ? ? ?[KNL,BOOT,ia64,X86] All physical memory greater
> ? ? ? ? ? ? ? ? ? ? ? ?than or equal to this physical address is ignored.
>
> ? ? ? ?maxcpus= ? ? ? ?[SMP] Maximum number of processors that an SMP kernel
> diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c
> index 4185797..cd07226 100644
> --- a/arch/x86/kernel/e820.c
> +++ b/arch/x86/kernel/e820.c
> @@ -47,6 +47,7 @@ unsigned long pci_mem_start = 0xaeedbabe;
> ?#ifdef CONFIG_PCI
> ?EXPORT_SYMBOL(pci_mem_start);
> ?#endif
> +static u64 max_addr = ~0ULL;
>
> ?/*
> ?* This function checks if any part of the range <start,end> is mapped
> @@ -119,6 +120,20 @@ static void __init __e820_add_region(struct e820map *e820x, u64 start, u64 size,
> ? ? ? ? ? ? ? ?return;
> ? ? ? ?}
>
> + ? ? ? if (start >= max_addr) {
> + ? ? ? ? ? ? ? printk(KERN_ERR "e820: ignoring [mem %#010llx-%#010llx]\n",
> + ? ? ? ? ? ? ? ? ? ? ?(unsigned long long)start,
> + ? ? ? ? ? ? ? ? ? ? ?(unsigned long long)(start + size - 1));
> + ? ? ? ? ? ? ? return;
> + ? ? ? }
> +
> + ? ? ? if (max_addr - start < size) {
> + ? ? ? ? ? ? ? printk(KERN_ERR "e820: ignoring [mem %#010llx-%#010llx]\n",
> + ? ? ? ? ? ? ? ? ? ? ?(unsigned long long)max_addr,
> + ? ? ? ? ? ? ? ? ? ? ?(unsigned long long)(start + size - 1));
> + ? ? ? ? ? ? ? size = max_addr - start;
> + ? ? ? }
> +
> ? ? ? ?e820x->map[x].addr = start;
> ? ? ? ?e820x->map[x].size = size;
> ? ? ? ?e820x->map[x].type = type;
> @@ -835,6 +850,22 @@ static int __init parse_memopt(char *p)
> ?}
> ?early_param("mem", parse_memopt);
>
> +static int __init parse_memmax_opt(char *p)
> +{
> + ? ? ? char *oldp;
> +
> + ? ? ? if (!p)
> + ? ? ? ? ? ? ? return -EINVAL;
> +
> + ? ? ? oldp = p;
> + ? ? ? max_addr = memparse(p, &p);
> + ? ? ? if (p == oldp)
> + ? ? ? ? ? ? ? return -EINVAL;
> +
> + ? ? ? return 0;
> +}
> +early_param("max_addr", parse_memmax_opt);
> +
> ?static int __init parse_memmap_opt(char *p)
> ?{
> ? ? ? ?char *oldp;
> @@ -881,6 +912,11 @@ early_param("memmap", parse_memmap_opt);
>
> ?void __init finish_e820_parsing(void)
> ?{
> + ? ? ? if (max_addr != ~0ULL) {
> + ? ? ? ? ? ? ? userdef = 1;
> + ? ? ? ? ? ? ? e820_remove_range(max_addr, ULLONG_MAX - max_addr, E820_RAM, 1);
> + ? ? ? }
> +
> ? ? ? ?if (userdef) {
> ? ? ? ? ? ? ? ?u32 nr = e820.nr_map;
>
> --
> 1.7.1
>

2012-06-11 21:15:20

by H. Peter Anvin

[permalink] [raw]
Subject: Re: [PATCH 1/2 v2] x86: add max_addr boot option

On 06/11/2012 01:44 AM, Wen Congyang wrote:
> Currently, the boot option max_addr is only supported on ia64 platform.
> We also need it on x86 platform.
> For example:
> There are two nodes:
> NODE#0 address range 0x00000000 00000000 - 0x00010000 00000000
> NODE#1 address range 0x00010000 00000000 - 0x00020000 00000000
> If we only want to use node0, we can specify the max_addr. The boot
> option "mem=" can do the same thing now. But the boot option "mem="
> means the total memory used by the system. If we tell the user
> that the boot option "mem=" can do this, it will confuse the user.
> So we need an new boot option "max_addr" on x86 platform.
>

I fail to see what this does that cannot be done with the
since-long-existing memmap= option. Could you address why memmap=
doesn't match your needs?

-hpa

2012-06-12 06:22:10

by Wen Congyang

[permalink] [raw]
Subject: Re: [PATCH 1/2 v2] x86: add max_addr boot option

At 06/12/2012 05:15 AM, H. Peter Anvin Wrote:
> On 06/11/2012 01:44 AM, Wen Congyang wrote:
>> Currently, the boot option max_addr is only supported on ia64 platform.
>> We also need it on x86 platform.
>> For example:
>> There are two nodes:
>> NODE#0 address range 0x00000000 00000000 - 0x00010000 00000000
>> NODE#1 address range 0x00010000 00000000 - 0x00020000 00000000
>> If we only want to use node0, we can specify the max_addr. The boot
>> option "mem=" can do the same thing now. But the boot option "mem="
>> means the total memory used by the system. If we tell the user
>> that the boot option "mem=" can do this, it will confuse the user.
>> So we need an new boot option "max_addr" on x86 platform.
>>
>
> I fail to see what this does that cannot be done with the
> since-long-existing memmap= option. Could you address why memmap=
> doesn't match your needs?

The memmap= option is very diffcult to use. The end user should know the memory
map in the system. The end user can get the max address of NODE#0, but he
may not know the memory map for NODE#0. If the end user give the wrong memory
map, the kernel can not boot. For example: I add memmap=16G@0 in the kernel
parameter, and the kernel cannot boot. The max_addr is more easier to use.

Thanks
Wen Congyang

>
> -hpa
>
> --
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to [email protected]
> More majordomo info at http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at http://www.tux.org/lkml/
>

2012-06-12 06:25:43

by Wen Congyang

[permalink] [raw]
Subject: Re: [PATCH 1/2 v2] x86: add max_addr boot option

At 06/12/2012 01:35 AM, Bjorn Helgaas Wrote:
> On Mon, Jun 11, 2012 at 1:44 AM, Wen Congyang <[email protected]> wrote:
>> Currently, the boot option max_addr is only supported on ia64 platform.
>> We also need it on x86 platform.
>> For example:
>> There are two nodes:
>> NODE#0 address range 0x00000000 00000000 - 0x00010000 00000000
>> NODE#1 address range 0x00010000 00000000 - 0x00020000 00000000
>> If we only want to use node0, we can specify the max_addr. The boot
>> option "mem=" can do the same thing now. But the boot option "mem="
>> means the total memory used by the system. If we tell the user
>> that the boot option "mem=" can do this, it will confuse the user.
>> So we need an new boot option "max_addr" on x86 platform.
>
> I don't object to this patch (and thanks for tweaking the mem range printk).
>
> I don't know what your use case is, but from a user interface
> perspective, the "max_addr=" option feels like a bit of a hack. If
> you're trying to avoid use of other nodes, "max_addr" is an awkward
> way to do it. It requires the user to know the physical address ->
> node mappings, and it doesn't affect the CPUs and I/O resources on
> other nodes. You could implement a "numa_node=" or similar parameter
> that would allow you to ignore remote memory, CPUs, and I/O.

Currently, I only need to ignore the memory. If we need to ignore a node,
"numa_node=" or similar parameter is a better choice.

Thanks
Wen Congyang

>
>> Signed-off-by: Wen Congyang <[email protected]>
>> ---
>> Documentation/kernel-parameters.txt | 2 +-
>> arch/x86/kernel/e820.c | 36 +++++++++++++++++++++++++++++++++++
>> 2 files changed, 37 insertions(+), 1 deletions(-)
>>
>> diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
>> index a92c5eb..034609d 100644
>> --- a/Documentation/kernel-parameters.txt
>> +++ b/Documentation/kernel-parameters.txt
>> @@ -1441,7 +1441,7 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
>> yeeloong laptop.
>> Example: machtype=lemote-yeeloong-2f-7inch
>>
>> - max_addr=nn[KMG] [KNL,BOOT,ia64] All physical memory greater
>> + max_addr=nn[KMG] [KNL,BOOT,ia64,X86] All physical memory greater
>> than or equal to this physical address is ignored.
>>
>> maxcpus= [SMP] Maximum number of processors that an SMP kernel
>> diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c
>> index 4185797..cd07226 100644
>> --- a/arch/x86/kernel/e820.c
>> +++ b/arch/x86/kernel/e820.c
>> @@ -47,6 +47,7 @@ unsigned long pci_mem_start = 0xaeedbabe;
>> #ifdef CONFIG_PCI
>> EXPORT_SYMBOL(pci_mem_start);
>> #endif
>> +static u64 max_addr = ~0ULL;
>>
>> /*
>> * This function checks if any part of the range <start,end> is mapped
>> @@ -119,6 +120,20 @@ static void __init __e820_add_region(struct e820map *e820x, u64 start, u64 size,
>> return;
>> }
>>
>> + if (start >= max_addr) {
>> + printk(KERN_ERR "e820: ignoring [mem %#010llx-%#010llx]\n",
>> + (unsigned long long)start,
>> + (unsigned long long)(start + size - 1));
>> + return;
>> + }
>> +
>> + if (max_addr - start < size) {
>> + printk(KERN_ERR "e820: ignoring [mem %#010llx-%#010llx]\n",
>> + (unsigned long long)max_addr,
>> + (unsigned long long)(start + size - 1));
>> + size = max_addr - start;
>> + }
>> +
>> e820x->map[x].addr = start;
>> e820x->map[x].size = size;
>> e820x->map[x].type = type;
>> @@ -835,6 +850,22 @@ static int __init parse_memopt(char *p)
>> }
>> early_param("mem", parse_memopt);
>>
>> +static int __init parse_memmax_opt(char *p)
>> +{
>> + char *oldp;
>> +
>> + if (!p)
>> + return -EINVAL;
>> +
>> + oldp = p;
>> + max_addr = memparse(p, &p);
>> + if (p == oldp)
>> + return -EINVAL;
>> +
>> + return 0;
>> +}
>> +early_param("max_addr", parse_memmax_opt);
>> +
>> static int __init parse_memmap_opt(char *p)
>> {
>> char *oldp;
>> @@ -881,6 +912,11 @@ early_param("memmap", parse_memmap_opt);
>>
>> void __init finish_e820_parsing(void)
>> {
>> + if (max_addr != ~0ULL) {
>> + userdef = 1;
>> + e820_remove_range(max_addr, ULLONG_MAX - max_addr, E820_RAM, 1);
>> + }
>> +
>> if (userdef) {
>> u32 nr = e820.nr_map;
>>
>> --
>> 1.7.1
>>
> --
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to [email protected]
> More majordomo info at http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at http://www.tux.org/lkml/
>

2012-06-12 11:30:57

by Bjorn Helgaas

[permalink] [raw]
Subject: Re: [PATCH 1/2 v2] x86: add max_addr boot option

On Mon, Jun 11, 2012 at 11:29 PM, Wen Congyang <[email protected]> wrote:
> At 06/12/2012 01:35 AM, Bjorn Helgaas Wrote:
>> On Mon, Jun 11, 2012 at 1:44 AM, Wen Congyang <[email protected]> wrote:
>>> Currently, the boot option max_addr is only supported on ia64 platform.
>>> We also need it on x86 platform.
>>> For example:
>>> There are two nodes:
>>> ?NODE#0 ?address range 0x00000000 00000000 - 0x00010000 00000000
>>> ?NODE#1 ?address range 0x00010000 00000000 - 0x00020000 00000000
>>> If we only want to use node0, we can specify the max_addr. The boot
>>> option "mem=" can do the same thing now. But the boot option "mem="
>>> means the total memory used by the system. If we tell the user
>>> that the boot option "mem=" can do this, it will confuse the user.
>>> So we need an new boot option "max_addr" on x86 platform.
>>
>> I don't object to this patch (and thanks for tweaking the mem range printk).
>>
>> I don't know what your use case is, but from a user interface
>> perspective, the "max_addr=" option feels like a bit of a hack. ?If
>> you're trying to avoid use of other nodes, "max_addr" is an awkward
>> way to do it. ?It requires the user to know the physical address ->
>> node mappings, and it doesn't affect the CPUs and I/O resources on
>> other nodes. ?You could implement a "numa_node=" or similar parameter
>> that would allow you to ignore remote memory, CPUs, and I/O.
>
> Currently, I only need to ignore the memory. If we need to ignore a node,
> "numa_node=" or similar parameter is a better choice.

Doesn't the end user have to know the memory map of the system to use
"max_addr="? How do you know what value to supply? Do you have to
attempt a boot once to discover the highest address on node 0? What
if node 0 and node 1 memory are interleaved, so there's some node 1
memory below the highest node 0 address?

>>> Signed-off-by: Wen Congyang <[email protected]>
>>> ---
>>> ?Documentation/kernel-parameters.txt | ? ?2 +-
>>> ?arch/x86/kernel/e820.c ? ? ? ? ? ? ?| ? 36 +++++++++++++++++++++++++++++++++++
>>> ?2 files changed, 37 insertions(+), 1 deletions(-)
>>>
>>> diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
>>> index a92c5eb..034609d 100644
>>> --- a/Documentation/kernel-parameters.txt
>>> +++ b/Documentation/kernel-parameters.txt
>>> @@ -1441,7 +1441,7 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
>>> ? ? ? ? ? ? ? ? ? ? ? ? yeeloong laptop.
>>> ? ? ? ? ? ? ? ? ? ? ? ?Example: machtype=lemote-yeeloong-2f-7inch
>>>
>>> - ? ? ? max_addr=nn[KMG] ? ? ? ?[KNL,BOOT,ia64] All physical memory greater
>>> + ? ? ? max_addr=nn[KMG] ? ? ? ?[KNL,BOOT,ia64,X86] All physical memory greater
>>> ? ? ? ? ? ? ? ? ? ? ? ?than or equal to this physical address is ignored.
>>>
>>> ? ? ? ?maxcpus= ? ? ? ?[SMP] Maximum number of processors that an SMP kernel
>>> diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c
>>> index 4185797..cd07226 100644
>>> --- a/arch/x86/kernel/e820.c
>>> +++ b/arch/x86/kernel/e820.c
>>> @@ -47,6 +47,7 @@ unsigned long pci_mem_start = 0xaeedbabe;
>>> ?#ifdef CONFIG_PCI
>>> ?EXPORT_SYMBOL(pci_mem_start);
>>> ?#endif
>>> +static u64 max_addr = ~0ULL;
>>>
>>> ?/*
>>> ?* This function checks if any part of the range <start,end> is mapped
>>> @@ -119,6 +120,20 @@ static void __init __e820_add_region(struct e820map *e820x, u64 start, u64 size,
>>> ? ? ? ? ? ? ? ?return;
>>> ? ? ? ?}
>>>
>>> + ? ? ? if (start >= max_addr) {
>>> + ? ? ? ? ? ? ? printk(KERN_ERR "e820: ignoring [mem %#010llx-%#010llx]\n",
>>> + ? ? ? ? ? ? ? ? ? ? ?(unsigned long long)start,
>>> + ? ? ? ? ? ? ? ? ? ? ?(unsigned long long)(start + size - 1));
>>> + ? ? ? ? ? ? ? return;
>>> + ? ? ? }
>>> +
>>> + ? ? ? if (max_addr - start < size) {
>>> + ? ? ? ? ? ? ? printk(KERN_ERR "e820: ignoring [mem %#010llx-%#010llx]\n",
>>> + ? ? ? ? ? ? ? ? ? ? ?(unsigned long long)max_addr,
>>> + ? ? ? ? ? ? ? ? ? ? ?(unsigned long long)(start + size - 1));
>>> + ? ? ? ? ? ? ? size = max_addr - start;
>>> + ? ? ? }
>>> +
>>> ? ? ? ?e820x->map[x].addr = start;
>>> ? ? ? ?e820x->map[x].size = size;
>>> ? ? ? ?e820x->map[x].type = type;
>>> @@ -835,6 +850,22 @@ static int __init parse_memopt(char *p)
>>> ?}
>>> ?early_param("mem", parse_memopt);
>>>
>>> +static int __init parse_memmax_opt(char *p)
>>> +{
>>> + ? ? ? char *oldp;
>>> +
>>> + ? ? ? if (!p)
>>> + ? ? ? ? ? ? ? return -EINVAL;
>>> +
>>> + ? ? ? oldp = p;
>>> + ? ? ? max_addr = memparse(p, &p);
>>> + ? ? ? if (p == oldp)
>>> + ? ? ? ? ? ? ? return -EINVAL;
>>> +
>>> + ? ? ? return 0;
>>> +}
>>> +early_param("max_addr", parse_memmax_opt);
>>> +
>>> ?static int __init parse_memmap_opt(char *p)
>>> ?{
>>> ? ? ? ?char *oldp;
>>> @@ -881,6 +912,11 @@ early_param("memmap", parse_memmap_opt);
>>>
>>> ?void __init finish_e820_parsing(void)
>>> ?{
>>> + ? ? ? if (max_addr != ~0ULL) {
>>> + ? ? ? ? ? ? ? userdef = 1;
>>> + ? ? ? ? ? ? ? e820_remove_range(max_addr, ULLONG_MAX - max_addr, E820_RAM, 1);
>>> + ? ? ? }
>>> +
>>> ? ? ? ?if (userdef) {
>>> ? ? ? ? ? ? ? ?u32 nr = e820.nr_map;
>>>
>>> --
>>> 1.7.1
>>>
>> --
>> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
>> the body of a message to [email protected]
>> More majordomo info at ?http://vger.kernel.org/majordomo-info.html
>> Please read the FAQ at ?http://www.tux.org/lkml/
>>
>

2012-06-12 16:10:34

by H. Peter Anvin

[permalink] [raw]
Subject: Re: [PATCH 1/2 v2] x86: add max_addr boot option

If what you care about is nodes, why not have an option to specify a map?

Wen Congyang <[email protected]> wrote:

>At 06/12/2012 05:15 AM, H. Peter Anvin Wrote:
>> On 06/11/2012 01:44 AM, Wen Congyang wrote:
>>> Currently, the boot option max_addr is only supported on ia64
>platform.
>>> We also need it on x86 platform.
>>> For example:
>>> There are two nodes:
>>> NODE#0 address range 0x00000000 00000000 - 0x00010000 00000000
>>> NODE#1 address range 0x00010000 00000000 - 0x00020000 00000000
>>> If we only want to use node0, we can specify the max_addr. The boot
>>> option "mem=" can do the same thing now. But the boot option "mem="
>>> means the total memory used by the system. If we tell the user
>>> that the boot option "mem=" can do this, it will confuse the user.
>>> So we need an new boot option "max_addr" on x86 platform.
>>>
>>
>> I fail to see what this does that cannot be done with the
>> since-long-existing memmap= option. Could you address why memmap=
>> doesn't match your needs?
>
>The memmap= option is very diffcult to use. The end user should know
>the memory
>map in the system. The end user can get the max address of NODE#0, but
>he
>may not know the memory map for NODE#0. If the end user give the wrong
>memory
>map, the kernel can not boot. For example: I add memmap=16G@0 in the
>kernel
>parameter, and the kernel cannot boot. The max_addr is more easier to
>use.
>
>Thanks
>Wen Congyang
>
>>
>> -hpa
>>
>> --
>> To unsubscribe from this list: send the line "unsubscribe
>linux-kernel" in
>> the body of a message to [email protected]
>> More majordomo info at http://vger.kernel.org/majordomo-info.html
>> Please read the FAQ at http://www.tux.org/lkml/
>>

--
Sent from my mobile phone. Please excuse brevity and lack of formatting.

2012-06-13 01:58:09

by Kamezawa Hiroyuki

[permalink] [raw]
Subject: Re: [PATCH 1/2 v2] x86: add max_addr boot option

(2012/06/12 20:30), Bjorn Helgaas wrote:
> On Mon, Jun 11, 2012 at 11:29 PM, Wen Congyang<[email protected]> wrote:
>> At 06/12/2012 01:35 AM, Bjorn Helgaas Wrote:
>>> On Mon, Jun 11, 2012 at 1:44 AM, Wen Congyang<[email protected]> wrote:
>>>> Currently, the boot option max_addr is only supported on ia64 platform.
>>>> We also need it on x86 platform.
>>>> For example:
>>>> There are two nodes:
>>>> NODE#0 address range 0x00000000 00000000 - 0x00010000 00000000
>>>> NODE#1 address range 0x00010000 00000000 - 0x00020000 00000000
>>>> If we only want to use node0, we can specify the max_addr. The boot
>>>> option "mem=" can do the same thing now. But the boot option "mem="
>>>> means the total memory used by the system. If we tell the user
>>>> that the boot option "mem=" can do this, it will confuse the user.
>>>> So we need an new boot option "max_addr" on x86 platform.
>>>
>>> I don't object to this patch (and thanks for tweaking the mem range printk).
>>>
>>> I don't know what your use case is, but from a user interface
>>> perspective, the "max_addr=" option feels like a bit of a hack. If
>>> you're trying to avoid use of other nodes, "max_addr" is an awkward
>>> way to do it. It requires the user to know the physical address ->
>>> node mappings, and it doesn't affect the CPUs and I/O resources on
>>> other nodes. You could implement a "numa_node=" or similar parameter
>>> that would allow you to ignore remote memory, CPUs, and I/O.
>>
>> Currently, I only need to ignore the memory. If we need to ignore a node,
>> "numa_node=" or similar parameter is a better choice.
>
> Doesn't the end user have to know the memory map of the system to use
> "max_addr="? How do you know what value to supply? Do you have to
> attempt a boot once to discover the highest address on node 0? What
> if node 0 and node 1 memory are interleaved, so there's some node 1
> memory below the highest node 0 address?
>

Current our plan is to avoid asking end-user to fix their boot option by hand
even if memory size per node is changed. We'll ship a hardware, which has
_fixed_ physical address range per each node regardless of equipped memory size.
The address will be written in Hardware manual or we'll ship some tool with hardware.
Of course, we disable interleave between nodes.

IIUC, memory layout can be changed because hardware error detection logic can
turn off DIMM before boot. So, if we use memmap=, which requires precise memory
mapping knowledge, the system admin need to modify it when the problem happens.

Problem happens => reboot (disable some DIMM) => remove memmap= option for avoiding
trouble => check memory layout again =>fix mem_map= => reboot again.
This reboot takes much time because the system which have Dynamic-partitioning tends to
be big....so, we'd like to have some _relaxed_ way to specify the region of memory.

Problem happens => reboot (disable some DIMM) => no changes required
(because we have enough memory hole between Node0 and Node1.)

BTW, how do you think about mem= boot option which works as max_addr=, now ?
This caused troubles some times on our support-desk, saying
Q. I specified mem=8G boot option but it seems the system has only 7GB....
A. it's because of PCI configuration area on 3G-4G address range...

Even if our requirement can be covered current mem= option, I'd like to have
max_addr= option and make mem= option to be sane as ia64.

Thanks,
-Kame

2012-06-13 02:23:58

by Kamezawa Hiroyuki

[permalink] [raw]
Subject: Re: [PATCH 1/2 v2] x86: add max_addr boot option

(2012/06/13 1:10), H. Peter Anvin wrote:
> If what you care about is nodes, why not have an option to specify a map?
>

At the first time of our project, we discussed passing node-id as boot option.
But, we found Node-ID is just determined by the pxm ID order in SRAT. That means,
we and our firmware team need to take care of the order of SRAT. But that node-ID
v.s. SRAT relationship is just determined by implemenation, there is no spec,
we thought we can't keep this way in future.

The second thought was specifying PXM. But, with hardware-partitioning-system,
dynamic implementation of SRAT for a partutuion is very confusing...

Then, alternative idea was using mem= boot option. Because our partition system
has fixed address range per each node, it works well.

But now, we know mem= boot option is buggy....it acts as max_addr= option, we
have concerns that 'someone may fix mem= option as sane as ia64. because it's buggy".

We'd like to fix mem= boot option by ourselves and preserve old behavior with max_addr=
boot option, which ia64 has.

Thanks,
-Kame

2012-06-13 03:30:11

by H. Peter Anvin

[permalink] [raw]
Subject: Re: [PATCH 1/2 v2] x86: add max_addr boot option

On 06/12/2012 07:21 PM, Kamezawa Hiroyuki wrote:
>
> But now, we know mem= boot option is buggy....it acts as max_addr=
> option, we have concerns that 'someone may fix mem= option as sane as ia64. because
> it's buggy".
>
> We'd like to fix mem= boot option by ourselves and preserve old behavior
> with max_addr= boot option, which ia64 has.
>

Now I'm *really* confused.

Realistically, there is no point in the old mem= behavior of assuming a
contiguous chunk of memory up to that point; it simply doesn't match how
modern hardware is constructed. Your notion that ia64 is "sane" is
probably more of "outdated" in my opinion.

As such, the current behavior for mem= seems like the right thing and
the change was intentional (not to mention has been in place since
kernel 2.5.65, back in 2003); it also solves your requirements. If you
are concerned about it, it would make more sense to make sure it is
documented as intentional.

In fact, it looks like IA64 introduced a divergence when the max_addr=
patch was introduced in 2004. You're basically proposing the same
divergence for x86 now; talk about having the tail wag the dog.

Sorry. NAK.

-hpa

2012-06-13 05:23:03

by Kamezawa Hiroyuki

[permalink] [raw]
Subject: Re: [PATCH 1/2 v2] x86: add max_addr boot option

(2012/06/13 12:29), H. Peter Anvin wrote:
> On 06/12/2012 07:21 PM, Kamezawa Hiroyuki wrote:
>>
>> But now, we know mem= boot option is buggy....it acts as max_addr=
>> option, we have concerns that 'someone may fix mem= option as sane as ia64. because
>> it's buggy".
>>
>> We'd like to fix mem= boot option by ourselves and preserve old behavior
>> with max_addr= boot option, which ia64 has.
>>
>
> Now I'm *really* confused.
>
> Realistically, there is no point in the old mem= behavior of assuming a
> contiguous chunk of memory up to that point; it simply doesn't match how
> modern hardware is constructed. Your notion that ia64 is "sane" is
> probably more of "outdated" in my opinion.
>
> As such, the current behavior for mem= seems like the right thing and
> the change was intentional (not to mention has been in place since
> kernel 2.5.65, back in 2003); it also solves your requirements. If you
> are concerned about it, it would make more sense to make sure it is
> documented as intentional.
>
> In fact, it looks like IA64 introduced a divergence when the max_addr=
> patch was introduced in 2004. You're basically proposing the same
> divergence for x86 now; talk about having the tail wag the dog.
>
> Sorry. NAK.
>

Hmm, them, it's ok to post a patch for fixing kernel-param

mem=nn[KMG] [KNL,BOOT] Force usage of a specific amount of memory
Amount of memory to be used when the kernel is not able
to see the whole system memory or for test.
[X86-32] Use together with memmap= to avoid physical
address space collisions. Without memmap= PCI devices
could be placed at addresses belonging to unused RAM.

to explain 'work as limiting max address' and implementing current mem= behavior
in x86-64/efi code ?

Thanks,
-Kame



2012-06-13 05:54:42

by Wen Congyang

[permalink] [raw]
Subject: Re: [PATCH 1/2 v2] x86: add max_addr boot option

At 06/13/2012 11:29 AM, H. Peter Anvin Wrote:
> On 06/12/2012 07:21 PM, Kamezawa Hiroyuki wrote:
>>
>> But now, we know mem= boot option is buggy....it acts as max_addr=
>> option, we have concerns that 'someone may fix mem= option as sane as ia64. because
>> it's buggy".
>>
>> We'd like to fix mem= boot option by ourselves and preserve old behavior
>> with max_addr= boot option, which ia64 has.
>>
>
> Now I'm *really* confused.
>
> Realistically, there is no point in the old mem= behavior of assuming a
> contiguous chunk of memory up to that point; it simply doesn't match how
> modern hardware is constructed. Your notion that ia64 is "sane" is
> probably more of "outdated" in my opinion.
>
> As such, the current behavior for mem= seems like the right thing and
> the change was intentional (not to mention has been in place since
> kernel 2.5.65, back in 2003); it also solves your requirements. If you
> are concerned about it, it would make more sense to make sure it is
> documented as intentional.


Here is the document(Documentation/kernel-parameters.txt):

mem=nn[KMG] [KNL,BOOT] Force usage of a specific amount of memory

The implementation of mem= on ia64 is the same as the description in the document, but
the implementation of mem= on x86 box is not the same as the descrition.

Now, which should we fix? Document or the implementition?

Another problem is: the mem= cannot work if the user specifies add_efi_memmap
option. I think we should also fix this problem.

Thanks
Wen Congyang
>
> In fact, it looks like IA64 introduced a divergence when the max_addr=
> patch was introduced in 2004. You're basically proposing the same
> divergence for x86 now; talk about having the tail wag the dog.
>
> Sorry. NAK.
>
> -hpa
>
>

2012-06-13 14:23:59

by Rob Landley

[permalink] [raw]
Subject: Re: [PATCH 1/2 v2] x86: add max_addr boot option

On 06/12/2012 08:55 PM, Kamezawa Hiroyuki wrote:
>>> Currently, I only need to ignore the memory. If we need to ignore a
>>> node,
>>> "numa_node=" or similar parameter is a better choice.
>>
>> Doesn't the end user have to know the memory map of the system to use
>> "max_addr="? How do you know what value to supply? Do you have to
>> attempt a boot once to discover the highest address on node 0? What
>> if node 0 and node 1 memory are interleaved, so there's some node 1
>> memory below the highest node 0 address?
>
> Current our plan is to avoid asking end-user to fix their boot option by
> hand even if memory size per node is changed. We'll ship a hardware, which has
> _fixed_ physical address range per each node regardless of equipped
> memory size.

I.E. you'll be configuring this yourself when you ship hardware.

You're adding an option because you consider it less confusing for your
end users who are digging into kernel parameters, but you will set this
new option for your users because they haven't got the information to
set it themselves?

> Problem happens => reboot (disable some DIMM) => remove memmap= option
> for avoiding
> trouble => check memory layout again =>fix mem_map= => reboot again.
> This reboot takes much time because the system which have
> Dynamic-partitioning tends to
> be big....so, we'd like to have some _relaxed_ way to specify the region
> of memory.
>
> Problem happens => reboot (disable some DIMM) => no changes required
> (because we have enough memory hole between Node0 and Node1.)

I'm guessing the above means "or you'll be providing some tool that does
it when they install/remove memory in the hardware"...

> BTW, how do you think about mem= boot option which works as max_addr=,
> now ?
> This caused troubles some times on our support-desk, saying
> Q. I specified mem=8G boot option but it seems the system has only 7GB....
> A. it's because of PCI configuration area on 3G-4G address range...

So you're saying there are already two ways to do this, but you want to
add a third to be less confusing for end users who are modifying the
linux kernel boot parameters by hand using information only you can
supply to them?

I'm confused...

> Even if our requirement can be covered current mem= option, I'd like to
> have max_addr= option and make mem= option to be sane as ia64.

"sane as ia64".

Ok, I've read that phrase five times and the words still don't fit together.

I'm going to admit defeat on my attempt to understand this thread, and
move on...

Rob
--
GNU/Linux isn't: Linux=GPLv2, GNU=GPLv3+, they can't share code.
Either it's "mere aggregation", or a license violation. Pick one.

2012-06-14 02:09:28

by Kamezawa Hiroyuki

[permalink] [raw]
Subject: Re: [PATCH 1/2 v2] x86: add max_addr boot option

(2012/06/13 13:59), Rob Landley wrote:
> On 06/12/2012 08:55 PM, Kamezawa Hiroyuki wrote:
>>>> Currently, I only need to ignore the memory. If we need to ignore a
>>>> node,
>>>> "numa_node=" or similar parameter is a better choice.
>>>
>>> Doesn't the end user have to know the memory map of the system to use
>>> "max_addr="? How do you know what value to supply? Do you have to
>>> attempt a boot once to discover the highest address on node 0? What
>>> if node 0 and node 1 memory are interleaved, so there's some node 1
>>> memory below the highest node 0 address?
>>
>> Current our plan is to avoid asking end-user to fix their boot option by
>> hand even if memory size per node is changed. We'll ship a hardware, which has
>> _fixed_ physical address range per each node regardless of equipped
>> memory size.
>
> I.E. you'll be configuring this yourself when you ship hardware.
>

yes.

> You're adding an option because you consider it less confusing for your
> end users who are digging into kernel parameters, but you will set this
> new option for your users because they haven't got the information to
> set it themselves?
>

My users don't need to know about hardware settings and the meaning of
kernel params. They'll just do as we ask to do.


>> Problem happens => reboot (disable some DIMM) => remove memmap= option
>> for avoiding
>> trouble => check memory layout again =>fix mem_map= => reboot again.
>> This reboot takes much time because the system which have
>> Dynamic-partitioning tends to
>> be big....so, we'd like to have some _relaxed_ way to specify the region
>> of memory.
>>
>> Problem happens => reboot (disable some DIMM) => no changes required
>> (because we have enough memory hole between Node0 and Node1.)
>
> I'm guessing the above means "or you'll be providing some tool that does
> it when they install/remove memory in the hardware"...
>
>> BTW, how do you think about mem= boot option which works as max_addr=,
>> now ?
>> This caused troubles some times on our support-desk, saying
>> Q. I specified mem=8G boot option but it seems the system has only 7GB....
>> A. it's because of PCI configuration area on 3G-4G address range...
>
> So you're saying there are already two ways to do this, but you want to
> add a third to be less confusing for end users who are modifying the
> linux kernel boot parameters by hand using information only you can
> supply to them?
>
> I'm confused...
>

I'm just saying current mem= implemenation seems buggy because spec. and
impl. doesn't match. So, we're just afraid that someone other than us
will fix it and break our assumption how mem= works. It's dangerous to
build a production on a feature where spec. and impl. doesn't match.
So, we proposed to add max_addr= option for avoiding that situation.

Reading threads, it seems Maintainers says that current 'mem=' have been working
for many years and it's not buggy, it works as expected. It seems no one
will be able to change the implementation. Then, we're okay with using mem=.
We'll just implement mem= option for efi environment. And try to fix the spec.
if possible.

Thanks,
-Kame








2012-06-14 20:00:39

by Rob Landley

[permalink] [raw]
Subject: Re: [PATCH 1/2 v2] x86: add max_addr boot option

On 06/13/2012 09:06 PM, Kamezawa Hiroyuki wrote:
>> You're adding an option because you consider it less confusing for your
>> end users who are digging into kernel parameters, but you will set this
>> new option for your users because they haven't got the information to
>> set it themselves?
>>
>
> My users don't need to know about hardware settings and the meaning of
> kernel params. They'll just do as we ask to do.

So you're adding a new feature that only you will use, because the
existing way of doing it confuses... you.

>> So you're saying there are already two ways to do this, but you want to
>> add a third to be less confusing for end users who are modifying the
>> linux kernel boot parameters by hand using information only you can
>> supply to them?
>>
>> I'm confused...
>>
>
> I'm just saying current mem= implemenation seems buggy because spec. and
> impl. doesn't match. So, we're just afraid that someone other than us
> will fix it and break our assumption how mem= works. It's dangerous to
> build a production on a feature where spec. and impl. doesn't match.
> So, we proposed to add max_addr= option for avoiding that situation.

So fix the spec, or fix the implementation. Don't add a random new
duplicate way to do the same thing because you're afraid that open
source code might change, but somehyow the new code you propose to add
won't (presumably due to being so profoundly uninteresting to the rest
of the world that nobody will notice it's there).

Sigh. On arm you can go "mem=size@start", which can be repeated. I.E.
you can, on the kernel command line, tell it where all the chunks of
physical memory it should use actually live. Letting x86 do that might
be nice. Adding a clipping option to the normal memory probing, so
memory probing has to fail in a certain specific way in order for this
to even apply? Not so much...

Rob
--
GNU/Linux isn't: Linux=GPLv2, GNU=GPLv3+, they can't share code.
Either it's "mere aggregation", or a license violation. Pick one.