This patch adds a 'flags' parameter to reserve_bootmem_generic() like it
already has been added in reserve_bootmem() with commit
72a7fe3967dbf86cb34e24fbf1d957fe24d2f246.
It also changes all users to use BOOTMEM_DEFAULT, which doesn't effectively
change the behaviour. Since the change is x86-specific, I don't think it's
necessary to add a new API for migration. There are only 4 users of that
function.
The change is necessary for the next patch, using reserve_bootmem_generic()
for crashkernel reservation.
Signed-off-by: Bernhard Walle <[email protected]>
---
arch/x86/kernel/e820_64.c | 3 ++-
arch/x86/kernel/efi_64.c | 3 ++-
arch/x86/kernel/mpparse.c | 5 +++--
arch/x86/mm/init_64.c | 17 ++++++++++++-----
include/asm-x86/proto.h | 2 +-
5 files changed, 20 insertions(+), 10 deletions(-)
--- a/arch/x86/kernel/e820_64.c
+++ b/arch/x86/kernel/e820_64.c
@@ -118,7 +118,8 @@ void __init early_res_to_bootmem(unsigne
continue;
printk(KERN_INFO " early res: %d [%lx-%lx] %s\n", i,
final_start, final_end - 1, r->name);
- reserve_bootmem_generic(final_start, final_end - final_start);
+ reserve_bootmem_generic(final_start, final_end - final_start,
+ BOOTMEM_DEFAULT);
}
}
--- a/arch/x86/kernel/efi_64.c
+++ b/arch/x86/kernel/efi_64.c
@@ -100,7 +100,8 @@ void __init efi_call_phys_epilog(void)
void __init efi_reserve_bootmem(void)
{
reserve_bootmem_generic((unsigned long)memmap.phys_map,
- memmap.nr_map * memmap.desc_size);
+ memmap.nr_map * memmap.desc_size,
+ BOOTMEM_DEFAULT);
}
void __iomem * __init efi_ioremap(unsigned long phys_addr, unsigned long size)
--- a/arch/x86/kernel/mpparse.c
+++ b/arch/x86/kernel/mpparse.c
@@ -729,10 +729,11 @@ static int __init smp_scan_config(unsign
if (!reserve)
return 1;
- reserve_bootmem_generic(virt_to_phys(mpf), PAGE_SIZE);
+ reserve_bootmem_generic(virt_to_phys(mpf), PAGE_SIZE,
+ BOOTMEM_DEFAULT);
if (mpf->mpf_physptr)
reserve_bootmem_generic(mpf->mpf_physptr,
- PAGE_SIZE);
+ PAGE_SIZE, BOOTMEM_DEFAULT);
#endif
return 1;
}
Files a/arch/x86/kernel/setup_64.o and b/arch/x86/kernel/setup_64.o differ
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -798,12 +798,13 @@ void free_initrd_mem(unsigned long start
}
#endif
-void __init reserve_bootmem_generic(unsigned long phys, unsigned len)
+int __init reserve_bootmem_generic(unsigned long phys, unsigned len, int flags)
{
#ifdef CONFIG_NUMA
int nid, next_nid;
#endif
unsigned long pfn = phys >> PAGE_SHIFT;
+ int ret;
if (pfn >= end_pfn) {
/*
@@ -811,11 +812,11 @@ void __init reserve_bootmem_generic(unsi
* firmware tables:
*/
if (pfn < max_pfn_mapped)
- return;
+ return -EFAULT;
printk(KERN_ERR "reserve_bootmem: illegal reserve %lx %u\n",
phys, len);
- return;
+ return -EFAULT;
}
/* Should check here against the e820 map to avoid double free */
@@ -823,9 +824,13 @@ void __init reserve_bootmem_generic(unsi
nid = phys_to_nid(phys);
next_nid = phys_to_nid(phys + len - 1);
if (nid == next_nid)
- reserve_bootmem_node(NODE_DATA(nid), phys, len, BOOTMEM_DEFAULT);
+ ret = reserve_bootmem_node(NODE_DATA(nid), phys, len, flags);
else
- reserve_bootmem(phys, len, BOOTMEM_DEFAULT);
+ ret = reserve_bootmem(phys, len, flags);
+
+ if (ret != 0)
+ return ret;
+
#else
reserve_bootmem(phys, len, BOOTMEM_DEFAULT);
#endif
@@ -834,6 +839,8 @@ void __init reserve_bootmem_generic(unsi
dma_reserve += len / PAGE_SIZE;
set_dma_reserve(dma_reserve);
}
+
+ return 0;
}
int kern_addr_valid(unsigned long addr)
Files a/arch/x86/mm/init_64.o and b/arch/x86/mm/init_64.o differ
--- a/include/asm-x86/proto.h
+++ b/include/asm-x86/proto.h
@@ -14,7 +14,7 @@ extern void ia32_syscall(void);
extern void ia32_cstar_target(void);
extern void ia32_sysenter_target(void);
-extern void reserve_bootmem_generic(unsigned long phys, unsigned len);
+extern int reserve_bootmem_generic(unsigned long phys, unsigned len, int flags);
extern void syscall32_cpu_init(void);
On Sun, Jun 08, 2008 at 03:46:30PM +0200, Bernhard Walle wrote:
>This patch adds a 'flags' parameter to reserve_bootmem_generic() like it
>already has been added in reserve_bootmem() with commit
>72a7fe3967dbf86cb34e24fbf1d957fe24d2f246.
>
>It also changes all users to use BOOTMEM_DEFAULT, which doesn't effectively
>change the behaviour. Since the change is x86-specific, I don't think it's
>necessary to add a new API for migration. There are only 4 users of that
>function.
>
>The change is necessary for the next patch, using reserve_bootmem_generic()
>for crashkernel reservation.
>
>
>Signed-off-by: Bernhard Walle <[email protected]>
>
>---
> arch/x86/kernel/e820_64.c | 3 ++-
> arch/x86/kernel/efi_64.c | 3 ++-
> arch/x86/kernel/mpparse.c | 5 +++--
> arch/x86/mm/init_64.c | 17 ++++++++++++-----
> include/asm-x86/proto.h | 2 +-
> 5 files changed, 20 insertions(+), 10 deletions(-)
>
>--- a/arch/x86/kernel/e820_64.c
>+++ b/arch/x86/kernel/e820_64.c
>@@ -118,7 +118,8 @@ void __init early_res_to_bootmem(unsigne
> continue;
> printk(KERN_INFO " early res: %d [%lx-%lx] %s\n", i,
> final_start, final_end - 1, r->name);
>- reserve_bootmem_generic(final_start, final_end - final_start);
>+ reserve_bootmem_generic(final_start, final_end - final_start,
>+ BOOTMEM_DEFAULT);
> }
> }
>
>--- a/arch/x86/kernel/efi_64.c
>+++ b/arch/x86/kernel/efi_64.c
>@@ -100,7 +100,8 @@ void __init efi_call_phys_epilog(void)
> void __init efi_reserve_bootmem(void)
> {
> reserve_bootmem_generic((unsigned long)memmap.phys_map,
>- memmap.nr_map * memmap.desc_size);
>+ memmap.nr_map * memmap.desc_size,
>+ BOOTMEM_DEFAULT);
> }
Just one comment.
Since 'reserve_bootmem_generic' is changed from 'void' to 'int',
we should check its return value for failure when possible, right?
--
Hi, I'm a .signature virus, please copy/paste me to help me spread
all over the world.
* WANG Cong <[email protected]> [2008-06-08 22:26]:
>
> Since 'reserve_bootmem_generic' is changed from 'void' to 'int',
> we should check its return value for failure when possible, right?
That may make sense here, but that's unrelated to my change.
Just because the error *can* be caught by checking the return value
doesn't mean that it *must* be caught always. It was silently ignored
before in the efi_reserve_bootmem() function before, and so is it now.
No behaviour change.
Bernhard
--
Bernhard Walle, SUSE LINUX Products GmbH, Architecture Maintenance
Hi,
Bernhard Walle <[email protected]> writes:
> --- a/arch/x86/mm/init_64.c
> +++ b/arch/x86/mm/init_64.c
> @@ -798,12 +798,13 @@ void free_initrd_mem(unsigned long start
> }
> #endif
>
> -void __init reserve_bootmem_generic(unsigned long phys, unsigned len)
> +int __init reserve_bootmem_generic(unsigned long phys, unsigned len, int flags)
> {
> #ifdef CONFIG_NUMA
> int nid, next_nid;
> #endif
> unsigned long pfn = phys >> PAGE_SHIFT;
> + int ret;
>
> if (pfn >= end_pfn) {
> /*
> @@ -811,11 +812,11 @@ void __init reserve_bootmem_generic(unsi
> * firmware tables:
> */
> if (pfn < max_pfn_mapped)
> - return;
> + return -EFAULT;
This seemed to be `just do nothing' behaviour. Wouldn't 0 be more
correct here? Or something else so there is a difference between the
path that does not print a warning (the one below) and the path that
does?
>
> printk(KERN_ERR "reserve_bootmem: illegal reserve %lx %u\n",
> phys, len);
> - return;
> + return -EFAULT;
> }
Hannes
Hi,
Bernhard Walle <[email protected]> writes:
> This patch adds a 'flags' parameter to reserve_bootmem_generic() like it
> already has been added in reserve_bootmem() with commit
> 72a7fe3967dbf86cb34e24fbf1d957fe24d2f246.
>
> It also changes all users to use BOOTMEM_DEFAULT, which doesn't effectively
> change the behaviour. Since the change is x86-specific, I don't think it's
> necessary to add a new API for migration. There are only 4 users of that
> function.
>
> The change is necessary for the next patch, using reserve_bootmem_generic()
> for crashkernel reservation.
>
>
> Signed-off-by: Bernhard Walle <[email protected]>
>
> ---
> arch/x86/kernel/e820_64.c | 3 ++-
> arch/x86/kernel/efi_64.c | 3 ++-
> arch/x86/kernel/mpparse.c | 5 +++--
> arch/x86/mm/init_64.c | 17 ++++++++++++-----
> include/asm-x86/proto.h | 2 +-
> 5 files changed, 20 insertions(+), 10 deletions(-)
>
> --- a/arch/x86/kernel/e820_64.c
> +++ b/arch/x86/kernel/e820_64.c
> @@ -118,7 +118,8 @@ void __init early_res_to_bootmem(unsigne
> continue;
> printk(KERN_INFO " early res: %d [%lx-%lx] %s\n", i,
> final_start, final_end - 1, r->name);
> - reserve_bootmem_generic(final_start, final_end - final_start);
> + reserve_bootmem_generic(final_start, final_end - final_start,
> + BOOTMEM_DEFAULT);
> }
> }
>
> --- a/arch/x86/kernel/efi_64.c
> +++ b/arch/x86/kernel/efi_64.c
> @@ -100,7 +100,8 @@ void __init efi_call_phys_epilog(void)
> void __init efi_reserve_bootmem(void)
> {
> reserve_bootmem_generic((unsigned long)memmap.phys_map,
> - memmap.nr_map * memmap.desc_size);
> + memmap.nr_map * memmap.desc_size,
> + BOOTMEM_DEFAULT);
> }
>
> void __iomem * __init efi_ioremap(unsigned long phys_addr, unsigned long size)
> --- a/arch/x86/kernel/mpparse.c
> +++ b/arch/x86/kernel/mpparse.c
> @@ -729,10 +729,11 @@ static int __init smp_scan_config(unsign
> if (!reserve)
> return 1;
>
> - reserve_bootmem_generic(virt_to_phys(mpf), PAGE_SIZE);
> + reserve_bootmem_generic(virt_to_phys(mpf), PAGE_SIZE,
> + BOOTMEM_DEFAULT);
> if (mpf->mpf_physptr)
> reserve_bootmem_generic(mpf->mpf_physptr,
> - PAGE_SIZE);
> + PAGE_SIZE, BOOTMEM_DEFAULT);
> #endif
> return 1;
> }
> Files a/arch/x86/kernel/setup_64.o and b/arch/x86/kernel/setup_64.o differ
> --- a/arch/x86/mm/init_64.c
> +++ b/arch/x86/mm/init_64.c
> @@ -798,12 +798,13 @@ void free_initrd_mem(unsigned long start
> }
> #endif
>
> -void __init reserve_bootmem_generic(unsigned long phys, unsigned len)
> +int __init reserve_bootmem_generic(unsigned long phys, unsigned len, int flags)
> {
> #ifdef CONFIG_NUMA
> int nid, next_nid;
> #endif
> unsigned long pfn = phys >> PAGE_SHIFT;
> + int ret;
>
> if (pfn >= end_pfn) {
> /*
> @@ -811,11 +812,11 @@ void __init reserve_bootmem_generic(unsi
> * firmware tables:
> */
> if (pfn < max_pfn_mapped)
> - return;
> + return -EFAULT;
>
> printk(KERN_ERR "reserve_bootmem: illegal reserve %lx %u\n",
> phys, len);
> - return;
> + return -EFAULT;
> }
>
> /* Should check here against the e820 map to avoid double free */
> @@ -823,9 +824,13 @@ void __init reserve_bootmem_generic(unsi
> nid = phys_to_nid(phys);
> next_nid = phys_to_nid(phys + len - 1);
> if (nid == next_nid)
> - reserve_bootmem_node(NODE_DATA(nid), phys, len, BOOTMEM_DEFAULT);
> + ret = reserve_bootmem_node(NODE_DATA(nid), phys, len, flags);
> else
> - reserve_bootmem(phys, len, BOOTMEM_DEFAULT);
> + ret = reserve_bootmem(phys, len, flags);
> +
> + if (ret != 0)
> + return ret;
> +
> #else
> reserve_bootmem(phys, len, BOOTMEM_DEFAULT);
^^^^^^^^^^^^^^^ flags?
And you ignore the return value here.
Hannes
On Mon, Jun 09, 2008 at 12:01:15AM +0200, Johannes Weiner wrote:
> Hi,
>
> Bernhard Walle <[email protected]> writes:
>
> > --- a/arch/x86/mm/init_64.c
> > +++ b/arch/x86/mm/init_64.c
> > @@ -798,12 +798,13 @@ void free_initrd_mem(unsigned long start
> > }
> > #endif
> >
> > -void __init reserve_bootmem_generic(unsigned long phys, unsigned len)
> > +int __init reserve_bootmem_generic(unsigned long phys, unsigned len, int flags)
> > {
> > #ifdef CONFIG_NUMA
> > int nid, next_nid;
> > #endif
> > unsigned long pfn = phys >> PAGE_SHIFT;
> > + int ret;
> >
> > if (pfn >= end_pfn) {
> > /*
> > @@ -811,11 +812,11 @@ void __init reserve_bootmem_generic(unsi
> > * firmware tables:
> > */
> > if (pfn < max_pfn_mapped)
> > - return;
> > + return -EFAULT;
>
> This seemed to be `just do nothing' behaviour. Wouldn't 0 be more
> correct here? Or something else so there is a difference between the
> path that does not print a warning (the one below) and the path that
> does?
Bernard,
This is interesting. IIUC, end_pfn represents end of physical RAM and
max_pfn_mapped represents, end of other tables like ACPI which are
mapped in higher regions.
Kdump first kernel always tries to reserve just physical RAM and nothing
else. So I am not sure what does above code do. Try to reserve a memory
which is not RAM but is in the region less than highest mapped entity and
in that case return silently without any warning. In what case do we
exercise this path?
Thanks
Vivek
* Vivek Goyal [2008-06-09 09:22]:
>
> Kdump first kernel always tries to reserve just physical RAM and nothing
> else. So I am not sure what does above code do. Try to reserve a memory
> which is not RAM but is in the region less than highest mapped entity and
> in that case return silently without any warning. In what case do we
> exercise this path?
I don't know. That code has been introduced in
http://git.kernel.org/?p=linux/kernel/git/torvalds/linux-2.6.git;a=commitdiff;h=5e58a02a8f6a7a1c9ae41f39286bcd3aea0d6f24
Ccing Andi.
IMO we should not print any warning in that function, leaving the error
handling to the caller.
Bernhard
--
Bernhard Walle, SUSE LINUX Products GmbH, Architecture Maintenance
* Johannes Weiner [2008-06-09 00:01]:
>
> > /*
> > @@ -811,11 +812,11 @@ void __init reserve_bootmem_generic(unsi
> > * firmware tables:
> > */
> > if (pfn < max_pfn_mapped)
> > - return;
> > + return -EFAULT;
>
> This seemed to be `just do nothing' behaviour. Wouldn't 0 be more
> correct here? Or something else so there is a difference between the
> path that does not print a warning (the one below) and the path that
> does?
Well, I don't think that we should return success when memory
allocation fails. For kdump, I think if the memory has not been
reserved, then the function should failed, for whatever reason it
failed. Because we cannot load the crashkernel.
So IMO the code should look like
[...]
int ret;
if (pfn >= end_pfn)
return -EFAULT;
/* Should check here against the e820 map to avoid double free */
#ifdef CONFIG_NUMA
nid = phys_to_nid(phys);
[...]
Bernhard
--
Bernhard Walle, SUSE LINUX Products GmbH, Architecture Maintenance
* Johannes Weiner [2008-06-09 00:06]:
>
> > +
> > #else
> > reserve_bootmem(phys, len, BOOTMEM_DEFAULT);
> ^^^^^^^^^^^^^^^ flags?
> And you ignore the return value here.
Thanks for catching that. Updated patch:
------------------------------------------------------------------
From: Bernhard Walle <[email protected]>
Subject: Add 'flags' parameter to reserve_bootmem_generic()
This patch adds a 'flags' parameter to reserve_bootmem_generic() like it
already has been added in reserve_bootmem() with commit
72a7fe3967dbf86cb34e24fbf1d957fe24d2f246.
It also changes all users to use BOOTMEM_DEFAULT, which doesn't effectively
change the behaviour. Since the change is x86-specific, I don't think it's
necessary to add a new API for migration. There are only 4 users of that
function.
The change is necessary for the next patch, using reserve_bootmem_generic()
for crashkernel reservation.
Signed-off-by: Bernhard Walle <[email protected]>
---
arch/x86/kernel/e820_64.c | 3 ++-
arch/x86/kernel/efi_64.c | 3 ++-
arch/x86/kernel/mpparse.c | 5 +++--
arch/x86/mm/init_64.c | 19 +++++++++++++------
include/asm-x86/proto.h | 2 +-
5 files changed, 21 insertions(+), 11 deletions(-)
--- a/arch/x86/kernel/e820_64.c
+++ b/arch/x86/kernel/e820_64.c
@@ -118,7 +118,8 @@ void __init early_res_to_bootmem(unsigne
continue;
printk(KERN_INFO " early res: %d [%lx-%lx] %s\n", i,
final_start, final_end - 1, r->name);
- reserve_bootmem_generic(final_start, final_end - final_start);
+ reserve_bootmem_generic(final_start, final_end - final_start,
+ BOOTMEM_DEFAULT);
}
}
--- a/arch/x86/kernel/efi_64.c
+++ b/arch/x86/kernel/efi_64.c
@@ -100,7 +100,8 @@ void __init efi_call_phys_epilog(void)
void __init efi_reserve_bootmem(void)
{
reserve_bootmem_generic((unsigned long)memmap.phys_map,
- memmap.nr_map * memmap.desc_size);
+ memmap.nr_map * memmap.desc_size,
+ BOOTMEM_DEFAULT);
}
void __iomem * __init efi_ioremap(unsigned long phys_addr, unsigned long size)
--- a/arch/x86/kernel/mpparse.c
+++ b/arch/x86/kernel/mpparse.c
@@ -729,10 +729,11 @@ static int __init smp_scan_config(unsign
if (!reserve)
return 1;
- reserve_bootmem_generic(virt_to_phys(mpf), PAGE_SIZE);
+ reserve_bootmem_generic(virt_to_phys(mpf), PAGE_SIZE,
+ BOOTMEM_DEFAULT);
if (mpf->mpf_physptr)
reserve_bootmem_generic(mpf->mpf_physptr,
- PAGE_SIZE);
+ PAGE_SIZE, BOOTMEM_DEFAULT);
#endif
return 1;
}
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -798,12 +798,13 @@ void free_initrd_mem(unsigned long start
}
#endif
-void __init reserve_bootmem_generic(unsigned long phys, unsigned len)
+int __init reserve_bootmem_generic(unsigned long phys, unsigned len, int flags)
{
#ifdef CONFIG_NUMA
int nid, next_nid;
#endif
unsigned long pfn = phys >> PAGE_SHIFT;
+ int ret;
if (pfn >= end_pfn) {
/*
@@ -811,11 +812,11 @@ void __init reserve_bootmem_generic(unsi
* firmware tables:
*/
if (pfn < max_pfn_mapped)
- return;
+ return -EFAULT;
printk(KERN_ERR "reserve_bootmem: illegal reserve %lx %u\n",
phys, len);
- return;
+ return -EFAULT;
}
/* Should check here against the e820 map to avoid double free */
@@ -823,17 +824,23 @@ void __init reserve_bootmem_generic(unsi
nid = phys_to_nid(phys);
next_nid = phys_to_nid(phys + len - 1);
if (nid == next_nid)
- reserve_bootmem_node(NODE_DATA(nid), phys, len, BOOTMEM_DEFAULT);
+ ret = reserve_bootmem_node(NODE_DATA(nid), phys, len, flags);
else
- reserve_bootmem(phys, len, BOOTMEM_DEFAULT);
+ ret = reserve_bootmem(phys, len, flags);
+
#else
- reserve_bootmem(phys, len, BOOTMEM_DEFAULT);
+ ret = reserve_bootmem(phys, len, flags);
#endif
+ if (ret != 0)
+ return ret;
+
if (phys+len <= MAX_DMA_PFN*PAGE_SIZE) {
dma_reserve += len / PAGE_SIZE;
set_dma_reserve(dma_reserve);
}
+
+ return 0;
}
int kern_addr_valid(unsigned long addr)
--- a/include/asm-x86/proto.h
+++ b/include/asm-x86/proto.h
@@ -14,7 +14,7 @@ extern void ia32_syscall(void);
extern void ia32_cstar_target(void);
extern void ia32_sysenter_target(void);
-extern void reserve_bootmem_generic(unsigned long phys, unsigned len);
+extern int reserve_bootmem_generic(unsigned long phys, unsigned len, int flags);
extern void syscall32_cpu_init(void);
Bernhard Walle wrote:
> * Vivek Goyal [2008-06-09 09:22]:
>> Kdump first kernel always tries to reserve just physical RAM and nothing
>> else. So I am not sure what does above code do. Try to reserve a memory
>> which is not RAM but is in the region less than highest mapped entity and
>> in that case return silently without any warning. In what case do we
>> exercise this path?
>
> I don't know. That code has been introduced in
> http://git.kernel.org/?p=linux/kernel/git/torvalds/linux-2.6.git;a=commitdiff;h=5e58a02a8f6a7a1c9ae41f39286bcd3aea0d6f24
>
> Ccing Andi.
>
> IMO we should not print any warning in that function, leaving the error
> handling to the caller.
Don't remember the details. Perhaps Amul does (cc'ed)
-Andi
* Amul Shah <[email protected]> [2008-06-09 15:50]:
>
> > Don't remember the details. Perhaps Amul does (cc'ed)
> >
> > -Andi
> >
>
> The short story is that the kexec kernel was panicking when trying to
> reserve the MP tables. The panic occurs because the MP tables resided
> in a reserved memory area above the highest address (80MB phys at that
> time) in the user defined E820 map used by the kexec kernel.
>
> I had placed my code to affect only MP table reservation (see patch
> below) because it is unique to just that code path. Andi decided a
> generalized approach would be better in case other vendors had similar
> issues.
Ok, in that case it makes indeed sense to just return success here.
Here's my third version of that patch:
-------------------------------------------------------------------------
From: Bernhard Walle <[email protected]>
Subject: Add 'flags' parameter to reserve_bootmem_generic()
This patch adds a 'flags' parameter to reserve_bootmem_generic() like it
already has been added in reserve_bootmem() with commit
72a7fe3967dbf86cb34e24fbf1d957fe24d2f246.
It also changes all users to use BOOTMEM_DEFAULT, which doesn't effectively
change the behaviour. Since the change is x86-specific, I don't think it's
necessary to add a new API for migration. There are only 4 users of that
function.
The change is necessary for the next patch, using reserve_bootmem_generic()
for crashkernel reservation.
Signed-off-by: Bernhard Walle <[email protected]>
---
arch/x86/kernel/e820_64.c | 3 ++-
arch/x86/kernel/efi_64.c | 3 ++-
arch/x86/kernel/mpparse.c | 5 +++--
arch/x86/mm/init_64.c | 19 +++++++++++++------
include/asm-x86/proto.h | 2 +-
5 files changed, 21 insertions(+), 11 deletions(-)
--- a/arch/x86/kernel/e820_64.c
+++ b/arch/x86/kernel/e820_64.c
@@ -118,7 +118,8 @@ void __init early_res_to_bootmem(unsigne
continue;
printk(KERN_INFO " early res: %d [%lx-%lx] %s\n", i,
final_start, final_end - 1, r->name);
- reserve_bootmem_generic(final_start, final_end - final_start);
+ reserve_bootmem_generic(final_start, final_end - final_start,
+ BOOTMEM_DEFAULT);
}
}
--- a/arch/x86/kernel/efi_64.c
+++ b/arch/x86/kernel/efi_64.c
@@ -100,7 +100,8 @@ void __init efi_call_phys_epilog(void)
void __init efi_reserve_bootmem(void)
{
reserve_bootmem_generic((unsigned long)memmap.phys_map,
- memmap.nr_map * memmap.desc_size);
+ memmap.nr_map * memmap.desc_size,
+ BOOTMEM_DEFAULT);
}
void __iomem * __init efi_ioremap(unsigned long phys_addr, unsigned long size)
--- a/arch/x86/kernel/mpparse.c
+++ b/arch/x86/kernel/mpparse.c
@@ -729,10 +729,11 @@ static int __init smp_scan_config(unsign
if (!reserve)
return 1;
- reserve_bootmem_generic(virt_to_phys(mpf), PAGE_SIZE);
+ reserve_bootmem_generic(virt_to_phys(mpf), PAGE_SIZE,
+ BOOTMEM_DEFAULT);
if (mpf->mpf_physptr)
reserve_bootmem_generic(mpf->mpf_physptr,
- PAGE_SIZE);
+ PAGE_SIZE, BOOTMEM_DEFAULT);
#endif
return 1;
}
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -798,12 +798,13 @@ void free_initrd_mem(unsigned long start
}
#endif
-void __init reserve_bootmem_generic(unsigned long phys, unsigned len)
+int __init reserve_bootmem_generic(unsigned long phys, unsigned len, int flags)
{
#ifdef CONFIG_NUMA
int nid, next_nid;
#endif
unsigned long pfn = phys >> PAGE_SHIFT;
+ int ret;
if (pfn >= end_pfn) {
/*
@@ -811,11 +812,11 @@ void __init reserve_bootmem_generic(unsi
* firmware tables:
*/
if (pfn < max_pfn_mapped)
- return;
+ return 0;
printk(KERN_ERR "reserve_bootmem: illegal reserve %lx %u\n",
phys, len);
- return;
+ return -EFAULT;
}
/* Should check here against the e820 map to avoid double free */
@@ -823,17 +824,23 @@ void __init reserve_bootmem_generic(unsi
nid = phys_to_nid(phys);
next_nid = phys_to_nid(phys + len - 1);
if (nid == next_nid)
- reserve_bootmem_node(NODE_DATA(nid), phys, len, BOOTMEM_DEFAULT);
+ ret = reserve_bootmem_node(NODE_DATA(nid), phys, len, flags);
else
- reserve_bootmem(phys, len, BOOTMEM_DEFAULT);
+ ret = reserve_bootmem(phys, len, flags);
+
#else
- reserve_bootmem(phys, len, BOOTMEM_DEFAULT);
+ ret = reserve_bootmem(phys, len, flags);
#endif
+ if (ret != 0)
+ return ret;
+
if (phys+len <= MAX_DMA_PFN*PAGE_SIZE) {
dma_reserve += len / PAGE_SIZE;
set_dma_reserve(dma_reserve);
}
+
+ return 0;
}
int kern_addr_valid(unsigned long addr)
--- a/include/asm-x86/proto.h
+++ b/include/asm-x86/proto.h
@@ -14,7 +14,7 @@ extern void ia32_syscall(void);
extern void ia32_cstar_target(void);
extern void ia32_sysenter_target(void);
-extern void reserve_bootmem_generic(unsigned long phys, unsigned len);
+extern int reserve_bootmem_generic(unsigned long phys, unsigned len, int flags);
extern void syscall32_cpu_init(void);
On Mon, 2008-06-09 at 18:39 +0200, Andi Kleen wrote:
> Bernhard Walle wrote:
> > * Vivek Goyal [2008-06-09 09:22]:
> >> Kdump first kernel always tries to reserve just physical RAM and nothing
> >> else. So I am not sure what does above code do. Try to reserve a memory
> >> which is not RAM but is in the region less than highest mapped entity and
> >> in that case return silently without any warning. In what case do we
> >> exercise this path?
> >
> > I don't know. That code has been introduced in
> > http://git.kernel.org/?p=linux/kernel/git/torvalds/linux-2.6.git;a=commitdiff;h=5e58a02a8f6a7a1c9ae41f39286bcd3aea0d6f24
> >
> > Ccing Andi.
> >
> > IMO we should not print any warning in that function, leaving the error
> > handling to the caller.
>
> Don't remember the details. Perhaps Amul does (cc'ed)
>
> -Andi
>
The short story is that the kexec kernel was panicking when trying to
reserve the MP tables. The panic occurs because the MP tables resided
in a reserved memory area above the highest address (80MB phys at that
time) in the user defined E820 map used by the kexec kernel.
I had placed my code to affect only MP table reservation (see patch
below) because it is unique to just that code path. Andi decided a
generalized approach would be better in case other vendors had similar
issues.
Vivek asked if I was using a user defined memory map for the kexec
kernel. I was using one, but the top of memory was being defined as
80MB physical (end_pfn). The "exactmap" option parsing is clobbering
the variable end_pfn_map. I suggested using the saved_pfn_map variable.
In the end Andi's patch was the best, so it stuck.
I took a quick look at the current code base and it would still panic
when reserving the MP table. The function smp_scan_config does the
reservation. I did not track down how the BUG_ON in
reserve_bootmem_core corresponds to end_pfn.
thanks,
Amul
Here is my original email (http://lkml.org/lkml/2006/11/2/285):
The kdump crash kernel panics when it tries to reserve the MP Config
tables on an ES7000.
The MP Config table is located above 1MB of physical memory in a
reserved memory area. It is located outside the first 1MB area because
the tables are too large, 240k.
The crash kernel is given a user defined memory map with E820 reserved
and ACPI areas passed in by kexec tools and a usable area from 16MB
physical to 80MB physical. This user defined map causes the top of
memory to be set as 80MB.
The ACPI tables and MP Tables reside higher in memory. When reserving
memory with reserve_bootmem_generic, the function has a BUG panic if the
memory location to reserve is above the top of memory. The MP table is
above the top of memory in a user defined memory map.
This patch will ignore reserving the MP tables if the MP table resides
in an area already reserved in the E820.
I have two alternate patches that accomplish the same effect if this
patch is not acceptable.
1. avoid reserving the MP tables if a user defined memory map or if a
user defined memory limit ("mem=") is used.
2. avoid reserving the MP tables if a kernel parameter is passed in to
ignore MP table reservation.
diff -Naur linux-2.6.19-rc4/arch/x86_64/kernel/e820.c linux-2.6.19-rc4-az/arch/x86_64/kernel/e820.c
--- linux-2.6.19-rc4/arch/x86_64/kernel/e820.c 2006-10-31 17:38:41.000000000 -0500
+++ linux-2.6.19-rc4-az/arch/x86_64/kernel/e820.c 2006-11-02 17:56:01.000000000 -0500
@@ -351,6 +351,53 @@
}
}
+int __init e820_reserved(unsigned long target_phys)
+{
+ int i;
+ unsigned long section_begin_phys, section_end_phys;
+
+ for (i = 0; i < e820.nr_map; i++) {
+ // if it is usable memory, ignore it
+ if (e820.map[i].type == E820_RAM )
+ continue;
+
+ section_begin_phys = e820.map[i].addr;
+ section_end_phys = e820.map[i].addr + e820.map[i].size;
+
+ // if its NOT within the memory range, ignore it
+ if (!(section_begin_phys < target_phys &&
+ target_phys < section_end_phys))
+ continue;
+
+ printk(KERN_DEBUG "MP Tables at %lx in %016lx - %016lx",
+ target_phys, section_begin_phys, section_end_phys);
+
+ switch (e820.map[i].type) {
+ case E820_RESERVED:
+ printk(KERN_DEBUG "(reserved)\n");
+ break;
+ case E820_ACPI:
+ printk(KERN_DEBUG "(ACPI data)\n");
+ printk(KERN_DEBUG "WARNING: MP Tables located in");
+ printk(KERN_DEBUG "ACPI Data Area\n");
+ break;
+ case E820_NVS:
+ printk(KERN_DEBUG "(ACPI NVS)\n");
+ printk(KERN_DEBUG "WARNING: MP Tables located in");
+ printk(KERN_DEBUG "ACPI NVS Area\n");
+ break;
+ default:
+ printk(KERN_DEBUG "(type %u)\n", e820.map[i].type);
+ printk(KERN_ERR "WARNING: MP Tables located in");
+ printk(KERN_ERR "Unkown Memory Area!\n");
+ printk(KERN_ERR "Reservations are disallowed.\n");
+ return 0;
+ }
+ return 1;
+ }
+ return 0;
+}
+
/*
* Sanitize the BIOS e820 map.
*
diff -Naur linux-2.6.19-rc4/arch/x86_64/kernel/mpparse.c linux-2.6.19-rc4-az/arch/x86_64/kernel/mpparse.c
--- linux-2.6.19-rc4/arch/x86_64/kernel/mpparse.c 2006-10-31 17:38:41.000000000 -0500
+++ linux-2.6.19-rc4-az/arch/x86_64/kernel/mpparse.c 2006-11-02 17:25:10.000000000 -0500
@@ -23,6 +23,7 @@
#include <linux/acpi.h>
#include <linux/module.h>
+#include <asm/e820.h>
#include <asm/smp.h>
#include <asm/mtrr.h>
#include <asm/mpspec.h>
@@ -543,7 +544,7 @@
smp_found_config = 1;
reserve_bootmem_generic(virt_to_phys(mpf), PAGE_SIZE);
- if (mpf->mpf_physptr)
+ if (mpf->mpf_physptr && e820_reserved(mpf->mpf_physptr))
reserve_bootmem_generic(mpf->mpf_physptr, PAGE_SIZE);
mpf_found = mpf;
return 1;
diff -Naur linux-2.6.19-rc4/include/asm-x86_64/e820.h linux-2.6.19-rc4-az/include/asm-x86_64/e820.h
--- linux-2.6.19-rc4/include/asm-x86_64/e820.h 2006-10-31 17:39:24.000000000 -0500
+++ linux-2.6.19-rc4-az/include/asm-x86_64/e820.h 2006-11-02 17:25:10.000000000 -0500
@@ -44,6 +44,7 @@
extern void e820_reserve_resources(void);
extern void e820_mark_nosave_regions(void);
extern void e820_print_map(char *who);
+extern int e820_reserved(unsigned long target_phys);
extern int e820_any_mapped(unsigned long start, unsigned long end, unsigned type);
extern int e820_all_mapped(unsigned long start, unsigned long end, unsigned type);
On Mon, Jun 09, 2008 at 10:17:32PM +0200, Bernhard Walle wrote:
> * Amul Shah <[email protected]> [2008-06-09 15:50]:
> >
> > > Don't remember the details. Perhaps Amul does (cc'ed)
> > >
> > > -Andi
> > >
> >
> > The short story is that the kexec kernel was panicking when trying to
> > reserve the MP tables. The panic occurs because the MP tables resided
> > in a reserved memory area above the highest address (80MB phys at that
> > time) in the user defined E820 map used by the kexec kernel.
> >
> > I had placed my code to affect only MP table reservation (see patch
> > below) because it is unique to just that code path. Andi decided a
> > generalized approach would be better in case other vendors had similar
> > issues.
>
> Ok, in that case it makes indeed sense to just return success here.
> Here's my third version of that patch:
Hi Bernhard,
[..]
> -void __init reserve_bootmem_generic(unsigned long phys, unsigned len)
> +int __init reserve_bootmem_generic(unsigned long phys, unsigned len, int flags)
> {
> #ifdef CONFIG_NUMA
> int nid, next_nid;
> #endif
> unsigned long pfn = phys >> PAGE_SHIFT;
> + int ret;
>
> if (pfn >= end_pfn) {
> /*
> @@ -811,11 +812,11 @@ void __init reserve_bootmem_generic(unsi
> * firmware tables:
> */
> if (pfn < max_pfn_mapped)
> - return;
> + return 0;
>
Can you please put some more explanation comment here to explain that
why it is ok to return with success, despite the fact that we never
reserved any memory.
One comment is already there, but it would be nice if we could expand
that a bit to give more context. Like during normal boot there we need
to make sure "MP tables" memory is reserved but once kdump kernel is
booted, same "MP tables" memory might be beyond "end_pfn" and
reserving code would not know about this special case. So it is ok to
return with success. (Something similar).
Thanks
Vivek
Hi Vivek,
* Vivek Goyal <[email protected]> [2008-06-09 16:29]:
>
> Can you please put some more explanation comment here to explain that
> why it is ok to return with success, despite the fact that we never
> reserved any memory.
Do you think that's ok?
When booting the kdump kernel, the MP tables (for example,
or other firmware-reserved memory) of the BIOS are beyond
end_pfn. (kexec-tools adds exactmap parameters to the kernel
so that the E820 map is no longer used.) Therefore, it's ok
to return "success" here. For normal boot, the MP tables
must be reserved normally.
Bernhard
------------------------------------------------------------------------------
From: Bernhard Walle <[email protected]>
Subject: Add 'flags' parameter to reserve_bootmem_generic()
This patch adds a 'flags' parameter to reserve_bootmem_generic() like it
already has been added in reserve_bootmem() with commit
72a7fe3967dbf86cb34e24fbf1d957fe24d2f246.
It also changes all users to use BOOTMEM_DEFAULT, which doesn't effectively
change the behaviour. Since the change is x86-specific, I don't think it's
necessary to add a new API for migration. There are only 4 users of that
function.
The change is necessary for the next patch, using reserve_bootmem_generic()
for crashkernel reservation.
Signed-off-by: Bernhard Walle <[email protected]>
---
arch/x86/kernel/e820_64.c | 3 ++-
arch/x86/kernel/efi_64.c | 3 ++-
arch/x86/kernel/mpparse.c | 5 +++--
arch/x86/mm/init_64.c | 27 +++++++++++++++++++--------
include/asm-x86/proto.h | 2 +-
5 files changed, 27 insertions(+), 13 deletions(-)
--- a/arch/x86/kernel/e820_64.c
+++ b/arch/x86/kernel/e820_64.c
@@ -118,7 +118,8 @@ void __init early_res_to_bootmem(unsigne
continue;
printk(KERN_INFO " early res: %d [%lx-%lx] %s\n", i,
final_start, final_end - 1, r->name);
- reserve_bootmem_generic(final_start, final_end - final_start);
+ reserve_bootmem_generic(final_start, final_end - final_start,
+ BOOTMEM_DEFAULT);
}
}
--- a/arch/x86/kernel/efi_64.c
+++ b/arch/x86/kernel/efi_64.c
@@ -100,7 +100,8 @@ void __init efi_call_phys_epilog(void)
void __init efi_reserve_bootmem(void)
{
reserve_bootmem_generic((unsigned long)memmap.phys_map,
- memmap.nr_map * memmap.desc_size);
+ memmap.nr_map * memmap.desc_size,
+ BOOTMEM_DEFAULT);
}
void __iomem * __init efi_ioremap(unsigned long phys_addr, unsigned long size)
--- a/arch/x86/kernel/mpparse.c
+++ b/arch/x86/kernel/mpparse.c
@@ -729,10 +729,11 @@ static int __init smp_scan_config(unsign
if (!reserve)
return 1;
- reserve_bootmem_generic(virt_to_phys(mpf), PAGE_SIZE);
+ reserve_bootmem_generic(virt_to_phys(mpf), PAGE_SIZE,
+ BOOTMEM_DEFAULT);
if (mpf->mpf_physptr)
reserve_bootmem_generic(mpf->mpf_physptr,
- PAGE_SIZE);
+ PAGE_SIZE, BOOTMEM_DEFAULT);
#endif
return 1;
}
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -798,24 +798,29 @@ void free_initrd_mem(unsigned long start
}
#endif
-void __init reserve_bootmem_generic(unsigned long phys, unsigned len)
+int __init reserve_bootmem_generic(unsigned long phys, unsigned len, int flags)
{
#ifdef CONFIG_NUMA
int nid, next_nid;
#endif
unsigned long pfn = phys >> PAGE_SHIFT;
+ int ret;
if (pfn >= end_pfn) {
/*
- * This can happen with kdump kernels when accessing
- * firmware tables:
+ * When booting the kdump kernel, the MP tables (for example,
+ * or other firmware-reserved memory) of the BIOS are beyond
+ * end_pfn. (kexec-tools adds exactmap parameters to the kernel
+ * so that the E820 map is no longer used.) Therefore, it's ok
+ * to return "success" here. For normal boot, the MP tables
+ * must be reserved normally.
*/
if (pfn < max_pfn_mapped)
- return;
+ return 0;
printk(KERN_ERR "reserve_bootmem: illegal reserve %lx %u\n",
phys, len);
- return;
+ return -EFAULT;
}
/* Should check here against the e820 map to avoid double free */
@@ -823,17 +828,23 @@ void __init reserve_bootmem_generic(unsi
nid = phys_to_nid(phys);
next_nid = phys_to_nid(phys + len - 1);
if (nid == next_nid)
- reserve_bootmem_node(NODE_DATA(nid), phys, len, BOOTMEM_DEFAULT);
+ ret = reserve_bootmem_node(NODE_DATA(nid), phys, len, flags);
else
- reserve_bootmem(phys, len, BOOTMEM_DEFAULT);
+ ret = reserve_bootmem(phys, len, flags);
+
#else
- reserve_bootmem(phys, len, BOOTMEM_DEFAULT);
+ ret = reserve_bootmem(phys, len, flags);
#endif
+ if (ret != 0)
+ return ret;
+
if (phys+len <= MAX_DMA_PFN*PAGE_SIZE) {
dma_reserve += len / PAGE_SIZE;
set_dma_reserve(dma_reserve);
}
+
+ return 0;
}
int kern_addr_valid(unsigned long addr)
--- a/include/asm-x86/proto.h
+++ b/include/asm-x86/proto.h
@@ -14,7 +14,7 @@ extern void ia32_syscall(void);
extern void ia32_cstar_target(void);
extern void ia32_sysenter_target(void);
-extern void reserve_bootmem_generic(unsigned long phys, unsigned len);
+extern int reserve_bootmem_generic(unsigned long phys, unsigned len, int flags);
extern void syscall32_cpu_init(void);
On Mon, Jun 09, 2008 at 10:42:11PM +0200, Bernhard Walle wrote:
> Hi Vivek,
>
> * Vivek Goyal <[email protected]> [2008-06-09 16:29]:
> >
> > Can you please put some more explanation comment here to explain that
> > why it is ok to return with success, despite the fact that we never
> > reserved any memory.
>
> Do you think that's ok?
>
> When booting the kdump kernel, the MP tables (for example,
> or other firmware-reserved memory) of the BIOS are beyond
> end_pfn. (kexec-tools adds exactmap parameters to the kernel
> so that the E820 map is no longer used.) Therefore, it's ok
> to return "success" here. For normal boot, the MP tables
> must be reserved normally.
>
Looks good. Thanks
Vivek
* Vivek Goyal <[email protected]> [2008-06-09 16:54]:
>
> On Mon, Jun 09, 2008 at 10:42:11PM +0200, Bernhard Walle wrote:
> > Hi Vivek,
> >
> > * Vivek Goyal <[email protected]> [2008-06-09 16:29]:
> > >
> > > Can you please put some more explanation comment here to explain that
> > > why it is ok to return with success, despite the fact that we never
> > > reserved any memory.
> >
> > Do you think that's ok?
> >
> > When booting the kdump kernel, the MP tables (for example,
> > or other firmware-reserved memory) of the BIOS are beyond
> > end_pfn. (kexec-tools adds exactmap parameters to the kernel
> > so that the E820 map is no longer used.) Therefore, it's ok
> > to return "success" here. For normal boot, the MP tables
> > must be reserved normally.
> >
>
> Looks good. Thanks
Should I resend the whole patch series to Linus for inclusion?
Bernhard
--
Bernhard Walle, SUSE LINUX Products GmbH, Architecture Maintenance
On Mon, Jun 09, 2008 at 10:57:34PM +0200, Bernhard Walle wrote:
> * Vivek Goyal <[email protected]> [2008-06-09 16:54]:
> >
> > On Mon, Jun 09, 2008 at 10:42:11PM +0200, Bernhard Walle wrote:
> > > Hi Vivek,
> > >
> > > * Vivek Goyal <[email protected]> [2008-06-09 16:29]:
> > > >
> > > > Can you please put some more explanation comment here to explain that
> > > > why it is ok to return with success, despite the fact that we never
> > > > reserved any memory.
> > >
> > > Do you think that's ok?
> > >
> > > When booting the kdump kernel, the MP tables (for example,
> > > or other firmware-reserved memory) of the BIOS are beyond
> > > end_pfn. (kexec-tools adds exactmap parameters to the kernel
> > > so that the E820 map is no longer used.) Therefore, it's ok
> > > to return "success" here. For normal boot, the MP tables
> > > must be reserved normally.
> > >
> >
> > Looks good. Thanks
>
> Should I resend the whole patch series to Linus for inclusion?
>
I am really not sure. :-). In the past, Andrew picked up the patches, kept
in his tree for couple of days and if something is important, he will push
it out to Linus.
Thanks
Vivek
* Vivek Goyal <[email protected]> [2008-06-09 17:00]:
>
> I am really not sure. :-). In the past, Andrew picked up the patches, kept
> in his tree for couple of days and if something is important, he will push
> it out to Linus.
Ok, I'm a bit confused about linux-next, and the new x86 tree from
Ingo/Thomas and how to merge patches. So, I'll wait a few days and then
resend the patches with Andrew in Cc.
Bernhard
--
Bernhard Walle, SUSE LINUX Products GmbH, Architecture Maintenance