2015-11-27 06:11:22

by Izumi, Taku

[permalink] [raw]
Subject: [PATCH v2 0/2] mm: Introduce kernelcore=reliable option

Xeon E7 v3 based systems supports Address Range Mirroring
and UEFI BIOS complied with UEFI spec 2.5 can notify which
ranges are reliable (mirrored) via EFI memory map.
Now Linux kernel utilize its information and allocates
boot time memory from reliable region.

My requirement is:
- allocate kernel memory from reliable region
- allocate user memory from non-reliable region

In order to meet my requirement, ZONE_MOVABLE is useful.
By arranging non-reliable range into ZONE_MOVABLE,
reliable memory is only used for kernel allocations.

My idea is to extend existing "kernelcore" option and
introduces kernelcore=reliable option. By specifying
"reliable" instead of specifying the amount of memory,
non-reliable region will be arranged into ZONE_MOVABLE.

Earlier discussions are at:
https://lkml.org/lkml/2015/10/9/24
https://lkml.org/lkml/2015/10/15/9

For example, suppose 2-nodes system with the following memory
range:
node 0 [mem 0x0000000000001000-0x000000109fffffff]
node 1 [mem 0x00000010a0000000-0x000000209fffffff]

and the following ranges are marked as reliable:
[0x0000000000000000-0x0000000100000000]
[0x0000000100000000-0x0000000180000000]
[0x0000000800000000-0x0000000880000000]
[0x00000010a0000000-0x0000001120000000]
[0x00000017a0000000-0x0000001820000000]

If you specify kernelcore=reliable, ZONE_NORMAL and ZONE_MOVABLE
are arranged like bellow:

- node 0:
ZONE_NORMAL : [0x0000000100000000-0x00000010a0000000]
ZONE_MOVABLE: [0x0000000180000000-0x00000010a0000000]
- node 1:
ZONE_NORMAL : [0x00000010a0000000-0x00000020a0000000]
ZONE_MOVABLE: [0x0000001120000000-0x00000020a0000000]

In overlapped range, pages to be ZONE_MOVABLE in ZONE_NORMAL
are treated as absent pages, and vice versa.

v1 -> v2:
Refine so that the above example case also can be
handled properly:


Taku Izumi (2):
mm: Calculate zone_start_pfn at zone_spanned_pages_in_node()
mm: Introduce kernelcore=reliable option

Documentation/kernel-parameters.txt | 9 ++-
mm/page_alloc.c | 140 +++++++++++++++++++++++++++++++-----
2 files changed, 131 insertions(+), 18 deletions(-)

--
1.8.3.1


2015-11-27 06:02:01

by Izumi, Taku

[permalink] [raw]
Subject: [PATCH v2 1/2] mm: Calculate zone_start_pfn at zone_spanned_pages_in_node()

Currently each zone's zone_start_pfn is calculated at
free_area_init_core(). However zone's range is fixed at
the time when invoking zone_spanned_pages_in_node().

This patch changes each zone->zone_start_pfn is
calculated at zone_spanned_pages_in_node().

Signed-off-by: Taku Izumi <[email protected]>
---
mm/page_alloc.c | 30 +++++++++++++++++++-----------
1 file changed, 19 insertions(+), 11 deletions(-)

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 17a3c66..acb0b4e 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -4928,31 +4928,31 @@ static unsigned long __meminit zone_spanned_pages_in_node(int nid,
unsigned long zone_type,
unsigned long node_start_pfn,
unsigned long node_end_pfn,
+ unsigned long *zone_start_pfn,
+ unsigned long *zone_end_pfn,
unsigned long *ignored)
{
- unsigned long zone_start_pfn, zone_end_pfn;
-
/* When hotadd a new node from cpu_up(), the node should be empty */
if (!node_start_pfn && !node_end_pfn)
return 0;

/* Get the start and end of the zone */
- zone_start_pfn = arch_zone_lowest_possible_pfn[zone_type];
- zone_end_pfn = arch_zone_highest_possible_pfn[zone_type];
+ *zone_start_pfn = arch_zone_lowest_possible_pfn[zone_type];
+ *zone_end_pfn = arch_zone_highest_possible_pfn[zone_type];
adjust_zone_range_for_zone_movable(nid, zone_type,
node_start_pfn, node_end_pfn,
- &zone_start_pfn, &zone_end_pfn);
+ zone_start_pfn, zone_end_pfn);

/* Check that this node has pages within the zone's required range */
- if (zone_end_pfn < node_start_pfn || zone_start_pfn > node_end_pfn)
+ if (*zone_end_pfn < node_start_pfn || *zone_start_pfn > node_end_pfn)
return 0;

/* Move the zone boundaries inside the node if necessary */
- zone_end_pfn = min(zone_end_pfn, node_end_pfn);
- zone_start_pfn = max(zone_start_pfn, node_start_pfn);
+ *zone_end_pfn = min(*zone_end_pfn, node_end_pfn);
+ *zone_start_pfn = max(*zone_start_pfn, node_start_pfn);

/* Return the spanned pages */
- return zone_end_pfn - zone_start_pfn;
+ return *zone_end_pfn - *zone_start_pfn;
}

/*
@@ -5017,6 +5017,8 @@ static inline unsigned long __meminit zone_spanned_pages_in_node(int nid,
unsigned long zone_type,
unsigned long node_start_pfn,
unsigned long node_end_pfn,
+ unsigned long *zone_start_pfn,
+ unsigned long *zone_end_pfn,
unsigned long *zones_size)
{
return zones_size[zone_type];
@@ -5047,15 +5049,22 @@ static void __meminit calculate_node_totalpages(struct pglist_data *pgdat,

for (i = 0; i < MAX_NR_ZONES; i++) {
struct zone *zone = pgdat->node_zones + i;
+ unsigned long zone_start_pfn, zone_end_pfn;
unsigned long size, real_size;

size = zone_spanned_pages_in_node(pgdat->node_id, i,
node_start_pfn,
node_end_pfn,
+ &zone_start_pfn,
+ &zone_end_pfn,
zones_size);
real_size = size - zone_absent_pages_in_node(pgdat->node_id, i,
node_start_pfn, node_end_pfn,
zholes_size);
+ if (size)
+ zone->zone_start_pfn = zone_start_pfn;
+ else
+ zone->zone_start_pfn = 0;
zone->spanned_pages = size;
zone->present_pages = real_size;

@@ -5176,7 +5185,6 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat)
{
enum zone_type j;
int nid = pgdat->node_id;
- unsigned long zone_start_pfn = pgdat->node_start_pfn;
int ret;

pgdat_resize_init(pgdat);
@@ -5192,6 +5200,7 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat)
for (j = 0; j < MAX_NR_ZONES; j++) {
struct zone *zone = pgdat->node_zones + j;
unsigned long size, realsize, freesize, memmap_pages;
+ unsigned long zone_start_pfn = zone->zone_start_pfn;

size = zone->spanned_pages;
realsize = freesize = zone->present_pages;
@@ -5260,7 +5269,6 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat)
ret = init_currently_empty_zone(zone, zone_start_pfn, size);
BUG_ON(ret);
memmap_init(size, nid, j, zone_start_pfn);
- zone_start_pfn += size;
}
}

--
1.8.3.1

2015-11-27 06:02:39

by Izumi, Taku

[permalink] [raw]
Subject: [PATCH v2 2/2] mm: Introduce kernelcore=reliable option

This patch extends existing "kernelcore" option and
introduces kernelcore=reliable option. By specifying
"reliable" instead of specifying the amount of memory,
non-reliable region will be arranged into ZONE_MOVABLE.

v1 -> v2:
- Refine so that the following case also can be
handled properly:

Node X: |MMMMMM------MMMMMM--------|
(legend) M: mirrored -: not mirrrored

In this case, ZONE_NORMAL and ZONE_MOVABLE are
arranged like bellow:

Node X: |--------------------------|
|ooooooxxxxxxooooooxxxxxxxx| ZONE_NORMAL
|ooooooxxxxxxoooooooo| ZONE_MOVABLE
(legend) o: present x: absent

Signed-off-by: Taku Izumi <[email protected]>
---
Documentation/kernel-parameters.txt | 9 ++-
mm/page_alloc.c | 110 ++++++++++++++++++++++++++++++++++--
2 files changed, 112 insertions(+), 7 deletions(-)

diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
index f8aae63..ed44c2c8 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -1695,7 +1695,8 @@ bytes respectively. Such letter suffixes can also be entirely omitted.

keepinitrd [HW,ARM]

- kernelcore=nn[KMG] [KNL,X86,IA-64,PPC] This parameter
+ kernelcore= Format: nn[KMG] | "reliable"
+ [KNL,X86,IA-64,PPC] This parameter
specifies the amount of memory usable by the kernel
for non-movable allocations. The requested amount is
spread evenly throughout all nodes in the system. The
@@ -1711,6 +1712,12 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
use the HighMem zone if it exists, and the Normal
zone if it does not.

+ Instead of specifying the amount of memory (nn[KMS]),
+ you can specify "reliable" option. In case "reliable"
+ option is specified, reliable memory is used for
+ non-movable allocations and remaining memory is used
+ for Movable pages.
+
kgdbdbgp= [KGDB,HW] kgdb over EHCI usb debug port.
Format: <Controller#>[,poll interval]
The controller # is the number of the ehci usb debug
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index acb0b4e..006a3d8 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -251,6 +251,7 @@ static unsigned long __meminitdata arch_zone_highest_possible_pfn[MAX_NR_ZONES];
static unsigned long __initdata required_kernelcore;
static unsigned long __initdata required_movablecore;
static unsigned long __meminitdata zone_movable_pfn[MAX_NUMNODES];
+static bool reliable_kernelcore;

/* movable_zone is the "real" zone pages in ZONE_MOVABLE are taken from */
int movable_zone;
@@ -4472,6 +4473,7 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone,
unsigned long pfn;
struct zone *z;
unsigned long nr_initialised = 0;
+ struct memblock_region *r = NULL, *tmp;

if (highest_memmap_pfn < end_pfn - 1)
highest_memmap_pfn = end_pfn - 1;
@@ -4491,6 +4493,38 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone,
if (!update_defer_init(pgdat, pfn, end_pfn,
&nr_initialised))
break;
+
+ /*
+ * if not reliable_kernelcore and ZONE_MOVABLE exists,
+ * range from zone_movable_pfn[nid] to end of each node
+ * should be ZONE_MOVABLE not ZONE_NORMAL. skip it.
+ */
+ if (!reliable_kernelcore && zone_movable_pfn[nid])
+ if (zone == ZONE_NORMAL &&
+ pfn >= zone_movable_pfn[nid])
+ continue;
+
+ /*
+ * check given memblock attribute by firmware which
+ * can affect kernel memory layout.
+ * if zone==ZONE_MOVABLE but memory is mirrored,
+ * it's an overlapped memmap init. skip it.
+ */
+ if (reliable_kernelcore && zone == ZONE_MOVABLE) {
+ if (!r ||
+ pfn >= memblock_region_memory_end_pfn(r)) {
+ for_each_memblock(memory, tmp)
+ if (pfn < memblock_region_memory_end_pfn(tmp))
+ break;
+ r = tmp;
+ }
+ if (pfn >= memblock_region_memory_base_pfn(r) &&
+ memblock_is_mirror(r)) {
+ /* already initialized as NORMAL */
+ pfn = memblock_region_memory_end_pfn(r);
+ continue;
+ }
+ }
}

/*
@@ -4909,11 +4943,6 @@ static void __meminit adjust_zone_range_for_zone_movable(int nid,
*zone_end_pfn = min(node_end_pfn,
arch_zone_highest_possible_pfn[movable_zone]);

- /* Adjust for ZONE_MOVABLE starting within this range */
- } else if (*zone_start_pfn < zone_movable_pfn[nid] &&
- *zone_end_pfn > zone_movable_pfn[nid]) {
- *zone_end_pfn = zone_movable_pfn[nid];
-
/* Check if this whole range is within ZONE_MOVABLE */
} else if (*zone_start_pfn >= zone_movable_pfn[nid])
*zone_start_pfn = *zone_end_pfn;
@@ -4998,6 +5027,7 @@ static unsigned long __meminit zone_absent_pages_in_node(int nid,
unsigned long zone_low = arch_zone_lowest_possible_pfn[zone_type];
unsigned long zone_high = arch_zone_highest_possible_pfn[zone_type];
unsigned long zone_start_pfn, zone_end_pfn;
+ unsigned long nr_absent;

/* When hotadd a new node from cpu_up(), the node should be empty */
if (!node_start_pfn && !node_end_pfn)
@@ -5009,7 +5039,39 @@ static unsigned long __meminit zone_absent_pages_in_node(int nid,
adjust_zone_range_for_zone_movable(nid, zone_type,
node_start_pfn, node_end_pfn,
&zone_start_pfn, &zone_end_pfn);
- return __absent_pages_in_range(nid, zone_start_pfn, zone_end_pfn);
+ nr_absent = __absent_pages_in_range(nid, zone_start_pfn, zone_end_pfn);
+
+ /*
+ * ZONE_MOVABLE handling.
+ * Treat pages to be ZONE_MOVABLE in ZONE_NORMAL as absent pages
+ * and vice versa.
+ */
+ if (zone_movable_pfn[nid]) {
+ if (reliable_kernelcore) {
+ unsigned long start_pfn, end_pfn;
+ struct memblock_region *r;
+
+ for_each_memblock(memory, r) {
+ start_pfn = clamp((ulong)PFN_DOWN(r->base),
+ zone_start_pfn, zone_end_pfn);
+ end_pfn = clamp((ulong)PFN_DOWN(r->base + r->size),
+ zone_start_pfn, zone_end_pfn);
+
+ if (zone_type == ZONE_MOVABLE &&
+ memblock_is_mirror(r))
+ nr_absent += end_pfn - start_pfn;
+
+ if (zone_type == ZONE_NORMAL &&
+ !memblock_is_mirror(r))
+ nr_absent += end_pfn - start_pfn;
+ }
+ } else {
+ if (zone_type == ZONE_NORMAL)
+ nr_absent += node_end_pfn - zone_movable_pfn[nid];
+ }
+ }
+
+ return nr_absent;
}

#else /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */
@@ -5507,6 +5569,36 @@ static void __init find_zone_movable_pfns_for_nodes(void)
}

/*
+ * If kernelcore=reliable is specified, ignore movablecore option
+ */
+ if (reliable_kernelcore) {
+ bool mem_below_4gb_not_mirrored = false;
+
+ for_each_memblock(memory, r) {
+ if (memblock_is_mirror(r))
+ continue;
+
+ nid = r->nid;
+
+ usable_startpfn = PFN_DOWN(r->base);
+
+ if (usable_startpfn < 0x100000) {
+ mem_below_4gb_not_mirrored = true;
+ continue;
+ }
+
+ zone_movable_pfn[nid] = zone_movable_pfn[nid] ?
+ min(usable_startpfn, zone_movable_pfn[nid]) :
+ usable_startpfn;
+ }
+
+ if (mem_below_4gb_not_mirrored)
+ pr_warn("This configuration results in unmirrored kernel memory.");
+
+ goto out2;
+ }
+
+ /*
* If movablecore=nn[KMG] was specified, calculate what size of
* kernelcore that corresponds so that memory usable for
* any allocation type is evenly spread. If both kernelcore
@@ -5766,6 +5858,12 @@ static int __init cmdline_parse_core(char *p, unsigned long *core)
*/
static int __init cmdline_parse_kernelcore(char *p)
{
+ /* parse kernelcore=reliable */
+ if (parse_option_str(p, "reliable")) {
+ reliable_kernelcore = true;
+ return 0;
+ }
+
return cmdline_parse_core(p, &required_kernelcore);
}

--
1.8.3.1

2015-12-08 00:25:37

by Tony Luck

[permalink] [raw]
Subject: Re: [PATCH v2 0/2] mm: Introduce kernelcore=reliable option

Sorry for the slow turnaround testing this.

This version seems to do better with my quirky system.
Summary of /proc/zoneinfo now looks like this:

$ ./zoneinfo
Node Normal Movable DMA DMA32
0 17090.04 85687.43 14.93 1677.41
1 17949.70 81490.98
2 17911.66 85675.00
3 17936.42 85313.32

which gets close to the mirror numbers reported in early part of boot:

[ 0.000000] efi: Memory: 81050M/420096M mirrored memory

SUM(Normal) = 70887.82

There are ~8GB of "struct page" allocated from boot time allocator,
which covers most of the difference in the values.

-Tony

2015-12-08 00:31:14

by Andrew Morton

[permalink] [raw]
Subject: Re: [PATCH v2 0/2] mm: Introduce kernelcore=reliable option

On Sat, 28 Nov 2015 00:03:55 +0900 Taku Izumi <[email protected]> wrote:

> Xeon E7 v3 based systems supports Address Range Mirroring
> and UEFI BIOS complied with UEFI spec 2.5 can notify which
> ranges are reliable (mirrored) via EFI memory map.
> Now Linux kernel utilize its information and allocates
> boot time memory from reliable region.
>
> My requirement is:
> - allocate kernel memory from reliable region
> - allocate user memory from non-reliable region
>
> In order to meet my requirement, ZONE_MOVABLE is useful.
> By arranging non-reliable range into ZONE_MOVABLE,
> reliable memory is only used for kernel allocations.
>
> My idea is to extend existing "kernelcore" option and
> introduces kernelcore=reliable option. By specifying
> "reliable" instead of specifying the amount of memory,
> non-reliable region will be arranged into ZONE_MOVABLE.

It is unfortunate that the kernel presently refers to this memory as
"mirrored", but this patchset introduces the new term "reliable". I
think it would be better if we use "mirrored" throughout.

Of course, mirroring isn't the only way to get reliable memory.
Perhaps if a part of the system memory has ECC correction then this
also can be accessed using "reliable", in which case your proposed
naming makes sense. reliable == mirrored || ecc?



Secondly, does this patchset mean that kernelcore=reliable and
kernelcore=100M are exclusive? Or can the user specify
"kernelcore=reliable,kernelcore=100M" to use 100M of reliable memory
for kernelcore?

This is unclear from the documentation and I suggest that this be
spelled out.

2015-12-08 08:08:08

by Izumi, Taku

[permalink] [raw]
Subject: RE: [PATCH v2 0/2] mm: Introduce kernelcore=reliable option

Dear Tony,

Thanks for testing!

Dear Andrew,


> > Xeon E7 v3 based systems supports Address Range Mirroring
> > and UEFI BIOS complied with UEFI spec 2.5 can notify which
> > ranges are reliable (mirrored) via EFI memory map.
> > Now Linux kernel utilize its information and allocates
> > boot time memory from reliable region.
> >
> > My requirement is:
> > - allocate kernel memory from reliable region
> > - allocate user memory from non-reliable region
> >
> > In order to meet my requirement, ZONE_MOVABLE is useful.
> > By arranging non-reliable range into ZONE_MOVABLE,
> > reliable memory is only used for kernel allocations.
> >
> > My idea is to extend existing "kernelcore" option and
> > introduces kernelcore=reliable option. By specifying
> > "reliable" instead of specifying the amount of memory,
> > non-reliable region will be arranged into ZONE_MOVABLE.
>
> It is unfortunate that the kernel presently refers to this memory as
> "mirrored", but this patchset introduces the new term "reliable". I
> think it would be better if we use "mirrored" throughout.
> Of course, mirroring isn't the only way to get reliable memory.

YES. "mirroring" is not the only way.
So, in my opinion, we should change "mirrored" into "reliable" in order
to match terms of UEFI 2.5 spec.

> Perhaps if a part of the system memory has ECC correction then this
> also can be accessed using "reliable", in which case your proposed
> naming makes sense. reliable == mirrored || ecc?

"reliable" is better.

But, I'm willing to change "reliable" into "mirrored".

Otherwise, I keep "kernelcore=reliable" and add the following minimal fix as
a separate patch:

diff a/arch/x86/platform/efi/efi.c b/arch/x86/platform/efi/efi.c
--- a/arch/x86/platform/efi/efi.c
+++ b/arch/x86/platform/efi/efi.c
@@ -134,7 +134,7 @@ void __init efi_find_mirror(void)
}
}
if (mirror_size)
- pr_info("Memory: %lldM/%lldM mirrored memory\n",
+ pr_info("Memory: %lldM/%lldM reliable memory\n",
mirror_size>>20, total_size>>20);
}


Which do you think is beter ?
- change into kernelcore="mirrored"
- keep kernelcore="reliable" and minmal printk fix

>
> Secondly, does this patchset mean that kernelcore=reliable and
> kernelcore=100M are exclusive? Or can the user specify
> "kernelcore=reliable,kernelcore=100M" to use 100M of reliable memory
> for kernelcore?

No, these are exclusive.
>
> This is unclear from the documentation and I suggest that this be
> spelled out.

Thanks. I'll update its document.

Sincerely,
Taku Izumi

2015-12-08 16:11:09

by Tony Luck

[permalink] [raw]
Subject: Re: [PATCH v2 0/2] mm: Introduce kernelcore=reliable option

On Tue, Dec 8, 2015 at 12:07 AM, Izumi, Taku <[email protected]> wrote:
> Which do you think is beter ?
> - change into kernelcore="mirrored"
> - keep kernelcore="reliable" and minmal printk fix

UEFI came up with the "reliable" wording (as a more generic term ...
as Andrew said
it could cover differences in ECC modes, or some alternate memory
technology that
has lower error rates).

But I personally like "mirror" more ... it matches current
implementation. Of course
I'll look silly if some future system does something other than mirror.

-Tony

2015-12-08 23:53:14

by Izumi, Taku

[permalink] [raw]
Subject: RE: [PATCH v2 0/2] mm: Introduce kernelcore=reliable option

Dear Tony,


> > Which do you think is beter ?
> > - change into kernelcore="mirrored"
> > - keep kernelcore="reliable" and minmal printk fix
>
> UEFI came up with the "reliable" wording (as a more generic term ...
> as Andrew said
> it could cover differences in ECC modes, or some alternate memory
> technology that
> has lower error rates).
>
> But I personally like "mirror" more ... it matches current
> implementation. Of course
> I'll look silly if some future system does something other than mirror.
>

Okay, I'll change the option name into kernelcore=mirror.

Sincerely,
Taku Izumi
????{.n?+???????+%?????ݶ??w??{.n?+????{??G?????{ay?ʇڙ?,j??f???h?????????z_??(?階?ݢj"???m??????G????????????&???~???iO???z??v?^?m???? ????????I?

2015-12-09 02:26:07

by Xishi Qiu

[permalink] [raw]
Subject: Re: [PATCH v2 2/2] mm: Introduce kernelcore=reliable option

On 2015/11/27 23:04, Taku Izumi wrote:

> This patch extends existing "kernelcore" option and
> introduces kernelcore=reliable option. By specifying
> "reliable" instead of specifying the amount of memory,
> non-reliable region will be arranged into ZONE_MOVABLE.
>
> v1 -> v2:
> - Refine so that the following case also can be
> handled properly:
>
> Node X: |MMMMMM------MMMMMM--------|
> (legend) M: mirrored -: not mirrrored
>
> In this case, ZONE_NORMAL and ZONE_MOVABLE are
> arranged like bellow:
>
> Node X: |--------------------------|
> |ooooooxxxxxxooooooxxxxxxxx| ZONE_NORMAL
> |ooooooxxxxxxoooooooo| ZONE_MOVABLE
> (legend) o: present x: absent
>
> Signed-off-by: Taku Izumi <[email protected]>
> ---
> Documentation/kernel-parameters.txt | 9 ++-
> mm/page_alloc.c | 110 ++++++++++++++++++++++++++++++++++--
> 2 files changed, 112 insertions(+), 7 deletions(-)
>
> diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
> index f8aae63..ed44c2c8 100644
> --- a/Documentation/kernel-parameters.txt
> +++ b/Documentation/kernel-parameters.txt
> @@ -1695,7 +1695,8 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
>
> keepinitrd [HW,ARM]
>
> - kernelcore=nn[KMG] [KNL,X86,IA-64,PPC] This parameter
> + kernelcore= Format: nn[KMG] | "reliable"
> + [KNL,X86,IA-64,PPC] This parameter
> specifies the amount of memory usable by the kernel
> for non-movable allocations. The requested amount is
> spread evenly throughout all nodes in the system. The
> @@ -1711,6 +1712,12 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
> use the HighMem zone if it exists, and the Normal
> zone if it does not.
>
> + Instead of specifying the amount of memory (nn[KMS]),
> + you can specify "reliable" option. In case "reliable"
> + option is specified, reliable memory is used for
> + non-movable allocations and remaining memory is used
> + for Movable pages.
> +
> kgdbdbgp= [KGDB,HW] kgdb over EHCI usb debug port.
> Format: <Controller#>[,poll interval]
> The controller # is the number of the ehci usb debug
> diff --git a/mm/page_alloc.c b/mm/page_alloc.c
> index acb0b4e..006a3d8 100644
> --- a/mm/page_alloc.c
> +++ b/mm/page_alloc.c
> @@ -251,6 +251,7 @@ static unsigned long __meminitdata arch_zone_highest_possible_pfn[MAX_NR_ZONES];
> static unsigned long __initdata required_kernelcore;
> static unsigned long __initdata required_movablecore;
> static unsigned long __meminitdata zone_movable_pfn[MAX_NUMNODES];
> +static bool reliable_kernelcore;
>
> /* movable_zone is the "real" zone pages in ZONE_MOVABLE are taken from */
> int movable_zone;
> @@ -4472,6 +4473,7 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone,
> unsigned long pfn;
> struct zone *z;
> unsigned long nr_initialised = 0;
> + struct memblock_region *r = NULL, *tmp;
>
> if (highest_memmap_pfn < end_pfn - 1)
> highest_memmap_pfn = end_pfn - 1;
> @@ -4491,6 +4493,38 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone,
> if (!update_defer_init(pgdat, pfn, end_pfn,
> &nr_initialised))
> break;
> +
> + /*
> + * if not reliable_kernelcore and ZONE_MOVABLE exists,
> + * range from zone_movable_pfn[nid] to end of each node
> + * should be ZONE_MOVABLE not ZONE_NORMAL. skip it.
> + */
> + if (!reliable_kernelcore && zone_movable_pfn[nid])
> + if (zone == ZONE_NORMAL &&
> + pfn >= zone_movable_pfn[nid])
> + continue;
> +
> + /*
> + * check given memblock attribute by firmware which
> + * can affect kernel memory layout.
> + * if zone==ZONE_MOVABLE but memory is mirrored,
> + * it's an overlapped memmap init. skip it.
> + */
> + if (reliable_kernelcore && zone == ZONE_MOVABLE) {
> + if (!r ||
> + pfn >= memblock_region_memory_end_pfn(r)) {
> + for_each_memblock(memory, tmp)
> + if (pfn < memblock_region_memory_end_pfn(tmp))
> + break;
> + r = tmp;
> + }
> + if (pfn >= memblock_region_memory_base_pfn(r) &&
> + memblock_is_mirror(r)) {
> + /* already initialized as NORMAL */
> + pfn = memblock_region_memory_end_pfn(r);
> + continue;
> + }
> + }

Hi Taku,

It has checked this case: zone==ZONE_MOVABLE but memory is mirrored,
but how about another case: zone==ZONE_NORMAL but memory is not mirrored?

Node X: |--------------------------|
|ooooooxxxxxxooooooxxxxxxxx| ZONE_NORMAL
|ooooooxxxxxxoooooooo| ZONE_MOVABLE
(legend) o: present x: absent

Thanks,
Xishi Qiu

> }
>
> /*
> @@ -4909,11 +4943,6 @@ static void __meminit adjust_zone_range_for_zone_movable(int nid,
> *zone_end_pfn = min(node_end_pfn,
> arch_zone_highest_possible_pfn[movable_zone]);
>
> - /* Adjust for ZONE_MOVABLE starting within this range */
> - } else if (*zone_start_pfn < zone_movable_pfn[nid] &&
> - *zone_end_pfn > zone_movable_pfn[nid]) {
> - *zone_end_pfn = zone_movable_pfn[nid];
> -
> /* Check if this whole range is within ZONE_MOVABLE */
> } else if (*zone_start_pfn >= zone_movable_pfn[nid])
> *zone_start_pfn = *zone_end_pfn;
> @@ -4998,6 +5027,7 @@ static unsigned long __meminit zone_absent_pages_in_node(int nid,
> unsigned long zone_low = arch_zone_lowest_possible_pfn[zone_type];
> unsigned long zone_high = arch_zone_highest_possible_pfn[zone_type];
> unsigned long zone_start_pfn, zone_end_pfn;
> + unsigned long nr_absent;
>
> /* When hotadd a new node from cpu_up(), the node should be empty */
> if (!node_start_pfn && !node_end_pfn)
> @@ -5009,7 +5039,39 @@ static unsigned long __meminit zone_absent_pages_in_node(int nid,
> adjust_zone_range_for_zone_movable(nid, zone_type,
> node_start_pfn, node_end_pfn,
> &zone_start_pfn, &zone_end_pfn);
> - return __absent_pages_in_range(nid, zone_start_pfn, zone_end_pfn);
> + nr_absent = __absent_pages_in_range(nid, zone_start_pfn, zone_end_pfn);
> +
> + /*
> + * ZONE_MOVABLE handling.
> + * Treat pages to be ZONE_MOVABLE in ZONE_NORMAL as absent pages
> + * and vice versa.
> + */
> + if (zone_movable_pfn[nid]) {
> + if (reliable_kernelcore) {
> + unsigned long start_pfn, end_pfn;
> + struct memblock_region *r;
> +
> + for_each_memblock(memory, r) {
> + start_pfn = clamp((ulong)PFN_DOWN(r->base),
> + zone_start_pfn, zone_end_pfn);
> + end_pfn = clamp((ulong)PFN_DOWN(r->base + r->size),
> + zone_start_pfn, zone_end_pfn);
> +
> + if (zone_type == ZONE_MOVABLE &&
> + memblock_is_mirror(r))
> + nr_absent += end_pfn - start_pfn;
> +
> + if (zone_type == ZONE_NORMAL &&
> + !memblock_is_mirror(r))
> + nr_absent += end_pfn - start_pfn;
> + }
> + } else {
> + if (zone_type == ZONE_NORMAL)
> + nr_absent += node_end_pfn - zone_movable_pfn[nid];
> + }
> + }
> +
> + return nr_absent;
> }
>
> #else /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */
> @@ -5507,6 +5569,36 @@ static void __init find_zone_movable_pfns_for_nodes(void)
> }
>
> /*
> + * If kernelcore=reliable is specified, ignore movablecore option
> + */
> + if (reliable_kernelcore) {
> + bool mem_below_4gb_not_mirrored = false;
> +
> + for_each_memblock(memory, r) {
> + if (memblock_is_mirror(r))
> + continue;
> +
> + nid = r->nid;
> +
> + usable_startpfn = PFN_DOWN(r->base);
> +
> + if (usable_startpfn < 0x100000) {
> + mem_below_4gb_not_mirrored = true;
> + continue;
> + }
> +
> + zone_movable_pfn[nid] = zone_movable_pfn[nid] ?
> + min(usable_startpfn, zone_movable_pfn[nid]) :
> + usable_startpfn;
> + }
> +
> + if (mem_below_4gb_not_mirrored)
> + pr_warn("This configuration results in unmirrored kernel memory.");
> +
> + goto out2;
> + }
> +
> + /*
> * If movablecore=nn[KMG] was specified, calculate what size of
> * kernelcore that corresponds so that memory usable for
> * any allocation type is evenly spread. If both kernelcore
> @@ -5766,6 +5858,12 @@ static int __init cmdline_parse_core(char *p, unsigned long *core)
> */
> static int __init cmdline_parse_kernelcore(char *p)
> {
> + /* parse kernelcore=reliable */
> + if (parse_option_str(p, "reliable")) {
> + reliable_kernelcore = true;
> + return 0;
> + }
> +
> return cmdline_parse_core(p, &required_kernelcore);
> }
>


2015-12-09 02:40:32

by Xishi Qiu

[permalink] [raw]
Subject: Re: [PATCH v2 2/2] mm: Introduce kernelcore=reliable option

On 2015/12/9 10:25, Xishi Qiu wrote:

> On 2015/11/27 23:04, Taku Izumi wrote:
>
>> This patch extends existing "kernelcore" option and
>> introduces kernelcore=reliable option. By specifying
>> "reliable" instead of specifying the amount of memory,
>> non-reliable region will be arranged into ZONE_MOVABLE.
>>
>> v1 -> v2:
>> - Refine so that the following case also can be
>> handled properly:
>>
>> Node X: |MMMMMM------MMMMMM--------|
>> (legend) M: mirrored -: not mirrrored
>>
>> In this case, ZONE_NORMAL and ZONE_MOVABLE are
>> arranged like bellow:
>>
>> Node X: |--------------------------|
>> |ooooooxxxxxxooooooxxxxxxxx| ZONE_NORMAL
>> |ooooooxxxxxxoooooooo| ZONE_MOVABLE
>> (legend) o: present x: absent
>>
>> Signed-off-by: Taku Izumi <[email protected]>
>> ---
>> Documentation/kernel-parameters.txt | 9 ++-
>> mm/page_alloc.c | 110 ++++++++++++++++++++++++++++++++++--
>> 2 files changed, 112 insertions(+), 7 deletions(-)
>>
>> diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
>> index f8aae63..ed44c2c8 100644
>> --- a/Documentation/kernel-parameters.txt
>> +++ b/Documentation/kernel-parameters.txt
>> @@ -1695,7 +1695,8 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
>>
>> keepinitrd [HW,ARM]
>>
>> - kernelcore=nn[KMG] [KNL,X86,IA-64,PPC] This parameter
>> + kernelcore= Format: nn[KMG] | "reliable"
>> + [KNL,X86,IA-64,PPC] This parameter
>> specifies the amount of memory usable by the kernel
>> for non-movable allocations. The requested amount is
>> spread evenly throughout all nodes in the system. The
>> @@ -1711,6 +1712,12 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
>> use the HighMem zone if it exists, and the Normal
>> zone if it does not.
>>
>> + Instead of specifying the amount of memory (nn[KMS]),
>> + you can specify "reliable" option. In case "reliable"
>> + option is specified, reliable memory is used for
>> + non-movable allocations and remaining memory is used
>> + for Movable pages.
>> +
>> kgdbdbgp= [KGDB,HW] kgdb over EHCI usb debug port.
>> Format: <Controller#>[,poll interval]
>> The controller # is the number of the ehci usb debug
>> diff --git a/mm/page_alloc.c b/mm/page_alloc.c
>> index acb0b4e..006a3d8 100644
>> --- a/mm/page_alloc.c
>> +++ b/mm/page_alloc.c
>> @@ -251,6 +251,7 @@ static unsigned long __meminitdata arch_zone_highest_possible_pfn[MAX_NR_ZONES];
>> static unsigned long __initdata required_kernelcore;
>> static unsigned long __initdata required_movablecore;
>> static unsigned long __meminitdata zone_movable_pfn[MAX_NUMNODES];
>> +static bool reliable_kernelcore;
>>
>> /* movable_zone is the "real" zone pages in ZONE_MOVABLE are taken from */
>> int movable_zone;
>> @@ -4472,6 +4473,7 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone,
>> unsigned long pfn;
>> struct zone *z;
>> unsigned long nr_initialised = 0;
>> + struct memblock_region *r = NULL, *tmp;
>>
>> if (highest_memmap_pfn < end_pfn - 1)
>> highest_memmap_pfn = end_pfn - 1;
>> @@ -4491,6 +4493,38 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone,
>> if (!update_defer_init(pgdat, pfn, end_pfn,
>> &nr_initialised))
>> break;
>> +
>> + /*
>> + * if not reliable_kernelcore and ZONE_MOVABLE exists,
>> + * range from zone_movable_pfn[nid] to end of each node
>> + * should be ZONE_MOVABLE not ZONE_NORMAL. skip it.
>> + */
>> + if (!reliable_kernelcore && zone_movable_pfn[nid])
>> + if (zone == ZONE_NORMAL &&
>> + pfn >= zone_movable_pfn[nid])
>> + continue;
>> +
>> + /*
>> + * check given memblock attribute by firmware which
>> + * can affect kernel memory layout.
>> + * if zone==ZONE_MOVABLE but memory is mirrored,
>> + * it's an overlapped memmap init. skip it.
>> + */
>> + if (reliable_kernelcore && zone == ZONE_MOVABLE) {
>> + if (!r ||
>> + pfn >= memblock_region_memory_end_pfn(r)) {
>> + for_each_memblock(memory, tmp)
>> + if (pfn < memblock_region_memory_end_pfn(tmp))
>> + break;
>> + r = tmp;
>> + }
>> + if (pfn >= memblock_region_memory_base_pfn(r) &&
>> + memblock_is_mirror(r)) {
>> + /* already initialized as NORMAL */
>> + pfn = memblock_region_memory_end_pfn(r);
>> + continue;
>> + }
>> + }
>
> Hi Taku,
>
> It has checked this case: zone==ZONE_MOVABLE but memory is mirrored,
> but how about another case: zone==ZONE_NORMAL but memory is not mirrored?
>
> Node X: |--------------------------|
> |ooooooxxxxxxooooooxxxxxxxx| ZONE_NORMAL
> |ooooooxxxxxxoooooooo| ZONE_MOVABLE
> (legend) o: present x: absent
>
> Thanks,
> Xishi Qiu
>

Hi Taku,

memmap_init_zone() will init normal zone first, then init the movable
zone, and it will change the page initialization which has already inited
in normal zone, so it need not to check the other case, right?

I think this is a little confusion and waste time.

Thanks,
Xishi Qiu

2015-12-09 03:21:12

by Izumi, Taku

[permalink] [raw]
Subject: RE: [PATCH v2 2/2] mm: Introduce kernelcore=reliable option

Dear Xishi,

Thanks for reviewing.

> -----Original Message-----
> From: Xishi Qiu [mailto:[email protected]]
> Sent: Wednesday, December 09, 2015 11:26 AM
> To: Izumi, Taku/$B@t(B $BBs(B
> Cc: [email protected]; [email protected]; [email protected]; Kamezawa, Hiroyuki/$B55_7(B $B42G7(B; [email protected];
> [email protected]; [email protected]; [email protected]
> Subject: Re: [PATCH v2 2/2] mm: Introduce kernelcore=reliable option
>
> On 2015/11/27 23:04, Taku Izumi wrote:
>
> > This patch extends existing "kernelcore" option and
> > introduces kernelcore=reliable option. By specifying
> > "reliable" instead of specifying the amount of memory,
> > non-reliable region will be arranged into ZONE_MOVABLE.
> >
> > v1 -> v2:
> > - Refine so that the following case also can be
> > handled properly:
> >
> > Node X: |MMMMMM------MMMMMM--------|
> > (legend) M: mirrored -: not mirrrored
> >
> > In this case, ZONE_NORMAL and ZONE_MOVABLE are
> > arranged like bellow:
> >
> > Node X: |--------------------------|
> > |ooooooxxxxxxooooooxxxxxxxx| ZONE_NORMAL
> > |ooooooxxxxxxoooooooo| ZONE_MOVABLE
> > (legend) o: present x: absent
> >
> > Signed-off-by: Taku Izumi <[email protected]>
> > ---
> > Documentation/kernel-parameters.txt | 9 ++-
> > mm/page_alloc.c | 110 ++++++++++++++++++++++++++++++++++--
> > 2 files changed, 112 insertions(+), 7 deletions(-)
> >
> > diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
> > index f8aae63..ed44c2c8 100644
> > --- a/Documentation/kernel-parameters.txt
> > +++ b/Documentation/kernel-parameters.txt
> > @@ -1695,7 +1695,8 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
> >
> > keepinitrd [HW,ARM]
> >
> > - kernelcore=nn[KMG] [KNL,X86,IA-64,PPC] This parameter
> > + kernelcore= Format: nn[KMG] | "reliable"
> > + [KNL,X86,IA-64,PPC] This parameter
> > specifies the amount of memory usable by the kernel
> > for non-movable allocations. The requested amount is
> > spread evenly throughout all nodes in the system. The
> > @@ -1711,6 +1712,12 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
> > use the HighMem zone if it exists, and the Normal
> > zone if it does not.
> >
> > + Instead of specifying the amount of memory (nn[KMS]),
> > + you can specify "reliable" option. In case "reliable"
> > + option is specified, reliable memory is used for
> > + non-movable allocations and remaining memory is used
> > + for Movable pages.
> > +
> > kgdbdbgp= [KGDB,HW] kgdb over EHCI usb debug port.
> > Format: <Controller#>[,poll interval]
> > The controller # is the number of the ehci usb debug
> > diff --git a/mm/page_alloc.c b/mm/page_alloc.c
> > index acb0b4e..006a3d8 100644
> > --- a/mm/page_alloc.c
> > +++ b/mm/page_alloc.c
> > @@ -251,6 +251,7 @@ static unsigned long __meminitdata arch_zone_highest_possible_pfn[MAX_NR_ZONES];
> > static unsigned long __initdata required_kernelcore;
> > static unsigned long __initdata required_movablecore;
> > static unsigned long __meminitdata zone_movable_pfn[MAX_NUMNODES];
> > +static bool reliable_kernelcore;
> >
> > /* movable_zone is the "real" zone pages in ZONE_MOVABLE are taken from */
> > int movable_zone;
> > @@ -4472,6 +4473,7 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone,
> > unsigned long pfn;
> > struct zone *z;
> > unsigned long nr_initialised = 0;
> > + struct memblock_region *r = NULL, *tmp;
> >
> > if (highest_memmap_pfn < end_pfn - 1)
> > highest_memmap_pfn = end_pfn - 1;
> > @@ -4491,6 +4493,38 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone,
> > if (!update_defer_init(pgdat, pfn, end_pfn,
> > &nr_initialised))
> > break;
> > +
> > + /*
> > + * if not reliable_kernelcore and ZONE_MOVABLE exists,
> > + * range from zone_movable_pfn[nid] to end of each node
> > + * should be ZONE_MOVABLE not ZONE_NORMAL. skip it.
> > + */
> > + if (!reliable_kernelcore && zone_movable_pfn[nid])
> > + if (zone == ZONE_NORMAL &&
> > + pfn >= zone_movable_pfn[nid])
> > + continue;
> > +
> > + /*
> > + * check given memblock attribute by firmware which
> > + * can affect kernel memory layout.
> > + * if zone==ZONE_MOVABLE but memory is mirrored,
> > + * it's an overlapped memmap init. skip it.
> > + */
> > + if (reliable_kernelcore && zone == ZONE_MOVABLE) {
> > + if (!r ||
> > + pfn >= memblock_region_memory_end_pfn(r)) {
> > + for_each_memblock(memory, tmp)
> > + if (pfn < memblock_region_memory_end_pfn(tmp))
> > + break;
> > + r = tmp;
> > + }
> > + if (pfn >= memblock_region_memory_base_pfn(r) &&
> > + memblock_is_mirror(r)) {
> > + /* already initialized as NORMAL */
> > + pfn = memblock_region_memory_end_pfn(r);
> > + continue;
> > + }
> > + }
>
> Hi Taku,
>
> It has checked this case: zone==ZONE_MOVABLE but memory is mirrored,
> but how about another case: zone==ZONE_NORMAL but memory is not mirrored?

Both case are handled.

>
> Node X: |--------------------------|
> |ooooooxxxxxxooooooxxxxxxxx| ZONE_NORMAL
> |ooooooxxxxxxoooooooo| ZONE_MOVABLE
> (legend) o: present x: absent


A B C
Node X: |--------------------------|
|ooooooxxxxxxooooooxxxxxxxx| ZONE_NORMAL
|ooooooxxxxxxoooooooo| ZONE_MOVABLE
(legend) o: present x: absent

ZONE_NORMAL, zone_start_pfn: A, size(spanned_pages): C-A
ZONE_MOVABLE, zone_start_pfn: B, size(spanned_pages): C-B
A: node_start_pfn
B: zone_movable_pfn[nid]

First memmap_init_zone() is invoked with zone = ZONE_NORMAL, so
pages are initialized as ZONE_NORMAL like bellow:

A C
|NNNNNNNNNNNNNNNNNNNNNNNNNN| ZONE_NORMAL

Then, memmap_init_zone() is invoked with zone = ZONE_MOVABLE,
pages to be ZONE_MOVABLE are reinitialized as ZONE_MOVABLE, and
others are skipped (so they are still ZONE_NORMAL) like bellow:

B C
|MMMMMM------MMMMMMMM| ZONE_MOVABLE

So finally pages result in maps like bellow:

A B C
|NNNNNMMMMMMMNNNNNNMMMMMMMM|


Sincerely,
Taku Izumi

>
> > }
> >
> > /*
> > @@ -4909,11 +4943,6 @@ static void __meminit adjust_zone_range_for_zone_movable(int nid,
> > *zone_end_pfn = min(node_end_pfn,
> > arch_zone_highest_possible_pfn[movable_zone]);
> >
> > - /* Adjust for ZONE_MOVABLE starting within this range */
> > - } else if (*zone_start_pfn < zone_movable_pfn[nid] &&
> > - *zone_end_pfn > zone_movable_pfn[nid]) {
> > - *zone_end_pfn = zone_movable_pfn[nid];
> > -
> > /* Check if this whole range is within ZONE_MOVABLE */
> > } else if (*zone_start_pfn >= zone_movable_pfn[nid])
> > *zone_start_pfn = *zone_end_pfn;
> > @@ -4998,6 +5027,7 @@ static unsigned long __meminit zone_absent_pages_in_node(int nid,
> > unsigned long zone_low = arch_zone_lowest_possible_pfn[zone_type];
> > unsigned long zone_high = arch_zone_highest_possible_pfn[zone_type];
> > unsigned long zone_start_pfn, zone_end_pfn;
> > + unsigned long nr_absent;
> >
> > /* When hotadd a new node from cpu_up(), the node should be empty */
> > if (!node_start_pfn && !node_end_pfn)
> > @@ -5009,7 +5039,39 @@ static unsigned long __meminit zone_absent_pages_in_node(int nid,
> > adjust_zone_range_for_zone_movable(nid, zone_type,
> > node_start_pfn, node_end_pfn,
> > &zone_start_pfn, &zone_end_pfn);
> > - return __absent_pages_in_range(nid, zone_start_pfn, zone_end_pfn);
> > + nr_absent = __absent_pages_in_range(nid, zone_start_pfn, zone_end_pfn);
> > +
> > + /*
> > + * ZONE_MOVABLE handling.
> > + * Treat pages to be ZONE_MOVABLE in ZONE_NORMAL as absent pages
> > + * and vice versa.
> > + */
> > + if (zone_movable_pfn[nid]) {
> > + if (reliable_kernelcore) {
> > + unsigned long start_pfn, end_pfn;
> > + struct memblock_region *r;
> > +
> > + for_each_memblock(memory, r) {
> > + start_pfn = clamp((ulong)PFN_DOWN(r->base),
> > + zone_start_pfn, zone_end_pfn);
> > + end_pfn = clamp((ulong)PFN_DOWN(r->base + r->size),
> > + zone_start_pfn, zone_end_pfn);
> > +
> > + if (zone_type == ZONE_MOVABLE &&
> > + memblock_is_mirror(r))
> > + nr_absent += end_pfn - start_pfn;
> > +
> > + if (zone_type == ZONE_NORMAL &&
> > + !memblock_is_mirror(r))
> > + nr_absent += end_pfn - start_pfn;
> > + }
> > + } else {
> > + if (zone_type == ZONE_NORMAL)
> > + nr_absent += node_end_pfn - zone_movable_pfn[nid];
> > + }
> > + }
> > +
> > + return nr_absent;
> > }
> >
> > #else /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */
> > @@ -5507,6 +5569,36 @@ static void __init find_zone_movable_pfns_for_nodes(void)
> > }
> >
> > /*
> > + * If kernelcore=reliable is specified, ignore movablecore option
> > + */
> > + if (reliable_kernelcore) {
> > + bool mem_below_4gb_not_mirrored = false;
> > +
> > + for_each_memblock(memory, r) {
> > + if (memblock_is_mirror(r))
> > + continue;
> > +
> > + nid = r->nid;
> > +
> > + usable_startpfn = PFN_DOWN(r->base);
> > +
> > + if (usable_startpfn < 0x100000) {
> > + mem_below_4gb_not_mirrored = true;
> > + continue;
> > + }
> > +
> > + zone_movable_pfn[nid] = zone_movable_pfn[nid] ?
> > + min(usable_startpfn, zone_movable_pfn[nid]) :
> > + usable_startpfn;
> > + }
> > +
> > + if (mem_below_4gb_not_mirrored)
> > + pr_warn("This configuration results in unmirrored kernel memory.");
> > +
> > + goto out2;
> > + }
> > +
> > + /*
> > * If movablecore=nn[KMG] was specified, calculate what size of
> > * kernelcore that corresponds so that memory usable for
> > * any allocation type is evenly spread. If both kernelcore
> > @@ -5766,6 +5858,12 @@ static int __init cmdline_parse_core(char *p, unsigned long *core)
> > */
> > static int __init cmdline_parse_kernelcore(char *p)
> > {
> > + /* parse kernelcore=reliable */
> > + if (parse_option_str(p, "reliable")) {
> > + reliable_kernelcore = true;
> > + return 0;
> > + }
> > +
> > return cmdline_parse_core(p, &required_kernelcore);
> > }
> >
>
>