This patchset provide a boot option for user to specify ZONE_MOVABLE memory
map for each node in the system.
movablecore_map=nn[KMG]@ss[KMG]
This option make sure memory range from ss to ss+nn is movable memory.
1) If the range is involved in a single node, then from ss to the end of
the node will be ZONE_MOVABLE.
2) If the range covers two or more nodes, then from ss to the end of
the node will be ZONE_MOVABLE, and all the other nodes will only
have ZONE_MOVABLE.
3) If no range is in the node, then the node will have no ZONE_MOVABLE
unless kernelcore or movablecore is specified.
4) This option could be specified at most MAX_NUMNODES times.
5) If kernelcore or movablecore is also specified, movablecore_map will have
higher priority to be satisfied.
6) This option has no conflict with memmap option.
Tang Chen (4):
page_alloc: add movable_memmap kernel parameter
page_alloc: Sanitize movablecore_map.
page_alloc: Limit movable zone areas with movablecore_map parameter
page_alloc: Bootmem limit with movablecore_map
Yasuaki Ishimatsu (1):
x86: get pg_data_t's memory from other node
Documentation/kernel-parameters.txt | 17 +++
arch/x86/mm/numa.c | 9 +-
include/linux/memblock.h | 1 +
include/linux/mm.h | 11 ++
mm/memblock.c | 43 ++++++-
mm/page_alloc.c | 233 ++++++++++++++++++++++++++++++++++-
6 files changed, 307 insertions(+), 7 deletions(-)
From: Yasuaki Ishimatsu <[email protected]>
If system can create movable node which all memory of the
node is allocated as ZONE_MOVABLE, setup_node_data() cannot
allocate memory for the node's pg_data_t.
So when memblock_alloc_nid() fails, setup_node_data() retries
memblock_alloc().
Signed-off-by: Yasuaki Ishimatsu <[email protected]>
Signed-off-by: Lai Jiangshan <[email protected]>
Signed-off-by: Tang Chen <[email protected]>
Reviewed-by: Wen Congyang <[email protected]>
Tested-by: Lin Feng <[email protected]>
---
arch/x86/mm/numa.c | 9 +++++++--
1 files changed, 7 insertions(+), 2 deletions(-)
diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c
index 2d125be..ae2e76e 100644
--- a/arch/x86/mm/numa.c
+++ b/arch/x86/mm/numa.c
@@ -224,9 +224,14 @@ static void __init setup_node_data(int nid, u64 start, u64 end)
} else {
nd_pa = memblock_alloc_nid(nd_size, SMP_CACHE_BYTES, nid);
if (!nd_pa) {
- pr_err("Cannot find %zu bytes in node %d\n",
+ printk(KERN_WARNING "Cannot find %zu bytes in node %d\n",
nd_size, nid);
- return;
+ nd_pa = memblock_alloc(nd_size, SMP_CACHE_BYTES);
+ if (!nd_pa) {
+ pr_err("Cannot find %zu bytes in other node\n",
+ nd_size);
+ return;
+ }
}
nd = __va(nd_pa);
}
--
1.7.1
This patch make sure bootmem will not allocate memory from areas that
may be ZONE_MOVABLE. The map info is from movablecore_map boot option.
Signed-off-by: Tang Chen <[email protected]>
Reviewed-by: Wen Congyang <[email protected]>
Tested-by: Lin Feng <[email protected]>
---
include/linux/memblock.h | 1 +
mm/memblock.c | 43 ++++++++++++++++++++++++++++++++++++++++---
2 files changed, 41 insertions(+), 3 deletions(-)
diff --git a/include/linux/memblock.h b/include/linux/memblock.h
index d452ee1..6e25597 100644
--- a/include/linux/memblock.h
+++ b/include/linux/memblock.h
@@ -42,6 +42,7 @@ struct memblock {
extern struct memblock memblock;
extern int memblock_debug;
+extern struct movablecore_map movablecore_map;
#define memblock_dbg(fmt, ...) \
if (memblock_debug) printk(KERN_INFO pr_fmt(fmt), ##__VA_ARGS__)
diff --git a/mm/memblock.c b/mm/memblock.c
index 6259055..0f74c73 100644
--- a/mm/memblock.c
+++ b/mm/memblock.c
@@ -19,6 +19,7 @@
#include <linux/debugfs.h>
#include <linux/seq_file.h>
#include <linux/memblock.h>
+#include <linux/mm.h>
static struct memblock_region memblock_memory_init_regions[INIT_MEMBLOCK_REGIONS] __initdata_memblock;
static struct memblock_region memblock_reserved_init_regions[INIT_MEMBLOCK_REGIONS] __initdata_memblock;
@@ -99,8 +100,9 @@ phys_addr_t __init_memblock memblock_find_in_range_node(phys_addr_t start,
phys_addr_t end, phys_addr_t size,
phys_addr_t align, int nid)
{
- phys_addr_t this_start, this_end, cand;
+ phys_addr_t this_start, this_end, map_start, map_end, cand;
u64 i;
+ int curr = movablecore_map.nr_map;
/* pump up @end */
if (end == MEMBLOCK_ALLOC_ACCESSIBLE)
@@ -114,12 +116,47 @@ phys_addr_t __init_memblock memblock_find_in_range_node(phys_addr_t start,
this_start = clamp(this_start, start, end);
this_end = clamp(this_end, start, end);
+restart:
if (this_end < size)
continue;
cand = round_down(this_end - size, align);
- if (cand >= this_start)
- return cand;
+ if (cand < this_start)
+ continue;
+
+ /*
+ * We start in reverse order to find out if [cand, this_end) is
+ * in a movablecore_map range.
+ */
+ while (--curr >= 0) {
+ map_start =
+ movablecore_map.map[curr].start << PAGE_SHIFT;
+ map_end =
+ movablecore_map.map[curr].end << PAGE_SHIFT;
+
+ /*
+ * Find the previous range of [this_start, this_end).
+ * Since memory is allocated in reverse order, we need
+ * to make sure this_end is after the end of the range.
+ */
+ if (this_end <= map_end)
+ continue;
+
+ /* [cand, this_end) and range are not overlapped. */
+ if (cand >= map_end)
+ return cand;
+ else {
+ /* Otherwise, goto the previous range. */
+ this_end = map_start;
+ goto restart;
+ }
+ }
+
+ /*
+ * If movablecore_map has not been initialized yet,
+ * just return cand.
+ */
+ return cand;
}
return 0;
}
--
1.7.1
If kernelcore or movablecore is specified at the same time with movablecore_map,
movablecore_map will have higher priority to be satisfied.
Signed-off-by: Tang Chen <[email protected]>
Reviewed-by: Wen Congyang <[email protected]>
Tested-by: Lin Feng <[email protected]>
---
mm/page_alloc.c | 29 +++++++++++++++++++++++++++--
1 files changed, 27 insertions(+), 2 deletions(-)
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index ae29970..c8dfb1e 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -4774,7 +4774,7 @@ static unsigned long __init early_calculate_totalpages(void)
static void __init find_zone_movable_pfns_for_nodes(void)
{
int i, nid;
- unsigned long usable_startpfn;
+ unsigned long usable_startpfn, node_movable_limit;
unsigned long kernelcore_node, kernelcore_remaining;
/* save the state before borrow the nodemask */
nodemask_t saved_node_state = node_states[N_HIGH_MEMORY];
@@ -4803,7 +4803,6 @@ static void __init find_zone_movable_pfns_for_nodes(void)
required_kernelcore = max(required_kernelcore, corepages);
}
- /* If kernelcore was not specified, there is no ZONE_MOVABLE */
if (!required_kernelcore)
goto out;
@@ -4817,6 +4816,9 @@ restart:
for_each_node_state(nid, N_HIGH_MEMORY) {
unsigned long start_pfn, end_pfn;
+ node_movable_limit = zone_movable_pfn[nid];
+ zone_movable_pfn[nid] = 0;
+
/*
* Recalculate kernelcore_node if the division per node
* now exceeds what is necessary to satisfy the requested
@@ -4840,6 +4842,29 @@ restart:
if (start_pfn >= end_pfn)
continue;
+ /*
+ * If movablecore_map was specified with kernelcore
+ * or movablecore, it will have higher priority to be
+ * satisfied.
+ */
+ if (start_pfn >= node_movable_limit) {
+ /*
+ * Here, we meet the ZONE_MOVABLE boundary
+ * specified by movablecore_map. We should
+ * not spread any more, but keep the rest
+ * of kernelcore_remaining and break out.
+ * And also, usable_nodes should be decreased.
+ */
+ usable_nodes--;
+ break;
+ }
+
+ /*
+ * If ZONE_MOVABLE start_pfn is in the range, we need
+ * to shrink end_pfn to ZONE_MOVABLE start_pfn.
+ */
+ end_pfn = min(end_pfn, node_movable_limit);
+
/* Account for what is only usable for kernelcore */
if (start_pfn < usable_startpfn) {
unsigned long kernel_pages;
--
1.7.1
zone_movable_pfn is initialiesd as 0. This patch set its elements to the
first pfn of ZONE_MOVABLE of the corresponding node. The map info is from
movablecore_map boot option. zone_movable_pfn[nid] == 0 means the node has
no ZONE_MOVABLE.
Signed-off-by: Tang Chen <[email protected]>
Reviewed-by: Wen Congyang <[email protected]>
Tested-by: Lin Feng <[email protected]>
---
mm/page_alloc.c | 58 +++++++++++++++++++++++++++++++++++++++++++++++++++++++
1 files changed, 58 insertions(+), 0 deletions(-)
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 198106f..ae29970 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -4323,6 +4323,59 @@ static unsigned long __meminit zone_absent_pages_in_node(int nid,
return __absent_pages_in_range(nid, zone_start_pfn, zone_end_pfn);
}
+/**
+ * sanitize_zone_movable_pfn - Sanitize the zone_movable_pfn array.
+ *
+ * zone_movable_pfn is initialized as 0. This function will try to get the
+ * first ZONE_MOVABLE pfn of each node from movablecore_map, and assigne
+ * them to zone_movable_pfn.
+ * zone_movable_pfn[nid] == 0 means the node has no ZONE_MOVABLE.
+ *
+ * Note: Each range is represented as [start_pfn, end_pfn)
+ */
+static void __meminit sanitize_zone_movable_pfn(void)
+{
+ int i = 0, j = 0, nid;
+ unsigned long start_pfn, end_pfn, movable_start, tmp_start;
+
+ if (!movablecore_map.nr_map)
+ return;
+
+ for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid) {
+ /* Assume there is no ZONE_MOVABLE on the node at first */
+ movable_start = ULONG_MAX;
+
+ while (j < movablecore_map.nr_map) {
+ if (movablecore_map.map[j].start >= end_pfn)
+ break;
+ if (movablecore_map.map[j].end <= start_pfn) {
+ j++;
+ continue;
+ }
+
+ movable_start = max(start_pfn, movablecore_map.map[j].start);
+ if (!zone_movable_pfn[nid])
+ zone_movable_pfn[nid] = ULONG_MAX;
+
+ /*
+ * Sections covering tow or more nodes
+ * should not be skipped.
+ */
+ if (movablecore_map.map[j].end < end_pfn)
+ j++;
+
+ break;
+ }
+
+ /*
+ * The start_pfn of ZONE_MOVABLE is either the minimum pfn
+ * specified by movablecore_map, or the end of the node,
+ * which means the node has no ZONE_MOVABLE.
+ */
+ zone_movable_pfn[nid] = min(movable_start, zone_movable_pfn[nid]);
+ }
+}
+
#else /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */
static inline unsigned long __meminit zone_spanned_pages_in_node(int nid,
unsigned long zone_type,
@@ -4341,6 +4394,10 @@ static inline unsigned long __meminit zone_absent_pages_in_node(int nid,
return zholes_size[zone_type];
}
+static void __meminit sanitize_zone_movable_pfn()
+{
+}
+
#endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */
static void __meminit calculate_node_totalpages(struct pglist_data *pgdat,
@@ -4906,6 +4963,7 @@ void __init free_area_init_nodes(unsigned long *max_zone_pfn)
/* Find the PFNs that ZONE_MOVABLE begins at in each node */
memset(zone_movable_pfn, 0, sizeof(zone_movable_pfn));
+ sanitize_zone_movable_pfn();
find_zone_movable_pfns_for_nodes();
/* Print out the zone ranges */
--
1.7.1
This patch adds functions to parse movablecore_map boot option. Since the
option could be specified more then once, all the maps will be stored in
the global variable movablecore_map.map array.
And also, we keep the array in monotonic increasing order by start_pfn.
And merge all overlapped ranges.
Signed-off-by: Tang Chen <[email protected]>
Reviewed-by: Wen Congyang <[email protected]>
Tested-by: Lin Feng <[email protected]>
---
Documentation/kernel-parameters.txt | 17 ++++
include/linux/mm.h | 11 +++
mm/page_alloc.c | 146 +++++++++++++++++++++++++++++++++++
3 files changed, 174 insertions(+), 0 deletions(-)
diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
index 9776f06..0718976 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -1620,6 +1620,23 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
that the amount of memory usable for all allocations
is not too small.
+ movablecore_map=nn[KMG]@ss[KMG]
+ [KNL,X86,IA-64,PPC] This parameter is similar to
+ memmap except it specifies the memory map of
+ ZONE_MOVABLE.
+ If more areas are all within one node, then from
+ lowest ss to the end of the node will be ZONE_MOVABLE.
+ If an area covers two or more nodes, the area from
+ ss to the end of the 1st node will be ZONE_MOVABLE,
+ and all the rest nodes will only have ZONE_MOVABLE.
+ If memmap is specified at the same time, the
+ movablecore_map will be limited within the memmap
+ areas. If kernelcore or movablecore is also specified,
+ movablecore_map will have higher priority to be
+ satisfied. So the administrator should be careful that
+ the amount of movablecore_map areas are not too large.
+ Otherwise kernel won't have enough memory to start.
+
MTD_Partition= [MTD]
Format: <name>,<region-number>,<size>,<offset>
diff --git a/include/linux/mm.h b/include/linux/mm.h
index fa06804..e4541b4 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1328,6 +1328,17 @@ extern void free_bootmem_with_active_regions(int nid,
unsigned long max_low_pfn);
extern void sparse_memory_present_with_active_regions(int nid);
+#define MOVABLECORE_MAP_MAX MAX_NUMNODES
+struct movablecore_entry {
+ unsigned long start; /* start pfn of memory segment */
+ unsigned long end; /* end pfn of memory segment */
+};
+
+struct movablecore_map {
+ __u32 nr_map;
+ struct movablecore_entry map[MOVABLECORE_MAP_MAX];
+};
+
#endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */
#if !defined(CONFIG_HAVE_MEMBLOCK_NODE_MAP) && \
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 5b74de6..198106f 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -198,6 +198,9 @@ static unsigned long __meminitdata nr_all_pages;
static unsigned long __meminitdata dma_reserve;
#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
+/* Movable memory segments, will also be used by memblock subsystem. */
+struct movablecore_map movablecore_map;
+
static unsigned long __meminitdata arch_zone_lowest_possible_pfn[MAX_NR_ZONES];
static unsigned long __meminitdata arch_zone_highest_possible_pfn[MAX_NR_ZONES];
static unsigned long __initdata required_kernelcore;
@@ -4986,6 +4989,149 @@ static int __init cmdline_parse_movablecore(char *p)
early_param("kernelcore", cmdline_parse_kernelcore);
early_param("movablecore", cmdline_parse_movablecore);
+/**
+ * insert_movablecore_map - Insert a memory range in to movablecore_map.map.
+ * @start_pfn: start pfn of the range
+ * @end_pfn: end pfn of the range
+ *
+ * This function will also merge the overlapped ranges, and sort the array
+ * by start_pfn in monotonic increasing order.
+ */
+static void __init insert_movablecore_map(unsigned long start_pfn,
+ unsigned long end_pfn)
+{
+ int i, pos_start, pos_end, remove;
+ bool merge = true;
+
+ if (!movablecore_map.nr_map) {
+ movablecore_map.map[0].start = start_pfn;
+ movablecore_map.map[0].end = end_pfn;
+ movablecore_map.nr_map++;
+ return;
+ }
+
+ /*
+ * pos_start at the 1st overlapped segment if merge_start is true,
+ * or at the next unoverlapped segment if merge_start is false.
+ */
+ for (pos_start = 0; pos_start < movablecore_map.nr_map; pos_start++)
+ if (start_pfn <= movablecore_map.map[pos_start].end) {
+ if (end_pfn < movablecore_map.map[pos_start].start)
+ merge = false;
+ break;
+ }
+
+ /*
+ * pos_end at the last overlapped segment if merge_end is true,
+ * or at the next unoverlapped segment if merge_start is false.
+ */
+ for (pos_end = pos_start; pos_end < movablecore_map.nr_map; pos_end++) {
+ if (end_pfn < movablecore_map.map[pos_end].start) {
+ if (pos_end > 0 && start_pfn > movablecore_map.map[pos_end-1].end)
+ merge = false;
+ else
+ pos_end--;
+ break;
+ }
+ }
+ if (pos_end == movablecore_map.nr_map && merge)
+ pos_end--;
+
+ if (pos_start == movablecore_map.nr_map)
+ merge = false;
+
+ if (merge) {
+ remove = pos_end - pos_start;
+
+ movablecore_map.map[pos_start].start =
+ min(start_pfn, movablecore_map.map[pos_start].start);
+ movablecore_map.map[pos_start].end =
+ max(end_pfn, movablecore_map.map[pos_end].end);
+
+ if (remove == 0)
+ goto out;
+
+ for (i = pos_start+1; i < movablecore_map.nr_map; i++) {
+ movablecore_map.map[i].start =
+ movablecore_map.map[i+remove].start;
+ movablecore_map.map[i].end =
+ movablecore_map.map[i+remove].end;
+ }
+
+ movablecore_map.nr_map -= remove;
+ } else {
+ for (i = movablecore_map.nr_map; i > pos_start; i--) {
+ movablecore_map.map[i].start =
+ movablecore_map.map[i-1].start;
+ movablecore_map.map[i].end =
+ movablecore_map.map[i-1].end;
+ }
+
+ movablecore_map.map[pos_start].start = start_pfn;
+ movablecore_map.map[pos_start].end = end_pfn;
+ movablecore_map.nr_map++;
+ }
+}
+
+/**
+ * movablecore_map_add_region - Add a memory range into movablecore_map.
+ * @start: physical start address of range
+ * @end: physical end address of range
+ *
+ * This function transform the physical address into pfn, and then add the
+ * range into movablecore_map by calling insert_movablecore_map().
+ */
+static void __init movablecore_map_add_region(u64 start, u64 size)
+{
+ unsigned long start_pfn, end_pfn;
+
+ if (start + size <= start)
+ return;
+
+ if (movablecore_map.nr_map >= ARRAY_SIZE(movablecore_map.map)) {
+ pr_err("movable_memory_map: too many entries;"
+ " ignoring [mem %#010llx-%#010llx]\n",
+ (unsigned long long) start,
+ (unsigned long long) (start + size - 1));
+ return;
+ }
+
+ start_pfn = PFN_DOWN(start);
+ end_pfn = PFN_UP(start + size);
+ insert_movablecore_map(start_pfn, end_pfn);
+}
+
+/*
+ * movablecore_map=nn[KMG]@ss[KMG] sets the region of memory to be used as
+ * movable memory.
+ */
+static int __init cmdline_parse_movablecore_map(char *p)
+{
+ char *oldp;
+ u64 start_at, mem_size;
+
+ if (!p)
+ goto err;
+
+ oldp = p;
+ mem_size = memparse(p, &p);
+ if (p == oldp)
+ goto err;
+
+ if (*p == '@') {
+ oldp = p + 1;
+ start_at = memparse(p+1, &p);
+ if (p == oldp || *p != '\0')
+ goto err;
+
+ movablecore_map_add_region(start_at, mem_size);
+ return 0;
+ }
+err:
+ return -EINVAL;
+}
+early_param("movablecore_map", cmdline_parse_movablecore_map);
+
#endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */
/**
--
1.7.1
On Mon, 19 Nov 2012 22:27:21 +0800
Tang Chen <[email protected]> wrote:
> This patchset provide a boot option for user to specify ZONE_MOVABLE memory
> map for each node in the system.
>
> movablecore_map=nn[KMG]@ss[KMG]
>
> This option make sure memory range from ss to ss+nn is movable memory.
> 1) If the range is involved in a single node, then from ss to the end of
> the node will be ZONE_MOVABLE.
> 2) If the range covers two or more nodes, then from ss to the end of
> the node will be ZONE_MOVABLE, and all the other nodes will only
> have ZONE_MOVABLE.
> 3) If no range is in the node, then the node will have no ZONE_MOVABLE
> unless kernelcore or movablecore is specified.
> 4) This option could be specified at most MAX_NUMNODES times.
> 5) If kernelcore or movablecore is also specified, movablecore_map will have
> higher priority to be satisfied.
> 6) This option has no conflict with memmap option.
This doesn't describe the problem which the patchset solves. I can
kinda see where it's coming from, but it would be nice to have it all
spelled out, please.
- What is wrong with the kernel as it stands?
- What are the possible ways of solving this?
- Describe the chosen way, explain why it is superior to alternatives
The amount of manual system configuration in this proposal looks quite
high. Adding kernel boot parameters really is a last resort. Why was
it unavoidable here?
On 2012-11-20 4:53, Andrew Morton wrote:
> On Mon, 19 Nov 2012 22:27:21 +0800
> Tang Chen <[email protected]> wrote:
>
>> This patchset provide a boot option for user to specify ZONE_MOVABLE memory
>> map for each node in the system.
>>
>> movablecore_map=nn[KMG]@ss[KMG]
>>
>> This option make sure memory range from ss to ss+nn is movable memory.
>> 1) If the range is involved in a single node, then from ss to the end of
>> the node will be ZONE_MOVABLE.
>> 2) If the range covers two or more nodes, then from ss to the end of
>> the node will be ZONE_MOVABLE, and all the other nodes will only
>> have ZONE_MOVABLE.
>> 3) If no range is in the node, then the node will have no ZONE_MOVABLE
>> unless kernelcore or movablecore is specified.
>> 4) This option could be specified at most MAX_NUMNODES times.
>> 5) If kernelcore or movablecore is also specified, movablecore_map will have
>> higher priority to be satisfied.
>> 6) This option has no conflict with memmap option.
>
> This doesn't describe the problem which the patchset solves. I can
> kinda see where it's coming from, but it would be nice to have it all
> spelled out, please.
>
> - What is wrong with the kernel as it stands?
> - What are the possible ways of solving this?
> - Describe the chosen way, explain why it is superior to alternatives
>
> The amount of manual system configuration in this proposal looks quite
> high. Adding kernel boot parameters really is a last resort. Why was
> it unavoidable here?
Agree, manual configuration should be last resort.
We should ask help from BIOS to provide more help about hotplug functionality,
and it should work out of box on platforms with hotplug capabilities.
For CPU/memory/node hotplug, I feel the backward compatibility burden on OS
should be minor, so why don't ask help from BIOS to better support hotplug?
We could shape the interfaces between BIOS and OS to support system device
hotplug.
Thanks
Gerry
2012/11/20 5:53, Andrew Morton wrote:
> On Mon, 19 Nov 2012 22:27:21 +0800
> Tang Chen <[email protected]> wrote:
>
>> This patchset provide a boot option for user to specify ZONE_MOVABLE memory
>> map for each node in the system.
>>
>> movablecore_map=nn[KMG]@ss[KMG]
>>
>> This option make sure memory range from ss to ss+nn is movable memory.
>> 1) If the range is involved in a single node, then from ss to the end of
>> the node will be ZONE_MOVABLE.
>> 2) If the range covers two or more nodes, then from ss to the end of
>> the node will be ZONE_MOVABLE, and all the other nodes will only
>> have ZONE_MOVABLE.
>> 3) If no range is in the node, then the node will have no ZONE_MOVABLE
>> unless kernelcore or movablecore is specified.
>> 4) This option could be specified at most MAX_NUMNODES times.
>> 5) If kernelcore or movablecore is also specified, movablecore_map will have
>> higher priority to be satisfied.
>> 6) This option has no conflict with memmap option.
>
> This doesn't describe the problem which the patchset solves. I can
> kinda see where it's coming from, but it would be nice to have it all
> spelled out, please.
>
> - What is wrong with the kernel as it stands?
If we hot remove a memroy, the memory cannot have kernel memory,
because Linux cannot migrate kernel memory currently. Therefore,
we have to guarantee that the hot removed memory has only movable
memoroy.
Linux has two boot options, kernelcore= and movablecore=, for
creating movable memory. These boot options can specify the amount
of memory use as kernel or movable memory. Using them, we can
create ZONE_MOVABLE which has only movable memory.
But it does not fulfill a requirement of memory hot remove, because
even if we specify the boot options, movable memory is distributed
in each node evenly. So when we want to hot remove memory which
memory range is 0x80000000-0c0000000, we have no way to specify
the memory as movable memory.
So we proposed a new feature which specifies memory range to use as
movable memory.
> - What are the possible ways of solving this?
I thought 2 ways to specify movable memory.
1. use firmware information
2. use boot option
1. use firmware information
According to ACPI spec 5.0, SRAT table has memory affinity structure
and the structure has Hot Pluggable Filed. See "5.2.16.2 Memory
Affinity Structure". If we use the information, we might be able to
specify movable memory by firmware. For example, if Hot Pluggable
Filed is enabled, Linux sets the memory as movable memory.
2. use boot option
This is our proposal. New boot option can specify memory range to use
as movable memory.
> - Describe the chosen way, explain why it is superior to alternatives
We chose second way, because if we use first way, users cannot change
memory range to use as movable memory easily. We think if we create
movable memory, performance regression may occur by NUMA. In this case,
user can turn off the feature easily if we prepare the boot option.
And if we prepare the boot optino, the user can select which memory
to use as movable memory easily.
Thanks,
Yasuaki Ishimatsu
>
> The amount of manual system configuration in this proposal looks quite
> high. Adding kernel boot parameters really is a last resort. Why was
> it unavoidable here?
>
On 11/20/2012 07:07 PM, Yasuaki Ishimatsu wrote:
> 2012/11/20 5:53, Andrew Morton wrote:
>> On Mon, 19 Nov 2012 22:27:21 +0800
>> Tang Chen <[email protected]> wrote:
>>
>>> This patchset provide a boot option for user to specify ZONE_MOVABLE
>>> memory
>>> map for each node in the system.
>>>
>>> movablecore_map=nn[KMG]@ss[KMG]
>>>
>>> This option make sure memory range from ss to ss+nn is movable memory.
>>> 1) If the range is involved in a single node, then from ss to the
>>> end of
>>> the node will be ZONE_MOVABLE.
>>> 2) If the range covers two or more nodes, then from ss to the end of
>>> the node will be ZONE_MOVABLE, and all the other nodes will only
>>> have ZONE_MOVABLE.
>>> 3) If no range is in the node, then the node will have no ZONE_MOVABLE
>>> unless kernelcore or movablecore is specified.
>>> 4) This option could be specified at most MAX_NUMNODES times.
>>> 5) If kernelcore or movablecore is also specified, movablecore_map
>>> will have
>>> higher priority to be satisfied.
>>> 6) This option has no conflict with memmap option.
>>
>> This doesn't describe the problem which the patchset solves. I can
>> kinda see where it's coming from, but it would be nice to have it all
>> spelled out, please.
>>
>
>> - What is wrong with the kernel as it stands?
>
> If we hot remove a memroy, the memory cannot have kernel memory,
> because Linux cannot migrate kernel memory currently. Therefore,
> we have to guarantee that the hot removed memory has only movable
> memoroy.
>
> Linux has two boot options, kernelcore= and movablecore=, for
> creating movable memory. These boot options can specify the amount
> of memory use as kernel or movable memory. Using them, we can
> create ZONE_MOVABLE which has only movable memory.
>
> But it does not fulfill a requirement of memory hot remove, because
> even if we specify the boot options, movable memory is distributed
> in each node evenly. So when we want to hot remove memory which
> memory range is 0x80000000-0c0000000, we have no way to specify
> the memory as movable memory.
Could you explain why can't specify the memory as movable memory in this
case?
>
> So we proposed a new feature which specifies memory range to use as
> movable memory.
>
>> - What are the possible ways of solving this?
>
> I thought 2 ways to specify movable memory.
> 1. use firmware information
> 2. use boot option
>
> 1. use firmware information
> According to ACPI spec 5.0, SRAT table has memory affinity structure
> and the structure has Hot Pluggable Filed. See "5.2.16.2 Memory
> Affinity Structure". If we use the information, we might be able to
> specify movable memory by firmware. For example, if Hot Pluggable
> Filed is enabled, Linux sets the memory as movable memory.
>
> 2. use boot option
> This is our proposal. New boot option can specify memory range to use
> as movable memory.
>
>> - Describe the chosen way, explain why it is superior to alternatives
>
> We chose second way, because if we use first way, users cannot change
> memory range to use as movable memory easily. We think if we create
> movable memory, performance regression may occur by NUMA. In this case,
Could you explain why regression occur in details?
> user can turn off the feature easily if we prepare the boot option.
> And if we prepare the boot optino, the user can select which memory
> to use as movable memory easily.
>
> Thanks,
> Yasuaki Ishimatsu
>
>>
>> The amount of manual system configuration in this proposal looks quite
>> high. Adding kernel boot parameters really is a last resort. Why was
>> it unavoidable here?
>>
>
>
> --
> To unsubscribe, send a message with 'unsubscribe linux-mm' in
> the body to [email protected]. For more info on Linux MM,
> see: http://www.linux-mm.org/ .
> Don't email: <a href=mailto:"[email protected]"> [email protected] </a>
Hi Jaegeuk,
2012/11/20 20:25, Jaegeuk Hanse wrote:
> On 11/20/2012 07:07 PM, Yasuaki Ishimatsu wrote:
>> 2012/11/20 5:53, Andrew Morton wrote:
>>> On Mon, 19 Nov 2012 22:27:21 +0800
>>> Tang Chen <[email protected]> wrote:
>>>
>>>> This patchset provide a boot option for user to specify ZONE_MOVABLE memory
>>>> map for each node in the system.
>>>>
>>>> movablecore_map=nn[KMG]@ss[KMG]
>>>>
>>>> This option make sure memory range from ss to ss+nn is movable memory.
>>>> 1) If the range is involved in a single node, then from ss to the end of
>>>> the node will be ZONE_MOVABLE.
>>>> 2) If the range covers two or more nodes, then from ss to the end of
>>>> the node will be ZONE_MOVABLE, and all the other nodes will only
>>>> have ZONE_MOVABLE.
>>>> 3) If no range is in the node, then the node will have no ZONE_MOVABLE
>>>> unless kernelcore or movablecore is specified.
>>>> 4) This option could be specified at most MAX_NUMNODES times.
>>>> 5) If kernelcore or movablecore is also specified, movablecore_map will have
>>>> higher priority to be satisfied.
>>>> 6) This option has no conflict with memmap option.
>>>
>>> This doesn't describe the problem which the patchset solves. I can
>>> kinda see where it's coming from, but it would be nice to have it all
>>> spelled out, please.
>>>
>>
>>> - What is wrong with the kernel as it stands?
>>
>> If we hot remove a memroy, the memory cannot have kernel memory,
>> because Linux cannot migrate kernel memory currently. Therefore,
>> we have to guarantee that the hot removed memory has only movable
>> memoroy.
>>
>> Linux has two boot options, kernelcore= and movablecore=, for
>> creating movable memory. These boot options can specify the amount
>> of memory use as kernel or movable memory. Using them, we can
>> create ZONE_MOVABLE which has only movable memory.
>>
>> But it does not fulfill a requirement of memory hot remove, because
>> even if we specify the boot options, movable memory is distributed
>> in each node evenly. So when we want to hot remove memory which
>> memory range is 0x80000000-0c0000000, we have no way to specify
>> the memory as movable memory.
>
> Could you explain why can't specify the memory as movable memory in this case?
For creating movable memory, Linux prepares two boot options, kernelcore=
and movablecore=. These boot options specify amount of memory not memory
range use as kernel or movable memory. So when we use these boot options,
we cannot control where the movable memory is set.
>
>>
>> So we proposed a new feature which specifies memory range to use as
>> movable memory.
>>
>>> - What are the possible ways of solving this?
>>
>> I thought 2 ways to specify movable memory.
>> 1. use firmware information
>> 2. use boot option
>>
>> 1. use firmware information
>> According to ACPI spec 5.0, SRAT table has memory affinity structure
>> and the structure has Hot Pluggable Filed. See "5.2.16.2 Memory
>> Affinity Structure". If we use the information, we might be able to
>> specify movable memory by firmware. For example, if Hot Pluggable
>> Filed is enabled, Linux sets the memory as movable memory.
>>
>> 2. use boot option
>> This is our proposal. New boot option can specify memory range to use
>> as movable memory.
>>
>>> - Describe the chosen way, explain why it is superior to alternatives
>>
>> We chose second way, because if we use first way, users cannot change
>> memory range to use as movable memory easily. We think if we create
>> movable memory, performance regression may occur by NUMA. In this case,
>
> Could you explain why regression occur in details?
Using the boot option, we can create movable node which has only
movable memory. So if we create a new task, kernel memory and movable
memory (anonymous page and page cache) of the task are allocated by
different nodes. In this case, performance regression may occur.
Thanks,
Yasuaki Ishimatsu
>
>> user can turn off the feature easily if we prepare the boot option.
>> And if we prepare the boot optino, the user can select which memory
>> to use as movable memory easily.
>>
>> Thanks,
>> Yasuaki Ishimatsu
>>
>>>
>>> The amount of manual system configuration in this proposal looks quite
>>> high. Adding kernel boot parameters really is a last resort. Why was
>>> it unavoidable here?
>>>
>>
>>
>> --
>> To unsubscribe, send a message with 'unsubscribe linux-mm' in
>> the body to [email protected]. For more info on Linux MM,
>> see: http://www.linux-mm.org/ .
>> Don't email: <a href=mailto:"[email protected]"> [email protected] </a>
>
Hi Tang,
The patch has two extra whitespaces.
2012/11/19 23:27, Tang Chen wrote:
> This patch adds functions to parse movablecore_map boot option. Since the
> option could be specified more then once, all the maps will be stored in
> the global variable movablecore_map.map array.
>
> And also, we keep the array in monotonic increasing order by start_pfn.
> And merge all overlapped ranges.
>
> Signed-off-by: Tang Chen <[email protected]>
> Reviewed-by: Wen Congyang <[email protected]>
> Tested-by: Lin Feng <[email protected]>
> ---
> Documentation/kernel-parameters.txt | 17 ++++
> include/linux/mm.h | 11 +++
> mm/page_alloc.c | 146 +++++++++++++++++++++++++++++++++++
> 3 files changed, 174 insertions(+), 0 deletions(-)
>
> diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
> index 9776f06..0718976 100644
> --- a/Documentation/kernel-parameters.txt
> +++ b/Documentation/kernel-parameters.txt
> @@ -1620,6 +1620,23 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
> that the amount of memory usable for all allocations
> is not too small.
>
> + movablecore_map=nn[KMG]@ss[KMG]
> + [KNL,X86,IA-64,PPC] This parameter is similar to
> + memmap except it specifies the memory map of
> + ZONE_MOVABLE.
> + If more areas are all within one node, then from
> + lowest ss to the end of the node will be ZONE_MOVABLE.
> + If an area covers two or more nodes, the area from
> + ss to the end of the 1st node will be ZONE_MOVABLE,
> + and all the rest nodes will only have ZONE_MOVABLE.
^ here
> + If memmap is specified at the same time, the
> + movablecore_map will be limited within the memmap
> + areas. If kernelcore or movablecore is also specified,
> + movablecore_map will have higher priority to be
> + satisfied. So the administrator should be careful that
> + the amount of movablecore_map areas are not too large.
> + Otherwise kernel won't have enough memory to start.
> +
> MTD_Partition= [MTD]
> Format: <name>,<region-number>,<size>,<offset>
>
> diff --git a/include/linux/mm.h b/include/linux/mm.h
> index fa06804..e4541b4 100644
> --- a/include/linux/mm.h
> +++ b/include/linux/mm.h
> @@ -1328,6 +1328,17 @@ extern void free_bootmem_with_active_regions(int nid,
> unsigned long max_low_pfn);
> extern void sparse_memory_present_with_active_regions(int nid);
>
> +#define MOVABLECORE_MAP_MAX MAX_NUMNODES
> +struct movablecore_entry {
> + unsigned long start; /* start pfn of memory segment */
> + unsigned long end; /* end pfn of memory segment */
> +};
> +
> +struct movablecore_map {
> + __u32 nr_map;
> + struct movablecore_entry map[MOVABLECORE_MAP_MAX];
> +};
> +
> #endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */
>
> #if !defined(CONFIG_HAVE_MEMBLOCK_NODE_MAP) && \
> diff --git a/mm/page_alloc.c b/mm/page_alloc.c
> index 5b74de6..198106f 100644
> --- a/mm/page_alloc.c
> +++ b/mm/page_alloc.c
> @@ -198,6 +198,9 @@ static unsigned long __meminitdata nr_all_pages;
> static unsigned long __meminitdata dma_reserve;
>
> #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
> +/* Movable memory segments, will also be used by memblock subsystem. */
> +struct movablecore_map movablecore_map;
> +
> static unsigned long __meminitdata arch_zone_lowest_possible_pfn[MAX_NR_ZONES];
> static unsigned long __meminitdata arch_zone_highest_possible_pfn[MAX_NR_ZONES];
> static unsigned long __initdata required_kernelcore;
> @@ -4986,6 +4989,149 @@ static int __init cmdline_parse_movablecore(char *p)
> early_param("kernelcore", cmdline_parse_kernelcore);
> early_param("movablecore", cmdline_parse_movablecore);
>
> +/**
> + * insert_movablecore_map - Insert a memory range in to movablecore_map.map.
> + * @start_pfn: start pfn of the range
> + * @end_pfn: end pfn of the range
> + *
> + * This function will also merge the overlapped ranges, and sort the array
> + * by start_pfn in monotonic increasing order.
> + */
> +static void __init insert_movablecore_map(unsigned long start_pfn,
> + unsigned long end_pfn)
> +{
> + int i, pos_start, pos_end, remove;
> + bool merge = true;
> +
> + if (!movablecore_map.nr_map) {
> + movablecore_map.map[0].start = start_pfn;
> + movablecore_map.map[0].end = end_pfn;
> + movablecore_map.nr_map++;
> + return;
> + }
> +
> + /*
> + * pos_start at the 1st overlapped segment if merge_start is true,
> + * or at the next unoverlapped segment if merge_start is false.
> + */
> + for (pos_start = 0; pos_start < movablecore_map.nr_map; pos_start++)
> + if (start_pfn <= movablecore_map.map[pos_start].end) {
> + if (end_pfn < movablecore_map.map[pos_start].start)
> + merge = false;
> + break;
> + }
> +
> + /*
> + * pos_end at the last overlapped segment if merge_end is true,
> + * or at the next unoverlapped segment if merge_start is false.
> + */
> + for (pos_end = pos_start; pos_end < movablecore_map.nr_map; pos_end++) {
> + if (end_pfn < movablecore_map.map[pos_end].start) {
> + if (pos_end > 0 && start_pfn > movablecore_map.map[pos_end-1].end)
> + merge = false;
> + else
> + pos_end--;
> + break;
> + }
> + }
> + if (pos_end == movablecore_map.nr_map && merge)
> + pos_end--;
> +
> + if (pos_start == movablecore_map.nr_map)
> + merge = false;
> +
> + if (merge) {
> + remove = pos_end - pos_start;
> +
> + movablecore_map.map[pos_start].start =
> + min(start_pfn, movablecore_map.map[pos_start].start);
> + movablecore_map.map[pos_start].end =
^ here
Thanks,
Yasuaki Ishimatsu
> + max(end_pfn, movablecore_map.map[pos_end].end);
> +
> + if (remove == 0)
> + goto out;
> +
> + for (i = pos_start+1; i < movablecore_map.nr_map; i++) {
> + movablecore_map.map[i].start =
> + movablecore_map.map[i+remove].start;
> + movablecore_map.map[i].end =
> + movablecore_map.map[i+remove].end;
> + }
> +
> + movablecore_map.nr_map -= remove;
> + } else {
> + for (i = movablecore_map.nr_map; i > pos_start; i--) {
> + movablecore_map.map[i].start =
> + movablecore_map.map[i-1].start;
> + movablecore_map.map[i].end =
> + movablecore_map.map[i-1].end;
> + }
> +
> + movablecore_map.map[pos_start].start = start_pfn;
> + movablecore_map.map[pos_start].end = end_pfn;
> + movablecore_map.nr_map++;
> + }
> +}
> +
> +/**
> + * movablecore_map_add_region - Add a memory range into movablecore_map.
> + * @start: physical start address of range
> + * @end: physical end address of range
> + *
> + * This function transform the physical address into pfn, and then add the
> + * range into movablecore_map by calling insert_movablecore_map().
> + */
> +static void __init movablecore_map_add_region(u64 start, u64 size)
> +{
> + unsigned long start_pfn, end_pfn;
> +
> + if (start + size <= start)
> + return;
> +
> + if (movablecore_map.nr_map >= ARRAY_SIZE(movablecore_map.map)) {
> + pr_err("movable_memory_map: too many entries;"
> + " ignoring [mem %#010llx-%#010llx]\n",
> + (unsigned long long) start,
> + (unsigned long long) (start + size - 1));
> + return;
> + }
> +
> + start_pfn = PFN_DOWN(start);
> + end_pfn = PFN_UP(start + size);
> + insert_movablecore_map(start_pfn, end_pfn);
> +}
> +
> +/*
> + * movablecore_map=nn[KMG]@ss[KMG] sets the region of memory to be used as
> + * movable memory.
> + */
> +static int __init cmdline_parse_movablecore_map(char *p)
> +{
> + char *oldp;
> + u64 start_at, mem_size;
> +
> + if (!p)
> + goto err;
> +
> + oldp = p;
> + mem_size = memparse(p, &p);
> + if (p == oldp)
> + goto err;
> +
> + if (*p == '@') {
> + oldp = p + 1;
> + start_at = memparse(p+1, &p);
> + if (p == oldp || *p != '\0')
> + goto err;
> +
> + movablecore_map_add_region(start_at, mem_size);
> + return 0;
> + }
> +err:
> + return -EINVAL;
> +}
> +early_param("movablecore_map", cmdline_parse_movablecore_map);
> +
> #endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */
>
> /**
>
Hi Tang,
2012/11/19 23:27, Tang Chen wrote:
> From: Yasuaki Ishimatsu <[email protected]>
>
> If system can create movable node which all memory of the
> node is allocated as ZONE_MOVABLE, setup_node_data() cannot
> allocate memory for the node's pg_data_t.
> So when memblock_alloc_nid() fails, setup_node_data() retries
> memblock_alloc().
>
> Signed-off-by: Yasuaki Ishimatsu <[email protected]>
> Signed-off-by: Lai Jiangshan <[email protected]>
> Signed-off-by: Tang Chen <[email protected]>
> Reviewed-by: Wen Congyang <[email protected]>
> Tested-by: Lin Feng <[email protected]>
> ---
> arch/x86/mm/numa.c | 9 +++++++--
> 1 files changed, 7 insertions(+), 2 deletions(-)
>
> diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c
> index 2d125be..ae2e76e 100644
> --- a/arch/x86/mm/numa.c
> +++ b/arch/x86/mm/numa.c
> @@ -224,9 +224,14 @@ static void __init setup_node_data(int nid, u64 start, u64 end)
> } else {
> nd_pa = memblock_alloc_nid(nd_size, SMP_CACHE_BYTES, nid);
> if (!nd_pa) {
> - pr_err("Cannot find %zu bytes in node %d\n",
> + printk(KERN_WARNING "Cannot find %zu bytes in node %d\n",
> nd_size, nid)
Please change to use pr_warn().
Thanks,
Yasuaki Ishimatsu
> - return;
> + nd_pa = memblock_alloc(nd_size, SMP_CACHE_BYTES);
> + if (!nd_pa) {
> + pr_err("Cannot find %zu bytes in other node\n",
> + nd_size);
> + return;
> + }
> }
> nd = __va(nd_pa);
> }
>
Hi Ishimatsu-san,
Thanks for the comments.
And I also found the some algorithm problems in patch2 ~ patch3.
I am working on it, and a v2 patchset is coming soon. :)
Thanks.
On 11/21/2012 01:46 PM, Yasuaki Ishimatsu wrote:
> Hi Tang,
>
> 2012/11/19 23:27, Tang Chen wrote:
>> From: Yasuaki Ishimatsu<[email protected]>
>>
>> If system can create movable node which all memory of the
>> node is allocated as ZONE_MOVABLE, setup_node_data() cannot
>> allocate memory for the node's pg_data_t.
>> So when memblock_alloc_nid() fails, setup_node_data() retries
>> memblock_alloc().
>>
>> Signed-off-by: Yasuaki Ishimatsu<[email protected]>
>> Signed-off-by: Lai Jiangshan<[email protected]>
>> Signed-off-by: Tang Chen<[email protected]>
>> Reviewed-by: Wen Congyang<[email protected]>
>> Tested-by: Lin Feng<[email protected]>
>> ---
>> arch/x86/mm/numa.c | 9 +++++++--
>> 1 files changed, 7 insertions(+), 2 deletions(-)
>>
>> diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c
>> index 2d125be..ae2e76e 100644
>> --- a/arch/x86/mm/numa.c
>> +++ b/arch/x86/mm/numa.c
>> @@ -224,9 +224,14 @@ static void __init setup_node_data(int nid, u64 start, u64 end)
>> } else {
>> nd_pa = memblock_alloc_nid(nd_size, SMP_CACHE_BYTES, nid);
>> if (!nd_pa) {
>> - pr_err("Cannot find %zu bytes in node %d\n",
>
>> + printk(KERN_WARNING "Cannot find %zu bytes in node %d\n",
>> nd_size, nid)
>
> Please change to use pr_warn().
>
> Thanks,
> Yasuaki Ishimatsu
>
>> - return;
>> + nd_pa = memblock_alloc(nd_size, SMP_CACHE_BYTES);
>> + if (!nd_pa) {
>> + pr_err("Cannot find %zu bytes in other node\n",
>> + nd_size);
>> + return;
>> + }
>> }
>> nd = __va(nd_pa);
>> }
>>
>
>
>
Hi Tang,
When I applied the patch, following error occurred.
mm/page_alloc.c: In function ‘insert_movablecore_map’:
mm/page_alloc.c:5061: error: label ‘out’ used but not defined
Thanks,
Yasuaki Ishimatsu
2012/11/19 23:27, Tang Chen wrote:
> This patch adds functions to parse movablecore_map boot option. Since the
> option could be specified more then once, all the maps will be stored in
> the global variable movablecore_map.map array.
>
> And also, we keep the array in monotonic increasing order by start_pfn.
> And merge all overlapped ranges.
>
> Signed-off-by: Tang Chen <[email protected]>
> Reviewed-by: Wen Congyang <[email protected]>
> Tested-by: Lin Feng <[email protected]>
> ---
> Documentation/kernel-parameters.txt | 17 ++++
> include/linux/mm.h | 11 +++
> mm/page_alloc.c | 146 +++++++++++++++++++++++++++++++++++
> 3 files changed, 174 insertions(+), 0 deletions(-)
>
> diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
> index 9776f06..0718976 100644
> --- a/Documentation/kernel-parameters.txt
> +++ b/Documentation/kernel-parameters.txt
> @@ -1620,6 +1620,23 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
> that the amount of memory usable for all allocations
> is not too small.
>
> + movablecore_map=nn[KMG]@ss[KMG]
> + [KNL,X86,IA-64,PPC] This parameter is similar to
> + memmap except it specifies the memory map of
> + ZONE_MOVABLE.
> + If more areas are all within one node, then from
> + lowest ss to the end of the node will be ZONE_MOVABLE.
> + If an area covers two or more nodes, the area from
> + ss to the end of the 1st node will be ZONE_MOVABLE,
> + and all the rest nodes will only have ZONE_MOVABLE.
> + If memmap is specified at the same time, the
> + movablecore_map will be limited within the memmap
> + areas. If kernelcore or movablecore is also specified,
> + movablecore_map will have higher priority to be
> + satisfied. So the administrator should be careful that
> + the amount of movablecore_map areas are not too large.
> + Otherwise kernel won't have enough memory to start.
> +
> MTD_Partition= [MTD]
> Format: <name>,<region-number>,<size>,<offset>
>
> diff --git a/include/linux/mm.h b/include/linux/mm.h
> index fa06804..e4541b4 100644
> --- a/include/linux/mm.h
> +++ b/include/linux/mm.h
> @@ -1328,6 +1328,17 @@ extern void free_bootmem_with_active_regions(int nid,
> unsigned long max_low_pfn);
> extern void sparse_memory_present_with_active_regions(int nid);
>
> +#define MOVABLECORE_MAP_MAX MAX_NUMNODES
> +struct movablecore_entry {
> + unsigned long start; /* start pfn of memory segment */
> + unsigned long end; /* end pfn of memory segment */
> +};
> +
> +struct movablecore_map {
> + __u32 nr_map;
> + struct movablecore_entry map[MOVABLECORE_MAP_MAX];
> +};
> +
> #endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */
>
> #if !defined(CONFIG_HAVE_MEMBLOCK_NODE_MAP) && \
> diff --git a/mm/page_alloc.c b/mm/page_alloc.c
> index 5b74de6..198106f 100644
> --- a/mm/page_alloc.c
> +++ b/mm/page_alloc.c
> @@ -198,6 +198,9 @@ static unsigned long __meminitdata nr_all_pages;
> static unsigned long __meminitdata dma_reserve;
>
> #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
> +/* Movable memory segments, will also be used by memblock subsystem. */
> +struct movablecore_map movablecore_map;
> +
> static unsigned long __meminitdata arch_zone_lowest_possible_pfn[MAX_NR_ZONES];
> static unsigned long __meminitdata arch_zone_highest_possible_pfn[MAX_NR_ZONES];
> static unsigned long __initdata required_kernelcore;
> @@ -4986,6 +4989,149 @@ static int __init cmdline_parse_movablecore(char *p)
> early_param("kernelcore", cmdline_parse_kernelcore);
> early_param("movablecore", cmdline_parse_movablecore);
>
> +/**
> + * insert_movablecore_map - Insert a memory range in to movablecore_map.map.
> + * @start_pfn: start pfn of the range
> + * @end_pfn: end pfn of the range
> + *
> + * This function will also merge the overlapped ranges, and sort the array
> + * by start_pfn in monotonic increasing order.
> + */
> +static void __init insert_movablecore_map(unsigned long start_pfn,
> + unsigned long end_pfn)
> +{
> + int i, pos_start, pos_end, remove;
> + bool merge = true;
> +
> + if (!movablecore_map.nr_map) {
> + movablecore_map.map[0].start = start_pfn;
> + movablecore_map.map[0].end = end_pfn;
> + movablecore_map.nr_map++;
> + return;
> + }
> +
> + /*
> + * pos_start at the 1st overlapped segment if merge_start is true,
> + * or at the next unoverlapped segment if merge_start is false.
> + */
> + for (pos_start = 0; pos_start < movablecore_map.nr_map; pos_start++)
> + if (start_pfn <= movablecore_map.map[pos_start].end) {
> + if (end_pfn < movablecore_map.map[pos_start].start)
> + merge = false;
> + break;
> + }
> +
> + /*
> + * pos_end at the last overlapped segment if merge_end is true,
> + * or at the next unoverlapped segment if merge_start is false.
> + */
> + for (pos_end = pos_start; pos_end < movablecore_map.nr_map; pos_end++) {
> + if (end_pfn < movablecore_map.map[pos_end].start) {
> + if (pos_end > 0 && start_pfn > movablecore_map.map[pos_end-1].end)
> + merge = false;
> + else
> + pos_end--;
> + break;
> + }
> + }
> + if (pos_end == movablecore_map.nr_map && merge)
> + pos_end--;
> +
> + if (pos_start == movablecore_map.nr_map)
> + merge = false;
> +
> + if (merge) {
> + remove = pos_end - pos_start;
> +
> + movablecore_map.map[pos_start].start =
> + min(start_pfn, movablecore_map.map[pos_start].start);
> + movablecore_map.map[pos_start].end =
> + max(end_pfn, movablecore_map.map[pos_end].end);
> +
> + if (remove == 0)
> + goto out;
> +
> + for (i = pos_start+1; i < movablecore_map.nr_map; i++) {
> + movablecore_map.map[i].start =
> + movablecore_map.map[i+remove].start;
> + movablecore_map.map[i].end =
> + movablecore_map.map[i+remove].end;
> + }
> +
> + movablecore_map.nr_map -= remove;
> + } else {
> + for (i = movablecore_map.nr_map; i > pos_start; i--) {
> + movablecore_map.map[i].start =
> + movablecore_map.map[i-1].start;
> + movablecore_map.map[i].end =
> + movablecore_map.map[i-1].end;
> + }
> +
> + movablecore_map.map[pos_start].start = start_pfn;
> + movablecore_map.map[pos_start].end = end_pfn;
> + movablecore_map.nr_map++;
> + }
> +}
> +
> +/**
> + * movablecore_map_add_region - Add a memory range into movablecore_map.
> + * @start: physical start address of range
> + * @end: physical end address of range
> + *
> + * This function transform the physical address into pfn, and then add the
> + * range into movablecore_map by calling insert_movablecore_map().
> + */
> +static void __init movablecore_map_add_region(u64 start, u64 size)
> +{
> + unsigned long start_pfn, end_pfn;
> +
> + if (start + size <= start)
> + return;
> +
> + if (movablecore_map.nr_map >= ARRAY_SIZE(movablecore_map.map)) {
> + pr_err("movable_memory_map: too many entries;"
> + " ignoring [mem %#010llx-%#010llx]\n",
> + (unsigned long long) start,
> + (unsigned long long) (start + size - 1));
> + return;
> + }
> +
> + start_pfn = PFN_DOWN(start);
> + end_pfn = PFN_UP(start + size);
> + insert_movablecore_map(start_pfn, end_pfn);
> +}
> +
> +/*
> + * movablecore_map=nn[KMG]@ss[KMG] sets the region of memory to be used as
> + * movable memory.
> + */
> +static int __init cmdline_parse_movablecore_map(char *p)
> +{
> + char *oldp;
> + u64 start_at, mem_size;
> +
> + if (!p)
> + goto err;
> +
> + oldp = p;
> + mem_size = memparse(p, &p);
> + if (p == oldp)
> + goto err;
> +
> + if (*p == '@') {
> + oldp = p + 1;
> + start_at = memparse(p+1, &p);
> + if (p == oldp || *p != '\0')
> + goto err;
> +
> + movablecore_map_add_region(start_at, mem_size);
> + return 0;
> + }
> +err:
> + return -EINVAL;
> +}
> +early_param("movablecore_map", cmdline_parse_movablecore_map);
> +
> #endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */
>
> /**
>
Hi Tang,
2012/11/21 14:58, Tang Chen wrote:
> Hi Ishimatsu-san,
>
> Thanks for the comments.
>
> And I also found the some algorithm problems in patch2 ~ patch3.
> I am working on it, and a v2 patchset is coming soon. :)
O.K.
I'm waiting nwe patch-set.
Thanks,
Yasuaki Ishimatsu
>
> Thanks.
>
> On 11/21/2012 01:46 PM, Yasuaki Ishimatsu wrote:
>> Hi Tang,
>>
>> 2012/11/19 23:27, Tang Chen wrote:
>>> From: Yasuaki Ishimatsu<[email protected]>
>>>
>>> If system can create movable node which all memory of the
>>> node is allocated as ZONE_MOVABLE, setup_node_data() cannot
>>> allocate memory for the node's pg_data_t.
>>> So when memblock_alloc_nid() fails, setup_node_data() retries
>>> memblock_alloc().
>>>
>>> Signed-off-by: Yasuaki Ishimatsu<[email protected]>
>>> Signed-off-by: Lai Jiangshan<[email protected]>
>>> Signed-off-by: Tang Chen<[email protected]>
>>> Reviewed-by: Wen Congyang<[email protected]>
>>> Tested-by: Lin Feng<[email protected]>
>>> ---
>>> arch/x86/mm/numa.c | 9 +++++++--
>>> 1 files changed, 7 insertions(+), 2 deletions(-)
>>>
>>> diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c
>>> index 2d125be..ae2e76e 100644
>>> --- a/arch/x86/mm/numa.c
>>> +++ b/arch/x86/mm/numa.c
>>> @@ -224,9 +224,14 @@ static void __init setup_node_data(int nid, u64 start, u64 end)
>>> } else {
>>> nd_pa = memblock_alloc_nid(nd_size, SMP_CACHE_BYTES, nid);
>>> if (!nd_pa) {
>>> - pr_err("Cannot find %zu bytes in node %d\n",
>>
>>> + printk(KERN_WARNING "Cannot find %zu bytes in node %d\n",
>>> nd_size, nid)
>>
>> Please change to use pr_warn().
>>
>> Thanks,
>> Yasuaki Ishimatsu
>>
>>> - return;
>>> + nd_pa = memblock_alloc(nd_size, SMP_CACHE_BYTES);
>>> + if (!nd_pa) {
>>> + pr_err("Cannot find %zu bytes in other node\n",
>>> + nd_size);
>>> + return;
>>> + }
>>> }
>>> nd = __va(nd_pa);
>>> }
>>>
>>
>>
>>
>