2018-07-02 02:16:31

by Pavel Tatashin

[permalink] [raw]
Subject: [PATCH v3 0/2] sparse_init rewrite

Changelog:
v3 - v1
- Fixed two issues found by Baoquan He
v1 - v2
- Addressed comments from Oscar Salvador

In sparse_init() we allocate two large buffers to temporary hold usemap and
memmap for the whole machine. However, we can avoid doing that if we
changed sparse_init() to operated on per-node bases instead of doing it on
the whole machine beforehand.

As shown by Baoquan
http://lkml.kernel.org/r/[email protected]

The buffers are large enough to cause machine stop to boot on small memory
systems.

These patches should be applied on top of Baoquan's work, as
CONFIG_SPARSEMEM_ALLOC_MEM_MAP_TOGETHER is removed in that work.

For the ease of review, I split this work so the first patch only adds new
interfaces, the second patch enables them, and removes the old ones.

Pavel Tatashin (2):
mm/sparse: add sparse_init_nid()
mm/sparse: start using sparse_init_nid(), and remove old code

include/linux/mm.h | 9 +-
mm/sparse-vmemmap.c | 44 ++++---
mm/sparse.c | 279 +++++++++++++++-----------------------------
3 files changed, 125 insertions(+), 207 deletions(-)

--
2.18.0



2018-07-02 02:14:56

by Pavel Tatashin

[permalink] [raw]
Subject: [PATCH v3 2/2] mm/sparse: start using sparse_init_nid(), and remove old code

Change sprase_init() to only find the pnum ranges that belong to a specific
node and call sprase_init_nid() for that range from sparse_init().

Delete all the code that became obsolete with this change.

Signed-off-by: Pavel Tatashin <[email protected]>
---
include/linux/mm.h | 5 -
mm/sparse-vmemmap.c | 39 --------
mm/sparse.c | 220 ++++----------------------------------------
3 files changed, 17 insertions(+), 247 deletions(-)

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 85530fdfb1f2..a7438be90658 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -2646,11 +2646,6 @@ extern int randomize_va_space;
const char * arch_vma_name(struct vm_area_struct *vma);
void print_vma_addr(char *prefix, unsigned long rip);

-void sparse_mem_maps_populate_node(struct page **map_map,
- unsigned long pnum_begin,
- unsigned long pnum_end,
- unsigned long map_count,
- int nodeid);
struct page * sparse_populate_node(unsigned long pnum_begin,
unsigned long pnum_end,
unsigned long map_count,
diff --git a/mm/sparse-vmemmap.c b/mm/sparse-vmemmap.c
index b3e325962306..4edc877cfe82 100644
--- a/mm/sparse-vmemmap.c
+++ b/mm/sparse-vmemmap.c
@@ -273,45 +273,6 @@ struct page * __meminit sparse_mem_map_populate(unsigned long pnum, int nid,
return map;
}

-void __init sparse_mem_maps_populate_node(struct page **map_map,
- unsigned long pnum_begin,
- unsigned long pnum_end,
- unsigned long map_count, int nodeid)
-{
- unsigned long pnum;
- unsigned long size = sizeof(struct page) * PAGES_PER_SECTION;
- void *vmemmap_buf_start;
- int nr_consumed_maps = 0;
-
- size = ALIGN(size, PMD_SIZE);
- vmemmap_buf_start = __earlyonly_bootmem_alloc(nodeid, size * map_count,
- PMD_SIZE, __pa(MAX_DMA_ADDRESS));
-
- if (vmemmap_buf_start) {
- vmemmap_buf = vmemmap_buf_start;
- vmemmap_buf_end = vmemmap_buf_start + size * map_count;
- }
-
- for (pnum = pnum_begin; pnum < pnum_end; pnum++) {
- if (!present_section_nr(pnum))
- continue;
-
- map_map[nr_consumed_maps] = sparse_mem_map_populate(pnum, nodeid, NULL);
- if (map_map[nr_consumed_maps++])
- continue;
- pr_err("%s: sparsemem memory map backing failed some memory will not be available\n",
- __func__);
- }
-
- if (vmemmap_buf_start) {
- /* need to free left buf */
- memblock_free_early(__pa(vmemmap_buf),
- vmemmap_buf_end - vmemmap_buf);
- vmemmap_buf = NULL;
- vmemmap_buf_end = NULL;
- }
-}
-
struct page * __init sparse_populate_node(unsigned long pnum_begin,
unsigned long pnum_end,
unsigned long map_count,
diff --git a/mm/sparse.c b/mm/sparse.c
index c18d92b8ab9b..f6da0b07947b 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -200,11 +200,10 @@ static inline int next_present_section_nr(int section_nr)
(section_nr <= __highest_present_section_nr)); \
section_nr = next_present_section_nr(section_nr))

-/*
- * Record how many memory sections are marked as present
- * during system bootup.
- */
-static int __initdata nr_present_sections;
+static inline unsigned long first_present_section_nr(void)
+{
+ return next_present_section_nr(-1);
+}

/* Record a memory area against a node. */
void __init memory_present(int nid, unsigned long start, unsigned long end)
@@ -235,7 +234,6 @@ void __init memory_present(int nid, unsigned long start, unsigned long end)
ms->section_mem_map = sparse_encode_early_nid(nid) |
SECTION_IS_ONLINE;
section_mark_present(ms);
- nr_present_sections++;
}
}
}
@@ -377,34 +375,6 @@ static void __init check_usemap_section_nr(int nid, unsigned long *usemap)
}
#endif /* CONFIG_MEMORY_HOTREMOVE */

-static void __init sparse_early_usemaps_alloc_node(void *data,
- unsigned long pnum_begin,
- unsigned long pnum_end,
- unsigned long usemap_count, int nodeid)
-{
- void *usemap;
- unsigned long pnum;
- unsigned long **usemap_map = (unsigned long **)data;
- int size = usemap_size();
- int nr_consumed_maps = 0;
-
- usemap = sparse_early_usemaps_alloc_pgdat_section(NODE_DATA(nodeid),
- size * usemap_count);
- if (!usemap) {
- pr_warn("%s: allocation failed\n", __func__);
- return;
- }
-
- for (pnum = pnum_begin; pnum < pnum_end; pnum++) {
- if (!present_section_nr(pnum))
- continue;
- usemap_map[nr_consumed_maps] = usemap;
- usemap += size;
- check_usemap_section_nr(nodeid, usemap_map[nr_consumed_maps]);
- nr_consumed_maps++;
- }
-}
-
#ifndef CONFIG_SPARSEMEM_VMEMMAP
struct page __init *sparse_mem_map_populate(unsigned long pnum, int nid,
struct vmem_altmap *altmap)
@@ -418,44 +388,6 @@ struct page __init *sparse_mem_map_populate(unsigned long pnum, int nid,
BOOTMEM_ALLOC_ACCESSIBLE, nid);
return map;
}
-void __init sparse_mem_maps_populate_node(struct page **map_map,
- unsigned long pnum_begin,
- unsigned long pnum_end,
- unsigned long map_count, int nodeid)
-{
- void *map;
- unsigned long pnum;
- unsigned long size = sizeof(struct page) * PAGES_PER_SECTION;
- int nr_consumed_maps;
-
- size = PAGE_ALIGN(size);
- map = memblock_virt_alloc_try_nid_raw(size * map_count,
- PAGE_SIZE, __pa(MAX_DMA_ADDRESS),
- BOOTMEM_ALLOC_ACCESSIBLE, nodeid);
- if (map) {
- nr_consumed_maps = 0;
- for (pnum = pnum_begin; pnum < pnum_end; pnum++) {
- if (!present_section_nr(pnum))
- continue;
- map_map[nr_consumed_maps] = map;
- map += size;
- nr_consumed_maps++;
- }
- return;
- }
-
- /* fallback */
- nr_consumed_maps = 0;
- for (pnum = pnum_begin; pnum < pnum_end; pnum++) {
- if (!present_section_nr(pnum))
- continue;
- map_map[nr_consumed_maps] = sparse_mem_map_populate(pnum, nodeid, NULL);
- if (map_map[nr_consumed_maps++])
- continue;
- pr_err("%s: sparsemem memory map backing failed some memory will not be available\n",
- __func__);
- }
-}

static unsigned long section_map_size(void)
{
@@ -495,73 +427,15 @@ struct page * __init sparse_populate_node_section(struct page *map_base,
}
#endif /* !CONFIG_SPARSEMEM_VMEMMAP */

-static void __init sparse_early_mem_maps_alloc_node(void *data,
- unsigned long pnum_begin,
- unsigned long pnum_end,
- unsigned long map_count, int nodeid)
-{
- struct page **map_map = (struct page **)data;
- sparse_mem_maps_populate_node(map_map, pnum_begin, pnum_end,
- map_count, nodeid);
-}
-
void __weak __meminit vmemmap_populate_print_last(void)
{
}

-/**
- * alloc_usemap_and_memmap - memory alloction for pageblock flags and vmemmap
- * @map: usemap_map for pageblock flags or mmap_map for vmemmap
- * @unit_size: size of map unit
- */
-static void __init alloc_usemap_and_memmap(void (*alloc_func)
- (void *, unsigned long, unsigned long,
- unsigned long, int), void *data,
- int data_unit_size)
-{
- unsigned long pnum;
- unsigned long map_count;
- int nodeid_begin = 0;
- unsigned long pnum_begin = 0;
-
- for_each_present_section_nr(0, pnum) {
- struct mem_section *ms;
-
- ms = __nr_to_section(pnum);
- nodeid_begin = sparse_early_nid(ms);
- pnum_begin = pnum;
- break;
- }
- map_count = 1;
- for_each_present_section_nr(pnum_begin + 1, pnum) {
- struct mem_section *ms;
- int nodeid;
-
- ms = __nr_to_section(pnum);
- nodeid = sparse_early_nid(ms);
- if (nodeid == nodeid_begin) {
- map_count++;
- continue;
- }
- /* ok, we need to take cake of from pnum_begin to pnum - 1*/
- alloc_func(data, pnum_begin, pnum,
- map_count, nodeid_begin);
- /* new start, update count etc*/
- nodeid_begin = nodeid;
- pnum_begin = pnum;
- data += map_count * data_unit_size;
- map_count = 1;
- }
- /* ok, last chunk */
- alloc_func(data, pnum_begin, __highest_present_section_nr+1,
- map_count, nodeid_begin);
-}
-
/*
* Initialize sparse on a specific node. The node spans [pnum_begin, pnum_end)
* And number of present sections in this node is map_count.
*/
-void __init sparse_init_nid(int nid, unsigned long pnum_begin,
+static void __init sparse_init_nid(int nid, unsigned long pnum_begin,
unsigned long pnum_end,
unsigned long map_count)
{
@@ -617,87 +491,27 @@ void __init sparse_init_nid(int nid, unsigned long pnum_begin,
*/
void __init sparse_init(void)
{
- unsigned long pnum;
- struct page *map;
- struct page **map_map;
- unsigned long *usemap;
- unsigned long **usemap_map;
- int size, size2;
- int nr_consumed_maps = 0;
-
- /* see include/linux/mmzone.h 'struct mem_section' definition */
- BUILD_BUG_ON(!is_power_of_2(sizeof(struct mem_section)));
+ unsigned long pnum_begin = first_present_section_nr();
+ int nid_begin = sparse_early_nid(__nr_to_section(pnum_begin));
+ unsigned long pnum_end, map_count = 1;

/* Setup pageblock_order for HUGETLB_PAGE_SIZE_VARIABLE */
set_pageblock_order();

- /*
- * map is using big page (aka 2M in x86 64 bit)
- * usemap is less one page (aka 24 bytes)
- * so alloc 2M (with 2M align) and 24 bytes in turn will
- * make next 2M slip to one more 2M later.
- * then in big system, the memory will have a lot of holes...
- * here try to allocate 2M pages continuously.
- *
- * powerpc need to call sparse_init_one_section right after each
- * sparse_early_mem_map_alloc, so allocate usemap_map at first.
- */
- size = sizeof(unsigned long *) * nr_present_sections;
- usemap_map = memblock_virt_alloc(size, 0);
- if (!usemap_map)
- panic("can not allocate usemap_map\n");
- alloc_usemap_and_memmap(sparse_early_usemaps_alloc_node,
- (void *)usemap_map,
- sizeof(usemap_map[0]));
-
- size2 = sizeof(struct page *) * nr_present_sections;
- map_map = memblock_virt_alloc(size2, 0);
- if (!map_map)
- panic("can not allocate map_map\n");
- alloc_usemap_and_memmap(sparse_early_mem_maps_alloc_node,
- (void *)map_map,
- sizeof(map_map[0]));
-
- /* The numner of present sections stored in nr_present_sections
- * are kept the same since mem sections are marked as present in
- * memory_present(). In this for loop, we need check which sections
- * failed to allocate memmap or usemap, then clear its
- * ->section_mem_map accordingly. During this process, we need
- * increase 'nr_consumed_maps' whether its allocation of memmap
- * or usemap failed or not, so that after we handle the i-th
- * memory section, can get memmap and usemap of (i+1)-th section
- * correctly. */
- for_each_present_section_nr(0, pnum) {
- struct mem_section *ms;
-
- if (nr_consumed_maps >= nr_present_sections) {
- pr_err("nr_consumed_maps goes beyond nr_present_sections\n");
- break;
- }
- ms = __nr_to_section(pnum);
- usemap = usemap_map[nr_consumed_maps];
- if (!usemap) {
- ms->section_mem_map = 0;
- nr_consumed_maps++;
- continue;
- }
+ for_each_present_section_nr(pnum_begin + 1, pnum_end) {
+ int nid = sparse_early_nid(__nr_to_section(pnum_end));

- map = map_map[nr_consumed_maps];
- if (!map) {
- ms->section_mem_map = 0;
- nr_consumed_maps++;
+ if (nid == nid_begin) {
+ map_count++;
continue;
}
-
- sparse_init_one_section(__nr_to_section(pnum), pnum, map,
- usemap);
- nr_consumed_maps++;
+ sparse_init_nid(nid_begin, pnum_begin, pnum_end, map_count);
+ nid_begin = nid;
+ pnum_begin = pnum_end;
+ map_count = 1;
}
-
+ sparse_init_nid(nid_begin, pnum_begin, pnum_end, map_count);
vmemmap_populate_print_last();
-
- memblock_free_early(__pa(map_map), size2);
- memblock_free_early(__pa(usemap_map), size);
}

#ifdef CONFIG_MEMORY_HOTPLUG
--
2.18.0


2018-07-02 02:15:10

by Pavel Tatashin

[permalink] [raw]
Subject: [PATCH v3 1/2] mm/sparse: add sparse_init_nid()

sparse_init() requires to temporary allocate two large buffers:
usemap_map and map_map. Baoquan He has identified that these buffers are so
large that Linux is not bootable on small memory machines, such as a kdump
boot. The buffers are especially large when CONFIG_X86_5LEVEL is set, as
they are scaled to the maximum physical memory size.

Baoquan provided a fix, which reduces these sizes of these buffers, but it
is much better to get rid of them entirely.

Add a new way to initialize sparse memory: sparse_init_nid(), which only
operates within one memory node, and thus allocates memory either in large
contiguous block or allocates section by section. This eliminates the need
for use of temporary buffers.

For simplified bisecting and review, the new interface is going to be
enabled as well as old code removed in the next patch.

Signed-off-by: Pavel Tatashin <[email protected]>
Reviewed-by: Oscar Salvador <[email protected]>
---
include/linux/mm.h | 8 ++++
mm/sparse-vmemmap.c | 49 ++++++++++++++++++++++++
mm/sparse.c | 91 +++++++++++++++++++++++++++++++++++++++++++++
3 files changed, 148 insertions(+)

diff --git a/include/linux/mm.h b/include/linux/mm.h
index a0fbb9ffe380..85530fdfb1f2 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -2651,6 +2651,14 @@ void sparse_mem_maps_populate_node(struct page **map_map,
unsigned long pnum_end,
unsigned long map_count,
int nodeid);
+struct page * sparse_populate_node(unsigned long pnum_begin,
+ unsigned long pnum_end,
+ unsigned long map_count,
+ int nid);
+struct page * sparse_populate_node_section(struct page *map_base,
+ unsigned long map_index,
+ unsigned long pnum,
+ int nid);

struct page *sparse_mem_map_populate(unsigned long pnum, int nid,
struct vmem_altmap *altmap);
diff --git a/mm/sparse-vmemmap.c b/mm/sparse-vmemmap.c
index e1a54ba411ec..b3e325962306 100644
--- a/mm/sparse-vmemmap.c
+++ b/mm/sparse-vmemmap.c
@@ -311,3 +311,52 @@ void __init sparse_mem_maps_populate_node(struct page **map_map,
vmemmap_buf_end = NULL;
}
}
+
+struct page * __init sparse_populate_node(unsigned long pnum_begin,
+ unsigned long pnum_end,
+ unsigned long map_count,
+ int nid)
+{
+ unsigned long size = sizeof(struct page) * PAGES_PER_SECTION;
+ unsigned long pnum, map_index = 0;
+ void *vmemmap_buf_start;
+
+ size = ALIGN(size, PMD_SIZE) * map_count;
+ vmemmap_buf_start = __earlyonly_bootmem_alloc(nid, size,
+ PMD_SIZE,
+ __pa(MAX_DMA_ADDRESS));
+ if (vmemmap_buf_start) {
+ vmemmap_buf = vmemmap_buf_start;
+ vmemmap_buf_end = vmemmap_buf_start + size;
+ }
+
+ for (pnum = pnum_begin; map_index < map_count; pnum++) {
+ if (!present_section_nr(pnum))
+ continue;
+ if (!sparse_mem_map_populate(pnum, nid, NULL))
+ break;
+ map_index++;
+ BUG_ON(pnum >= pnum_end);
+ }
+
+ if (vmemmap_buf_start) {
+ /* need to free left buf */
+ memblock_free_early(__pa(vmemmap_buf),
+ vmemmap_buf_end - vmemmap_buf);
+ vmemmap_buf = NULL;
+ vmemmap_buf_end = NULL;
+ }
+ return pfn_to_page(section_nr_to_pfn(pnum_begin));
+}
+
+/*
+ * Return map for pnum section. sparse_populate_node() has populated memory map
+ * in this node, we simply do pnum to struct page conversion.
+ */
+struct page * __init sparse_populate_node_section(struct page *map_base,
+ unsigned long map_index,
+ unsigned long pnum,
+ int nid)
+{
+ return pfn_to_page(section_nr_to_pfn(pnum));
+}
diff --git a/mm/sparse.c b/mm/sparse.c
index d18e2697a781..c18d92b8ab9b 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -456,6 +456,43 @@ void __init sparse_mem_maps_populate_node(struct page **map_map,
__func__);
}
}
+
+static unsigned long section_map_size(void)
+{
+ return PAGE_ALIGN(sizeof(struct page) * PAGES_PER_SECTION);
+}
+
+/*
+ * Try to allocate all struct pages for this node, if this fails, we will
+ * be allocating one section at a time in sparse_populate_node_section().
+ */
+struct page * __init sparse_populate_node(unsigned long pnum_begin,
+ unsigned long pnum_end,
+ unsigned long map_count,
+ int nid)
+{
+ return memblock_virt_alloc_try_nid_raw(section_map_size() * map_count,
+ PAGE_SIZE, __pa(MAX_DMA_ADDRESS),
+ BOOTMEM_ALLOC_ACCESSIBLE, nid);
+}
+
+/*
+ * Return map for pnum section. map_base is not NULL if we could allocate map
+ * for this node together. Otherwise we allocate one section at a time.
+ * map_index is the index of pnum in this node counting only present sections.
+ */
+struct page * __init sparse_populate_node_section(struct page *map_base,
+ unsigned long map_index,
+ unsigned long pnum,
+ int nid)
+{
+ if (map_base) {
+ unsigned long offset = section_map_size() * map_index;
+
+ return (struct page *)((char *)map_base + offset);
+ }
+ return sparse_mem_map_populate(pnum, nid, NULL);
+}
#endif /* !CONFIG_SPARSEMEM_VMEMMAP */

static void __init sparse_early_mem_maps_alloc_node(void *data,
@@ -520,6 +557,60 @@ static void __init alloc_usemap_and_memmap(void (*alloc_func)
map_count, nodeid_begin);
}

+/*
+ * Initialize sparse on a specific node. The node spans [pnum_begin, pnum_end)
+ * And number of present sections in this node is map_count.
+ */
+void __init sparse_init_nid(int nid, unsigned long pnum_begin,
+ unsigned long pnum_end,
+ unsigned long map_count)
+{
+ unsigned long pnum, usemap_longs, *usemap, map_index;
+ struct page *map, *map_base;
+
+ usemap_longs = BITS_TO_LONGS(SECTION_BLOCKFLAGS_BITS);
+ usemap = sparse_early_usemaps_alloc_pgdat_section(NODE_DATA(nid),
+ usemap_size() *
+ map_count);
+ if (!usemap) {
+ pr_err("%s: usemap allocation failed", __func__);
+ goto failed;
+ }
+ map_base = sparse_populate_node(pnum_begin, pnum_end,
+ map_count, nid);
+ map_index = 0;
+ for_each_present_section_nr(pnum_begin, pnum) {
+ if (pnum >= pnum_end)
+ break;
+
+ BUG_ON(map_index == map_count);
+ map = sparse_populate_node_section(map_base, map_index,
+ pnum, nid);
+ if (!map) {
+ pr_err("%s: memory map backing failed. Some memory will not be available.",
+ __func__);
+ pnum_begin = pnum;
+ goto failed;
+ }
+ check_usemap_section_nr(nid, usemap);
+ sparse_init_one_section(__nr_to_section(pnum), pnum, map,
+ usemap);
+ map_index++;
+ usemap += usemap_longs;
+ }
+ return;
+failed:
+ /* We failed to allocate, mark all the following pnums as not present */
+ for_each_present_section_nr(pnum_begin, pnum) {
+ struct mem_section *ms;
+
+ if (pnum >= pnum_end)
+ break;
+ ms = __nr_to_section(pnum);
+ ms->section_mem_map = 0;
+ }
+}
+
/*
* Allocate the accumulated non-linear sections, allocate a mem_map
* for each and record the physical to section mapping.
--
2.18.0


2018-07-02 02:17:46

by Baoquan He

[permalink] [raw]
Subject: Re: [PATCH v3 1/2] mm/sparse: add sparse_init_nid()

Hi Pavel,

Thanks for your quick fix. You might have missed another comment to v2
patch 1/2 which is at the bottom.

On 07/01/18 at 10:04pm, Pavel Tatashin wrote:
> +/*
> + * Initialize sparse on a specific node. The node spans [pnum_begin, pnum_end)
> + * And number of present sections in this node is map_count.
> + */
> +void __init sparse_init_nid(int nid, unsigned long pnum_begin,
> + unsigned long pnum_end,
> + unsigned long map_count)
> +{
> + unsigned long pnum, usemap_longs, *usemap, map_index;
> + struct page *map, *map_base;
> +
> + usemap_longs = BITS_TO_LONGS(SECTION_BLOCKFLAGS_BITS);
> + usemap = sparse_early_usemaps_alloc_pgdat_section(NODE_DATA(nid),
> + usemap_size() *
> + map_count);
> + if (!usemap) {
> + pr_err("%s: usemap allocation failed", __func__);
> + goto failed;
> + }
> + map_base = sparse_populate_node(pnum_begin, pnum_end,
> + map_count, nid);
> + map_index = 0;
> + for_each_present_section_nr(pnum_begin, pnum) {
> + if (pnum >= pnum_end)
> + break;
> +
> + BUG_ON(map_index == map_count);
> + map = sparse_populate_node_section(map_base, map_index,
> + pnum, nid);
> + if (!map) {

Here, I think it might be not right to jump to 'failed' directly if one
section of the node failed to populate memmap. I think the original code
is only skipping the section which memmap failed to populate by marking
it as not present with "ms->section_mem_map = 0".

> + pr_err("%s: memory map backing failed. Some memory will not be available.",
> + __func__);
> + pnum_begin = pnum;
> + goto failed;
> + }
> + check_usemap_section_nr(nid, usemap);
> + sparse_init_one_section(__nr_to_section(pnum), pnum, map,
> + usemap);
> + map_index++;
> + usemap += usemap_longs;
> + }
> + return;
> +failed:
> + /* We failed to allocate, mark all the following pnums as not present */
> + for_each_present_section_nr(pnum_begin, pnum) {
> + struct mem_section *ms;
> +
> + if (pnum >= pnum_end)
> + break;
> + ms = __nr_to_section(pnum);
> + ms->section_mem_map = 0;
> + }
> +}
> +
> /*
> * Allocate the accumulated non-linear sections, allocate a mem_map
> * for each and record the physical to section mapping.
> --
> 2.18.0
>

2018-07-02 02:24:48

by Pavel Tatashin

[permalink] [raw]
Subject: Re: [PATCH v3 1/2] mm/sparse: add sparse_init_nid()

> Here, I think it might be not right to jump to 'failed' directly if one
> section of the node failed to populate memmap. I think the original code
> is only skipping the section which memmap failed to populate by marking
> it as not present with "ms->section_mem_map = 0".
>

Hi Baoquan,

Thank you for a careful review. This is an intended change compared to
the original code. Because we operate per-node now, if we fail to
allocate a single section, in this node, it means we also will fail to
allocate all the consequent sections in the same node and no need to
check them anymore. In the original code we could not simply bailout,
because we still might have valid entries in the following nodes.
Similarly, sparse_init() will call sparse_init_nid() for the next node
even if previous node failed to setup all the memory.

Pavel

2018-07-02 02:50:01

by Baoquan He

[permalink] [raw]
Subject: Re: [PATCH v3 1/2] mm/sparse: add sparse_init_nid()

On 07/01/18 at 10:18pm, Pavel Tatashin wrote:
> > Here, I think it might be not right to jump to 'failed' directly if one
> > section of the node failed to populate memmap. I think the original code
> > is only skipping the section which memmap failed to populate by marking
> > it as not present with "ms->section_mem_map = 0".
> >
>
> Hi Baoquan,
>
> Thank you for a careful review. This is an intended change compared to
> the original code. Because we operate per-node now, if we fail to
> allocate a single section, in this node, it means we also will fail to
> allocate all the consequent sections in the same node and no need to
> check them anymore. In the original code we could not simply bailout,
> because we still might have valid entries in the following nodes.
> Similarly, sparse_init() will call sparse_init_nid() for the next node
> even if previous node failed to setup all the memory.

Hmm, say the node we are handling is node5, and there are 100 sections.
If you allocate memmap for section at one time, you have succeeded to
handle for the first 99 sections, now the 100th failed, so you will mark
all sections on node5 as not present. And the allocation failure is only
for single section memmap allocation case.

Please think about the vmemmap case, it will map the struct page pages
to vmemmap, and will populate page tables for them to map. That is a
long walk, not only memmory allocation, and page table checking and
populating, one section failing to populate memmap doesn't mean all the
consequent sections also failed. I think the original code is
reasonable.

Thanks
Baoquan

2018-07-02 03:30:23

by Pavel Tatashin

[permalink] [raw]
Subject: Re: [PATCH v3 1/2] mm/sparse: add sparse_init_nid()

On Sun, Jul 1, 2018 at 10:31 PM Baoquan He <[email protected]> wrote:
>
> On 07/01/18 at 10:18pm, Pavel Tatashin wrote:
> > > Here, I think it might be not right to jump to 'failed' directly if one
> > > section of the node failed to populate memmap. I think the original code
> > > is only skipping the section which memmap failed to populate by marking
> > > it as not present with "ms->section_mem_map = 0".
> > >
> >
> > Hi Baoquan,
> >
> > Thank you for a careful review. This is an intended change compared to
> > the original code. Because we operate per-node now, if we fail to
> > allocate a single section, in this node, it means we also will fail to
> > allocate all the consequent sections in the same node and no need to
> > check them anymore. In the original code we could not simply bailout,
> > because we still might have valid entries in the following nodes.
> > Similarly, sparse_init() will call sparse_init_nid() for the next node
> > even if previous node failed to setup all the memory.
>
> Hmm, say the node we are handling is node5, and there are 100 sections.
> If you allocate memmap for section at one time, you have succeeded to
> handle for the first 99 sections, now the 100th failed, so you will mark
> all sections on node5 as not present. And the allocation failure is only
> for single section memmap allocation case.

No, unless I am missing something, that's not how code works:

463 if (!map) {
464 pr_err("%s: memory map backing failed.
Some memory will not be available.",
465 __func__);
466 pnum_begin = pnum;
467 goto failed;
468 }

476 failed:
477 /* We failed to allocate, mark all the following pnums as
not present */
478 for_each_present_section_nr(pnum_begin, pnum) {

We continue from the pnum that failed as we set pnum_begin to pnum,
and mark all the consequent sections as not-present.

The only change compared to the original code is that once we found an
empty pnum we stop checking the consequent pnums in this node, as we
know they are empty as well, because there is no more memory in this
node to allocate from.

Pavel

2018-07-02 03:51:41

by Baoquan He

[permalink] [raw]
Subject: Re: [PATCH v3 1/2] mm/sparse: add sparse_init_nid()

On 07/01/18 at 10:04pm, Pavel Tatashin wrote:
> +/*
> + * Initialize sparse on a specific node. The node spans [pnum_begin, pnum_end)
> + * And number of present sections in this node is map_count.
> + */
> +void __init sparse_init_nid(int nid, unsigned long pnum_begin,
> + unsigned long pnum_end,
> + unsigned long map_count)
> +{
> + unsigned long pnum, usemap_longs, *usemap, map_index;
> + struct page *map, *map_base;
> +
> + usemap_longs = BITS_TO_LONGS(SECTION_BLOCKFLAGS_BITS);
> + usemap = sparse_early_usemaps_alloc_pgdat_section(NODE_DATA(nid),
> + usemap_size() *
> + map_count);
> + if (!usemap) {
> + pr_err("%s: usemap allocation failed", __func__);

Wondering if we can provide more useful information for better debugging
if failed. E.g here tell on what nid the usemap allocation failed.

> + goto failed;
> + }
> + map_base = sparse_populate_node(pnum_begin, pnum_end,
> + map_count, nid);
> + map_index = 0;
> + for_each_present_section_nr(pnum_begin, pnum) {
> + if (pnum >= pnum_end)
> + break;
> +
> + BUG_ON(map_index == map_count);
> + map = sparse_populate_node_section(map_base, map_index,
> + pnum, nid);
> + if (!map) {
> + pr_err("%s: memory map backing failed. Some memory will not be available.",
> + __func__);
And here tell nid and the memory section nr failed.

> + pnum_begin = pnum;
> + goto failed;
> + }
> + check_usemap_section_nr(nid, usemap);
> + sparse_init_one_section(__nr_to_section(pnum), pnum, map,
> + usemap);
> + map_index++;
> + usemap += usemap_longs;
> + }
> + return;
> +failed:
> + /* We failed to allocate, mark all the following pnums as not present */
> + for_each_present_section_nr(pnum_begin, pnum) {
> + struct mem_section *ms;
> +
> + if (pnum >= pnum_end)
> + break;
> + ms = __nr_to_section(pnum);
> + ms->section_mem_map = 0;
> + }
> +}
> +
> /*
> * Allocate the accumulated non-linear sections, allocate a mem_map
> * for each and record the physical to section mapping.
> --
> 2.18.0
>

2018-07-02 03:51:41

by Baoquan He

[permalink] [raw]
Subject: Re: [PATCH v3 1/2] mm/sparse: add sparse_init_nid()

On 07/01/18 at 10:43pm, Pavel Tatashin wrote:
> On Sun, Jul 1, 2018 at 10:31 PM Baoquan He <[email protected]> wrote:
> >
> > On 07/01/18 at 10:18pm, Pavel Tatashin wrote:
> > > > Here, I think it might be not right to jump to 'failed' directly if one
> > > > section of the node failed to populate memmap. I think the original code
> > > > is only skipping the section which memmap failed to populate by marking
> > > > it as not present with "ms->section_mem_map = 0".
> > > >
> > >
> > > Hi Baoquan,
> > >
> > > Thank you for a careful review. This is an intended change compared to
> > > the original code. Because we operate per-node now, if we fail to
> > > allocate a single section, in this node, it means we also will fail to
> > > allocate all the consequent sections in the same node and no need to
> > > check them anymore. In the original code we could not simply bailout,
> > > because we still might have valid entries in the following nodes.
> > > Similarly, sparse_init() will call sparse_init_nid() for the next node
> > > even if previous node failed to setup all the memory.
> >
> > Hmm, say the node we are handling is node5, and there are 100 sections.
> > If you allocate memmap for section at one time, you have succeeded to
> > handle for the first 99 sections, now the 100th failed, so you will mark
> > all sections on node5 as not present. And the allocation failure is only
> > for single section memmap allocation case.
>
> No, unless I am missing something, that's not how code works:
>
> 463 if (!map) {
> 464 pr_err("%s: memory map backing failed.
> Some memory will not be available.",
> 465 __func__);
> 466 pnum_begin = pnum;
> 467 goto failed;
> 468 }
>
> 476 failed:
> 477 /* We failed to allocate, mark all the following pnums as
> not present */
> 478 for_each_present_section_nr(pnum_begin, pnum) {
>
> We continue from the pnum that failed as we set pnum_begin to pnum,
> and mark all the consequent sections as not-present.

Ah, yes, I misunderstood it, sorry for that.

Then I have only one concern, for vmemmap case, if one section doesn't
succeed to populate its memmap, do we need to skip all the remaining
sections in that node?

>
> The only change compared to the original code is that once we found an
> empty pnum we stop checking the consequent pnums in this node, as we
> know they are empty as well, because there is no more memory in this
> node to allocate from.
>

2018-07-02 03:53:59

by Pavel Tatashin

[permalink] [raw]
Subject: Re: [PATCH v3 1/2] mm/sparse: add sparse_init_nid()

> > + if (!usemap) {
> > + pr_err("%s: usemap allocation failed", __func__);
>
> Wondering if we can provide more useful information for better debugging
> if failed. E.g here tell on what nid the usemap allocation failed.
>
> > + pnum, nid);
> > + if (!map) {
> > + pr_err("%s: memory map backing failed. Some memory will not be available.",
> > + __func__);
> And here tell nid and the memory section nr failed.

Sure, I will wait for more comments, if any, and add more info to the
error messages in the next revision.

Thank you,
Pavel

2018-07-02 03:54:00

by Pavel Tatashin

[permalink] [raw]
Subject: Re: [PATCH v3 1/2] mm/sparse: add sparse_init_nid()

> Ah, yes, I misunderstood it, sorry for that.
>
> Then I have only one concern, for vmemmap case, if one section doesn't
> succeed to populate its memmap, do we need to skip all the remaining
> sections in that node?

Yes, in sparse_populate_node() we have the following:

294 for (pnum = pnum_begin; map_index < map_count; pnum++) {
295 if (!present_section_nr(pnum))
296 continue;
297 if (!sparse_mem_map_populate(pnum, nid, NULL))
298 break;

So, on the first failure, we even stop trying to populate other
sections. No more memory to do so.

Pavel

2018-07-02 03:54:53

by Pavel Tatashin

[permalink] [raw]
Subject: Re: [PATCH v3 1/2] mm/sparse: add sparse_init_nid()

> > So, on the first failure, we even stop trying to populate other
> > sections. No more memory to do so.
>
> This is the thing I worry about. In old sparse_mem_maps_populate_node()
> you can see, when not present or failed to populate, just continue. This
> is the main difference between yours and the old code. The key logic is
> changed here.
>

I do not see how we can succeed after the first failure. We still
allocate from the same node:

sparse_mem_map_populate() may fail only if we could not allocate large
enough buffer vmemmap_buf_start earlier.

This means that in:
sparse_mem_map_populate()
vmemmap_populate()
vmemmap_populate_hugepages()
vmemmap_alloc_block_buf() (no buffer, so call allocator)
vmemmap_alloc_block(size, node);
__earlyonly_bootmem_alloc(node, size, size, __pa(MAX_DMA_ADDRESS));
memblock_virt_alloc_try_nid_raw() -> Nothing changes for
this call to succeed. So, all consequent calls to
sparse_mem_map_populate() in this node will fail as well.

> >
> Forgot mentioning it's the vervion in mm/sparse-vmemmap.c

Sorry, I do not understand what is vervion.

Thank you,
Pavel

2018-07-02 03:55:12

by Baoquan He

[permalink] [raw]
Subject: Re: [PATCH v3 1/2] mm/sparse: add sparse_init_nid()

On 07/01/18 at 11:03pm, Pavel Tatashin wrote:
> > Ah, yes, I misunderstood it, sorry for that.
> >
> > Then I have only one concern, for vmemmap case, if one section doesn't
> > succeed to populate its memmap, do we need to skip all the remaining
> > sections in that node?
>
> Yes, in sparse_populate_node() we have the following:
>
> 294 for (pnum = pnum_begin; map_index < map_count; pnum++) {
> 295 if (!present_section_nr(pnum))
> 296 continue;
> 297 if (!sparse_mem_map_populate(pnum, nid, NULL))
> 298 break;
>
> So, on the first failure, we even stop trying to populate other
> sections. No more memory to do so.

This is the thing I worry about. In old sparse_mem_maps_populate_node()
you can see, when not present or failed to populate, just continue. This
is the main difference between yours and the old code. The key logic is
changed here.

void __init sparse_mem_maps_populate_node(struct page **map_map,
unsigned long pnum_begin,
unsigned long pnum_end,
unsigned long map_count, int nodeid)
{
...
for (pnum = pnum_begin; pnum < pnum_end; pnum++) {
struct mem_section *ms;

if (!present_section_nr(pnum))
continue;

map_map[pnum] = sparse_mem_map_populate(pnum, nodeid, NULL);
if (map_map[pnum])
continue;
ms = __nr_to_section(pnum);
pr_err("%s: sparsemem memory map backing failed some memory will not be available\n",
__func__);
ms->section_mem_map = 0;
}
...
}


2018-07-02 03:56:10

by Baoquan He

[permalink] [raw]
Subject: Re: [PATCH v3 1/2] mm/sparse: add sparse_init_nid()

On 07/01/18 at 11:28pm, Pavel Tatashin wrote:
> > > So, on the first failure, we even stop trying to populate other
> > > sections. No more memory to do so.
> >
> > This is the thing I worry about. In old sparse_mem_maps_populate_node()
> > you can see, when not present or failed to populate, just continue. This
> > is the main difference between yours and the old code. The key logic is
> > changed here.
> >
>
> I do not see how we can succeed after the first failure. We still
> allocate from the same node:
>
> sparse_mem_map_populate() may fail only if we could not allocate large
> enough buffer vmemmap_buf_start earlier.
>
> This means that in:
> sparse_mem_map_populate()
> vmemmap_populate()
> vmemmap_populate_hugepages()
> vmemmap_alloc_block_buf() (no buffer, so call allocator)
> vmemmap_alloc_block(size, node);
> __earlyonly_bootmem_alloc(node, size, size, __pa(MAX_DMA_ADDRESS));
> memblock_virt_alloc_try_nid_raw() -> Nothing changes for
> this call to succeed. So, all consequent calls to
> sparse_mem_map_populate() in this node will fail as well.

Yes, you are right, it's improvement. Thanks.

>
> > >
> > Forgot mentioning it's the vervion in mm/sparse-vmemmap.c
>
> Sorry, I do not understand what is vervion.

Typo, 'version', should be. Sorry for that.


2018-07-02 04:42:39

by Baoquan He

[permalink] [raw]
Subject: Re: [PATCH v3 1/2] mm/sparse: add sparse_init_nid()

On 07/02/18 at 11:14am, Baoquan He wrote:
> On 07/01/18 at 11:03pm, Pavel Tatashin wrote:
> > > Ah, yes, I misunderstood it, sorry for that.
> > >
> > > Then I have only one concern, for vmemmap case, if one section doesn't
> > > succeed to populate its memmap, do we need to skip all the remaining
> > > sections in that node?
> >
> > Yes, in sparse_populate_node() we have the following:
> >
> > 294 for (pnum = pnum_begin; map_index < map_count; pnum++) {
> > 295 if (!present_section_nr(pnum))
> > 296 continue;
> > 297 if (!sparse_mem_map_populate(pnum, nid, NULL))
> > 298 break;
> >
> > So, on the first failure, we even stop trying to populate other
> > sections. No more memory to do so.
>
> This is the thing I worry about. In old sparse_mem_maps_populate_node()
> you can see, when not present or failed to populate, just continue. This
> is the main difference between yours and the old code. The key logic is
> changed here.
>
Forgot mentioning it's the vervion in mm/sparse-vmemmap.c

> void __init sparse_mem_maps_populate_node(struct page **map_map,
> unsigned long pnum_begin,
> unsigned long pnum_end,
> unsigned long map_count, int nodeid)
> {
> ...
> for (pnum = pnum_begin; pnum < pnum_end; pnum++) {
> struct mem_section *ms;
>
> if (!present_section_nr(pnum))
> continue;
>
> map_map[pnum] = sparse_mem_map_populate(pnum, nodeid, NULL);
> if (map_map[pnum])
> continue;
> ms = __nr_to_section(pnum);
> pr_err("%s: sparsemem memory map backing failed some memory will not be available\n",
> __func__);
> ms->section_mem_map = 0;
> }
> ...
> }
>

2018-07-02 14:06:06

by Oscar Salvador

[permalink] [raw]
Subject: Re: [PATCH v3 2/2] mm/sparse: start using sparse_init_nid(), and remove old code

On Sun, Jul 01, 2018 at 10:04:17PM -0400, Pavel Tatashin wrote:
> Change sprase_init() to only find the pnum ranges that belong to a specific
> node and call sprase_init_nid() for that range from sparse_init().
>
> Delete all the code that became obsolete with this change.
>
> Signed-off-by: Pavel Tatashin <[email protected]>

This looks like an improvement to me.
It also makes the code much easier to follow and to understand.

Reviewed-by: Oscar Salvador <[email protected]>

--
Oscar Salvador
SUSE L3

2018-07-02 16:22:24

by Dave Hansen

[permalink] [raw]
Subject: Re: [PATCH v3 0/2] sparse_init rewrite

On 07/01/2018 07:04 PM, Pavel Tatashin wrote:
> include/linux/mm.h | 9 +-
> mm/sparse-vmemmap.c | 44 ++++---
> mm/sparse.c | 279 +++++++++++++++-----------------------------
> 3 files changed, 125 insertions(+), 207 deletions(-)

FWIW, I'm not a fan of rewrites, but this is an awful lot of code to remove.

I assume from all the back-and-forth, you have another version
forthcoming. I'll take a close look through that one.

2018-07-02 17:48:41

by Pavel Tatashin

[permalink] [raw]
Subject: Re: [PATCH v3 0/2] sparse_init rewrite

On Mon, Jul 2, 2018 at 12:20 PM Dave Hansen <[email protected]> wrote:
>
> On 07/01/2018 07:04 PM, Pavel Tatashin wrote:
> > include/linux/mm.h | 9 +-
> > mm/sparse-vmemmap.c | 44 ++++---
> > mm/sparse.c | 279 +++++++++++++++-----------------------------
> > 3 files changed, 125 insertions(+), 207 deletions(-)
>
> FWIW, I'm not a fan of rewrites, but this is an awful lot of code to remove.
>
> I assume from all the back-and-forth, you have another version
> forthcoming. I'll take a close look through that one.

The removed code is a benefit, but once you review it, you will see
that it was necessary to re-write in order to get rid of the temporary
buffers. Please review the current version. The only change that is
going to be in the next version is added "nid" to pr_err() in
sparse_init_nid() for more detailed error.

Thank you,
Pavel

2018-07-02 19:49:25

by Dave Hansen

[permalink] [raw]
Subject: Re: [PATCH v3 2/2] mm/sparse: start using sparse_init_nid(), and remove old code

On 07/01/2018 07:04 PM, Pavel Tatashin wrote:
> + for_each_present_section_nr(pnum_begin + 1, pnum_end) {
> + int nid = sparse_early_nid(__nr_to_section(pnum_end));
>
> + if (nid == nid_begin) {
> + map_count++;
> continue;
> }

> + sparse_init_nid(nid_begin, pnum_begin, pnum_end, map_count);
> + nid_begin = nid;
> + pnum_begin = pnum_end;
> + map_count = 1;
> }

Ugh, this is really hard to read. Especially because the pnum "counter"
is called "pnum_end".

So, this is basically a loop that collects all of the adjacent sections
in a given single nid and then calls sparse_init_nid(). pnum_end in
this case is non-inclusive, so the sparse_init_nid() call is actually
for the *previous* nid that pnum_end is pointing _past_.

This *really* needs commenting.

2018-07-02 19:55:53

by Pavel Tatashin

[permalink] [raw]
Subject: Re: [PATCH v3 2/2] mm/sparse: start using sparse_init_nid(), and remove old code



On 07/02/2018 03:47 PM, Dave Hansen wrote:
> On 07/01/2018 07:04 PM, Pavel Tatashin wrote:
>> + for_each_present_section_nr(pnum_begin + 1, pnum_end) {
>> + int nid = sparse_early_nid(__nr_to_section(pnum_end));
>>
>> + if (nid == nid_begin) {
>> + map_count++;
>> continue;
>> }
>
>> + sparse_init_nid(nid_begin, pnum_begin, pnum_end, map_count);
>> + nid_begin = nid;
>> + pnum_begin = pnum_end;
>> + map_count = 1;
>> }
>
> Ugh, this is really hard to read. Especially because the pnum "counter"
> is called "pnum_end".

I called it pnum_end, because that is what is passed to sparse_init_nid(), but I see your point, and I can rename pnum_end to simply pnum if that will make things look better.

>
> So, this is basically a loop that collects all of the adjacent sections
> in a given single nid and then calls sparse_init_nid(). pnum_end in
> this case is non-inclusive, so the sparse_init_nid() call is actually
> for the *previous* nid that pnum_end is pointing _past_.
>
> This *really* needs commenting.

There is a comment before sparse_init_nid() about inclusiveness:

434 /*
435 * Initialize sparse on a specific node. The node spans [pnum_begin, pnum_end)
436 * And number of present sections in this node is map_count.
437 */
438 static void __init sparse_init_nid(int nid, unsigned long pnum_begin,
439 unsigned long pnum_end,
440 unsigned long map_count)


Thank you,
Pavel

2018-07-02 20:01:04

by Dave Hansen

[permalink] [raw]
Subject: Re: [PATCH v3 1/2] mm/sparse: add sparse_init_nid()

> @@ -2651,6 +2651,14 @@ void sparse_mem_maps_populate_node(struct page **map_map,
> unsigned long pnum_end,
> unsigned long map_count,
> int nodeid);
> +struct page * sparse_populate_node(unsigned long pnum_begin,

CodingStyle: put the "*" next to the function name, no space, please.

> + unsigned long pnum_end,
> + unsigned long map_count,
> + int nid);
> +struct page * sparse_populate_node_section(struct page *map_base,
> + unsigned long map_index,
> + unsigned long pnum,
> + int nid);

These two functions are named in very similar ways. Do they do similar
things?

> struct page *sparse_mem_map_populate(unsigned long pnum, int nid,
> struct vmem_altmap *altmap);
> diff --git a/mm/sparse-vmemmap.c b/mm/sparse-vmemmap.c
> index e1a54ba411ec..b3e325962306 100644
> --- a/mm/sparse-vmemmap.c
> +++ b/mm/sparse-vmemmap.c
> @@ -311,3 +311,52 @@ void __init sparse_mem_maps_populate_node(struct page **map_map,
> vmemmap_buf_end = NULL;
> }
> }
> +
> +struct page * __init sparse_populate_node(unsigned long pnum_begin,
> + unsigned long pnum_end,
> + unsigned long map_count,
> + int nid)
> +{

Could you comment what the function is doing, please?

> + unsigned long size = sizeof(struct page) * PAGES_PER_SECTION;
> + unsigned long pnum, map_index = 0;
> + void *vmemmap_buf_start;
> +
> + size = ALIGN(size, PMD_SIZE) * map_count;
> + vmemmap_buf_start = __earlyonly_bootmem_alloc(nid, size,
> + PMD_SIZE,
> + __pa(MAX_DMA_ADDRESS));

Let's not repeat the mistakes of the previous version of the code.
Please explain why we are aligning this. Also,
__earlyonly_bootmem_alloc()->memblock_virt_alloc_try_nid_raw() claims to
be aligning the size. Do we also need to do it here?

Yes, I know the old code did this, but this is the cost of doing a
rewrite. :)

> + if (vmemmap_buf_start) {
> + vmemmap_buf = vmemmap_buf_start;
> + vmemmap_buf_end = vmemmap_buf_start + size;
> + }

It would be nice to call out that these are globals that other code
picks up.

> + for (pnum = pnum_begin; map_index < map_count; pnum++) {
> + if (!present_section_nr(pnum))
> + continue;
> + if (!sparse_mem_map_populate(pnum, nid, NULL))
> + break;

^ This consumes "vmemmap_buf", right? That seems like a really nice
thing to point out here if so.

> + map_index++;
> + BUG_ON(pnum >= pnum_end);
> + }
> +
> + if (vmemmap_buf_start) {
> + /* need to free left buf */
> + memblock_free_early(__pa(vmemmap_buf),
> + vmemmap_buf_end - vmemmap_buf);
> + vmemmap_buf = NULL;
> + vmemmap_buf_end = NULL;
> + }
> + return pfn_to_page(section_nr_to_pfn(pnum_begin));
> +}
> +
> +/*
> + * Return map for pnum section. sparse_populate_node() has populated memory map
> + * in this node, we simply do pnum to struct page conversion.
> + */
> +struct page * __init sparse_populate_node_section(struct page *map_base,
> + unsigned long map_index,
> + unsigned long pnum,
> + int nid)
> +{
> + return pfn_to_page(section_nr_to_pfn(pnum));
> +}

What is up with all of the unused arguments to this function?

> diff --git a/mm/sparse.c b/mm/sparse.c
> index d18e2697a781..c18d92b8ab9b 100644
> --- a/mm/sparse.c
> +++ b/mm/sparse.c
> @@ -456,6 +456,43 @@ void __init sparse_mem_maps_populate_node(struct page **map_map,
> __func__);
> }
> }
> +
> +static unsigned long section_map_size(void)
> +{
> + return PAGE_ALIGN(sizeof(struct page) * PAGES_PER_SECTION);
> +}

Seems like if we have this, we should use it wherever possible, like
sparse_populate_node().


> +/*
> + * Try to allocate all struct pages for this node, if this fails, we will
> + * be allocating one section at a time in sparse_populate_node_section().
> + */
> +struct page * __init sparse_populate_node(unsigned long pnum_begin,
> + unsigned long pnum_end,
> + unsigned long map_count,
> + int nid)
> +{
> + return memblock_virt_alloc_try_nid_raw(section_map_size() * map_count,
> + PAGE_SIZE, __pa(MAX_DMA_ADDRESS),
> + BOOTMEM_ALLOC_ACCESSIBLE, nid);
> +}
> +
> +/*
> + * Return map for pnum section. map_base is not NULL if we could allocate map
> + * for this node together. Otherwise we allocate one section at a time.
> + * map_index is the index of pnum in this node counting only present sections.
> + */
> +struct page * __init sparse_populate_node_section(struct page *map_base,
> + unsigned long map_index,
> + unsigned long pnum,
> + int nid)
> +{
> + if (map_base) {
> + unsigned long offset = section_map_size() * map_index;
> +
> + return (struct page *)((char *)map_base + offset);
> + }
> + return sparse_mem_map_populate(pnum, nid, NULL);

Oh, you have a vmemmap and non-vmemmap version.

BTW, can't the whole map base calculation just be replaced with:

return &map_base[PAGES_PER_SECTION * map_index];

?

2018-07-02 20:01:48

by Dave Hansen

[permalink] [raw]
Subject: Re: [PATCH v3 2/2] mm/sparse: start using sparse_init_nid(), and remove old code

On 07/02/2018 12:54 PM, Pavel Tatashin wrote:
>
>
> On 07/02/2018 03:47 PM, Dave Hansen wrote:
>> On 07/01/2018 07:04 PM, Pavel Tatashin wrote:
>>> + for_each_present_section_nr(pnum_begin + 1, pnum_end) {
>>> + int nid = sparse_early_nid(__nr_to_section(pnum_end));
>>>
>>> + if (nid == nid_begin) {
>>> + map_count++;
>>> continue;
>>> }
>>
>>> + sparse_init_nid(nid_begin, pnum_begin, pnum_end, map_count);
>>> + nid_begin = nid;
>>> + pnum_begin = pnum_end;
>>> + map_count = 1;
>>> }
>>
>> Ugh, this is really hard to read. Especially because the pnum "counter"
>> is called "pnum_end".
>
> I called it pnum_end, because that is what is passed to
> sparse_init_nid(), but I see your point, and I can rename pnum_end to
> simply pnum if that will make things look better.

Could you just make it a helper that takes a beginning pnum and returns
the number of consecutive sections?

>> So, this is basically a loop that collects all of the adjacent sections
>> in a given single nid and then calls sparse_init_nid(). pnum_end in
>> this case is non-inclusive, so the sparse_init_nid() call is actually
>> for the *previous* nid that pnum_end is pointing _past_.
>>
>> This *really* needs commenting.
>
> There is a comment before sparse_init_nid() about inclusiveness:
>
> 434 /*
> 435 * Initialize sparse on a specific node. The node spans [pnum_begin, pnum_end)
> 436 * And number of present sections in this node is map_count.
> 437 */
> 438 static void __init sparse_init_nid(int nid, unsigned long pnum_begin,
> 439 unsigned long pnum_end,
> 440 unsigned long map_count)

Which I totally missed. Could you comment the code, please?


2018-07-02 20:14:06

by Pavel Tatashin

[permalink] [raw]
Subject: Re: [PATCH v3 2/2] mm/sparse: start using sparse_init_nid(), and remove old code

On Mon, Jul 2, 2018 at 4:00 PM Dave Hansen <[email protected]> wrote:
>
> On 07/02/2018 12:54 PM, Pavel Tatashin wrote:
> >
> >
> > On 07/02/2018 03:47 PM, Dave Hansen wrote:
> >> On 07/01/2018 07:04 PM, Pavel Tatashin wrote:
> >>> + for_each_present_section_nr(pnum_begin + 1, pnum_end) {
> >>> + int nid = sparse_early_nid(__nr_to_section(pnum_end));
> >>>
> >>> + if (nid == nid_begin) {
> >>> + map_count++;
> >>> continue;
> >>> }
> >>
> >>> + sparse_init_nid(nid_begin, pnum_begin, pnum_end, map_count);
> >>> + nid_begin = nid;
> >>> + pnum_begin = pnum_end;
> >>> + map_count = 1;
> >>> }
> >>
> >> Ugh, this is really hard to read. Especially because the pnum "counter"
> >> is called "pnum_end".
> >
> > I called it pnum_end, because that is what is passed to
> > sparse_init_nid(), but I see your point, and I can rename pnum_end to
> > simply pnum if that will make things look better.
>
> Could you just make it a helper that takes a beginning pnum and returns
> the number of consecutive sections?

But sections do not have to be consequent. Some nodes may have
sections that are not present. So we are looking for two values:
map_count -> which is number of present sections and node_end for the
current node i.e. the first section of the next node. So the helper
would need to return two things, and would basically repeat the same
code that is done in this function.

>
> >> So, this is basically a loop that collects all of the adjacent sections
> >> in a given single nid and then calls sparse_init_nid(). pnum_end in
> >> this case is non-inclusive, so the sparse_init_nid() call is actually
> >> for the *previous* nid that pnum_end is pointing _past_.
> >>
> >> This *really* needs commenting.
> >
> > There is a comment before sparse_init_nid() about inclusiveness:
> >
> > 434 /*
> > 435 * Initialize sparse on a specific node. The node spans [pnum_begin, pnum_end)
> > 436 * And number of present sections in this node is map_count.
> > 437 */
> > 438 static void __init sparse_init_nid(int nid, unsigned long pnum_begin,
> > 439 unsigned long pnum_end,
> > 440 unsigned long map_count)
>
> Which I totally missed. Could you comment the code, please?

Sure, I will add a comment into sparse_init() as well.

2018-07-02 20:31:12

by Pavel Tatashin

[permalink] [raw]
Subject: Re: [PATCH v3 1/2] mm/sparse: add sparse_init_nid()

On Mon, Jul 2, 2018 at 4:00 PM Dave Hansen <[email protected]> wrote:
>
> > @@ -2651,6 +2651,14 @@ void sparse_mem_maps_populate_node(struct page **map_map,
> > unsigned long pnum_end,
> > unsigned long map_count,
> > int nodeid);
> > +struct page * sparse_populate_node(unsigned long pnum_begin,
>
> CodingStyle: put the "*" next to the function name, no space, please.

OK

>
> > + unsigned long pnum_end,
> > + unsigned long map_count,
> > + int nid);
> > +struct page * sparse_populate_node_section(struct page *map_base,
> > + unsigned long map_index,
> > + unsigned long pnum,
> > + int nid);
>
> These two functions are named in very similar ways. Do they do similar
> things?

Yes, they do in non-vmemmap:

sparse_populate_node() -> populates the whole node if we can using a
single allocation
sparse_populate_node_section() -> populate only one section in the
given node if the whole node is not already populated.

However, vemmap variant is a little different: sparse_populate_node()
populates in a single allocation if can, and if not it still populates
the whole node but in smaller chunks, so
sparse_populate_node_section() has nothing left to do.

>
> > struct page *sparse_mem_map_populate(unsigned long pnum, int nid,
> > struct vmem_altmap *altmap);
> > diff --git a/mm/sparse-vmemmap.c b/mm/sparse-vmemmap.c
> > index e1a54ba411ec..b3e325962306 100644
> > --- a/mm/sparse-vmemmap.c
> > +++ b/mm/sparse-vmemmap.c
> > @@ -311,3 +311,52 @@ void __init sparse_mem_maps_populate_node(struct page **map_map,
> > vmemmap_buf_end = NULL;
> > }
> > }
> > +
> > +struct page * __init sparse_populate_node(unsigned long pnum_begin,
> > + unsigned long pnum_end,
> > + unsigned long map_count,
> > + int nid)
> > +{
>
> Could you comment what the function is doing, please?

Sure, I will add comments.

>
> > + unsigned long size = sizeof(struct page) * PAGES_PER_SECTION;
> > + unsigned long pnum, map_index = 0;
> > + void *vmemmap_buf_start;
> > +
> > + size = ALIGN(size, PMD_SIZE) * map_count;
> > + vmemmap_buf_start = __earlyonly_bootmem_alloc(nid, size,
> > + PMD_SIZE,
> > + __pa(MAX_DMA_ADDRESS));
>
> Let's not repeat the mistakes of the previous version of the code.
> Please explain why we are aligning this. Also,
> __earlyonly_bootmem_alloc()->memblock_virt_alloc_try_nid_raw() claims to
> be aligning the size. Do we also need to do it here?
>
> Yes, I know the old code did this, but this is the cost of doing a
> rewrite. :)

Actually, I was thinking about this particular case when I was
rewriting this code. Here we align size before multiplying by
map_count aligns after memblock_virt_alloc_try_nid_raw(). So, we must
have both as they are different.

>
> > + if (vmemmap_buf_start) {
> > + vmemmap_buf = vmemmap_buf_start;
> > + vmemmap_buf_end = vmemmap_buf_start + size;
> > + }
>
> It would be nice to call out that these are globals that other code
> picks up.

I do not like these globals, they should have specific functions that
access them only, something:
static struct {
buffer;
buffer_end;
} vmemmap_buffer;
vmemmap_buffer_init() allocate buffer
vmemmap_buffer_alloc() return NULL if buffer is empty
vmemmap_buffer_fini()

Call vmemmap_buffer_init() and vmemmap_buffer_fini() from
sparse_populate_node() and
vmemmap_buffer_alloc() from vmemmap_alloc_block_buf().

But, it should be a separate patch. If you would like I can add it to
this series, or submit separately.

>
> > + for (pnum = pnum_begin; map_index < map_count; pnum++) {
> > + if (!present_section_nr(pnum))
> > + continue;
> > + if (!sparse_mem_map_populate(pnum, nid, NULL))
> > + break;
>
> ^ This consumes "vmemmap_buf", right? That seems like a really nice
> thing to point out here if so.

It consumes vmemmap_buf if __earlyonly_bootmem_alloc() was successful,
otherwise it allocates struct pages a section at a time.

>
> > + map_index++;
> > + BUG_ON(pnum >= pnum_end);
> > + }
> > +
> > + if (vmemmap_buf_start) {
> > + /* need to free left buf */
> > + memblock_free_early(__pa(vmemmap_buf),
> > + vmemmap_buf_end - vmemmap_buf);
> > + vmemmap_buf = NULL;
> > + vmemmap_buf_end = NULL;
> > + }
> > + return pfn_to_page(section_nr_to_pfn(pnum_begin));
> > +}
> > +
> > +/*
> > + * Return map for pnum section. sparse_populate_node() has populated memory map
> > + * in this node, we simply do pnum to struct page conversion.
> > + */
> > +struct page * __init sparse_populate_node_section(struct page *map_base,
> > + unsigned long map_index,
> > + unsigned long pnum,
> > + int nid)
> > +{
> > + return pfn_to_page(section_nr_to_pfn(pnum));
> > +}
>
> What is up with all of the unused arguments to this function?

Because the same function is called from non-vmemmap sparse code.

>
> > diff --git a/mm/sparse.c b/mm/sparse.c
> > index d18e2697a781..c18d92b8ab9b 100644
> > --- a/mm/sparse.c
> > +++ b/mm/sparse.c
> > @@ -456,6 +456,43 @@ void __init sparse_mem_maps_populate_node(struct page **map_map,
> > __func__);
> > }
> > }
> > +
> > +static unsigned long section_map_size(void)
> > +{
> > + return PAGE_ALIGN(sizeof(struct page) * PAGES_PER_SECTION);
> > +}
>
> Seems like if we have this, we should use it wherever possible, like
> sparse_populate_node().

It is used in sparse_populate_node():

401 struct page * __init sparse_populate_node(unsigned long pnum_begin,
406 return memblock_virt_alloc_try_nid_raw(section_map_size()
* map_count,
407 PAGE_SIZE,
__pa(MAX_DMA_ADDRESS),
408
BOOTMEM_ALLOC_ACCESSIBLE, nid);


>
>
> > +/*
> > + * Try to allocate all struct pages for this node, if this fails, we will
> > + * be allocating one section at a time in sparse_populate_node_section().
> > + */
> > +struct page * __init sparse_populate_node(unsigned long pnum_begin,
> > + unsigned long pnum_end,
> > + unsigned long map_count,
> > + int nid)
> > +{
> > + return memblock_virt_alloc_try_nid_raw(section_map_size() * map_count,
> > + PAGE_SIZE, __pa(MAX_DMA_ADDRESS),
> > + BOOTMEM_ALLOC_ACCESSIBLE, nid);
> > +}
> > +
> > +/*
> > + * Return map for pnum section. map_base is not NULL if we could allocate map
> > + * for this node together. Otherwise we allocate one section at a time.
> > + * map_index is the index of pnum in this node counting only present sections.
> > + */
> > +struct page * __init sparse_populate_node_section(struct page *map_base,
> > + unsigned long map_index,
> > + unsigned long pnum,
> > + int nid)
> > +{
> > + if (map_base) {
> > + unsigned long offset = section_map_size() * map_index;
> > +
> > + return (struct page *)((char *)map_base + offset);
> > + }
> > + return sparse_mem_map_populate(pnum, nid, NULL);
>
> Oh, you have a vmemmap and non-vmemmap version.
>
> BTW, can't the whole map base calculation just be replaced with:
>
> return &map_base[PAGES_PER_SECTION * map_index];

Unfortunately no. Because map_base might be allocated in chunks
larger than PAGES_PER_SECTION * sizeof(struct page). See: PAGE_ALIGN()
in section_map_size

Thank you,
Pavel

2018-07-05 13:40:39

by Dave Hansen

[permalink] [raw]
Subject: Re: [PATCH v3 1/2] mm/sparse: add sparse_init_nid()

On 07/02/2018 01:29 PM, Pavel Tatashin wrote:
> On Mon, Jul 2, 2018 at 4:00 PM Dave Hansen <[email protected]> wrote:
>>> + unsigned long size = sizeof(struct page) * PAGES_PER_SECTION;
>>> + unsigned long pnum, map_index = 0;
>>> + void *vmemmap_buf_start;
>>> +
>>> + size = ALIGN(size, PMD_SIZE) * map_count;
>>> + vmemmap_buf_start = __earlyonly_bootmem_alloc(nid, size,
>>> + PMD_SIZE,
>>> + __pa(MAX_DMA_ADDRESS));
>>
>> Let's not repeat the mistakes of the previous version of the code.
>> Please explain why we are aligning this. Also,
>> __earlyonly_bootmem_alloc()->memblock_virt_alloc_try_nid_raw() claims to
>> be aligning the size. Do we also need to do it here?
>>
>> Yes, I know the old code did this, but this is the cost of doing a
>> rewrite. :)
>
> Actually, I was thinking about this particular case when I was
> rewriting this code. Here we align size before multiplying by
> map_count aligns after memblock_virt_alloc_try_nid_raw(). So, we must
> have both as they are different.

That's a good point that they do different things.

But, which behavior of the two different things is the one we _want_?

>>> + if (vmemmap_buf_start) {
>>> + vmemmap_buf = vmemmap_buf_start;
>>> + vmemmap_buf_end = vmemmap_buf_start + size;
>>> + }
>>
>> It would be nice to call out that these are globals that other code
>> picks up.
>
> I do not like these globals, they should have specific functions that
> access them only, something:
> static struct {
> buffer;
> buffer_end;
> } vmemmap_buffer;
> vmemmap_buffer_init() allocate buffer
> vmemmap_buffer_alloc() return NULL if buffer is empty
> vmemmap_buffer_fini()
>
> Call vmemmap_buffer_init() and vmemmap_buffer_fini() from
> sparse_populate_node() and
> vmemmap_buffer_alloc() from vmemmap_alloc_block_buf().
>
> But, it should be a separate patch. If you would like I can add it to
> this series, or submit separately.

Seems like a nice cleanup, but I don't think it needs to be done here.

>>> + * Return map for pnum section. sparse_populate_node() has populated memory map
>>> + * in this node, we simply do pnum to struct page conversion.
>>> + */
>>> +struct page * __init sparse_populate_node_section(struct page *map_base,
>>> + unsigned long map_index,
>>> + unsigned long pnum,
>>> + int nid)
>>> +{
>>> + return pfn_to_page(section_nr_to_pfn(pnum));
>>> +}
>>
>> What is up with all of the unused arguments to this function?
>
> Because the same function is called from non-vmemmap sparse code.

That's probably good to call out in the patch description if not there
already.

>>> diff --git a/mm/sparse.c b/mm/sparse.c
>>> index d18e2697a781..c18d92b8ab9b 100644
>>> --- a/mm/sparse.c
>>> +++ b/mm/sparse.c
>>> @@ -456,6 +456,43 @@ void __init sparse_mem_maps_populate_node(struct page **map_map,
>>> __func__);
>>> }
>>> }
>>> +
>>> +static unsigned long section_map_size(void)
>>> +{
>>> + return PAGE_ALIGN(sizeof(struct page) * PAGES_PER_SECTION);
>>> +}
>>
>> Seems like if we have this, we should use it wherever possible, like
>> sparse_populate_node().
>
> It is used in sparse_populate_node():
>
> 401 struct page * __init sparse_populate_node(unsigned long pnum_begin,
> 406 return memblock_virt_alloc_try_nid_raw(section_map_size()
> * map_count,
> 407 PAGE_SIZE,
> __pa(MAX_DMA_ADDRESS),
> 408
> BOOTMEM_ALLOC_ACCESSIBLE, nid);

I missed the PAGE_ALIGN() until now. That really needs a comment
calling out how it's not really the map size but the *allocation* size
of a single section's map.

It probably also needs a name like section_memmap_allocation_size() or
something to differentiate it from the *used* size.

>>> +/*
>>> + * Try to allocate all struct pages for this node, if this fails, we will
>>> + * be allocating one section at a time in sparse_populate_node_section().
>>> + */
>>> +struct page * __init sparse_populate_node(unsigned long pnum_begin,
>>> + unsigned long pnum_end,
>>> + unsigned long map_count,
>>> + int nid)
>>> +{
>>> + return memblock_virt_alloc_try_nid_raw(section_map_size() * map_count,
>>> + PAGE_SIZE, __pa(MAX_DMA_ADDRESS),
>>> + BOOTMEM_ALLOC_ACCESSIBLE, nid);
>>> +}
>>> +
>>> +/*
>>> + * Return map for pnum section. map_base is not NULL if we could allocate map
>>> + * for this node together. Otherwise we allocate one section at a time.
>>> + * map_index is the index of pnum in this node counting only present sections.
>>> + */
>>> +struct page * __init sparse_populate_node_section(struct page *map_base,
>>> + unsigned long map_index,
>>> + unsigned long pnum,
>>> + int nid)
>>> +{
>>> + if (map_base) {
>>> + unsigned long offset = section_map_size() * map_index;
>>> +
>>> + return (struct page *)((char *)map_base + offset);
>>> + }
>>> + return sparse_mem_map_populate(pnum, nid, NULL);
>>
>> Oh, you have a vmemmap and non-vmemmap version.
>>
>> BTW, can't the whole map base calculation just be replaced with:
>>
>> return &map_base[PAGES_PER_SECTION * map_index];
>
> Unfortunately no. Because map_base might be allocated in chunks
> larger than PAGES_PER_SECTION * sizeof(struct page). See: PAGE_ALIGN()
> in section_map_size

Good point.

Oh, well, can you at least get rid of the superfluous "(char *)" cast?
That should make the whole thing a bit less onerous.

2018-07-09 14:33:27

by Pavel Tatashin

[permalink] [raw]
Subject: Re: [PATCH v3 1/2] mm/sparse: add sparse_init_nid()

On Thu, Jul 5, 2018 at 9:39 AM Dave Hansen <[email protected]> wrote:
>
> On 07/02/2018 01:29 PM, Pavel Tatashin wrote:
> > On Mon, Jul 2, 2018 at 4:00 PM Dave Hansen <[email protected]> wrote:
> >>> + unsigned long size = sizeof(struct page) * PAGES_PER_SECTION;
> >>> + unsigned long pnum, map_index = 0;
> >>> + void *vmemmap_buf_start;
> >>> +
> >>> + size = ALIGN(size, PMD_SIZE) * map_count;
> >>> + vmemmap_buf_start = __earlyonly_bootmem_alloc(nid, size,
> >>> + PMD_SIZE,
> >>> + __pa(MAX_DMA_ADDRESS));
> >>
> >> Let's not repeat the mistakes of the previous version of the code.
> >> Please explain why we are aligning this. Also,
> >> __earlyonly_bootmem_alloc()->memblock_virt_alloc_try_nid_raw() claims to
> >> be aligning the size. Do we also need to do it here?
> >>
> >> Yes, I know the old code did this, but this is the cost of doing a
> >> rewrite. :)
> >
> > Actually, I was thinking about this particular case when I was
> > rewriting this code. Here we align size before multiplying by
> > map_count aligns after memblock_virt_alloc_try_nid_raw(). So, we must
> > have both as they are different.
>
> That's a good point that they do different things.
>
> But, which behavior of the two different things is the one we _want_?

We definitely want the first one:
size = ALIGN(size, PMD_SIZE) * map_count;

The alignment in memblock is not strictly needed for this case, but it
already comes with memblock allocator.

>
> >>> + if (vmemmap_buf_start) {
> >>> + vmemmap_buf = vmemmap_buf_start;
> >>> + vmemmap_buf_end = vmemmap_buf_start + size;
> >>> + }
> >>
> >> It would be nice to call out that these are globals that other code
> >> picks up.
> >
> > I do not like these globals, they should have specific functions that
> > access them only, something:
> > static struct {
> > buffer;
> > buffer_end;
> > } vmemmap_buffer;
> > vmemmap_buffer_init() allocate buffer
> > vmemmap_buffer_alloc() return NULL if buffer is empty
> > vmemmap_buffer_fini()
> >
> > Call vmemmap_buffer_init() and vmemmap_buffer_fini() from
> > sparse_populate_node() and
> > vmemmap_buffer_alloc() from vmemmap_alloc_block_buf().
> >
> > But, it should be a separate patch. If you would like I can add it to
> > this series, or submit separately.
>
> Seems like a nice cleanup, but I don't think it needs to be done here.
>
> >>> + * Return map for pnum section. sparse_populate_node() has populated memory map
> >>> + * in this node, we simply do pnum to struct page conversion.
> >>> + */
> >>> +struct page * __init sparse_populate_node_section(struct page *map_base,
> >>> + unsigned long map_index,
> >>> + unsigned long pnum,
> >>> + int nid)
> >>> +{
> >>> + return pfn_to_page(section_nr_to_pfn(pnum));
> >>> +}
> >>
> >> What is up with all of the unused arguments to this function?
> >
> > Because the same function is called from non-vmemmap sparse code.
>
> That's probably good to call out in the patch description if not there
> already.
>
> >>> diff --git a/mm/sparse.c b/mm/sparse.c
> >>> index d18e2697a781..c18d92b8ab9b 100644
> >>> --- a/mm/sparse.c
> >>> +++ b/mm/sparse.c
> >>> @@ -456,6 +456,43 @@ void __init sparse_mem_maps_populate_node(struct page **map_map,
> >>> __func__);
> >>> }
> >>> }
> >>> +
> >>> +static unsigned long section_map_size(void)
> >>> +{
> >>> + return PAGE_ALIGN(sizeof(struct page) * PAGES_PER_SECTION);
> >>> +}
> >>
> >> Seems like if we have this, we should use it wherever possible, like
> >> sparse_populate_node().
> >
> > It is used in sparse_populate_node():
> >
> > 401 struct page * __init sparse_populate_node(unsigned long pnum_begin,
> > 406 return memblock_virt_alloc_try_nid_raw(section_map_size()
> > * map_count,
> > 407 PAGE_SIZE,
> > __pa(MAX_DMA_ADDRESS),
> > 408
> > BOOTMEM_ALLOC_ACCESSIBLE, nid);
>
> I missed the PAGE_ALIGN() until now. That really needs a comment
> calling out how it's not really the map size but the *allocation* size
> of a single section's map.
>
> It probably also needs a name like section_memmap_allocation_size() or
> something to differentiate it from the *used* size.
>
> >>> +/*
> >>> + * Try to allocate all struct pages for this node, if this fails, we will
> >>> + * be allocating one section at a time in sparse_populate_node_section().
> >>> + */
> >>> +struct page * __init sparse_populate_node(unsigned long pnum_begin,
> >>> + unsigned long pnum_end,
> >>> + unsigned long map_count,
> >>> + int nid)
> >>> +{
> >>> + return memblock_virt_alloc_try_nid_raw(section_map_size() * map_count,
> >>> + PAGE_SIZE, __pa(MAX_DMA_ADDRESS),
> >>> + BOOTMEM_ALLOC_ACCESSIBLE, nid);
> >>> +}
> >>> +
> >>> +/*
> >>> + * Return map for pnum section. map_base is not NULL if we could allocate map
> >>> + * for this node together. Otherwise we allocate one section at a time.
> >>> + * map_index is the index of pnum in this node counting only present sections.
> >>> + */
> >>> +struct page * __init sparse_populate_node_section(struct page *map_base,
> >>> + unsigned long map_index,
> >>> + unsigned long pnum,
> >>> + int nid)
> >>> +{
> >>> + if (map_base) {
> >>> + unsigned long offset = section_map_size() * map_index;
> >>> +
> >>> + return (struct page *)((char *)map_base + offset);
> >>> + }
> >>> + return sparse_mem_map_populate(pnum, nid, NULL);
> >>
> >> Oh, you have a vmemmap and non-vmemmap version.
> >>
> >> BTW, can't the whole map base calculation just be replaced with:
> >>
> >> return &map_base[PAGES_PER_SECTION * map_index];
> >
> > Unfortunately no. Because map_base might be allocated in chunks
> > larger than PAGES_PER_SECTION * sizeof(struct page). See: PAGE_ALIGN()
> > in section_map_size
>
> Good point.
>
> Oh, well, can you at least get rid of the superfluous "(char *)" cast?
> That should make the whole thing a bit less onerous.

I will see what can be done, if it is not going to be cleaner, I will
keep the cast.

Thank you,
Pavel