The following patch-set from Yinghai allocates pagetables to local nodes.
v1: https://lkml.org/lkml/2013/3/7/642
v2: https://lkml.org/lkml/2013/3/10/47
v3: https://lkml.org/lkml/2013/4/4/639
v4: https://lkml.org/lkml/2013/4/11/829
Since pagetable pages are used by the kernel, they cannot be offlined.
As a result, they cannot be hot-remove.
This patch fix this problem with the following solution:
1. Introduce a new bootmem type LOCA_NODE_DATAL, and register local
pagetable pages as LOCA_NODE_DATAL by setting page->lru.next to
LOCA_NODE_DATAL, just like we register SECTION_INFO pages.
2. Skip LOCA_NODE_DATAL pages in offline/online procedures. When the
whole memory block they reside in is offlined, the kernel can
still access the pagetables.
(This changes the semantics of offline/online a little bit.)
3. Do not free LOCA_NODE_DATAL pages to buddy system because they
were skipped when in offline/online procedures. The memory block
they reside in could have been offlined.
Anyway, this problem should be fixed. Any better idea is welcome.
Tang Chen (4):
bootmem, mem-hotplug: Register local pagetable pages with
LOCAL_NODE_DATA when freeing bootmem.
mem-hotplug: Skip LOCAL_NODE_DATA pages in memory offline procedure.
mem-hotplug: Skip LOCAL_NODE_DATA pages in memory online procedure.
mem-hotplug: Do not free LOCAL_NODE_DATA pages to buddy system in
hot-remove procedure.
arch/x86/mm/init_64.c | 2 +
include/linux/memblock.h | 22 +++++++++++++++++
include/linux/memory_hotplug.h | 13 ++++++++-
mm/memblock.c | 52 ++++++++++++++++++++++++++++++++++++++++
mm/memory_hotplug.c | 42 +++++++++++++++++++++++++++++++-
mm/page_alloc.c | 18 ++++++++++++-
mm/page_isolation.c | 6 ++++
7 files changed, 150 insertions(+), 5 deletions(-)
As Yinghai suggested, even if a node is movable node, which has only
ZONE_MOVABLE, pagetables should be put in the local node.
In memory hot-remove logic, it offlines all pages first, and then
removes pagetables. But the local pagetable pages cannot be offlined
because they are used by kernel.
So we should skip this kind of pages in offline procedure. But first
of all, we need to mark them.
This patch marks local node data pages in the same way as we mark the
SECTION_INFO and MIX_SECTION_INFO data pages. We introduce a new type
of bootmem: LOCAL_NODE_DATA. And use page->lru.next to mark this type
of memory.
Signed-off-by: Tang Chen <[email protected]>
---
arch/x86/mm/init_64.c | 2 +
include/linux/memblock.h | 22 +++++++++++++++++
include/linux/memory_hotplug.h | 13 ++++++++-
mm/memblock.c | 52 ++++++++++++++++++++++++++++++++++++++++
mm/memory_hotplug.c | 26 ++++++++++++++++++++
5 files changed, 113 insertions(+), 2 deletions(-)
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index dafdeb2..8be9c3b 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -1055,6 +1055,8 @@ static void __init register_page_bootmem_info(void)
for_each_online_node(i)
register_page_bootmem_info_node(NODE_DATA(i));
+
+ register_page_bootmem_local_node();
#endif
}
diff --git a/include/linux/memblock.h b/include/linux/memblock.h
index 5528e8f..4dd43df 100644
--- a/include/linux/memblock.h
+++ b/include/linux/memblock.h
@@ -134,6 +134,28 @@ void __next_free_mem_range_rev(u64 *idx, int nid, phys_addr_t *out_start,
i != (u64)ULLONG_MAX; \
__next_free_mem_range_rev(&i, nid, p_start, p_end, p_nid))
+void __next_local_node_mem_range(int *idx, int nid, phys_addr_t *out_start,
+ phys_addr_t *out_end, int *out_nid);
+
+/**
+ * for_each_local_node_mem_range - iterate memblock areas storing local node
+ * data
+ * @i: int used as loop variable
+ * @nid: node selector, %MAX_NUMNODES for all nodes
+ * @p_start: ptr to phys_addr_t for start address of the range, can be %NULL
+ * @p_end: ptr to phys_addr_t for end address of the range, can be %NULL
+ * @p_nid: ptr to int for nid of the range, can be %NULL
+ *
+ * Walks over memblock areas storing local node data. Since all the local node
+ * areas will be reserved by memblock, this iterator will only iterate
+ * memblock.reserve. Available as soon as memblock is initialized.
+ */
+#define for_each_local_node_mem_range(i, nid, p_start, p_end, p_nid) \
+ for (i = -1, \
+ __next_local_node_mem_range(&i, nid, p_start, p_end, p_nid); \
+ i != -1; \
+ __next_local_node_mem_range(&i, nid, p_start, p_end, p_nid))
+
#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
int memblock_set_node(phys_addr_t base, phys_addr_t size, int nid);
diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h
index 18fe2a3..a720fd1 100644
--- a/include/linux/memory_hotplug.h
+++ b/include/linux/memory_hotplug.h
@@ -16,14 +16,19 @@ struct memory_block;
/*
* Types for free bootmem stored in page->lru.next. These have to be in
- * some random range in unsigned long space for debugging purposes.
+ * some random range in unsigned long space for debugging purposes except
+ * LOCAL_NODE_DATA.
+ *
+ * LOCAL_NODE_DATA is used to mark local node pages storing data to
+ * describe the memory of the node, such as local pagetable pages.
*/
enum {
MEMORY_HOTPLUG_MIN_BOOTMEM_TYPE = 12,
SECTION_INFO = MEMORY_HOTPLUG_MIN_BOOTMEM_TYPE,
MIX_SECTION_INFO,
NODE_INFO,
- MEMORY_HOTPLUG_MAX_BOOTMEM_TYPE = NODE_INFO,
+ LOCAL_NODE_DATA,
+ MEMORY_HOTPLUG_MAX_BOOTMEM_TYPE = LOCAL_NODE_DATA,
};
/* Types for control the zone type of onlined memory */
@@ -179,10 +184,14 @@ static inline void arch_refresh_nodedata(int nid, pg_data_t *pgdat)
#ifdef CONFIG_HAVE_BOOTMEM_INFO_NODE
extern void register_page_bootmem_info_node(struct pglist_data *pgdat);
+extern void register_page_bootmem_local_node(void);
#else
static inline void register_page_bootmem_info_node(struct pglist_data *pgdat)
{
}
+static inline void register_page_bootmem_local_node()
+{
+}
#endif
extern void put_page_bootmem(struct page *page);
extern void get_page_bootmem(unsigned long ingo, struct page *page,
diff --git a/mm/memblock.c b/mm/memblock.c
index 8b9a13c..7f429f4 100644
--- a/mm/memblock.c
+++ b/mm/memblock.c
@@ -628,6 +628,58 @@ bool __init_memblock memblock_is_hotpluggable(struct memblock_region *region)
return region->flags & (1 << MEMBLK_HOTPLUGGABLE);
}
+/*
+ * Common iterator to find next range with the same flags.
+ */
+static void __init_memblock __next_flag_mem_range(int *idx, int nid,
+ unsigned long flags,
+ phys_addr_t *out_start,
+ phys_addr_t *out_end, int *out_nid)
+{
+ struct memblock_type *rsv = &memblock.reserved;
+ struct memblock_region *r;
+
+ while (++*idx < rsv->cnt) {
+ r = &rsv->regions[*idx];
+
+ if (nid != MAX_NUMNODES &&
+ nid != memblock_get_region_node(r))
+ continue;
+
+ if (r->flags & flags)
+ break;
+ }
+
+ if (*idx >= rsv->cnt) {
+ *idx = -1;
+ return;
+ }
+
+ if (out_start)
+ *out_start = r->base;
+ if (out_end)
+ *out_end = r->base + r->size;
+ if (out_nid)
+ *out_nid = memblock_get_region_node(r);
+}
+
+/**
+ * __next_local_node_mem_range - next function for
+ * for_each_local_node_mem_range()
+ * @idx: pointer to int loop variable
+ * @nid: node selector, %MAX_NUMNODES for all nodes
+ * @out_start: ptr to phys_addr_t for start address of the range, can be %NULL
+ * @out_end: ptr to phys_addr_t for end address of the range, can be %NULL
+ * @out_nid: ptr to int for nid of the range, can be %NULL
+ */
+void __init_memblock __next_local_node_mem_range(int *idx, int nid,
+ phys_addr_t *out_start,
+ phys_addr_t *out_end, int *out_nid)
+{
+ __next_flag_mem_range(idx, nid, 1 << MEMBLK_LOCAL_NODE,
+ out_start, out_end, out_nid);
+}
+
/**
* __next_free_mem_range - next function for for_each_free_mem_range()
* @idx: pointer to u64 loop variable
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index b81a367..075d412 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -30,6 +30,7 @@
#include <linux/mm_inline.h>
#include <linux/firmware-map.h>
#include <linux/stop_machine.h>
+#include <linux/memblock.h>
#include <asm/tlbflush.h>
@@ -191,6 +192,31 @@ static void register_page_bootmem_info_section(unsigned long start_pfn)
}
#endif /* !CONFIG_SPARSEMEM_VMEMMAP */
+void __ref register_page_bootmem_local_node()
+{
+ int i, nid;
+ phys_addr_t start, end;
+ unsigned long start_pfn, end_pfn;
+ struct page *page;
+
+ for_each_local_node_mem_range(i, MAX_NUMNODES, &start, &end, &nid) {
+ start_pfn = PFN_DOWN(start);
+ end_pfn = PFN_UP(end);
+ page = pfn_to_page(start_pfn);
+
+ for ( ; start_pfn <= end_pfn; start_pfn++, page++) {
+ /*
+ * We need to set the whole page as LOCAL_NODE_DATA,
+ * so we get the upper end_pfn. But this upper end_pfn
+ * may not exist. So we have to check if the page
+ * present before we access its struct page.
+ */
+ if (pfn_present(start_pfn))
+ get_page_bootmem(nid, page, LOCAL_NODE_DATA);
+ }
+ }
+}
+
void register_page_bootmem_info_node(struct pglist_data *pgdat)
{
unsigned long i, pfn, end_pfn, nr_pages;
--
1.7.1
In memory hot-remove procedure, we free pagetable pages to buddy system.
But for local pagetable pages, do not free them to buddy system because
they were skipped in offline procedure. The memory block they reside in
could have been offlined, and we won't offline it again.
Signed-off-by: Tang Chen <[email protected]>
---
mm/memory_hotplug.c | 8 ++++++++
1 files changed, 8 insertions(+), 0 deletions(-)
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 21d6fcb..c30e819 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -119,6 +119,14 @@ void __ref put_page_bootmem(struct page *page)
INIT_LIST_HEAD(&page->lru);
/*
+ * Do not free pages with local node kernel data (for now, just
+ * local pagetables) to the buddy system because we skipped
+ * these pages when offlining the corresponding block.
+ */
+ if (type == LOCAL_NODE_DATA)
+ return;
+
+ /*
* Please refer to comment for __free_pages_bootmem()
* for why we serialize here.
*/
--
1.7.1
In memory offline procedure, skip pages marked as LOCAL_NODE_DATA.
For now, this kind of pages are used to store local node pagetables.
The minimum unit of memory online/offline is a memory block. In a
block, the movable pages will be offlined as usual (unmapped and
isolated), and the pagetable pages will be skipped. After the iteration
of all page, the block will be set as offline, but the kernel can
still access the pagetable pages. This is user transparent.
Signed-off-by: Tang Chen <[email protected]>
---
mm/page_alloc.c | 18 ++++++++++++++++--
mm/page_isolation.c | 6 ++++++
2 files changed, 22 insertions(+), 2 deletions(-)
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 557b21b..73b8f0b 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -5701,11 +5701,18 @@ bool has_unmovable_pages(struct zone *zone, struct page *page, int count,
pfn = page_to_pfn(page);
for (found = 0, iter = 0; iter < pageblock_nr_pages; iter++) {
unsigned long check = pfn + iter;
+ unsigned long magic;
if (!pfn_valid_within(check))
continue;
page = pfn_to_page(check);
+
+ /* Skip pages storing local node kernel data. */
+ magic = (unsigned long)page->lru.next;
+ if (magic == LOCAL_NODE_DATA)
+ continue;
+
/*
* We can't use page_count without pin a page
* because another CPU can free compound page.
@@ -6029,8 +6036,7 @@ __offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn)
struct page *page;
struct zone *zone;
int order, i;
- unsigned long pfn;
- unsigned long flags;
+ unsigned long pfn, flags, magic;
/* find the first valid pfn */
for (pfn = start_pfn; pfn < end_pfn; pfn++)
if (pfn_valid(pfn))
@@ -6046,6 +6052,14 @@ __offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn)
continue;
}
page = pfn_to_page(pfn);
+
+ /* Skip pages storing local node kernel data. */
+ magic = (unsigned long)page->lru.next;
+ if (magic == LOCAL_NODE_DATA) {
+ pfn++;
+ continue;
+ }
+
/*
* The HWPoisoned page may be not in buddy system, and
* page_count() is not 0.
diff --git a/mm/page_isolation.c b/mm/page_isolation.c
index 383bdbb..fb60a27 100644
--- a/mm/page_isolation.c
+++ b/mm/page_isolation.c
@@ -174,6 +174,7 @@ __test_page_isolated_in_pageblock(unsigned long pfn, unsigned long end_pfn,
bool skip_hwpoisoned_pages)
{
struct page *page;
+ unsigned long magic;
while (pfn < end_pfn) {
if (!pfn_valid_within(pfn)) {
@@ -181,6 +182,8 @@ __test_page_isolated_in_pageblock(unsigned long pfn, unsigned long end_pfn,
continue;
}
page = pfn_to_page(pfn);
+ magic = (unsigned long)page->lru.next;
+
if (PageBuddy(page)) {
/*
* If race between isolatation and allocation happens,
@@ -208,6 +211,9 @@ __test_page_isolated_in_pageblock(unsigned long pfn, unsigned long end_pfn,
*/
pfn++;
continue;
+ } else if (magic == LOCAL_NODE_DATA) {
+ pfn++;
+ continue;
}
else
break;
--
1.7.1
Pages marked as LOCAL_NODE_DATA are skipped when we do memory offline.
So we have to skip them again when we do memory online.
Signed-off-by: Tang Chen <[email protected]>
---
mm/memory_hotplug.c | 8 +++++++-
1 files changed, 7 insertions(+), 1 deletions(-)
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 075d412..21d6fcb 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -825,12 +825,18 @@ static void generic_online_page(struct page *page)
static int online_pages_range(unsigned long start_pfn, unsigned long nr_pages,
void *arg)
{
- unsigned long i;
+ unsigned long i, magic;
unsigned long onlined_pages = *(unsigned long *)arg;
struct page *page;
if (PageReserved(pfn_to_page(start_pfn)))
for (i = 0; i < nr_pages; i++) {
page = pfn_to_page(start_pfn + i);
+ magic = (unsigned long)page->lru.next;
+
+ /* Skip pages storing local node kernel data. */
+ if (magic == LOCAL_NODE_DATA)
+ continue;
+
(*online_page_callback)(page);
onlined_pages++;
}
--
1.7.1
On 2013/5/24 17:30, Tang Chen wrote:
> In memory offline procedure, skip pages marked as LOCAL_NODE_DATA.
> For now, this kind of pages are used to store local node pagetables.
>
> The minimum unit of memory online/offline is a memory block. In a
> block, the movable pages will be offlined as usual (unmapped and
> isolated), and the pagetable pages will be skipped. After the iteration
> of all page, the block will be set as offline, but the kernel can
> still access the pagetable pages. This is user transparent.
>
> Signed-off-by: Tang Chen <[email protected]>
> ---
> mm/page_alloc.c | 18 ++++++++++++++++--
> mm/page_isolation.c | 6 ++++++
> 2 files changed, 22 insertions(+), 2 deletions(-)
>
> diff --git a/mm/page_alloc.c b/mm/page_alloc.c
> index 557b21b..73b8f0b 100644
> --- a/mm/page_alloc.c
> +++ b/mm/page_alloc.c
> @@ -5701,11 +5701,18 @@ bool has_unmovable_pages(struct zone *zone, struct page *page, int count,
> pfn = page_to_pfn(page);
> for (found = 0, iter = 0; iter < pageblock_nr_pages; iter++) {
> unsigned long check = pfn + iter;
> + unsigned long magic;
>
> if (!pfn_valid_within(check))
> continue;
>
> page = pfn_to_page(check);
> +
> + /* Skip pages storing local node kernel data. */
> + magic = (unsigned long)page->lru.next;
> + if (magic == LOCAL_NODE_DATA)
Hi Tang,
I think can define this as a macro, and can be reused in the other places.
Thanks,
Jianguo Wu.
> + continue;
> +
> /*
> * We can't use page_count without pin a page
> * because another CPU can free compound page.
> @@ -6029,8 +6036,7 @@ __offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn)
> struct page *page;
> struct zone *zone;
> int order, i;
> - unsigned long pfn;
> - unsigned long flags;
> + unsigned long pfn, flags, magic;
> /* find the first valid pfn */
> for (pfn = start_pfn; pfn < end_pfn; pfn++)
> if (pfn_valid(pfn))
> @@ -6046,6 +6052,14 @@ __offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn)
> continue;
> }
> page = pfn_to_page(pfn);
> +
> + /* Skip pages storing local node kernel data. */
> + magic = (unsigned long)page->lru.next;
> + if (magic == LOCAL_NODE_DATA) {
> + pfn++;
> + continue;
> + }
> +
> /*
> * The HWPoisoned page may be not in buddy system, and
> * page_count() is not 0.
> diff --git a/mm/page_isolation.c b/mm/page_isolation.c
> index 383bdbb..fb60a27 100644
> --- a/mm/page_isolation.c
> +++ b/mm/page_isolation.c
> @@ -174,6 +174,7 @@ __test_page_isolated_in_pageblock(unsigned long pfn, unsigned long end_pfn,
> bool skip_hwpoisoned_pages)
> {
> struct page *page;
> + unsigned long magic;
>
> while (pfn < end_pfn) {
> if (!pfn_valid_within(pfn)) {
> @@ -181,6 +182,8 @@ __test_page_isolated_in_pageblock(unsigned long pfn, unsigned long end_pfn,
> continue;
> }
> page = pfn_to_page(pfn);
> + magic = (unsigned long)page->lru.next;
> +
> if (PageBuddy(page)) {
> /*
> * If race between isolatation and allocation happens,
> @@ -208,6 +211,9 @@ __test_page_isolated_in_pageblock(unsigned long pfn, unsigned long end_pfn,
> */
> pfn++;
> continue;
> + } else if (magic == LOCAL_NODE_DATA) {
> + pfn++;
> + continue;
> }
> else
> break;
On 2013/5/24 17:30, Tang Chen wrote:
> In memory hot-remove procedure, we free pagetable pages to buddy system.
> But for local pagetable pages, do not free them to buddy system because
> they were skipped in offline procedure. The memory block they reside in
> could have been offlined, and we won't offline it again.
>
> Signed-off-by: Tang Chen <[email protected]>
> ---
> mm/memory_hotplug.c | 8 ++++++++
> 1 files changed, 8 insertions(+), 0 deletions(-)
>
> diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
> index 21d6fcb..c30e819 100644
> --- a/mm/memory_hotplug.c
> +++ b/mm/memory_hotplug.c
> @@ -119,6 +119,14 @@ void __ref put_page_bootmem(struct page *page)
> INIT_LIST_HEAD(&page->lru);
>
> /*
> + * Do not free pages with local node kernel data (for now, just
> + * local pagetables) to the buddy system because we skipped
> + * these pages when offlining the corresponding block.
> + */
> + if (type == LOCAL_NODE_DATA)
> + return;
Hi Tang,
I think this should be check in free_pagetable(), like:
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index 474e28f..08fe80e 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -725,7 +725,7 @@ static void __meminit free_pagetable(struct page *page, int order)
if (magic == SECTION_INFO || magic == MIX_SECTION_INFO) {
while (nr_pages--)
put_page_bootmem(page++);
- } else
+ } else if (magic != LOCAL_NODE_DATA)
__free_pages_bootmem(page, order);
} else
free_pages((unsigned long)page_address(page), order);
Thanks,
Jianguo Wu.
> +
> + /*
> * Please refer to comment for __free_pages_bootmem()
> * for why we serialize here.
> */