LinuxLists.cc - [RFC] memory hotremove patch take 2 [00/10]

[permalink] [raw]

Subject: [RFC] memory hotremove patch take 2 [01/10] (counter of removable page)

Show #of Movable pages and vmstat.

Signed-Off-By: KAMEZAWA Hiroyuki <[email protected]>
Signed-off-by: Yasunori Goto <[email protected]>

arch/ia64/mm/init.c | 2 ++
drivers/base/node.c | 4 ++++
fs/proc/proc_misc.c | 4 ++++
include/linux/kernel.h | 2 ++
include/linux/swap.h | 1 +
mm/page_alloc.c | 22 ++++++++++++++++++++++
6 files changed, 35 insertions(+)

Index: current_test/mm/page_alloc.c
===================================================================
--- current_test.orig/mm/page_alloc.c 2007-05-08 15:06:50.000000000 +0900
+++ current_test/mm/page_alloc.c 2007-05-08 15:08:36.000000000 +0900
@@ -58,6 +58,7 @@ unsigned long totalram_pages __read_most
unsigned long totalreserve_pages __read_mostly;
long nr_swap_pages;
int percpu_pagelist_fraction;
+unsigned long total_movable_pages __read_mostly;

static void __free_pages_ok(struct page *page, unsigned int order);

@@ -1827,6 +1828,18 @@ static unsigned int nr_free_zone_pages(i
return sum;
}

+unsigned int nr_free_movable_pages(void)
+{
+ unsigned long nr_pages = 0;
+ struct zone *zone;
+ int nid;
+
+ for_each_online_node(nid) {
+ zone = &(NODE_DATA(nid)->node_zones[ZONE_MOVABLE]);
+ nr_pages += zone_page_state(zone, NR_FREE_PAGES);
+ }
+ return nr_pages;
+}
/*
* Amount of free RAM allocatable within ZONE_DMA and ZONE_NORMAL
*/
@@ -1889,6 +1902,8 @@ void si_meminfo(struct sysinfo *val)
val->totalhigh = totalhigh_pages;
val->freehigh = nr_free_highpages();
val->mem_unit = PAGE_SIZE;
+ val->movable = total_movable_pages;
+ val->free_movable = nr_free_movable_pages();
}

EXPORT_SYMBOL(si_meminfo);
@@ -1908,6 +1923,11 @@ void si_meminfo_node(struct sysinfo *val
val->totalhigh = 0;
val->freehigh = 0;
#endif
+
+ val->movable = pgdat->node_zones[ZONE_MOVABLE].present_pages;
+ val->free_movable = zone_page_state(&pgdat->node_zones[ZONE_MOVABLE],
+ NR_FREE_PAGES);
+
val->mem_unit = PAGE_SIZE;
}
#endif
@@ -3216,6 +3236,8 @@ static void __meminit free_area_init_cor

zone->spanned_pages = size;
zone->present_pages = realsize;
+ if (j == ZONE_MOVABLE)
+ total_movable_pages += realsize;
#ifdef CONFIG_NUMA
zone->node = nid;
zone->min_unmapped_pages = (realsize*sysctl_min_unmapped_ratio)
Index: current_test/include/linux/kernel.h
===================================================================
--- current_test.orig/include/linux/kernel.h 2007-05-08 15:06:49.000000000 +0900
+++ current_test/include/linux/kernel.h 2007-05-08 15:07:20.000000000 +0900
@@ -352,6 +352,8 @@ struct sysinfo {
unsigned short pad; /* explicit padding for m68k */
unsigned long totalhigh; /* Total high memory size */
unsigned long freehigh; /* Available high memory size */
+ unsigned long movable; /* pages used only for data */
+ unsigned long free_movable; /* Avaiable pages in movable */
unsigned int mem_unit; /* Memory unit size in bytes */
char _f[20-2*sizeof(long)-sizeof(int)]; /* Padding: libc5 uses this.. */
};
Index: current_test/fs/proc/proc_misc.c
===================================================================
--- current_test.orig/fs/proc/proc_misc.c 2007-05-08 15:06:48.000000000 +0900
+++ current_test/fs/proc/proc_misc.c 2007-05-08 15:07:20.000000000 +0900
@@ -161,6 +161,8 @@ static int meminfo_read_proc(char *page,
"LowTotal: %8lu kB\n"
"LowFree: %8lu kB\n"
#endif
+ "MovableTotal: %8lu kB\n"
+ "MovableFree: %8lu kB\n"
"SwapTotal: %8lu kB\n"
"SwapFree: %8lu kB\n"
"Dirty: %8lu kB\n"
@@ -191,6 +193,8 @@ static int meminfo_read_proc(char *page,
K(i.totalram-i.totalhigh),
K(i.freeram-i.freehigh),
#endif
+ K(i.movable),
+ K(i.free_movable),
K(i.totalswap),
K(i.freeswap),
K(global_page_state(NR_FILE_DIRTY)),
Index: current_test/drivers/base/node.c
===================================================================
--- current_test.orig/drivers/base/node.c 2007-05-08 15:06:10.000000000 +0900
+++ current_test/drivers/base/node.c 2007-05-08 15:07:20.000000000 +0900
@@ -55,6 +55,8 @@ static ssize_t node_read_meminfo(struct
"Node %d LowTotal: %8lu kB\n"
"Node %d LowFree: %8lu kB\n"
#endif
+ "Node %d MovableTotal: %8lu kB\n"
+ "Node %d MovableFree: %8lu kB\n"
"Node %d Dirty: %8lu kB\n"
"Node %d Writeback: %8lu kB\n"
"Node %d FilePages: %8lu kB\n"
@@ -77,6 +79,8 @@ static ssize_t node_read_meminfo(struct
nid, K(i.totalram - i.totalhigh),
nid, K(i.freeram - i.freehigh),
#endif
+ nid, K(i.movable),
+ nid, K(i.free_movable),
nid, K(node_page_state(nid, NR_FILE_DIRTY)),
nid, K(node_page_state(nid, NR_WRITEBACK)),
nid, K(node_page_state(nid, NR_FILE_PAGES)),
Index: current_test/arch/ia64/mm/init.c
===================================================================
--- current_test.orig/arch/ia64/mm/init.c 2007-05-08 15:06:38.000000000 +0900
+++ current_test/arch/ia64/mm/init.c 2007-05-08 15:08:29.000000000 +0900
@@ -700,6 +700,8 @@ void online_page(struct page *page)
__free_page(page);
totalram_pages++;
num_physpages++;
+ if (page_zonenum(page) == ZONE_MOVABLE)
+ total_movable_pages++;
}

int arch_add_memory(int nid, u64 start, u64 size)
Index: current_test/include/linux/swap.h
===================================================================
--- current_test.orig/include/linux/swap.h 2007-05-08 15:06:49.000000000 +0900
+++ current_test/include/linux/swap.h 2007-05-08 15:07:20.000000000 +0900
@@ -169,6 +169,7 @@ extern void swapin_readahead(swp_entry_t
/* linux/mm/page_alloc.c */
extern unsigned long totalram_pages;
extern unsigned long totalreserve_pages;
+extern unsigned long total_movable_pages;
extern long nr_swap_pages;
extern unsigned int nr_free_buffer_pages(void);
extern unsigned int nr_free_pagecache_pages(void);

--
Yasunori Goto

2007-05-09 03:11:38

[permalink] [raw]

Subject: [RFC] memory hotremove patch take 2 [02/10] (make page unused)

This patch is for supporting making page unused.

Isolate pages by capturing freed pages before inserting free_area[],
buddy allocator.
If you have an idea for avoiding spin_lock(), please advise me.

Isolating pages in free_area[] is implemented in other patch.

Signed-Off-By: KAMEZAWA Hiroyuki <[email protected]>
Signed-off-by: Yasunori Goto <[email protected]>

include/linux/mmzone.h | 8 +
include/linux/page_isolation.h | 52 +++++++++++
mm/Kconfig | 7 +
mm/page_alloc.c | 187 +++++++++++++++++++++++++++++++++++++++++
4 files changed, 254 insertions(+)

Index: current_test/include/linux/mmzone.h
===================================================================
--- current_test.orig/include/linux/mmzone.h 2007-05-08 15:06:49.000000000 +0900
+++ current_test/include/linux/mmzone.h 2007-05-08 15:08:03.000000000 +0900
@@ -314,6 +314,14 @@ struct zone {
/* zone_start_pfn == zone_start_paddr >> PAGE_SHIFT */
unsigned long zone_start_pfn;

+#ifdef CONFIG_PAGE_ISOLATION
+ /*
+ * For pages which are not used but not free.
+ * See include/linux/page_isolation.h
+ */
+ spinlock_t isolation_lock;
+ struct list_head isolation_list;
+#endif
/*
* zone_start_pfn, spanned_pages and present_pages are all
* protected by span_seqlock. It is a seqlock because it has
Index: current_test/mm/page_alloc.c
===================================================================
--- current_test.orig/mm/page_alloc.c 2007-05-08 15:07:20.000000000 +0900
+++ current_test/mm/page_alloc.c 2007-05-08 15:08:34.000000000 +0900
@@ -41,6 +41,7 @@
#include <linux/pfn.h>
#include <linux/backing-dev.h>
#include <linux/fault-inject.h>
+#include <linux/page_isolation.h>

#include <asm/tlbflush.h>
#include <asm/div64.h>
@@ -448,6 +449,9 @@ static inline void __free_one_page(struc
if (unlikely(PageCompound(page)))
destroy_compound_page(page, order);

+ if (page_under_isolation(zone, page, order))
+ return;
+
page_idx = page_to_pfn(page) & ((1 << MAX_ORDER) - 1);

VM_BUG_ON(page_idx & (order_size - 1));
@@ -3259,6 +3263,10 @@ static void __meminit free_area_init_cor
zone->nr_scan_inactive = 0;
zap_zone_vm_stats(zone);
atomic_set(&zone->reclaim_in_progress, 0);
+#ifdef CONFIG_PAGE_ISOLATION
+ spin_lock_init(&zone->isolation_lock);
+ INIT_LIST_HEAD(&zone->isolation_list);
+#endif
if (!size)
continue;

@@ -4214,3 +4222,182 @@ void set_pageblock_flags_group(struct pa
else
__clear_bit(bitidx + start_bitidx, bitmap);
}
+
+#ifdef CONFIG_PAGE_ISOLATION
+/*
+ * Page Isolation.
+ *
+ * If a page is removed from usual free_list and will never be used,
+ * It is linked to "struct isolation_info" and set Reserved, Private
+ * bit. page->mapping points to isolation_info in it.
+ * and page_count(page) is 0.
+ *
+ * This can be used for creating a chunk of contiguous *unused* memory.
+ *
+ * current user is Memory-Hot-Remove.
+ * maybe move to some other file is better.
+ */
+static void
+isolate_page_nolock(struct isolation_info *info, struct page *page, int order)
+{
+ int pagenum;
+ pagenum = 1 << order;
+ while (pagenum > 0) {
+ SetPageReserved(page);
+ SetPagePrivate(page);
+ page->private = (unsigned long)info;
+ list_add(&page->lru, &info->pages);
+ page++;
+ pagenum--;
+ }
+}
+
+/*
+ * This function is called from page_under_isolation()l
+ */
+
+int __page_under_isolation(struct zone *zone, struct page *page, int order)
+{
+ struct isolation_info *info;
+ unsigned long pfn = page_to_pfn(page);
+ unsigned long flags;
+ int found = 0;
+
+ spin_lock_irqsave(&zone->isolation_lock,flags);
+ list_for_each_entry(info, &zone->isolation_list, list) {
+ if (info->start_pfn <= pfn && pfn < info->end_pfn) {
+ found = 1;
+ break;
+ }
+ }
+ if (found) {
+ isolate_page_nolock(info, page, order);
+ }
+ spin_unlock_irqrestore(&zone->isolation_lock, flags);
+ return found;
+}
+
+/*
+ * start and end must be in the same zone.
+ *
+ */
+struct isolation_info *
+register_isolation(unsigned long start, unsigned long end)
+{
+ struct zone *zone;
+ struct isolation_info *info = NULL, *tmp;
+ unsigned long flags;
+ unsigned long last_pfn = end - 1;
+
+ if (!pfn_valid(start) || !pfn_valid(last_pfn) || (start >= end))
+ return ERR_PTR(-EINVAL);
+ /* check start and end is in the same zone */
+ zone = page_zone(pfn_to_page(start));
+
+ if (zone != page_zone(pfn_to_page(last_pfn)))
+ return ERR_PTR(-EINVAL);
+ /* target range has to match MAX_ORDER alignmet */
+ if ((start & (MAX_ORDER_NR_PAGES - 1)) ||
+ (end & (MAX_ORDER_NR_PAGES - 1)))
+ return ERR_PTR(-EINVAL);
+ info = kmalloc(sizeof(*info), GFP_KERNEL);
+ if (!info)
+ return ERR_PTR(-ENOMEM);
+ spin_lock_irqsave(&zone->isolation_lock, flags);
+ /* we don't allow overlap among isolation areas */
+ if (!list_empty(&zone->isolation_list)) {
+ list_for_each_entry(tmp, &zone->isolation_list, list) {
+ if (start < tmp->end_pfn && end > tmp->start_pfn) {
+ goto out_free;
+ }
+ }
+ }
+ info->start_pfn = start;
+ info->end_pfn = end;
+ info->zone = zone;
+ INIT_LIST_HEAD(&info->list);
+ INIT_LIST_HEAD(&info->pages);
+ list_add(&info->list, &zone->isolation_list);
+out_unlock:
+ spin_unlock_irqrestore(&zone->isolation_lock, flags);
+ return info;
+out_free:
+ kfree(info);
+ info = ERR_PTR(-EBUSY);
+ goto out_unlock;
+}
+/*
+ * Remove IsolationInfo from zone.
+ * After this, we can unuse memory in info or
+ * free back to freelist.
+ */
+
+void
+detach_isolation_info_zone(struct isolation_info *info)
+{
+ unsigned long flags;
+ struct zone *zone = info->zone;
+ spin_lock_irqsave(&zone->isolation_lock,flags);
+ list_del(&info->list);
+ info->zone = NULL;
+ spin_unlock_irqrestore(&zone->isolation_lock,flags);
+}
+
+/*
+ * All pages in info->pages should be remvoed before calling this.
+ * And info should be detached from zone.
+ */
+void
+free_isolation_info(struct isolation_info *info)
+{
+ BUG_ON(!list_empty(&info->pages));
+ BUG_ON(info->zone);
+ kfree(info);
+ return;
+}
+
+/*
+ * Mark All pages in the isolation_info to be Reserved.
+ * When onlining these pages again, a user must check
+ * which page is usable by IORESOURCE_RAM
+ * please see memory_hotplug.c/online_pages() if unclear.
+ *
+ * info should be detached from zone before calling this.
+ */
+void
+unuse_all_isolated_pages(struct isolation_info *info)
+{
+ struct page *page, *n;
+ BUG_ON(info->zone);
+ list_for_each_entry_safe(page, n, &info->pages, lru) {
+ SetPageReserved(page);
+ page->private = 0;
+ ClearPagePrivate(page);
+ list_del(&page->lru);
+ }
+}
+
+/*
+ * Free all pages connected in isolation list.
+ * pages are moved back to free_list.
+ */
+void
+free_all_isolated_pages(struct isolation_info *info)
+{
+ struct page *page, *n;
+ BUG_ON(info->zone);
+ list_for_each_entry_safe(page, n ,&info->pages, lru) {
+ ClearPagePrivate(page);
+ ClearPageReserved(page);
+ page->private = 0;
+ list_del(&page->lru);
+ set_page_count(page, 0);
+ set_page_refcounted(page);
+ /* This is sage because info is detached from zone */
+ __free_page(page);
+ }
+}
+
+#endif /* CONFIG_PAGE_ISOLATION */
+
+
Index: current_test/mm/Kconfig
===================================================================
--- current_test.orig/mm/Kconfig 2007-05-08 15:06:50.000000000 +0900
+++ current_test/mm/Kconfig 2007-05-08 15:08:31.000000000 +0900
@@ -225,3 +225,10 @@ config DEBUG_READAHEAD

Say N for production servers.

+config PAGE_ISOLATION
+ bool "Page Isolation Framework"
+ help
+ This option adds page isolation framework to mm.
+ This is used for isolate amount of contiguous pages from linux
+ memory management.
+ Say N if unsure.
Index: current_test/include/linux/page_isolation.h
===================================================================
--- /dev/null 1970-01-01 00:00:00.000000000 +0000
+++ current_test/include/linux/page_isolation.h 2007-05-08 15:08:34.000000000 +0900
@@ -0,0 +1,52 @@
+#ifndef __LINIX_PAGE_ISOLATION_H
+#define __LINUX_PAGE_ISOLATION_H
+
+#ifdef CONFIG_PAGE_ISOLATION
+
+struct isolation_info {
+ struct list_head list;
+ unsigned long start_pfn;
+ unsigned long end_pfn;
+ struct zone *zone;
+ struct list_head pages;
+};
+
+extern int
+__page_under_isolation(struct zone *zone, struct page *page, int order);
+
+static inline int
+page_under_isolation(struct zone *zone, struct page *page, int order)
+{
+ if (likely(list_empty(&zone->isolation_list)))
+ return 0;
+ return __page_under_isolation(zone, page, order);
+}
+
+static inline int
+is_page_isolated(struct isolation_info *info, struct page *page)
+{
+ if (PageReserved(page) && PagePrivate(page) &&
+ page_count(page) == 0 &&
+ page->private == (unsigned long)info)
+ return 1;
+ return 0;
+}
+
+extern struct isolation_info *
+register_isolation(unsigned long start, unsigned long end);
+
+extern void detach_isolation_info_zone(struct isolation_info *info);
+extern void free_isolation_info(struct isolation_info *info);
+extern void unuse_all_isolated_pages(struct isolation_info *info);
+extern void free_all_isolated_pages(struct isolation_info *info);
+
+#else
+
+static inline int
+page_under_isolation(struct zone *zone, struct page *page, int order)
+{
+ return 0;
+}
+
+#endif
+#endif

--
Yasunori Goto

2007-05-09 03:12:14

[permalink] [raw]

Subject: [RFC] memory hotremove patch take 2 [04/10] (isolate all free pages)

Isolate all freed pages (means in buddy_list) in the range.
See page_buddy() and free_one_page() function if unsure.

Signed-Off-By: KAMEZAWA Hiroyuki <[email protected]>
Signed-off-by: Yasunori Goto <[email protected]>

include/linux/page_isolation.h | 1
mm/page_alloc.c | 45 +++++++++++++++++++++++++++++++++++++++++
2 files changed, 46 insertions(+)

Index: current_test/mm/page_alloc.c
===================================================================
--- current_test.orig/mm/page_alloc.c 2007-05-08 15:08:04.000000000 +0900
+++ current_test/mm/page_alloc.c 2007-05-08 15:08:26.000000000 +0900
@@ -4411,6 +4411,51 @@ free_all_isolated_pages(struct isolation
}
}

+/*
+ * Isolate already freed pages.
+ */
+int
+capture_isolate_freed_pages(struct isolation_info *info)
+{
+ struct zone *zone;
+ unsigned long pfn;
+ struct page *page;
+ int order, order_size;
+ int nr_pages = 0;
+ unsigned long last_pfn = info->end_pfn - 1;
+ pfn = info->start_pfn;
+ if (!pfn_valid(pfn))
+ return -EINVAL;
+ zone = info->zone;
+ if ((zone != page_zone(pfn_to_page(pfn))) ||
+ (zone != page_zone(pfn_to_page(last_pfn))))
+ return -EINVAL;
+ drain_all_pages();
+ spin_lock(&zone->lock);
+ while (pfn < info->end_pfn) {
+ if (!pfn_valid(pfn)) {
+ pfn++;
+ continue;
+ }
+ page = pfn_to_page(pfn);
+ /* See page_is_buddy() */
+ if (page_count(page) == 0 && PageBuddy(page)) {
+ order = page_order(page);
+ order_size = 1 << order;
+ zone->free_area[order].nr_free--;
+ __mod_zone_page_state(zone, NR_FREE_PAGES, -order_size);
+ list_del(&page->lru);
+ rmv_page_order(page);
+ isolate_page_nolock(info, page, order);
+ nr_pages += order_size;
+ pfn += order_size;
+ } else {
+ pfn++;
+ }
+ }
+ spin_unlock(&zone->lock);
+ return nr_pages;
+}
#endif /* CONFIG_PAGE_ISOLATION */

Index: current_test/include/linux/page_isolation.h
===================================================================
--- current_test.orig/include/linux/page_isolation.h 2007-05-08 15:08:04.000000000 +0900
+++ current_test/include/linux/page_isolation.h 2007-05-08 15:08:27.000000000 +0900
@@ -40,6 +40,7 @@ extern void free_isolation_info(struct i
extern void unuse_all_isolated_pages(struct isolation_info *info);
extern void free_all_isolated_pages(struct isolation_info *info);
extern void drain_all_pages(void);
+extern int capture_isolate_freed_pages(struct isolation_info *info);

#else

--
Yasunori Goto

2007-05-09 03:12:42

[permalink] [raw]

Subject: [RFC] memory hotremove patch take 2 [03/10] (drain all pages)

This patch add function drain_all_pages(void) to drain all
pages on per-cpu-freelist.
Page isolation will catch them in free_one_page.

Signed-Off-By: KAMEZAWA Hiroyuki <[email protected]>
Signed-off-by: Yasunori Goto <[email protected]>

include/linux/page_isolation.h | 1 +
mm/page_alloc.c | 13 +++++++++++++
2 files changed, 14 insertions(+)

Index: current_test/mm/page_alloc.c
===================================================================
--- current_test.orig/mm/page_alloc.c 2007-05-08 15:08:03.000000000 +0900
+++ current_test/mm/page_alloc.c 2007-05-08 15:08:33.000000000 +0900
@@ -1070,6 +1070,19 @@ void drain_all_local_pages(void)
smp_call_function(smp_drain_local_pages, NULL, 0, 1);
}

+#ifdef CONFIG_PAGE_ISOLATION
+static void drain_local_zone_pages(struct work_struct *work)
+{
+ drain_local_pages();
+}
+
+void drain_all_pages(void)
+{
+ schedule_on_each_cpu(drain_local_zone_pages);
+}
+
+#endif /* CONFIG_PAGE_ISOLATION */
+
/*
* Free a 0-order page
*/
Index: current_test/include/linux/page_isolation.h
===================================================================
--- current_test.orig/include/linux/page_isolation.h 2007-05-08 15:08:03.000000000 +0900
+++ current_test/include/linux/page_isolation.h 2007-05-08 15:08:33.000000000 +0900
@@ -39,6 +39,7 @@ extern void detach_isolation_info_zone(s
extern void free_isolation_info(struct isolation_info *info);
extern void unuse_all_isolated_pages(struct isolation_info *info);
extern void free_all_isolated_pages(struct isolation_info *info);
+extern void drain_all_pages(void);

#else

--
Yasunori Goto

2007-05-09 03:13:18

[permalink] [raw]

Subject: [RFC] memory hotremove patch take 2 [05/10] (make basic remove code)

Add MEMORY_HOTREMOVE config and implements basic algorythm.

This config selects ZONE_MOVABLE and PAGE_ISOLATION

how work:
1. register Isololation area of specified section
2. search mem_map and migrate pages.
3. detach isolation and make pages unused.

This works on my easy test, but I think I need more work on loop algorythm
and policy.

Signed-Off-By: KAMEZAWA Hiroyuki <[email protected]>
Signed-off-by: Yasunori Goto <[email protected]>

include/linux/memory_hotplug.h | 1
mm/Kconfig | 8 +
mm/memory_hotplug.c | 221 +++++++++++++++++++++++++++++++++++++++++
3 files changed, 229 insertions(+), 1 deletion(-)

Index: current_test/mm/Kconfig
===================================================================
--- current_test.orig/mm/Kconfig 2007-05-08 15:08:03.000000000 +0900
+++ current_test/mm/Kconfig 2007-05-08 15:08:27.000000000 +0900
@@ -126,6 +126,12 @@ config MEMORY_HOTPLUG_SPARSE
def_bool y
depends on SPARSEMEM && MEMORY_HOTPLUG

+config MEMORY_HOTREMOVE
+ bool "Allow for memory hot-remove"
+ depends on MEMORY_HOTPLUG_SPARSE
+ select MIGRATION
+ select PAGE_ISOLATION
+
# Heavily threaded applications may benefit from splitting the mm-wide
# page_table_lock, so that faults on different parts of the user address
# space can be handled with less contention: split it at this NR_CPUS.
@@ -145,7 +151,7 @@ config SPLIT_PTLOCK_CPUS
config MIGRATION
bool "Page migration"
def_bool y
- depends on NUMA
+ depends on NUMA || MEMORY_HOTREMOVE
help
Allows the migration of the physical location of pages of processes
while the virtual addresses are not changed. This is useful for
Index: current_test/mm/memory_hotplug.c
===================================================================
--- current_test.orig/mm/memory_hotplug.c 2007-05-08 15:02:48.000000000 +0900
+++ current_test/mm/memory_hotplug.c 2007-05-08 15:08:27.000000000 +0900
@@ -23,6 +23,9 @@
#include <linux/vmalloc.h>
#include <linux/ioport.h>
#include <linux/cpuset.h>
+#include <linux/page_isolation.h>
+#include <linux/delay.h>
+#include <linux/migrate.h>

#include <asm/tlbflush.h>

@@ -308,3 +311,221 @@ error:
return ret;
}
EXPORT_SYMBOL_GPL(add_memory);
+
+
+
+#ifdef CONFIG_MEMORY_HOTREMOVE
+
+/*
+ * Just an easy implementation.
+ */
+static struct page *
+hotremove_migrate_alloc(struct page *page,
+ unsigned long private,
+ int **x)
+{
+ return alloc_page(GFP_HIGH_MOVABLE);
+}
+
+/* scans # of pages per itelation */
+#define HOTREMOVE_UNIT (1024)
+
+static int do_migrate_and_isolate_pages(struct isolation_info *info,
+ unsigned long start_pfn,
+ unsigned long end_pfn)
+{
+ int move_pages = HOTREMOVE_UNIT;
+ int ret, managed, not_managed;
+ unsigned long pfn;
+ struct page *page;
+ LIST_HEAD(source);
+
+ not_managed = 0;
+ for (pfn = start_pfn; pfn < end_pfn && move_pages > 0; pfn++) {
+ if (!pfn_valid(pfn)) /* never happens in sparsemem */
+ continue;
+ page = pfn_to_page(pfn);
+ if (is_page_isolated(info,page))
+ continue;
+ ret = isolate_lru_page(page, &source);
+
+ if (ret == 0) {
+ move_pages--;
+ managed++;
+ } else {
+ if (page_count(page))
+ not_managed++; /* someone uses this */
+ }
+ }
+ ret = -EBUSY;
+ if (not_managed) {
+ if (!list_empty(&source))
+ putback_lru_pages(&source);
+ goto out;
+ }
+ ret = 0;
+ if (list_empty(&source))
+ goto out;
+ /* this function returns # of failed pages */
+ ret = migrate_pages(&source, hotremove_migrate_alloc,
+ (unsigned long)info);
+out:
+ return ret;
+}
+
+
+/*
+ * Check All pages registered as IORESOURCE_RAM are isolated or not.
+ */
+static int check_removal_success(struct isolation_info *info)
+{
+ struct resource res;
+ unsigned long section_end;
+ unsigned long start_pfn, i, nr_pages;
+ struct page *page;
+ int removed = 0;
+ res.start = info->start_pfn << PAGE_SHIFT;
+ res.end = (info->end_pfn - 1) << PAGE_SHIFT;
+ res.flags = IORESOURCE_MEM;
+ section_end = res.end;
+ while ((res.start < res.end) && (find_next_system_ram(&res) >= 0)) {
+ start_pfn =(res.start >> PAGE_SHIFT);
+ nr_pages = (res.end + 1UL - res.start) >> PAGE_SHIFT;
+ for (i = 0; i < nr_pages; i++) {
+ page = pfn_to_page(start_pfn + i);
+ if (!is_page_isolated(info,page))
+ return -EBUSY;
+ removed++;
+ }
+ res.start = res.end + 1;
+ res.end = section_end;
+ }
+ return removed;
+}
+/*
+ * start_pfn and end_pfn myst be aligned to SECTION_SIZE.
+ * start_pfn and end_pfn must be in the same zone.
+ * target page range must be in ZONE_MOVABLE.
+ *
+ * Under this, [start_pfn, end_pfn) pages are isolated.
+ * All freed pages in the range is captured info isolation_info.
+ *
+ * If all pages in the range are isolated, offline_pages() returns 0.
+ *
+ * Note: memory holes in section are marked as Reserved Memory.
+ * So we igonre Reserved pages in the first check.
+ * But bootmem is aslo makred as Reserved.
+ * We check memory resouce information and confirm we freed
+ * All necessary pages.
+ */
+
+int offline_pages(unsigned long start_pfn,
+ unsigned long end_pfn,
+ unsigned long timeout)
+{
+ struct isolation_info *info;
+ struct page *page;
+ LIST_HEAD(pagelist);
+ int ret, nr_pages;
+ unsigned long expire = jiffies + timeout;
+ struct zone *zone;
+ unsigned long pfn, offlined_pages;
+
+ if (start_pfn & (PAGES_PER_SECTION - 1))
+ return -EINVAL;
+ if (end_pfn & (PAGES_PER_SECTION - 1))
+ return -EINVAL;
+
+ zone = page_zone(pfn_to_page(start_pfn));
+
+ if (zone != zone->zone_pgdat->node_zones + ZONE_MOVABLE)
+ return -EBUSY;
+
+ for (pfn = start_pfn; pfn < end_pfn; pfn++) {
+ if (!pfn_valid(pfn))
+ continue;
+ page = pfn_to_page(pfn);
+ if (PageSlab(page) ||
+ PageUncached(page) ||
+ PageCompound(page))
+ break;
+ }
+ if (pfn < end_pfn)
+ return -EBUSY;
+
+ info = register_isolation(start_pfn, end_pfn);
+ if (IS_ERR(info))
+ return PTR_ERR(info);
+ /* start memory hot removal */
+
+ ret = capture_isolate_freed_pages(info);
+ if(ret < 0)
+ goto failed_removal;
+
+ nr_pages = end_pfn - start_pfn;
+ pfn = start_pfn;
+repeat:
+ ret = -EAGAIN;
+ if (time_after(jiffies, expire))
+ goto failed_removal;
+ ret = -EINTR;
+ if (signal_pending(current))
+ goto failed_removal;
+
+ lru_add_drain_all();
+
+ for(;pfn < end_pfn;pfn++) {
+ if (!pfn_valid(pfn))
+ continue;
+ page = pfn_to_page(pfn);
+ if (PageReserved(page)) /* ignore Resrved page for now */
+ continue;
+ if (!is_page_isolated(info,page))
+ break;
+ }
+
+ if (pfn != end_pfn) {
+ ret = do_migrate_and_isolate_pages(info, pfn, end_pfn);
+ if (!ret) {
+ cond_resched();
+ goto repeat;
+ } else if (ret < 0) {
+ ret = -EBUSY;
+ goto failed_removal;
+ } else if (ret > 0) {
+ /* some congestion found. sleep a bit */
+ msleep(10);
+ goto repeat;
+ }
+ }
+ /* check memory holes and bootmem */
+ ret = check_removal_success(info);
+ if (ret < 0) {
+ goto failed_removal;
+ }
+ offlined_pages = ret;
+ /* all pages are isolated */
+ detach_isolation_info_zone(info);
+ unuse_all_isolated_pages(info);
+ free_isolation_info(info);
+ zone->present_pages -= offlined_pages;
+ zone->zone_pgdat->node_present_pages -= offlined_pages;
+ totalram_pages -= offlined_pages;
+ num_physpages -= offlined_pages;
+ vm_total_pages = nr_free_pagecache_pages();
+ writeback_set_ratelimit();
+ return 0;
+
+failed_removal:
+ if (ret == -EBUSY) {
+ printk("some unremovable pages are included in %lx to %lx\n",
+ info->start_pfn, info->end_pfn);
+ }
+ /* push back to free_list */
+ detach_isolation_info_zone(info);
+ free_all_isolated_pages(info);
+ free_isolation_info(info);
+ return ret;
+}
+
+#endif
Index: current_test/include/linux/memory_hotplug.h
===================================================================
--- current_test.orig/include/linux/memory_hotplug.h 2007-05-08 15:02:48.000000000 +0900
+++ current_test/include/linux/memory_hotplug.h 2007-05-08 15:08:06.000000000 +0900
@@ -59,6 +59,7 @@ extern int add_one_highpage(struct page
extern void online_page(struct page *page);
/* VM interface that may be used by firmware interface */
extern int online_pages(unsigned long, unsigned long);
+extern int offline_pages(unsigned long, unsigned long, unsigned long);

/* reasonably generic interface to expand the physical pages in a zone */
extern int __add_pages(struct zone *zone, unsigned long start_pfn,

--
Yasunori Goto

2007-05-09 03:13:41

[permalink] [raw]

Subject: [RFC] memory hotremove patch take 2 [06/10] (ia64's remove_memory code)

Call offline pages from remove_memory().
Signed-off-by: Yasunori Goto <[email protected]>

Signed-Off-By: KAMEZAWA Hiroyuki <[email protected]>
arch/ia64/mm/init.c | 13 ++++++++++++-
1 files changed, 12 insertions(+), 1 deletion(-)

Index: current_test/arch/ia64/mm/init.c
===================================================================
--- current_test.orig/arch/ia64/mm/init.c 2007-05-08 15:07:20.000000000 +0900
+++ current_test/arch/ia64/mm/init.c 2007-05-08 15:08:07.000000000 +0900
@@ -726,7 +726,18 @@ int arch_add_memory(int nid, u64 start,

int remove_memory(u64 start, u64 size)
{
- return -EINVAL;
+ unsigned long start_pfn, end_pfn;
+ unsigned long timeout = 120 * HZ;
+ int ret;
+ start_pfn = start >> PAGE_SHIFT;
+ end_pfn = start_pfn + (size >> PAGE_SHIFT);
+ ret = offline_pages(start_pfn, end_pfn, timeout);
+ if (ret)
+ goto out;
+ /* we can free mem_map at this point */
+out:
+ return ret;
}
+
EXPORT_SYMBOL_GPL(remove_memory);
#endif

--
Yasunori Goto

2007-05-09 03:14:44

[permalink] [raw]

Subject: [RFC] memory hotremove patch take 2 [10/10] (retry swap-in page)

There is a race condition between swap-in and unmap_and_move().
When swap-in occur, page_mapped might be not set yet.
So, unmap_and_move() gives up at once, and tries later.

Signed-off-by: Yasunori Goto <[email protected]>

mm/migrate.c | 5 +++++
1 files changed, 5 insertions(+)

Index: current_test/mm/migrate.c
===================================================================
--- current_test.orig/mm/migrate.c 2007-05-08 15:08:09.000000000 +0900
+++ current_test/mm/migrate.c 2007-05-08 15:08:09.000000000 +0900
@@ -670,6 +670,11 @@ static int unmap_and_move(new_page_t get
/* hold this anon_vma until remove_migration_ptes() finishes */
anon_vma_hold(page);
}
+
+ if (PageSwapCache(page) && !page_mapped(page))
+ /* swap in now. try lator*/
+ goto unlock;
+
/*
* Establish migration ptes or remove ptes
*/

--
Yasunori Goto

2007-05-09 03:15:21

[permalink] [raw]

Subject: [RFC] memory hotremove patch take 2 [09/10] (direct isolation for remove)

This patch is to isolate source page of migration ASAP in unmap_and_move(),
when memory-hotremove.

In old code, it uses just put_page(),
and we expected that migrated source page is catched in __free_one_page()
as isolated page. But, it is spooled in per_cpu_page and used soon
for next destination page of migration. This was cause of eternal loop in
offline_pages().

Signed-off-by: Yasunori Goto <[email protected]>

include/linux/page_isolation.h | 14 ++++++++++++
mm/Kconfig | 1
mm/migrate.c | 46 +++++++++++++++++++++++++++++++++++++++--
3 files changed, 59 insertions(+), 2 deletions(-)

Index: current_test/mm/migrate.c
===================================================================
--- current_test.orig/mm/migrate.c 2007-05-08 15:08:07.000000000 +0900
+++ current_test/mm/migrate.c 2007-05-08 15:08:21.000000000 +0900
@@ -249,6 +249,32 @@ static void remove_migration_ptes(struct
remove_file_migration_ptes(old, new);
}

+
+static int
+is_page_isolated_noinfo(struct page *page)
+{
+ int ret = 0;
+ struct zone *zone;
+ unsigned long flags;
+ struct isolation_info *info;
+
+ if (unlikely(PageReserved(page) && PagePrivate(page) &&
+ page_count(page) == 1)){
+ zone = page_zone(page);
+ spin_lock_irqsave(&zone->isolation_lock, flags);
+ list_for_each_entry(info, &zone->isolation_list, list) {
+ if (PageReserved(page) && PagePrivate(page) &&
+ page_count(page) == 1 &&
+ page->private == (unsigned long)info){
+ ret = 1;
+ break;
+ }
+ }
+ spin_unlock_irqrestore(&zone->isolation_lock, flags);
+
+ }
+ return ret;
+}
/*
* Something used the pte of a page under migration. We need to
* get to the page and wait until migration is finished.
@@ -278,7 +304,14 @@ void migration_entry_wait(struct mm_stru
get_page(page);
pte_unmap_unlock(ptep, ptl);
wait_on_page_locked(page);
- put_page(page);
+
+ /*
+ * The page might be migrated and directly isolated.
+ * If not, then release page.
+ */
+ if (!is_page_isolated_noinfo(page))
+ put_page(page);
+
return;
out:
pte_unmap_unlock(ptep, ptl);
@@ -653,6 +686,15 @@ static int unmap_and_move(new_page_t get
anon_vma_release(page);
}

+ if (rc != -EAGAIN && is_migrate_isolation(flag)) {
+ /* page must be removed sooner. */
+ list_del(&page->lru);
+ page_under_isolation(page_zone(page), page, 0);
+ __put_page(page);
+ unlock_page(page);
+ goto move_newpage;
+ }
+
unlock:
unlock_page(page);

@@ -758,7 +800,7 @@ int migrate_pages_and_remove(struct list
new_page_t get_new_page, unsigned long private)
{
return __migrate_pages(from, get_new_page, private,
- MIGRATE_NOCONTEXT);
+ MIGRATE_NOCONTEXT | MIGRATE_ISOLATION);
}
#endif

Index: current_test/include/linux/page_isolation.h
===================================================================
--- current_test.orig/include/linux/page_isolation.h 2007-05-08 15:08:07.000000000 +0900
+++ current_test/include/linux/page_isolation.h 2007-05-08 15:08:09.000000000 +0900
@@ -33,12 +33,20 @@ is_page_isolated(struct isolation_info *
}

#define MIGRATE_NOCONTEXT 0x1
+#define MIGRATE_ISOLATION 0x2
+
static inline int
is_migrate_nocontext(int flag)
{
return (flag & MIGRATE_NOCONTEXT) == MIGRATE_NOCONTEXT;
}

+static inline int
+is_migrate_isolation(int flag)
+{
+ return (flag & MIGRATE_ISOLATION) == MIGRATE_ISOLATION;
+}
+
extern struct isolation_info *
register_isolation(unsigned long start, unsigned long end);

@@ -64,5 +72,11 @@ is_migrate_nocontext(int flag)
return 0;
}

+static inline int
+is_migrate_isolation(int flag)
+{
+ return 0;
+}
+
#endif
#endif
Index: current_test/mm/Kconfig
===================================================================
--- current_test.orig/mm/Kconfig 2007-05-08 15:08:07.000000000 +0900
+++ current_test/mm/Kconfig 2007-05-08 15:08:09.000000000 +0900
@@ -169,6 +169,7 @@ config MIGRATION_REMOVE
migration target pages. This has a small race condition.
If this config is selected, some workaround for fix them is enabled.
This may be add slight performance influence.
+ In addition, page must be isolated sooner for remove.

config RESOURCES_64BIT
bool "64 bit Memory and IO resources (EXPERIMENTAL)" if (!64BIT && EXPERIMENTAL)

--
Yasunori Goto

2007-05-09 03:15:53

[permalink] [raw]

Subject: Re: [RFC] memory hotremove patch take 2 [07/10] (delay freeing anon_vma)

Delaying freeing anon_vma until migration finishes.

We cannot trust page->mapping (of ANON) when page_mapcount(page) ==0.

page migration puts page_mocount(page) to be 0. So we have to
guarantee anon_vma pointed by page->mapping is valid by some hook.

Usual page migration guarantees this by mm->sem. but we can't do it.
So, just delaying freeing anon_vma.

Signed-Off-By: KAMEZAWA Hiroyuki <[email protected]>
Signed-off-by: Yasunori Goto <[email protected]>

include/linux/migrate.h | 2 ++
include/linux/page_isolation.h | 14 ++++++++++++++
include/linux/rmap.h | 22 ++++++++++++++++++++++
mm/Kconfig | 12 ++++++++++++
mm/memory_hotplug.c | 4 ++--
mm/migrate.c | 37 +++++++++++++++++++++++++++++++------
mm/rmap.c | 36 +++++++++++++++++++++++++++++++++++-
7 files changed, 118 insertions(+), 9 deletions(-)

Index: current_test/mm/migrate.c
===================================================================
--- current_test.orig/mm/migrate.c 2007-05-08 15:06:50.000000000 +0900
+++ current_test/mm/migrate.c 2007-05-08 15:08:24.000000000 +0900
@@ -28,6 +28,7 @@
#include <linux/mempolicy.h>
#include <linux/vmalloc.h>
#include <linux/security.h>
+#include <linux/page_isolation.h>

#include "internal.h"

@@ -607,7 +608,7 @@ static int move_to_new_page(struct page
* to the newly allocated page in newpage.
*/
static int unmap_and_move(new_page_t get_new_page, unsigned long private,
- struct page *page, int force)
+ struct page *page, int force, int flag)
{
int rc = 0;
int *result = NULL;
@@ -632,7 +633,10 @@ static int unmap_and_move(new_page_t get
goto unlock;
wait_on_page_writeback(page);
}
-
+ if (PageAnon(page) && is_migrate_nocontext(flag)) {
+ /* hold this anon_vma until remove_migration_ptes() finishes */
+ anon_vma_hold(page);
+ }
/*
* Establish migration ptes or remove ptes
*/
@@ -640,8 +644,14 @@ static int unmap_and_move(new_page_t get
if (!page_mapped(page))
rc = move_to_new_page(newpage, page);

- if (rc)
+ if (rc) {
remove_migration_ptes(page, page);
+ if (PageAnon(page) && is_migrate_nocontext(flag))
+ anon_vma_release(page);
+ } else {
+ if (PageAnon(newpage) && is_migrate_nocontext(flag))
+ anon_vma_release(page);
+ }

unlock:
unlock_page(page);
@@ -686,8 +696,8 @@ move_newpage:
*
* Return: Number of pages not migrated or error code.
*/
-int migrate_pages(struct list_head *from,
- new_page_t get_new_page, unsigned long private)
+static int __migrate_pages(struct list_head *from,
+ new_page_t get_new_page, unsigned long private, int flag)
{
int retry = 1;
int nr_failed = 0;
@@ -707,7 +717,7 @@ int migrate_pages(struct list_head *from
cond_resched();

rc = unmap_and_move(get_new_page, private,
- page, pass > 2);
+ page, pass > 2, flag);

switch(rc) {
case -ENOMEM:
@@ -737,6 +747,21 @@ out:
return nr_failed + retry;
}

+int migrate_pages(struct list_head *from,
+ new_page_t get_new_page, unsigned long private)
+{
+ return __migrate_pages(from, get_new_page, private, 0);
+}
+
+#ifdef CONFIG_MIGRATION_REMOVE
+int migrate_pages_and_remove(struct list_head *from,
+ new_page_t get_new_page, unsigned long private)
+{
+ return __migrate_pages(from, get_new_page, private,
+ MIGRATE_NOCONTEXT);
+}
+#endif
+
#ifdef CONFIG_NUMA
/*
* Move a list of individual pages
Index: current_test/include/linux/rmap.h
===================================================================
--- current_test.orig/include/linux/rmap.h 2007-05-08 15:06:49.000000000 +0900
+++ current_test/include/linux/rmap.h 2007-05-08 15:08:07.000000000 +0900
@@ -26,6 +26,9 @@
struct anon_vma {
spinlock_t lock; /* Serialize access to vma list */
struct list_head head; /* List of private "related" vmas */
+#ifdef CONFIG_MIGRATION_REMOVE
+ atomic_t hold; /* == 0 if we can free this immediately */
+#endif
};

#ifdef CONFIG_MMU
@@ -37,10 +40,14 @@ static inline struct anon_vma *anon_vma_
return kmem_cache_alloc(anon_vma_cachep, GFP_KERNEL);
}

+#ifndef CONFIG_MIGRATION_REMOVE
static inline void anon_vma_free(struct anon_vma *anon_vma)
{
kmem_cache_free(anon_vma_cachep, anon_vma);
}
+#else
+extern void anon_vma_free(struct anon_vma *anon_vma);
+#endif

static inline void anon_vma_lock(struct vm_area_struct *vma)
{
@@ -75,6 +82,21 @@ void page_add_file_rmap(struct page *);
void page_dup_rmap(struct page *page, struct vm_area_struct *vma, unsigned long address);
void page_remove_rmap(struct page *, struct vm_area_struct *);

+#ifdef CONFIG_MIGRATION_REMOVE
+/*
+ * While Page migration without any process context, we doesn't have
+ * mm->sem. Because page->mapcount goes down to 0 while migration,
+ * we cannot trust page->mapping value.
+ * These two functions prevents anon_vma from being freed while
+ * migration.
+ */
+void anon_vma_hold(struct page *page);
+void anon_vma_release(struct page *page);
+#else /* !CONFIG_MIGRATION_REMOVE */
+#define anon_vma_hold(page) do{}while(0)
+#define anon_vma_release(page) do{}while(0)
+#endif /* CONFIG_MIGRATION_REMOVE */
+
/*
* Called from mm/vmscan.c to handle paging out
*/
Index: current_test/mm/rmap.c
===================================================================
--- current_test.orig/mm/rmap.c 2007-05-08 15:06:50.000000000 +0900
+++ current_test/mm/rmap.c 2007-05-08 15:08:07.000000000 +0900
@@ -155,8 +155,9 @@ void anon_vma_unlink(struct vm_area_stru
empty = list_empty(&anon_vma->head);
spin_unlock(&anon_vma->lock);

- if (empty)
+ if (empty) {
anon_vma_free(anon_vma);
+ }
}

static void anon_vma_ctor(void *data, struct kmem_cache *cachep,
@@ -1003,3 +1004,36 @@ int try_to_unmap(struct page *page, int
return ret;
}

+#ifdef CONFIG_MIGRATION_REMOVE
+
+void anon_vma_free(struct anon_vma *anon)
+{
+ if (atomic_read(&anon->hold) == 0) {
+ kmem_cache_free(anon_vma_cachep, anon);
+ }
+}
+
+void anon_vma_hold(struct page *page)
+{
+ struct anon_vma *anon_vma;
+ anon_vma = page_lock_anon_vma(page);
+ if (!anon_vma)
+ return;
+ atomic_set(&anon_vma->hold, 1);
+ spin_unlock(&anon_vma->lock);
+}
+
+void anon_vma_release(struct page *page)
+{
+ struct anon_vma *anon_vma;
+ int empty;
+ anon_vma = page_lock_anon_vma(page);
+ if (!anon_vma)
+ return;
+ atomic_set(&anon_vma->hold, 0);
+ empty = list_empty(&anon_vma->head);
+ spin_unlock(&anon_vma->lock);
+ if (empty)
+ anon_vma_free(anon_vma);
+}
+#endif
Index: current_test/mm/Kconfig
===================================================================
--- current_test.orig/mm/Kconfig 2007-05-08 15:08:06.000000000 +0900
+++ current_test/mm/Kconfig 2007-05-08 15:08:24.000000000 +0900
@@ -131,6 +131,7 @@ config MEMORY_HOTREMOVE
depends on MEMORY_HOTPLUG_SPARSE
select MIGRATION
select PAGE_ISOLATION
+ select MIGRATION_REMOVE

# Heavily threaded applications may benefit from splitting the mm-wide
# page_table_lock, so that faults on different parts of the user address
@@ -158,6 +159,17 @@ config MIGRATION
example on NUMA systems to put pages nearer to the processors accessing
the page.

+config MIGRATION_REMOVE
+ bool "Page migration for memory remove"
+ def_bool y
+ depends on MEMORY_HOTREMOVE
+ help
+ When Memory-Hotremove is executed, page migraion runs.
+ But a process which does page migraion doesn't have context of
+ migration target pages. This has a small race condition.
+ If this config is selected, some workaround for fix them is enabled.
+ This may be add slight performance influence.
+
config RESOURCES_64BIT
bool "64 bit Memory and IO resources (EXPERIMENTAL)" if (!64BIT && EXPERIMENTAL)
default 64BIT
Index: current_test/include/linux/migrate.h
===================================================================
--- current_test.orig/include/linux/migrate.h 2007-05-08 15:06:49.000000000 +0900
+++ current_test/include/linux/migrate.h 2007-05-08 15:08:07.000000000 +0900
@@ -30,6 +30,8 @@ extern int putback_lru_pages(struct list
extern int migrate_page(struct address_space *,
struct page *, struct page *);
extern int migrate_pages(struct list_head *l, new_page_t x, unsigned long);
+extern int migrate_pages_and_remove(struct list_head *l,
+ new_page_t x, unsigned long);

extern int fail_migrate_page(struct address_space *,
struct page *, struct page *);
Index: current_test/mm/memory_hotplug.c
===================================================================
--- current_test.orig/mm/memory_hotplug.c 2007-05-08 15:08:06.000000000 +0900
+++ current_test/mm/memory_hotplug.c 2007-05-08 15:08:07.000000000 +0900
@@ -345,7 +345,7 @@ static int do_migrate_and_isolate_pages(
if (!pfn_valid(pfn)) /* never happens in sparsemem */
continue;
page = pfn_to_page(pfn);
- if (is_page_isolated(info,page))
+ if (PageReserved(page))
continue;
ret = isolate_lru_page(page, &source);

@@ -367,7 +367,7 @@ static int do_migrate_and_isolate_pages(
if (list_empty(&source))
goto out;
/* this function returns # of failed pages */
- ret = migrate_pages(&source, hotremove_migrate_alloc,
+ ret = migrate_pages_and_remove(&source, hotremove_migrate_alloc,
(unsigned long)info);
out:
return ret;
Index: current_test/include/linux/page_isolation.h
===================================================================
--- current_test.orig/include/linux/page_isolation.h 2007-05-08 15:08:05.000000000 +0900
+++ current_test/include/linux/page_isolation.h 2007-05-08 15:08:24.000000000 +0900
@@ -32,6 +32,13 @@ is_page_isolated(struct isolation_info *
return 0;
}

+#define MIGRATE_NOCONTEXT 0x1
+static inline int
+is_migrate_nocontext(int flag)
+{
+ return (flag & MIGRATE_NOCONTEXT) == MIGRATE_NOCONTEXT;
+}
+
extern struct isolation_info *
register_isolation(unsigned long start, unsigned long end);

@@ -50,5 +57,12 @@ page_under_isolation(struct zone *zone,
return 0;
}

+
+static inline int
+is_migrate_nocontext(int flag)
+{
+ return 0;
+}
+
#endif
#endif

--
Yasunori Goto

2007-05-09 03:16:27

[permalink] [raw]

Subject: [RFC] memory hotremove patch take 2 [08/10] (memap init alignment)

If there is small hole at end of a section, there are not initialized pages.
To find it, messy check is necessary at many place of memory remove code.
But, reserved bit by initialization is enough for most case of them.

Signed-off-by: Yasunori Goto <[email protected]>

mm/page_alloc.c | 5 +++++
1 files changed, 5 insertions(+)

Index: current_test/mm/page_alloc.c
===================================================================
--- current_test.orig/mm/page_alloc.c 2007-05-08 15:08:05.000000000 +0900
+++ current_test/mm/page_alloc.c 2007-05-08 15:08:08.000000000 +0900
@@ -2434,6 +2434,11 @@ void __meminit memmap_init_zone(unsigned
unsigned long end_pfn = start_pfn + size;
unsigned long pfn;

+
+#ifdef CONFIG_SPARSEMEM
+ end_pfn = roundup(end_pfn, PAGES_PER_SECTION);
+#endif
+
for (pfn = start_pfn; pfn < end_pfn; pfn++) {
/*
* There can be holes in boot-time mem_map[]s

--
Yasunori Goto

2007-05-09 03:26:25

[permalink] [raw]

Subject: Re: [RFC] memory hotremove patch take 2 [10/10] (retry swap-in page)

On Wed, 09 May 2007 12:12:32 +0900
Yasunori Goto <[email protected]> wrote:

> There is a race condition between swap-in and unmap_and_move().
> When swap-in occur, page_mapped might be not set yet.
> So, unmap_and_move() gives up at once, and tries later.
>
>
Note: this will not happen in sys_migratepage(), it holds mm->sem and
gathers migration target page from page table.

-Kame

2007-05-10 12:46:58

[permalink] [raw]

Subject: Re: [RFC] memory hotremove patch take 2 [01/10] (counter of removable page)

On Wed, 9 May 2007, Yasunori Goto wrote:

> Show #of Movable pages and vmstat.
>
> Signed-Off-By: KAMEZAWA Hiroyuki <[email protected]>
> Signed-off-by: Yasunori Goto <[email protected]>
>
> arch/ia64/mm/init.c | 2 ++
> drivers/base/node.c | 4 ++++
> fs/proc/proc_misc.c | 4 ++++
> include/linux/kernel.h | 2 ++
> include/linux/swap.h | 1 +
> mm/page_alloc.c | 22 ++++++++++++++++++++++
> 6 files changed, 35 insertions(+)
>
> Index: current_test/mm/page_alloc.c
> ===================================================================
> --- current_test.orig/mm/page_alloc.c 2007-05-08 15:06:50.000000000 +0900
> +++ current_test/mm/page_alloc.c 2007-05-08 15:08:36.000000000 +0900
> @@ -58,6 +58,7 @@ unsigned long totalram_pages __read_most
> unsigned long totalreserve_pages __read_mostly;
> long nr_swap_pages;
> int percpu_pagelist_fraction;
> +unsigned long total_movable_pages __read_mostly;
>

Is it really necessary to have this as a separate value? It could be
calculated at the same time as nr_free_movable_pages() seeing as that is
called for meminfo anyway if the read was protected with
zone_span_seqbegin()+zone_span_seqretry().

> static void __free_pages_ok(struct page *page, unsigned int order);
>
> @@ -1827,6 +1828,18 @@ static unsigned int nr_free_zone_pages(i
> return sum;
> }
>
> +unsigned int nr_free_movable_pages(void)
> +{
> + unsigned long nr_pages = 0;
> + struct zone *zone;
> + int nid;
> +
> + for_each_online_node(nid) {
> + zone = &(NODE_DATA(nid)->node_zones[ZONE_MOVABLE]);
> + nr_pages += zone_page_state(zone, NR_FREE_PAGES);
> + }
> + return nr_pages;
> +}
> /*
> * Amount of free RAM allocatable within ZONE_DMA and ZONE_NORMAL
> */
> @@ -1889,6 +1902,8 @@ void si_meminfo(struct sysinfo *val)
> val->totalhigh = totalhigh_pages;
> val->freehigh = nr_free_highpages();
> val->mem_unit = PAGE_SIZE;
> + val->movable = total_movable_pages;
> + val->free_movable = nr_free_movable_pages();
> }
>
> EXPORT_SYMBOL(si_meminfo);
> @@ -1908,6 +1923,11 @@ void si_meminfo_node(struct sysinfo *val
> val->totalhigh = 0;
> val->freehigh = 0;
> #endif
> +
> + val->movable = pgdat->node_zones[ZONE_MOVABLE].present_pages;

I think this has to be protected with zone_span_seqbegin()

> + val->free_movable = zone_page_state(&pgdat->node_zones[ZONE_MOVABLE],
> + NR_FREE_PAGES);
> +
> val->mem_unit = PAGE_SIZE;
> }
> #endif
> @@ -3216,6 +3236,8 @@ static void __meminit free_area_init_cor
>
> zone->spanned_pages = size;
> zone->present_pages = realsize;
> + if (j == ZONE_MOVABLE)
> + total_movable_pages += realsize;

If total_movable_pages is calculated at the same time as free pages,
this could go away. Similar for online_page() later

> #ifdef CONFIG_NUMA
> zone->node = nid;
> zone->min_unmapped_pages = (realsize*sysctl_min_unmapped_ratio)
> Index: current_test/include/linux/kernel.h
> ===================================================================
> --- current_test.orig/include/linux/kernel.h 2007-05-08 15:06:49.000000000 +0900
> +++ current_test/include/linux/kernel.h 2007-05-08 15:07:20.000000000 +0900
> @@ -352,6 +352,8 @@ struct sysinfo {
> unsigned short pad; /* explicit padding for m68k */
> unsigned long totalhigh; /* Total high memory size */
> unsigned long freehigh; /* Available high memory size */
> + unsigned long movable; /* pages used only for data */
> + unsigned long free_movable; /* Avaiable pages in movable */
> unsigned int mem_unit; /* Memory unit size in bytes */
> char _f[20-2*sizeof(long)-sizeof(int)]; /* Padding: libc5 uses this.. */
> };
> Index: current_test/fs/proc/proc_misc.c
> ===================================================================
> --- current_test.orig/fs/proc/proc_misc.c 2007-05-08 15:06:48.000000000 +0900
> +++ current_test/fs/proc/proc_misc.c 2007-05-08 15:07:20.000000000 +0900
> @@ -161,6 +161,8 @@ static int meminfo_read_proc(char *page,
> "LowTotal: %8lu kB\n"
> "LowFree: %8lu kB\n"
> #endif
> + "MovableTotal: %8lu kB\n"
> + "MovableFree: %8lu kB\n"
> "SwapTotal: %8lu kB\n"
> "SwapFree: %8lu kB\n"
> "Dirty: %8lu kB\n"
> @@ -191,6 +193,8 @@ static int meminfo_read_proc(char *page,
> K(i.totalram-i.totalhigh),
> K(i.freeram-i.freehigh),
> #endif
> + K(i.movable),
> + K(i.free_movable),
> K(i.totalswap),
> K(i.freeswap),
> K(global_page_state(NR_FILE_DIRTY)),
> Index: current_test/drivers/base/node.c
> ===================================================================
> --- current_test.orig/drivers/base/node.c 2007-05-08 15:06:10.000000000 +0900
> +++ current_test/drivers/base/node.c 2007-05-08 15:07:20.000000000 +0900
> @@ -55,6 +55,8 @@ static ssize_t node_read_meminfo(struct
> "Node %d LowTotal: %8lu kB\n"
> "Node %d LowFree: %8lu kB\n"
> #endif
> + "Node %d MovableTotal: %8lu kB\n"
> + "Node %d MovableFree: %8lu kB\n"
> "Node %d Dirty: %8lu kB\n"
> "Node %d Writeback: %8lu kB\n"
> "Node %d FilePages: %8lu kB\n"
> @@ -77,6 +79,8 @@ static ssize_t node_read_meminfo(struct
> nid, K(i.totalram - i.totalhigh),
> nid, K(i.freeram - i.freehigh),
> #endif
> + nid, K(i.movable),
> + nid, K(i.free_movable),
> nid, K(node_page_state(nid, NR_FILE_DIRTY)),
> nid, K(node_page_state(nid, NR_WRITEBACK)),
> nid, K(node_page_state(nid, NR_FILE_PAGES)),
> Index: current_test/arch/ia64/mm/init.c
> ===================================================================
> --- current_test.orig/arch/ia64/mm/init.c 2007-05-08 15:06:38.000000000 +0900
> +++ current_test/arch/ia64/mm/init.c 2007-05-08 15:08:29.000000000 +0900
> @@ -700,6 +700,8 @@ void online_page(struct page *page)
> __free_page(page);
> totalram_pages++;
> num_physpages++;
> + if (page_zonenum(page) == ZONE_MOVABLE)
> + total_movable_pages++;
> }
>
> int arch_add_memory(int nid, u64 start, u64 size)
> Index: current_test/include/linux/swap.h
> ===================================================================
> --- current_test.orig/include/linux/swap.h 2007-05-08 15:06:49.000000000 +0900
> +++ current_test/include/linux/swap.h 2007-05-08 15:07:20.000000000 +0900
> @@ -169,6 +169,7 @@ extern void swapin_readahead(swp_entry_t
> /* linux/mm/page_alloc.c */
> extern unsigned long totalram_pages;
> extern unsigned long totalreserve_pages;
> +extern unsigned long total_movable_pages;

If total_movable_pages was calculated on the fly, this extern could also
go away because online_page() would not need it any more.

> extern long nr_swap_pages;
> extern unsigned int nr_free_buffer_pages(void);
> extern unsigned int nr_free_pagecache_pages(void);
>
> --
> Yasunori Goto
>
>

--
Mel Gorman
Part-time Phd Student Linux Technology Center
University of Limerick IBM Dublin Software Lab

2007-05-10 12:47:19

by Andi Kleen

[permalink] [raw]

Subject: Re: [RFC] memory hotremove patch take 2 [01/10] (counter of removable page)

Yasunori Goto <[email protected]> writes:

(not a full review, just something I noticed)
> @@ -352,6 +352,8 @@ struct sysinfo {
> unsigned short pad; /* explicit padding for m68k */
> unsigned long totalhigh; /* Total high memory size */
> unsigned long freehigh; /* Available high memory size */
> + unsigned long movable; /* pages used only for data */
> + unsigned long free_movable; /* Avaiable pages in movable */

You can't just change that structure, it is exported to user space.

-Andi

2007-05-10 15:34:22

[permalink] [raw]

Subject: Re: [RFC] memory hotremove patch take 2 [02/10] (make page unused)

On Wed, 9 May 2007, Yasunori Goto wrote:

> This patch is for supporting making page unused.
>

Without reading the patch, this could also be interesting when trying to
free a block of pages for a contiguous allocation without racing against
other allocators.

> Isolate pages by capturing freed pages before inserting free_area[],
> buddy allocator.
> If you have an idea for avoiding spin_lock(), please advise me.
>

Again, commenting on this before I read the patch. Grouping pages by
mobility uses a bitmap to track flags affecting a block of pages. If you
used a bit there and added a MIGRATE_ISOLATING type, the pages on free
would get placed in those freelists. As long as MIGRATE_ISOLATING is not
in fallbacks[] in page_alloc.c, the pages would not get allocated. This
should avoid the need for a separate spinlock.

That said, it increases the size of struct zone more than yours do and
ties these patches to a part of grouping pages by mobility which you don't
do currently.

> Isolating pages in free_area[] is implemented in other patch.
>

I haven't seen that part yet but it sounds like it does something similar
to move_freepages() so there may be code to be shared there.

> Signed-Off-By: KAMEZAWA Hiroyuki <[email protected]>
> Signed-off-by: Yasunori Goto <[email protected]>
>
>
> include/linux/mmzone.h | 8 +
> include/linux/page_isolation.h | 52 +++++++++++
> mm/Kconfig | 7 +
> mm/page_alloc.c | 187 +++++++++++++++++++++++++++++++++++++++++
> 4 files changed, 254 insertions(+)
>
> Index: current_test/include/linux/mmzone.h
> ===================================================================
> --- current_test.orig/include/linux/mmzone.h 2007-05-08 15:06:49.000000000 +0900
> +++ current_test/include/linux/mmzone.h 2007-05-08 15:08:03.000000000 +0900
> @@ -314,6 +314,14 @@ struct zone {
> /* zone_start_pfn == zone_start_paddr >> PAGE_SHIFT */
> unsigned long zone_start_pfn;
>
> +#ifdef CONFIG_PAGE_ISOLATION
> + /*
> + * For pages which are not used but not free.
> + * See include/linux/page_isolation.h
> + */
> + spinlock_t isolation_lock;
> + struct list_head isolation_list;
> +#endif

Using MIGRATE_ISOLATING instead of this approach does mean that there will
be MAX_ORDER additional struct free_area added to the zone. That is more
lists than this approach.

I am somewhat suprised that CONFIG_PAGE_ISOLATION exists as a separate
option. If it was a compile-time option at all, I would expect it to
depend on memory hot-remove being selected.

> /*
> * zone_start_pfn, spanned_pages and present_pages are all
> * protected by span_seqlock. It is a seqlock because it has
> Index: current_test/mm/page_alloc.c
> ===================================================================
> --- current_test.orig/mm/page_alloc.c 2007-05-08 15:07:20.000000000 +0900
> +++ current_test/mm/page_alloc.c 2007-05-08 15:08:34.000000000 +0900
> @@ -41,6 +41,7 @@
> #include <linux/pfn.h>
> #include <linux/backing-dev.h>
> #include <linux/fault-inject.h>
> +#include <linux/page_isolation.h>
>
> #include <asm/tlbflush.h>
> #include <asm/div64.h>
> @@ -448,6 +449,9 @@ static inline void __free_one_page(struc
> if (unlikely(PageCompound(page)))
> destroy_compound_page(page, order);
>
> + if (page_under_isolation(zone, page, order))
> + return;
> +

Using MIGRATE_ISOLATING would avoid a potential list search here.

> page_idx = page_to_pfn(page) & ((1 << MAX_ORDER) - 1);
>
> VM_BUG_ON(page_idx & (order_size - 1));
> @@ -3259,6 +3263,10 @@ static void __meminit free_area_init_cor
> zone->nr_scan_inactive = 0;
> zap_zone_vm_stats(zone);
> atomic_set(&zone->reclaim_in_progress, 0);
> +#ifdef CONFIG_PAGE_ISOLATION
> + spin_lock_init(&zone->isolation_lock);
> + INIT_LIST_HEAD(&zone->isolation_list);
i> +#endif
> if (!size)
> continue;
>
> @@ -4214,3 +4222,182 @@ void set_pageblock_flags_group(struct pa
> else
> __clear_bit(bitidx + start_bitidx, bitmap);
> }
> +
> +#ifdef CONFIG_PAGE_ISOLATION
> +/*
> + * Page Isolation.
> + *
> + * If a page is removed from usual free_list and will never be used,
> + * It is linked to "struct isolation_info" and set Reserved, Private
> + * bit. page->mapping points to isolation_info in it.
> + * and page_count(page) is 0.
> + *
> + * This can be used for creating a chunk of contiguous *unused* memory.
> + *
> + * current user is Memory-Hot-Remove.
> + * maybe move to some other file is better.

page_isolation.c to match the header filename seems reasonable.
page_alloc.c has a lot of multi-function stuff like memory initialisation
in it.

> + */
> +static void
> +isolate_page_nolock(struct isolation_info *info, struct page *page, int order)
> +{
> + int pagenum;
> + pagenum = 1 << order;
> + while (pagenum > 0) {
> + SetPageReserved(page);
> + SetPagePrivate(page);
> + page->private = (unsigned long)info;
> + list_add(&page->lru, &info->pages);
> + page++;
> + pagenum--;
> + }
> +}

It's worth commenting somewhere that pages on the list in isolation_info
are always order-0.

> +
> +/*
> + * This function is called from page_under_isolation()
> + */
> +
> +int __page_under_isolation(struct zone *zone, struct page *page, int order)
> +{
> + struct isolation_info *info;
> + unsigned long pfn = page_to_pfn(page);
> + unsigned long flags;
> + int found = 0;
> +
> + spin_lock_irqsave(&zone->isolation_lock,flags);

An unwritten convention seems to be that __ versions of same-named
functions are the nolock version. i.e. I would expect
page_under_isolation() to acquire and release the spinlock and
__page_under_isolation() to do no additional locking.

Locking outside of here might make the flow a little clearer as well if
you had two returns and avoided the use of "found".

> + list_for_each_entry(info, &zone->isolation_list, list) {
> + if (info->start_pfn <= pfn && pfn < info->end_pfn) {
> + found = 1;
> + break;
> + }
> + }
> + if (found) {
> + isolate_page_nolock(info, page, order);
> + }
> + spin_unlock_irqrestore(&zone->isolation_lock, flags);
> + return found;
> +}
> +
> +/*
> + * start and end must be in the same zone.
> + *
> + */
> +struct isolation_info *
> +register_isolation(unsigned long start, unsigned long end)
> +{
> + struct zone *zone;
> + struct isolation_info *info = NULL, *tmp;
> + unsigned long flags;
> + unsigned long last_pfn = end - 1;
> +
> + if (!pfn_valid(start) || !pfn_valid(last_pfn) || (start >= end))
> + return ERR_PTR(-EINVAL);
> + /* check start and end is in the same zone */
> + zone = page_zone(pfn_to_page(start));
> +
> + if (zone != page_zone(pfn_to_page(last_pfn)))
> + return ERR_PTR(-EINVAL);
> + /* target range has to match MAX_ORDER alignmet */
> + if ((start & (MAX_ORDER_NR_PAGES - 1)) ||
> + (end & (MAX_ORDER_NR_PAGES - 1)))
> + return ERR_PTR(-EINVAL);

Why does the range have to be MAX_ORDER alighned?

> + info = kmalloc(sizeof(*info), GFP_KERNEL);
> + if (!info)
> + return ERR_PTR(-ENOMEM);
> + spin_lock_irqsave(&zone->isolation_lock, flags);
> + /* we don't allow overlap among isolation areas */
> + if (!list_empty(&zone->isolation_list)) {
> + list_for_each_entry(tmp, &zone->isolation_list, list) {
> + if (start < tmp->end_pfn && end > tmp->start_pfn) {
> + goto out_free;
> + }
> + }
> + }

Why not merge requests for overlapping isolations?

> + info->start_pfn = start;
> + info->end_pfn = end;
> + info->zone = zone;
> + INIT_LIST_HEAD(&info->list);
> + INIT_LIST_HEAD(&info->pages);
> + list_add(&info->list, &zone->isolation_list);
> +out_unlock:
> + spin_unlock_irqrestore(&zone->isolation_lock, flags);
> + return info;
> +out_free:
> + kfree(info);
> + info = ERR_PTR(-EBUSY);
> + goto out_unlock;
> +}
> +/*
> + * Remove IsolationInfo from zone.
> + * After this, we can unuse memory in info or
> + * free back to freelist.
> + */
> +
> +void
> +detach_isolation_info_zone(struct isolation_info *info)
> +{
> + unsigned long flags;
> + struct zone *zone = info->zone;
> + spin_lock_irqsave(&zone->isolation_lock,flags);
> + list_del(&info->list);
> + info->zone = NULL;
> + spin_unlock_irqrestore(&zone->isolation_lock,flags);
> +}
> +
> +/*
> + * All pages in info->pages should be remvoed before calling this.
> + * And info should be detached from zone.
> + */
> +void
> +free_isolation_info(struct isolation_info *info)
> +{
> + BUG_ON(!list_empty(&info->pages));
> + BUG_ON(info->zone);
> + kfree(info);
> + return;
> +}
> +
> +/*
> + * Mark All pages in the isolation_info to be Reserved.
> + * When onlining these pages again, a user must check
> + * which page is usable by IORESOURCE_RAM
> + * please see memory_hotplug.c/online_pages() if unclear.
> + *
> + * info should be detached from zone before calling this.
> + */
> +void
> +unuse_all_isolated_pages(struct isolation_info *info)
> +{
> + struct page *page, *n;
> + BUG_ON(info->zone);
> + list_for_each_entry_safe(page, n, &info->pages, lru) {
> + SetPageReserved(page);
> + page->private = 0;
> + ClearPagePrivate(page);
> + list_del(&page->lru);
> + }
> +}
> +
> +/*
> + * Free all pages connected in isolation list.
> + * pages are moved back to free_list.
> + */
> +void
> +free_all_isolated_pages(struct isolation_info *info)
> +{
> + struct page *page, *n;
> + BUG_ON(info->zone);
> + list_for_each_entry_safe(page, n ,&info->pages, lru) {
> + ClearPagePrivate(page);
> + ClearPageReserved(page);
> + page->private = 0;
> + list_del(&page->lru);
> + set_page_count(page, 0);
> + set_page_refcounted(page);
> + /* This is sage because info is detached from zone */

s/sage/safe/

> + __free_page(page);
> + }
> +}
> +
> +#endif /* CONFIG_PAGE_ISOLATION */
> +
> +
> Index: current_test/mm/Kconfig
> ===================================================================
> --- current_test.orig/mm/Kconfig 2007-05-08 15:06:50.000000000 +0900
> +++ current_test/mm/Kconfig 2007-05-08 15:08:31.000000000 +0900
> @@ -225,3 +225,10 @@ config DEBUG_READAHEAD
>
> Say N for production servers.
>
> +config PAGE_ISOLATION
> + bool "Page Isolation Framework"
> + help
> + This option adds page isolation framework to mm.
> + This is used for isolate amount of contiguous pages from linux
> + memory management.
> + Say N if unsure.
> Index: current_test/include/linux/page_isolation.h
> ===================================================================
> --- /dev/null 1970-01-01 00:00:00.000000000 +0000
> +++ current_test/include/linux/page_isolation.h 2007-05-08 15:08:34.000000000 +0900
> @@ -0,0 +1,52 @@
> +#ifndef __LINIX_PAGE_ISOLATION_H
> +#define __LINUX_PAGE_ISOLATION_H
> +
> +#ifdef CONFIG_PAGE_ISOLATION
> +
> +struct isolation_info {
> + struct list_head list;
> + unsigned long start_pfn;
> + unsigned long end_pfn;
> + struct zone *zone;
> + struct list_head pages;
> +};
> +
> +extern int
> +__page_under_isolation(struct zone *zone, struct page *page, int order);
> +
> +static inline int
> +page_under_isolation(struct zone *zone, struct page *page, int order)
> +{
> + if (likely(list_empty(&zone->isolation_list)))
> + return 0;
> + return __page_under_isolation(zone, page, order);
> +}
> +
> +static inline int
> +is_page_isolated(struct isolation_info *info, struct page *page)
> +{
> + if (PageReserved(page) && PagePrivate(page) &&
> + page_count(page) == 0 &&
> + page->private == (unsigned long)info)
> + return 1;
> + return 0;
> +}
> +
> +extern struct isolation_info *
> +register_isolation(unsigned long start, unsigned long end);
> +
> +extern void detach_isolation_info_zone(struct isolation_info *info);
> +extern void free_isolation_info(struct isolation_info *info);
> +extern void unuse_all_isolated_pages(struct isolation_info *info);
> +extern void free_all_isolated_pages(struct isolation_info *info);
> +
> +#else
> +
> +static inline int
> +page_under_isolation(struct zone *zone, struct page *page, int order)
> +{
> + return 0;
> +}
> +
> +#endif
> +#endif
>
> --
> Yasunori Goto
>
>

--
Mel Gorman
Part-time Phd Student Linux Technology Center
University of Limerick IBM Dublin Software Lab

2007-05-10 15:35:45

[permalink] [raw]

Subject: Re: [RFC] memory hotremove patch take 2 [03/10] (drain all pages)

On Wed, 9 May 2007, Yasunori Goto wrote:

> This patch add function drain_all_pages(void) to drain all
> pages on per-cpu-freelist.
> Page isolation will catch them in free_one_page.
>

Is this significantly different to what drain_all_local_pages() currently
does?

> Signed-Off-By: KAMEZAWA Hiroyuki <[email protected]>
> Signed-off-by: Yasunori Goto <[email protected]>
>
> include/linux/page_isolation.h | 1 +
> mm/page_alloc.c | 13 +++++++++++++
> 2 files changed, 14 insertions(+)
>
> Index: current_test/mm/page_alloc.c
> ===================================================================
> --- current_test.orig/mm/page_alloc.c 2007-05-08 15:08:03.000000000 +0900
> +++ current_test/mm/page_alloc.c 2007-05-08 15:08:33.000000000 +0900
> @@ -1070,6 +1070,19 @@ void drain_all_local_pages(void)
> smp_call_function(smp_drain_local_pages, NULL, 0, 1);
> }
>
> +#ifdef CONFIG_PAGE_ISOLATION
> +static void drain_local_zone_pages(struct work_struct *work)
> +{
> + drain_local_pages();
> +}
> +
> +void drain_all_pages(void)
> +{
> + schedule_on_each_cpu(drain_local_zone_pages);
> +}
> +
> +#endif /* CONFIG_PAGE_ISOLATION */
> +
> /*
> * Free a 0-order page
> */
> Index: current_test/include/linux/page_isolation.h
> ===================================================================
> --- current_test.orig/include/linux/page_isolation.h 2007-05-08 15:08:03.000000000 +0900
> +++ current_test/include/linux/page_isolation.h 2007-05-08 15:08:33.000000000 +0900
> @@ -39,6 +39,7 @@ extern void detach_isolation_info_zone(s
> extern void free_isolation_info(struct isolation_info *info);
> extern void unuse_all_isolated_pages(struct isolation_info *info);
> extern void free_all_isolated_pages(struct isolation_info *info);
> +extern void drain_all_pages(void);
>
> #else
>
>
> --
> Yasunori Goto
>
>

--
Mel Gorman
Part-time Phd Student Linux Technology Center
University of Limerick IBM Dublin Software Lab

2007-05-10 16:43:08

[permalink] [raw]

Subject: Re: [RFC] memory hotremove patch take 2 [04/10] (isolate all free pages)

On Wed, 9 May 2007, Yasunori Goto wrote:

> Isolate all freed pages (means in buddy_list) in the range.
> See page_buddy() and free_one_page() function if unsure.
>
> Signed-Off-By: KAMEZAWA Hiroyuki <[email protected]>
> Signed-off-by: Yasunori Goto <[email protected]>
>
> include/linux/page_isolation.h | 1
> mm/page_alloc.c | 45 +++++++++++++++++++++++++++++++++++++++++
> 2 files changed, 46 insertions(+)
>
> Index: current_test/mm/page_alloc.c
> ===================================================================
> --- current_test.orig/mm/page_alloc.c 2007-05-08 15:08:04.000000000 +0900
> +++ current_test/mm/page_alloc.c 2007-05-08 15:08:26.000000000 +0900
> @@ -4411,6 +4411,51 @@ free_all_isolated_pages(struct isolation
> }
> }
>
> +/*
> + * Isolate already freed pages.
> + */
> +int
> +capture_isolate_freed_pages(struct isolation_info *info)
> +{
> + struct zone *zone;
> + unsigned long pfn;
> + struct page *page;
> + int order, order_size;
> + int nr_pages = 0;
> + unsigned long last_pfn = info->end_pfn - 1;
> + pfn = info->start_pfn;
> + if (!pfn_valid(pfn))
> + return -EINVAL;

This may lead to boundary cases where pages cannot be captured at the
start and end of non-aligned zones due to memory holes.

> + zone = info->zone;
> + if ((zone != page_zone(pfn_to_page(pfn))) ||
> + (zone != page_zone(pfn_to_page(last_pfn))))
> + return -EINVAL;

Is this check really necessary? Surely a caller to
capture_isolate_freed_pages() will have already made all the necessary
checks when adding the struct insolation_info ?

> + drain_all_pages();
> + spin_lock(&zone->lock);
> + while (pfn < info->end_pfn) {
> + if (!pfn_valid(pfn)) {
> + pfn++;
> + continue;
> + }
> + page = pfn_to_page(pfn);
> + /* See page_is_buddy() */
> + if (page_count(page) == 0 && PageBuddy(page)) {

If PageBuddy is set it's free, you shouldn't have to check the page_count.

> + order = page_order(page);
> + order_size = 1 << order;
> + zone->free_area[order].nr_free--;
> + __mod_zone_page_state(zone, NR_FREE_PAGES, -order_size);
> + list_del(&page->lru);
> + rmv_page_order(page);
> + isolate_page_nolock(info, page, order);
> + nr_pages += order_size;
> + pfn += order_size;
> + } else {
> + pfn++;
> + }
> + }
> + spin_unlock(&zone->lock);
> + return nr_pages;
> +}
> #endif /* CONFIG_PAGE_ISOLATION */
>

This is all similar to move_freepages() other than the locking part. It
would be worth checking if there is code that could be shared or at least
have similar styles.

>
> Index: current_test/include/linux/page_isolation.h
> ===================================================================
> --- current_test.orig/include/linux/page_isolation.h 2007-05-08 15:08:04.000000000 +0900
> +++ current_test/include/linux/page_isolation.h 2007-05-08 15:08:27.000000000 +0900
> @@ -40,6 +40,7 @@ extern void free_isolation_info(struct i
> extern void unuse_all_isolated_pages(struct isolation_info *info);
> extern void free_all_isolated_pages(struct isolation_info *info);
> extern void drain_all_pages(void);
> +extern int capture_isolate_freed_pages(struct isolation_info *info);
>
> #else
>
>
> --
> Yasunori Goto
>
>

--
Mel Gorman
Part-time Phd Student Linux Technology Center
University of Limerick IBM Dublin Software Lab

2007-05-10 18:00:41

[permalink] [raw]

Subject: Re: [RFC] memory hotremove patch take 2 [01/10] (counter of removable page)

On Wed, 9 May 2007, Yasunori Goto wrote:

>
> +unsigned int nr_free_movable_pages(void)
> +{
> + unsigned long nr_pages = 0;
> + struct zone *zone;
> + int nid;
> +
> + for_each_online_node(nid) {
> + zone = &(NODE_DATA(nid)->node_zones[ZONE_MOVABLE]);
> + nr_pages += zone_page_state(zone, NR_FREE_PAGES);
> + }
> + return nr_pages;
> +}

Hmmmm... This is redoing what the vm counters already provide

Could you add

NR_MOVABLE_PAGES etc.

instead and then let the ZVC counter logic take care of the rest?

With a ZVC you will have the numbers in each zone and also in
/proc/vmstat.

(Additional ulterior motive: If we ever get away from ZONE_MOVABLE and
make movable a portion of each zone then this will still work)

2007-05-10 18:04:45

[permalink] [raw]

Subject: Re: [RFC] memory hotremove patch take 2 [02/10] (make page unused)

On Wed, 9 May 2007, Yasunori Goto wrote:

> This patch is for supporting making page unused.
>
> Isolate pages by capturing freed pages before inserting free_area[],
> buddy allocator.
> If you have an idea for avoiding spin_lock(), please advise me.

Using the zone lock instead may avoid to introduce another lock? Or is the
new lock here for performance reasons?

Isnt it possible to just add another flavor of pages like what Mel has
been doing with reclaimable and movable? I.e. add another category of free
pages to Mel's scheme called isolated and use Mel's function to move stuff
over there?

2007-05-10 18:07:41

[permalink] [raw]

Subject: Re: [RFC] memory hotremove patch take 2 [03/10] (drain all pages)

On Wed, 9 May 2007, Yasunori Goto wrote:

> This patch add function drain_all_pages(void) to drain all
> pages on per-cpu-freelist.
> Page isolation will catch them in free_one_page.

This is only draining the pcps of the local processor. I would think
that you need to drain all other processors pcps of this zone as well. And
there is no need to drain this processors pcps of other zones.

2007-05-10 18:07:52

[permalink] [raw]

Subject: Re: [RFC] memory hotremove patch take 2 [04/10] (isolate all free pages)

I think would also be taken care of by Mel's existing framework.

2007-05-10 18:09:38

[permalink] [raw]

Subject: Re: [RFC] memory hotremove patch take 2 [05/10] (make basic remove code)

On Wed, 9 May 2007, Yasunori Goto wrote:

> +/*
> + * Just an easy implementation.
> + */
> +static struct page *
> +hotremove_migrate_alloc(struct page *page,
> + unsigned long private,
> + int **x)
> +{
> + return alloc_page(GFP_HIGH_MOVABLE);
> +}

This would need to reflect the zone in which you are performing hot
remove. Or is hot remove only possible in the higest zone?

2007-05-11 00:47:58

[permalink] [raw]

Subject: Re: [RFC] memory hotremove patch take 2 [02/10] (make page unused)

On Thu, 10 May 2007 16:34:01 +0100 (IST)
Mel Gorman <[email protected]> wrote:

> > +#ifdef CONFIG_PAGE_ISOLATION
> > + /*
> > + * For pages which are not used but not free.
> > + * See include/linux/page_isolation.h
> > + */
> > + spinlock_t isolation_lock;
> > + struct list_head isolation_list;
> > +#endif
>
> Using MIGRATE_ISOLATING instead of this approach does mean that there will
> be MAX_ORDER additional struct free_area added to the zone. That is more
> lists than this approach.
>
Thank you!, its an interesting idea. I think it will make our code much
simpler. I'll look into.

> I am somewhat suprised that CONFIG_PAGE_ISOLATION exists as a separate
> option. If it was a compile-time option at all, I would expect it to
> depend on memory hot-remove being selected.
>
I myself think CONFIG_PAGE_ISOLATION can be used by some code which need to
isolate some amount of contiguous pages. So config is divided for now.
Now, CONFIG_MEMORY_HOTREMOVE selects this.
CONFIG_PAGE_ISOLATION and CONFIG_MEMORY_HOTREMOVE will be merged later
if there are no one who use this except for hot-removal.

> > /*
> > * zone_start_pfn, spanned_pages and present_pages are all
> > * protected by span_seqlock. It is a seqlock because it has
> > Index: current_test/mm/page_alloc.c
> > ===================================================================
> > --- current_test.orig/mm/page_alloc.c 2007-05-08 15:07:20.000000000 +0900
> > +++ current_test/mm/page_alloc.c 2007-05-08 15:08:34.000000000 +0900
> > @@ -41,6 +41,7 @@
> > #include <linux/pfn.h>
> > #include <linux/backing-dev.h>
> > #include <linux/fault-inject.h>
> > +#include <linux/page_isolation.h>
> >
> > #include <asm/tlbflush.h>
> > #include <asm/div64.h>
> > @@ -448,6 +449,9 @@ static inline void __free_one_page(struc
> > if (unlikely(PageCompound(page)))
> > destroy_compound_page(page, order);
> >
> > + if (page_under_isolation(zone, page, order))
> > + return;
> > +
>
> Using MIGRATE_ISOLATING would avoid a potential list search here.
>
yes. thank you.

> > page_idx = page_to_pfn(page) & ((1 << MAX_ORDER) - 1);
> >
> > VM_BUG_ON(page_idx & (order_size - 1));
> > @@ -3259,6 +3263,10 @@ static void __meminit free_area_init_cor
> > zone->nr_scan_inactive = 0;
> > zap_zone_vm_stats(zone);
> > atomic_set(&zone->reclaim_in_progress, 0);
> > +#ifdef CONFIG_PAGE_ISOLATION
> > + spin_lock_init(&zone->isolation_lock);
> > + INIT_LIST_HEAD(&zone->isolation_list);
> i> +#endif
> > if (!size)
> > continue;
> >
> > @@ -4214,3 +4222,182 @@ void set_pageblock_flags_group(struct pa
> > else
> > __clear_bit(bitidx + start_bitidx, bitmap);
> > }
> > +
> > +#ifdef CONFIG_PAGE_ISOLATION
> > +/*
> > + * Page Isolation.
> > + *
> > + * If a page is removed from usual free_list and will never be used,
> > + * It is linked to "struct isolation_info" and set Reserved, Private
> > + * bit. page->mapping points to isolation_info in it.
> > + * and page_count(page) is 0.
> > + *
> > + * This can be used for creating a chunk of contiguous *unused* memory.
> > + *
> > + * current user is Memory-Hot-Remove.
> > + * maybe move to some other file is better.
>
> page_isolation.c to match the header filename seems reasonable.
> page_alloc.c has a lot of multi-function stuff like memory initialisation
> in it.

Hmm.

>
> > + */
> > +static void
> > +isolate_page_nolock(struct isolation_info *info, struct page *page, int order)
> > +{
> > + int pagenum;
> > + pagenum = 1 << order;
> > + while (pagenum > 0) {
> > + SetPageReserved(page);
> > + SetPagePrivate(page);
> > + page->private = (unsigned long)info;
> > + list_add(&page->lru, &info->pages);
> > + page++;
> > + pagenum--;
> > + }
> > +}
>
> It's worth commenting somewhere that pages on the list in isolation_info
> are always order-0.
>
okay.

> > +
> > +/*
> > + * This function is called from page_under_isolation()
> > + */
> > +
> > +int __page_under_isolation(struct zone *zone, struct page *page, int order)
> > +{
> > + struct isolation_info *info;
> > + unsigned long pfn = page_to_pfn(page);
> > + unsigned long flags;
> > + int found = 0;
> > +
> > + spin_lock_irqsave(&zone->isolation_lock,flags);
>
> An unwritten convention seems to be that __ versions of same-named
> functions are the nolock version. i.e. I would expect
> page_under_isolation() to acquire and release the spinlock and
> __page_under_isolation() to do no additional locking.
>
> Locking outside of here might make the flow a little clearer as well if
> you had two returns and avoided the use of "found".
>
Maybe MOVABLE_ISOLATING will simplify these code.

> > + list_for_each_entry(info, &zone->isolation_list, list) {
> > + if (info->start_pfn <= pfn && pfn < info->end_pfn) {
> > + found = 1;
> > + break;
> > + }
> > + }
> > + if (found) {
> > + isolate_page_nolock(info, page, order);
> > + }
> > + spin_unlock_irqrestore(&zone->isolation_lock, flags);
> > + return found;
> > +}
> > +
> > +/*
> > + * start and end must be in the same zone.
> > + *
> > + */
> > +struct isolation_info *
> > +register_isolation(unsigned long start, unsigned long end)
> > +{
> > + struct zone *zone;
> > + struct isolation_info *info = NULL, *tmp;
> > + unsigned long flags;
> > + unsigned long last_pfn = end - 1;
> > +
> > + if (!pfn_valid(start) || !pfn_valid(last_pfn) || (start >= end))
> > + return ERR_PTR(-EINVAL);
> > + /* check start and end is in the same zone */
> > + zone = page_zone(pfn_to_page(start));
> > +
> > + if (zone != page_zone(pfn_to_page(last_pfn)))
> > + return ERR_PTR(-EINVAL);
> > + /* target range has to match MAX_ORDER alignmet */
> > + if ((start & (MAX_ORDER_NR_PAGES - 1)) ||
> > + (end & (MAX_ORDER_NR_PAGES - 1)))
> > + return ERR_PTR(-EINVAL);
>
> Why does the range have to be MAX_ORDER alighned?
>
> > + info = kmalloc(sizeof(*info), GFP_KERNEL);
> > + if (!info)
> > + return ERR_PTR(-ENOMEM);
> > + spin_lock_irqsave(&zone->isolation_lock, flags);
> > + /* we don't allow overlap among isolation areas */
> > + if (!list_empty(&zone->isolation_list)) {
> > + list_for_each_entry(tmp, &zone->isolation_list, list) {
> > + if (start < tmp->end_pfn && end > tmp->start_pfn) {
> > + goto out_free;
> > + }
> > + }
> > + }
>
> Why not merge requests for overlapping isolations?

This is related to memory-unplug interface. It doesn't allow overlaping.
So this is not expected to happen. just sanity check.
but this code will be removed by MIGRATE_ISOLATING.

Thank you for your good idea.

-Kame

2007-05-11 00:48:36

[permalink] [raw]

Subject: Re: [RFC] memory hotremove patch take 2 [02/10] (make page unused)

On Thu, 10 May 2007 11:04:37 -0700 (PDT)
Christoph Lameter <[email protected]> wrote:

> On Wed, 9 May 2007, Yasunori Goto wrote:
>
> > This patch is for supporting making page unused.
> >
> > Isolate pages by capturing freed pages before inserting free_area[],
> > buddy allocator.
> > If you have an idea for avoiding spin_lock(), please advise me.
>
> Using the zone lock instead may avoid to introduce another lock? Or is the
> new lock here for performance reasons?
>
> Isnt it possible to just add another flavor of pages like what Mel has
> been doing with reclaimable and movable? I.e. add another category of free
> pages to Mel's scheme called isolated and use Mel's function to move stuff
> over there?
>
Mel-san's idea seems good. So we'll rewrite the whole this patch.

Thank you.
-Kame

2007-05-11 00:53:19

[permalink] [raw]

Subject: Re: [RFC] memory hotremove patch take 2 [03/10] (drain all pages)

On Thu, 10 May 2007 16:35:37 +0100 (IST)
Mel Gorman <[email protected]> wrote:

> On Wed, 9 May 2007, Yasunori Goto wrote:
>
> > This patch add function drain_all_pages(void) to drain all
> > pages on per-cpu-freelist.
> > Page isolation will catch them in free_one_page.
> >
>
> Is this significantly different to what drain_all_local_pages() currently
> does?
>

no difference. this duplicating it..... thank you for pointing out.
Maybe I missed this because this func only exists in -mm.

Regards,
-Kame

2007-05-11 00:54:28

[permalink] [raw]

Subject: Re: [RFC] memory hotremove patch take 2 [03/10] (drain all pages)

On Thu, 10 May 2007 11:07:08 -0700 (PDT)
Christoph Lameter <[email protected]> wrote:

> On Wed, 9 May 2007, Yasunori Goto wrote:
>
> > This patch add function drain_all_pages(void) to drain all
> > pages on per-cpu-freelist.
> > Page isolation will catch them in free_one_page.
>
> This is only draining the pcps of the local processor. I would think
> that you need to drain all other processors pcps of this zone as well. And
> there is no need to drain this processors pcps of other zones.
>

As Mel-san pointed, -mm has drain_all_local_pages(). We'll use it.

Thanks,
-Kame

2007-05-11 00:57:44

[permalink] [raw]

Subject: Re: [RFC] memory hotremove patch take 2 [04/10] (isolate all free pages)

On Thu, 10 May 2007 17:42:54 +0100 (IST)
Mel Gorman <[email protected]> wrote:

> > + if (!pfn_valid(pfn))
> > + return -EINVAL;
>
> This may lead to boundary cases where pages cannot be captured at the
> start and end of non-aligned zones due to memory holes.
>
Hm, ok. maybe we can remove this.

> > + zone = info->zone;
> > + if ((zone != page_zone(pfn_to_page(pfn))) ||
> > + (zone != page_zone(pfn_to_page(last_pfn))))
> > + return -EINVAL;
>
> Is this check really necessary? Surely a caller to
> capture_isolate_freed_pages() will have already made all the necessary
> checks when adding the struct insolation_info ?
>
just because isolation_info is treated per zone.
Maybe MIGRATE_ISOLATING can allow us more flexible approach.

> > + drain_all_pages();
> > + spin_lock(&zone->lock);
> > + while (pfn < info->end_pfn) {
> > + if (!pfn_valid(pfn)) {
> > + pfn++;
> > + continue;
> > + }
> > + page = pfn_to_page(pfn);
> > + /* See page_is_buddy() */
> > + if (page_count(page) == 0 && PageBuddy(page)) {
>
> If PageBuddy is set it's free, you shouldn't have to check the page_count.
>
ok.

> > + order = page_order(page);
> > + order_size = 1 << order;
> > + zone->free_area[order].nr_free--;
> > + __mod_zone_page_state(zone, NR_FREE_PAGES, -order_size);
> > + list_del(&page->lru);
> > + rmv_page_order(page);
> > + isolate_page_nolock(info, page, order);
> > + nr_pages += order_size;
> > + pfn += order_size;
> > + } else {
> > + pfn++;
> > + }
> > + }
> > + spin_unlock(&zone->lock);
> > + return nr_pages;
> > +}
> > #endif /* CONFIG_PAGE_ISOLATION */
> >
>
> This is all similar to move_freepages() other than the locking part. It
> would be worth checking if there is code that could be shared or at least
> have similar styles.

Thank you, I'll look into move_freepages().

-Kame

2007-05-11 01:05:13

[permalink] [raw]

Subject: Re: [RFC] memory hotremove patch take 2 [05/10] (make basic remove code)

On Thu, 10 May 2007 11:09:29 -0700 (PDT)
Christoph Lameter <[email protected]> wrote:

> On Wed, 9 May 2007, Yasunori Goto wrote:
>
> > +/*
> > + * Just an easy implementation.
> > + */
> > +static struct page *
> > +hotremove_migrate_alloc(struct page *page,
> > + unsigned long private,
> > + int **x)
> > +{
> > + return alloc_page(GFP_HIGH_MOVABLE);
> > +}
>
> This would need to reflect the zone in which you are performing hot
> remove. Or is hot remove only possible in the higest zone?
>
No. We'll allow hot remove in any zone-type.
My old patchest didn't include Mel-san's page grouping and just had
ZONE_MOVABLE, so I wrote this. Reflecting migration target's zone here
is reasobanle.

Anyway, I think we'll need more complicated function here.

Thanks,
-Kame

2007-05-11 01:08:34

[permalink] [raw]

Subject: Re: [RFC] memory hotremove patch take 2 [01/10] (counter of removable page)

On 10 May 2007 15:44:08 +0200
Andi Kleen <[email protected]> wrote:

> Yasunori Goto <[email protected]> writes:
>
>
> (not a full review, just something I noticed)
> > @@ -352,6 +352,8 @@ struct sysinfo {
> > unsigned short pad; /* explicit padding for m68k */
> > unsigned long totalhigh; /* Total high memory size */
> > unsigned long freehigh; /* Available high memory size */
> > + unsigned long movable; /* pages used only for data */
> > + unsigned long free_movable; /* Avaiable pages in movable */
>
> You can't just change that structure, it is exported to user space.
>
Okay. We'll drop this.

Thanks,
-Kame

2007-05-11 01:09:22