2004-01-08 07:37:29

by IWAMOTO Toshihiro

[permalink] [raw]
Subject: a new version of memory hotremove patch

Hi,

This is an update of the memory hot removal patch.
As I'll merge this patch over Goto-san's hotplug patch
(http://marc.theaimsgroup.com/?l=linux-kernel&m=107214532922184&w=2),
this will be the final version in this standalone form.
When that is done, there will be no more zone_active checks in page
allocation code.

Changes from the previous version (dated 20031126) are:
- Implemented remapping of mlock()'d pages.
This is done by adding an argument to try_to_unmap() function to be
able to ignore VM_LOCKED bit.

- Hacks to make kswapd more aggressive on disabled zones were
removed. Remapping and swapping out should be used together to make
system performance impact caused by memory hot removal low. I think
switching remapping and swapping out based on whether pages are on
active lists or inactive lists is an easy and acceptable solution.
There may be better threshold, and the above idea may be
inappropriate for systems where the cost of memcpy is very high.

- Bugfixes. truncate detection, page dirty bit handling, more
PageAgain handling.

Known problems:
- If a page is in mapping->io_pages when remap happens, it will be
moved to dirty_pages. Tracking page->list to find out the list
which page is connected to would be too expensive, and I have no other
idea.

- It seems there's a very small possibility of race between remap and
move_from_swap_cache. I've added some code for this, but it is
essentially untested.


I guess many of you think this patch has no use for yourselves, but it
was at least useful for finding some kind of kernel memory leaks. :)

http://people.valinux.co.jp/~iwamoto/mh.html
contains some patch explaination and usage info. This page hasn't
changed much since the last post.

$Id: memoryhotplug.patch,v 1.42 2004/01/08 03:23:07 iwamoto Exp $

diff -dpur linux-2.6.0/arch/i386/Kconfig linux-2.6.0-mh/arch/i386/Kconfig
--- linux-2.6.0/arch/i386/Kconfig Thu Dec 18 11:58:16 2003
+++ linux-2.6.0-mh/arch/i386/Kconfig Thu Dec 25 11:02:27 2003
@@ -706,14 +706,18 @@ comment "NUMA (NUMA-Q) requires SMP, 64G
comment "NUMA (Summit) requires SMP, 64GB highmem support, full ACPI"
depends on X86_SUMMIT && (!HIGHMEM64G || !ACPI || ACPI_HT_ONLY)

+config MEMHOTPLUGTEST
+ bool "Memory hotplug test"
+ default n
+
config DISCONTIGMEM
bool
- depends on NUMA
+ depends on NUMA || MEMHOTPLUGTEST
default y

config HAVE_ARCH_BOOTMEM_NODE
bool
- depends on NUMA
+ depends on NUMA || MEMHOTPLUGTEST
default y

config HIGHPTE
diff -dpur linux-2.6.0/arch/i386/mm/discontig.c linux-2.6.0-mh/arch/i386/mm/discontig.c
--- linux-2.6.0/arch/i386/mm/discontig.c Thu Dec 18 11:58:57 2003
+++ linux-2.6.0-mh/arch/i386/mm/discontig.c Thu Dec 25 11:02:27 2003
@@ -28,6 +28,7 @@
#include <linux/mmzone.h>
#include <linux/highmem.h>
#include <linux/initrd.h>
+#include <linux/proc_fs.h>
#include <asm/e820.h>
#include <asm/setup.h>
#include <asm/mmzone.h>
@@ -111,6 +112,49 @@ int __init get_memcfg_numa_flat(void)
return 1;
}

+int __init get_memcfg_numa_blks(void)
+{
+ int i, pfn;
+
+ printk("NUMA - single node, flat memory mode, but broken in several blocks\n");
+
+ /* Run the memory configuration and find the top of memory. */
+ find_max_pfn();
+ if (max_pfn & (PTRS_PER_PTE - 1)) {
+ pfn = max_pfn & ~(PTRS_PER_PTE - 1);
+ printk("Rounding down maxpfn %d -> %d\n", max_pfn, pfn);
+ max_pfn = pfn;
+ }
+ for(i = 0; i < MAX_NUMNODES; i++) {
+ pfn = PFN_DOWN(1 << 30) * i;
+ node_start_pfn[i] = pfn;
+ pfn += PFN_DOWN(1 << 30);
+ if (pfn < max_pfn)
+ node_end_pfn[i] = pfn;
+ else {
+ node_end_pfn[i] = max_pfn;
+ i++;
+ printk("total %d blocks, max %d\n", i, max_pfn);
+ break;
+ }
+ }
+
+ /* Fill in the physnode_map with our simplistic memory model,
+ * all memory is in node 0.
+ */
+ for (pfn = node_start_pfn[0]; pfn <= max_pfn;
+ pfn += PAGES_PER_ELEMENT)
+ {
+ physnode_map[pfn / PAGES_PER_ELEMENT] = pfn / PFN_DOWN(1 << 30);
+ }
+
+ /* Indicate there is one node available. */
+ node_set_online(0);
+ numnodes = i;
+
+ return 1;
+}
+
/*
* Find the highest page frame number we have available for the node
*/
@@ -183,6 +227,8 @@ static void __init register_bootmem_low_
}
}

+static struct kcore_list numa_kc;
+
void __init remap_numa_kva(void)
{
void *vaddr;
@@ -196,7 +242,11 @@ void __init remap_numa_kva(void)
node_remap_start_pfn[node] + pfn,
PAGE_KERNEL_LARGE);
}
+ memset(node_remap_start_vaddr[node], 0,
+ node_remap_size[node] * PAGE_SIZE);
}
+ kclist_add(&numa_kc, node_remap_start_vaddr[numnodes - 1],
+ node_remap_offset[numnodes - 1] << PAGE_SHIFT);
}

static unsigned long calculate_numa_remap_pages(void)
diff -dpur linux-2.6.0/include/asm-i386/mmzone.h linux-2.6.0-mh/include/asm-i386/mmzone.h
--- linux-2.6.0/include/asm-i386/mmzone.h Thu Dec 18 11:58:49 2003
+++ linux-2.6.0-mh/include/asm-i386/mmzone.h Thu Dec 25 11:02:27 2003
@@ -128,6 +128,10 @@ static inline struct pglist_data *pfn_to
#endif /* CONFIG_X86_NUMAQ */

extern int get_memcfg_numa_flat(void );
+#ifdef CONFIG_MEMHOTPLUGTEST
+extern int get_memcfg_numa_blks(void);
+#endif
+
/*
* This allows any one NUMA architecture to be compiled
* for, and still fall back to the flat function if it
@@ -143,6 +147,10 @@ static inline void get_memcfg_numa(void)
return;
#endif

+#ifdef CONFIG_MEMHOTPLUGTEST
+ get_memcfg_numa_blks();
+ return;
+#endif
get_memcfg_numa_flat();
}

diff -dpur linux-2.6.0/include/asm-i386/numnodes.h linux-2.6.0-mh/include/asm-i386/numnodes.h
--- linux-2.6.0/include/asm-i386/numnodes.h Thu Dec 18 11:58:16 2003
+++ linux-2.6.0-mh/include/asm-i386/numnodes.h Thu Dec 25 11:02:27 2003
@@ -13,6 +13,10 @@
/* Max 8 Nodes */
#define NODES_SHIFT 3

+#elif defined(CONFIG_MEMHOTPLUGTEST)
+
+#define NODES_SHIFT 3
+
#endif /* CONFIG_X86_NUMAQ */

#endif /* _ASM_MAX_NUMNODES_H */
diff -dpur linux-2.6.0/include/linux/mm.h linux-2.6.0-mh/include/linux/mm.h
--- linux-2.6.0/include/linux/mm.h Thu Dec 18 11:58:05 2003
+++ linux-2.6.0-mh/include/linux/mm.h Thu Dec 25 11:02:27 2003
@@ -219,7 +219,14 @@ struct page {
*/
#define put_page_testzero(p) \
({ \
- BUG_ON(page_count(p) == 0); \
+ if (page_count(p) == 0) { \
+ int i; \
+ printk("Page: %lx ", (long)p); \
+ for(i = 0; i < sizeof(struct page); i++) \
+ printk(" %02x", ((unsigned char *)p)[i]); \
+ printk("\n"); \
+ BUG(); \
+ } \
atomic_dec_and_test(&(p)->count); \
})

diff -dpur linux-2.6.0/include/linux/mmzone.h linux-2.6.0-mh/include/linux/mmzone.h
--- linux-2.6.0/include/linux/mmzone.h Thu Dec 18 11:58:57 2003
+++ linux-2.6.0-mh/include/linux/mmzone.h Thu Dec 25 11:02:27 2003
@@ -360,6 +360,10 @@ static inline unsigned int num_online_me
return num;
}

+#ifdef CONFIG_MEMHOTPLUGTEST
+int zone_activep(const struct zone *);
+int remapd(void *p);
+#endif
#else /* !CONFIG_DISCONTIGMEM && !CONFIG_NUMA */

#define node_online(node) \
diff -dpur linux-2.6.0/include/linux/page-flags.h linux-2.6.0-mh/include/linux/page-flags.h
--- linux-2.6.0/include/linux/page-flags.h Thu Dec 18 11:59:06 2003
+++ linux-2.6.0-mh/include/linux/page-flags.h Thu Dec 25 11:02:27 2003
@@ -76,6 +76,8 @@
#define PG_reclaim 18 /* To be reclaimed asap */
#define PG_compound 19 /* Part of a compound page */

+#define PG_again 20
+

/*
* Global page accounting. One instance per CPU. Only unsigned longs are
@@ -268,6 +270,10 @@ extern void get_full_page_state(struct p
#define PageCompound(page) test_bit(PG_compound, &(page)->flags)
#define SetPageCompound(page) set_bit(PG_compound, &(page)->flags)
#define ClearPageCompound(page) clear_bit(PG_compound, &(page)->flags)
+
+#define PageAgain(page) test_bit(PG_again, &(page)->flags)
+#define SetPageAgain(page) set_bit(PG_again, &(page)->flags)
+#define ClearPageAgain(page) clear_bit(PG_again, &(page)->flags)

/*
* The PageSwapCache predicate doesn't use a PG_flag at this time,
diff -dpur linux-2.6.0/include/linux/swap.h linux-2.6.0-mh/include/linux/swap.h
--- linux-2.6.0/include/linux/swap.h Thu Dec 18 11:58:04 2003
+++ linux-2.6.0-mh/include/linux/swap.h Thu Dec 25 11:02:27 2003
@@ -183,13 +183,13 @@ int FASTCALL(page_referenced(struct page
struct pte_chain *FASTCALL(page_add_rmap(struct page *, pte_t *,
struct pte_chain *));
void FASTCALL(page_remove_rmap(struct page *, pte_t *));
-int FASTCALL(try_to_unmap(struct page *));
+int FASTCALL(try_to_unmap(struct page *, int));

/* linux/mm/shmem.c */
extern int shmem_unuse(swp_entry_t entry, struct page *page);
#else
#define page_referenced(page) TestClearPageReferenced(page)
-#define try_to_unmap(page) SWAP_FAIL
+#define try_to_unmap(page, force) SWAP_FAIL
#endif /* CONFIG_MMU */

/* return values of try_to_unmap */
diff -dpur linux-2.6.0/mm/filemap.c linux-2.6.0-mh/mm/filemap.c
--- linux-2.6.0/mm/filemap.c Thu Dec 18 11:58:40 2003
+++ linux-2.6.0-mh/mm/filemap.c Thu Dec 25 11:02:27 2003
@@ -448,7 +448,8 @@ repeat:
spin_lock(&mapping->page_lock);

/* Has the page been truncated while we slept? */
- if (page->mapping != mapping || page->index != offset) {
+ if (page->mapping != mapping || page->index != offset ||
+ PageAgain(page)) {
unlock_page(page);
page_cache_release(page);
goto repeat;
@@ -677,6 +678,12 @@ page_not_up_to_date:
goto page_ok;
}

+ if (PageAgain(page)) {
+ unlock_page(page);
+ page_cache_release(page);
+ goto find_page;
+ }
+
readpage:
/* ... and start the actual read. The read will unlock the page. */
error = mapping->a_ops->readpage(filp, page);
@@ -1120,6 +1127,12 @@ page_not_uptodate:
goto success;
}

+ if (PageAgain(page)) {
+ unlock_page(page);
+ page_cache_release(page);
+ goto retry_find;
+ }
+
if (!mapping->a_ops->readpage(file, page)) {
wait_on_page_locked(page);
if (PageUptodate(page))
@@ -1228,6 +1241,12 @@ page_not_uptodate:
goto success;
}

+ if (PageAgain(page)) {
+ unlock_page(page);
+ page_cache_release(page);
+ goto retry_find;
+ }
+
if (!mapping->a_ops->readpage(file, page)) {
wait_on_page_locked(page);
if (PageUptodate(page))
@@ -1436,6 +1455,11 @@ retry:
if (PageUptodate(page)) {
unlock_page(page);
goto out;
+ }
+ if (PageAgain(page)) {
+ unlock_page(page);
+ page_cache_release(page);
+ goto retry;
}
err = filler(data, page);
if (err < 0) {
diff -dpur linux-2.6.0/mm/memory.c linux-2.6.0-mh/mm/memory.c
--- linux-2.6.0/mm/memory.c Thu Dec 18 11:58:48 2003
+++ linux-2.6.0-mh/mm/memory.c Thu Dec 25 11:02:27 2003
@@ -1232,6 +1232,7 @@ static int do_swap_page(struct mm_struct

pte_unmap(page_table);
spin_unlock(&mm->page_table_lock);
+again:
page = lookup_swap_cache(entry);
if (!page) {
swapin_readahead(entry);
@@ -1264,6 +1265,12 @@ static int do_swap_page(struct mm_struct
goto out;
}
lock_page(page);
+ if (PageAgain(page)) {
+ unlock_page(page);
+ page_cache_release(page);
+ pte_chain_free(pte_chain);
+ goto again;
+ }

/*
* Back out if somebody else faulted in this pte while we
diff -dpur linux-2.6.0/mm/page_alloc.c linux-2.6.0-mh/mm/page_alloc.c
--- linux-2.6.0/mm/page_alloc.c Thu Dec 18 11:58:08 2003
+++ linux-2.6.0-mh/mm/page_alloc.c Thu Dec 25 11:02:27 2003
@@ -31,6 +31,7 @@
#include <linux/topology.h>
#include <linux/sysctl.h>
#include <linux/cpu.h>
+#include <linux/proc_fs.h>

#include <asm/tlbflush.h>

@@ -52,6 +53,9 @@ EXPORT_SYMBOL(nr_swap_pages);
*/
struct zone *zone_table[MAX_NR_ZONES*MAX_NUMNODES];
EXPORT_SYMBOL(zone_table);
+#ifdef CONFIG_MEMHOTPLUGTEST
+static char zone_active[MAX_NR_ZONES*MAX_NUMNODES];
+#endif

static char *zone_names[MAX_NR_ZONES] = { "DMA", "Normal", "HighMem" };
int min_free_kbytes = 1024;
@@ -221,6 +225,7 @@ static inline void free_pages_check(cons
1 << PG_active |
1 << PG_reclaim |
1 << PG_slab |
+ 1 << PG_again |
1 << PG_writeback )))
bad_page(function, page);
if (PageDirty(page))
@@ -326,12 +331,13 @@ static void prep_new_page(struct page *p
1 << PG_active |
1 << PG_dirty |
1 << PG_reclaim |
+ 1 << PG_again |
1 << PG_writeback )))
bad_page(__FUNCTION__, page);

page->flags &= ~(1 << PG_uptodate | 1 << PG_error |
1 << PG_referenced | 1 << PG_arch_1 |
- 1 << PG_checked | 1 << PG_mappedtodisk);
+ 1 << PG_checked | 1 << PG_mappedtodisk | 1 << PG_again);
page->private = 0;
set_page_refs(page, order);
}
@@ -411,7 +417,9 @@ int is_head_of_free_region(struct page *
spin_unlock_irqrestore(&zone->lock, flags);
return 0;
}
+#endif

+#if defined(CONFIG_SOFTWARE_SUSPEND) || defined(CONFIG_MEMHOTPLUGTEST)
/*
* Spill all of this CPU's per-cpu pages back into the buddy allocator.
*/
@@ -512,9 +520,28 @@ static struct page *buffered_rmqueue(str
mod_page_state(pgalloc, 1 << order);
prep_new_page(page, order);
}
+#ifdef CONFIG_MEMHOTPLUGTEST
+ if (page != NULL && ! zone_active[page->flags >> ZONE_SHIFT])
+ printk("alloc_page from disabled zone: %p\n", page);
+#endif
return page;
}

+#ifdef CONFIG_MEMHOTPLUGTEST
+int
+zone_activep(const struct zone *z)
+{
+ int i;
+
+ for(i = 0; ; i++) {
+ if (zone_table[i] == z)
+ return zone_active[i];
+ if (zone_table[i] == NULL)
+ BUG();
+ }
+}
+#endif
+
/*
* This is the 'heart' of the zoned buddy allocator.
*
@@ -562,6 +589,10 @@ __alloc_pages(unsigned int gfp_mask, uns
struct zone *z = zones[i];
unsigned long local_low;

+#ifdef CONFIG_MEMHOTPLUGTEST
+ if (! zone_activep(z))
+ continue;
+#endif
/*
* This is the fabled 'incremental min'. We let real-time tasks
* dip their real-time paws a little deeper into reserves.
@@ -590,6 +621,10 @@ __alloc_pages(unsigned int gfp_mask, uns
unsigned long local_min;
struct zone *z = zones[i];

+#ifdef CONFIG_MEMHOTPLUGTEST
+ if (! zone_activep(z))
+ continue;
+#endif
local_min = z->pages_min;
if (gfp_mask & __GFP_HIGH)
local_min >>= 2;
@@ -613,6 +648,10 @@ rebalance:
for (i = 0; zones[i] != NULL; i++) {
struct zone *z = zones[i];

+#ifdef CONFIG_MEMHOTPLUGTEST
+ if (! zone_activep(z))
+ continue;
+#endif
page = buffered_rmqueue(z, order, cold);
if (page)
goto got_pg;
@@ -638,6 +677,10 @@ rebalance:
for (i = 0; zones[i] != NULL; i++) {
struct zone *z = zones[i];

+#ifdef CONFIG_MEMHOTPLUGTEST
+ if (! zone_activep(z))
+ continue;
+#endif
min += z->pages_min;
if (z->free_pages >= min ||
(!wait && z->free_pages >= z->pages_high)) {
@@ -1076,6 +1119,9 @@ static int __init build_zonelists_node(p
static void __init build_zonelists(pg_data_t *pgdat)
{
int i, j, k, node, local_node;
+#ifdef CONFIG_MEMHOTPLUGTEST
+ struct zone *zone;
+#endif

local_node = pgdat->node_id;
printk("Building zonelist for node : %d\n", local_node);
@@ -1091,7 +1137,7 @@ static void __init build_zonelists(pg_da
k = ZONE_HIGHMEM;
if (i & __GFP_DMA)
k = ZONE_DMA;
-
+#ifndef CONFIG_MEMHOTPLUGTEST
j = build_zonelists_node(pgdat, zonelist, j, k);
/*
* Now we build the zonelist so that it contains the zones
@@ -1107,6 +1153,23 @@ static void __init build_zonelists(pg_da
j = build_zonelists_node(NODE_DATA(node), zonelist, j, k);

zonelist->zones[j++] = NULL;
+#else
+ for(; k >= 0; k--) {
+ zone = pgdat->node_zones + k;
+ if (zone->present_pages)
+ zonelist->zones[j++] = zone;
+ for (node = local_node + 1; node < numnodes; node++) {
+ zone = NODE_DATA(node)->node_zones + k;
+ if (zone->present_pages)
+ zonelist->zones[j++] = zone;
+ }
+ for (node = 0; node < local_node; node++) {
+ zone = NODE_DATA(node)->node_zones + k;
+ if (zone->present_pages)
+ zonelist->zones[j++] = zone;
+ }
+ }
+#endif
}
}

@@ -1252,6 +1315,9 @@ static void __init free_area_init_core(s
unsigned long batch;

zone_table[nid * MAX_NR_ZONES + j] = zone;
+#ifdef CONFIG_MEMHOTPLUGTEST
+ zone_active[nid * MAX_NR_ZONES + j] = 1;
+#endif
realsize = size = zones_size[j];
if (zholes_size)
realsize -= zholes_size[j];
@@ -1644,3 +1710,145 @@ int min_free_kbytes_sysctl_handler(ctl_t
setup_per_zone_pages_min();
return 0;
}
+
+#ifdef CONFIG_MEMHOTPLUGTEST
+static int mhtest_read(char *page, char **start, off_t off, int count,
+ int *eof, void *data)
+{
+ char *p;
+ int i, len;
+ const struct zone *z;
+
+ p = page;
+ for(i = 0; ; i++) {
+ z = zone_table[i];
+ if (z == NULL)
+ break;
+ if (! z->present_pages)
+ /* skip empty zone */
+ continue;
+ len = sprintf(p, "Zone %d: %sabled free %d, active %d, present %d\n", i,
+ zone_active[i] ? "en" : "dis", z->free_pages, z->nr_active,
+ z->present_pages);
+ p += len;
+ }
+ len = p - page;
+
+ if (len <= off + count)
+ *eof = 1;
+ *start = page + off;
+ len -= off;
+ if (len < 0)
+ len = 0;
+ if (len > count)
+ len = count;
+
+ return len;
+}
+
+static int mhtest_write(struct file *file, const char *buffer,
+ unsigned long count, void *data)
+{
+ unsigned long idx;
+ char buf[64], *p;
+ int i;
+ struct list_head *l;
+
+ if (count > sizeof(buf) - 1)
+ count = sizeof(buf) - 1;
+ if (copy_from_user(buf, buffer, count))
+ return -EFAULT;
+
+ buf[count] = 0;
+
+ p = strchr(buf, ' ');
+ if (p == NULL)
+ goto out;
+
+ *p++ = '\0';
+ idx = simple_strtoul(p, NULL, 0);
+
+ if (idx > MAX_NR_ZONES*MAX_NUMNODES) {
+ printk("Argument out of range\n");
+ goto out;
+ }
+ if (strcmp(buf, "disable") == 0) {
+ printk("disable %d\n", idx);
+ /* XXX */
+ for (i = 0; i < NR_CPUS; i++) {
+ struct per_cpu_pages *pcp;
+
+ pcp = &zone_table[idx]->pageset[i].pcp[0]; /* hot */
+ pcp->low = pcp->high = 0;
+
+ pcp = &zone_table[idx]->pageset[i].pcp[1]; /* cold */
+ pcp->low = pcp->high = 0;
+ }
+ zone_active[idx] = 0;
+ zone_table[idx]->pages_high = zone_table[idx]->present_pages;
+ } else if (strcmp(buf, "purge") == 0) {
+ if (zone_active[idx])
+ printk("Zone %d still active (proceeding anyway)\n",
+ idx);
+ printk("purge %d\n", idx);
+ wake_up_interruptible(&zone_table[idx]->zone_pgdat->kswapd_wait);
+ /* XXX overkill, but who cares? */
+ on_each_cpu(drain_local_pages, NULL, 1, 1);
+ } else if (strcmp(buf, "enable") == 0) {
+ printk("enable %d\n", idx);
+ zone_active[idx] = 1;
+ zone_table[idx]->pages_high =
+ zone_table[idx]->pages_min * 3;
+ /* XXX */
+ for (i = 0; i < NR_CPUS; i++) {
+ struct per_cpu_pages *pcp;
+
+ pcp = &zone_table[idx]->pageset[i].pcp[0]; /* hot */
+ pcp->low = 2 * pcp->batch;
+ pcp->high = 6 * pcp->batch;
+
+ pcp = &zone_table[idx]->pageset[i].pcp[1]; /* cold */
+ pcp->high = 2 * pcp->batch;
+ }
+ } else if (strcmp(buf, "remap") == 0) {
+ on_each_cpu(drain_local_pages, NULL, 1, 1);
+ kernel_thread(remapd, zone_table[idx], CLONE_KERNEL);
+ } else if (strcmp(buf, "active") == 0) {
+ if (zone_table[idx] == NULL)
+ goto out;
+ spin_lock_irq(&zone_table[idx]->lru_lock);
+ i = 0;
+ list_for_each(l, &zone_table[idx]->active_list) {
+ printk(" %lx", (unsigned long)list_entry(l, struct page, lru));
+ i++;
+ if (i == 10)
+ break;
+ }
+ spin_unlock_irq(&zone_table[idx]->lru_lock);
+ printk("\n");
+ } else if (strcmp(buf, "inuse") == 0) {
+ if (zone_table[idx] == NULL)
+ goto out;
+ for(i = 0; i < zone_table[idx]->spanned_pages; i++)
+ if (page_count(&zone_table[idx]->zone_mem_map[i]))
+ printk(" %lx", (unsigned long)&zone_table[idx]->zone_mem_map[i]);
+ printk("\n");
+ }
+out:
+ return count;
+}
+
+static int __init procmhtest_init(void)
+{
+ struct proc_dir_entry *entry;
+
+ entry = create_proc_entry("memhotplug", 0, NULL);
+ if (entry == NULL)
+ return -1;
+
+ entry->read_proc = &mhtest_read;
+ entry->write_proc = &mhtest_write;
+ return 0;
+}
+__initcall(procmhtest_init);
+#endif
diff -dpur linux-2.6.0/mm/rmap.c linux-2.6.0-mh/mm/rmap.c
--- linux-2.6.0/mm/rmap.c Thu Dec 18 11:59:39 2003
+++ linux-2.6.0-mh/mm/rmap.c Thu Dec 25 11:02:27 2003
@@ -291,8 +291,8 @@ out_unlock:
* pte_chain_lock shrink_list()
* mm->page_table_lock try_to_unmap_one(), trylock
*/
-static int FASTCALL(try_to_unmap_one(struct page *, pte_addr_t));
-static int try_to_unmap_one(struct page * page, pte_addr_t paddr)
+static int FASTCALL(try_to_unmap_one(struct page *, pte_addr_t, int));
+static int try_to_unmap_one(struct page * page, pte_addr_t paddr, int force)
{
pte_t *ptep = rmap_ptep_map(paddr);
unsigned long address = ptep_to_address(ptep);
@@ -322,7 +322,7 @@ static int try_to_unmap_one(struct page
}

/* The page is mlock()d, we cannot swap it out. */
- if (vma->vm_flags & VM_LOCKED) {
+ if (! force && (vma->vm_flags & VM_LOCKED)) {
ret = SWAP_FAIL;
goto out_unlock;
}
@@ -382,7 +382,7 @@ out_unlock:
* SWAP_AGAIN - we missed a trylock, try again later
* SWAP_FAIL - the page is unswappable
*/
-int try_to_unmap(struct page * page)
+int try_to_unmap(struct page * page, int force)
{
struct pte_chain *pc, *next_pc, *start;
int ret = SWAP_SUCCESS;
@@ -398,7 +398,7 @@ int try_to_unmap(struct page * page)
BUG();

if (PageDirect(page)) {
- ret = try_to_unmap_one(page, page->pte.direct);
+ ret = try_to_unmap_one(page, page->pte.direct, force);
if (ret == SWAP_SUCCESS) {
page->pte.direct = 0;
ClearPageDirect(page);
@@ -421,7 +421,7 @@ int try_to_unmap(struct page * page)
if (victim_i == -1)
victim_i = i;

- switch (try_to_unmap_one(page, pte_paddr)) {
+ switch (try_to_unmap_one(page, pte_paddr, force)) {
case SWAP_SUCCESS:
/*
* Release a slot. If we're releasing the
diff -dpur linux-2.6.0/mm/shmem.c linux-2.6.0-mh/mm/shmem.c
--- linux-2.6.0/mm/shmem.c Thu Dec 18 11:58:48 2003
+++ linux-2.6.0-mh/mm/shmem.c Thu Dec 25 11:02:27 2003
@@ -80,7 +80,12 @@ static inline struct page *shmem_dir_all
* BLOCKS_PER_PAGE on indirect pages, assume PAGE_CACHE_SIZE:
* might be reconsidered if it ever diverges from PAGE_SIZE.
*/
+#ifdef CONFIG_MEMHOTPLUGTEST
+ return alloc_pages(gfp_mask & ~__GFP_HIGHMEM,
+ PAGE_CACHE_SHIFT-PAGE_SHIFT);
+#else
return alloc_pages(gfp_mask, PAGE_CACHE_SHIFT-PAGE_SHIFT);
+#endif
}

static inline void shmem_dir_free(struct page *page)
diff -dpur linux-2.6.0/mm/swap_state.c linux-2.6.0-mh/mm/swap_state.c
--- linux-2.6.0/mm/swap_state.c Thu Dec 18 11:58:48 2003
+++ linux-2.6.0/mm/swap_state.c Mon Jan 5 14:27:35 2004
@@ -234,12 +234,21 @@ int move_from_swap_cache(struct page *pa
spin_lock(&swapper_space.page_lock);
spin_lock(&mapping->page_lock);

+ if (radix_tree_lookup(&page->mapping->page_tree, page->index)
+ != page) {
+ /* remap in progress */
+ printk("move_from_swap_cache: under remap %p\n", page);
+ err = -EAGAIN;
+ goto out;
+ }
+
err = radix_tree_insert(&mapping->page_tree, index, page);
if (!err) {
__delete_from_swap_cache(page);
___add_to_page_cache(page, mapping, index);
}

+out:
spin_unlock(&mapping->page_lock);
spin_unlock(&swapper_space.page_lock);

diff -dpur linux-2.6.0/mm/swapfile.c linux-2.6.0-mh/mm/swapfile.c
--- linux-2.6.0/mm/swapfile.c Thu Dec 18 11:58:18 2003
+++ linux-2.6.0-mh/mm/swapfile.c Thu Dec 25 11:02:27 2003
@@ -607,6 +607,7 @@ static int try_to_unuse(unsigned int typ
*/
swap_map = &si->swap_map[i];
entry = swp_entry(type, i);
+ again:
page = read_swap_cache_async(entry);
if (!page) {
/*
@@ -641,6 +642,11 @@ static int try_to_unuse(unsigned int typ
wait_on_page_locked(page);
wait_on_page_writeback(page);
lock_page(page);
+ if (PageAgain(page)) {
+ unlock_page(page);
+ page_cache_release(page);
+ goto again;
+ }
wait_on_page_writeback(page);

/*
@@ -749,6 +755,7 @@ static int try_to_unuse(unsigned int typ

swap_writepage(page, &wbc);
lock_page(page);
+ BUG_ON(PageAgain(page));
wait_on_page_writeback(page);
}
if (PageSwapCache(page)) {
diff -dpur linux-2.6.0/mm/truncate.c linux-2.6.0-mh/mm/truncate.c
--- linux-2.6.0/mm/truncate.c Thu Dec 18 11:59:42 2003
+++ linux-2.6.0-mh/mm/truncate.c Thu Dec 25 11:02:27 2003
@@ -132,6 +132,10 @@ void truncate_inode_pages(struct address
next++;
if (TestSetPageLocked(page))
continue;
+ if (PageAgain(page)) {
+ unlock_page(page);
+ continue;
+ }
if (PageWriteback(page)) {
unlock_page(page);
continue;
@@ -165,6 +169,14 @@ void truncate_inode_pages(struct address
struct page *page = pvec.pages[i];

lock_page(page);
+ if (PageAgain(page)) {
+ unsigned long index = page->index;
+
+ unlock_page(page);
+ put_page(page);
+ page = find_lock_page(mapping, index);
+ pvec.pages[i] = page;
+ }
wait_on_page_writeback(page);
if (page->index > next)
next = page->index;
@@ -255,6 +267,14 @@ void invalidate_inode_pages2(struct addr
struct page *page = pvec.pages[i];

lock_page(page);
+ if (PageAgain(page)) {
+ unsigned long index = page->index;
+
+ unlock_page(page);
+ put_page(page);
+ page = find_lock_page(mapping, index);
+ pvec.pages[i] = page;
+ }
if (page->mapping == mapping) { /* truncate race? */
wait_on_page_writeback(page);
next = page->index + 1;
diff -dpur linux-2.6.0/mm/vmalloc.c linux-2.6.0-mh/mm/vmalloc.c
--- linux-2.6.0/mm/vmalloc.c Thu Dec 18 11:58:57 2003
+++ linux-2.6.0-mh/mm/vmalloc.c Thu Dec 25 11:02:27 2003
@@ -447,7 +447,11 @@ EXPORT_SYMBOL(__vmalloc);
*/
void *vmalloc(unsigned long size)
{
+#ifdef CONFIG_MEMHOTPLUGTEST
+ return __vmalloc(size, GFP_KERNEL, PAGE_KERNEL);
+#else
return __vmalloc(size, GFP_KERNEL | __GFP_HIGHMEM, PAGE_KERNEL);
+#endif
}

EXPORT_SYMBOL(vmalloc);
diff -dpur linux-2.6.0/mm/vmscan.c linux-2.6.0-mh/mm/vmscan.c
--- linux-2.6.0/mm/vmscan.c Thu Dec 18 11:58:15 2003
+++ linux-2.6.0-mh/mm/vmscan.c Thu Dec 25 11:02:27 2003
@@ -36,6 +36,9 @@
#include <asm/div64.h>

#include <linux/swapops.h>
+#ifdef CONFIG_KDB
+#include <linux/kdb.h>
+#endif

/*
* The "priority" of VM scanning is how much of the queues we will scan in one
@@ -315,7 +320,7 @@ shrink_list(struct list_head *page_list,
* processes. Try to unmap it here.
*/
if (page_mapped(page) && mapping) {
- switch (try_to_unmap(page)) {
+ switch (try_to_unmap(page, 0)) {
case SWAP_FAIL:
pte_chain_unlock(page);
goto activate_locked;
@@ -1048,6 +1072,448 @@ int kswapd(void *p)
balance_pgdat(pgdat, 0, &ps);
}
}
+
+#ifdef CONFIG_MEMHOTPLUGTEST
+static void
+print_buffer(struct page* page)
+{
+ struct address_space* mapping = page->mapping;
+ struct buffer_head *bh, *head;
+
+ spin_lock(&mapping->private_lock);
+ bh = head = page_buffers(page);
+ printk("buffers:");
+ do {
+ printk(" %lx %d\n", bh->b_state, atomic_read(&bh->b_count));
+
+ bh = bh->b_this_page;
+ } while (bh != head);
+ printk("\n");
+ spin_unlock(&mapping->private_lock);
+}
+/* try to remap a page. returns non-zero on failure */
+int remap_onepage(struct page *page)
+{
+ struct page *newpage;
+ struct zone *zone;
+ struct address_space *mapping;
+ void *p;
+ int waitcnt, error = -1, truncated = 0;
+
+ newpage = alloc_page(GFP_HIGHUSER);
+ if (newpage == NULL)
+ return -ENOMEM;
+ if (TestSetPageLocked(newpage))
+ BUG();
+ lock_page(page);
+ mapping = page->mapping;
+
+ if (! PagePrivate(page) && PageWriteback(page)) {
+ BUG_ON(page->mapping != &swapper_space);
+ printk("remap_onepage: swap cache? %p\n", page);
+ }
+
+ waitcnt = 100;
+ while (PageWriteback(page)) {
+ __set_current_state(TASK_INTERRUPTIBLE);
+ schedule_timeout(10);
+ __set_current_state(TASK_RUNNING);
+ if (! --waitcnt)
+ goto radixfail;
+ }
+ if (PagePrivate(page)) {
+ /* XXX copied from shrink_list() */
+ if (PageDirty(page) &&
+ is_page_cache_freeable(page) &&
+ mapping != NULL &&
+ mapping->a_ops->writepage != NULL) {
+ spin_lock(&mapping->page_lock);
+ if (test_clear_page_dirty(page)) {
+ int res;
+ struct writeback_control wbc = {
+ .sync_mode = WB_SYNC_NONE,
+ .nr_to_write = SWAP_CLUSTER_MAX,
+ .nonblocking = 1,
+ .for_reclaim = 1,
+ };
+
+ list_move(&page->list, &mapping->locked_pages);
+ spin_unlock(&mapping->page_lock);
+
+ SetPageReclaim(page);
+ res = mapping->a_ops->writepage(page, &wbc);
+
+ if (res == WRITEPAGE_ACTIVATE) {
+ ClearPageReclaim(page);
+ goto radixfail;
+ }
+ if (!PageWriteback(page)) {
+ /* synchronous write or broken a_ops? */
+ ClearPageReclaim(page);
+ }
+ lock_page(page);
+ mapping = page->mapping;
+ if (! PagePrivate(page))
+ goto bufferdone;
+ } else
+ spin_unlock(&mapping->page_lock);
+ }
+
+ waitcnt = 100;
+ while (1) {
+ if (try_to_release_page(page, GFP_KERNEL))
+ break;
+ __set_current_state(TASK_INTERRUPTIBLE);
+ schedule_timeout(10);
+ __set_current_state(TASK_RUNNING);
+ if (! --waitcnt) {
+ print_buffer(page);
+ goto radixfail;
+ }
+ }
+ }
+bufferdone:
+ if (mapping == NULL) {
+ if (! page_mapped(page)) {
+ if (page_count(page) > 1)
+ printk("page %p not mapped: count %d\n",
+ page, page_count(page));
+ goto radixfail;
+ }
+ /* The page is an anon page. Allocate swap entry. */
+ if (!add_to_swap(page))
+ goto radixfail;
+ mapping = page->mapping;
+ }
+ error = radix_tree_preload(GFP_KERNEL);
+ if (error)
+ goto radixfail;
+ if (PagePrivate(page)) /* XXX */
+ BUG();
+
+ /* should {__add_to,__remove_from}_page_cache be used instead? */
+ spin_lock(&mapping->page_lock);
+ if (mapping != page->mapping)
+ printk("mapping changed %p -> %p, page %p\n",
+ mapping, page->mapping, page);
+ if (radix_tree_delete(&mapping->page_tree, page->index) == NULL) {
+ /* Page truncated. */
+ spin_unlock(&mapping->page_lock);
+ radix_tree_preload_end();
+ goto radixfail;
+ }
+ /* don't __put_page(page) here. truncate may be in progress */
+ newpage->flags |= page->flags & ~(1 << PG_uptodate) &
+ ~(1 << PG_highmem) & ~(1 << PG_chainlock) &
+ ~(1 << PG_direct) & ~(~0UL << ZONE_SHIFT);
+
+ /* list_del(&page->list); XXX */
+ radix_tree_insert(&mapping->page_tree, page->index, newpage);
+ page_cache_get(newpage);
+ newpage->mapping = mapping;
+ newpage->index = page->index;
+ spin_unlock(&mapping->page_lock);
+ radix_tree_preload_end();
+
+ pte_chain_lock(page);
+ if (page_mapped(page)) {
+ while ((error = try_to_unmap(page, 1)) == SWAP_AGAIN) {
+ pte_chain_unlock(page);
+ __set_current_state(TASK_INTERRUPTIBLE);
+ schedule_timeout(1);
+ __set_current_state(TASK_RUNNING);
+ pte_chain_lock(page);
+ }
+ if (error == SWAP_FAIL) {
+ pte_chain_unlock(page); /* XXX */
+ /* either during mremap or mlocked */
+ goto unmapfail;
+ }
+ }
+ pte_chain_unlock(page);
+ if (PagePrivate(page))
+ printk("buffer reappeared\n");
+
+ unlock_page(page); /* no lock needed while waiting page count */
+
+ waitcnt = 1;
+wait_again:
+ while ((truncated + page_count(page)) > 2) {
+ waitcnt++;
+ current->state = TASK_INTERRUPTIBLE;
+ schedule_timeout(1);
+ if ((waitcnt % 5000) == 0) {
+ printk("remap_onepage: still waiting on %p %d\n", page, waitcnt);
+ break;
+ }
+ if (PagePrivate(page))
+ break; /* see below */
+ }
+
+ lock_page(page);
+ BUG_ON(page_count(page) == 0);
+ if (mapping != page->mapping && page->mapping != NULL)
+ printk("mapping changed %p -> %p, page %p\n",
+ mapping, page->mapping, page);
+ if (PagePrivate(page))
+ try_to_release_page(page, GFP_KERNEL);
+ if ((truncated + page_count(page)) > 2) {
+ if (waitcnt > 50000)
+ goto unmapfail;
+ unlock_page(page);
+ goto wait_again;
+ }
+ if (PageReclaim(page) || PageWriteback(page) || PagePrivate(page))
+#ifdef CONFIG_KDB
+ KDB_ENTER();
+#else
+ BUG();
+#endif
+ spin_lock(&mapping->page_lock);
+ if (page_count(page) == 1) {
+ /* page has been truncated. free both pages. */
+ p = radix_tree_lookup(&mapping->page_tree, newpage->index);
+ if (p != NULL) {
+ /* new cache page appeared after truncation */
+ printk("page %p newpage %p radix %p\n",
+ page, newpage, p);
+ BUG_ON(p == newpage);
+ }
+ BUG_ON(page->mapping != NULL);
+ put_page(newpage);
+ if (page_count(newpage) != 1) {
+ printk("newpage count %d != 1, %p\n",
+ page_count(newpage), newpage);
+ BUG();
+ }
+ /* No need to do page->list. remove_from_page_cache did. */
+ newpage->mapping = page->mapping = NULL;
+ spin_unlock(&mapping->page_lock);
+ ClearPageActive(page);
+ ClearPageActive(newpage);
+ unlock_page(page);
+ unlock_page(newpage);
+ put_page(page);
+ put_page(newpage);
+ return 0;
+ }
+ p = radix_tree_lookup(&mapping->page_tree, newpage->index);
+ spin_unlock(&mapping->page_lock);
+ if (p == NULL) {
+ truncated = 1;
+ BUG_ON(page->mapping != NULL);
+ unlock_page(page);
+ goto wait_again;
+ }
+ BUG_ON(mapping != page->mapping);
+
+ spin_lock(&mapping->page_lock);
+ list_del(&page->list); /* XXX */
+ if (PageDirty(page)) {
+ SetPageDirty(newpage);
+ list_add(&newpage->list, &mapping->dirty_pages);
+ } else
+ list_add(&newpage->list, &mapping->clean_pages);
+ page->mapping = NULL;
+ spin_unlock(&mapping->page_lock);
+ unlock_page(page);
+
+ copy_highpage(newpage, page);
+ ClearPageActive(page);
+ __put_page(page);
+ put_page(page);
+
+ /* We are done. Finish and let the waiters run. */
+ SetPageUptodate(newpage);
+ /* XXX locking order correct? */
+ zone = page_zone(newpage);
+ spin_lock_irq(&zone->lru_lock);
+ if (PageActive(newpage)) {
+ list_add(&newpage->lru, &zone->active_list);
+ zone->nr_active++;
+ } else {
+ list_add(&newpage->lru, &zone->inactive_list);
+ zone->nr_inactive++;
+ }
+ SetPageLRU(newpage);
+ spin_unlock_irq(&zone->lru_lock);
+ unlock_page(newpage);
+ page_cache_release(newpage);
+ return 0;
+
+unmapfail:
+ /*
+ * Try to unwind by notifying waiters. If someone misbehaves,
+ * we die.
+ */
+ error = radix_tree_preload(GFP_KERNEL);
+ if (error)
+ BUG();
+ /* should {__add_to,__remove_from}_page_cache be used instead? */
+ spin_lock(&mapping->page_lock);
+ /* list_del(&newpage->list); */
+ if (radix_tree_delete(&mapping->page_tree, page->index) == NULL)
+ /* Hold extra count to handle truncate */
+ page_cache_get(newpage);
+ radix_tree_insert(&mapping->page_tree, page->index, page);
+ /* no page_cache_get(page); needed */
+ radix_tree_preload_end();
+ spin_unlock(&mapping->page_lock);
+
+ SetPageAgain(newpage);
+ /* XXX unmap needed? No, it shouldn't. Handled by fault handlers. */
+ unlock_page(newpage);
+
+ waitcnt = 1;
+ for(; page_count(newpage) > 2; waitcnt++) {
+ current->state = TASK_INTERRUPTIBLE;
+ schedule_timeout(1);
+ if ((waitcnt % 10000) == 0) {
+ printk("You are hosed.\n");
+ printk("newpage %p\n", newpage);
+ BUG();
+ }
+ }
+ BUG_ON(PageUptodate(newpage));
+ ClearPageDirty(newpage);
+ ClearPageActive(newpage);
+ spin_lock(&mapping->page_lock);
+ newpage->mapping = NULL;
+ if (page_count(newpage) == 1) {
+ printk("newpage %p truncated. page %p\n", newpage, page);
+ BUG();
+ }
+ spin_unlock(&mapping->page_lock);
+ unlock_page(page);
+ BUG_ON(page_count(newpage) != 2);
+ ClearPageAgain(newpage);
+ __put_page(newpage);
+ __free_page(newpage);
+ return 1;
+
+radixfail:
+ unlock_page(page);
+ unlock_page(newpage);
+ __free_page(newpage);
+ return 1;
+}
+
+static struct work_struct lru_drain_wq[NR_CPUS];
+static void
+lru_drain_schedule(void *p)
+{
+ int cpu = get_cpu();
+
+ schedule_work(&lru_drain_wq[cpu]);
+ put_cpu();
+}
+
+atomic_t remapd_count;
+int remapd(void *p)
+{
+ struct zone *zone = p;
+ struct page *page, *page1;
+ struct list_head *l;
+ int active, i, nr_failed = 0;
+ int fastmode = 100;
+ LIST_HEAD(failedp);
+
+ daemonize("remap%d", zone->zone_start_pfn);
+ if (atomic_read(&remapd_count) > 0) {
+ printk("remapd already running\n");
+ return 0;
+ }
+ atomic_inc(&remapd_count);
+ on_each_cpu(lru_drain_schedule, NULL, 1, 1);
+ while(nr_failed < 100) {
+ spin_lock_irq(&zone->lru_lock);
+ for(active = 0; active < 2; active++) {
+ l = active ? &zone->active_list :
+ &zone->inactive_list;
+ for(i = 0; ! list_empty(l) && i < 10; i++) {
+ page = list_entry(l->prev, struct page, lru);
+ if (fastmode && PageLocked(page)) {
+ page1 = page;
+ while (fastmode && PageLocked(page)) {
+ page =
+ list_entry(page->lru.prev,
+ struct page, lru);
+ fastmode--;
+ if (&page->lru == l) {
+ /* scanned the whole
+ list */
+ page = page1;
+ break;
+ }
+ if (page == page1)
+ BUG();
+ }
+ if (! fastmode) {
+ printk("used up fastmode\n");
+ page = page1;
+ }
+ }
+ if (! TestClearPageLRU(page))
+ BUG();
+ list_del(&page->lru);
+ if (page_count(page) == 0) {
+ /* the page is in pagevec_release();
+ shrink_cache says so. */
+ SetPageLRU(page);
+ list_add(&page->lru, l);
+ continue;
+ }
+ if (active)
+ zone->nr_active--;
+ else
+ zone->nr_inactive--;
+ page_cache_get(page);
+ spin_unlock_irq(&zone->lru_lock);
+ goto got_page;
+ }
+ }
+ spin_unlock_irq(&zone->lru_lock);
+ break;
+
+ got_page:
+ if (remap_onepage(page)) {
+ nr_failed++;
+ list_add(&page->lru, &failedp);
+ }
+ }
+ if (list_empty(&failedp))
+ goto out;
+
+ while (! list_empty(&failedp)) {
+ spin_lock_irq(&zone->lru_lock);
+ page = list_entry(failedp.prev, struct page, lru);
+ list_del(&page->lru);
+ if (PageActive(page)) {
+ list_add(&page->lru, &zone->active_list);
+ zone->nr_active++;
+ } else {
+ list_add(&page->lru, &zone->inactive_list);
+ zone->nr_inactive++;
+ }
+ if (TestSetPageLRU(page))
+ BUG();
+ spin_unlock_irq(&zone->lru_lock);
+ page_cache_release(page);
+ }
+out:
+ atomic_dec(&remapd_count);
+ return 0;
+}
+
+static int __init remapd_init(void)
+{
+ int i;
+
+ for(i = 0; i < NR_CPUS; i++)
+ INIT_WORK(&lru_drain_wq[i], lru_add_drain, NULL);
+ return 0;
+}
+
+module_init(remapd_init);
+#endif

/*
* A zone is low on free memory, so wake its kswapd task to service it.


2004-01-08 08:23:11

by Andrew Morton

[permalink] [raw]
Subject: Re: a new version of memory hotremove patch

IWAMOTO Toshihiro <[email protected]> wrote:
>
> - If a page is in mapping->io_pages when remap happens, it will be
> moved to dirty_pages. Tracking page->list to find out the list
> which page is connected to would be too expensive, and I have no other
> idea.

That sounds like a reasonable thing to do. The only impact would be that
an fsync() which is currently in progress could fail to write the page, so
the page is still dirty after the fsync() returns.

If this is the biggest problem, you've done well ;)

2004-01-08 11:39:29

by Hirokazu Takahashi

[permalink] [raw]
Subject: [PATCH] dynamic allocation of huge continuous pages

Hello,

I just implemented a patch which allow us to allocate huge
continuous pages easily. As We know it's very hard to allocate
them on demand, since free memory on system may be fragmented.
My approach is that I let annoying pages move to another place
so that we can make free coninuous space on memory. Iwamoto's
memory-hot-removal patch will help to do it.

I believe moving pages approach will be much better than random swaping
page out approach for this purpose.

iwamoto> This is an update of the memory hot removal patch.
:
iwamoto> http://people.valinux.co.jp/~iwamoto/mh.html

My patch needs the iwamoto's memory-hot-removeval patch.
You should apply both of them against linux-2.6.0.

Known problems:
- This patch doesn't work if CONFIG_HUGETLB_PAGE isn't set.
Does anybody have good idea to solve it,
since it's difficult to know whether a specified page
is free or a part of a large continuous page without
PG_compound flag.

ToDos:
- It's hard to allocate HugePages for hugetlbfs on a box
which dosen't have HighMem zones yet.
- We will make some continuous pages allocation work
at the same time.

Thank you,
Hirokazu Takahashi.


--- include/linux/page-flags.h.ORG Thu Jan 8 19:06:48 2032
+++ include/linux/page-flags.h Thu Jan 8 19:08:42 2032
@@ -77,6 +77,7 @@
#define PG_compound 19 /* Part of a compound page */

#define PG_again 20
+#define PG_booked 21


/*
@@ -274,6 +275,10 @@ extern void get_full_page_state(struct p
#define PageAgain(page) test_bit(PG_again, &(page)->flags)
#define SetPageAgain(page) set_bit(PG_again, &(page)->flags)
#define ClearPageAgain(page) clear_bit(PG_again, &(page)->flags)
+
+#define PageBooked(page) test_bit(PG_booked, &(page)->flags)
+#define SetPageBooked(page) set_bit(PG_booked, &(page)->flags)
+#define ClearPageBooked(page) clear_bit(PG_booked, &(page)->flags)

/*
* The PageSwapCache predicate doesn't use a PG_flag at this time,
--- include/linux/mmzone.h.ORG Thu Jan 8 19:06:56 2032
+++ include/linux/mmzone.h Thu Jan 8 19:12:07 2032
@@ -154,6 +154,9 @@ struct zone {
char *name;
unsigned long spanned_pages; /* total size, including holes */
unsigned long present_pages; /* amount of memory (excluding holes) */
+ unsigned long contig_pages_alloc_hint;
+ unsigned long booked_pages;
+ long scan_pages;
} ____cacheline_maxaligned_in_smp;

#define ZONE_DMA 0
--- mm/page_alloc.c.ORG Thu Jan 8 19:07:27 2032
+++ mm/page_alloc.c Thu Jan 8 19:51:24 2032
@@ -185,7 +185,11 @@ static inline void __free_pages_bulk (st
BUG();
index = page_idx >> (1 + order);

- zone->free_pages -= mask;
+ if (!PageBooked(page))
+ zone->free_pages -= mask;
+ else {
+ zone->booked_pages -= mask;
+ }
while (mask + (1 << (MAX_ORDER-1))) {
struct page *buddy1, *buddy2;

@@ -204,6 +208,9 @@ static inline void __free_pages_bulk (st
buddy2 = base + page_idx;
BUG_ON(bad_range(zone, buddy1));
BUG_ON(bad_range(zone, buddy2));
+ if (PageBooked(buddy1) != PageBooked(buddy2)) {
+ break;
+ }
list_del(&buddy1->list);
mask <<= 1;
area++;
@@ -352,13 +359,20 @@ static struct page *__rmqueue(struct zon
unsigned int current_order;
struct page *page;
unsigned int index;
+ struct list_head *p;

for (current_order = order; current_order < MAX_ORDER; ++current_order) {
area = zone->free_area + current_order;
if (list_empty(&area->free_list))
continue;
+ list_for_each(p, &area->free_list) {
+ page = list_entry(p, struct page, list);
+ if (!PageBooked(page))
+ goto gotit;
+ }
+ continue;

- page = list_entry(area->free_list.next, struct page, list);
+gotit:
list_del(&page->list);
index = page - zone->zone_mem_map;
if (current_order != MAX_ORDER-1)
@@ -456,6 +470,11 @@ static void free_hot_cold_page(struct pa
struct per_cpu_pages *pcp;
unsigned long flags;

+ if (PageBooked(page)) {
+ __free_pages_ok(page, 0);
+ return;
+ }
+
kernel_map_pages(page, 1, 0);
inc_page_state(pgfree);
free_pages_check(__FUNCTION__, page);
@@ -542,6 +561,242 @@ zone_activep(const struct zone *z)
}
#endif

+#if defined(CONFIG_HUGETLB_PAGE) && defined(CONFIG_MEMHOTPLUGTEST)
+/*
+ * Check wheter the page is freeable or not.
+ * It might not be free even if this function says OK,
+ * when it is just being allocated.
+ * This check is almost sufficient but not perfect.
+ */
+static inline int is_page_freeable(struct page *page)
+{
+ return (page->mapping || page_mapped(page) || !page_count(page)) &&
+ !(page->flags & (1<<PG_reserved|1<<PG_compound|1<<PG_booked|1<<PG_slab));
+}
+
+static inline int is_free_page(struct page *page)
+{
+ return !(page_mapped(page) ||
+ page->mapping != NULL ||
+ page_count(page) != 0 ||
+ (page->flags & (
+ 1 << PG_reserved|
+ 1 << PG_compound|
+ 1 << PG_booked |
+ 1 << PG_lru |
+ 1 << PG_private |
+ 1 << PG_locked |
+ 1 << PG_active |
+ 1 << PG_reclaim |
+ 1 << PG_dirty |
+ 1 << PG_slab |
+ 1 << PG_writeback )));
+}
+
+static int
+try_to_book_pages(struct zone *zone, struct page *page, unsigned int order)
+{
+ struct page *p;
+ int booked_count = 0;
+ unsigned long flags;
+
+ spin_lock_irqsave(&zone->lock, flags);
+
+ for (p = page; p < &page[1<<order]; p++) {
+ if (!is_page_freeable(p))
+ goto out;
+ if (is_free_page(p))
+ booked_count++;
+ SetPageBooked(p);
+ }
+
+ zone->booked_pages = booked_count;
+ zone->free_pages -= booked_count;
+
+ spin_unlock_irqrestore(&zone->lock, flags);
+ return 1;
+out:
+ for (p--; p >= page; p--) {
+ ClearPageBooked(p);
+ }
+ spin_unlock_irqrestore(&zone->lock, flags);
+ return 0;
+}
+
+static struct page *
+book_pages(struct zone *zone, unsigned int gfp_mask, unsigned int order)
+{
+ unsigned long num = 1<<order;
+ unsigned long slot = zone->contig_pages_alloc_hint;
+ struct page *page;
+
+ slot = (slot + num - 1) & ~(num - 1); /* align */
+
+ for ( ; zone->scan_pages > 0; slot += num) {
+ zone->scan_pages -= num;
+ if (slot + num > zone->present_pages)
+ slot = 0;
+ page = &zone->zone_mem_map[slot];
+ if (try_to_book_pages(zone, page, order)) {
+ zone->contig_pages_alloc_hint = slot + num;
+ return page;
+ }
+ }
+ return NULL;
+}
+
+static void
+unbook_pages(struct zone *zone, struct page *page, unsigned int order)
+{
+ struct page *p;
+ for (p = page; p < &page[1<<order]; p++) {
+ ClearPageBooked(p);
+ }
+}
+
+extern int remap_onepage(struct page *);
+/*
+ * sweepout_pages() might not work well as the booked pages
+ * may include some unfreeable pages.
+ */
+static int
+sweepout_pages(struct zone *zone, struct page *page, int num)
+{
+ struct page *p;
+ int failed = 0;
+ int retry = 0;
+ int retry_save = 0;
+ int retry_count = 20;
+
+again:
+ on_each_cpu((void (*)(void*))drain_local_pages, NULL, 1, 1);
+ for (p = page; p <= &page[num - 1]; p++) {
+ if (!page_count(p))
+ continue;
+ if (!PageBooked(p)) {
+ printk(KERN_ERR "ERROR sweepout_pages: page:%p isn't booked. page(%p) num(%d)\n", p, page, num);
+ }
+
+ spin_lock_irq(&zone->lru_lock);
+ if (!PageLRU(p)) {
+ spin_unlock_irq(&zone->lru_lock);
+ retry++;
+ continue;
+ }
+ list_del(&p->lru);
+ if (!TestClearPageLRU(p))
+ BUG();
+ if (PageActive(p)) {
+ zone->nr_active--;
+ if (page_count(p) == 0) {
+ /* the page is in pagevec_release();
+ shrink_cache says so. */
+ SetPageLRU(p);
+ list_add(&p->lru, &zone->active_list);
+ spin_unlock_irq(&zone->lru_lock);
+ continue;
+ }
+ } else {
+ zone->nr_inactive--;
+ if (page_count(p) == 0) {
+ /* the page is in pagevec_release();
+ shrink_cache says so. */
+ SetPageLRU(p);
+ list_add(&p->lru, &zone->inactive_list);
+ spin_unlock_irq(&zone->lru_lock);
+ continue;
+ }
+ }
+ page_cache_get(p);
+ spin_unlock_irq(&zone->lru_lock);
+ if (remap_onepage(p)) {
+ failed++;
+ spin_lock_irq(&zone->lru_lock);
+ if (PageActive(p)) {
+ list_add(&p->lru, &zone->active_list);
+ zone->nr_active++;
+ } else {
+ list_add(&p->lru, &zone->inactive_list);
+ zone->nr_inactive++;
+ }
+ SetPageLRU(p);
+ spin_unlock_irq(&zone->lru_lock);
+ page_cache_release(p);
+ }
+ }
+ if (retry && (retry_count--)) {
+ retry_save = retry;
+ retry = 0;
+ schedule_timeout(HZ/4);
+ /* Actually we should wait on the pages */
+ goto again;
+ }
+ on_each_cpu((void (*)(void*))drain_local_pages, NULL, 1, 1);
+ return failed;
+}
+
+/*
+ * Allocate contiguous pages enen if pages are fragmented in zones.
+ * Migrating pages helps to make enough space in them.
+ */
+static struct page *
+force_alloc_pages(unsigned int gfp_mask, unsigned int order,
+ struct zonelist *zonelist)
+{
+ struct zone **zones = zonelist->zones;
+ struct zone *zone;
+ struct page *page = NULL;
+ unsigned long flags;
+ int i;
+ int ret;
+
+ static DECLARE_MUTEX(bookedpage_sem);
+
+ if (down_trylock(&bookedpage_sem)) {
+ down(&bookedpage_sem);
+ }
+
+ for (i = 0; zones[i] != NULL; i++) {
+ zone = zones[i];
+ zone->scan_pages = zone->present_pages;
+ while (zone->scan_pages > 0) {
+ page = book_pages(zone, gfp_mask, order);
+ if (!page)
+ break;
+ ret = sweepout_pages(zone, page, 1<<order);
+ if (ret) {
+ spin_lock_irqsave(&zone->lock, flags);
+ unbook_pages(zone, page, order);
+ page = NULL;
+
+ zone->free_pages += zone->booked_pages;
+ spin_unlock_irqrestore(&zone->lock, flags);
+ continue;
+ }
+ spin_lock_irqsave(&zone->lock, flags);
+ unbook_pages(zone, page, order);
+ zone->free_pages += zone->booked_pages;
+ page = __rmqueue(zone, order);
+ spin_unlock_irqrestore(&zone->lock, flags);
+ if (page) {
+ prep_compound_page(page, order);
+ up(&bookedpage_sem);
+ return page;
+ }
+ }
+ }
+ up(&bookedpage_sem);
+ return NULL;
+}
+#endif /* CONFIG_HUGETLB_PAGE */
+
+static inline int
+enough_pages(struct zone *zone, unsigned long min, const int wait)
+{
+ return (long)zone->free_pages - (long)min >= 0 ||
+ (!wait && (long)zone->free_pages - (long)zone->pages_high >= 0);
+}
+
/*
* This is the 'heart' of the zoned buddy allocator.
*
@@ -602,8 +857,7 @@ __alloc_pages(unsigned int gfp_mask, uns
local_low >>= 1;
min += local_low;

- if (z->free_pages >= min ||
- (!wait && z->free_pages >= z->pages_high)) {
+ if (enough_pages(z, min, wait)) {
page = buffered_rmqueue(z, order, cold);
if (page)
goto got_pg;
@@ -631,8 +885,7 @@ __alloc_pages(unsigned int gfp_mask, uns
if (rt_task(p))
local_min >>= 1;
min += local_min;
- if (z->free_pages >= min ||
- (!wait && z->free_pages >= z->pages_high)) {
+ if (enough_pages(z, min, wait)) {
page = buffered_rmqueue(z, order, cold);
if (page)
goto got_pg;
@@ -682,14 +935,27 @@ rebalance:
continue;
#endif
min += z->pages_min;
- if (z->free_pages >= min ||
- (!wait && z->free_pages >= z->pages_high)) {
+ if (enough_pages(z, min, wait)) {
page = buffered_rmqueue(z, order, cold);
if (page)
goto got_pg;
}
min += z->pages_low * sysctl_lower_zone_protection;
}
+
+#if defined(CONFIG_HUGETLB_PAGE) && defined(CONFIG_MEMHOTPLUGTEST)
+ /*
+ * Defrag pages to allocate large contiguous pages
+ *
+ * FIXME: The following code will work only if CONFIG_HUGETLB_PAGE
+ * flag is on.
+ */
+ if (order) {
+ page = force_alloc_pages(gfp_mask, order, zonelist);
+ if (page)
+ goto got_pg;
+ }
+#endif /* CONFIG_HUGETLB_PAGE */

/*
* Don't let big-order allocations loop unless the caller explicitly
--- mm/vmscan.c.ORG Thu Jan 8 19:07:40 2032
+++ mm/vmscan.c Thu Jan 8 19:08:42 2032
@@ -1183,7 +1183,7 @@ bufferdone:
}
/* don't __put_page(page) here. truncate may be in progress */
newpage->flags |= page->flags & ~(1 << PG_uptodate) &
- ~(1 << PG_highmem) & ~(1 << PG_chainlock) &
+ ~(1 << PG_highmem) & ~(1 << PG_chainlock) & ~(1 << PG_booked) &
~(1 << PG_direct) & ~(~0UL << ZONE_SHIFT);

/* list_del(&page->list); XXX */

2004-01-09 04:15:49

by Rusty Russell

[permalink] [raw]
Subject: Re: [PATCH] dynamic allocation of huge continuous pages

In message <[email protected]> you write:
> + list_for_each(p, &area->free_list) {
> + page = list_entry(p, struct page, list);

Just FYI, "list_for_each_entry(page, &area->free_list, list)" is
shorter and neater.

Cheers,
Rusty.
--
Anyone who quotes me in their sig is an idiot. -- Rusty Russell.

2004-01-09 09:43:49

by Hirokazu Takahashi

[permalink] [raw]
Subject: Re: [PATCH] dynamic allocation of huge continuous pages

Hi,

Thank you for your advice.

> > + list_for_each(p, &area->free_list) {
> > + page = list_entry(p, struct page, list);
>
> Just FYI, "list_for_each_entry(page, &area->free_list, list)" is
> shorter and neater.
>
> Cheers,
> Rusty.

Thank you,
Hirokazu Takahashi.