2004-06-30 11:18:30

by IWAMOTO Toshihiro

[permalink] [raw]
Subject: new memory hotremoval patch

Hi,

this is an updated version of my memory hotremoval patch.
I'll only include the main patch which contains page remapping code.
The other two files, which haven't changed much from April, can be
found at http://people.valinux.co.jp/~iwamoto/mh.html .

Page "remapping" is a mechanism to free a specified page by copying the
page content to a newly allocated replacement page and redirecting
references to the original page to the new page.
This was designed to reliably free specified pages, unlike the swapout
code.

The patch is against linux-2.6.7, and fully supports objrmap.
Though there seems to be some lock related lossage, the page remapping
code works fairly well.

Due to struct page changes, page->mapping == NULL predicate can no
longer be used for detecting cancellation of an anonymous page
remapping operation. So the PG_again bit is being used again.
It may be still possible to kill the PG_again bit, but the priority is
rather low.

I will be working on the following items.

1. Prototype implementation of memsection support.
It seems some people wants to hotremove small regions of memory
rather than zones or nodes. A prototype implementation will
show how Takahashi's hugetlb page code can be used for such a
purpose.

2. Handling of pages with dirty buffers without writing them back.
This is file system specific. I plan to do against ext2 and
ext3.


My patch supports remapping of normal pages, Takahashi's hugepage
remapping patch will be posted in a few days.


$Id: memoryhotplug.patch,v 1.95 2004/06/30 07:31:37 iwamoto Exp $

--- linux-2.6.7.ORG/arch/i386/Kconfig 2004-06-16 14:18:59.000000000 +0900
+++ linux-2.6.7/arch/i386/Kconfig 2004-06-17 16:31:41.000000000 +0900
@@ -734,9 +734,19 @@ comment "NUMA (NUMA-Q) requires SMP, 64G
comment "NUMA (Summit) requires SMP, 64GB highmem support, ACPI"
depends on X86_SUMMIT && (!HIGHMEM64G || !ACPI)

+config MEMHOTPLUG
+ bool "Memory hotplug test"
+ depends on !X86_PAE
+ default n
+
+config MEMHOTPLUG_BLKSIZE
+ int "Size of a memory hotplug unit (in MB, must be multiple of 256)."
+ range 256 1024
+ depends on MEMHOTPLUG
+
config DISCONTIGMEM
bool
- depends on NUMA
+ depends on NUMA || MEMHOTPLUG
default y

config HAVE_ARCH_BOOTMEM_NODE
--- linux-2.6.7.ORG/include/linux/gfp.h 2004-06-16 14:19:02.000000000 +0900
+++ linux-2.6.7/include/linux/gfp.h 2004-06-17 11:39:11.000000000 +0900
@@ -11,9 +11,10 @@ struct vm_area_struct;
/*
* GFP bitmasks..
*/
-/* Zone modifiers in GFP_ZONEMASK (see linux/mmzone.h - low two bits) */
-#define __GFP_DMA 0x01
-#define __GFP_HIGHMEM 0x02
+/* Zone modifiers in GFP_ZONEMASK (see linux/mmzone.h - low three bits) */
+#define __GFP_DMA 0x01
+#define __GFP_HIGHMEM 0x02
+#define __GFP_HOTREMOVABLE 0x03

/*
* Action modifiers - doesn't change the zoning
@@ -51,7 +52,7 @@ struct vm_area_struct;
#define GFP_NOFS (__GFP_WAIT | __GFP_IO)
#define GFP_KERNEL (__GFP_WAIT | __GFP_IO | __GFP_FS)
#define GFP_USER (__GFP_WAIT | __GFP_IO | __GFP_FS)
-#define GFP_HIGHUSER (__GFP_WAIT | __GFP_IO | __GFP_FS | __GFP_HIGHMEM)
+#define GFP_HIGHUSER (__GFP_WAIT | __GFP_IO | __GFP_FS | __GFP_HIGHMEM | __GFP_HOTREMOVABLE)

/* Flag - indicates that the buffer will be suitable for DMA. Ignored on some
platforms, used as appropriate on others */
--- linux-2.6.7.ORG/include/linux/mm.h 2004-06-16 14:18:56.000000000 +0900
+++ linux-2.6.7/include/linux/mm.h 2004-06-17 16:26:50.000000000 +0900
@@ -314,6 +314,11 @@ static inline void put_page(struct page

#endif /* CONFIG_HUGETLB_PAGE */

+static inline int is_page_cache_freeable(struct page *page)
+{
+ return page_count(page) - !!PagePrivate(page) == 2;
+}
+
/*
* Multiple processes may "see" the same page. E.g. for untouched
* mappings of /dev/null, all processes see the same page full of
--- linux-2.6.7.ORG/include/linux/mmzone.h 2004-06-16 14:19:36.000000000 +0900
+++ linux-2.6.7/include/linux/mmzone.h 2004-06-17 11:39:11.000000000 +0900
@@ -65,8 +65,10 @@ struct per_cpu_pageset {
#define ZONE_DMA 0
#define ZONE_NORMAL 1
#define ZONE_HIGHMEM 2
+#define ZONE_HOTREMOVABLE 3 /* only for zonelists */

#define MAX_NR_ZONES 3 /* Sync this with ZONES_SHIFT */
+#define MAX_NR_ZONELISTS 4
#define ZONES_SHIFT 2 /* ceil(log2(MAX_NR_ZONES)) */

#define GFP_ZONEMASK 0x03
@@ -225,7 +227,7 @@ struct zonelist {
struct bootmem_data;
typedef struct pglist_data {
struct zone node_zones[MAX_NR_ZONES];
- struct zonelist node_zonelists[MAX_NR_ZONES];
+ struct zonelist node_zonelists[MAX_NR_ZONELISTS];
int nr_zones;
struct page *node_mem_map;
struct bootmem_data *bdata;
@@ -237,6 +239,7 @@ typedef struct pglist_data {
struct pglist_data *pgdat_next;
wait_queue_head_t kswapd_wait;
struct task_struct *kswapd;
+ char removable, enabled;
} pg_data_t;

#define node_present_pages(nid) (NODE_DATA(nid)->node_present_pages)
--- linux-2.6.7.ORG/include/linux/page-flags.h 2004-06-16 14:19:42.000000000 +0900
+++ linux-2.6.7/include/linux/page-flags.h 2004-06-17 11:39:11.000000000 +0900
@@ -78,6 +78,8 @@

#define PG_anon 20 /* Anonymous: anon_vma in mapping */

+#define PG_again 21
+

/*
* Global page accounting. One instance per CPU. Only unsigned longs are
@@ -298,6 +300,10 @@ extern unsigned long __read_page_state(u
#define SetPageCompound(page) set_bit(PG_compound, &(page)->flags)
#define ClearPageCompound(page) clear_bit(PG_compound, &(page)->flags)

+#define PageAgain(page) test_bit(PG_again, &(page)->flags)
+#define SetPageAgain(page) set_bit(PG_again, &(page)->flags)
+#define ClearPageAgain(page) clear_bit(PG_again, &(page)->flags)
+
#define PageAnon(page) test_bit(PG_anon, &(page)->flags)
#define SetPageAnon(page) set_bit(PG_anon, &(page)->flags)
#define ClearPageAnon(page) clear_bit(PG_anon, &(page)->flags)
--- linux-2.6.7.ORG/include/linux/rmap.h 2004-06-16 14:18:57.000000000 +0900
+++ linux-2.6.7/include/linux/rmap.h 2004-06-17 11:39:11.000000000 +0900
@@ -96,7 +96,7 @@ static inline void page_dup_rmap(struct
* Called from mm/vmscan.c to handle paging out
*/
int page_referenced(struct page *);
-int try_to_unmap(struct page *);
+int try_to_unmap(struct page *, struct list_head *);

#else /* !CONFIG_MMU */

@@ -105,7 +105,7 @@ int try_to_unmap(struct page *);
#define anon_vma_link(vma) do {} while (0)

#define page_referenced(page) TestClearPageReferenced(page)
-#define try_to_unmap(page) SWAP_FAIL
+#define try_to_unmap(page, force) SWAP_FAIL

#endif /* CONFIG_MMU */

--- linux-2.6.7.ORG/mm/Makefile 2004-06-16 14:19:37.000000000 +0900
+++ linux-2.6.7/mm/Makefile 2004-06-17 11:39:11.000000000 +0900
@@ -15,3 +15,5 @@ obj-y := bootmem.o filemap.o mempool.o
obj-$(CONFIG_SWAP) += page_io.o swap_state.o swapfile.o
obj-$(CONFIG_HUGETLBFS) += hugetlb.o
obj-$(CONFIG_NUMA) += mempolicy.o
+
+obj-$(CONFIG_MEMHOTPLUG) += memhotplug.o
--- linux-2.6.7.ORG/mm/filemap.c 2004-06-16 14:19:12.000000000 +0900
+++ linux-2.6.7/mm/filemap.c 2004-06-17 11:39:11.000000000 +0900
@@ -250,7 +250,8 @@ int filemap_write_and_wait(struct addres
int add_to_page_cache(struct page *page, struct address_space *mapping,
pgoff_t offset, int gfp_mask)
{
- int error = radix_tree_preload(gfp_mask & ~__GFP_HIGHMEM);
+ int error = radix_tree_preload((gfp_mask & ~GFP_ZONEMASK) |
+ ((gfp_mask & GFP_ZONEMASK) == __GFP_DMA ? __GFP_DMA : 0));

if (error == 0) {
spin_lock_irq(&mapping->tree_lock);
@@ -495,6 +496,7 @@ repeat:
page_cache_release(page);
goto repeat;
}
+ BUG_ON(PageAgain(page));
}
}
spin_unlock_irq(&mapping->tree_lock);
@@ -738,6 +740,8 @@ page_not_up_to_date:
goto page_ok;
}

+ BUG_ON(PageAgain(page));
+
readpage:
/* ... and start the actual read. The read will unlock the page. */
error = mapping->a_ops->readpage(filp, page);
@@ -1206,6 +1210,8 @@ page_not_uptodate:
goto success;
}

+ BUG_ON(PageAgain(page));
+
if (!mapping->a_ops->readpage(file, page)) {
wait_on_page_locked(page);
if (PageUptodate(page))
@@ -1314,6 +1320,8 @@ page_not_uptodate:
goto success;
}

+ BUG_ON(PageAgain(page));
+
if (!mapping->a_ops->readpage(file, page)) {
wait_on_page_locked(page);
if (PageUptodate(page))
@@ -1518,6 +1526,8 @@ retry:
unlock_page(page);
goto out;
}
+ BUG_ON(PageAgain(page));
+
err = filler(data, page);
if (err < 0) {
page_cache_release(page);
--- linux-2.6.7.ORG/mm/memory.c 2004-06-16 14:19:22.000000000 +0900
+++ linux-2.6.7/mm/memory.c 2004-06-17 16:26:50.000000000 +0900
@@ -1305,6 +1305,7 @@ static int do_swap_page(struct mm_struct

pte_unmap(page_table);
spin_unlock(&mm->page_table_lock);
+again:
page = lookup_swap_cache(entry);
if (!page) {
swapin_readahead(entry, address, vma);
@@ -1332,6 +1333,12 @@ static int do_swap_page(struct mm_struct

mark_page_accessed(page);
lock_page(page);
+ if (PageAgain(page)) {
+ unlock_page(page);
+ page_cache_release(page);
+ goto again;
+ }
+ BUG_ON(PageAgain(page));

/*
* Back out if somebody else faulted in this pte while we
--- linux-2.6.7.ORG/mm/page_alloc.c 2004-06-16 14:18:57.000000000 +0900
+++ linux-2.6.7/mm/page_alloc.c 2004-06-17 16:31:41.000000000 +0900
@@ -25,6 +25,7 @@
#include <linux/module.h>
#include <linux/suspend.h>
#include <linux/pagevec.h>
+#include <linux/memhotplug.h>
#include <linux/blkdev.h>
#include <linux/slab.h>
#include <linux/notifier.h>
@@ -231,6 +232,7 @@ static inline void free_pages_check(cons
1 << PG_maplock |
1 << PG_anon |
1 << PG_swapcache |
+ 1 << PG_again |
1 << PG_writeback )))
bad_page(function, page);
if (PageDirty(page))
@@ -341,12 +343,13 @@ static void prep_new_page(struct page *p
1 << PG_maplock |
1 << PG_anon |
1 << PG_swapcache |
+ 1 << PG_again |
1 << PG_writeback )))
bad_page(__FUNCTION__, page);

page->flags &= ~(1 << PG_uptodate | 1 << PG_error |
1 << PG_referenced | 1 << PG_arch_1 |
- 1 << PG_checked | 1 << PG_mappedtodisk);
+ 1 << PG_checked | 1 << PG_mappedtodisk | 1 << PG_again);
page->private = 0;
set_page_refs(page, order);
}
@@ -404,7 +407,7 @@ static int rmqueue_bulk(struct zone *zon
return allocated;
}

-#if defined(CONFIG_PM) || defined(CONFIG_HOTPLUG_CPU)
+#if defined(CONFIG_PM) || defined(CONFIG_HOTPLUG_CPU) || defined(CONFIG_MEMHOTPLUG)
static void __drain_pages(unsigned int cpu)
{
struct zone *zone;
@@ -447,7 +450,9 @@ int is_head_of_free_region(struct page *
spin_unlock_irqrestore(&zone->lock, flags);
return 0;
}
+#endif

+#if defined(CONFIG_SOFTWARE_SUSPEND) || defined(CONFIG_MEMHOTPLUG)
/*
* Spill all of this CPU's per-cpu pages back into the buddy allocator.
*/
@@ -847,7 +852,8 @@ unsigned int nr_free_pages(void)
struct zone *zone;

for_each_zone(zone)
- sum += zone->free_pages;
+ if (zone->zone_pgdat->enabled)
+ sum += zone->free_pages;

return sum;
}
@@ -860,7 +866,8 @@ unsigned int nr_used_zone_pages(void)
struct zone *zone;

for_each_zone(zone)
- pages += zone->nr_active + zone->nr_inactive;
+ if (zone->zone_pgdat->enabled)
+ pages += zone->nr_active + zone->nr_inactive;

return pages;
}
@@ -887,6 +894,8 @@ static unsigned int nr_free_zone_pages(i
struct zone **zonep = zonelist->zones;
struct zone *zone;

+ if (!pgdat->enabled)
+ continue;
for (zone = *zonep++; zone; zone = *zonep++) {
unsigned long size = zone->present_pages;
unsigned long high = zone->pages_high;
@@ -921,7 +930,8 @@ unsigned int nr_free_highpages (void)
unsigned int pages = 0;

for_each_pgdat(pgdat)
- pages += pgdat->node_zones[ZONE_HIGHMEM].free_pages;
+ if (pgdat->enabled)
+ pages += pgdat->node_zones[ZONE_HIGHMEM].free_pages;

return pages;
}
@@ -1171,13 +1181,21 @@ void show_free_areas(void)
/*
* Builds allocation fallback zone lists.
*/
-static int __init build_zonelists_node(pg_data_t *pgdat, struct zonelist *zonelist, int j, int k)
+static int build_zonelists_node(pg_data_t *pgdat, struct zonelist *zonelist, int j, int k)
{
+
+ if (!pgdat->enabled)
+ return j;
+ if (k != ZONE_HOTREMOVABLE &&
+ pgdat->removable)
+ return j;
+
switch (k) {
struct zone *zone;
default:
BUG();
case ZONE_HIGHMEM:
+ case ZONE_HOTREMOVABLE:
zone = pgdat->node_zones + ZONE_HIGHMEM;
if (zone->present_pages) {
#ifndef CONFIG_HIGHMEM
@@ -1304,24 +1322,48 @@ static void __init build_zonelists(pg_da

#else /* CONFIG_NUMA */

-static void __init build_zonelists(pg_data_t *pgdat)
+static void build_zonelists(pg_data_t *pgdat)
{
int i, j, k, node, local_node;
+ int hotremovable;
+#ifdef CONFIG_MEMHOTPLUG
+ struct zone *zone;
+#endif

local_node = pgdat->node_id;
- for (i = 0; i < MAX_NR_ZONES; i++) {
+ for (i = 0; i < MAX_NR_ZONELISTS; i++) {
struct zonelist *zonelist;

zonelist = pgdat->node_zonelists + i;
- memset(zonelist, 0, sizeof(*zonelist));
+ /* memset(zonelist, 0, sizeof(*zonelist)); */

j = 0;
k = ZONE_NORMAL;
- if (i & __GFP_HIGHMEM)
+ hotremovable = 0;
+ switch (i) {
+ default:
+ BUG();
+ return;
+ case 0:
+ k = ZONE_NORMAL;
+ break;
+ case __GFP_HIGHMEM:
k = ZONE_HIGHMEM;
- if (i & __GFP_DMA)
+ break;
+ case __GFP_DMA:
k = ZONE_DMA;
+ break;
+ case __GFP_HOTREMOVABLE:
+#ifdef CONFIG_MEMHOTPLUG
+ k = ZONE_HIGHMEM;
+#else
+ k = ZONE_HOTREMOVABLE;
+#endif
+ hotremovable = 1;
+ break;
+ }

+#ifndef CONFIG_MEMHOTPLUG
j = build_zonelists_node(pgdat, zonelist, j, k);
/*
* Now we build the zonelist so that it contains the zones
@@ -1335,19 +1377,54 @@ static void __init build_zonelists(pg_da
j = build_zonelists_node(NODE_DATA(node), zonelist, j, k);
for (node = 0; node < local_node; node++)
j = build_zonelists_node(NODE_DATA(node), zonelist, j, k);
-
- zonelist->zones[j] = NULL;
- }
+#else
+ while (hotremovable >= 0) {
+ for(; k >= 0; k--) {
+ zone = pgdat->node_zones + k;
+ for (node = local_node; ;) {
+ if (NODE_DATA(node) == NULL ||
+ !NODE_DATA(node)->enabled ||
+ (!!NODE_DATA(node)->removable) !=
+ (!!hotremovable))
+ goto next;
+ zone = NODE_DATA(node)->node_zones + k;
+ if (zone->present_pages)
+ zonelist->zones[j++] = zone;
+ next:
+ node = (node + 1) % numnodes;
+ if (node == local_node)
+ break;
+ }
+ }
+ if (hotremovable) {
+ /* place non-hotremovable after hotremovable */
+ k = ZONE_HIGHMEM;
+ }
+ hotremovable--;
+ }
+#endif
+ BUG_ON(j > sizeof(zonelist->zones) /
+ sizeof(zonelist->zones[0]) - 1);
+ for(; j < sizeof(zonelist->zones) /
+ sizeof(zonelist->zones[0]); j++)
+ zonelist->zones[j] = NULL;
+ }
}

#endif /* CONFIG_NUMA */

-void __init build_all_zonelists(void)
+#ifdef CONFIG_MEMHOTPLUG
+void
+#else
+void __init
+#endif
+build_all_zonelists(void)
{
int i;

for(i = 0 ; i < numnodes ; i++)
- build_zonelists(NODE_DATA(i));
+ if (NODE_DATA(i) != NULL)
+ build_zonelists(NODE_DATA(i));
printk("Built %i zonelists\n", numnodes);
}

@@ -1419,7 +1496,7 @@ static void __init calculate_zone_totalp
* up by free_all_bootmem() once the early boot process is
* done. Non-atomic initialization, single-pass.
*/
-void __init memmap_init_zone(struct page *start, unsigned long size, int nid,
+void memmap_init_zone(struct page *start, unsigned long size, int nid,
unsigned long zone, unsigned long start_pfn)
{
struct page *page;
@@ -1457,10 +1534,13 @@ static void __init free_area_init_core(s
int cpu, nid = pgdat->node_id;
struct page *lmem_map = pgdat->node_mem_map;
unsigned long zone_start_pfn = pgdat->node_start_pfn;
+#ifdef CONFIG_MEMHOTPLUG
+ int cold = !nid;
+#endif

pgdat->nr_zones = 0;
init_waitqueue_head(&pgdat->kswapd_wait);
-
+
for (j = 0; j < MAX_NR_ZONES; j++) {
struct zone *zone = pgdat->node_zones + j;
unsigned long size, realsize;
@@ -1530,6 +1610,13 @@ static void __init free_area_init_core(s
zone->wait_table_size = wait_table_size(size);
zone->wait_table_bits =
wait_table_bits(zone->wait_table_size);
+#ifdef CONFIG_MEMHOTPLUG
+ if (!cold)
+ zone->wait_table = (wait_queue_head_t *)
+ kmalloc(zone->wait_table_size
+ * sizeof(wait_queue_head_t), GFP_KERNEL);
+ else
+#endif
zone->wait_table = (wait_queue_head_t *)
alloc_bootmem_node(pgdat, zone->wait_table_size
* sizeof(wait_queue_head_t));
@@ -1584,6 +1671,13 @@ static void __init free_area_init_core(s
*/
bitmap_size = (size-1) >> (i+4);
bitmap_size = LONG_ALIGN(bitmap_size+1);
+#ifdef CONFIG_MEMHOTPLUG
+ if (!cold) {
+ zone->free_area[i].map =
+ (unsigned long *)kmalloc(bitmap_size, GFP_KERNEL);
+ memset(zone->free_area[i].map, 0, bitmap_size);
+ } else
+#endif
zone->free_area[i].map =
(unsigned long *) alloc_bootmem_node(pgdat, bitmap_size);
}
@@ -1901,7 +1995,7 @@ static void setup_per_zone_protection(vo
* that the pages_{min,low,high} values for each zone are set correctly
* with respect to min_free_kbytes.
*/
-static void setup_per_zone_pages_min(void)
+void setup_per_zone_pages_min(void)
{
unsigned long pages_min = min_free_kbytes >> (PAGE_SHIFT - 10);
unsigned long lowmem_pages = 0;
--- linux-2.6.7.ORG/mm/rmap.c 2004-06-16 14:20:03.000000000 +0900
+++ linux-2.6.7/mm/rmap.c 2004-06-17 11:39:12.000000000 +0900
@@ -30,6 +30,7 @@
#include <linux/pagemap.h>
#include <linux/swap.h>
#include <linux/swapops.h>
+#include <linux/memhotplug.h>
#include <linux/slab.h>
#include <linux/init.h>
#include <linux/rmap.h>
@@ -421,7 +422,8 @@ void page_remove_rmap(struct page *page)
* Subfunctions of try_to_unmap: try_to_unmap_one called
* repeatedly from either try_to_unmap_anon or try_to_unmap_file.
*/
-static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma)
+static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
+ struct list_head *force)
{
struct mm_struct *mm = vma->vm_mm;
unsigned long address;
@@ -429,6 +431,9 @@ static int try_to_unmap_one(struct page
pmd_t *pmd;
pte_t *pte;
pte_t pteval;
+#ifdef CONFIG_MEMHOTPLUG
+ struct page_va_list *vlist;
+#endif
int ret = SWAP_AGAIN;

if (!mm->rss)
@@ -466,8 +471,22 @@ static int try_to_unmap_one(struct page
*/
if ((vma->vm_flags & (VM_LOCKED|VM_RESERVED)) ||
ptep_test_and_clear_young(pte)) {
- ret = SWAP_FAIL;
- goto out_unmap;
+ if (force == NULL || vma->vm_flags & VM_RESERVED) {
+ ret = SWAP_FAIL;
+ goto out_unmap;
+ }
+#ifdef CONFIG_MEMHOTPLUG
+ vlist = kmalloc(sizeof(struct page_va_list), GFP_KERNEL);
+ atomic_inc(&mm->mm_count);
+ vlist->mm = mmgrab(mm);
+ if (vlist->mm == NULL) {
+ mmdrop(mm);
+ kfree(vlist);
+ } else {
+ vlist->addr = address;
+ list_add(&vlist->list, force);
+ }
+#endif
}

/*
@@ -620,7 +639,7 @@ out_unlock:
return SWAP_AGAIN;
}

-static inline int try_to_unmap_anon(struct page *page)
+static inline int try_to_unmap_anon(struct page *page, struct list_head *force)
{
struct anon_vma *anon_vma = (struct anon_vma *) page->mapping;
struct vm_area_struct *vma;
@@ -629,7 +648,7 @@ static inline int try_to_unmap_anon(stru
spin_lock(&anon_vma->lock);
BUG_ON(list_empty(&anon_vma->head));
list_for_each_entry(vma, &anon_vma->head, anon_vma_node) {
- ret = try_to_unmap_one(page, vma);
+ ret = try_to_unmap_one(page, vma, force);
if (ret == SWAP_FAIL || !page->mapcount)
break;
}
@@ -649,7 +668,7 @@ static inline int try_to_unmap_anon(stru
* The spinlock address_space->i_mmap_lock is tried. If it can't be gotten,
* return a temporary error.
*/
-static inline int try_to_unmap_file(struct page *page)
+static inline int try_to_unmap_file(struct page *page, struct list_head *force)
{
struct address_space *mapping = page->mapping;
pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
@@ -666,7 +685,7 @@ static inline int try_to_unmap_file(stru

while ((vma = vma_prio_tree_next(vma, &mapping->i_mmap,
&iter, pgoff, pgoff)) != NULL) {
- ret = try_to_unmap_one(page, vma);
+ ret = try_to_unmap_one(page, vma, force);
if (ret == SWAP_FAIL || !page->mapcount)
goto out;
}
@@ -760,7 +779,7 @@ out:
* SWAP_AGAIN - we missed a trylock, try again later
* SWAP_FAIL - the page is unswappable
*/
-int try_to_unmap(struct page *page)
+int try_to_unmap(struct page *page, struct list_head *force)
{
int ret;

@@ -769,9 +788,9 @@ int try_to_unmap(struct page *page)
BUG_ON(!page->mapcount);

if (PageAnon(page))
- ret = try_to_unmap_anon(page);
+ ret = try_to_unmap_anon(page, force);
else
- ret = try_to_unmap_file(page);
+ ret = try_to_unmap_file(page, force);

if (!page->mapcount) {
if (page_test_and_clear_dirty(page))
--- linux-2.6.7.ORG/mm/swapfile.c 2004-06-16 14:19:01.000000000 +0900
+++ linux-2.6.7/mm/swapfile.c 2004-06-17 11:39:12.000000000 +0900
@@ -658,6 +658,7 @@ static int try_to_unuse(unsigned int typ
*/
swap_map = &si->swap_map[i];
entry = swp_entry(type, i);
+ again:
page = read_swap_cache_async(entry, NULL, 0);
if (!page) {
/*
@@ -692,6 +693,11 @@ static int try_to_unuse(unsigned int typ
wait_on_page_locked(page);
wait_on_page_writeback(page);
lock_page(page);
+ if (PageAgain(page)) {
+ unlock_page(page);
+ page_cache_release(page);
+ goto again;
+ }
wait_on_page_writeback(page);

/*
@@ -800,6 +806,7 @@ static int try_to_unuse(unsigned int typ

swap_writepage(page, &wbc);
lock_page(page);
+ BUG_ON(PageAgain(page));
wait_on_page_writeback(page);
}
if (PageSwapCache(page)) {
--- linux-2.6.7.ORG/mm/truncate.c 2004-06-16 14:20:04.000000000 +0900
+++ linux-2.6.7/mm/truncate.c 2004-06-17 11:39:12.000000000 +0900
@@ -132,6 +132,8 @@ void truncate_inode_pages(struct address
next++;
if (TestSetPageLocked(page))
continue;
+ /* no PageAgain(page) check; page->mapping check
+ * is done in truncate_complete_page */
if (PageWriteback(page)) {
unlock_page(page);
continue;
@@ -165,6 +167,24 @@ void truncate_inode_pages(struct address
struct page *page = pvec.pages[i];

lock_page(page);
+ if (page->mapping == NULL) {
+ /* XXX Is page->index still valid? */
+ unsigned long index = page->index;
+ int again = PageAgain(page);
+
+ unlock_page(page);
+ put_page(page);
+ page = find_lock_page(mapping, index);
+ if (page == NULL) {
+ BUG_ON(again);
+ /* XXX */
+ if (page->index > next)
+ next = page->index;
+ next++;
+ }
+ BUG_ON(!again);
+ pvec.pages[i] = page;
+ }
wait_on_page_writeback(page);
if (page->index > next)
next = page->index;
@@ -257,14 +277,29 @@ void invalidate_inode_pages2(struct addr
struct page *page = pvec.pages[i];

lock_page(page);
- if (page->mapping == mapping) { /* truncate race? */
- wait_on_page_writeback(page);
- next = page->index + 1;
- if (page_mapped(page))
- clear_page_dirty(page);
- else
- invalidate_complete_page(mapping, page);
+ while (page->mapping != mapping) {
+ struct page *newpage;
+ unsigned long index = page->index;
+
+ BUG_ON(page->mapping != NULL);
+
+ unlock_page(page);
+ newpage = find_lock_page(mapping, index);
+ if (page == newpage) {
+ put_page(page);
+ break;
+ }
+ BUG_ON(!PageAgain(page));
+ pvec.pages[i] = newpage;
+ put_page(page);
+ page = newpage;
}
+ wait_on_page_writeback(page);
+ next = page->index + 1;
+ if (page_mapped(page))
+ clear_page_dirty(page);
+ else
+ invalidate_complete_page(mapping, page);
unlock_page(page);
}
pagevec_release(&pvec);
--- linux-2.6.7.ORG/mm/vmscan.c 2004-06-16 14:18:58.000000000 +0900
+++ linux-2.6.7/mm/vmscan.c 2004-06-17 11:39:12.000000000 +0900
@@ -32,6 +32,7 @@
#include <linux/topology.h>
#include <linux/cpu.h>
#include <linux/notifier.h>
+#include <linux/kthread.h>

#include <asm/pgalloc.h>
#include <asm/tlbflush.h>
@@ -197,11 +198,6 @@ static inline int page_mapping_inuse(str
return mapping_mapped(mapping);
}

-static inline int is_page_cache_freeable(struct page *page)
-{
- return page_count(page) - !!PagePrivate(page) == 2;
-}
-
static int may_write_to_queue(struct backing_dev_info *bdi)
{
if (current_is_kswapd())
@@ -399,7 +395,7 @@ static int shrink_list(struct list_head
* processes. Try to unmap it here.
*/
if (page_mapped(page) && mapping) {
- switch (try_to_unmap(page)) {
+ switch (try_to_unmap(page, NULL)) {
case SWAP_FAIL:
page_map_unlock(page);
goto activate_locked;
@@ -1134,6 +1130,8 @@ int kswapd(void *p)
if (current->flags & PF_FREEZE)
refrigerator(PF_FREEZE);
prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE);
+ if (kthread_should_stop())
+ return 0;
schedule();
finish_wait(&pgdat->kswapd_wait, &wait);

@@ -1217,4 +1215,14 @@ static int __init kswapd_init(void)
return 0;
}

+#ifdef CONFIG_MEMHOTPLUG
+void
+kswapd_start_one(pg_data_t *pgdat)
+{
+ pgdat->kswapd = kthread_create(kswapd, pgdat, "kswapd%d",
+ pgdat->node_id);
+ total_memory = nr_free_pagecache_pages();
+}
+#endif
+
module_init(kswapd_init)
--- linux-2.6.7.ORG/include/linux/memhotplug.h 1970-01-01 09:00:00.000000000 +0900
+++ linux-2.6.7/include/linux/memhotplug.h 2004-06-17 11:39:12.000000000 +0900
@@ -0,0 +1,32 @@
+#ifndef _LINUX_MEMHOTPLUG_H
+#define _LINUX_MEMHOTPLUG_H
+
+#include <linux/config.h>
+#include <linux/mm.h>
+
+#ifdef __KERNEL__
+
+struct page_va_list {
+ struct mm_struct *mm;
+ unsigned long addr;
+ struct list_head list;
+};
+
+struct remap_operations {
+ struct page * (*remap_alloc_page)(int);
+ int (*remap_delete_page)(struct page *);
+ int (*remap_copy_page)(struct page *, struct page *);
+ int (*remap_lru_add_page)(struct page *, int);
+ int (*remap_release_buffers)(struct page *);
+ int (*remap_prepare)(struct page *page, int fastmode);
+ int (*remap_stick_page)(struct list_head *vlist);
+};
+
+extern int remapd(void *p);
+extern int remap_onepage(struct page *, int, int, struct remap_operations *);
+extern int remap_onepage_normal(struct page *, int, int);
+
+#define REMAP_ANYNODE (-1)
+
+#endif /* __KERNEL__ */
+#endif /* _LINUX_MEMHOTPLUG_H */
--- linux-2.6.7.ORG/mm/memhotplug.c 1970-01-01 09:00:00.000000000 +0900
+++ linux-2.6.7/mm/memhotplug.c 2004-06-17 11:39:12.000000000 +0900
@@ -0,0 +1,708 @@
+/*
+ * linux/mm/memhotplug.c
+ *
+ * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds
+ *
+ * Support of memory hotplug, Iwamoto
+ */
+
+#include <linux/mm.h>
+#include <linux/module.h>
+#include <linux/swap.h>
+#include <linux/pagemap.h>
+#include <linux/init.h>
+#include <linux/highmem.h>
+#include <linux/writeback.h>
+#include <linux/buffer_head.h>
+#include <linux/rmap.h>
+#include <linux/memhotplug.h>
+
+#ifdef CONFIG_KDB
+#include <linux/kdb.h>
+#endif
+
+static void
+print_buffer(struct page* page)
+{
+ struct address_space* mapping = page_mapping(page);
+ struct buffer_head *bh, *head;
+
+ spin_lock(&mapping->private_lock);
+ bh = head = page_buffers(page);
+ printk("buffers:");
+ do {
+ printk(" %lx %d", bh->b_state, atomic_read(&bh->b_count));
+
+ bh = bh->b_this_page;
+ } while (bh != head);
+ printk("\n");
+ spin_unlock(&mapping->private_lock);
+}
+
+static int
+stick_mlocked_page(struct list_head *vlist)
+{
+ struct page_va_list *v1;
+ struct vm_area_struct *vma;
+ int error;
+
+ while(!list_empty(vlist)) {
+ v1 = list_entry(vlist->next, struct page_va_list, list);
+ list_del(&v1->list);
+ down_read(&v1->mm->mmap_sem);
+ vma = find_vma(v1->mm, v1->addr);
+ if (vma == NULL || !(vma->vm_flags & VM_LOCKED))
+ goto out;
+ error = get_user_pages(current, v1->mm, v1->addr, PAGE_SIZE,
+ (vma->vm_flags & VM_WRITE) != 0, 0, NULL, NULL);
+ out:
+ up_read(&v1->mm->mmap_sem);
+ mmput(v1->mm);
+ kfree(v1);
+ }
+ return 0;
+}
+
+/* helper function for remap_onepage */
+#define REMAPPREP_WB 1
+#define REMAPPREP_BUFFER 2
+
+/*
+ * Try to free buffers if "page" has them.
+ */
+static int
+remap_preparepage(struct page *page, int fastmode)
+{
+ struct address_space *mapping;
+ int waitcnt = fastmode ? 0 : 100;
+
+ BUG_ON(!PageLocked(page));
+
+ mapping = page_mapping(page);
+
+ if (!PagePrivate(page) && PageWriteback(page) &&
+ !PageSwapCache(page)) {
+ printk("remap_preparepage: mapping %p page %p\n",
+ page->mapping, page);
+ return -REMAPPREP_WB;
+ }
+
+ while (PageWriteback(page)) {
+ if (!waitcnt)
+ return -REMAPPREP_WB;
+ __set_current_state(TASK_INTERRUPTIBLE);
+ schedule_timeout(10);
+ __set_current_state(TASK_RUNNING);
+ waitcnt--;
+ }
+ if (PagePrivate(page)) {
+ /* XXX copied from shrink_list() */
+ if (PageDirty(page) &&
+ is_page_cache_freeable(page) &&
+ mapping != NULL &&
+ mapping->a_ops->writepage != NULL) {
+ spin_lock_irq(&mapping->tree_lock);
+ if (clear_page_dirty_for_io(page)) {
+ int res;
+ struct writeback_control wbc = {
+ .sync_mode = WB_SYNC_NONE,
+ .nr_to_write = SWAP_CLUSTER_MAX,
+ .nonblocking = 1,
+ .for_reclaim = 1,
+ };
+
+ spin_unlock_irq(&mapping->tree_lock);
+
+ SetPageReclaim(page);
+ res = mapping->a_ops->writepage(page, &wbc);
+
+ if (res < 0)
+ /* not implemented. help */
+ BUG();
+ if (res == WRITEPAGE_ACTIVATE) {
+ ClearPageReclaim(page);
+ return -REMAPPREP_WB;
+ }
+ if (!PageWriteback(page)) {
+ /* synchronous write or broken a_ops? */
+ ClearPageReclaim(page);
+ }
+ lock_page(page);
+ mapping = page_mapping(page);
+ if (!PagePrivate(page))
+ return 0;
+ } else
+ spin_unlock_irq(&mapping->tree_lock);
+ }
+
+ while (1) {
+ if (try_to_release_page(page, GFP_KERNEL))
+ break;
+ if (!waitcnt)
+ return -REMAPPREP_BUFFER;
+ __set_current_state(TASK_INTERRUPTIBLE);
+ schedule_timeout(10);
+ __set_current_state(TASK_RUNNING);
+ waitcnt--;
+ if (!waitcnt)
+ print_buffer(page);
+ }
+ }
+ return 0;
+}
+
+/*
+ * Just assign swap space to a anonymous page if it doesn't have yet,
+ * so that the page can be handled like a page in the page cache
+ * since it in the swap cache.
+ */
+static struct address_space *
+make_page_mapped(struct page *page)
+{
+ if (!page_mapped(page)) {
+ if (page_count(page) > 1)
+ printk("page %p not mapped: count %d\n",
+ page, page_count(page));
+ return NULL;
+ }
+ /* The page is an anon page. Allocate its swap entry. */
+ page_map_unlock(page);
+ add_to_swap(page);
+ page_map_lock(page);
+ return page_mapping(page);
+}
+
+/*
+ * Replace "page" with "newpage" on the radix tree. After that, all
+ * new access to "page" will be redirected to "newpage" and it
+ * will be blocked until remapping has been done.
+ */
+static int
+radix_tree_replace_pages(struct page *page, struct page *newpage,
+ struct address_space *mapping)
+{
+ if (radix_tree_preload(GFP_KERNEL))
+ return -1;
+
+ if (PagePrivate(page)) /* XXX */
+ BUG();
+
+ /* should {__add_to,__remove_from}_page_cache be used instead? */
+ spin_lock_irq(&mapping->tree_lock);
+ if (mapping != page_mapping(page))
+ printk("mapping changed %p -> %p, page %p\n",
+ mapping, page_mapping(page), page);
+ if (radix_tree_delete(&mapping->page_tree,
+ PageSwapCache(page) ? page->private : page->index) == NULL) {
+ /* Page truncated. */
+ spin_unlock_irq(&mapping->tree_lock);
+ radix_tree_preload_end();
+ return -1;
+ }
+ /* Don't __put_page(page) here. Truncate may be in progress. */
+ newpage->flags |= page->flags & ~(1 << PG_uptodate) &
+ ~(1 << PG_highmem) & ~(1 << PG_anon) &
+ ~(1 << PG_maplock) &
+ ~(1 << PG_active) & ~(~0UL << NODEZONE_SHIFT);
+
+ /* list_del(&page->list); XXX */
+ radix_tree_insert(&mapping->page_tree,
+ PageSwapCache(page) ? page->private : page->index, newpage);
+ page_cache_get(newpage);
+ newpage->index = page->index;
+ if (PageSwapCache(page))
+ newpage->private = page->private;
+ else
+ newpage->mapping = page->mapping;
+ spin_unlock_irq(&mapping->tree_lock);
+ radix_tree_preload_end();
+ return 0;
+}
+
+/*
+ * Remove all PTE mappings to "page".
+ */
+static int
+unmap_page(struct page *page, struct list_head *vlist)
+{
+ int error = SWAP_SUCCESS;
+
+ page_map_lock(page);
+ while (page_mapped(page) &&
+ (error = try_to_unmap(page, vlist)) == SWAP_AGAIN) {
+ page_map_unlock(page);
+ __set_current_state(TASK_INTERRUPTIBLE);
+ schedule_timeout(1);
+ __set_current_state(TASK_RUNNING);
+ page_map_lock(page);
+ }
+ page_map_unlock(page);
+ if (error == SWAP_FAIL) {
+ /* either during mremap or mlocked */
+ return -1;
+ }
+ return 0;
+}
+
+/*
+ * Wait for "page" to become free. Almost same as waiting for its
+ * page count to drop to 2, but truncated pages are special.
+ */
+static int
+wait_on_page_freeable(struct page *page, struct address_space *mapping,
+ struct list_head *vlist, int truncated,
+ int nretry, struct remap_operations *ops)
+{
+ struct address_space *mapping1;
+
+ while ((truncated + page_count(page)) > 2) {
+ if (nretry <= 0)
+ return -1;
+ /* no lock needed while waiting page count */
+ unlock_page(page);
+
+ while ((truncated + page_count(page)) > 2) {
+ nretry--;
+ current->state = TASK_INTERRUPTIBLE;
+ schedule_timeout(1);
+ if ((nretry % 5000) == 0) {
+ printk("remap_onepage: still waiting on %p %d\n", page, nretry);
+ break;
+ }
+ if (PagePrivate(page) || page_mapped(page))
+ break; /* see below */
+ }
+
+ lock_page(page);
+ BUG_ON(page_count(page) == 0);
+ mapping1 = page_mapping(page);
+ if (mapping != mapping1 && mapping1 != NULL)
+ printk("mapping changed %p -> %p, page %p\n",
+ mapping, mapping1, page);
+ if (PagePrivate(page))
+ ops->remap_release_buffers(page);
+ unmap_page(page, vlist);
+ }
+ return nretry;
+}
+
+/*
+ * A file which "page" belongs to has been truncated. Free both pages.
+ */
+static void
+free_truncated_pages(struct page *page, struct page *newpage,
+ struct address_space *mapping)
+{
+ void *p;
+ /* mapping->tree_lock must be held. */
+ p = radix_tree_lookup(&mapping->page_tree,
+ PageSwapCache(newpage) ? newpage->private : newpage->index);
+ if (p != NULL) {
+ /* new cache page appeared after truncation */
+ printk("page %p newpage %p radix %p\n",
+ page, newpage, p);
+ BUG_ON(p == newpage);
+ }
+ BUG_ON(page_mapping(page) != NULL);
+ put_page(newpage);
+ if (page_count(newpage) != 1) {
+ printk("newpage count %d != 1, %p\n",
+ page_count(newpage), newpage);
+ BUG();
+ }
+ /* No need to do page->list. remove_from_page_cache did. */
+ newpage->mapping = page->mapping = NULL;
+ spin_unlock_irq(&mapping->tree_lock);
+ ClearPageActive(page);
+ ClearPageActive(newpage);
+ ClearPageSwapCache(page);
+ ClearPageSwapCache(newpage);
+ unlock_page(page);
+ unlock_page(newpage);
+ put_page(newpage);
+}
+
+static inline int
+is_page_truncated(struct page *page, struct page *newpage,
+ struct address_space *mapping)
+{
+ void *p;
+ spin_lock_irq(&mapping->tree_lock);
+ if (page_count(page) == 1) {
+ /* page has been truncated. */
+ return 0;
+ }
+ p = radix_tree_lookup(&mapping->page_tree,
+ PageSwapCache(newpage) ? newpage->private : newpage->index);
+ spin_unlock_irq(&mapping->tree_lock);
+ if (p == NULL) {
+ BUG_ON(page->mapping != NULL);
+ return -1;
+ }
+ return 1;
+}
+
+/*
+ * Replace "page" with "newpage" on the list of clean/dirty pages.
+ */
+static void
+remap_exchange_pages(struct page *page, struct page *newpage,
+ struct address_space *mapping)
+{
+ if (PageDirty(page))
+ set_page_dirty(newpage);
+ page->mapping = NULL;
+ unlock_page(page);
+
+ __put_page(page);
+
+ /* We are done. Finish and let the waiters run. */
+ SetPageUptodate(newpage);
+}
+
+/*
+ * Roll back all remapping operations.
+ */
+static int
+radix_tree_rewind_page(struct page *page, struct page *newpage,
+ struct address_space *mapping)
+{
+ int waitcnt;
+ unsigned long index;
+
+ /*
+ * Try to unwind by notifying waiters. If someone misbehaves,
+ * we die.
+ */
+ if (radix_tree_preload(GFP_KERNEL))
+ BUG();
+ /* should {__add_to,__remove_from}_page_cache be used instead? */
+ spin_lock_irq(&mapping->tree_lock);
+ /* list_del(&newpage->list); */
+ index = PageSwapCache(page) ? page->private : page->index;
+ if (radix_tree_delete(&mapping->page_tree, index) == NULL)
+ /* Hold extra count to handle truncate */
+ page_cache_get(newpage);
+ radix_tree_insert(&mapping->page_tree, index, page);
+ /* no page_cache_get(page); needed */
+ radix_tree_preload_end();
+ spin_unlock_irq(&mapping->tree_lock);
+
+ SetPageAgain(newpage);
+ newpage->mapping = NULL;
+ /* XXX unmap needed? No, it shouldn't. Handled by fault handlers. */
+ unlock_page(newpage);
+
+ waitcnt = HZ;
+ for(; page_count(newpage) > 2; waitcnt--) {
+ current->state = TASK_INTERRUPTIBLE;
+ schedule_timeout(10);
+ if (waitcnt == 0) {
+ printk("You are hosed.\n");
+ printk("newpage %p flags %lx %d %d, page %p flags %lx %d\n",
+ newpage, newpage->flags, page_count(newpage),
+ newpage->mapcount,
+ page, page->flags, page_count(page));
+ BUG();
+ }
+ }
+ BUG_ON(PageUptodate(newpage));
+ ClearPageDirty(newpage);
+ ClearPageActive(newpage);
+ spin_lock_irq(&mapping->tree_lock);
+ if (page_count(newpage) == 1) {
+ printk("newpage %p truncated. page %p\n", newpage, page);
+ BUG();
+ }
+ spin_unlock_irq(&mapping->tree_lock);
+ unlock_page(page);
+ BUG_ON(page_count(newpage) != 2);
+ ClearPageAgain(newpage);
+ __put_page(newpage);
+ return 1;
+}
+
+/*
+ * Allocate a new page from specified node.
+ */
+static struct page *
+remap_alloc_page(int nid)
+{
+ if (nid == REMAP_ANYNODE)
+ return alloc_page(GFP_HIGHUSER);
+ else
+ return alloc_pages_node(nid, GFP_HIGHUSER, 0);
+}
+
+static int
+remap_delete_page(struct page *page)
+{
+ BUG_ON(page_count(page) != 1);
+ put_page(page);
+ return 0;
+}
+
+static int
+remap_copy_page(struct page *to, struct page *from)
+{
+ copy_highpage(to, from);
+ return 0;
+}
+
+static int
+remap_lru_add_page(struct page *page, int active)
+{
+ if (active)
+ lru_cache_add_active(page);
+ else
+ lru_cache_add(page);
+ return 0;
+}
+
+static int
+remap_release_buffer(struct page *page)
+{
+ try_to_release_page(page, GFP_KERNEL);
+ return 0;
+}
+
+static struct remap_operations remap_ops = {
+ .remap_alloc_page = remap_alloc_page,
+ .remap_delete_page = remap_delete_page,
+ .remap_copy_page = remap_copy_page,
+ .remap_lru_add_page = remap_lru_add_page,
+ .remap_release_buffers = remap_release_buffer,
+ .remap_prepare = remap_preparepage,
+ .remap_stick_page = stick_mlocked_page
+};
+
+/*
+ * Try to remap a page. Returns non-zero on failure.
+ */
+int remap_onepage(struct page *page, int nodeid, int fastmode,
+ struct remap_operations *ops)
+{
+ struct page *newpage;
+ struct address_space *mapping;
+ LIST_HEAD(vlist);
+ int truncated = 0;
+ int nretry = fastmode ? HZ/50: HZ*10; /* XXXX */
+
+ if ((newpage = ops->remap_alloc_page(nodeid)) == NULL)
+ return -ENOMEM;
+ if (TestSetPageLocked(newpage))
+ BUG();
+ lock_page(page);
+
+ if (ops->remap_prepare && ops->remap_prepare(page, fastmode))
+ goto radixfail;
+ page_map_lock(page);
+ if (PageAnon(page) && !PageSwapCache(page))
+ make_page_mapped(page);
+ mapping = page_mapping(page);
+ page_map_unlock(page);
+ if (mapping == NULL)
+ goto radixfail;
+ if (radix_tree_replace_pages(page, newpage, mapping))
+ goto radixfail;
+ if (unmap_page(page, &vlist))
+ goto unmapfail;
+ if (PagePrivate(page))
+ printk("buffer reappeared\n");
+wait_again:
+ if ((nretry = wait_on_page_freeable(page, mapping, &vlist, truncated, nretry, ops)) < 0)
+ goto unmapfail;
+
+ if (PageReclaim(page) || PageWriteback(page) || PagePrivate(page))
+#ifdef CONFIG_KDB
+ KDB_ENTER();
+#else
+ BUG();
+#endif
+ switch (is_page_truncated(page, newpage, mapping)) {
+ case 0:
+ /* has been truncated */
+ free_truncated_pages(page, newpage, mapping);
+ ops->remap_delete_page(page);
+ return 0;
+ case -1:
+ /* being truncated */
+ truncated = 1;
+ BUG_ON(page_mapping(page) != NULL);
+ goto wait_again;
+ default:
+ /* through */
+ }
+
+ BUG_ON(mapping != page_mapping(page));
+
+ ops->remap_copy_page(newpage, page);
+ remap_exchange_pages(page, newpage, mapping);
+ if (ops->remap_lru_add_page)
+ ops->remap_lru_add_page(newpage, PageActive(page));
+ ClearPageActive(page);
+ ClearPageSwapCache(page);
+ ops->remap_delete_page(page);
+
+ /*
+ * Wake up all waiters which are waiting for completion
+ * of remapping operations.
+ */
+ unlock_page(newpage);
+
+ if (ops->remap_stick_page)
+ ops->remap_stick_page(&vlist);
+ page_cache_release(newpage);
+ return 0;
+
+unmapfail:
+ radix_tree_rewind_page(page, newpage, mapping);
+ if (ops->remap_stick_page)
+ ops->remap_stick_page(&vlist);
+ ClearPageActive(newpage);
+ ClearPageSwapCache(newpage);
+ ops->remap_delete_page(newpage);
+ return 1;
+
+radixfail:
+ unlock_page(page);
+ unlock_page(newpage);
+ if (ops->remap_stick_page)
+ ops->remap_stick_page(&vlist);
+ ops->remap_delete_page(newpage);
+ return 1;
+}
+
+int remap_onepage_normal(struct page *page, int nodeid, int fastmode)
+{
+ return remap_onepage(page, nodeid, fastmode, &remap_ops);
+}
+
+static struct work_struct lru_drain_wq[NR_CPUS];
+static void
+lru_drain_schedule(void *p)
+{
+ int cpu = get_cpu();
+
+ schedule_work(&lru_drain_wq[cpu]);
+ put_cpu();
+}
+
+atomic_t remapd_count;
+int remapd(void *p)
+{
+ struct zone *zone = p;
+ struct page *page, *page1;
+ struct list_head *l;
+ int active, i, nr_failed = 0;
+ int fastmode = 100;
+ LIST_HEAD(failedp);
+
+ daemonize("remap%d", zone->zone_start_pfn);
+ if (atomic_read(&remapd_count) > 0) {
+ printk("remapd already running\n");
+ return 0;
+ }
+ atomic_inc(&remapd_count);
+ on_each_cpu(lru_drain_schedule, NULL, 1, 1);
+ while(nr_failed < 100) {
+ spin_lock_irq(&zone->lru_lock);
+ for(active = 0; active < 2; active++) {
+ l = active ? &zone->active_list :
+ &zone->inactive_list;
+ for(i = 0; !list_empty(l) && i < 10; i++) {
+ page = list_entry(l->prev, struct page, lru);
+ if (fastmode && PageLocked(page)) {
+ page1 = page;
+ while (fastmode && PageLocked(page)) {
+ page =
+ list_entry(page->lru.prev,
+ struct page, lru);
+ fastmode--;
+ if (&page->lru == l) {
+ /* scanned the whole
+ list */
+ page = page1;
+ break;
+ }
+ if (page == page1)
+ BUG();
+ }
+ if (!fastmode) {
+ printk("used up fastmode\n");
+ page = page1;
+ }
+ }
+ if (!TestClearPageLRU(page))
+ BUG();
+ list_del(&page->lru);
+ if (get_page_testone(page)) {
+ /* the page is in pagevec_release();
+ shrink_cache says so. */
+ __put_page(page);
+ SetPageLRU(page);
+ list_add(&page->lru, l);
+ continue;
+ }
+ if (active)
+ zone->nr_active--;
+ else
+ zone->nr_inactive--;
+ spin_unlock_irq(&zone->lru_lock);
+ goto got_page;
+ }
+ }
+ spin_unlock_irq(&zone->lru_lock);
+ break;
+
+ got_page:
+ if (remap_onepage(page, REMAP_ANYNODE, fastmode, &remap_ops)) {
+ nr_failed++;
+ if (fastmode)
+ fastmode--;
+ list_add(&page->lru, &failedp);
+ }
+ }
+ if (list_empty(&failedp))
+ goto out;
+
+ while (!list_empty(&failedp)) {
+ page = list_entry(failedp.prev, struct page, lru);
+ list_del(&page->lru);
+ if (!TestSetPageLocked(page)) {
+ if (remap_preparepage(page, 10 /* XXX */)) {
+ unlock_page(page);
+ } else {
+ ClearPageLocked(page); /* XXX */
+ if (!remap_onepage(page, REMAP_ANYNODE, 0, &remap_ops))
+ continue;
+ }
+ }
+ spin_lock_irq(&zone->lru_lock);
+ if (PageActive(page)) {
+ list_add(&page->lru, &zone->active_list);
+ zone->nr_active++;
+ } else {
+ list_add(&page->lru, &zone->inactive_list);
+ zone->nr_inactive++;
+ }
+ if (TestSetPageLRU(page))
+ BUG();
+ spin_unlock_irq(&zone->lru_lock);
+ page_cache_release(page);
+ }
+out:
+ atomic_dec(&remapd_count);
+ return 0;
+}
+
+static int __init remapd_init(void)
+{
+ int i;
+
+ for(i = 0; i < NR_CPUS; i++)
+ INIT_WORK(&lru_drain_wq[i], (void (*)(void *))lru_add_drain, NULL);
+ return 0;
+}
+
+module_init(remapd_init);


2004-06-30 11:32:49

by Arjan van de Ven

[permalink] [raw]
Subject: Re: new memory hotremoval patch


> Page "remapping" is a mechanism to free a specified page by copying the
> page content to a newly allocated replacement page and redirecting
> references to the original page to the new page.
> This was designed to reliably free specified pages, unlike the swapout
> code.

are you 100% sure the locking is correct wrt O_DIRECT, AIO or futexes ??


Attachments:
signature.asc (189.00 B)
This is a digitally signed message part

2004-06-30 14:38:19

by Hirokazu Takahashi

[permalink] [raw]
Subject: Re: [Lhms-devel] Re: new memory hotremoval patch

Hello,

> > Page "remapping" is a mechanism to free a specified page by copying the
> > page content to a newly allocated replacement page and redirecting
> > references to the original page to the new page.
> > This was designed to reliably free specified pages, unlike the swapout
> > code.
>
> are you 100% sure the locking is correct wrt O_DIRECT, AIO or futexes ??

Sure, it can handle that!
And it can handle pages on RAMDISK and sysfs and so on.


Thank you,
Hirokazu Takahashi.



2004-06-30 23:32:02

by Dave Hansen

[permalink] [raw]
Subject: Re: [Lhms-devel] new memory hotremoval patch

On Wed, 2004-06-30 at 04:17, IWAMOTO Toshihiro wrote:
> Hi,
>
> this is an updated version of my memory hotremoval patch.
> I'll only include the main patch which contains page remapping code.
> The other two files, which haven't changed much from April, can be
> found at http://people.valinux.co.jp/~iwamoto/mh.html .

I tried your code and it oopsed pretty fast:

NUMA - single node, flat memory mode, but broken in several blocks
Rounding down maxpfn 950265 -> 949248
node 0 start 0
node 1 start 65536
node 2 start 131072
node 3 start 196608
node 4 start 262144
node 5 start 327680
node 6 start 393216
node 7 start 458752
physnode_map 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14
Warning only 1920MB will be used.
Use a HIGHMEM enabled kernel.
1892MB LOWMEM available.
min_low_pfn = 1080, max_low_pfn = 484352, highstart_pfn = 0
Low memory ends at vaddr f6400000
node 0 will remap to vaddr f6400000 -
High memory starts at vaddr 80000000
found SMP MP-table at 0009e1d0
On node 0 totalpages: 65536
DMA zone: 4096 pages, LIFO batch:1
Normal zone: 61440 pages, LIFO batch:15
HighMem zone: 0 pages, LIFO batch:1
...
Node 2 not plugged
Node 5 not plugged
Zone 5 not plugged
Node 8 not plugged
Zone 8 not plugged
Node 2 not plugged
Node 5 not plugged
Node 8 not plugged
disable 11
Unable to handle kernel NULL pointer dereference at virtual address 000003a7
printing eip:
801349b5
*pde = 00000000
Oops: 0002 [#1]
SMP DEBUG_PAGEALLOC
Modules linked in:
CPU: 0
EIP: 0060:[<801349b5>] Not tainted
EFLAGS: 00010206 (2.6.7)
EIP is at mhtest_disable+0x49/0xa0
eax: 0000039f ebx: 0000001f ecx: 00000380 edx: 000000b0
esi: 00000000 edi: 80425ea0 ebp: 0000002c esp: 8c60beec
ds: 007b es: 007b ss: 0068
Process sh (pid: 1842, threadinfo=8c60a000 task=8e549a70)
Stack: 0000000b 0000002c 802cad39 802cad38 00000001 80134dd4 0000000b 00000000
8f9b2f64 8f9b2f88 0000000b 8c60bf28 802cacb0 0000002c 0000000b 61736964
00656c62 000a3131 0000000a 8e4fee58 00000001 8c496f64 0000000a 00000000
Call Trace:
[<80134dd4>] mhtest_write+0x150/0x17d
[<8015daa4>] dupfd+0x2c/0x64
[<8015dac6>] dupfd+0x4e/0x64
[<801788a3>] proc_file_write+0x27/0x34
[<8014ef0c>] vfs_write+0xa0/0xd0
[<8014efb9>] sys_write+0x31/0x4c
[<80103c87>] syscall_call+0x7/0xb

Code: c7 40 08 00 00 00 00 c7 40 04 00 00 00 00 89 c8 03 04 3a 83


$ addr2line -e vmlinux 801349b5
/home/dave/work/linux/2.6/2.6.7/linux-2.6.7-iwamoto/mm/page_alloc.c:2205


The script spit out some errors too:

# sh rotate.sh
2:
rotate.sh: line 11: [: too many arguments

5: rotate.sh: line 11: [: too many arguments

8: rotate.sh: line 11: [: too many arguments




-- Dave

2004-07-01 00:11:40

by Dave Hansen

[permalink] [raw]
Subject: Re: [Lhms-devel] new memory hotremoval patch

On Wed, 2004-06-30 at 04:17, IWAMOTO Toshihiro wrote:
> Due to struct page changes, page->mapping == NULL predicate can no
> longer be used for detecting cancellation of an anonymous page
> remapping operation. So the PG_again bit is being used again.
> It may be still possible to kill the PG_again bit, but the priority is
> rather low.

But, you reintroduced it everywhere, including file-backed pages, not
just for anonymous pages? Why was this necessary?

-- Dave

2004-07-01 02:10:06

by IWAMOTO Toshihiro

[permalink] [raw]
Subject: Re: [Lhms-devel] new memory hotremoval patch

At Wed, 30 Jun 2004 16:31:12 -0700,
Dave Hansen wrote:
>
> On Wed, 2004-06-30 at 04:17, IWAMOTO Toshihiro wrote:
> > Hi,
> >
> > this is an updated version of my memory hotremoval patch.
> > I'll only include the main patch which contains page remapping code.
> > The other two files, which haven't changed much from April, can be
> > found at http://people.valinux.co.jp/~iwamoto/mh.html .
>
> I tried your code and it oopsed pretty fast:

This is because I forgot to update rotate.sh in my web page last night.
I've updated the file, please download and try again.

It is also necessary to issue several "plug" commands before running
rotate.sh.

# echo plug 1 > /proc/memhotplug
# echo plug 2 > /proc/memhotplug
...


> physnode_map 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14
> Warning only 1920MB will be used.
> Use a HIGHMEM enabled kernel.
> 1892MB LOWMEM available.

Did you use 2G/2G split? That config should work too, but I usually
test with CONFIG_HIGHMEM4G and CONFIG_MEMHOTPLUG_BLKSIZE=512.

--
IWAMOTO Toshihiro

2004-07-01 03:06:00

by IWAMOTO Toshihiro

[permalink] [raw]
Subject: Re: [Lhms-devel] new memory hotremoval patch

At Wed, 30 Jun 2004 17:11:11 -0700,
Dave Hansen wrote:
>
> On Wed, 2004-06-30 at 04:17, IWAMOTO Toshihiro wrote:
> > Due to struct page changes, page->mapping == NULL predicate can no
> > longer be used for detecting cancellation of an anonymous page
> > remapping operation. So the PG_again bit is being used again.
> > It may be still possible to kill the PG_again bit, but the priority is
> > rather low.
>
> But, you reintroduced it everywhere, including file-backed pages, not
> just for anonymous pages? Why was this necessary?

Which PG_again check are you talking about?
I think BUG_ON()s in file backed page codes should be kept for now.

For swap pages, one possibility to reserve a special swap entry
constant (SWAP_AGAIN) and check page->private instead of PageAgain
check, but I'm not sure if this is a good idea.

#define SWAP_AGAIN ~0UL

...

static int do_swap_page(struct mm_struct * mm,
struct vm_area_struct * vma, unsigned long address,
pte_t *page_table, pmd_t *pmd, pte_t orig_pte, int write_access)
{
...
again:
page = lookup_swap_cache(entry);
...
mark_page_accessed(page);
lock_page(page);
if (page->private == SWAP_AGAIN) {
...
goto again;
}

--
IWAMOTO Toshihiro