From: David Howells <dhowells@redhat.com>
To: torvalds@osdl.org, akpm@osdl.org, wli@holomorphy.com, hch@infradead.org
cc: linux-kernel@vger.kernel.org
Subject: [PATCH] Compound page overhaul
User-Agent: EMH/1.14.1 SEMI/1.14.5 (Awara-Onsen) FLIM/1.14.5 (Demachiyanagi) APEL/10.6 Emacs/21.3 (i386-redhat-linux-gnu) MULE/5.0 (SAKAKI)
MIME-Version: 1.0 (generated by SEMI 1.14.5 - "Awara-Onsen")
Content-Type: text/plain; charset=US-ASCII
Date: Mon, 22 Nov 2004 13:27:57 +0000
Message-ID: <11948.1101130077@redhat.com>
Sender: linux-kernel-owner@vger.kernel.org
Content-Length: 30413
Lines: 1014


The attached patch overhauls compound page handling.

 (1) A new bit flag PG_compound_slave has been added. This is used to mark the
     second+ subpages of a compound page, thus making get_page() and
     put_page() able to determine the need to perform weird stuff quickly.

     This could be changed to do horribly things with the page count or to
     abuse the page->lru member instead of eating another page flag.

 (2) Compound page metadata is now always set on high-order pages when
     allocating and always checked when freeing:

	- PG_compound is set on all subpages
	- PG_compound_slave is set on all but the first subpage
	- page[1].index holds the compound page order
	- page[1...N-1].private points to page[0]
	- page[1].mapping may hold a destructor function for put_page()

     This is now done in prep_new_page().

 (3) __page_first() is now provided to find the first page of any page set
     (even single page sets).

 (4) A new config option ENHANCED_COMPOUND_PAGES is now available. This is
     only set on !MMU or HUGETLB_PAGE. It causes __page_first() to dereference
     page->private if PG_compound_slave is set.

 (5) __GFP_COMP is no longer required since compound metadata is always now
     set.

 (6) compound_page_order() is now available. This will indicate the order of
     any page, high-order or not.

     Since it is now trivial to work out the order of any page, free_pages()
     and co could all lose their order arguments.

 (7) Trailing spaces have been cleaned up on lines in page_alloc.c.

 (8) bootmem.c now has a separate path to release pages to the main allocator
     that bypasses many of the checks performed on struct pages.

     bootmem.c's page releaser could be improved by giving the boot
     allocator's bitmap sufficient bits to make sure bit 0 is 32-page or
     64-page aligned (depending on bits/long), even if the page to which it
     corresponds doesn't actually exist.

 (9) bad_page() now prints more information, including information about more
     pages in the case of a compound page.

(10) prep_compound_page() and destroy_compound_page() have been absorbed.

(11) A lot more unlikely() clauses have been inserted in the free page
     checking functions.

(12) The !MMU bits have all gone from page_alloc.c.

(13) __pagevec_free() has moved to swap.c with all the other pagevec
     functions.

(14) put_page() has moved to page_alloc.c with all the other related
     functions. This could be relegated to a separate file, but since there
     are many other conditionals in page_alloc.c, what's the point?

Signed-Off-By: David Howells <dhowells@redhat.com>
---
warthog>diffstat compound-2610rc2mm3.diff
 include/linux/gfp.h        |    3 
 include/linux/mm.h         |   58 ++++--
 include/linux/page-flags.h |    6 
 init/Kconfig               |   13 +
 mm/bootmem.c               |   28 +--
 mm/hugetlb.c               |    2 
 mm/internal.h              |    3 
 mm/page_alloc.c            |  399 +++++++++++++++++++++++++++------------------
 mm/swap.c                  |   29 +--
 9 files changed, 335 insertions(+), 206 deletions(-)


diff -uNr /warthog/kernels/linux-2.6.10-rc2-mm3/include/linux/gfp.h linux-2.6.10-rc2-mm3-frv/include/linux/gfp.h
--- /warthog/kernels/linux-2.6.10-rc2-mm3/include/linux/gfp.h	2004-11-22 10:54:16.124855383 +0000
+++ linux-2.6.10-rc2-mm3-frv/include/linux/gfp.h	2004-11-22 11:45:09.983234496 +0000
@@ -36,7 +36,6 @@
 #define __GFP_NOFAIL	0x800	/* Retry for ever.  Cannot fail */
 #define __GFP_NORETRY	0x1000	/* Do not retry.  Might fail */
 #define __GFP_NO_GROW	0x2000	/* Slab internal usage */
-#define __GFP_COMP	0x4000	/* Add compound page metadata */
 
 #define __GFP_BITS_SHIFT 16	/* Room for 16 __GFP_FOO bits */
 #define __GFP_BITS_MASK ((1 << __GFP_BITS_SHIFT) - 1)
@@ -44,7 +43,7 @@
 /* if you forget to add the bitmask here kernel will crash, period */
 #define GFP_LEVEL_MASK (__GFP_WAIT|__GFP_HIGH|__GFP_IO|__GFP_FS| \
 			__GFP_COLD|__GFP_NOWARN|__GFP_REPEAT| \
-			__GFP_NOFAIL|__GFP_NORETRY|__GFP_NO_GROW|__GFP_COMP)
+			__GFP_NOFAIL|__GFP_NORETRY|__GFP_NO_GROW)
 
 #define GFP_ATOMIC	(__GFP_HIGH)
 #define GFP_NOIO	(__GFP_WAIT)
diff -uNr /warthog/kernels/linux-2.6.10-rc2-mm3/include/linux/mm.h linux-2.6.10-rc2-mm3-frv/include/linux/mm.h
--- /warthog/kernels/linux-2.6.10-rc2-mm3/include/linux/mm.h	2004-11-22 10:54:16.344837345 +0000
+++ linux-2.6.10-rc2-mm3-frv/include/linux/mm.h	2004-11-22 11:45:09.987234163 +0000
@@ -227,6 +227,12 @@
  * it to keep track of whatever it is we are using the page for at the
  * moment. Note that we have no way to track which tasks are using
  * a page.
+ *
+ * Any high-order page allocation has all the pages marked PG_compound. The
+ * first page of such a block holds the block's usage count and control
+ * data. The second page holds the order in its index member and a destructor
+ * function pointer in its mapping member. In enhanced compound page mode, the
+ * second+ pages have their private pointers pointing at the first page.
  */
 struct page {
 	page_flags_t flags;		/* Atomic flags, some possibly
@@ -314,45 +320,65 @@
  */
 #define get_page_testone(p)	atomic_inc_and_test(&(p)->_count)
 
-#define set_page_count(p,v) 	atomic_set(&(p)->_count, v - 1)
+#define set_page_count(p,v) 	atomic_set(&(p)->_count, (v) - 1)
 #define __put_page(p)		atomic_dec(&(p)->_count)
 
 extern void FASTCALL(__page_cache_release(struct page *));
 
-#ifdef CONFIG_HUGETLB_PAGE
-
-static inline int page_count(struct page *p)
+static inline struct page *__page_first(struct page *page)
 {
-	if (PageCompound(p))
-		p = (struct page *)p->private;
-	return atomic_read(&(p)->_count) + 1;
+#ifdef CONFIG_ENHANCED_COMPOUND_PAGES
+	if (unlikely(PageCompoundSlave(page)))
+		page = (struct page *) page->private;
+#endif
+	return page;
 }
 
-static inline void get_page(struct page *page)
+static inline unsigned compound_page_order(struct page *page)
 {
-	if (unlikely(PageCompound(page)))
-		page = (struct page *)page->private;
-	atomic_inc(&page->_count);
+	unsigned order = 0;
+
+	if (unlikely(PageCompound(page))) {
+		page = __page_first(page);
+		order = page[1].index;
+	}
+	return order;
 }
 
-void put_page(struct page *page);
+typedef void (*page_dtor_t)(struct page *);
 
-#else		/* CONFIG_HUGETLB_PAGE */
+static inline page_dtor_t page_dtor(struct page *page)
+{
+	page_dtor_t dtor = NULL;
 
-#define page_count(p)		(atomic_read(&(p)->_count) + 1)
+	if (unlikely(PageCompound(page))) {
+		page = __page_first(page);
+		dtor = (page_dtor_t) page[1].mapping;
+	}
+	return dtor;
+}
+
+static inline int page_count(struct page *page)
+{
+	page = __page_first(page);
+	return atomic_read(&page->_count) + 1;
+}
 
 static inline void get_page(struct page *page)
 {
+	page = __page_first(page);
 	atomic_inc(&page->_count);
 }
 
+#ifdef CONFIG_ENHANCED_COMPOUND_PAGES
+extern fastcall void put_page(struct page *page);
+#else
 static inline void put_page(struct page *page)
 {
 	if (!PageReserved(page) && put_page_testzero(page))
 		__page_cache_release(page);
 }
-
-#endif		/* CONFIG_HUGETLB_PAGE */
+#endif
 
 /*
  * Multiple processes may "see" the same page. E.g. for untouched
diff -uNr /warthog/kernels/linux-2.6.10-rc2-mm3/include/linux/page-flags.h linux-2.6.10-rc2-mm3-frv/include/linux/page-flags.h
--- /warthog/kernels/linux-2.6.10-rc2-mm3/include/linux/page-flags.h	2004-11-22 10:54:16.542821110 +0000
+++ linux-2.6.10-rc2-mm3-frv/include/linux/page-flags.h	2004-11-22 11:45:09.989233996 +0000
@@ -78,6 +78,7 @@
 #define PG_sharedpolicy         19      /* Page was allocated for a file
 					   mapping using a shared_policy */
 
+#define PG_compound_slave	20	/* second+ page of a compound page */
 
 /*
  * Global page accounting.  One instance per CPU.  Only unsigned longs are
@@ -294,6 +295,11 @@
 #define PageCompound(page)	test_bit(PG_compound, &(page)->flags)
 #define SetPageCompound(page)	set_bit(PG_compound, &(page)->flags)
 #define ClearPageCompound(page)	clear_bit(PG_compound, &(page)->flags)
+#define __ClearPageCompound(page)	__clear_bit(PG_compound, &(page)->flags)
+
+#define PageCompoundSlave(page)		test_bit(PG_compound_slave, &(page)->flags)
+#define SetPageCompoundSlave(page)	set_bit(PG_compound_slave, &(page)->flags)
+#define ClearPageCompoundSlave(page)	clear_bit(PG_compound_slave, &(page)->flags)
 
 #define PageSharedPolicy(page)      test_bit(PG_sharedpolicy, &(page)->flags)
 #define SetPageSharedPolicy(page)   set_bit(PG_sharedpolicy, &(page)->flags)
diff -uNr /warthog/kernels/linux-2.6.10-rc2-mm3/init/Kconfig linux-2.6.10-rc2-mm3-frv/init/Kconfig
--- /warthog/kernels/linux-2.6.10-rc2-mm3/init/Kconfig	2004-11-22 10:54:17.000000000 +0000
+++ linux-2.6.10-rc2-mm3-frv/init/Kconfig	2004-11-22 11:45:10.008232414 +0000
@@ -380,6 +380,19 @@
 	default !SHMEM
 	bool
 
+config ENHANCED_COMPOUND_PAGES
+	bool
+	default HUGETLB_PAGE || !MMU
+	help
+
+	  Enhance management of high-order pages by pointing the 2nd+ pages at
+	  the first. get_page() and put_page() then use the usage count on the
+	  first page to manage all the pages in the block.
+
+	  This is used when it might be necessary to access the intermediate
+	  pages of a block, such as ptrace() might under nommu of hugetlb
+	  conditions.
+
 menu "Loadable module support"
 
 config MODULES
diff -uNr /warthog/kernels/linux-2.6.10-rc2-mm3/mm/bootmem.c linux-2.6.10-rc2-mm3-frv/mm/bootmem.c
--- /warthog/kernels/linux-2.6.10-rc2-mm3/mm/bootmem.c	2004-11-22 10:54:17.000000000 +0000
+++ linux-2.6.10-rc2-mm3-frv/mm/bootmem.c	2004-11-22 11:45:10.030230581 +0000
@@ -274,55 +274,59 @@
 	page = virt_to_page(phys_to_virt(bdata->node_boot_start));
 	idx = bdata->node_low_pfn - (bdata->node_boot_start >> PAGE_SHIFT);
 	map = bdata->node_bootmem_map;
+
 	/* Check physaddr is O(LOG2(BITS_PER_LONG)) page aligned */
 	if (bdata->node_boot_start == 0 ||
 	    ffs(bdata->node_boot_start) - PAGE_SHIFT > ffs(BITS_PER_LONG))
 		gofast = 1;
+
 	for (i = 0; i < idx; ) {
 		unsigned long v = ~map[i / BITS_PER_LONG];
+
 		if (gofast && v == ~0UL) {
 			int j, order;
 
 			count += BITS_PER_LONG;
 			__ClearPageReserved(page);
 			order = ffs(BITS_PER_LONG) - 1;
-			set_page_refs(page, order);
 			for (j = 1; j < BITS_PER_LONG; j++) {
 				if (j + 16 < BITS_PER_LONG)
 					prefetchw(page + j + 16);
 				__ClearPageReserved(page + j);
 			}
-			__free_pages(page, order);
+			__free_pages_bootmem(page, order);
 			i += BITS_PER_LONG;
 			page += BITS_PER_LONG;
+
 		} else if (v) {
 			unsigned long m;
 			for (m = 1; m && i < idx; m<<=1, page++, i++) {
 				if (v & m) {
 					count++;
 					__ClearPageReserved(page);
-					set_page_refs(page, 0);
-					__free_page(page);
+					__free_pages_bootmem(page, 0);
 				}
 			}
+
 		} else {
-			i+=BITS_PER_LONG;
+			i += BITS_PER_LONG;
 			page += BITS_PER_LONG;
 		}
 	}
 	total += count;
 
 	/*
-	 * Now free the allocator bitmap itself, it's not
-	 * needed anymore:
+	 * Now free the allocator bitmap itself, it's not needed anymore:
 	 */
 	page = virt_to_page(bdata->node_bootmem_map);
-	count = 0;
-	for (i = 0; i < ((bdata->node_low_pfn-(bdata->node_boot_start >> PAGE_SHIFT))/8 + PAGE_SIZE-1)/PAGE_SIZE; i++,page++) {
-		count++;
+
+	count = bdata->node_low_pfn - (bdata->node_boot_start >> PAGE_SHIFT);
+	count = ((count / 8) + PAGE_SIZE - 1) >> PAGE_SHIFT;
+
+	for (i = count; i > 0; i--) {
 		__ClearPageReserved(page);
-		set_page_count(page, 1);
-		__free_page(page);
+		__free_pages_bootmem(page, 0);
+		page++;
 	}
 	total += count;
 	bdata->node_bootmem_map = NULL;
diff -uNr /warthog/kernels/linux-2.6.10-rc2-mm3/mm/hugetlb.c linux-2.6.10-rc2-mm3-frv/mm/hugetlb.c
--- /warthog/kernels/linux-2.6.10-rc2-mm3/mm/hugetlb.c	2004-11-22 10:54:18.000000000 +0000
+++ linux-2.6.10-rc2-mm3-frv/mm/hugetlb.c	2004-11-22 12:03:36.121111676 +0000
@@ -52,7 +52,7 @@
 {
 	static int nid = 0;
 	struct page *page;
-	page = alloc_pages_node(nid, GFP_HIGHUSER|__GFP_COMP|__GFP_NOWARN,
+	page = alloc_pages_node(nid, GFP_HIGHUSER|__GFP_NOWARN,
 					HUGETLB_PAGE_ORDER);
 	nid = (nid + 1) % numnodes;
 	if (page) {
diff -uNr /warthog/kernels/linux-2.6.10-rc2-mm3/mm/internal.h linux-2.6.10-rc2-mm3-frv/mm/internal.h
--- /warthog/kernels/linux-2.6.10-rc2-mm3/mm/internal.h	2004-11-22 10:54:18.000000000 +0000
+++ linux-2.6.10-rc2-mm3-frv/mm/internal.h	2004-11-22 11:45:10.034230247 +0000
@@ -10,4 +10,5 @@
  */
 
 /* page_alloc.c */
-extern void set_page_refs(struct page *page, int order);
+extern void fastcall free_hot_cold_page(struct page *page, int cold);
+extern fastcall void __init __free_pages_bootmem(struct page *page, unsigned int order);
diff -uNr /warthog/kernels/linux-2.6.10-rc2-mm3/mm/page_alloc.c linux-2.6.10-rc2-mm3-frv/mm/page_alloc.c
--- /warthog/kernels/linux-2.6.10-rc2-mm3/mm/page_alloc.c	2004-11-22 10:54:18.000000000 +0000
+++ linux-2.6.10-rc2-mm3-frv/mm/page_alloc.c	2004-11-22 12:02:01.371958426 +0000
@@ -80,15 +80,47 @@
 	return 0;
 }
 
-static void bad_page(const char *function, struct page *page)
+static inline void __bad_page(struct page *page)
 {
-	printk(KERN_EMERG "Bad page state at %s (in process '%s', page %p)\n",
-		function, current->comm, page);
-	printk(KERN_EMERG "flags:0x%0*lx mapping:%p mapcount:%d count:%d\n",
-		(int)(2*sizeof(page_flags_t)), (unsigned long)page->flags,
-		page->mapping, page_mapcount(page), page_count(page));
+	printk(KERN_EMERG
+	       "[%p] %0*lx %p %8d %8d %8lx %8lx\n",
+	       page,
+	       (int)(2 * sizeof(page_flags_t)),
+	       (unsigned long) page->flags,
+	       page->mapping, page_mapcount(page), page_count(page),
+	       page->index, page->private);
+}
+
+static void bad_page(const char *function, struct page *page,
+		     struct page *page0, int order)
+{
+	printk(KERN_EMERG "\n");
+	printk(KERN_EMERG
+	       "Bad page state at %s (in process '%s', order %d)\n",
+	       function, current->comm, order);
+
+	printk(KERN_EMERG
+	       "STRUCTPAGE FLAGS    MAPPING  MAPCOUNT COUNT    INDEX    PRIVATE\n");
+	printk(KERN_EMERG
+	       "========== ======== ======== ======== ======== ======== ========\n");
+
+	/* print extra details on a compound page */
+	if (PageCompound(page0)) {
+		__bad_page(page0);
+		__bad_page(page0 + 1);
+
+		if (page > page0 + 1) {
+			if (page > page0 + 2)
+				printk(KERN_EMERG "...\n");
+			__bad_page(page);
+		}
+	} else {
+		__bad_page(page);
+	}
+
 	printk(KERN_EMERG "Backtrace:\n");
 	dump_stack();
+
 	printk(KERN_EMERG "Trying to fix it up, but a reboot is needed\n");
 	page->flags &= ~(1 << PG_private	|
 			1 << PG_locked	|
@@ -103,75 +135,18 @@
 	tainted |= TAINT_BAD_PAGE;
 }
 
-#ifndef CONFIG_HUGETLB_PAGE
-#define prep_compound_page(page, order) do { } while (0)
-#define destroy_compound_page(page, order) do { } while (0)
-#else
-/*
- * Higher-order pages are called "compound pages".  They are structured thusly:
- *
- * The first PAGE_SIZE page is called the "head page".
- *
- * The remaining PAGE_SIZE pages are called "tail pages".
- *
- * All pages have PG_compound set.  All pages have their ->private pointing at
- * the head page (even the head page has this).
- *
- * The first tail page's ->mapping, if non-zero, holds the address of the
- * compound page's put_page() function.
- *
- * The order of the allocation is stored in the first tail page's ->index
- * This is only for debug at present.  This usage means that zero-order pages
- * may not be compound.
- */
-static void prep_compound_page(struct page *page, unsigned long order)
-{
-	int i;
-	int nr_pages = 1 << order;
-
-	page[1].mapping = NULL;
-	page[1].index = order;
-	for (i = 0; i < nr_pages; i++) {
-		struct page *p = page + i;
-
-		SetPageCompound(p);
-		p->private = (unsigned long)page;
-	}
-}
-
-static void destroy_compound_page(struct page *page, unsigned long order)
-{
-	int i;
-	int nr_pages = 1 << order;
-
-	if (!PageCompound(page))
-		return;
-
-	if (page[1].index != order)
-		bad_page(__FUNCTION__, page);
-
-	for (i = 0; i < nr_pages; i++) {
-		struct page *p = page + i;
-
-		if (!PageCompound(p))
-			bad_page(__FUNCTION__, page);
-		if (p->private != (unsigned long)page)
-			bad_page(__FUNCTION__, page);
-		ClearPageCompound(p);
-	}
-}
-#endif		/* CONFIG_HUGETLB_PAGE */
-
 /*
  * function for dealing with page's order in buddy system.
  * zone->lock is already acquired when we use these.
  * So, we don't need atomic page->flags operations here.
  */
-static inline unsigned long page_order(struct page *page) {
+static inline unsigned long page_order(struct page *page)
+{
 	return page->private;
 }
 
-static inline void set_page_order(struct page *page, int order) {
+static inline void set_page_order(struct page *page, int order)
+{
 	page->private = order;
 	__SetPagePrivate(page);
 }
@@ -182,6 +157,11 @@
 	page->private = 0;
 }
 
+static inline void set_page_refs(struct page *page, int order)
+{
+	set_page_count(page, 1);
+}
+
 /*
  * This function checks whether a page is free && is the buddy
  * we can do coalesce a page and its buddy if
@@ -202,6 +182,93 @@
 }
 
 /*
+ * validate a page that's being handed back for recycling
+ */
+static
+void free_pages_check_compound(const char *function, struct page *page, int order)
+{
+	struct page *xpage;
+	int i;
+
+	xpage = page;
+
+	if (unlikely(order == 0 ||
+		     PageCompoundSlave(page)
+		     ))
+		goto badpage;
+
+	xpage++;
+	if (unlikely(xpage->index != order))
+		goto badpage;
+
+	for (i = (1 << order) - 1; i > 0; i--) {
+		if (unlikely(!PageCompound(xpage) ||
+			     !PageCompoundSlave(xpage) ||
+			     (xpage->flags & (
+				     1 << PG_lru	|
+				     1 << PG_private	|
+				     1 << PG_locked	|
+				     1 << PG_active	|
+				     1 << PG_reclaim	|
+				     1 << PG_slab	|
+				     1 << PG_swapcache	|
+				     1 << PG_writeback
+				     )) ||
+			     page_count(xpage) != 0 ||
+			     page_mapped(xpage) ||
+			     xpage->mapping != NULL ||
+			     xpage->private != (unsigned long) page
+			     ))
+			goto badpage;
+
+		if (PageDirty(xpage))
+			ClearPageDirty(xpage);
+		xpage++;
+	}
+
+	return;
+
+ badpage:
+	bad_page(function, xpage, page, order);
+	return;
+}
+
+static inline
+void free_pages_check(const char *function, struct page *page, int order)
+{
+	if (unlikely(
+		page_mapped(page) ||
+		page->mapping != NULL ||
+		page_count(page) != 0 ||
+		(page->flags & (
+			1 << PG_lru	|
+			1 << PG_private |
+			1 << PG_locked	|
+			1 << PG_active	|
+			1 << PG_reclaim	|
+			1 << PG_slab	|
+			1 << PG_swapcache |
+			1 << PG_writeback ))
+		))
+		goto badpage;
+
+	/* check that compound pages are correctly assembled */
+	if (unlikely(PageCompound(page)))
+		free_pages_check_compound(function, page, order);
+	else if (unlikely(order > 0))
+		goto badpage;
+
+	if (PageDirty(page))
+		ClearPageDirty(page);
+
+	return;
+
+ badpage:
+	bad_page(function, page, page, order);
+	return;
+}
+
+/*
  * Freeing function for a buddy system allocator.
  *
  * The concept of a buddy system is to maintain direct-mapped table
@@ -217,10 +284,10 @@
  * free pages of length of (1 << order) and marked with PG_Private.Page's
  * order is recorded in page->private field.
  * So when we are allocating or freeing one, we can derive the state of the
- * other.  That is, if we allocate a small block, and both were   
- * free, the remainder of the region must be split into blocks.   
+ * other.  That is, if we allocate a small block, and both were
+ * free, the remainder of the region must be split into blocks.
  * If a block is freed, and its buddy is also free, then this
- * triggers coalescing into a block of larger size.            
+ * triggers coalescing into a block of larger size.
  *
  * -- wli
  */
@@ -232,8 +299,14 @@
 	struct page *coalesced;
 	int order_size = 1 << order;
 
-	if (unlikely(order))
-		destroy_compound_page(page, order);
+	if (unlikely(PageCompound(page))) {
+		struct page *xpage = page;
+		int i;
+
+		for (i = (1 << order); i > 0; i--)
+			(xpage++)->flags &=
+				~(1 << PG_compound | 1 << PG_compound_slave);
+	}
 
 	page_idx = page - base;
 
@@ -266,27 +339,8 @@
 	zone->free_area[order].nr_free++;
 }
 
-static inline void free_pages_check(const char *function, struct page *page)
-{
-	if (	page_mapped(page) ||
-		page->mapping != NULL ||
-		page_count(page) != 0 ||
-		(page->flags & (
-			1 << PG_lru	|
-			1 << PG_private |
-			1 << PG_locked	|
-			1 << PG_active	|
-			1 << PG_reclaim	|
-			1 << PG_slab	|
-			1 << PG_swapcache |
-			1 << PG_writeback )))
-		bad_page(function, page);
-	if (PageDirty(page))
-		ClearPageDirty(page);
-}
-
 /*
- * Frees a list of pages. 
+ * Frees a list of pages.
  * Assumes all pages on list are in same zone, and of same order.
  * count is the number of pages to free, or 0 for all on the list.
  *
@@ -322,25 +376,40 @@
 void __free_pages_ok(struct page *page, unsigned int order)
 {
 	LIST_HEAD(list);
-	int i;
 
 	arch_free_page(page, order);
 
 	mod_page_state(pgfree, 1 << order);
 
-#ifndef CONFIG_MMU
-	if (order > 0)
-		for (i = 1 ; i < (1 << order) ; ++i)
-			__put_page(page + i);
-#endif
-
-	for (i = 0 ; i < (1 << order) ; ++i)
-		free_pages_check(__FUNCTION__, page + i);
+	free_pages_check(__FUNCTION__, page, order);
 	list_add(&page->lru, &list);
-	kernel_map_pages(page, 1<<order, 0);
+	kernel_map_pages(page, 1 << order, 0);
 	free_pages_bulk(page_zone(page), 1, &list, order);
 }
 
+/*
+ * permit the bootmem allocator to evade page validation on high-order frees
+ */
+fastcall void __init __free_pages_bootmem(struct page *page, unsigned int order)
+{
+	set_page_refs(page, order);
+	set_page_count(page, 0);
+
+	if (order == 0) {
+		free_hot_cold_page(page, 0);
+	} else {
+		LIST_HEAD(list);
+
+		arch_free_page(page, order);
+
+		mod_page_state(pgfree, 1 << order);
+
+		list_add(&page->lru, &list);
+		kernel_map_pages(page, 1 << order, 0);
+		free_pages_bulk(page_zone(page), 1, &list, order);
+	}
+}
+
 
 /*
  * The order of subdivision here is critical for the IO subsystem.
@@ -374,30 +443,16 @@
 	return page;
 }
 
-void set_page_refs(struct page *page, int order)
-{
-#ifdef CONFIG_MMU
-	set_page_count(page, 1);
-#else
-	int i;
-
-	/*
-	 * We need to reference all the pages for this order, otherwise if
-	 * anyone accesses one of the pages with (get/put) it will be freed.
-	 * - eg: access_process_vm()
-	 */
-	for (i = 0; i < (1 << order); i++)
-		set_page_count(page + i, 1);
-#endif /* CONFIG_MMU */
-}
-
 /*
  * This page is about to be returned from the page allocator
  */
 static void prep_new_page(struct page *page, int order)
 {
+	page_flags_t pgflags = page->flags;
+
+	/* check the struct page hasn't become corrupted */
 	if (page->mapping || page_mapped(page) ||
-	    (page->flags & (
+	    (pgflags & (
 			1 << PG_private	|
 			1 << PG_locked	|
 			1 << PG_lru	|
@@ -405,17 +460,43 @@
 			1 << PG_dirty	|
 			1 << PG_reclaim	|
 			1 << PG_swapcache |
-			1 << PG_writeback )))
-		bad_page(__FUNCTION__, page);
+			1 << PG_writeback |
+			1 << PG_compound |
+			1 << PG_compound_slave)))
+		bad_page(__FUNCTION__, page, page, order);
+
+	pgflags &= ~(1 << PG_uptodate | 1 << PG_error |
+		     1 << PG_referenced | 1 << PG_arch_1 |
+		     1 << PG_checked | 1 << PG_mappedtodisk);
 
-	page->flags &= ~(1 << PG_uptodate | 1 << PG_error |
-			1 << PG_referenced | 1 << PG_arch_1 |
-			1 << PG_checked | 1 << PG_mappedtodisk);
 	page->private = 0;
+
+	/* set the refcount on the page */
 	set_page_refs(page, order);
+
+	/* mark all pages as being compound and store high-order page metadata
+	 * on the second page */
+	if (order > 0) {
+		struct page *xpage;
+		int i;
+
+		pgflags |= 1 << PG_compound;
+
+		page[1].index = order;
+		page[1].mapping = NULL; /* no destructor yet */
+
+		xpage = page + 1;
+		for (i = (1 << order) - 1; i > 0; i--) {
+			xpage->flags |= 1 << PG_compound | 1 << PG_compound_slave;
+			xpage->private = (unsigned long) page;
+			xpage++;
+		}
+	}
+
+	page->flags = pgflags;
 }
 
-/* 
+/*
  * Do the hard work of removing an element from the buddy allocator.
  * Call me with the zone->lock already held.
  */
@@ -441,19 +522,19 @@
 	return NULL;
 }
 
-/* 
+/*
  * Obtain a specified number of elements from the buddy allocator, all under
  * a single hold of the lock, for efficiency.  Add them to the supplied list.
  * Returns the number of new pages which were placed at *list.
  */
-static int rmqueue_bulk(struct zone *zone, unsigned int order, 
+static int rmqueue_bulk(struct zone *zone, unsigned int order,
 			unsigned long count, struct list_head *list)
 {
 	unsigned long flags;
 	int i;
 	int allocated = 0;
 	struct page *page;
-	
+
 	spin_lock_irqsave(&zone->lock, flags);
 	for (i = 0; i < count; ++i) {
 		page = __rmqueue(zone, order);
@@ -517,9 +598,9 @@
 {
 	unsigned long flags;
 
-	local_irq_save(flags);	
+	local_irq_save(flags);
 	__drain_pages(smp_processor_id());
-	local_irq_restore(flags);	
+	local_irq_restore(flags);
 }
 #endif /* CONFIG_PM */
 
@@ -552,8 +633,7 @@
 /*
  * Free a 0-order page
  */
-static void FASTCALL(free_hot_cold_page(struct page *page, int cold));
-static void fastcall free_hot_cold_page(struct page *page, int cold)
+void fastcall free_hot_cold_page(struct page *page, int cold)
 {
 	struct zone *zone = page_zone(page);
 	struct per_cpu_pages *pcp;
@@ -565,7 +645,7 @@
 	inc_page_state(pgfree);
 	if (PageAnon(page))
 		page->mapping = NULL;
-	free_pages_check(__FUNCTION__, page);
+	free_pages_check(__FUNCTION__, page, 0);
 	pcp = &zone->pageset[get_cpu()].pcp[cold];
 	local_irq_save(flags);
 	if (pcp->count >= pcp->high)
@@ -580,7 +660,7 @@
 {
 	free_hot_cold_page(page, 0);
 }
-	
+
 void fastcall free_cold_page(struct page *page)
 {
 	free_hot_cold_page(page, 1);
@@ -717,8 +797,6 @@
 		BUG_ON(bad_range(zone, page));
 		mod_page_state_zone(zone, pgalloc, 1 << order);
 		prep_new_page(page, order);
-		if (order && (gfp_flags & __GFP_COMP))
-			prep_compound_page(page, order);
 	}
 	return page;
 }
@@ -957,14 +1035,6 @@
 
 EXPORT_SYMBOL(get_zeroed_page);
 
-void __pagevec_free(struct pagevec *pvec)
-{
-	int i = pagevec_count(pvec);
-
-	while (--i >= 0)
-		free_hot_cold_page(pvec->pages[i], pvec->cold);
-}
-
 fastcall void __free_pages(struct page *page, unsigned int order)
 {
 	if (!PageReserved(page) && put_page_testzero(page)) {
@@ -987,6 +1057,27 @@
 
 EXPORT_SYMBOL(free_pages);
 
+#ifdef CONFIG_ENHANCED_COMPOUND_PAGES
+fastcall void put_page(struct page *page)
+{
+	if (unlikely(PageCompound(page))) {
+		page = (struct page *) page->private;
+		if (put_page_testzero(page)) {
+			page_dtor_t dtor;
+
+			dtor = (page_dtor_t) page[1].mapping;
+			(*dtor)(page);
+		}
+		return;
+	}
+
+	if (likely(!PageReserved(page)) && put_page_testzero(page))
+		__page_cache_release(page);
+}
+
+EXPORT_SYMBOL(put_page);
+#endif
+
 /*
  * Total amount of free (allocatable) RAM:
  */
@@ -1498,7 +1589,7 @@
  			j = build_zonelists_node(NODE_DATA(node), zonelist, j, k);
  		for (node = 0; node < local_node; node++)
  			j = build_zonelists_node(NODE_DATA(node), zonelist, j, k);
- 
+
 		zonelist->zones[j] = NULL;
 	}
 }
@@ -1636,7 +1727,7 @@
 	pgdat->nr_zones = 0;
 	init_waitqueue_head(&pgdat->kswapd_wait);
 	pgdat->kswapd_max_order = 0;
-	
+
 	for (j = 0; j < MAX_NR_ZONES; j++) {
 		struct zone *zone = pgdat->node_zones + j;
 		unsigned long size, realsize;
@@ -1798,7 +1889,7 @@
 {
 }
 
-/* 
+/*
  * This walks the free areas for each zone.
  */
 static int frag_show(struct seq_file *m, void *arg)
@@ -2038,8 +2129,8 @@
 }
 
 /*
- * setup_per_zone_pages_min - called when min_free_kbytes changes.  Ensures 
- *	that the pages_{min,low,high} values for each zone are set correctly 
+ * setup_per_zone_pages_min - called when min_free_kbytes changes.  Ensures
+ *	that the pages_{min,low,high} values for each zone are set correctly
  *	with respect to min_free_kbytes.
  */
 static void setup_per_zone_pages_min(void)
@@ -2073,10 +2164,10 @@
 				min_pages = 128;
 			zone->pages_min = min_pages;
 		} else {
-			/* if it's a lowmem zone, reserve a number of pages 
+			/* if it's a lowmem zone, reserve a number of pages
 			 * proportionate to the zone's size.
 			 */
-			zone->pages_min = (pages_min * zone->present_pages) / 
+			zone->pages_min = (pages_min * zone->present_pages) /
 			                   lowmem_pages;
 		}
 
@@ -2132,11 +2223,11 @@
 module_init(init_per_zone_pages_min)
 
 /*
- * min_free_kbytes_sysctl_handler - just a wrapper around proc_dointvec() so 
+ * min_free_kbytes_sysctl_handler - just a wrapper around proc_dointvec() so
  *	that we can call two helper functions whenever min_free_kbytes
  *	changes.
  */
-int min_free_kbytes_sysctl_handler(ctl_table *table, int write, 
+int min_free_kbytes_sysctl_handler(ctl_table *table, int write,
 		struct file *file, void __user *buffer, size_t *length, loff_t *ppos)
 {
 	proc_dointvec(table, write, file, buffer, length, ppos);
diff -uNr /warthog/kernels/linux-2.6.10-rc2-mm3/mm/swap.c linux-2.6.10-rc2-mm3-frv/mm/swap.c
--- /warthog/kernels/linux-2.6.10-rc2-mm3/mm/swap.c	2004-11-22 10:54:18.000000000 +0000
+++ linux-2.6.10-rc2-mm3-frv/mm/swap.c	2004-11-22 11:45:10.047229164 +0000
@@ -30,30 +30,11 @@
 #include <linux/cpu.h>
 #include <linux/notifier.h>
 #include <linux/init.h>
+#include "internal.h"
 
 /* How many pages do we try to swap or page in/out together? */
 int page_cluster;
 
-#ifdef CONFIG_HUGETLB_PAGE
-
-void put_page(struct page *page)
-{
-	if (unlikely(PageCompound(page))) {
-		page = (struct page *)page->private;
-		if (put_page_testzero(page)) {
-			void (*dtor)(struct page *page);
-
-			dtor = (void (*)(struct page *))page[1].mapping;
-			(*dtor)(page);
-		}
-		return;
-	}
-	if (!PageReserved(page) && put_page_testzero(page))
-		__page_cache_release(page);
-}
-EXPORT_SYMBOL(put_page);
-#endif
-
 /*
  * Writeback is about to end against a page which has been marked for immediate
  * reclaim.  If it still appears to be reclaimable, move it to the tail of the
@@ -242,6 +223,14 @@
 	pagevec_free(&pages_to_free);
 }
 
+void __pagevec_free(struct pagevec *pvec)
+{
+	int i = pagevec_count(pvec);
+
+	while (--i >= 0)
+		free_hot_cold_page(pvec->pages[i], pvec->cold);
+}
+
 /*
  * The pages which we're about to release may be in the deferred lru-addition
  * queues.  That would prevent them from really being freed right now.  That's
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/