2008-10-23 14:19:47

by Andy Whitcroft

[permalink] [raw]
Subject: [PATCH 0/2] Fixes for gigantic compounds pages V3

[This update includes the cleanups akpm applied, and moves us to a common
form throughout where gigantic page forms of prep/copy/clear are only
used where the original page size is gigantic. These should trigger
the minimum overall cost for non-gigantic cases. It also brings the two
patches into one series for easier tracking.]

During stress testing of the gigantic pages in 2.6.27 threw up some more
places where the hugepage support assumes the mem_map is contigious.
The buddy allocator does not guarentee that the memory map is contigious,
and in some memory models it is not; notably with SPARSMEM without
VMEMMAP enabled.

These have been round a couple of times, and I think most of the objections
and comments are included. The one outstanding was why we needed to fix
it at all as people could use VMEMMAP. What is comes down to is that
there are legitimate combinations of features, such as memory hot remove,
which require SPARSMEM without VMEMMAP. With that combination enabled
then any use of gigantic pages will skip off the end of the mem_map
segments and read random memory, with all the associated risks.

This patch set introduces some new iterators for the mem_map which know how
to follow across discontiguities. It then uses those to provide gigantic
versions of copy_huge_page, clear_huge_page, and prep_compound_page.
This patch set effectivly backs out the changes to prep_compound_page
removing any potential performance issues.

Please consider these patches for -mm. It is likely arguable these are
also stable candidates for 2.6.27 once they have had some run time.

Thanks to Jon Tollefson for his help testing previous versions of these
patches.

-apw

Andy Whitcroft (2):
hugetlbfs: handle pages higher order than MAX_ORDER
hugetlb: pull gigantic page initialisation out of the default path

mm/hugetlb.c | 49 +++++++++++++++++++++++++++++++++++++++++++++++--
mm/internal.h | 29 +++++++++++++++++++++++++++++
mm/page_alloc.c | 28 +++++++++++++++++++++-------
3 files changed, 97 insertions(+), 9 deletions(-)


2008-10-23 14:19:32

by Andy Whitcroft

[permalink] [raw]
Subject: [PATCH 1/2] hugetlbfs: handle pages higher order than MAX_ORDER

When working with hugepages, hugetlbfs assumes that those hugepages
are smaller than MAX_ORDER. Specifically it assumes that the mem_map
is contigious and uses that to optimise access to the elements of the
mem_map that represent the hugepage. Gigantic pages (such as 16GB pages
on powerpc) by definition are of greater order than MAX_ORDER (larger
than MAX_ORDER_NR_PAGES in size). This means that we can no longer make
use of the buddy alloctor guarentees for the contiguity of the mem_map,
which ensures that the mem_map is at least contigious for maximmally
aligned areas of MAX_ORDER_NR_PAGES pages.

This patch adds new mem_map accessors and iterator helpers which handle
any discontiguity at MAX_ORDER_NR_PAGES boundaries. It then uses these
to implement gigantic page versions of copy_huge_page and clear_huge_page,
and to allow follow_hugetlb_page handle gigantic pages.

Signed-off-by: Andy Whitcroft <[email protected]>
---
mm/hugetlb.c | 37 ++++++++++++++++++++++++++++++++++++-
mm/internal.h | 28 ++++++++++++++++++++++++++++
2 files changed, 64 insertions(+), 1 deletions(-)

diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 67a7119..793f52e 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -353,11 +353,26 @@ static int vma_has_reserves(struct vm_area_struct *vma)
return 0;
}

+static void clear_gigantic_page(struct page *page,
+ unsigned long addr, unsigned long sz)
+{
+ int i;
+ struct page *p = page;
+
+ might_sleep();
+ for (i = 0; i < sz/PAGE_SIZE; i++, p = mem_map_next(p, page, i)) {
+ cond_resched();
+ clear_user_highpage(p, addr + i * PAGE_SIZE);
+ }
+}
static void clear_huge_page(struct page *page,
unsigned long addr, unsigned long sz)
{
int i;

+ if (unlikely(sz > MAX_ORDER_NR_PAGES))
+ return clear_gigantic_page(page, addr, sz);
+
might_sleep();
for (i = 0; i < sz/PAGE_SIZE; i++) {
cond_resched();
@@ -365,12 +380,32 @@ static void clear_huge_page(struct page *page,
}
}

+static void copy_gigantic_page(struct page *dst, struct page *src,
+ unsigned long addr, struct vm_area_struct *vma)
+{
+ int i;
+ struct hstate *h = hstate_vma(vma);
+ struct page *dst_base = dst;
+ struct page *src_base = src;
+ might_sleep();
+ for (i = 0; i < pages_per_huge_page(h); ) {
+ cond_resched();
+ copy_user_highpage(dst, src, addr + i*PAGE_SIZE, vma);
+
+ i++;
+ dst = mem_map_next(dst, dst_base, i);
+ src = mem_map_next(src, src_base, i);
+ }
+}
static void copy_huge_page(struct page *dst, struct page *src,
unsigned long addr, struct vm_area_struct *vma)
{
int i;
struct hstate *h = hstate_vma(vma);

+ if (unlikely(pages_per_huge_page(h) > MAX_ORDER_NR_PAGES))
+ return copy_gigantic_page(dst, src, addr, vma);
+
might_sleep();
for (i = 0; i < pages_per_huge_page(h); i++) {
cond_resched();
@@ -2103,7 +2138,7 @@ int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
same_page:
if (pages) {
get_page(page);
- pages[i] = page + pfn_offset;
+ pages[i] = mem_map_offset(page, pfn_offset);
}

if (vmas)
diff --git a/mm/internal.h b/mm/internal.h
index 1f43f74..08b8dea 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -53,6 +53,34 @@ static inline unsigned long page_order(struct page *page)
}

/*
+ * Return the mem_map entry representing the 'offset' subpage within
+ * the maximally aligned gigantic page 'base'. Handle any discontiguity
+ * in the mem_map at MAX_ORDER_NR_PAGES boundaries.
+ */
+static inline struct page *mem_map_offset(struct page *base, int offset)
+{
+ if (unlikely(offset >= MAX_ORDER_NR_PAGES))
+ return pfn_to_page(page_to_pfn(base) + offset);
+ return base + offset;
+}
+
+/*
+ * Iterator over all subpages withing the maximally aligned gigantic
+ * page 'base'. Handle any discontiguity in the mem_map.
+ */
+static inline struct page *mem_map_next(struct page *iter,
+ struct page *base, int offset)
+{
+ if (unlikely((offset & (MAX_ORDER_NR_PAGES - 1)) == 0)) {
+ unsigned long pfn = page_to_pfn(base) + offset;
+ if (!pfn_valid(pfn))
+ return NULL;
+ return pfn_to_page(pfn);
+ }
+ return iter + 1;
+}
+
+/*
* FLATMEM and DISCONTIGMEM configurations use alloc_bootmem_node,
* so all functions starting at paging_init should be marked __init
* in those cases. SPARSEMEM, however, allows for memory hotplug,
--
1.6.0.2.711.gf1ba4

2008-10-23 14:20:01

by Andy Whitcroft

[permalink] [raw]
Subject: [PATCH 2/2] hugetlb: pull gigantic page initialisation out of the default path

As we can determine exactly when a gigantic page is in use we can optimise
the common regular page cases by pulling out gigantic page initialisation
into its own function. As gigantic pages are never released to buddy we
do not need a destructor. This effectivly reverts the previous change
to the main buddy allocator. It also adds a paranoid check to ensure we
never release gigantic pages from hugetlbfs to the main buddy.

Signed-off-by: Andy Whitcroft <[email protected]>
---
mm/hugetlb.c | 12 +++++++++++-
mm/internal.h | 1 +
mm/page_alloc.c | 28 +++++++++++++++++++++-------
3 files changed, 33 insertions(+), 8 deletions(-)

diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 793f52e..77427c8 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -490,6 +490,8 @@ static void update_and_free_page(struct hstate *h, struct page *page)
{
int i;

+ VM_BUG_ON(h->order >= MAX_ORDER);
+
h->nr_huge_pages--;
h->nr_huge_pages_node[page_to_nid(page)]--;
for (i = 0; i < pages_per_huge_page(h); i++) {
@@ -1004,6 +1006,14 @@ found:
return 1;
}

+static void prep_compound_huge_page(struct page *page, int order)
+{
+ if (unlikely(order > (MAX_ORDER - 1)))
+ prep_compound_gigantic_page(page, order);
+ else
+ prep_compound_page(page, order);
+}
+
/* Put bootmem huge pages into the standard lists after mem_map is up */
static void __init gather_bootmem_prealloc(void)
{
@@ -1014,7 +1024,7 @@ static void __init gather_bootmem_prealloc(void)
struct hstate *h = m->hstate;
__ClearPageReserved(page);
WARN_ON(page_count(page) != 1);
- prep_compound_page(page, h->order);
+ prep_compound_huge_page(page, h->order);
prep_new_huge_page(h, page, page_to_nid(page));
}
}
diff --git a/mm/internal.h b/mm/internal.h
index 08b8dea..92729ea 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -17,6 +17,7 @@ void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *start_vma,
unsigned long floor, unsigned long ceiling);

extern void prep_compound_page(struct page *page, unsigned long order);
+extern void prep_compound_gigantic_page(struct page *page, unsigned long order);

static inline void set_page_count(struct page *page, int v)
{
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 27b8681..b40d9b8 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -268,24 +268,39 @@ void prep_compound_page(struct page *page, unsigned long order)
{
int i;
int nr_pages = 1 << order;
+
+ set_compound_page_dtor(page, free_compound_page);
+ set_compound_order(page, order);
+ __SetPageHead(page);
+ for (i = 1; i < nr_pages; i++) {
+ struct page *p = page + i;
+
+ __SetPageTail(p);
+ p->first_page = page;
+ }
+}
+
+#ifdef CONFIG_HUGETLBFS
+void prep_compound_gigantic_page(struct page *page, unsigned long order)
+{
+ int i;
+ int nr_pages = 1 << order;
struct page *p = page + 1;

set_compound_page_dtor(page, free_compound_page);
set_compound_order(page, order);
__SetPageHead(page);
- for (i = 1; i < nr_pages; i++, p++) {
- if (unlikely((i & (MAX_ORDER_NR_PAGES - 1)) == 0))
- p = pfn_to_page(page_to_pfn(page) + i);
+ for (i = 1; i < nr_pages; i++, p = mem_map_next(p, page, i)) {
__SetPageTail(p);
p->first_page = page;
}
}
+#endif

static void destroy_compound_page(struct page *page, unsigned long order)
{
int i;
int nr_pages = 1 << order;
- struct page *p = page + 1;

if (unlikely(compound_order(page) != order))
bad_page(page);
@@ -293,9 +308,8 @@ static void destroy_compound_page(struct page *page, unsigned long order)
if (unlikely(!PageHead(page)))
bad_page(page);
__ClearPageHead(page);
- for (i = 1; i < nr_pages; i++, p++) {
- if (unlikely((i & (MAX_ORDER_NR_PAGES - 1)) == 0))
- p = pfn_to_page(page_to_pfn(page) + i);
+ for (i = 1; i < nr_pages; i++) {
+ struct page *p = page + i;

if (unlikely(!PageTail(p) |
(p->first_page != page)))
--
1.6.0.2.711.gf1ba4