2008-03-26 21:20:44

by Jon Tollefson

[permalink] [raw]
Subject: [PATCH 0/4] 16G huge page support for powerpc


This patch set builds on Andi Kleen's patches for GB pages for hugetlb
posted on March 16th. This set adds support for 16G huge pages on
ppc64. Supporting multiple huge page sizes on ppc64 as defined in
Andi's patches is not a part of this set; that will be included in a
future patch.

The first patch here adds an arch callback since the 16G pages are not
allocated from bootmem. The 16G pages have to be reserved prior to
boot-time. The location of these pages are indicated in the device tree.

Support for 16G pages requires a POWER5+ or later machine and a little
bit of memory.

Jon


2008-03-26 21:26:19

by Jon Tollefson

[permalink] [raw]
Subject: [PATCH 1/4] allow arch specific function for allocating gigantic pages

Allow alloc_bm_huge_page() to be overridden by architectures that can't always use bootmem.
This requires huge_boot_pages to be available for use by this function. Also huge_page_size()
and other functions need to use a long so that they can handle the 16G page size.


Signed-off-by: Jon Tollefson <[email protected]>
---

include/linux/hugetlb.h | 10 +++++++++-
mm/hugetlb.c | 21 +++++++++------------
2 files changed, 18 insertions(+), 13 deletions(-)


diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index a8de3c1..35a41be 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -35,6 +35,7 @@ void hugetlb_unreserve_pages(struct inode *inode, long offset, long freed);
extern unsigned long hugepages_treat_as_movable;
extern const unsigned long hugetlb_zero, hugetlb_infinity;
extern int sysctl_hugetlb_shm_group;
+extern struct list_head huge_boot_pages;

/* arch callbacks */

@@ -219,9 +220,15 @@ struct hstate {
unsigned int surplus_huge_pages_node[MAX_NUMNODES];
unsigned long parsed_hugepages;
};
+struct huge_bm_page {
+ struct list_head list;
+ struct hstate *hstate;
+};

void __init huge_add_hstate(unsigned order);
struct hstate *huge_lookup_hstate(unsigned long pagesize);
+/* arch callback */
+int alloc_bm_huge_page(struct hstate *h);

#ifndef HUGE_MAX_HSTATE
#define HUGE_MAX_HSTATE 1
@@ -248,7 +255,7 @@ static inline struct hstate *hstate_inode(struct inode *i)
return HUGETLBFS_I(i)->hstate;
}

-static inline unsigned huge_page_size(struct hstate *h)
+static inline unsigned long huge_page_size(struct hstate *h)
{
return PAGE_SIZE << h->order;
}
@@ -273,6 +280,7 @@ extern unsigned long sysctl_overcommit_huge_pages[HUGE_MAX_HSTATE];

#else
struct hstate {};
+#define alloc_bm_huge_page(h) NULL
#define hstate_file(f) NULL
#define hstate_vma(v) NULL
#define hstate_inode(i) NULL
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index c28b8b6..a0017b0 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -27,6 +27,7 @@ unsigned long max_huge_pages[HUGE_MAX_HSTATE];
unsigned long sysctl_overcommit_huge_pages[HUGE_MAX_HSTATE];
static gfp_t htlb_alloc_mask = GFP_HIGHUSER;
unsigned long hugepages_treat_as_movable;
+struct list_head huge_boot_pages;

static int max_hstate = 1;

@@ -43,7 +44,8 @@ struct hstate *parsed_hstate __initdata = &global_hstate;
*/
static DEFINE_SPINLOCK(hugetlb_lock);

-static void clear_huge_page(struct page *page, unsigned long addr, unsigned sz)
+static void clear_huge_page(struct page *page, unsigned long addr,
+ unsigned long sz)
{
int i;

@@ -521,14 +523,8 @@ static __init char *memfmt(char *buf, unsigned long n)
return buf;
}

-static __initdata LIST_HEAD(huge_boot_pages);
-
-struct huge_bm_page {
- struct list_head list;
- struct hstate *hstate;
-};
-
-static int __init alloc_bm_huge_page(struct hstate *h)
+/* Can be overriden by architectures */
+__attribute__((weak)) int alloc_bm_huge_page(struct hstate *h)
{
struct huge_bm_page *m;
m = __alloc_bootmem_node_nopanic(NODE_DATA(h->hugetlb_next_nid),
@@ -614,6 +610,7 @@ static int __init hugetlb_init(void)
{
if (HPAGE_SHIFT == 0)
return 0;
+ INIT_LIST_HEAD(&huge_boot_pages);
return hugetlb_init_hstate(&global_hstate);
}
module_init(hugetlb_init);
@@ -866,7 +863,7 @@ int hugetlb_report_meminfo(char *buf)
n += dump_field(buf + n, offsetof(struct hstate, surplus_huge_pages));
n += sprintf(buf + n, "Hugepagesize: ");
for_each_hstate (h)
- n += sprintf(buf + n, " %5u", huge_page_size(h) / 1024);
+ n += sprintf(buf + n, " %5lu", huge_page_size(h) / 1024);
n += sprintf(buf + n, " kB\n");
return n;
}
@@ -947,7 +944,7 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
unsigned long addr;
int cow;
struct hstate *h = hstate_vma(vma);
- unsigned sz = huge_page_size(h);
+ unsigned long sz = huge_page_size(h);

cow = (vma->vm_flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE;

@@ -992,7 +989,7 @@ void __unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
struct page *page;
struct page *tmp;
struct hstate *h = hstate_vma(vma);
- unsigned sz = huge_page_size(h);
+ unsigned long sz = huge_page_size(h);

/*
* A page gathering list, protected by per file i_mmap_lock. The



2008-03-26 21:26:36

by Jon Tollefson

[permalink] [raw]
Subject: [PATCH 2/4] powerpc: function for allocating gigantic pages

The 16G page locations have been saved during early boot in an array. The
alloc_bm_huge_page() function adds a page from here to the huge_boot_pages list.


Signed-off-by: Jon Tollefson <[email protected]>
---


hugetlbpage.c | 19 +++++++++++++++++++
1 file changed, 19 insertions(+)

diff --git a/arch/powerpc/mm/hugetlbpage.c b/arch/powerpc/mm/hugetlbpage.c
index 94625db..31d977b 100644
--- a/arch/powerpc/mm/hugetlbpage.c
+++ b/arch/powerpc/mm/hugetlbpage.c
@@ -29,6 +29,10 @@

#define NUM_LOW_AREAS (0x100000000UL >> SID_SHIFT)
#define NUM_HIGH_AREAS (PGTABLE_RANGE >> HTLB_AREA_SHIFT)
+#define MAX_NUMBER_GPAGES 1024
+
+static void *gpage_freearray[MAX_NUMBER_GPAGES];
+static unsigned nr_gpages;

unsigned int hugepte_shift;
#define PTRS_PER_HUGEPTE (1 << hugepte_shift)
@@ -104,6 +108,21 @@ pmd_t *hpmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long addr)
}
#endif

+/* Put 16G page address into temporary huge page list because the mem_map
+ * is not up yet.
+ */
+int alloc_bm_huge_page(struct hstate *h)
+{
+ struct huge_bm_page *m;
+ if (nr_gpages == 0)
+ return 0;
+ m = gpage_freearray[--nr_gpages];
+ list_add(&m->list, &huge_boot_pages);
+ m->hstate = h;
+ return 1;
+}
+
+
/* Modelled after find_linux_pte() */
pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr)
{


2008-03-26 21:27:54

by Jon Tollefson

[permalink] [raw]
Subject: [PATCH 3/4] powerpc: scan device tree and save gigantic page locations

The 16G huge pages have to be reserved in the HMC prior to boot. The location of
the pages are placed in the device tree. During very early boot these locations are
saved for use by hugetlbfs.

Signed-off-by: Jon Tollefson <[email protected]>
---

arch/powerpc/mm/hash_utils_64.c | 41 ++++++++++++++++++++++++++++++++++++++-
arch/powerpc/mm/hugetlbpage.c | 17 ++++++++++++++++
include/asm-powerpc/mmu-hash64.h | 2 +
3 files changed, 59 insertions(+), 1 deletion(-)


diff --git a/arch/powerpc/mm/hash_utils_64.c b/arch/powerpc/mm/hash_utils_64.c
index a83dfa3..d3f7d92 100644
--- a/arch/powerpc/mm/hash_utils_64.c
+++ b/arch/powerpc/mm/hash_utils_64.c
@@ -67,6 +67,7 @@

#define KB (1024)
#define MB (1024*KB)
+#define GB (1024L*MB)

/*
* Note: pte --> Linux PTE
@@ -302,6 +303,41 @@ static int __init htab_dt_scan_page_sizes(unsigned long node,
return 0;
}

+/* Scan for 16G memory blocks that have been set aside for huge pages
+ * and reserve those blocks for 16G huge pages.
+ */
+static int __init htab_dt_scan_hugepage_blocks(unsigned long node,
+ const char *uname, int depth,
+ void *data) {
+ char *type = of_get_flat_dt_prop(node, "device_type", NULL);
+ unsigned long *lprop;
+ u32 *prop;
+
+ /* We are scanning "memory" nodes only */
+ if (type == NULL || strcmp(type, "memory") != 0)
+ return 0;
+
+ /* This property is the log base 2 of the number of virtual pages that
+ * will represent this memory block. */
+ prop = of_get_flat_dt_prop(node, "ibm,expected#pages", NULL);
+ if (prop == NULL)
+ return 0;
+ unsigned int expected_pages = (1 << prop[0]);
+ lprop = of_get_flat_dt_prop(node, "reg", NULL);
+ if (lprop == NULL)
+ return 0;
+ long unsigned int phys_addr = lprop[0];
+ long unsigned int block_size = lprop[1];
+ if (block_size != (16 * GB))
+ return 0;
+ printk(KERN_INFO "Reserving huge page memory "
+ "addr = 0x%lX size = 0x%lX pages = %d\n",
+ phys_addr, block_size, expected_pages);
+ lmb_reserve(phys_addr, block_size * expected_pages);
+ add_gpage(phys_addr, block_size, expected_pages);
+ return 0;
+}
+
static void __init htab_init_page_sizes(void)
{
int rc;
@@ -370,7 +406,10 @@ static void __init htab_init_page_sizes(void)
mmu_psize_defs[mmu_io_psize].shift);

#ifdef CONFIG_HUGETLB_PAGE
- /* Init large page size. Currently, we pick 16M or 1M depending
+ /* Reserve 16G huge page memory sections for huge pages */
+ of_scan_flat_dt(htab_dt_scan_hugepage_blocks, NULL);
+
+/* Init large page size. Currently, we pick 16M or 1M depending
* on what is available
*/
if (mmu_psize_defs[MMU_PAGE_16M].shift)
diff --git a/arch/powerpc/mm/hugetlbpage.c b/arch/powerpc/mm/hugetlbpage.c
index 31d977b..44d3d55 100644
--- a/arch/powerpc/mm/hugetlbpage.c
+++ b/arch/powerpc/mm/hugetlbpage.c
@@ -108,6 +108,23 @@ pmd_t *hpmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long addr)
}
#endif

+/* Build list of addresses of gigantic pages. This function is used in early
+ * boot before the buddy allocator is setup.
+ */
+void add_gpage(unsigned long addr, unsigned long page_size,
+ unsigned long number_of_pages)
+{
+ if (addr) {
+ while (number_of_pages > 0) {
+ gpage_freearray[nr_gpages] = __va(addr);
+ nr_gpages++;
+ number_of_pages--;
+ addr += page_size;
+ }
+ }
+}
+
+
/* Put 16G page address into temporary huge page list because the mem_map
* is not up yet.
*/
diff --git a/include/asm-powerpc/mmu-hash64.h b/include/asm-powerpc/mmu-hash64.h
index 2864fa3..db1276a 100644
--- a/include/asm-powerpc/mmu-hash64.h
+++ b/include/asm-powerpc/mmu-hash64.h
@@ -279,6 +279,8 @@ extern int htab_bolt_mapping(unsigned long vstart, unsigned long vend,
unsigned long pstart, unsigned long mode,
int psize, int ssize);
extern void set_huge_psize(int psize);
+extern void add_gpage(unsigned long addr, unsigned long page_size,
+ unsigned long number_of_pages);
extern void demote_segment_4k(struct mm_struct *mm, unsigned long addr);

extern void htab_initialize(void);



2008-03-26 21:29:41

by Jon Tollefson

[permalink] [raw]
Subject: [PATCH 4/4] powerpc: define page support for 16G pages

The huge page size is setup for 16G pages if that size is specified at boot-time. The support for
multiple huge page sizes is not being utilized yet. That will be in a future patch.


Signed-off-by: Jon Tollefson <[email protected]>
---

hugetlbpage.c | 12 ++++++++++--
1 file changed, 10 insertions(+), 2 deletions(-)


diff --git a/arch/powerpc/mm/hugetlbpage.c b/arch/powerpc/mm/hugetlbpage.c
index 44d3d55..b6a02b7 100644
--- a/arch/powerpc/mm/hugetlbpage.c
+++ b/arch/powerpc/mm/hugetlbpage.c
@@ -26,6 +26,7 @@

#define HPAGE_SHIFT_64K 16
#define HPAGE_SHIFT_16M 24
+#define HPAGE_SHIFT_16G 34

#define NUM_LOW_AREAS (0x100000000UL >> SID_SHIFT)
#define NUM_HIGH_AREAS (PGTABLE_RANGE >> HTLB_AREA_SHIFT)
@@ -589,9 +590,11 @@ void set_huge_psize(int psize)
{
/* Check that it is a page size supported by the hardware and
* that it fits within pagetable limits. */
- if (mmu_psize_defs[psize].shift && mmu_psize_defs[psize].shift < SID_SHIFT &&
+ if (mmu_psize_defs[psize].shift &&
+ mmu_psize_defs[psize].shift < SID_SHIFT_1T &&
(mmu_psize_defs[psize].shift > MIN_HUGEPTE_SHIFT ||
- mmu_psize_defs[psize].shift == HPAGE_SHIFT_64K)) {
+ mmu_psize_defs[psize].shift == HPAGE_SHIFT_64K ||
+ mmu_psize_defs[psize].shift == HPAGE_SHIFT_16G)) {
HPAGE_SHIFT = mmu_psize_defs[psize].shift;
mmu_huge_psize = psize;
#ifdef CONFIG_PPC_64K_PAGES
@@ -599,6 +602,8 @@ void set_huge_psize(int psize)
#else
if (HPAGE_SHIFT == HPAGE_SHIFT_64K)
hugepte_shift = (PMD_SHIFT-HPAGE_SHIFT);
+ else if (HPAGE_SHIFT == HPAGE_SHIFT_16G)
+ hugepte_shift = (PGDIR_SHIFT-HPAGE_SHIFT);
else
hugepte_shift = (PUD_SHIFT-HPAGE_SHIFT);
#endif
@@ -625,6 +630,9 @@ static int __init hugepage_setup_sz(char *str)
case HPAGE_SHIFT_16M:
mmu_psize = MMU_PAGE_16M;
break;
+ case HPAGE_SHIFT_16G:
+ mmu_psize = MMU_PAGE_16G;
+ break;
}

if (mmu_psize >=0 && mmu_psize_defs[mmu_psize].shift)


2008-03-26 21:44:33

by Andi Kleen

[permalink] [raw]
Subject: Re: [PATCH 0/4] 16G huge page support for powerpc


FWIW i turned over the hugepages patchkit to Nick Piggin. So send
all future patches to him please.

-Andi

2008-03-26 21:46:31

by Andi Kleen

[permalink] [raw]
Subject: Re: [PATCH 1/4] allow arch specific function for allocating gigantic pages


Haven't reviewed it in detail, just noticed something.

> @@ -614,6 +610,7 @@ static int __init hugetlb_init(void)
> {
> if (HPAGE_SHIFT == 0)
> return 0;
> + INIT_LIST_HEAD(&huge_boot_pages);
> return hugetlb_init_hstate(&global_hstate);

I don't think adding the INIT_LIST_HEAD here is correct. There can
be huge pages added by the __setup handlers before hugetlb_init

-Andi