Implement HugeTLB for 32-bit BookE PowerPC. There is also some
infrastructure in place for 64-bit BookE as well from David Gibson;
I'm not sure if we want to include this at this time or not. I have
only been able to build test the 64-bit configurations. Ben, let
me know how you want to proceed here.
The bulk of this patch series is powerpc-specific, but there
are a couple of minor fixes to the generic fs and mm code included,
hence the cross-post.
Note that to use this fully some modifications to libhugetlbfs
are required; I expect to publish those shortly.
Diffstat below.
Cheers,
Becky
arch/powerpc/Kconfig | 3 +-
arch/powerpc/include/asm/hugetlb.h | 63 +++++-
arch/powerpc/include/asm/mmu-book3e.h | 7 +
arch/powerpc/include/asm/mmu-hash64.h | 3 +-
arch/powerpc/include/asm/mmu.h | 23 ++-
arch/powerpc/include/asm/page.h | 31 +++-
arch/powerpc/include/asm/page_64.h | 11 -
arch/powerpc/include/asm/pte-book3e.h | 3 +
arch/powerpc/kernel/head_fsl_booke.S | 133 ++++++++++--
arch/powerpc/kernel/smp.c | 4 +
arch/powerpc/mm/Makefile | 1 +
arch/powerpc/mm/hash_utils_64.c | 3 -
arch/powerpc/mm/hugetlbpage-book3e.c | 121 ++++++++++
arch/powerpc/mm/hugetlbpage.c | 379 ++++++++++++++++++++++++++++----
arch/powerpc/mm/init_32.c | 9 +
arch/powerpc/mm/mem.c | 17 ++-
arch/powerpc/mm/mmu_context_nohash.c | 5 +
arch/powerpc/mm/pgtable.c | 3 +-
arch/powerpc/mm/tlb_low_64e.S | 24 +-
arch/powerpc/mm/tlb_nohash.c | 52 +++++-
arch/powerpc/platforms/Kconfig.cputype | 4 +-
fs/hugetlbfs/inode.c | 2 +-
include/linux/hugetlb.h | 3 +
mm/hugetlb.c | 8 +-
24 files changed, 803 insertions(+), 109 deletions(-)
From: Becky Bruce <[email protected]>
This:
vma->vm_pgoff & ~(huge_page_mask(h) >> PAGE_SHIFT)
is incorrect on 32-bit. It causes us to & the pgoff with
something that looks like this (for a 4m hugepage): 0xfff003ff.
The mask should be flipped and *then* shifted, to give you
0x0000_03fff.
Signed-off-by: Becky Bruce <[email protected]>
---
fs/hugetlbfs/inode.c | 2 +-
1 files changed, 1 insertions(+), 1 deletions(-)
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index 7aafeb8..537a209 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -94,7 +94,7 @@ static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma)
vma->vm_flags |= VM_HUGETLB | VM_RESERVED;
vma->vm_ops = &hugetlb_vm_ops;
- if (vma->vm_pgoff & ~(huge_page_mask(h) >> PAGE_SHIFT))
+ if (vma->vm_pgoff & (~huge_page_mask(h) >> PAGE_SHIFT))
return -EINVAL;
vma_len = (loff_t)(vma->vm_end - vma->vm_start);
--
1.5.6.5
From: Becky Bruce <[email protected]>
This is needed on HIGHMEM systems - we don't always have a virtual
address so store the physical address and map it in as needed.
Signed-off-by: Becky Bruce <[email protected]>
---
include/linux/hugetlb.h | 3 +++
mm/hugetlb.c | 8 +++++++-
2 files changed, 10 insertions(+), 1 deletions(-)
diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index 59225ef..19644e0 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -231,6 +231,9 @@ struct hstate {
struct huge_bootmem_page {
struct list_head list;
struct hstate *hstate;
+#ifdef CONFIG_HIGHMEM
+ phys_addr_t phys;
+#endif
};
struct page *alloc_huge_page_node(struct hstate *h, int nid);
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 6402458..2db81ea 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -1105,8 +1105,14 @@ static void __init gather_bootmem_prealloc(void)
struct huge_bootmem_page *m;
list_for_each_entry(m, &huge_boot_pages, list) {
- struct page *page = virt_to_page(m);
struct hstate *h = m->hstate;
+#ifdef CONFIG_HIGHMEM
+ struct page *page = pfn_to_page(m->phys >> PAGE_SHIFT);
+ free_bootmem_late((unsigned long)m,
+ sizeof(struct huge_bootmem_page));
+#else
+ struct page *page = virt_to_page(m);
+#endif
__ClearPageReserved(page);
WARN_ON(page_count(page) != 1);
prep_compound_huge_page(page, h->order);
--
1.5.6.5
From: Becky Bruce <[email protected]>
This has been broken for a while but hasn't been an issue until
now because nobody was reserving regions at high addresses.
Signed-off-by: Becky Bruce <[email protected]>
---
arch/powerpc/mm/mem.c | 3 ++-
1 files changed, 2 insertions(+), 1 deletions(-)
diff --git a/arch/powerpc/mm/mem.c b/arch/powerpc/mm/mem.c
index 57e545b..097b288 100644
--- a/arch/powerpc/mm/mem.c
+++ b/arch/powerpc/mm/mem.c
@@ -337,8 +337,9 @@ void __init mem_init(void)
highmem_mapnr = lowmem_end_addr >> PAGE_SHIFT;
for (pfn = highmem_mapnr; pfn < max_mapnr; ++pfn) {
+ phys_addr_t paddr = (phys_addr_t)pfn << PAGE_SHIFT;
struct page *page = pfn_to_page(pfn);
- if (memblock_is_reserved(pfn << PAGE_SHIFT))
+ if (memblock_is_reserved(paddr))
continue;
ClearPageReserved(page);
init_page_count(page);
--
1.5.6.5
From: Becky Bruce <[email protected]>
This is used to round-robin TLBCAM entries.
Signed-off-by: Becky Bruce <[email protected]>
---
arch/powerpc/include/asm/mmu.h | 5 +++++
arch/powerpc/kernel/smp.c | 4 ++++
arch/powerpc/mm/mem.c | 9 +++++++++
arch/powerpc/mm/tlb_nohash.c | 6 ++++++
4 files changed, 24 insertions(+), 0 deletions(-)
diff --git a/arch/powerpc/include/asm/mmu.h b/arch/powerpc/include/asm/mmu.h
index 4138b21..b427a55 100644
--- a/arch/powerpc/include/asm/mmu.h
+++ b/arch/powerpc/include/asm/mmu.h
@@ -115,6 +115,11 @@
#ifndef __ASSEMBLY__
#include <asm/cputable.h>
+#ifdef CONFIG_PPC_FSL_BOOK3E
+#include <asm/percpu.h>
+DECLARE_PER_CPU(int, next_tlbcam_idx);
+#endif
+
static inline int mmu_has_feature(unsigned long feature)
{
return (cur_cpu_spec->mmu_features & feature);
diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c
index 2975f64..3c9681a 100644
--- a/arch/powerpc/kernel/smp.c
+++ b/arch/powerpc/kernel/smp.c
@@ -313,6 +313,10 @@ struct thread_info *current_set[NR_CPUS];
static void __devinit smp_store_cpu_info(int id)
{
per_cpu(cpu_pvr, id) = mfspr(SPRN_PVR);
+#ifdef CONFIG_PPC_FSL_BOOK3E
+ per_cpu(next_tlbcam_idx, id)
+ = (mfspr(SPRN_TLB1CFG) & TLBnCFG_N_ENTRY) - 1;
+#endif
}
void __init smp_prepare_cpus(unsigned int max_cpus)
diff --git a/arch/powerpc/mm/mem.c b/arch/powerpc/mm/mem.c
index 097b288..7209901 100644
--- a/arch/powerpc/mm/mem.c
+++ b/arch/powerpc/mm/mem.c
@@ -353,6 +353,15 @@ void __init mem_init(void)
}
#endif /* CONFIG_HIGHMEM */
+#if defined(CONFIG_PPC_FSL_BOOK3E) && !defined(CONFIG_SMP)
+ /*
+ * If smp is enabled, next_tlbcam_idx is initialized in the cpu up
+ * functions.... do it here for the non-smp case.
+ */
+ per_cpu(next_tlbcam_idx, smp_processor_id()) =
+ (mfspr(SPRN_TLB1CFG) & TLBnCFG_N_ENTRY) - 1;
+#endif
+
printk(KERN_INFO "Memory: %luk/%luk available (%luk kernel code, "
"%luk reserved, %luk data, %luk bss, %luk init)\n",
nr_free_pages() << (PAGE_SHIFT-10),
diff --git a/arch/powerpc/mm/tlb_nohash.c b/arch/powerpc/mm/tlb_nohash.c
index 5693499..ea037ba 100644
--- a/arch/powerpc/mm/tlb_nohash.c
+++ b/arch/powerpc/mm/tlb_nohash.c
@@ -102,6 +102,12 @@ unsigned long linear_map_top; /* Top of linear mapping */
#endif /* CONFIG_PPC64 */
+#ifdef CONFIG_PPC_FSL_BOOK3E
+/* next_tlbcam_idx is used to round-robin tlbcam entry assignment */
+DEFINE_PER_CPU(int, next_tlbcam_idx);
+EXPORT_PER_CPU_SYMBOL(next_tlbcam_idx);
+#endif
+
/*
* Base TLB flushing operations:
*
--
1.5.6.5
From: Becky Bruce <[email protected]>
Enable hugepages on Freescale BookE processors. This allows the kernel to
use huge TLB entries to map pages, which can greatly reduce the number of
TLB misses and the amount of TLB thrashing experienced by applications with
large memory footprints. Care should be taken when using this on FSL
processors, as the number of large TLB entries supported by the core is low
(16-64) on current processors.
The supported set of hugepage sizes include 4m, 16m, 64m, 256m, and 1g.
Page sizes larger than the max zone size are called "gigantic" pages and
must be allocated on the command line (and cannot be deallocated).
This is currently only fully implemented for Freescale 32-bit BookE
processors, but there is some infrastructure in the code for
64-bit BooKE.
Signed-off-by: Becky Bruce <[email protected]>
Signed-off-by: David Gibson <[email protected]>
---
arch/powerpc/Kconfig | 3 +-
arch/powerpc/include/asm/hugetlb.h | 63 +++++-
arch/powerpc/include/asm/mmu-book3e.h | 7 +
arch/powerpc/include/asm/mmu-hash64.h | 3 +-
arch/powerpc/include/asm/mmu.h | 18 +-
arch/powerpc/include/asm/page.h | 31 +++-
arch/powerpc/include/asm/page_64.h | 11 -
arch/powerpc/include/asm/pte-book3e.h | 3 +
arch/powerpc/kernel/head_fsl_booke.S | 133 ++++++++++--
arch/powerpc/mm/Makefile | 1 +
arch/powerpc/mm/hash_utils_64.c | 3 -
arch/powerpc/mm/hugetlbpage-book3e.c | 121 ++++++++++
arch/powerpc/mm/hugetlbpage.c | 379 ++++++++++++++++++++++++++++----
arch/powerpc/mm/init_32.c | 9 +
arch/powerpc/mm/mem.c | 5 +
arch/powerpc/mm/mmu_context_nohash.c | 5 +
arch/powerpc/mm/pgtable.c | 3 +-
arch/powerpc/mm/tlb_low_64e.S | 24 +-
arch/powerpc/mm/tlb_nohash.c | 46 ++++-
arch/powerpc/platforms/Kconfig.cputype | 4 +-
20 files changed, 766 insertions(+), 106 deletions(-)
create mode 100644 arch/powerpc/mm/hugetlbpage-book3e.c
diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index 2729c66..b7af257 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -426,8 +426,7 @@ config ARCH_POPULATES_NODE_MAP
def_bool y
config SYS_SUPPORTS_HUGETLBFS
- def_bool y
- depends on PPC_BOOK3S_64
+ bool
source "mm/Kconfig"
diff --git a/arch/powerpc/include/asm/hugetlb.h b/arch/powerpc/include/asm/hugetlb.h
index 5856a66..8600493 100644
--- a/arch/powerpc/include/asm/hugetlb.h
+++ b/arch/powerpc/include/asm/hugetlb.h
@@ -1,15 +1,60 @@
#ifndef _ASM_POWERPC_HUGETLB_H
#define _ASM_POWERPC_HUGETLB_H
+#ifdef CONFIG_HUGETLB_PAGE
#include <asm/page.h>
+extern struct kmem_cache *hugepte_cache;
+extern void __init reserve_hugetlb_gpages(void);
+
+static inline pte_t *hugepd_page(hugepd_t hpd)
+{
+ BUG_ON(!hugepd_ok(hpd));
+ return (pte_t *)((hpd.pd & ~HUGEPD_SHIFT_MASK) | PD_HUGE);
+}
+
+static inline unsigned int hugepd_shift(hugepd_t hpd)
+{
+ return hpd.pd & HUGEPD_SHIFT_MASK;
+}
+
+static inline pte_t *hugepte_offset(hugepd_t *hpdp, unsigned long addr,
+ unsigned pdshift)
+{
+ /*
+ * On 32-bit, we have multiple higher-level table entries that point to
+ * the same hugepte. Just use the first one since they're all
+ * identical. So for that case, idx=0.
+ */
+ unsigned long idx = 0;
+
+ pte_t *dir = hugepd_page(*hpdp);
+#ifdef CONFIG_PPC64
+ idx = (addr & ((1UL << pdshift) - 1)) >> hugepd_shift(*hpdp);
+#endif
+
+ return dir + idx;
+}
+
pte_t *huge_pte_offset_and_shift(struct mm_struct *mm,
unsigned long addr, unsigned *shift);
void flush_dcache_icache_hugepage(struct page *page);
+#if defined(CONFIG_PPC_MM_SLICES) || defined(CONFIG_PPC_SUBPAGE_PROT)
int is_hugepage_only_range(struct mm_struct *mm, unsigned long addr,
unsigned long len);
+#else
+static inline int is_hugepage_only_range(struct mm_struct *mm,
+ unsigned long addr,
+ unsigned long len)
+{
+ return 0;
+}
+#endif
+
+void book3e_hugetlb_preload(struct mm_struct *mm, unsigned long ea, pte_t pte);
+void flush_hugetlb_page(struct vm_area_struct *vma, unsigned long vmaddr);
void hugetlb_free_pgd_range(struct mmu_gather *tlb, unsigned long addr,
unsigned long end, unsigned long floor,
@@ -50,8 +95,11 @@ static inline void set_huge_pte_at(struct mm_struct *mm, unsigned long addr,
static inline pte_t huge_ptep_get_and_clear(struct mm_struct *mm,
unsigned long addr, pte_t *ptep)
{
- unsigned long old = pte_update(mm, addr, ptep, ~0UL, 1);
- return __pte(old);
+#ifdef CONFIG_PPC64
+ return __pte(pte_update(mm, addr, ptep, ~0UL, 1));
+#else
+ return __pte(pte_update(ptep, ~0UL, 0));
+#endif
}
static inline void huge_ptep_clear_flush(struct vm_area_struct *vma,
@@ -93,4 +141,15 @@ static inline void arch_release_hugepage(struct page *page)
{
}
+#else /* ! CONFIG_HUGETLB_PAGE */
+static inline void reserve_hugetlb_gpages(void)
+{
+ pr_err("Cannot reserve gpages without hugetlb enabled\n");
+}
+static inline void flush_hugetlb_page(struct vm_area_struct *vma,
+ unsigned long vmaddr)
+{
+}
+#endif
+
#endif /* _ASM_POWERPC_HUGETLB_H */
diff --git a/arch/powerpc/include/asm/mmu-book3e.h b/arch/powerpc/include/asm/mmu-book3e.h
index 3ea0f9a..0260ea5 100644
--- a/arch/powerpc/include/asm/mmu-book3e.h
+++ b/arch/powerpc/include/asm/mmu-book3e.h
@@ -66,6 +66,7 @@
#define MAS2_M 0x00000004
#define MAS2_G 0x00000002
#define MAS2_E 0x00000001
+#define MAS2_WIMGE_MASK 0x0000001f
#define MAS2_EPN_MASK(size) (~0 << (size + 10))
#define MAS2_VAL(addr, size, flags) ((addr) & MAS2_EPN_MASK(size) | (flags))
@@ -80,6 +81,7 @@
#define MAS3_SW 0x00000004
#define MAS3_UR 0x00000002
#define MAS3_SR 0x00000001
+#define MAS3_BAP_MASK 0x0000003f
#define MAS3_SPSIZE 0x0000003e
#define MAS3_SPSIZE_SHIFT 1
@@ -212,6 +214,11 @@ typedef struct {
unsigned int id;
unsigned int active;
unsigned long vdso_base;
+#ifdef CONFIG_PPC_MM_SLICES
+ u64 low_slices_psize; /* SLB page size encodings */
+ u64 high_slices_psize; /* 4 bits per slice for now */
+ u16 user_psize; /* page size index */
+#endif
} mm_context_t;
/* Page size definitions, common between 32 and 64-bit
diff --git a/arch/powerpc/include/asm/mmu-hash64.h b/arch/powerpc/include/asm/mmu-hash64.h
index d865bd9..9169032 100644
--- a/arch/powerpc/include/asm/mmu-hash64.h
+++ b/arch/powerpc/include/asm/mmu-hash64.h
@@ -256,8 +256,7 @@ extern void hash_failure_debug(unsigned long ea, unsigned long access,
extern int htab_bolt_mapping(unsigned long vstart, unsigned long vend,
unsigned long pstart, unsigned long prot,
int psize, int ssize);
-extern void add_gpage(unsigned long addr, unsigned long page_size,
- unsigned long number_of_pages);
+extern void add_gpage(u64 addr, u64 page_size, unsigned long number_of_pages);
extern void demote_segment_4k(struct mm_struct *mm, unsigned long addr);
extern void hpte_init_native(void);
diff --git a/arch/powerpc/include/asm/mmu.h b/arch/powerpc/include/asm/mmu.h
index b427a55..07f7b28 100644
--- a/arch/powerpc/include/asm/mmu.h
+++ b/arch/powerpc/include/asm/mmu.h
@@ -170,14 +170,16 @@ extern u64 ppc64_rma_size;
#define MMU_PAGE_64K_AP 3 /* "Admixed pages" (hash64 only) */
#define MMU_PAGE_256K 4
#define MMU_PAGE_1M 5
-#define MMU_PAGE_8M 6
-#define MMU_PAGE_16M 7
-#define MMU_PAGE_256M 8
-#define MMU_PAGE_1G 9
-#define MMU_PAGE_16G 10
-#define MMU_PAGE_64G 11
-#define MMU_PAGE_COUNT 12
-
+#define MMU_PAGE_4M 6
+#define MMU_PAGE_8M 7
+#define MMU_PAGE_16M 8
+#define MMU_PAGE_64M 9
+#define MMU_PAGE_256M 10
+#define MMU_PAGE_1G 11
+#define MMU_PAGE_16G 12
+#define MMU_PAGE_64G 13
+
+#define MMU_PAGE_COUNT 14
#if defined(CONFIG_PPC_STD_MMU_64)
/* 64-bit classic hash table MMU */
diff --git a/arch/powerpc/include/asm/page.h b/arch/powerpc/include/asm/page.h
index 2cd664e..dd9c4fd 100644
--- a/arch/powerpc/include/asm/page.h
+++ b/arch/powerpc/include/asm/page.h
@@ -36,6 +36,18 @@
#define PAGE_SIZE (ASM_CONST(1) << PAGE_SHIFT)
+#ifndef __ASSEMBLY__
+#ifdef CONFIG_HUGETLB_PAGE
+extern unsigned int HPAGE_SHIFT;
+#else
+#define HPAGE_SHIFT PAGE_SHIFT
+#endif
+#define HPAGE_SIZE ((1UL) << HPAGE_SHIFT)
+#define HPAGE_MASK (~(HPAGE_SIZE - 1))
+#define HUGETLB_PAGE_ORDER (HPAGE_SHIFT - PAGE_SHIFT)
+#define HUGE_MAX_HSTATE (MMU_PAGE_COUNT-1)
+#endif
+
/* We do define AT_SYSINFO_EHDR but don't use the gate mechanism */
#define __HAVE_ARCH_GATE_AREA 1
@@ -158,6 +170,24 @@ extern phys_addr_t kernstart_addr;
#define is_kernel_addr(x) ((x) >= PAGE_OFFSET)
#endif
+/*
+ * Use the top bit of the higher-level page table entries to indicate whether
+ * the entries we point to contain hugepages. This works because we know that
+ * the page tables live in kernel space. If we ever decide to support having
+ * page tables at arbitrary addresses, this breaks and will have to change.
+ */
+#ifdef CONFIG_PPC64
+#define PD_HUGE 0x8000000000000000
+#else
+#define PD_HUGE 0x80000000
+#endif
+
+/*
+ * Some number of bits at the level of the page table that points to
+ * a hugepte are used to encode the size. This masks those bits.
+ */
+#define HUGEPD_SHIFT_MASK 0x3f
+
#ifndef __ASSEMBLY__
#undef STRICT_MM_TYPECHECKS
@@ -243,7 +273,6 @@ typedef unsigned long pgprot_t;
#endif
typedef struct { signed long pd; } hugepd_t;
-#define HUGEPD_SHIFT_MASK 0x3f
#ifdef CONFIG_HUGETLB_PAGE
static inline int hugepd_ok(hugepd_t hpd)
diff --git a/arch/powerpc/include/asm/page_64.h b/arch/powerpc/include/asm/page_64.h
index 9356262..fb40ede 100644
--- a/arch/powerpc/include/asm/page_64.h
+++ b/arch/powerpc/include/asm/page_64.h
@@ -64,17 +64,6 @@ extern void copy_page(void *to, void *from);
/* Log 2 of page table size */
extern u64 ppc64_pft_size;
-/* Large pages size */
-#ifdef CONFIG_HUGETLB_PAGE
-extern unsigned int HPAGE_SHIFT;
-#else
-#define HPAGE_SHIFT PAGE_SHIFT
-#endif
-#define HPAGE_SIZE ((1UL) << HPAGE_SHIFT)
-#define HPAGE_MASK (~(HPAGE_SIZE - 1))
-#define HUGETLB_PAGE_ORDER (HPAGE_SHIFT - PAGE_SHIFT)
-#define HUGE_MAX_HSTATE (MMU_PAGE_COUNT-1)
-
#endif /* __ASSEMBLY__ */
#ifdef CONFIG_PPC_MM_SLICES
diff --git a/arch/powerpc/include/asm/pte-book3e.h b/arch/powerpc/include/asm/pte-book3e.h
index 082d515..0156702 100644
--- a/arch/powerpc/include/asm/pte-book3e.h
+++ b/arch/powerpc/include/asm/pte-book3e.h
@@ -72,6 +72,9 @@
#define PTE_RPN_SHIFT (24)
#endif
+#define PTE_WIMGE_SHIFT (19)
+#define PTE_BAP_SHIFT (2)
+
/* On 32-bit, we never clear the top part of the PTE */
#ifdef CONFIG_PPC32
#define _PTE_NONE_MASK 0xffffffff00000000ULL
diff --git a/arch/powerpc/kernel/head_fsl_booke.S b/arch/powerpc/kernel/head_fsl_booke.S
index 985638d..3b3de22 100644
--- a/arch/powerpc/kernel/head_fsl_booke.S
+++ b/arch/powerpc/kernel/head_fsl_booke.S
@@ -236,8 +236,24 @@ _ENTRY(__early_start)
* if we find the pte (fall through):
* r11 is low pte word
* r12 is pointer to the pte
+ * r10 is the pshift from the PGD, if we're a hugepage
*/
#ifdef CONFIG_PTE_64BIT
+#ifdef CONFIG_HUGETLB_PAGE
+#define FIND_PTE \
+ rlwinm r12, r10, 13, 19, 29; /* Compute pgdir/pmd offset */ \
+ lwzx r11, r12, r11; /* Get pgd/pmd entry */ \
+ rlwinm. r12, r11, 0, 0, 20; /* Extract pt base address */ \
+ blt 1000f; /* Normal non-huge page */ \
+ beq 2f; /* Bail if no table */ \
+ oris r11, r11, PD_HUGE@h; /* Put back address bit */ \
+ andi. r10, r11, HUGEPD_SHIFT_MASK@l; /* extract size field */ \
+ xor r12, r10, r11; /* drop size bits from pointer */ \
+ b 1001f; \
+1000: rlwimi r12, r10, 23, 20, 28; /* Compute pte address */ \
+ li r10, 0; /* clear r10 */ \
+1001: lwz r11, 4(r12); /* Get pte entry */
+#else
#define FIND_PTE \
rlwinm r12, r10, 13, 19, 29; /* Compute pgdir/pmd offset */ \
lwzx r11, r12, r11; /* Get pgd/pmd entry */ \
@@ -245,7 +261,8 @@ _ENTRY(__early_start)
beq 2f; /* Bail if no table */ \
rlwimi r12, r10, 23, 20, 28; /* Compute pte address */ \
lwz r11, 4(r12); /* Get pte entry */
-#else
+#endif /* HUGEPAGE */
+#else /* !PTE_64BIT */
#define FIND_PTE \
rlwimi r11, r10, 12, 20, 29; /* Create L1 (pgdir/pmd) address */ \
lwz r11, 0(r11); /* Get L1 entry */ \
@@ -402,8 +419,8 @@ interrupt_base:
#ifdef CONFIG_PTE_64BIT
#ifdef CONFIG_SMP
- subf r10,r11,r12 /* create false data dep */
- lwzx r13,r11,r10 /* Get upper pte bits */
+ subf r13,r11,r12 /* create false data dep */
+ lwzx r13,r11,r13 /* Get upper pte bits */
#else
lwz r13,0(r12) /* Get upper pte bits */
#endif
@@ -483,8 +500,8 @@ interrupt_base:
#ifdef CONFIG_PTE_64BIT
#ifdef CONFIG_SMP
- subf r10,r11,r12 /* create false data dep */
- lwzx r13,r11,r10 /* Get upper pte bits */
+ subf r13,r11,r12 /* create false data dep */
+ lwzx r13,r11,r13 /* Get upper pte bits */
#else
lwz r13,0(r12) /* Get upper pte bits */
#endif
@@ -548,7 +565,7 @@ interrupt_base:
/*
* Both the instruction and data TLB miss get to this
* point to load the TLB.
- * r10 - available to use
+ * r10 - tsize encoding (if HUGETLB_PAGE) or available to use
* r11 - TLB (info from Linux PTE)
* r12 - available to use
* r13 - upper bits of PTE (if PTE_64BIT) or available to use
@@ -558,21 +575,73 @@ interrupt_base:
* Upon exit, we reload everything and RFI.
*/
finish_tlb_load:
+#ifdef CONFIG_HUGETLB_PAGE
+ cmpwi 6, r10, 0 /* check for huge page */
+ beq 6, finish_tlb_load_cont /* !huge */
+
+ /* Alas, we need more scratch registers for hugepages */
+ mfspr r12, SPRN_SPRG_THREAD
+ stw r14, THREAD_NORMSAVE(4)(r12)
+ stw r15, THREAD_NORMSAVE(5)(r12)
+ stw r16, THREAD_NORMSAVE(6)(r12)
+ stw r17, THREAD_NORMSAVE(7)(r12)
+
+ /* Get the next_tlbcam_idx percpu var */
+#ifdef CONFIG_SMP
+ lwz r12, THREAD_INFO-THREAD(r12)
+ lwz r15, TI_CPU(r12)
+ lis r14, __per_cpu_offset@h
+ ori r14, r14, __per_cpu_offset@l
+ rlwinm r15, r15, 2, 0, 29
+ lwzx r16, r14, r15
+#else
+ li r16, 0
+#endif
+ lis r17, next_tlbcam_idx@h
+ ori r17, r17, next_tlbcam_idx@l
+ add r17, r17, r16 /* r17 = *next_tlbcam_idx */
+ lwz r15, 0(r17) /* r15 = next_tlbcam_idx */
+
+ lis r14, MAS0_TLBSEL(1)@h /* select TLB1 (TLBCAM) */
+ rlwimi r14, r15, 16, 4, 15 /* next_tlbcam_idx entry */
+ mtspr SPRN_MAS0, r14
+
+ /* Extract TLB1CFG(NENTRY) */
+ mfspr r16, SPRN_TLB1CFG
+ andi. r16, r16, 0xfff
+
+ /* Update next_tlbcam_idx, wrapping when necessary */
+ addi r15, r15, 1
+ cmpw r15, r16
+ blt 100f
+ lis r14, tlbcam_index@h
+ ori r14, r14, tlbcam_index@l
+ lwz r15, 0(r14)
+100: stw r15, 0(r17)
+
+ /*
+ * Calc MAS1_TSIZE from r10 (which has pshift encoded)
+ * tlb_enc = (pshift - 10).
+ */
+ subi r15, r10, 10
+ mfspr r16, SPRN_MAS1
+ rlwimi r16, r15, 7, 20, 24
+ mtspr SPRN_MAS1, r16
+
+ /* copy the pshift for use later */
+ mr r14, r10
+
+ /* fall through */
+
+#endif /* CONFIG_HUGETLB_PAGE */
+
/*
* We set execute, because we don't have the granularity to
* properly set this at the page level (Linux problem).
* Many of these bits are software only. Bits we don't set
* here we (properly should) assume have the appropriate value.
*/
-
- mfspr r12, SPRN_MAS2
-#ifdef CONFIG_PTE_64BIT
- rlwimi r12, r11, 32-19, 27, 31 /* extract WIMGE from pte */
-#else
- rlwimi r12, r11, 26, 27, 31 /* extract WIMGE from pte */
-#endif
- mtspr SPRN_MAS2, r12
-
+finish_tlb_load_cont:
#ifdef CONFIG_PTE_64BIT
rlwinm r12, r11, 32-2, 26, 31 /* Move in perm bits */
andi. r10, r11, _PAGE_DIRTY
@@ -581,22 +650,40 @@ finish_tlb_load:
andc r12, r12, r10
1: rlwimi r12, r13, 20, 0, 11 /* grab RPN[32:43] */
rlwimi r12, r11, 20, 12, 19 /* grab RPN[44:51] */
- mtspr SPRN_MAS3, r12
+2: mtspr SPRN_MAS3, r12
BEGIN_MMU_FTR_SECTION
srwi r10, r13, 12 /* grab RPN[12:31] */
mtspr SPRN_MAS7, r10
END_MMU_FTR_SECTION_IFSET(MMU_FTR_BIG_PHYS)
#else
li r10, (_PAGE_EXEC | _PAGE_PRESENT)
+ mr r13, r11
rlwimi r10, r11, 31, 29, 29 /* extract _PAGE_DIRTY into SW */
and r12, r11, r10
andi. r10, r11, _PAGE_USER /* Test for _PAGE_USER */
slwi r10, r12, 1
or r10, r10, r12
iseleq r12, r12, r10
- rlwimi r11, r12, 0, 20, 31 /* Extract RPN from PTE and merge with perms */
- mtspr SPRN_MAS3, r11
+ rlwimi r13, r12, 0, 20, 31 /* Get RPN from PTE, merge w/ perms */
+ mtspr SPRN_MAS3, r13
#endif
+
+ mfspr r12, SPRN_MAS2
+#ifdef CONFIG_PTE_64BIT
+ rlwimi r12, r11, 32-19, 27, 31 /* extract WIMGE from pte */
+#else
+ rlwimi r12, r11, 26, 27, 31 /* extract WIMGE from pte */
+#endif
+#ifdef CONFIG_HUGETLB_PAGE
+ beq 6, 3f /* don't mask if page isn't huge */
+ li r13, 1
+ slw r13, r13, r14
+ subi r13, r13, 1
+ rlwinm r13, r13, 0, 0, 19 /* bottom bits used for WIMGE/etc */
+ andc r12, r12, r13 /* mask off ea bits within the page */
+#endif
+3: mtspr SPRN_MAS2, r12
+
#ifdef CONFIG_E200
/* Round robin TLB1 entries assignment */
mfspr r12, SPRN_MAS0
@@ -622,11 +709,19 @@ END_MMU_FTR_SECTION_IFSET(MMU_FTR_BIG_PHYS)
mtspr SPRN_MAS0,r12
#endif /* CONFIG_E200 */
+tlb_write_entry:
tlbwe
/* Done...restore registers and get out of here. */
mfspr r10, SPRN_SPRG_THREAD
- lwz r11, THREAD_NORMSAVE(3)(r10)
+#ifdef CONFIG_HUGETLB_PAGE
+ beq 6, 8f /* skip restore for 4k page faults */
+ lwz r14, THREAD_NORMSAVE(4)(r10)
+ lwz r15, THREAD_NORMSAVE(5)(r10)
+ lwz r16, THREAD_NORMSAVE(6)(r10)
+ lwz r17, THREAD_NORMSAVE(7)(r10)
+#endif
+8: lwz r11, THREAD_NORMSAVE(3)(r10)
mtcr r11
lwz r13, THREAD_NORMSAVE(2)(r10)
lwz r12, THREAD_NORMSAVE(1)(r10)
diff --git a/arch/powerpc/mm/Makefile b/arch/powerpc/mm/Makefile
index bdca46e..991ee81 100644
--- a/arch/powerpc/mm/Makefile
+++ b/arch/powerpc/mm/Makefile
@@ -29,6 +29,7 @@ obj-$(CONFIG_PPC_MM_SLICES) += slice.o
ifeq ($(CONFIG_HUGETLB_PAGE),y)
obj-y += hugetlbpage.o
obj-$(CONFIG_PPC_STD_MMU_64) += hugetlbpage-hash64.o
+obj-$(CONFIG_PPC_BOOK3E_MMU) += hugetlbpage-book3e.o
endif
obj-$(CONFIG_PPC_SUBPAGE_PROT) += subpage-prot.o
obj-$(CONFIG_NOT_COHERENT_CACHE) += dma-noncoherent.o
diff --git a/arch/powerpc/mm/hash_utils_64.c b/arch/powerpc/mm/hash_utils_64.c
index 26b2872..1f8b2a0 100644
--- a/arch/powerpc/mm/hash_utils_64.c
+++ b/arch/powerpc/mm/hash_utils_64.c
@@ -105,9 +105,6 @@ int mmu_kernel_ssize = MMU_SEGSIZE_256M;
int mmu_highuser_ssize = MMU_SEGSIZE_256M;
u16 mmu_slb_size = 64;
EXPORT_SYMBOL_GPL(mmu_slb_size);
-#ifdef CONFIG_HUGETLB_PAGE
-unsigned int HPAGE_SHIFT;
-#endif
#ifdef CONFIG_PPC_64K_PAGES
int mmu_ci_restrictions;
#endif
diff --git a/arch/powerpc/mm/hugetlbpage-book3e.c b/arch/powerpc/mm/hugetlbpage-book3e.c
new file mode 100644
index 0000000..1295b7c
--- /dev/null
+++ b/arch/powerpc/mm/hugetlbpage-book3e.c
@@ -0,0 +1,121 @@
+/*
+ * PPC Huge TLB Page Support for Book3E MMU
+ *
+ * Copyright (C) 2009 David Gibson, IBM Corporation.
+ * Copyright (C) 2011 Becky Bruce, Freescale Semiconductor
+ *
+ */
+#include <linux/mm.h>
+#include <linux/hugetlb.h>
+
+static inline int mmu_get_tsize(int psize)
+{
+ return mmu_psize_defs[psize].enc;
+}
+
+static inline int book3e_tlb_exists(unsigned long ea, unsigned long pid)
+{
+ int found = 0;
+
+ mtspr(SPRN_MAS6, pid << 16);
+ if (mmu_has_feature(MMU_FTR_USE_TLBRSRV)) {
+ asm volatile(
+ "li %0,0\n"
+ "tlbsx. 0,%1\n"
+ "bne 1f\n"
+ "li %0,1\n"
+ "1:\n"
+ : "=&r"(found) : "r"(ea));
+ } else {
+ asm volatile(
+ "tlbsx 0,%1\n"
+ "mfspr %0,0x271\n"
+ "srwi %0,%0,31\n"
+ : "=&r"(found) : "r"(ea));
+ }
+
+ return found;
+}
+
+void book3e_hugetlb_preload(struct mm_struct *mm, unsigned long ea, pte_t pte)
+{
+ unsigned long mas1, mas2;
+ u64 mas7_3;
+ unsigned long psize, tsize, shift;
+ unsigned long flags;
+
+#ifdef CONFIG_PPC_FSL_BOOK3E
+ int index, lz, ncams;
+ struct vm_area_struct *vma;
+#endif
+
+ if (unlikely(is_kernel_addr(ea)))
+ return;
+
+#ifdef CONFIG_MM_SLICES
+ psize = mmu_get_tsize(get_slice_psize(mm, ea));
+ tsize = mmu_get_psize(psize);
+ shift = mmu_psize_defs[psize].shift;
+#else
+ vma = find_vma(mm, ea);
+ psize = vma_mmu_pagesize(vma); /* returns actual size in bytes */
+ asm (PPC_CNTLZL "%0,%1" : "=r" (lz) : "r" (psize));
+ shift = 31 - lz;
+ tsize = 21 - lz;
+#endif
+
+ /*
+ * We can't be interrupted while we're setting up the MAS
+ * regusters or after we've confirmed that no tlb exists.
+ */
+ local_irq_save(flags);
+
+ if (unlikely(book3e_tlb_exists(ea, mm->context.id))) {
+ local_irq_restore(flags);
+ return;
+ }
+
+#ifdef CONFIG_PPC_FSL_BOOK3E
+ ncams = mfspr(SPRN_TLB1CFG) & TLBnCFG_N_ENTRY;
+
+ /* We have to use the CAM(TLB1) on FSL parts for hugepages */
+ index = __get_cpu_var(next_tlbcam_idx);
+ mtspr(SPRN_MAS0, MAS0_ESEL(index) | MAS0_TLBSEL(1));
+
+ /* Just round-robin the entries and wrap when we hit the end */
+ if (unlikely(index == ncams - 1))
+ __get_cpu_var(next_tlbcam_idx) = tlbcam_index;
+ else
+ __get_cpu_var(next_tlbcam_idx)++;
+#endif
+ mas1 = MAS1_VALID | MAS1_TID(mm->context.id) | MAS1_TSIZE(tsize);
+ mas2 = ea & ~((1UL << shift) - 1);
+ mas2 |= (pte_val(pte) >> PTE_WIMGE_SHIFT) & MAS2_WIMGE_MASK;
+ mas7_3 = (u64)pte_pfn(pte) << PAGE_SHIFT;
+ mas7_3 |= (pte_val(pte) >> PTE_BAP_SHIFT) & MAS3_BAP_MASK;
+ if (!pte_dirty(pte))
+ mas7_3 &= ~(MAS3_SW|MAS3_UW);
+
+ mtspr(SPRN_MAS1, mas1);
+ mtspr(SPRN_MAS2, mas2);
+
+ if (mmu_has_feature(MMU_FTR_USE_PAIRED_MAS)) {
+ mtspr(SPRN_MAS7_MAS3, mas7_3);
+ } else {
+ mtspr(SPRN_MAS7, upper_32_bits(mas7_3));
+ mtspr(SPRN_MAS3, lower_32_bits(mas7_3));
+ }
+
+ asm volatile ("tlbwe");
+
+ local_irq_restore(flags);
+}
+
+void flush_hugetlb_page(struct vm_area_struct *vma, unsigned long vmaddr)
+{
+ struct hstate *hstate = hstate_file(vma->vm_file);
+ unsigned long tsize = huge_page_shift(hstate) - 10;
+
+ __flush_tlb_page(vma ? vma->vm_mm : NULL, vmaddr, tsize, 0);
+
+}
diff --git a/arch/powerpc/mm/hugetlbpage.c b/arch/powerpc/mm/hugetlbpage.c
index 0b9a5c1..3a5f59d 100644
--- a/arch/powerpc/mm/hugetlbpage.c
+++ b/arch/powerpc/mm/hugetlbpage.c
@@ -1,7 +1,8 @@
/*
- * PPC64 (POWER4) Huge TLB Page Support for Kernel.
+ * PPC Huge TLB Page Support for Kernel.
*
* Copyright (C) 2003 David Gibson, IBM Corporation.
+ * Copyright (C) 2011 Becky Bruce, Freescale Semiconductor
*
* Based on the IA-32 version:
* Copyright (C) 2002, Rohit Seth <[email protected]>
@@ -11,24 +12,39 @@
#include <linux/io.h>
#include <linux/slab.h>
#include <linux/hugetlb.h>
+#include <linux/of_fdt.h>
+#include <linux/memblock.h>
+#include <linux/bootmem.h>
#include <asm/pgtable.h>
#include <asm/pgalloc.h>
#include <asm/tlb.h>
+#include <asm/setup.h>
#define PAGE_SHIFT_64K 16
#define PAGE_SHIFT_16M 24
#define PAGE_SHIFT_16G 34
-#define MAX_NUMBER_GPAGES 1024
+unsigned int HPAGE_SHIFT;
-/* Tracks the 16G pages after the device tree is scanned and before the
- * huge_boot_pages list is ready. */
-static unsigned long gpage_freearray[MAX_NUMBER_GPAGES];
+/*
+ * Tracks gpages after the device tree is scanned and before the
+ * huge_boot_pages list is ready. On 64-bit implementations, this is
+ * just used to track 16G pages and so is a single array. 32-bit
+ * implementations may have more than one gpage size due to limitations
+ * of the memory allocators, so we need multiple arrays
+ */
+#ifdef CONFIG_PPC64
+#define MAX_NUMBER_GPAGES 1024
+static u64 gpage_freearray[MAX_NUMBER_GPAGES];
static unsigned nr_gpages;
-
-/* Flag to mark huge PD pointers. This means pmd_bad() and pud_bad()
- * will choke on pointers to hugepte tables, which is handy for
- * catching screwups early. */
+#else
+#define MAX_NUMBER_GPAGES 128
+struct psize_gpages {
+ u64 gpage_list[MAX_NUMBER_GPAGES];
+ unsigned int nr_gpages;
+};
+static struct psize_gpages gpage_freearray[MMU_PAGE_COUNT];
+#endif
static inline int shift_to_mmu_psize(unsigned int shift)
{
@@ -49,25 +65,6 @@ static inline unsigned int mmu_psize_to_shift(unsigned int mmu_psize)
#define hugepd_none(hpd) ((hpd).pd == 0)
-static inline pte_t *hugepd_page(hugepd_t hpd)
-{
- BUG_ON(!hugepd_ok(hpd));
- return (pte_t *)((hpd.pd & ~HUGEPD_SHIFT_MASK) | 0xc000000000000000);
-}
-
-static inline unsigned int hugepd_shift(hugepd_t hpd)
-{
- return hpd.pd & HUGEPD_SHIFT_MASK;
-}
-
-static inline pte_t *hugepte_offset(hugepd_t *hpdp, unsigned long addr, unsigned pdshift)
-{
- unsigned long idx = (addr & ((1UL << pdshift) - 1)) >> hugepd_shift(*hpdp);
- pte_t *dir = hugepd_page(*hpdp);
-
- return dir + idx;
-}
-
pte_t *find_linux_pte_or_hugepte(pgd_t *pgdir, unsigned long ea, unsigned *shift)
{
pgd_t *pg;
@@ -93,7 +90,7 @@ pte_t *find_linux_pte_or_hugepte(pgd_t *pgdir, unsigned long ea, unsigned *shift
if (is_hugepd(pm))
hpdp = (hugepd_t *)pm;
else if (!pmd_none(*pm)) {
- return pte_offset_map(pm, ea);
+ return pte_offset_kernel(pm, ea);
}
}
}
@@ -114,8 +111,18 @@ pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr)
static int __hugepte_alloc(struct mm_struct *mm, hugepd_t *hpdp,
unsigned long address, unsigned pdshift, unsigned pshift)
{
- pte_t *new = kmem_cache_zalloc(PGT_CACHE(pdshift - pshift),
- GFP_KERNEL|__GFP_REPEAT);
+ struct kmem_cache *cachep;
+ pte_t *new;
+
+#ifdef CONFIG_PPC64
+ cachep = PGT_CACHE(pdshift - pshift);
+#else
+ int i;
+ int num_hugepd = 1 << (pshift - pdshift);
+ cachep = hugepte_cache;
+#endif
+
+ new = kmem_cache_zalloc(cachep, GFP_KERNEL|__GFP_REPEAT);
BUG_ON(pshift > HUGEPD_SHIFT_MASK);
BUG_ON((unsigned long)new & HUGEPD_SHIFT_MASK);
@@ -124,10 +131,31 @@ static int __hugepte_alloc(struct mm_struct *mm, hugepd_t *hpdp,
return -ENOMEM;
spin_lock(&mm->page_table_lock);
+#ifdef CONFIG_PPC64
if (!hugepd_none(*hpdp))
- kmem_cache_free(PGT_CACHE(pdshift - pshift), new);
+ kmem_cache_free(cachep, new);
else
- hpdp->pd = ((unsigned long)new & ~0x8000000000000000) | pshift;
+ hpdp->pd = ((unsigned long)new & ~PD_HUGE) | pshift;
+#else
+ /*
+ * We have multiple higher-level entries that point to the same
+ * actual pte location. Fill in each as we go and backtrack on error.
+ * We need all of these so the DTLB pgtable walk code can find the
+ * right higher-level entry without knowing if it's a hugepage or not.
+ */
+ for (i = 0; i < num_hugepd; i++, hpdp++) {
+ if (unlikely(!hugepd_none(*hpdp)))
+ break;
+ else
+ hpdp->pd = ((unsigned long)new & ~PD_HUGE) | pshift;
+ }
+ /* If we bailed from the for loop early, an error occurred, clean up */
+ if (i < num_hugepd) {
+ for (i = i - 1 ; i >= 0; i--, hpdp--)
+ hpdp->pd = 0;
+ kmem_cache_free(cachep, new);
+ }
+#endif
spin_unlock(&mm->page_table_lock);
return 0;
}
@@ -169,11 +197,132 @@ pte_t *huge_pte_alloc(struct mm_struct *mm, unsigned long addr, unsigned long sz
return hugepte_offset(hpdp, addr, pdshift);
}
+#ifdef CONFIG_PPC32
/* Build list of addresses of gigantic pages. This function is used in early
* boot before the buddy or bootmem allocator is setup.
*/
-void add_gpage(unsigned long addr, unsigned long page_size,
- unsigned long number_of_pages)
+void add_gpage(u64 addr, u64 page_size, unsigned long number_of_pages)
+{
+ unsigned int idx = shift_to_mmu_psize(__ffs(page_size));
+ int i;
+
+ if (addr == 0)
+ return;
+
+ gpage_freearray[idx].nr_gpages = number_of_pages;
+
+ for (i = 0; i < number_of_pages; i++) {
+ gpage_freearray[idx].gpage_list[i] = addr;
+ addr += page_size;
+ }
+}
+
+/*
+ * Moves the gigantic page addresses from the temporary list to the
+ * huge_boot_pages list.
+ */
+int alloc_bootmem_huge_page(struct hstate *hstate)
+{
+ struct huge_bootmem_page *m;
+ int idx = shift_to_mmu_psize(hstate->order + PAGE_SHIFT);
+ int nr_gpages = gpage_freearray[idx].nr_gpages;
+
+ if (nr_gpages == 0)
+ return 0;
+
+#ifdef CONFIG_HIGHMEM
+ /*
+ * If gpages can be in highmem we can't use the trick of storing the
+ * data structure in the page; allocate space for this
+ */
+ m = alloc_bootmem(sizeof(struct huge_bootmem_page));
+ m->phys = gpage_freearray[idx].gpage_list[--nr_gpages];
+#else
+ m = phys_to_virt(gpage_freearray[idx].gpage_list[--nr_gpages]);
+#endif
+
+ list_add(&m->list, &huge_boot_pages);
+ gpage_freearray[idx].nr_gpages = nr_gpages;
+ gpage_freearray[idx].gpage_list[nr_gpages] = 0;
+ m->hstate = hstate;
+
+ return 1;
+}
+/*
+ * Scan the command line hugepagesz= options for gigantic pages; store those in
+ * a list that we use to allocate the memory once all options are parsed.
+ */
+
+unsigned long gpage_npages[MMU_PAGE_COUNT];
+
+static int __init do_gpage_early_setup(char *param, char *val)
+{
+ static phys_addr_t size;
+ unsigned long npages;
+
+ /*
+ * The hugepagesz and hugepages cmdline options are interleaved. We
+ * use the size variable to keep track of whether or not this was done
+ * properly and skip over instances where it is incorrect. Other
+ * command-line parsing code will issue warnings, so we don't need to.
+ *
+ */
+ if ((strcmp(param, "default_hugepagesz") == 0) ||
+ (strcmp(param, "hugepagesz") == 0)) {
+ size = memparse(val, NULL);
+ } else if (strcmp(param, "hugepages") == 0) {
+ if (size != 0) {
+ if (sscanf(val, "%lu", &npages) <= 0)
+ npages = 0;
+ gpage_npages[shift_to_mmu_psize(__ffs(size))] = npages;
+ size = 0;
+ }
+ }
+ return 0;
+}
+
+
+/*
+ * This function allocates physical space for pages that are larger than the
+ * buddy allocator can handle. We want to allocate these in highmem because
+ * the amount of lowmem is limited. This means that this function MUST be
+ * called before lowmem_end_addr is set up in MMU_init() in order for the lmb
+ * allocate to grab highmem.
+ */
+void __init reserve_hugetlb_gpages(void)
+{
+ static __initdata char cmdline[COMMAND_LINE_SIZE];
+ phys_addr_t size, base;
+ int i;
+
+ strlcpy(cmdline, boot_command_line, COMMAND_LINE_SIZE);
+ parse_args("hugetlb gpages", cmdline, NULL, 0, &do_gpage_early_setup);
+
+ /*
+ * Walk gpage list in reverse, allocating larger page sizes first.
+ * Skip over unsupported sizes, or sizes that have 0 gpages allocated.
+ * When we reach the point in the list where pages are no longer
+ * considered gpages, we're done.
+ */
+ for (i = MMU_PAGE_COUNT-1; i >= 0; i--) {
+ if (mmu_psize_defs[i].shift == 0 || gpage_npages[i] == 0)
+ continue;
+ else if (mmu_psize_to_shift(i) < (MAX_ORDER + PAGE_SHIFT))
+ break;
+
+ size = (phys_addr_t)(1ULL << mmu_psize_to_shift(i));
+ base = memblock_alloc_base(size * gpage_npages[i], size,
+ MEMBLOCK_ALLOC_ANYWHERE);
+ add_gpage(base, size, gpage_npages[i]);
+ }
+}
+
+#else /* PPC64 */
+
+/* Build list of addresses of gigantic pages. This function is used in early
+ * boot before the buddy or bootmem allocator is setup.
+ */
+void add_gpage(u64 addr, u64 page_size, unsigned long number_of_pages)
{
if (!addr)
return;
@@ -199,19 +348,79 @@ int alloc_bootmem_huge_page(struct hstate *hstate)
m->hstate = hstate;
return 1;
}
+#endif
int huge_pmd_unshare(struct mm_struct *mm, unsigned long *addr, pte_t *ptep)
{
return 0;
}
+#ifdef CONFIG_PPC32
+#define HUGEPD_FREELIST_SIZE \
+ ((PAGE_SIZE - sizeof(struct hugepd_freelist)) / sizeof(pte_t))
+
+struct hugepd_freelist {
+ struct rcu_head rcu;
+ unsigned int index;
+ void *ptes[0];
+};
+
+static DEFINE_PER_CPU(struct hugepd_freelist *, hugepd_freelist_cur);
+
+static void hugepd_free_rcu_callback(struct rcu_head *head)
+{
+ struct hugepd_freelist *batch =
+ container_of(head, struct hugepd_freelist, rcu);
+ unsigned int i;
+
+ for (i = 0; i < batch->index; i++)
+ kmem_cache_free(hugepte_cache, batch->ptes[i]);
+
+ free_page((unsigned long)batch);
+}
+
+static void hugepd_free(struct mmu_gather *tlb, void *hugepte)
+{
+ struct hugepd_freelist **batchp;
+
+ batchp = &__get_cpu_var(hugepd_freelist_cur);
+
+ if (atomic_read(&tlb->mm->mm_users) < 2 ||
+ cpumask_equal(mm_cpumask(tlb->mm),
+ cpumask_of(smp_processor_id()))) {
+ kmem_cache_free(hugepte_cache, hugepte);
+ return;
+ }
+
+ if (*batchp == NULL) {
+ *batchp = (struct hugepd_freelist *)__get_free_page(GFP_ATOMIC);
+ (*batchp)->index = 0;
+ }
+
+ (*batchp)->ptes[(*batchp)->index++] = hugepte;
+ if ((*batchp)->index == HUGEPD_FREELIST_SIZE) {
+ call_rcu_sched(&(*batchp)->rcu, hugepd_free_rcu_callback);
+ *batchp = NULL;
+ }
+}
+#endif
+
static void free_hugepd_range(struct mmu_gather *tlb, hugepd_t *hpdp, int pdshift,
unsigned long start, unsigned long end,
unsigned long floor, unsigned long ceiling)
{
pte_t *hugepte = hugepd_page(*hpdp);
- unsigned shift = hugepd_shift(*hpdp);
+ int i;
+
unsigned long pdmask = ~((1UL << pdshift) - 1);
+ unsigned int num_hugepd = 1;
+
+#ifdef CONFIG_PPC64
+ unsigned int shift = hugepd_shift(*hpdp);
+#else
+ /* Note: On 32-bit the hpdp may be the first of several */
+ num_hugepd = (1 << (hugepd_shift(*hpdp) - pdshift));
+#endif
start &= pdmask;
if (start < floor)
@@ -224,9 +433,15 @@ static void free_hugepd_range(struct mmu_gather *tlb, hugepd_t *hpdp, int pdshif
if (end - 1 > ceiling - 1)
return;
- hpdp->pd = 0;
+ for (i = 0; i < num_hugepd; i++, hpdp++)
+ hpdp->pd = 0;
+
tlb->need_flush = 1;
+#ifdef CONFIG_PPC64
pgtable_free_tlb(tlb, hugepte, pdshift - shift);
+#else
+ hugepd_free(tlb, hugepte);
+#endif
}
static void hugetlb_free_pmd_range(struct mmu_gather *tlb, pud_t *pud,
@@ -331,18 +546,27 @@ void hugetlb_free_pgd_range(struct mmu_gather *tlb,
* too.
*/
- pgd = pgd_offset(tlb->mm, addr);
do {
next = pgd_addr_end(addr, end);
+ pgd = pgd_offset(tlb->mm, addr);
if (!is_hugepd(pgd)) {
if (pgd_none_or_clear_bad(pgd))
continue;
hugetlb_free_pud_range(tlb, pgd, addr, next, floor, ceiling);
} else {
+#ifdef CONFIG_PPC32
+ /*
+ * Increment next by the size of the huge mapping since
+ * on 32-bit there may be more than one entry at the pgd
+ * level for a single hugepage, but all of them point to
+ * the same kmem cache that holds the hugepte.
+ */
+ next = addr + (1 << hugepd_shift(*(hugepd_t *)pgd));
+#endif
free_hugepd_range(tlb, (hugepd_t *)pgd, PGDIR_SHIFT,
addr, next, floor, ceiling);
}
- } while (pgd++, addr = next, addr != end);
+ } while (addr = next, addr != end);
}
struct page *
@@ -466,17 +690,35 @@ unsigned long hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
unsigned long len, unsigned long pgoff,
unsigned long flags)
{
+#ifdef CONFIG_MM_SLICES
struct hstate *hstate = hstate_file(file);
int mmu_psize = shift_to_mmu_psize(huge_page_shift(hstate));
return slice_get_unmapped_area(addr, len, flags, mmu_psize, 1, 0);
+#else
+ return get_unmapped_area(file, addr, len, pgoff, flags);
+#endif
}
unsigned long vma_mmu_pagesize(struct vm_area_struct *vma)
{
+#ifdef CONFIG_MM_SLICES
unsigned int psize = get_slice_psize(vma->vm_mm, vma->vm_start);
return 1UL << mmu_psize_to_shift(psize);
+#else
+ if (!is_vm_hugetlb_page(vma))
+ return PAGE_SIZE;
+
+ return huge_page_size(hstate_vma(vma));
+#endif
+}
+
+static inline bool is_power_of_4(unsigned long x)
+{
+ if (is_power_of_2(x))
+ return (__ilog2(x) % 2) ? false : true;
+ return false;
}
static int __init add_huge_page_size(unsigned long long size)
@@ -486,9 +728,14 @@ static int __init add_huge_page_size(unsigned long long size)
/* Check that it is a page size supported by the hardware and
* that it fits within pagetable and slice limits. */
+#ifdef CONFIG_PPC_FSL_BOOK3E
+ if ((size < PAGE_SIZE) || !is_power_of_4(size))
+ return -EINVAL;
+#else
if (!is_power_of_2(size)
|| (shift > SLICE_HIGH_SHIFT) || (shift <= PAGE_SHIFT))
return -EINVAL;
+#endif
if ((mmu_psize = shift_to_mmu_psize(shift)) < 0)
return -EINVAL;
@@ -525,6 +772,46 @@ static int __init hugepage_setup_sz(char *str)
}
__setup("hugepagesz=", hugepage_setup_sz);
+#ifdef CONFIG_FSL_BOOKE
+struct kmem_cache *hugepte_cache;
+static int __init hugetlbpage_init(void)
+{
+ int psize;
+
+ for (psize = 0; psize < MMU_PAGE_COUNT; ++psize) {
+ unsigned shift;
+
+ if (!mmu_psize_defs[psize].shift)
+ continue;
+
+ shift = mmu_psize_to_shift(psize);
+
+ /* Don't treat normal page sizes as huge... */
+ if (shift != PAGE_SHIFT)
+ if (add_huge_page_size(1ULL << shift) < 0)
+ continue;
+ }
+
+ /*
+ * Create a kmem cache for hugeptes. The bottom bits in the pte have
+ * size information encoded in them, so align them to allow this
+ */
+ hugepte_cache = kmem_cache_create("hugepte-cache", sizeof(pte_t),
+ HUGEPD_SHIFT_MASK + 1, 0, NULL);
+ if (hugepte_cache == NULL)
+ panic("%s: Unable to create kmem cache for hugeptes\n",
+ __func__);
+
+ /* Default hpage size = 4M */
+ if (mmu_psize_defs[MMU_PAGE_4M].shift)
+ HPAGE_SHIFT = mmu_psize_defs[MMU_PAGE_4M].shift;
+ else
+ panic("%s: Unable to set default huge page size\n", __func__);
+
+
+ return 0;
+}
+#else
static int __init hugetlbpage_init(void)
{
int psize;
@@ -567,15 +854,23 @@ static int __init hugetlbpage_init(void)
return 0;
}
-
+#endif
module_init(hugetlbpage_init);
void flush_dcache_icache_hugepage(struct page *page)
{
int i;
+ void *start;
BUG_ON(!PageCompound(page));
- for (i = 0; i < (1UL << compound_order(page)); i++)
- __flush_dcache_icache(page_address(page+i));
+ for (i = 0; i < (1UL << compound_order(page)); i++) {
+ if (!PageHighMem(page)) {
+ __flush_dcache_icache(page_address(page+i));
+ } else {
+ start = kmap_atomic(page+i, KM_PPC_SYNC_ICACHE);
+ __flush_dcache_icache(start);
+ kunmap_atomic(start, KM_PPC_SYNC_ICACHE);
+ }
+ }
}
diff --git a/arch/powerpc/mm/init_32.c b/arch/powerpc/mm/init_32.c
index d65b591..3e1ba4c 100644
--- a/arch/powerpc/mm/init_32.c
+++ b/arch/powerpc/mm/init_32.c
@@ -32,6 +32,8 @@
#include <linux/pagemap.h>
#include <linux/memblock.h>
#include <linux/gfp.h>
+#include <linux/slab.h>
+#include <linux/hugetlb.h>
#include <asm/pgalloc.h>
#include <asm/prom.h>
@@ -44,6 +46,7 @@
#include <asm/tlb.h>
#include <asm/sections.h>
#include <asm/system.h>
+#include <asm/hugetlb.h>
#include "mmu_decl.h"
@@ -123,6 +126,12 @@ void __init MMU_init(void)
/* parse args from command line */
MMU_setup();
+ /*
+ * Reserve gigantic pages for hugetlb. This MUST occur before
+ * lowmem_end_addr is initialized below.
+ */
+ reserve_hugetlb_gpages();
+
if (memblock.memory.cnt > 1) {
#ifndef CONFIG_WII
memblock.memory.cnt = 1;
diff --git a/arch/powerpc/mm/mem.c b/arch/powerpc/mm/mem.c
index 7209901..9e6f3a6 100644
--- a/arch/powerpc/mm/mem.c
+++ b/arch/powerpc/mm/mem.c
@@ -510,4 +510,9 @@ void update_mmu_cache(struct vm_area_struct *vma, unsigned long address,
return;
hash_preload(vma->vm_mm, address, access, trap);
#endif /* CONFIG_PPC_STD_MMU */
+#if (defined(CONFIG_PPC_BOOK3E_64) || defined(CONFIG_PPC_FSL_BOOK3E)) \
+ && defined(CONFIG_HUGETLB_PAGE)
+ if (is_vm_hugetlb_page(vma))
+ book3e_hugetlb_preload(vma->vm_mm, address, *ptep);
+#endif
}
diff --git a/arch/powerpc/mm/mmu_context_nohash.c b/arch/powerpc/mm/mmu_context_nohash.c
index 336807d..5b63bd3 100644
--- a/arch/powerpc/mm/mmu_context_nohash.c
+++ b/arch/powerpc/mm/mmu_context_nohash.c
@@ -292,6 +292,11 @@ int init_new_context(struct task_struct *t, struct mm_struct *mm)
mm->context.id = MMU_NO_CONTEXT;
mm->context.active = 0;
+#ifdef CONFIG_PPC_MM_SLICES
+ if (slice_mm_new_context(mm))
+ slice_set_user_psize(mm, mmu_virtual_psize);
+#endif
+
return 0;
}
diff --git a/arch/powerpc/mm/pgtable.c b/arch/powerpc/mm/pgtable.c
index af40c87..214130a 100644
--- a/arch/powerpc/mm/pgtable.c
+++ b/arch/powerpc/mm/pgtable.c
@@ -27,6 +27,7 @@
#include <linux/init.h>
#include <linux/percpu.h>
#include <linux/hardirq.h>
+#include <linux/hugetlb.h>
#include <asm/pgalloc.h>
#include <asm/tlbflush.h>
#include <asm/tlb.h>
@@ -212,7 +213,7 @@ int ptep_set_access_flags(struct vm_area_struct *vma, unsigned long address,
entry = set_access_flags_filter(entry, vma, dirty);
changed = !pte_same(*(ptep), entry);
if (changed) {
- if (!(vma->vm_flags & VM_HUGETLB))
+ if (!is_vm_hugetlb_page(vma))
assert_pte_locked(vma->vm_mm, address);
__ptep_set_access_flags(ptep, entry);
flush_tlb_page_nohash(vma, address);
diff --git a/arch/powerpc/mm/tlb_low_64e.S b/arch/powerpc/mm/tlb_low_64e.S
index af08922..12ae424 100644
--- a/arch/powerpc/mm/tlb_low_64e.S
+++ b/arch/powerpc/mm/tlb_low_64e.S
@@ -347,24 +347,24 @@ END_MMU_FTR_SECTION_IFSET(MMU_FTR_USE_TLBRSRV)
rldicl r11,r16,64-VPTE_PGD_SHIFT,64-PGD_INDEX_SIZE-3
clrrdi r10,r11,3
ldx r15,r10,r15
- cmpldi cr0,r15,0
- beq virt_page_table_tlb_miss_fault
+ cmpdi cr0,r15,0
+ bge virt_page_table_tlb_miss_fault
#ifndef CONFIG_PPC_64K_PAGES
/* Get to PUD entry */
rldicl r11,r16,64-VPTE_PUD_SHIFT,64-PUD_INDEX_SIZE-3
clrrdi r10,r11,3
ldx r15,r10,r15
- cmpldi cr0,r15,0
- beq virt_page_table_tlb_miss_fault
+ cmpdi cr0,r15,0
+ bge virt_page_table_tlb_miss_fault
#endif /* CONFIG_PPC_64K_PAGES */
/* Get to PMD entry */
rldicl r11,r16,64-VPTE_PMD_SHIFT,64-PMD_INDEX_SIZE-3
clrrdi r10,r11,3
ldx r15,r10,r15
- cmpldi cr0,r15,0
- beq virt_page_table_tlb_miss_fault
+ cmpdi cr0,r15,0
+ bge virt_page_table_tlb_miss_fault
/* Ok, we're all right, we can now create a kernel translation for
* a 4K or 64K page from r16 -> r15.
@@ -596,24 +596,24 @@ htw_tlb_miss:
rldicl r11,r16,64-(PGDIR_SHIFT-3),64-PGD_INDEX_SIZE-3
clrrdi r10,r11,3
ldx r15,r10,r15
- cmpldi cr0,r15,0
- beq htw_tlb_miss_fault
+ cmpdi cr0,r15,0
+ bge htw_tlb_miss_fault
#ifndef CONFIG_PPC_64K_PAGES
/* Get to PUD entry */
rldicl r11,r16,64-(PUD_SHIFT-3),64-PUD_INDEX_SIZE-3
clrrdi r10,r11,3
ldx r15,r10,r15
- cmpldi cr0,r15,0
- beq htw_tlb_miss_fault
+ cmpdi cr0,r15,0
+ bge htw_tlb_miss_fault
#endif /* CONFIG_PPC_64K_PAGES */
/* Get to PMD entry */
rldicl r11,r16,64-(PMD_SHIFT-3),64-PMD_INDEX_SIZE-3
clrrdi r10,r11,3
ldx r15,r10,r15
- cmpldi cr0,r15,0
- beq htw_tlb_miss_fault
+ cmpdi cr0,r15,0
+ bge htw_tlb_miss_fault
/* Ok, we're all right, we can now create an indirect entry for
* a 1M or 256M page.
diff --git a/arch/powerpc/mm/tlb_nohash.c b/arch/powerpc/mm/tlb_nohash.c
index ea037ba..8a8a893 100644
--- a/arch/powerpc/mm/tlb_nohash.c
+++ b/arch/powerpc/mm/tlb_nohash.c
@@ -35,14 +35,49 @@
#include <linux/preempt.h>
#include <linux/spinlock.h>
#include <linux/memblock.h>
+#include <linux/hugetlb.h>
#include <asm/tlbflush.h>
#include <asm/tlb.h>
#include <asm/code-patching.h>
+#include <asm/hugetlb.h>
#include "mmu_decl.h"
-#ifdef CONFIG_PPC_BOOK3E
+/*
+ * This struct lists the sw-supported page sizes. The hardawre MMU may support
+ * other sizes not listed here. The .ind field is only used on MMUs that have
+ * indirect page table entries.
+ */
+#ifdef CONFIG_PPC_BOOK3E_MMU
+#ifdef CONFIG_FSL_BOOKE
+struct mmu_psize_def mmu_psize_defs[MMU_PAGE_COUNT] = {
+ [MMU_PAGE_4K] = {
+ .shift = 12,
+ .enc = BOOK3E_PAGESZ_4K,
+ },
+ [MMU_PAGE_4M] = {
+ .shift = 22,
+ .enc = BOOK3E_PAGESZ_4M,
+ },
+ [MMU_PAGE_16M] = {
+ .shift = 24,
+ .enc = BOOK3E_PAGESZ_16M,
+ },
+ [MMU_PAGE_64M] = {
+ .shift = 26,
+ .enc = BOOK3E_PAGESZ_64M,
+ },
+ [MMU_PAGE_256M] = {
+ .shift = 28,
+ .enc = BOOK3E_PAGESZ_256M,
+ },
+ [MMU_PAGE_1G] = {
+ .shift = 30,
+ .enc = BOOK3E_PAGESZ_1GB,
+ },
+};
+#else
struct mmu_psize_def mmu_psize_defs[MMU_PAGE_COUNT] = {
[MMU_PAGE_4K] = {
.shift = 12,
@@ -76,6 +111,8 @@ struct mmu_psize_def mmu_psize_defs[MMU_PAGE_COUNT] = {
.enc = BOOK3E_PAGESZ_1GB,
},
};
+#endif /* CONFIG_FSL_BOOKE */
+
static inline int mmu_get_tsize(int psize)
{
return mmu_psize_defs[psize].enc;
@@ -86,7 +123,7 @@ static inline int mmu_get_tsize(int psize)
/* This isn't used on !Book3E for now */
return 0;
}
-#endif
+#endif /* CONFIG_PPC_BOOK3E_MMU */
/* The variables below are currently only used on 64-bit Book3E
* though this will probably be made common with other nohash
@@ -265,6 +302,11 @@ void __flush_tlb_page(struct mm_struct *mm, unsigned long vmaddr,
void flush_tlb_page(struct vm_area_struct *vma, unsigned long vmaddr)
{
+#ifdef CONFIG_HUGETLB_PAGE
+ if (is_vm_hugetlb_page(vma))
+ flush_hugetlb_page(vma, vmaddr);
+#endif
+
__flush_tlb_page(vma ? vma->vm_mm : NULL, vmaddr,
mmu_get_tsize(mmu_virtual_psize), 0);
}
diff --git a/arch/powerpc/platforms/Kconfig.cputype b/arch/powerpc/platforms/Kconfig.cputype
index 2165b65..9abc655 100644
--- a/arch/powerpc/platforms/Kconfig.cputype
+++ b/arch/powerpc/platforms/Kconfig.cputype
@@ -69,6 +69,7 @@ choice
config PPC_BOOK3S_64
bool "Server processors"
select PPC_FPU
+ select SYS_SUPPORTS_HUGETLBFS
config PPC_BOOK3E_64
bool "Embedded processors"
@@ -173,6 +174,7 @@ config BOOKE
config FSL_BOOKE
bool
depends on (E200 || E500) && PPC32
+ select SYS_SUPPORTS_HUGETLBFS if PHYS_64BIT
default y
# this is for common code between PPC32 & PPC64 FSL BOOKE
@@ -296,7 +298,7 @@ config PPC_BOOK3E_MMU
config PPC_MM_SLICES
bool
- default y if HUGETLB_PAGE || (PPC_STD_MMU_64 && PPC_64K_PAGES)
+ default y if (PPC64 && HUGETLB_PAGE) || (PPC_STD_MMU_64 && PPC_64K_PAGES)
default n
config VIRT_CPU_ACCOUNTING
--
1.5.6.5
On Tue, 2011-06-28 at 14:54 -0500, Becky Bruce wrote:
> struct page *alloc_huge_page_node(struct hstate *h, int nid);
> diff --git a/mm/hugetlb.c b/mm/hugetlb.c
> index 6402458..2db81ea 100644
> --- a/mm/hugetlb.c
> +++ b/mm/hugetlb.c
> @@ -1105,8 +1105,14 @@ static void __init
> gather_bootmem_prealloc(void)
> struct huge_bootmem_page *m;
>
> list_for_each_entry(m, &huge_boot_pages, list) {
> - struct page *page = virt_to_page(m);
> struct hstate *h = m->hstate;
> +#ifdef CONFIG_HIGHMEM
> + struct page *page = pfn_to_page(m->phys >>
> PAGE_SHIFT);
> + free_bootmem_late((unsigned long)m,
> + sizeof(struct huge_bootmem_page));
> +#else
> + struct page *page = virt_to_page(m);
> +#endif
> __ClearPageReserved(page);
Why do you add free_bootmem_late() in the highmem case and not the
normal case ?
Cheers,
Ben.
On Jun 28, 2011, at 4:39 PM, Benjamin Herrenschmidt wrote:
> On Tue, 2011-06-28 at 14:54 -0500, Becky Bruce wrote:
>> struct page *alloc_huge_page_node(struct hstate *h, int nid);
>> diff --git a/mm/hugetlb.c b/mm/hugetlb.c
>> index 6402458..2db81ea 100644
>> --- a/mm/hugetlb.c
>> +++ b/mm/hugetlb.c
>> @@ -1105,8 +1105,14 @@ static void __init
>> gather_bootmem_prealloc(void)
>> struct huge_bootmem_page *m;
>>
>> list_for_each_entry(m, &huge_boot_pages, list) {
>> - struct page *page = virt_to_page(m);
>> struct hstate *h = m->hstate;
>> +#ifdef CONFIG_HIGHMEM
>> + struct page *page = pfn_to_page(m->phys >>
>> PAGE_SHIFT);
>> + free_bootmem_late((unsigned long)m,
>> + sizeof(struct huge_bootmem_page));
>> +#else
>> + struct page *page = virt_to_page(m);
>> +#endif
>> __ClearPageReserved(page);
>
> Why do you add free_bootmem_late() in the highmem case and not the
> normal case ?
Because there was no bootmem allocation in the normal case - the non-highmem version stores data structure in the huge page itself. This is perfectly fine as long as you have a mapping. Since this isn't true for HIGHMEM pages, I allocate bootmem to store the early data structure that stores information about the hugepage (this happens in arch-specific code in alloc_bootmem_huge_page).
-Becky