This patch implements 4 levels of translation tables since 3 levels
of page tables with 4KB pages cannot support 40-bit physical address
space described in [1] due to the following issue.
It is a restriction that kernel logical memory map with 4KB + 3 levels
(0xffffffc000000000-0xffffffffffffffff) cannot cover RAM region from
544GB to 1024GB in [1]. Specifically, ARM64 kernel fails to create
mapping for this region in map_mem function since __phys_to_virt for
this region reaches to address overflow.
If SoC design follows the document, [1], over 32GB RAM would be placed
from 544GB. Even 64GB system is supposed to use the region from 544GB
to 576GB for only 32GB RAM. Naturally, it would reach to enable 4 levels
of page tables to avoid hacking __virt_to_phys and __phys_to_virt.
However, it is recommended 4 levels of page table should be only enabled
if memory map is too sparse or there is about 512GB RAM.
References
----------
[1]: Principles of ARM Memory Maps, White Paper, Issue C
Signed-off-by: Jungseok Lee <[email protected]>
Reviewed-by: Sungjinn Chung <[email protected]>
---
arch/arm64/Kconfig | 7 +++++
arch/arm64/include/asm/memblock.h | 6 +++++
arch/arm64/include/asm/page.h | 4 ++-
arch/arm64/include/asm/pgalloc.h | 20 +++++++++++++++
arch/arm64/include/asm/pgtable-hwdef.h | 6 +++--
arch/arm64/include/asm/pgtable.h | 44 ++++++++++++++++++++++++++++++--
arch/arm64/include/asm/tlb.h | 8 ++++++
arch/arm64/kernel/head.S | 40 ++++++++++++++++++++---------
arch/arm64/kernel/traps.c | 5 ++++
arch/arm64/mm/fault.c | 1 +
arch/arm64/mm/mmu.c | 16 +++++++++---
11 files changed, 136 insertions(+), 21 deletions(-)
diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
index 431acbc..7f5270b 100644
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@ -184,12 +184,19 @@ config ARM64_3_LEVELS
help
This feature enables 3 levels of translation tables.
+config ARM64_4_LEVELS
+ bool "4 level"
+ depends on ARM64_4K_PAGES
+ help
+ This feature enables 4 levels of translation tables.
+
endchoice
config ARM64_VA_BITS
int "Virtual address space size"
range 39 39 if ARM64_4K_PAGES && ARM64_3_LEVELS
range 42 42 if ARM64_64K_PAGES && ARM64_2_LEVELS
+ range 48 48 if ARM64_4K_PAGES && ARM64_4_LEVELS
help
This feature is determined by a combination of page size and
level of translation tables.
diff --git a/arch/arm64/include/asm/memblock.h b/arch/arm64/include/asm/memblock.h
index 6afeed2..e4ac8bf 100644
--- a/arch/arm64/include/asm/memblock.h
+++ b/arch/arm64/include/asm/memblock.h
@@ -16,6 +16,12 @@
#ifndef __ASM_MEMBLOCK_H
#define __ASM_MEMBLOCK_H
+#ifndef CONFIG_ARM64_4_LEVELS
+#define MEMBLOCK_INITIAL_LIMIT PGDIR_SIZE
+#else
+#define MEMBLOCK_INITIAL_LIMIT PUD_SIZE
+#endif
+
extern void arm64_memblock_init(void);
#endif
diff --git a/arch/arm64/include/asm/page.h b/arch/arm64/include/asm/page.h
index 268e53d..83b5289 100644
--- a/arch/arm64/include/asm/page.h
+++ b/arch/arm64/include/asm/page.h
@@ -35,8 +35,10 @@
#ifdef CONFIG_ARM64_2_LEVELS
#include <asm/pgtable-2level-types.h>
-#else
+#elif defined(CONFIG_ARM64_3_LEVELS)
#include <asm/pgtable-3level-types.h>
+#else
+#include <asm/pgtable-4level-types.h>
#endif
extern void __cpu_clear_user_page(void *p, unsigned long user);
diff --git a/arch/arm64/include/asm/pgalloc.h b/arch/arm64/include/asm/pgalloc.h
index 4829837..8d745fa 100644
--- a/arch/arm64/include/asm/pgalloc.h
+++ b/arch/arm64/include/asm/pgalloc.h
@@ -26,6 +26,26 @@
#define check_pgt_cache() do { } while (0)
+#ifdef CONFIG_ARM64_4_LEVELS
+
+static inline pud_t *pud_alloc_one(struct mm_struct *mm, unsigned long addr)
+{
+ return (pud_t *)get_zeroed_page(GFP_KERNEL | __GFP_REPEAT);
+}
+
+static inline void pud_free(struct mm_struct *mm, pud_t *pud)
+{
+ BUG_ON((unsigned long)pud & (PAGE_SIZE-1));
+ free_page((unsigned long)pud);
+}
+
+static inline void pgd_populate(struct mm_struct *mm, pgd_t *pgd, pud_t *pud)
+{
+ set_pgd(pgd, __pgd(__pa(pud) | PUD_TYPE_TABLE));
+}
+
+#endif /* CONFIG_ARM64_4_LEVELS */
+
#ifndef CONFIG_ARM64_2_LEVELS
static inline pmd_t *pmd_alloc_one(struct mm_struct *mm, unsigned long addr)
diff --git a/arch/arm64/include/asm/pgtable-hwdef.h b/arch/arm64/include/asm/pgtable-hwdef.h
index 9cd86c6..ba30053 100644
--- a/arch/arm64/include/asm/pgtable-hwdef.h
+++ b/arch/arm64/include/asm/pgtable-hwdef.h
@@ -18,8 +18,10 @@
#ifdef CONFIG_ARM64_2_LEVELS
#include <asm/pgtable-2level-hwdef.h>
-#else
+#elif defined(CONFIG_ARM64_3_LEVELS)
#include <asm/pgtable-3level-hwdef.h>
+#else
+#include <asm/pgtable-4level-hwdef.h>
#endif
/*
@@ -27,7 +29,7 @@
*
* Level 1 descriptor (PUD).
*/
-
+#define PUD_TYPE_TABLE (_AT(pudval_t, 3) << 0)
#define PUD_TABLE_BIT (_AT(pgdval_t, 1) << 1)
/*
diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h
index a64ce5e..efc40d1 100644
--- a/arch/arm64/include/asm/pgtable.h
+++ b/arch/arm64/include/asm/pgtable.h
@@ -35,7 +35,11 @@
* VMALLOC and SPARSEMEM_VMEMMAP ranges.
*/
#define VMALLOC_START (UL(0xffffffffffffffff) << VA_BITS)
+#ifndef CONFIG_ARM64_4_LEVELS
#define VMALLOC_END (PAGE_OFFSET - UL(0x400000000) - SZ_64K)
+#else
+#define VMALLOC_END (PAGE_OFFSET - UL(0x40000000000) - SZ_64K)
+#endif
#define vmemmap ((struct page *)(VMALLOC_END + SZ_64K))
@@ -44,12 +48,16 @@
#ifndef __ASSEMBLY__
extern void __pte_error(const char *file, int line, unsigned long val);
extern void __pmd_error(const char *file, int line, unsigned long val);
+extern void __pud_error(const char *file, int line, unsigned long val);
extern void __pgd_error(const char *file, int line, unsigned long val);
#define pte_ERROR(pte) __pte_error(__FILE__, __LINE__, pte_val(pte))
#ifndef CONFIG_ARM64_2_LEVELS
#define pmd_ERROR(pmd) __pmd_error(__FILE__, __LINE__, pmd_val(pmd))
#endif
+#ifdef CONFIG_ARM64_4_LEVELS
+#define pud_ERROR(pud) __pud_error(__FILE__, __LINE__, pud_val(pud))
+#endif
#define pgd_ERROR(pgd) __pgd_error(__FILE__, __LINE__, pgd_val(pgd))
/*
@@ -344,6 +352,30 @@ static inline pmd_t *pud_page_vaddr(pud_t pud)
#endif /* CONFIG_ARM64_2_LEVELS */
+#ifdef CONFIG_ARM64_4_LEVELS
+
+#define pgd_none(pgd) (!pgd_val(pgd))
+#define pgd_bad(pgd) (!(pgd_val(pgd) & 2))
+#define pgd_present(pgd) (pgd_val(pgd))
+
+static inline void set_pgd(pgd_t *pgdp, pgd_t pgd)
+{
+ *pgdp = pgd;
+ dsb();
+}
+
+static inline void pgd_clear(pgd_t *pgdp)
+{
+ set_pgd(pgdp, __pgd(0));
+}
+
+static inline pud_t *pgd_page_vaddr(pgd_t pgd)
+{
+ return __va(pgd_val(pgd) & PHYS_MASK & (s32)PAGE_MASK);
+}
+
+#endif /* CONFIG_ARM64_4_LEVELS */
+
/* to find an entry in a page-table-directory */
#define pgd_index(addr) (((addr) >> PGDIR_SHIFT) & (PTRS_PER_PGD - 1))
@@ -352,6 +384,14 @@ static inline pmd_t *pud_page_vaddr(pud_t pud)
/* to find an entry in a kernel page-table-directory */
#define pgd_offset_k(addr) pgd_offset(&init_mm, addr)
+#ifdef CONFIG_ARM64_4_LEVELS
+#define pud_index(addr) (((addr) >> PUD_SHIFT) & (PTRS_PER_PUD - 1))
+static inline pud_t *pud_offset(pgd_t *pgd, unsigned long addr)
+{
+ return (pud_t *)pgd_page_vaddr(*pgd) + pud_index(addr);
+}
+#endif
+
/* Find an entry in the second-level page table.. */
#ifndef CONFIG_ARM64_2_LEVELS
#define pmd_index(addr) (((addr) >> PMD_SHIFT) & (PTRS_PER_PMD - 1))
@@ -380,8 +420,8 @@ static inline pmd_t pmd_modify(pmd_t pmd, pgprot_t newprot)
extern pgd_t swapper_pg_dir[PTRS_PER_PGD];
extern pgd_t idmap_pg_dir[PTRS_PER_PGD];
-#define SWAPPER_DIR_SIZE (3 * PAGE_SIZE)
-#define IDMAP_DIR_SIZE (2 * PAGE_SIZE)
+#define SWAPPER_DIR_SIZE (4 * PAGE_SIZE)
+#define IDMAP_DIR_SIZE (3 * PAGE_SIZE)
/*
* Encode and decode a swap entry:
diff --git a/arch/arm64/include/asm/tlb.h b/arch/arm64/include/asm/tlb.h
index df378b2..dedfb04 100644
--- a/arch/arm64/include/asm/tlb.h
+++ b/arch/arm64/include/asm/tlb.h
@@ -99,5 +99,13 @@ static inline void __pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmdp,
}
#endif
+#ifdef CONFIG_ARM64_4_LEVELS
+static inline void __pud_free_tlb(struct mmu_gather *tlb, pmd_t *pudp,
+ unsigned long addr)
+{
+ tlb_add_flush(tlb, addr);
+ tlb_remove_page(tlb, virt_to_page(pudp));
+}
+#endif
#endif
diff --git a/arch/arm64/kernel/head.S b/arch/arm64/kernel/head.S
index 0fd5650..f313a7a 100644
--- a/arch/arm64/kernel/head.S
+++ b/arch/arm64/kernel/head.S
@@ -37,8 +37,8 @@
/*
* swapper_pg_dir is the virtual address of the initial page table. We place
- * the page tables 3 * PAGE_SIZE below KERNEL_RAM_VADDR. The idmap_pg_dir has
- * 2 pages and is placed below swapper_pg_dir.
+ * the page tables 4 * PAGE_SIZE below KERNEL_RAM_VADDR. The idmap_pg_dir has
+ * 3 pages and is placed below swapper_pg_dir.
*/
#define KERNEL_RAM_VADDR (PAGE_OFFSET + TEXT_OFFSET)
@@ -46,8 +46,8 @@
#error KERNEL_RAM_VADDR must start at 0xXXX80000
#endif
-#define SWAPPER_DIR_SIZE (3 * PAGE_SIZE)
-#define IDMAP_DIR_SIZE (2 * PAGE_SIZE)
+#define SWAPPER_DIR_SIZE (4 * PAGE_SIZE)
+#define IDMAP_DIR_SIZE (3 * PAGE_SIZE)
.globl swapper_pg_dir
.equ swapper_pg_dir, KERNEL_RAM_VADDR - SWAPPER_DIR_SIZE
@@ -371,16 +371,29 @@ ENDPROC(__calc_phys_offset)
/*
* Macro to populate the PGD for the corresponding block entry in the next
- * level (tbl) for the given virtual address.
+ * levels (tbl1 and tbl2) for the given virtual address.
*
- * Preserves: pgd, tbl, virt
+ * Preserves: pgd, tbl1, tbl2, virt
* Corrupts: tmp1, tmp2
*/
- .macro create_pgd_entry, pgd, tbl, virt, tmp1, tmp2
+ .macro create_pgd_entry, pgd, tbl1, tbl2, virt, tmp1, tmp2
lsr \tmp1, \virt, #PGDIR_SHIFT
and \tmp1, \tmp1, #PTRS_PER_PGD - 1 // PGD index
- orr \tmp2, \tbl, #3 // PGD entry table type
+ orr \tmp2, \tbl1, #3 // PGD entry table type
str \tmp2, [\pgd, \tmp1, lsl #3]
+#ifdef CONFIG_ARM64_4_LEVELS
+ ldr \tbl2, =FIXADDR_TOP
+ cmp \tbl2, \virt
+ add \tbl2, \tbl1, #PAGE_SIZE
+ b.ne 1f
+ add \tbl2, \tbl2, #PAGE_SIZE
+1:
+ lsr \tmp1, \virt, #PUD_SHIFT
+ and \tmp1, \tmp1, #PTRS_PER_PUD - 1 // PUD index
+ orr \tmp2, \tbl2, #3 // PUD entry table type
+ str \tmp2, [\tbl1, \tmp1, lsl #3]
+ mov \tbl1, \tbl2
+#endif
.endm
/*
@@ -444,7 +457,7 @@ __create_page_tables:
add x0, x25, #PAGE_SIZE // section table address
ldr x3, =KERNEL_START
add x3, x3, x28 // __pa(KERNEL_START)
- create_pgd_entry x25, x0, x3, x5, x6
+ create_pgd_entry x25, x0, x1, x3, x5, x6
ldr x6, =KERNEL_END
mov x5, x3 // __pa(KERNEL_START)
add x6, x6, x28 // __pa(KERNEL_END)
@@ -455,7 +468,7 @@ __create_page_tables:
*/
add x0, x26, #PAGE_SIZE // section table address
mov x5, #PAGE_OFFSET
- create_pgd_entry x26, x0, x5, x3, x6
+ create_pgd_entry x26, x0, x1, x5, x3, x6
ldr x6, =KERNEL_END
mov x3, x24 // phys offset
create_block_map x0, x7, x3, x5, x6
@@ -480,8 +493,11 @@ __create_page_tables:
* Create the pgd entry for the fixed mappings.
*/
ldr x5, =FIXADDR_TOP // Fixed mapping virtual address
- add x0, x26, #2 * PAGE_SIZE // section table address
- create_pgd_entry x26, x0, x5, x6, x7
+ add x0, x26, #PAGE_SIZE
+#ifndef CONFIG_ARM64_4_LEVELS
+ add x0, x0, #PAGE_SIZE
+#endif
+ create_pgd_entry x26, x0, x1, x5, x6, x7
/*
* Since the page tables have been populated with non-cacheable
diff --git a/arch/arm64/kernel/traps.c b/arch/arm64/kernel/traps.c
index 0484e81..16d5ee5 100644
--- a/arch/arm64/kernel/traps.c
+++ b/arch/arm64/kernel/traps.c
@@ -336,6 +336,11 @@ void __pmd_error(const char *file, int line, unsigned long val)
pr_crit("%s:%d: bad pmd %016lx.\n", file, line, val);
}
+void __pud_error(const char *file, int line, unsigned long val)
+{
+ pr_crit("%s:%d: bad pud %016lx.\n", file, line, val);
+}
+
void __pgd_error(const char *file, int line, unsigned long val)
{
pr_crit("%s:%d: bad pgd %016lx.\n", file, line, val);
diff --git a/arch/arm64/mm/fault.c b/arch/arm64/mm/fault.c
index c23751b..ed4a343 100644
--- a/arch/arm64/mm/fault.c
+++ b/arch/arm64/mm/fault.c
@@ -61,6 +61,7 @@ void show_pte(struct mm_struct *mm, unsigned long addr)
break;
pud = pud_offset(pgd, addr);
+ printk(", *pud=%016llx", pud_val(*pud));
if (pud_none(*pud) || pud_bad(*pud))
break;
diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c
index 6b7e895..4d29332 100644
--- a/arch/arm64/mm/mmu.c
+++ b/arch/arm64/mm/mmu.c
@@ -32,6 +32,7 @@
#include <asm/setup.h>
#include <asm/sizes.h>
#include <asm/tlb.h>
+#include <asm/memblock.h>
#include <asm/mmu_context.h>
#include "mm.h"
@@ -222,9 +223,15 @@ static void __init alloc_init_pmd(pud_t *pud, unsigned long addr,
static void __init alloc_init_pud(pgd_t *pgd, unsigned long addr,
unsigned long end, unsigned long phys)
{
- pud_t *pud = pud_offset(pgd, addr);
+ pud_t *pud;
unsigned long next;
+ if (pgd_none(*pgd) || pgd_bad(*pgd)) {
+ pud = early_alloc(PTRS_PER_PUD * sizeof(pud_t));
+ pgd_populate(&init_mm, pgd, pud);
+ }
+
+ pud = pud_offset(pgd, addr);
do {
next = pud_addr_end(addr, end);
alloc_init_pmd(pud, addr, next, phys);
@@ -271,10 +278,11 @@ static void __init map_mem(void)
* memory addressable from the initial direct kernel mapping.
*
* The initial direct kernel mapping, located at swapper_pg_dir,
- * gives us PGDIR_SIZE memory starting from PHYS_OFFSET (which must be
- * aligned to 2MB as per Documentation/arm64/booting.txt).
+ * gives us PGDIR_SIZE (2 and 3 levels) or PUD_SIZE (4 levels) memory
+ * starting from PHYS_OFFSET (which must be aligned to 2MB as per
+ * Documentation/arm64/booting.txt).
*/
- limit = PHYS_OFFSET + PGDIR_SIZE;
+ limit = PHYS_OFFSET + MEMBLOCK_INITIAL_LIMIT;
memblock_set_current_limit(limit);
/* map all the memory banks */
--
1.7.10.4
On Fri, Apr 18, 2014 at 04:59:20PM +0900, Jungseok Lee wrote:
> This patch implements 4 levels of translation tables since 3 levels
> of page tables with 4KB pages cannot support 40-bit physical address
> space described in [1] due to the following issue.
>
> It is a restriction that kernel logical memory map with 4KB + 3 levels
> (0xffffffc000000000-0xffffffffffffffff) cannot cover RAM region from
> 544GB to 1024GB in [1]. Specifically, ARM64 kernel fails to create
> mapping for this region in map_mem function since __phys_to_virt for
> this region reaches to address overflow.
>
> If SoC design follows the document, [1], over 32GB RAM would be placed
> from 544GB. Even 64GB system is supposed to use the region from 544GB
> to 576GB for only 32GB RAM. Naturally, it would reach to enable 4 levels
> of page tables to avoid hacking __virt_to_phys and __phys_to_virt.
>
> However, it is recommended 4 levels of page table should be only enabled
> if memory map is too sparse or there is about 512GB RAM.
Hello Jungseok,
A few comments can be found inline...
>
> References
> ----------
> [1]: Principles of ARM Memory Maps, White Paper, Issue C
>
> Signed-off-by: Jungseok Lee <[email protected]>
> Reviewed-by: Sungjinn Chung <[email protected]>
> ---
> arch/arm64/Kconfig | 7 +++++
> arch/arm64/include/asm/memblock.h | 6 +++++
> arch/arm64/include/asm/page.h | 4 ++-
> arch/arm64/include/asm/pgalloc.h | 20 +++++++++++++++
> arch/arm64/include/asm/pgtable-hwdef.h | 6 +++--
> arch/arm64/include/asm/pgtable.h | 44 ++++++++++++++++++++++++++++++--
> arch/arm64/include/asm/tlb.h | 8 ++++++
> arch/arm64/kernel/head.S | 40 ++++++++++++++++++++---------
> arch/arm64/kernel/traps.c | 5 ++++
> arch/arm64/mm/fault.c | 1 +
> arch/arm64/mm/mmu.c | 16 +++++++++---
> 11 files changed, 136 insertions(+), 21 deletions(-)
>
> diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
> index 431acbc..7f5270b 100644
> --- a/arch/arm64/Kconfig
> +++ b/arch/arm64/Kconfig
> @@ -184,12 +184,19 @@ config ARM64_3_LEVELS
> help
> This feature enables 3 levels of translation tables.
>
> +config ARM64_4_LEVELS
> + bool "4 level"
> + depends on ARM64_4K_PAGES
> + help
> + This feature enables 4 levels of translation tables.
> +
> endchoice
>
> config ARM64_VA_BITS
> int "Virtual address space size"
> range 39 39 if ARM64_4K_PAGES && ARM64_3_LEVELS
> range 42 42 if ARM64_64K_PAGES && ARM64_2_LEVELS
> + range 48 48 if ARM64_4K_PAGES && ARM64_4_LEVELS
> help
> This feature is determined by a combination of page size and
> level of translation tables.
> diff --git a/arch/arm64/include/asm/memblock.h b/arch/arm64/include/asm/memblock.h
> index 6afeed2..e4ac8bf 100644
> --- a/arch/arm64/include/asm/memblock.h
> +++ b/arch/arm64/include/asm/memblock.h
> @@ -16,6 +16,12 @@
> #ifndef __ASM_MEMBLOCK_H
> #define __ASM_MEMBLOCK_H
>
> +#ifndef CONFIG_ARM64_4_LEVELS
> +#define MEMBLOCK_INITIAL_LIMIT PGDIR_SIZE
> +#else
> +#define MEMBLOCK_INITIAL_LIMIT PUD_SIZE
> +#endif
> +
> extern void arm64_memblock_init(void);
>
> #endif
> diff --git a/arch/arm64/include/asm/page.h b/arch/arm64/include/asm/page.h
> index 268e53d..83b5289 100644
> --- a/arch/arm64/include/asm/page.h
> +++ b/arch/arm64/include/asm/page.h
> @@ -35,8 +35,10 @@
>
> #ifdef CONFIG_ARM64_2_LEVELS
> #include <asm/pgtable-2level-types.h>
> -#else
> +#elif defined(CONFIG_ARM64_3_LEVELS)
> #include <asm/pgtable-3level-types.h>
> +#else
> +#include <asm/pgtable-4level-types.h>
> #endif
>
> extern void __cpu_clear_user_page(void *p, unsigned long user);
> diff --git a/arch/arm64/include/asm/pgalloc.h b/arch/arm64/include/asm/pgalloc.h
> index 4829837..8d745fa 100644
> --- a/arch/arm64/include/asm/pgalloc.h
> +++ b/arch/arm64/include/asm/pgalloc.h
> @@ -26,6 +26,26 @@
>
> #define check_pgt_cache() do { } while (0)
>
> +#ifdef CONFIG_ARM64_4_LEVELS
> +
> +static inline pud_t *pud_alloc_one(struct mm_struct *mm, unsigned long addr)
> +{
> + return (pud_t *)get_zeroed_page(GFP_KERNEL | __GFP_REPEAT);
> +}
> +
> +static inline void pud_free(struct mm_struct *mm, pud_t *pud)
> +{
> + BUG_ON((unsigned long)pud & (PAGE_SIZE-1));
> + free_page((unsigned long)pud);
> +}
> +
> +static inline void pgd_populate(struct mm_struct *mm, pgd_t *pgd, pud_t *pud)
> +{
> + set_pgd(pgd, __pgd(__pa(pud) | PUD_TYPE_TABLE));
> +}
> +
> +#endif /* CONFIG_ARM64_4_LEVELS */
> +
> #ifndef CONFIG_ARM64_2_LEVELS
>
> static inline pmd_t *pmd_alloc_one(struct mm_struct *mm, unsigned long addr)
> diff --git a/arch/arm64/include/asm/pgtable-hwdef.h b/arch/arm64/include/asm/pgtable-hwdef.h
> index 9cd86c6..ba30053 100644
> --- a/arch/arm64/include/asm/pgtable-hwdef.h
> +++ b/arch/arm64/include/asm/pgtable-hwdef.h
> @@ -18,8 +18,10 @@
>
> #ifdef CONFIG_ARM64_2_LEVELS
> #include <asm/pgtable-2level-hwdef.h>
> -#else
> +#elif defined(CONFIG_ARM64_3_LEVELS)
> #include <asm/pgtable-3level-hwdef.h>
> +#else
> +#include <asm/pgtable-4level-hwdef.h>
> #endif
>
> /*
> @@ -27,7 +29,7 @@
> *
> * Level 1 descriptor (PUD).
> */
> -
> +#define PUD_TYPE_TABLE (_AT(pudval_t, 3) << 0)
> #define PUD_TABLE_BIT (_AT(pgdval_t, 1) << 1)
>
> /*
> diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h
> index a64ce5e..efc40d1 100644
> --- a/arch/arm64/include/asm/pgtable.h
> +++ b/arch/arm64/include/asm/pgtable.h
> @@ -35,7 +35,11 @@
> * VMALLOC and SPARSEMEM_VMEMMAP ranges.
> */
> #define VMALLOC_START (UL(0xffffffffffffffff) << VA_BITS)
> +#ifndef CONFIG_ARM64_4_LEVELS
> #define VMALLOC_END (PAGE_OFFSET - UL(0x400000000) - SZ_64K)
> +#else
> +#define VMALLOC_END (PAGE_OFFSET - UL(0x40000000000) - SZ_64K)
> +#endif
>
> #define vmemmap ((struct page *)(VMALLOC_END + SZ_64K))
>
> @@ -44,12 +48,16 @@
> #ifndef __ASSEMBLY__
> extern void __pte_error(const char *file, int line, unsigned long val);
> extern void __pmd_error(const char *file, int line, unsigned long val);
> +extern void __pud_error(const char *file, int line, unsigned long val);
> extern void __pgd_error(const char *file, int line, unsigned long val);
>
> #define pte_ERROR(pte) __pte_error(__FILE__, __LINE__, pte_val(pte))
> #ifndef CONFIG_ARM64_2_LEVELS
> #define pmd_ERROR(pmd) __pmd_error(__FILE__, __LINE__, pmd_val(pmd))
> #endif
> +#ifdef CONFIG_ARM64_4_LEVELS
> +#define pud_ERROR(pud) __pud_error(__FILE__, __LINE__, pud_val(pud))
> +#endif
> #define pgd_ERROR(pgd) __pgd_error(__FILE__, __LINE__, pgd_val(pgd))
>
> /*
> @@ -344,6 +352,30 @@ static inline pmd_t *pud_page_vaddr(pud_t pud)
>
> #endif /* CONFIG_ARM64_2_LEVELS */
>
> +#ifdef CONFIG_ARM64_4_LEVELS
> +
> +#define pgd_none(pgd) (!pgd_val(pgd))
> +#define pgd_bad(pgd) (!(pgd_val(pgd) & 2))
> +#define pgd_present(pgd) (pgd_val(pgd))
> +
> +static inline void set_pgd(pgd_t *pgdp, pgd_t pgd)
> +{
> + *pgdp = pgd;
> + dsb();
> +}
> +
> +static inline void pgd_clear(pgd_t *pgdp)
> +{
> + set_pgd(pgdp, __pgd(0));
> +}
> +
> +static inline pud_t *pgd_page_vaddr(pgd_t pgd)
> +{
> + return __va(pgd_val(pgd) & PHYS_MASK & (s32)PAGE_MASK);
> +}
> +
> +#endif /* CONFIG_ARM64_4_LEVELS */
> +
> /* to find an entry in a page-table-directory */
> #define pgd_index(addr) (((addr) >> PGDIR_SHIFT) & (PTRS_PER_PGD - 1))
>
> @@ -352,6 +384,14 @@ static inline pmd_t *pud_page_vaddr(pud_t pud)
> /* to find an entry in a kernel page-table-directory */
> #define pgd_offset_k(addr) pgd_offset(&init_mm, addr)
>
> +#ifdef CONFIG_ARM64_4_LEVELS
> +#define pud_index(addr) (((addr) >> PUD_SHIFT) & (PTRS_PER_PUD - 1))
> +static inline pud_t *pud_offset(pgd_t *pgd, unsigned long addr)
> +{
> + return (pud_t *)pgd_page_vaddr(*pgd) + pud_index(addr);
> +}
> +#endif
> +
> /* Find an entry in the second-level page table.. */
> #ifndef CONFIG_ARM64_2_LEVELS
> #define pmd_index(addr) (((addr) >> PMD_SHIFT) & (PTRS_PER_PMD - 1))
> @@ -380,8 +420,8 @@ static inline pmd_t pmd_modify(pmd_t pmd, pgprot_t newprot)
> extern pgd_t swapper_pg_dir[PTRS_PER_PGD];
> extern pgd_t idmap_pg_dir[PTRS_PER_PGD];
>
> -#define SWAPPER_DIR_SIZE (3 * PAGE_SIZE)
> -#define IDMAP_DIR_SIZE (2 * PAGE_SIZE)
> +#define SWAPPER_DIR_SIZE (4 * PAGE_SIZE)
> +#define IDMAP_DIR_SIZE (3 * PAGE_SIZE)
>
> /*
> * Encode and decode a swap entry:
> diff --git a/arch/arm64/include/asm/tlb.h b/arch/arm64/include/asm/tlb.h
> index df378b2..dedfb04 100644
> --- a/arch/arm64/include/asm/tlb.h
> +++ b/arch/arm64/include/asm/tlb.h
> @@ -99,5 +99,13 @@ static inline void __pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmdp,
> }
> #endif
>
> +#ifdef CONFIG_ARM64_4_LEVELS
> +static inline void __pud_free_tlb(struct mmu_gather *tlb, pmd_t *pudp,
> + unsigned long addr)
> +{
> + tlb_add_flush(tlb, addr);
> + tlb_remove_page(tlb, virt_to_page(pudp));
> +}
> +#endif
>
> #endif
> diff --git a/arch/arm64/kernel/head.S b/arch/arm64/kernel/head.S
> index 0fd5650..f313a7a 100644
> --- a/arch/arm64/kernel/head.S
> +++ b/arch/arm64/kernel/head.S
> @@ -37,8 +37,8 @@
>
> /*
> * swapper_pg_dir is the virtual address of the initial page table. We place
> - * the page tables 3 * PAGE_SIZE below KERNEL_RAM_VADDR. The idmap_pg_dir has
> - * 2 pages and is placed below swapper_pg_dir.
> + * the page tables 4 * PAGE_SIZE below KERNEL_RAM_VADDR. The idmap_pg_dir has
> + * 3 pages and is placed below swapper_pg_dir.
> */
> #define KERNEL_RAM_VADDR (PAGE_OFFSET + TEXT_OFFSET)
>
> @@ -46,8 +46,8 @@
> #error KERNEL_RAM_VADDR must start at 0xXXX80000
> #endif
>
> -#define SWAPPER_DIR_SIZE (3 * PAGE_SIZE)
> -#define IDMAP_DIR_SIZE (2 * PAGE_SIZE)
> +#define SWAPPER_DIR_SIZE (4 * PAGE_SIZE)
> +#define IDMAP_DIR_SIZE (3 * PAGE_SIZE)
>
> .globl swapper_pg_dir
> .equ swapper_pg_dir, KERNEL_RAM_VADDR - SWAPPER_DIR_SIZE
> @@ -371,16 +371,29 @@ ENDPROC(__calc_phys_offset)
>
> /*
> * Macro to populate the PGD for the corresponding block entry in the next
> - * level (tbl) for the given virtual address.
> + * levels (tbl1 and tbl2) for the given virtual address.
> *
> - * Preserves: pgd, tbl, virt
> + * Preserves: pgd, tbl1, tbl2, virt
tbl1 and tbl2 are *not* preserved for 4 level. tbl1 is bumped up one
page to make space for the pud, then fed into create_block_mapping
later.
> * Corrupts: tmp1, tmp2
> */
> - .macro create_pgd_entry, pgd, tbl, virt, tmp1, tmp2
> + .macro create_pgd_entry, pgd, tbl1, tbl2, virt, tmp1, tmp2
> lsr \tmp1, \virt, #PGDIR_SHIFT
> and \tmp1, \tmp1, #PTRS_PER_PGD - 1 // PGD index
> - orr \tmp2, \tbl, #3 // PGD entry table type
> + orr \tmp2, \tbl1, #3 // PGD entry table type
> str \tmp2, [\pgd, \tmp1, lsl #3]
> +#ifdef CONFIG_ARM64_4_LEVELS
> + ldr \tbl2, =FIXADDR_TOP
> + cmp \tbl2, \virt
Do we need this extra logic? See my other comment below where the fixed
mapping is placed down.
> + add \tbl2, \tbl1, #PAGE_SIZE
> + b.ne 1f
> + add \tbl2, \tbl2, #PAGE_SIZE
> +1:
> + lsr \tmp1, \virt, #PUD_SHIFT
> + and \tmp1, \tmp1, #PTRS_PER_PUD - 1 // PUD index
> + orr \tmp2, \tbl2, #3 // PUD entry table type
> + str \tmp2, [\tbl1, \tmp1, lsl #3]
> + mov \tbl1, \tbl2
> +#endif
It may be easier to read to have a create_pud_entry macro too?
> .endm
>
> /*
> @@ -444,7 +457,7 @@ __create_page_tables:
> add x0, x25, #PAGE_SIZE // section table address
> ldr x3, =KERNEL_START
> add x3, x3, x28 // __pa(KERNEL_START)
> - create_pgd_entry x25, x0, x3, x5, x6
> + create_pgd_entry x25, x0, x1, x3, x5, x6
> ldr x6, =KERNEL_END
> mov x5, x3 // __pa(KERNEL_START)
> add x6, x6, x28 // __pa(KERNEL_END)
> @@ -455,7 +468,7 @@ __create_page_tables:
> */
> add x0, x26, #PAGE_SIZE // section table address
> mov x5, #PAGE_OFFSET
> - create_pgd_entry x26, x0, x5, x3, x6
> + create_pgd_entry x26, x0, x1, x5, x3, x6
> ldr x6, =KERNEL_END
> mov x3, x24 // phys offset
> create_block_map x0, x7, x3, x5, x6
> @@ -480,8 +493,11 @@ __create_page_tables:
> * Create the pgd entry for the fixed mappings.
> */
> ldr x5, =FIXADDR_TOP // Fixed mapping virtual address
> - add x0, x26, #2 * PAGE_SIZE // section table address
> - create_pgd_entry x26, x0, x5, x6, x7
> + add x0, x26, #PAGE_SIZE
> +#ifndef CONFIG_ARM64_4_LEVELS
> + add x0, x0, #PAGE_SIZE
> +#endif
This is overly complicated. For <4 levels we set x0 to be:
ttbr1 + 2*PAGE_SIZE. For 4-levels, we set x0 to be ttbr1 + PAGE_SIZE,
then inside the create_pgd_entry macro, we check the VA for FIXADDR_TOP
then add another PAGE_SIZE. This is presumably done so the same PUD is
used for the swapper block map and the FIXADDR map.
If you assume that the PUD always follows the PGD for 4-levels, then
you can remove this #ifdef and the conditional VA logic in
set_pgd_entry. To make the logic simpler for <4 levels, you could call
create_pud_entry in the middle of create_pgd_entry, then put down the
actual pgd after.
> + create_pgd_entry x26, x0, x1, x5, x6, x7
>
So before this patch we have the following created by
__create_page_tables:
+========================+ <--- TEXT_OFFSET + PHYS_OFFSET
| FIXADDR (pmd or pte) |
+------------------------+
| block map (pmd or pte) |
+------------------------+
| PGDs for swapper |
+========================+ <--- TTBR1 swapper_pg_dir
| block map for idmap |
+------------------------+
| PGDs for idmap |
+------------------------+ <--- TTBR0 idmap_pg_dir
After the patch, for 4 levels activated we have:
+========================+ <--- TEXT_OFFSET + PHYS_OFFSET
| FIXADDR (ptes) |
+------------------------+
| block map (ptes) |
+------------------------+
| PUDs for swapper |
+------------------------+
| PGDs for swapper |
+========================+ <--- TTBR1 swapper_pg_dir
| block map for idmap |
+------------------------+
| PUDs for idmap |
+------------------------+
| PGDs for idmap |
+------------------------+ <--- TTBR0 idmap_pg_dir
and without 4 levels activated we have:
+========================+ <--- TEXT_OFFSET + PHYS_OFFSET
| ZERO BYTES |
+------------------------+
| FIXADDR (pmd or pte) |
+------------------------+
| block map (pmd or pte) |
+------------------------+
| PGDs for swapper |
+========================+ <--- TTBR1 swapper_pg_dir
| ZERO BYTES |
+------------------------+
| block map for idmap |
+------------------------+
| PGDs for idmap |
+------------------------+ <--- TTBR0 idmap_pg_dir
This is a pity as we are potentially throwing away 128KB.
I would recommend only extending the sizes of IDMAP_DIR_SIZE and
SWAPPER_DIR_SIZE if necessary.
> /*
> * Since the page tables have been populated with non-cacheable
> diff --git a/arch/arm64/kernel/traps.c b/arch/arm64/kernel/traps.c
> index 0484e81..16d5ee5 100644
> --- a/arch/arm64/kernel/traps.c
> +++ b/arch/arm64/kernel/traps.c
> @@ -336,6 +336,11 @@ void __pmd_error(const char *file, int line, unsigned long val)
> pr_crit("%s:%d: bad pmd %016lx.\n", file, line, val);
> }
>
> +void __pud_error(const char *file, int line, unsigned long val)
> +{
> + pr_crit("%s:%d: bad pud %016lx.\n", file, line, val);
> +}
> +
> void __pgd_error(const char *file, int line, unsigned long val)
> {
> pr_crit("%s:%d: bad pgd %016lx.\n", file, line, val);
> diff --git a/arch/arm64/mm/fault.c b/arch/arm64/mm/fault.c
> index c23751b..ed4a343 100644
> --- a/arch/arm64/mm/fault.c
> +++ b/arch/arm64/mm/fault.c
> @@ -61,6 +61,7 @@ void show_pte(struct mm_struct *mm, unsigned long addr)
> break;
>
> pud = pud_offset(pgd, addr);
> + printk(", *pud=%016llx", pud_val(*pud));
> if (pud_none(*pud) || pud_bad(*pud))
> break;
>
> diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c
> index 6b7e895..4d29332 100644
> --- a/arch/arm64/mm/mmu.c
> +++ b/arch/arm64/mm/mmu.c
> @@ -32,6 +32,7 @@
> #include <asm/setup.h>
> #include <asm/sizes.h>
> #include <asm/tlb.h>
> +#include <asm/memblock.h>
> #include <asm/mmu_context.h>
>
> #include "mm.h"
> @@ -222,9 +223,15 @@ static void __init alloc_init_pmd(pud_t *pud, unsigned long addr,
> static void __init alloc_init_pud(pgd_t *pgd, unsigned long addr,
> unsigned long end, unsigned long phys)
> {
> - pud_t *pud = pud_offset(pgd, addr);
> + pud_t *pud;
> unsigned long next;
>
> + if (pgd_none(*pgd) || pgd_bad(*pgd)) {
> + pud = early_alloc(PTRS_PER_PUD * sizeof(pud_t));
> + pgd_populate(&init_mm, pgd, pud);
> + }
> +
> + pud = pud_offset(pgd, addr);
> do {
> next = pud_addr_end(addr, end);
> alloc_init_pmd(pud, addr, next, phys);
> @@ -271,10 +278,11 @@ static void __init map_mem(void)
> * memory addressable from the initial direct kernel mapping.
> *
> * The initial direct kernel mapping, located at swapper_pg_dir,
> - * gives us PGDIR_SIZE memory starting from PHYS_OFFSET (which must be
> - * aligned to 2MB as per Documentation/arm64/booting.txt).
> + * gives us PGDIR_SIZE (2 and 3 levels) or PUD_SIZE (4 levels) memory
> + * starting from PHYS_OFFSET (which must be aligned to 2MB as per
> + * Documentation/arm64/booting.txt).
> */
> - limit = PHYS_OFFSET + PGDIR_SIZE;
> + limit = PHYS_OFFSET + MEMBLOCK_INITIAL_LIMIT;
> memblock_set_current_limit(limit);
>
> /* map all the memory banks */
> --
> 1.7.10.4
>
>
On Thursday, April 24, 2014 1:02 AM, Steve Capper wrote:
> On Fri, Apr 18, 2014 at 04:59:20PM +0900, Jungseok Lee wrote:
> > This patch implements 4 levels of translation tables since 3 levels of
> > page tables with 4KB pages cannot support 40-bit physical address
> > space described in [1] due to the following issue.
> >
> > It is a restriction that kernel logical memory map with 4KB + 3 levels
> > (0xffffffc000000000-0xffffffffffffffff) cannot cover RAM region from
> > 544GB to 1024GB in [1]. Specifically, ARM64 kernel fails to create
> > mapping for this region in map_mem function since __phys_to_virt for
> > this region reaches to address overflow.
> >
> > If SoC design follows the document, [1], over 32GB RAM would be placed
> > from 544GB. Even 64GB system is supposed to use the region from 544GB
> > to 576GB for only 32GB RAM. Naturally, it would reach to enable 4
> > levels of page tables to avoid hacking __virt_to_phys and __phys_to_virt.
> >
> > However, it is recommended 4 levels of page table should be only
> > enabled if memory map is too sparse or there is about 512GB RAM.
>
> Hello Jungseok,
> A few comments can be found inline...
Hi Steve, The comments are very helpful. Thanks.
[ ... ]
> > diff --git a/arch/arm64/kernel/head.S b/arch/arm64/kernel/head.S index
> > 0fd5650..f313a7a 100644
> > --- a/arch/arm64/kernel/head.S
> > +++ b/arch/arm64/kernel/head.S
> > @@ -37,8 +37,8 @@
> >
> > /*
> > * swapper_pg_dir is the virtual address of the initial page table.
> > We place
> > - * the page tables 3 * PAGE_SIZE below KERNEL_RAM_VADDR. The
> > idmap_pg_dir has
> > - * 2 pages and is placed below swapper_pg_dir.
> > + * the page tables 4 * PAGE_SIZE below KERNEL_RAM_VADDR. The
> > + idmap_pg_dir has
> > + * 3 pages and is placed below swapper_pg_dir.
> > */
> > #define KERNEL_RAM_VADDR (PAGE_OFFSET + TEXT_OFFSET)
> >
> > @@ -46,8 +46,8 @@
> > #error KERNEL_RAM_VADDR must start at 0xXXX80000 #endif
> >
> > -#define SWAPPER_DIR_SIZE (3 * PAGE_SIZE)
> > -#define IDMAP_DIR_SIZE (2 * PAGE_SIZE)
> > +#define SWAPPER_DIR_SIZE (4 * PAGE_SIZE)
> > +#define IDMAP_DIR_SIZE (3 * PAGE_SIZE)
> >
> > .globl swapper_pg_dir
> > .equ swapper_pg_dir, KERNEL_RAM_VADDR - SWAPPER_DIR_SIZE
> > @@ -371,16 +371,29 @@ ENDPROC(__calc_phys_offset)
> >
> > /*
> > * Macro to populate the PGD for the corresponding block entry in the
> > next
> > - * level (tbl) for the given virtual address.
> > + * levels (tbl1 and tbl2) for the given virtual address.
> > *
> > - * Preserves: pgd, tbl, virt
> > + * Preserves: pgd, tbl1, tbl2, virt
>
> tbl1 and tbl2 are *not* preserved for 4 level. tbl1 is bumped up one page to make space for the pud,
> then fed into create_block_mapping later.
Your logic can be extended to 3 levels.
In an original code, tbl is fed into create_block_mapping.
That is why I've written them down as "preserves".
I will fix it in the next version.
> > * Corrupts: tmp1, tmp2
> > */
> > - .macro create_pgd_entry, pgd, tbl, virt, tmp1, tmp2
> > + .macro create_pgd_entry, pgd, tbl1, tbl2, virt, tmp1, tmp2
> > lsr \tmp1, \virt, #PGDIR_SHIFT
> > and \tmp1, \tmp1, #PTRS_PER_PGD - 1 // PGD index
> > - orr \tmp2, \tbl, #3 // PGD entry table type
> > + orr \tmp2, \tbl1, #3 // PGD entry table type
> > str \tmp2, [\pgd, \tmp1, lsl #3]
> > +#ifdef CONFIG_ARM64_4_LEVELS
> > + ldr \tbl2, =FIXADDR_TOP
> > + cmp \tbl2, \virt
>
> Do we need this extra logic? See my other comment below where the fixed mapping is placed down.
>
> > + add \tbl2, \tbl1, #PAGE_SIZE
> > + b.ne 1f
> > + add \tbl2, \tbl2, #PAGE_SIZE
> > +1:
> > + lsr \tmp1, \virt, #PUD_SHIFT
> > + and \tmp1, \tmp1, #PTRS_PER_PUD - 1 // PUD index
> > + orr \tmp2, \tbl2, #3 // PUD entry table type
> > + str \tmp2, [\tbl1, \tmp1, lsl #3]
> > + mov \tbl1, \tbl2
> > +#endif
>
> It may be easier to read to have a create_pud_entry macro too?
Okay. I will write a create_pud_entry macro.
> > .endm
> >
> > /*
> > @@ -444,7 +457,7 @@ __create_page_tables:
> > add x0, x25, #PAGE_SIZE // section table address
> > ldr x3, =KERNEL_START
> > add x3, x3, x28 // __pa(KERNEL_START)
> > - create_pgd_entry x25, x0, x3, x5, x6
> > + create_pgd_entry x25, x0, x1, x3, x5, x6
> > ldr x6, =KERNEL_END
> > mov x5, x3 // __pa(KERNEL_START)
> > add x6, x6, x28 // __pa(KERNEL_END)
> > @@ -455,7 +468,7 @@ __create_page_tables:
> > */
> > add x0, x26, #PAGE_SIZE // section table address
> > mov x5, #PAGE_OFFSET
> > - create_pgd_entry x26, x0, x5, x3, x6
> > + create_pgd_entry x26, x0, x1, x5, x3, x6
> > ldr x6, =KERNEL_END
> > mov x3, x24 // phys offset
> > create_block_map x0, x7, x3, x5, x6
> > @@ -480,8 +493,11 @@ __create_page_tables:
> > * Create the pgd entry for the fixed mappings.
> > */
> > ldr x5, =FIXADDR_TOP // Fixed mapping virtual address
> > - add x0, x26, #2 * PAGE_SIZE // section table address
> > - create_pgd_entry x26, x0, x5, x6, x7
> > + add x0, x26, #PAGE_SIZE
> > +#ifndef CONFIG_ARM64_4_LEVELS
> > + add x0, x0, #PAGE_SIZE
> > +#endif
>
> This is overly complicated. For <4 levels we set x0 to be:
> ttbr1 + 2*PAGE_SIZE. For 4-levels, we set x0 to be ttbr1 + PAGE_SIZE, then inside the create_pgd_entry
> macro, we check the VA for FIXADDR_TOP then add another PAGE_SIZE. This is presumably done so the same
> PUD is used for the swapper block map and the FIXADDR map.
>
> If you assume that the PUD always follows the PGD for 4-levels, then you can remove this #ifdef and
> the conditional VA logic in set_pgd_entry. To make the logic simpler for <4 levels, you could call
> create_pud_entry in the middle of create_pgd_entry, then put down the actual pgd after.
Okay, I will revise it in an easy and neat way.
> > + create_pgd_entry x26, x0, x1, x5, x6, x7
> >
>
> So before this patch we have the following created by
> __create_page_tables:
>
> +========================+ <--- TEXT_OFFSET + PHYS_OFFSET
> | FIXADDR (pmd or pte) |
> +------------------------+
> | block map (pmd or pte) |
> +------------------------+
> | PGDs for swapper |
> +========================+ <--- TTBR1 swapper_pg_dir
> | block map for idmap |
> +------------------------+
> | PGDs for idmap |
> +------------------------+ <--- TTBR0 idmap_pg_dir
>
>
> After the patch, for 4 levels activated we have:
> +========================+ <--- TEXT_OFFSET + PHYS_OFFSET
> | FIXADDR (ptes) |
> +------------------------+
> | block map (ptes) |
> +------------------------+
> | PUDs for swapper |
> +------------------------+
> | PGDs for swapper |
> +========================+ <--- TTBR1 swapper_pg_dir
> | block map for idmap |
> +------------------------+
> | PUDs for idmap |
> +------------------------+
> | PGDs for idmap |
> +------------------------+ <--- TTBR0 idmap_pg_dir
>
> and without 4 levels activated we have:
> +========================+ <--- TEXT_OFFSET + PHYS_OFFSET
> | ZERO BYTES |
> +------------------------+
> | FIXADDR (pmd or pte) |
> +------------------------+
> | block map (pmd or pte) |
> +------------------------+
> | PGDs for swapper |
> +========================+ <--- TTBR1 swapper_pg_dir
> | ZERO BYTES |
> +------------------------+
> | block map for idmap |
> +------------------------+
> | PGDs for idmap |
> +------------------------+ <--- TTBR0 idmap_pg_dir
>
> This is a pity as we are potentially throwing away 128KB.
> I would recommend only extending the sizes of IDMAP_DIR_SIZE and SWAPPER_DIR_SIZE if necessary.
Yes, you're right.
I will introduce #ifdef statements for their size adjustment.
Best Regards
Jungseok Lee
On Thursday, April 24, 2014 1:02 AM, Steve Capper wrote:
> On Fri, Apr 18, 2014 at 04:59:20PM +0900, Jungseok Lee wrote:
[ ... ]
> > diff --git a/arch/arm64/kernel/head.S b/arch/arm64/kernel/head.S index
> > 0fd5650..f313a7a 100644
> > --- a/arch/arm64/kernel/head.S
> > +++ b/arch/arm64/kernel/head.S
> > @@ -37,8 +37,8 @@
> >
> > /*
> > * swapper_pg_dir is the virtual address of the initial page table.
> > We place
> > - * the page tables 3 * PAGE_SIZE below KERNEL_RAM_VADDR. The
> > idmap_pg_dir has
> > - * 2 pages and is placed below swapper_pg_dir.
> > + * the page tables 4 * PAGE_SIZE below KERNEL_RAM_VADDR. The
> > + idmap_pg_dir has
> > + * 3 pages and is placed below swapper_pg_dir.
> > */
> > #define KERNEL_RAM_VADDR (PAGE_OFFSET + TEXT_OFFSET)
> >
> > @@ -46,8 +46,8 @@
> > #error KERNEL_RAM_VADDR must start at 0xXXX80000 #endif
> >
> > -#define SWAPPER_DIR_SIZE (3 * PAGE_SIZE)
> > -#define IDMAP_DIR_SIZE (2 * PAGE_SIZE)
> > +#define SWAPPER_DIR_SIZE (4 * PAGE_SIZE)
> > +#define IDMAP_DIR_SIZE (3 * PAGE_SIZE)
> >
> > .globl swapper_pg_dir
> > .equ swapper_pg_dir, KERNEL_RAM_VADDR - SWAPPER_DIR_SIZE
> > @@ -371,16 +371,29 @@ ENDPROC(__calc_phys_offset)
> >
> > /*
> > * Macro to populate the PGD for the corresponding block entry in the
> > next
> > - * level (tbl) for the given virtual address.
> > + * levels (tbl1 and tbl2) for the given virtual address.
> > *
> > - * Preserves: pgd, tbl, virt
> > + * Preserves: pgd, tbl1, tbl2, virt
>
> tbl1 and tbl2 are *not* preserved for 4 level. tbl1 is bumped up one page to make space for the pud,
> then fed into create_block_mapping later.
>
> > * Corrupts: tmp1, tmp2
> > */
> > - .macro create_pgd_entry, pgd, tbl, virt, tmp1, tmp2
> > + .macro create_pgd_entry, pgd, tbl1, tbl2, virt, tmp1, tmp2
> > lsr \tmp1, \virt, #PGDIR_SHIFT
> > and \tmp1, \tmp1, #PTRS_PER_PGD - 1 // PGD index
> > - orr \tmp2, \tbl, #3 // PGD entry table type
> > + orr \tmp2, \tbl1, #3 // PGD entry table type
> > str \tmp2, [\pgd, \tmp1, lsl #3]
> > +#ifdef CONFIG_ARM64_4_LEVELS
> > + ldr \tbl2, =FIXADDR_TOP
> > + cmp \tbl2, \virt
>
> Do we need this extra logic? See my other comment below where the fixed mapping is placed down.
>
> > + add \tbl2, \tbl1, #PAGE_SIZE
> > + b.ne 1f
> > + add \tbl2, \tbl2, #PAGE_SIZE
> > +1:
> > + lsr \tmp1, \virt, #PUD_SHIFT
> > + and \tmp1, \tmp1, #PTRS_PER_PUD - 1 // PUD index
> > + orr \tmp2, \tbl2, #3 // PUD entry table type
> > + str \tmp2, [\tbl1, \tmp1, lsl #3]
> > + mov \tbl1, \tbl2
> > +#endif
>
> It may be easier to read to have a create_pud_entry macro too?
>
> > .endm
> >
> > /*
> > @@ -444,7 +457,7 @@ __create_page_tables:
> > add x0, x25, #PAGE_SIZE // section table address
> > ldr x3, =KERNEL_START
> > add x3, x3, x28 // __pa(KERNEL_START)
> > - create_pgd_entry x25, x0, x3, x5, x6
> > + create_pgd_entry x25, x0, x1, x3, x5, x6
> > ldr x6, =KERNEL_END
> > mov x5, x3 // __pa(KERNEL_START)
> > add x6, x6, x28 // __pa(KERNEL_END)
> > @@ -455,7 +468,7 @@ __create_page_tables:
> > */
> > add x0, x26, #PAGE_SIZE // section table address
> > mov x5, #PAGE_OFFSET
> > - create_pgd_entry x26, x0, x5, x3, x6
> > + create_pgd_entry x26, x0, x1, x5, x3, x6
> > ldr x6, =KERNEL_END
> > mov x3, x24 // phys offset
> > create_block_map x0, x7, x3, x5, x6
> > @@ -480,8 +493,11 @@ __create_page_tables:
> > * Create the pgd entry for the fixed mappings.
> > */
> > ldr x5, =FIXADDR_TOP // Fixed mapping virtual address
> > - add x0, x26, #2 * PAGE_SIZE // section table address
> > - create_pgd_entry x26, x0, x5, x6, x7
> > + add x0, x26, #PAGE_SIZE
> > +#ifndef CONFIG_ARM64_4_LEVELS
> > + add x0, x0, #PAGE_SIZE
> > +#endif
>
> This is overly complicated. For <4 levels we set x0 to be:
> ttbr1 + 2*PAGE_SIZE. For 4-levels, we set x0 to be ttbr1 + PAGE_SIZE, then inside the create_pgd_entry
> macro, we check the VA for FIXADDR_TOP then add another PAGE_SIZE. This is presumably done so the same
> PUD is used for the swapper block map and the FIXADDR map.
Is it too complicated to understand the logic?
1) For <4 levels:
PAGE_SIZE is added for FIXADDR map and x0 is passed to create pgd entry.
2) For =4 levels:
PAGE_SIZE is added in the create_pgd_entry macro since FIXADDR map info
is needed to create pud entry.
However, I agree that the code should be revised if other people feel
like it is a labyrinthine logic.
> If you assume that the PUD always follows the PGD for 4-levels, then you can remove this #ifdef and
> the conditional VA logic in set_pgd_entry. To make the logic simpler for <4 levels, you could call
> create_pud_entry in the middle of create_pgd_entry, then put down the actual pgd after.
create_pud_entry should distinguish block map from FIXADDR map although
PUD always follows the PGD in case of 4 levels. If we would like to avoid
unnecessary #ifdef, the conditional logic should be introduced in create_
pgd_entry macro.
I cannot find the "best" way even though I've tried to figure it out.
I would like to find out the most clear and self-descriptive expression.
Could you give an idea on how to remove both conditional VA logic and #ifdef?
> > + create_pgd_entry x26, x0, x1, x5, x6, x7
> >
>
> So before this patch we have the following created by
> __create_page_tables:
>
> +========================+ <--- TEXT_OFFSET + PHYS_OFFSET
> | FIXADDR (pmd or pte) |
> +------------------------+
> | block map (pmd or pte) |
> +------------------------+
> | PGDs for swapper |
> +========================+ <--- TTBR1 swapper_pg_dir
> | block map for idmap |
> +------------------------+
> | PGDs for idmap |
> +------------------------+ <--- TTBR0 idmap_pg_dir
>
>
> After the patch, for 4 levels activated we have:
> +========================+ <--- TEXT_OFFSET + PHYS_OFFSET
> | FIXADDR (ptes) |
> +------------------------+
> | block map (ptes) |
> +------------------------+
> | PUDs for swapper |
> +------------------------+
> | PGDs for swapper |
> +========================+ <--- TTBR1 swapper_pg_dir
> | block map for idmap |
> +------------------------+
> | PUDs for idmap |
> +------------------------+
> | PGDs for idmap |
> +------------------------+ <--- TTBR0 idmap_pg_dir
>
> and without 4 levels activated we have:
> +========================+ <--- TEXT_OFFSET + PHYS_OFFSET
> | ZERO BYTES |
> +------------------------+
> | FIXADDR (pmd or pte) |
> +------------------------+
> | block map (pmd or pte) |
> +------------------------+
> | PGDs for swapper |
> +========================+ <--- TTBR1 swapper_pg_dir
> | ZERO BYTES |
> +------------------------+
> | block map for idmap |
> +------------------------+
> | PGDs for idmap |
> +------------------------+ <--- TTBR0 idmap_pg_dir
This is definitely helpful to understand head.S.
It would be good to add these figures to Documentation or comments.
Best Regards
Jungseok Lee
On Sun, Apr 27, 2014 at 12:37:35PM +0900, Jungseok Lee wrote:
> On Thursday, April 24, 2014 1:02 AM, Steve Capper wrote:
> > On Fri, Apr 18, 2014 at 04:59:20PM +0900, Jungseok Lee wrote:
[ ... ]
> >
> > This is overly complicated. For <4 levels we set x0 to be:
> > ttbr1 + 2*PAGE_SIZE. For 4-levels, we set x0 to be ttbr1 + PAGE_SIZE, then inside the create_pgd_entry
> > macro, we check the VA for FIXADDR_TOP then add another PAGE_SIZE. This is presumably done so the same
> > PUD is used for the swapper block map and the FIXADDR map.
>
> Is it too complicated to understand the logic?
>
> 1) For <4 levels:
> PAGE_SIZE is added for FIXADDR map and x0 is passed to create pgd entry.
> 2) For =4 levels:
> PAGE_SIZE is added in the create_pgd_entry macro since FIXADDR map info
> is needed to create pud entry.
>
> However, I agree that the code should be revised if other people feel
> like it is a labyrinthine logic.
>
> > If you assume that the PUD always follows the PGD for 4-levels, then you can remove this #ifdef and
> > the conditional VA logic in set_pgd_entry. To make the logic simpler for <4 levels, you could call
> > create_pud_entry in the middle of create_pgd_entry, then put down the actual pgd after.
>
> create_pud_entry should distinguish block map from FIXADDR map although
> PUD always follows the PGD in case of 4 levels. If we would like to avoid
> unnecessary #ifdef, the conditional logic should be introduced in create_
> pgd_entry macro.
>
> I cannot find the "best" way even though I've tried to figure it out.
> I would like to find out the most clear and self-descriptive expression.
>
> Could you give an idea on how to remove both conditional VA logic and #ifdef?
Hello Jungseok,
I had the following logic in my head:
It compiles and runs on the model, but I've not tried it in anger.
=========================================================================
.macro create_pud_entry, pgd, tbl, virt, pud, tmp1, tmp2
#ifdef CONFIG_ARM64_4_LEVELS
add \tbl, \tbl, #PAGE_SIZE // bump tbl 1 page up.
// to make room for pud
add \pud, \pgd, #PAGE_SIZE // pgd points to pud which
// follows pgd
lsr \tmp1, \virt, #PUD_SHIFT
and \tmp1, \tmp1, #PTRS_PER_PUD - 1 // PUD index
orr \tmp2, \tbl, #3 // PUD entry table type
str \tmp2, [\pud, \tmp1, lsl #3]
#else
mov \pud, \tbl // pgd points to section table
// directly for < 4 levels
#endif
.endm
/*
* Macro to populate the PGD for the corresponding block entry in the next
* level (tbl) for the given virtual address.
*
* Preserves: pgd, virt
* Corrupts: tmp1, tmp2, tmp3
* Returns: tbl -> page where block mappings can be placed
* (changed to make room for pud with 4levels, preserved otherwise)
*/
.macro create_pgd_entry, pgd, tbl, virt, tmp1, tmp2, tmp3
create_pud_entry \pgd, \tbl, \virt, \tmp3, \tmp1, \tmp2
lsr \tmp1, \virt, #PGDIR_SHIFT
and \tmp1, \tmp1, #PTRS_PER_PGD - 1 // PGD index
orr \tmp2, \tmp3, #3 // PGD entry table type
str \tmp2, [\pgd, \tmp1, lsl #3]
.endm
=========================================================================
[Note I've changed the extra argument to create_pgd_entry to be at the end
to make it easier to diff callers]
So essentially, we bump up tbl if we are running with 4 levels of page
table. We put the pgd down after the pud, and this allows us to sneak a
pud in after the pgd if we need to, otherwise it points to the target
for the section mapping.
Does this work for you? (I go cross-eyed with too much assembler, so
could have easily missed something).
>
> > > + create_pgd_entry x26, x0, x1, x5, x6, x7
> > >
> >
> > So before this patch we have the following created by
> > __create_page_tables:
> >
> > +========================+ <--- TEXT_OFFSET + PHYS_OFFSET
> > | FIXADDR (pmd or pte) |
> > +------------------------+
> > | block map (pmd or pte) |
> > +------------------------+
> > | PGDs for swapper |
> > +========================+ <--- TTBR1 swapper_pg_dir
> > | block map for idmap |
> > +------------------------+
> > | PGDs for idmap |
> > +------------------------+ <--- TTBR0 idmap_pg_dir
> >
> >
> > After the patch, for 4 levels activated we have:
> > +========================+ <--- TEXT_OFFSET + PHYS_OFFSET
> > | FIXADDR (ptes) |
> > +------------------------+
> > | block map (ptes) |
> > +------------------------+
> > | PUDs for swapper |
> > +------------------------+
> > | PGDs for swapper |
> > +========================+ <--- TTBR1 swapper_pg_dir
> > | block map for idmap |
> > +------------------------+
> > | PUDs for idmap |
> > +------------------------+
> > | PGDs for idmap |
> > +------------------------+ <--- TTBR0 idmap_pg_dir
> >
> > and without 4 levels activated we have:
> > +========================+ <--- TEXT_OFFSET + PHYS_OFFSET
> > | ZERO BYTES |
> > +------------------------+
> > | FIXADDR (pmd or pte) |
> > +------------------------+
> > | block map (pmd or pte) |
> > +------------------------+
> > | PGDs for swapper |
> > +========================+ <--- TTBR1 swapper_pg_dir
> > | ZERO BYTES |
> > +------------------------+
> > | block map for idmap |
> > +------------------------+
> > | PGDs for idmap |
> > +------------------------+ <--- TTBR0 idmap_pg_dir
>
> This is definitely helpful to understand head.S.
> It would be good to add these figures to Documentation or comments.
Please feel free to grab it if you want... :-).
Otherwise, I can put a patch in for this, as I am working on some logic
to remap the physical memory a PUD blocks for 4K granule.
Cheers,
--
Steve
On Monday, April 28, 2014 10:24 PM, Steve Capper wrote:
> On Sun, Apr 27, 2014 at 12:37:35PM +0900, Jungseok Lee wrote:
> > On Thursday, April 24, 2014 1:02 AM, Steve Capper wrote:
> > > On Fri, Apr 18, 2014 at 04:59:20PM +0900, Jungseok Lee wrote:
>
> [ ... ]
>
> > >
> > > This is overly complicated. For <4 levels we set x0 to be:
> > > ttbr1 + 2*PAGE_SIZE. For 4-levels, we set x0 to be ttbr1 +
> > > PAGE_SIZE, then inside the create_pgd_entry macro, we check the VA
> > > for FIXADDR_TOP then add another PAGE_SIZE. This is presumably done so the same PUD is used for
> the swapper block map and the FIXADDR map.
> >
> > Is it too complicated to understand the logic?
> >
> > 1) For <4 levels:
> > PAGE_SIZE is added for FIXADDR map and x0 is passed to create pgd entry.
> > 2) For =4 levels:
> > PAGE_SIZE is added in the create_pgd_entry macro since FIXADDR map
> > info is needed to create pud entry.
> >
> > However, I agree that the code should be revised if other people feel
> > like it is a labyrinthine logic.
> >
> > > If you assume that the PUD always follows the PGD for 4-levels, then
> > > you can remove this #ifdef and the conditional VA logic in
> > > set_pgd_entry. To make the logic simpler for <4 levels, you could call create_pud_entry in the
> middle of create_pgd_entry, then put down the actual pgd after.
> >
> > create_pud_entry should distinguish block map from FIXADDR map
> > although PUD always follows the PGD in case of 4 levels. If we would
> > like to avoid unnecessary #ifdef, the conditional logic should be
> > introduced in create_ pgd_entry macro.
> >
> > I cannot find the "best" way even though I've tried to figure it out.
> > I would like to find out the most clear and self-descriptive expression.
> >
> > Could you give an idea on how to remove both conditional VA logic and #ifdef?
>
>
> Hello Jungseok,
> I had the following logic in my head:
> It compiles and runs on the model, but I've not tried it in anger.
Hello Steve,
It works well as both host and guest on the model.
> =========================================================================
>
> .macro create_pud_entry, pgd, tbl, virt, pud, tmp1, tmp2 #ifdef CONFIG_ARM64_4_LEVELS
> add \tbl, \tbl, #PAGE_SIZE // bump tbl 1 page up.
> // to make room for pud
> add \pud, \pgd, #PAGE_SIZE // pgd points to pud which
> // follows pgd
> lsr \tmp1, \virt, #PUD_SHIFT
> and \tmp1, \tmp1, #PTRS_PER_PUD - 1 // PUD index
> orr \tmp2, \tbl, #3 // PUD entry table type
> str \tmp2, [\pud, \tmp1, lsl #3]
> #else
> mov \pud, \tbl // pgd points to section table
> // directly for < 4 levels
> #endif
> .endm
>
> /*
> * Macro to populate the PGD for the corresponding block entry in the next
> * level (tbl) for the given virtual address.
> *
> * Preserves: pgd, virt
> * Corrupts: tmp1, tmp2, tmp3
> * Returns: tbl -> page where block mappings can be placed
> * (changed to make room for pud with 4levels, preserved otherwise)
> */
> .macro create_pgd_entry, pgd, tbl, virt, tmp1, tmp2, tmp3
> create_pud_entry \pgd, \tbl, \virt, \tmp3, \tmp1, \tmp2
> lsr \tmp1, \virt, #PGDIR_SHIFT
> and \tmp1, \tmp1, #PTRS_PER_PGD - 1 // PGD index
> orr \tmp2, \tmp3, #3 // PGD entry table type
> str \tmp2, [\pgd, \tmp1, lsl #3]
> .endm
>
> =========================================================================
>
> [Note I've changed the extra argument to create_pgd_entry to be at the end to make it easier to diff
> callers]
>
> So essentially, we bump up tbl if we are running with 4 levels of page table. We put the pgd down
> after the pud, and this allows us to sneak a pud in after the pgd if we need to, otherwise it points
> to the target for the section mapping.
>
> Does this work for you? (I go cross-eyed with too much assembler, so could have easily missed
> something).
It is a better description compared to my logic.
I fully understand your intention now.
It would be good to adopt your code instead of mine.
How about participating as an author of this part if you are okay?
I am posting v4 patches as soon as possible and then
figuring out a way to take you as an author of this head.S.
> >
> > > > + create_pgd_entry x26, x0, x1, x5, x6, x7
> > > >
> > >
> > > So before this patch we have the following created by
> > > __create_page_tables:
> > >
> > > +========================+ <--- TEXT_OFFSET + PHYS_OFFSET
> > > | FIXADDR (pmd or pte) |
> > > +------------------------+
> > > | block map (pmd or pte) |
> > > +------------------------+
> > > | PGDs for swapper |
> > > +========================+ <--- TTBR1 swapper_pg_dir
> > > | block map for idmap |
> > > +------------------------+
> > > | PGDs for idmap |
> > > +------------------------+ <--- TTBR0 idmap_pg_dir
> > >
> > >
> > > After the patch, for 4 levels activated we have:
> > > +========================+ <--- TEXT_OFFSET + PHYS_OFFSET
> > > | FIXADDR (ptes) |
> > > +------------------------+
> > > | block map (ptes) |
> > > +------------------------+
> > > | PUDs for swapper |
> > > +------------------------+
> > > | PGDs for swapper |
> > > +========================+ <--- TTBR1 swapper_pg_dir
> > > | block map for idmap |
> > > +------------------------+
> > > | PUDs for idmap |
> > > +------------------------+
> > > | PGDs for idmap |
> > > +------------------------+ <--- TTBR0 idmap_pg_dir
> > >
> > > and without 4 levels activated we have:
> > > +========================+ <--- TEXT_OFFSET + PHYS_OFFSET
> > > | ZERO BYTES |
> > > +------------------------+
> > > | FIXADDR (pmd or pte) |
> > > +------------------------+
> > > | block map (pmd or pte) |
> > > +------------------------+
> > > | PGDs for swapper |
> > > +========================+ <--- TTBR1 swapper_pg_dir
> > > | ZERO BYTES |
> > > +------------------------+
> > > | block map for idmap |
> > > +------------------------+
> > > | PGDs for idmap |
> > > +------------------------+ <--- TTBR0 idmap_pg_dir
> >
> > This is definitely helpful to understand head.S.
> > It would be good to add these figures to Documentation or comments.
>
> Please feel free to grab it if you want... :-).
> Otherwise, I can put a patch in for this, as I am working on some logic to remap the physical memory a
> PUD blocks for 4K granule.
It is better to write a patch by you, not me.
Then, I will review it.
Best Regards
Jungseok Lee