2017-03-27 16:30:22

by Kirill A. Shutemov

[permalink] [raw]
Subject: [PATCH 0/8] x86: 5-level paging enabling for v4.12, Part 3

Here's the third bunch of patches of 5-level patchset.

This time we prepare code to handle non-folded version of the additional page
table level.

Kirill A. Shutemov (8):
x86/boot: Detect 5-level paging support
x86/asm: Remove __VIRTUAL_MASK_SHIFT==47 assert
x86/mm: Define virtual memory map for 5-level paging
x86/paravirt: Make paravirt code support 5-level paging
x86/mm: Add basic defines/helpers for CONFIG_X86_5LEVEL
x86/dump_pagetables: Add support 5-level paging
x86/kasan: Extend to support 5-level paging
x86/espfix: Add support 5-level paging

Documentation/x86/x86_64/mm.txt | 33 +++++++++++++++++++--
arch/x86/Kconfig | 1 +
arch/x86/boot/cpucheck.c | 9 ++++++
arch/x86/boot/cpuflags.c | 12 ++++++--
arch/x86/entry/entry_64.S | 7 ++---
arch/x86/include/asm/disabled-features.h | 8 +++++-
arch/x86/include/asm/kasan.h | 9 ++++--
arch/x86/include/asm/page_64_types.h | 10 +++++++
arch/x86/include/asm/paravirt.h | 37 +++++++++++++++++-------
arch/x86/include/asm/paravirt_types.h | 7 ++++-
arch/x86/include/asm/pgalloc.h | 2 ++
arch/x86/include/asm/pgtable_64.h | 11 +++++++
arch/x86/include/asm/pgtable_64_types.h | 26 +++++++++++++++++
arch/x86/include/asm/pgtable_types.h | 10 ++++++-
arch/x86/include/asm/required-features.h | 8 +++++-
arch/x86/include/asm/sparsemem.h | 9 ++++--
arch/x86/kernel/espfix_64.c | 12 ++++----
arch/x86/kernel/paravirt.c | 9 ++++--
arch/x86/mm/dump_pagetables.c | 49 +++++++++++++++++++++++++++-----
arch/x86/mm/kasan_init_64.c | 18 ++++++++++--
arch/x86/mm/pgtable.c | 34 +++++++++++++++++++++-
21 files changed, 274 insertions(+), 47 deletions(-)

--
2.11.0


2017-03-27 16:30:56

by Kirill A. Shutemov

[permalink] [raw]
Subject: [PATCH 1/8] x86/boot: Detect 5-level paging support

5-level paging support is required from hardware when compiled with
CONFIG_X86_5LEVEL=y.

We will implement boot-time switch between 4- and 5-level paging later.

Signed-off-by: Kirill A. Shutemov <[email protected]>
---
arch/x86/boot/cpucheck.c | 9 +++++++++
arch/x86/boot/cpuflags.c | 12 ++++++++++--
arch/x86/include/asm/disabled-features.h | 8 +++++++-
arch/x86/include/asm/required-features.h | 8 +++++++-
4 files changed, 33 insertions(+), 4 deletions(-)

diff --git a/arch/x86/boot/cpucheck.c b/arch/x86/boot/cpucheck.c
index 4ad7d70e8739..8f0c4c9fc904 100644
--- a/arch/x86/boot/cpucheck.c
+++ b/arch/x86/boot/cpucheck.c
@@ -44,6 +44,15 @@ static const u32 req_flags[NCAPINTS] =
0, /* REQUIRED_MASK5 not implemented in this file */
REQUIRED_MASK6,
0, /* REQUIRED_MASK7 not implemented in this file */
+ 0, /* REQUIRED_MASK8 not implemented in this file */
+ 0, /* REQUIRED_MASK9 not implemented in this file */
+ 0, /* REQUIRED_MASK10 not implemented in this file */
+ 0, /* REQUIRED_MASK11 not implemented in this file */
+ 0, /* REQUIRED_MASK12 not implemented in this file */
+ 0, /* REQUIRED_MASK13 not implemented in this file */
+ 0, /* REQUIRED_MASK14 not implemented in this file */
+ 0, /* REQUIRED_MASK15 not implemented in this file */
+ REQUIRED_MASK16,
};

#define A32(a, b, c, d) (((d) << 24)+((c) << 16)+((b) << 8)+(a))
diff --git a/arch/x86/boot/cpuflags.c b/arch/x86/boot/cpuflags.c
index 6687ab953257..9e77c23c2422 100644
--- a/arch/x86/boot/cpuflags.c
+++ b/arch/x86/boot/cpuflags.c
@@ -70,16 +70,19 @@ int has_eflag(unsigned long mask)
# define EBX_REG "=b"
#endif

-static inline void cpuid(u32 id, u32 *a, u32 *b, u32 *c, u32 *d)
+static inline void cpuid_count(u32 id, u32 count,
+ u32 *a, u32 *b, u32 *c, u32 *d)
{
asm volatile(".ifnc %%ebx,%3 ; movl %%ebx,%3 ; .endif \n\t"
"cpuid \n\t"
".ifnc %%ebx,%3 ; xchgl %%ebx,%3 ; .endif \n\t"
: "=a" (*a), "=c" (*c), "=d" (*d), EBX_REG (*b)
- : "a" (id)
+ : "a" (id), "c" (count)
);
}

+#define cpuid(id, a, b, c, d) cpuid_count(id, 0, a, b, c, d)
+
void get_cpuflags(void)
{
u32 max_intel_level, max_amd_level;
@@ -108,6 +111,11 @@ void get_cpuflags(void)
cpu.model += ((tfms >> 16) & 0xf) << 4;
}

+ if (max_intel_level >= 0x00000007) {
+ cpuid_count(0x00000007, 0, &ignored, &ignored,
+ &cpu.flags[16], &ignored);
+ }
+
cpuid(0x80000000, &max_amd_level, &ignored, &ignored,
&ignored);

diff --git a/arch/x86/include/asm/disabled-features.h b/arch/x86/include/asm/disabled-features.h
index 85599ad4d024..fc0960236fc3 100644
--- a/arch/x86/include/asm/disabled-features.h
+++ b/arch/x86/include/asm/disabled-features.h
@@ -36,6 +36,12 @@
# define DISABLE_OSPKE (1<<(X86_FEATURE_OSPKE & 31))
#endif /* CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS */

+#ifdef CONFIG_X86_5LEVEL
+#define DISABLE_LA57 0
+#else
+#define DISABLE_LA57 (1<<(X86_FEATURE_LA57 & 31))
+#endif
+
/*
* Make sure to add features to the correct mask
*/
@@ -55,7 +61,7 @@
#define DISABLED_MASK13 0
#define DISABLED_MASK14 0
#define DISABLED_MASK15 0
-#define DISABLED_MASK16 (DISABLE_PKU|DISABLE_OSPKE)
+#define DISABLED_MASK16 (DISABLE_PKU|DISABLE_OSPKE|DISABLE_LA57)
#define DISABLED_MASK17 0
#define DISABLED_MASK_CHECK BUILD_BUG_ON_ZERO(NCAPINTS != 18)

diff --git a/arch/x86/include/asm/required-features.h b/arch/x86/include/asm/required-features.h
index fac9a5c0abe9..d91ba04dd007 100644
--- a/arch/x86/include/asm/required-features.h
+++ b/arch/x86/include/asm/required-features.h
@@ -53,6 +53,12 @@
# define NEED_MOVBE 0
#endif

+#ifdef CONFIG_X86_5LEVEL
+# define NEED_LA57 (1<<(X86_FEATURE_LA57 & 31))
+#else
+# define NEED_LA57 0
+#endif
+
#ifdef CONFIG_X86_64
#ifdef CONFIG_PARAVIRT
/* Paravirtualized systems may not have PSE or PGE available */
@@ -98,7 +104,7 @@
#define REQUIRED_MASK13 0
#define REQUIRED_MASK14 0
#define REQUIRED_MASK15 0
-#define REQUIRED_MASK16 0
+#define REQUIRED_MASK16 (NEED_LA57)
#define REQUIRED_MASK17 0
#define REQUIRED_MASK_CHECK BUILD_BUG_ON_ZERO(NCAPINTS != 18)

--
2.11.0

2017-03-27 16:31:15

by Kirill A. Shutemov

[permalink] [raw]
Subject: [PATCH 8/8] x86/espfix: Add support 5-level paging

We don't need extra virtual address space for ESPFIX, so it stays within
one PUD page table for both 4- and 5-level paging.

Signed-off-by: Kirill A. Shutemov <[email protected]>
---
arch/x86/kernel/espfix_64.c | 12 +++++++-----
1 file changed, 7 insertions(+), 5 deletions(-)

diff --git a/arch/x86/kernel/espfix_64.c b/arch/x86/kernel/espfix_64.c
index 04f89caef9c4..8e598a1ad986 100644
--- a/arch/x86/kernel/espfix_64.c
+++ b/arch/x86/kernel/espfix_64.c
@@ -50,11 +50,11 @@
#define ESPFIX_STACKS_PER_PAGE (PAGE_SIZE/ESPFIX_STACK_SIZE)

/* There is address space for how many espfix pages? */
-#define ESPFIX_PAGE_SPACE (1UL << (PGDIR_SHIFT-PAGE_SHIFT-16))
+#define ESPFIX_PAGE_SPACE (1UL << (P4D_SHIFT-PAGE_SHIFT-16))

#define ESPFIX_MAX_CPUS (ESPFIX_STACKS_PER_PAGE * ESPFIX_PAGE_SPACE)
#if CONFIG_NR_CPUS > ESPFIX_MAX_CPUS
-# error "Need more than one PGD for the ESPFIX hack"
+# error "Need more virtual address space for the ESPFIX hack"
#endif

#define PGALLOC_GFP (GFP_KERNEL | __GFP_NOTRACK | __GFP_ZERO)
@@ -121,11 +121,13 @@ static void init_espfix_random(void)

void __init init_espfix_bsp(void)
{
- pgd_t *pgd_p;
+ pgd_t *pgd;
+ p4d_t *p4d;

/* Install the espfix pud into the kernel page directory */
- pgd_p = &init_level4_pgt[pgd_index(ESPFIX_BASE_ADDR)];
- pgd_populate(&init_mm, pgd_p, (pud_t *)espfix_pud_page);
+ pgd = &init_level4_pgt[pgd_index(ESPFIX_BASE_ADDR)];
+ p4d = p4d_alloc(&init_mm, pgd, ESPFIX_BASE_ADDR);
+ p4d_populate(&init_mm, p4d, espfix_pud_page);

/* Randomize the locations */
init_espfix_random();
--
2.11.0

2017-03-27 16:30:40

by Kirill A. Shutemov

[permalink] [raw]
Subject: [PATCH 4/8] x86/paravirt: Make paravirt code support 5-level paging

Add operations to allocate/release p4ds.

Xen requires more work. We will need to come back to it.

Signed-off-by: Kirill A. Shutemov <[email protected]>
---
arch/x86/include/asm/paravirt.h | 37 ++++++++++++++++++++++++-----------
arch/x86/include/asm/paravirt_types.h | 7 ++++++-
arch/x86/include/asm/pgalloc.h | 2 ++
arch/x86/kernel/paravirt.c | 9 +++++++--
4 files changed, 41 insertions(+), 14 deletions(-)

diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h
index 158d877ce9e9..55fa56fe4e45 100644
--- a/arch/x86/include/asm/paravirt.h
+++ b/arch/x86/include/asm/paravirt.h
@@ -357,6 +357,16 @@ static inline void paravirt_release_pud(unsigned long pfn)
PVOP_VCALL1(pv_mmu_ops.release_pud, pfn);
}

+static inline void paravirt_alloc_p4d(struct mm_struct *mm, unsigned long pfn)
+{
+ PVOP_VCALL2(pv_mmu_ops.alloc_p4d, mm, pfn);
+}
+
+static inline void paravirt_release_p4d(unsigned long pfn)
+{
+ PVOP_VCALL1(pv_mmu_ops.release_p4d, pfn);
+}
+
static inline void pte_update(struct mm_struct *mm, unsigned long addr,
pte_t *ptep)
{
@@ -582,25 +592,25 @@ static inline void set_p4d(p4d_t *p4dp, p4d_t p4d)
val);
}

-static inline void p4d_clear(p4d_t *p4dp)
+#if CONFIG_PGTABLE_LEVELS >= 5
+
+static inline p4d_t __p4d(p4dval_t val)
{
- set_p4d(p4dp, __p4d(0));
-}
+ p4dval_t ret = PVOP_CALLEE1(p4dval_t, pv_mmu_ops.make_p4d, val);

-#if CONFIG_PGTABLE_LEVELS >= 5
+ return (p4d_t) { ret };
+}

-#error FIXME
+static inline p4dval_t p4d_val(p4d_t p4d)
+{
+ return PVOP_CALLEE1(p4dval_t, pv_mmu_ops.p4d_val, p4d.p4d);
+}

static inline void set_pgd(pgd_t *pgdp, pgd_t pgd)
{
pgdval_t val = native_pgd_val(pgd);

- if (sizeof(pgdval_t) > sizeof(long))
- PVOP_VCALL3(pv_mmu_ops.set_pgd, pgdp,
- val, (u64)val >> 32);
- else
- PVOP_VCALL2(pv_mmu_ops.set_pgd, pgdp,
- val);
+ PVOP_VCALL2(pv_mmu_ops.set_pgd, pgdp, val);
}

static inline void pgd_clear(pgd_t *pgdp)
@@ -610,6 +620,11 @@ static inline void pgd_clear(pgd_t *pgdp)

#endif /* CONFIG_PGTABLE_LEVELS == 5 */

+static inline void p4d_clear(p4d_t *p4dp)
+{
+ set_p4d(p4dp, __p4d(0));
+}
+
#endif /* CONFIG_PGTABLE_LEVELS == 4 */

#endif /* CONFIG_PGTABLE_LEVELS >= 3 */
diff --git a/arch/x86/include/asm/paravirt_types.h b/arch/x86/include/asm/paravirt_types.h
index 93c49cf09b63..7465d6fe336f 100644
--- a/arch/x86/include/asm/paravirt_types.h
+++ b/arch/x86/include/asm/paravirt_types.h
@@ -238,9 +238,11 @@ struct pv_mmu_ops {
void (*alloc_pte)(struct mm_struct *mm, unsigned long pfn);
void (*alloc_pmd)(struct mm_struct *mm, unsigned long pfn);
void (*alloc_pud)(struct mm_struct *mm, unsigned long pfn);
+ void (*alloc_p4d)(struct mm_struct *mm, unsigned long pfn);
void (*release_pte)(unsigned long pfn);
void (*release_pmd)(unsigned long pfn);
void (*release_pud)(unsigned long pfn);
+ void (*release_p4d)(unsigned long pfn);

/* Pagetable manipulation functions */
void (*set_pte)(pte_t *ptep, pte_t pteval);
@@ -286,7 +288,10 @@ struct pv_mmu_ops {
void (*set_p4d)(p4d_t *p4dp, p4d_t p4dval);

#if CONFIG_PGTABLE_LEVELS >= 5
-#error FIXME
+ struct paravirt_callee_save p4d_val;
+ struct paravirt_callee_save make_p4d;
+
+ void (*set_pgd)(pgd_t *pgdp, pgd_t pgdval);
#endif /* CONFIG_PGTABLE_LEVELS >= 5 */

#endif /* CONFIG_PGTABLE_LEVELS >= 4 */
diff --git a/arch/x86/include/asm/pgalloc.h b/arch/x86/include/asm/pgalloc.h
index 2f585054c63c..b2d0cd8288aa 100644
--- a/arch/x86/include/asm/pgalloc.h
+++ b/arch/x86/include/asm/pgalloc.h
@@ -17,9 +17,11 @@ static inline void paravirt_alloc_pmd(struct mm_struct *mm, unsigned long pfn) {
static inline void paravirt_alloc_pmd_clone(unsigned long pfn, unsigned long clonepfn,
unsigned long start, unsigned long count) {}
static inline void paravirt_alloc_pud(struct mm_struct *mm, unsigned long pfn) {}
+static inline void paravirt_alloc_p4d(struct mm_struct *mm, unsigned long pfn) {}
static inline void paravirt_release_pte(unsigned long pfn) {}
static inline void paravirt_release_pmd(unsigned long pfn) {}
static inline void paravirt_release_pud(unsigned long pfn) {}
+static inline void paravirt_release_p4d(unsigned long pfn) {}
#endif

/*
diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c
index 110daf22f5c7..3586996fc50d 100644
--- a/arch/x86/kernel/paravirt.c
+++ b/arch/x86/kernel/paravirt.c
@@ -405,9 +405,11 @@ struct pv_mmu_ops pv_mmu_ops __ro_after_init = {
.alloc_pte = paravirt_nop,
.alloc_pmd = paravirt_nop,
.alloc_pud = paravirt_nop,
+ .alloc_p4d = paravirt_nop,
.release_pte = paravirt_nop,
.release_pmd = paravirt_nop,
.release_pud = paravirt_nop,
+ .release_p4d = paravirt_nop,

.set_pte = native_set_pte,
.set_pte_at = native_set_pte_at,
@@ -437,8 +439,11 @@ struct pv_mmu_ops pv_mmu_ops __ro_after_init = {
.set_p4d = native_set_p4d,

#if CONFIG_PGTABLE_LEVELS >= 5
-#error FIXME
-#endif /* CONFIG_PGTABLE_LEVELS >= 4 */
+ .p4d_val = PTE_IDENT,
+ .make_p4d = PTE_IDENT,
+
+ .set_pgd = native_set_pgd,
+#endif /* CONFIG_PGTABLE_LEVELS >= 5 */
#endif /* CONFIG_PGTABLE_LEVELS >= 4 */
#endif /* CONFIG_PGTABLE_LEVELS >= 3 */

--
2.11.0

2017-03-27 16:31:05

by Kirill A. Shutemov

[permalink] [raw]
Subject: [PATCH 3/8] x86/mm: Define virtual memory map for 5-level paging

The first part of memory map (up to %esp fixup) simply scales existing
map for 4-level paging by factor of 9 -- number of bits addressed by
additional page table level.

The rest of the map is unchanged.

Signed-off-by: Kirill A. Shutemov <[email protected]>
---
Documentation/x86/x86_64/mm.txt | 33 ++++++++++++++++++++++++++++++---
arch/x86/Kconfig | 1 +
arch/x86/include/asm/kasan.h | 9 ++++++---
arch/x86/include/asm/page_64_types.h | 10 ++++++++++
arch/x86/include/asm/pgtable_64_types.h | 6 ++++++
arch/x86/include/asm/sparsemem.h | 9 +++++++--
6 files changed, 60 insertions(+), 8 deletions(-)

diff --git a/Documentation/x86/x86_64/mm.txt b/Documentation/x86/x86_64/mm.txt
index ee3f9c30957c..0774d2cb3151 100644
--- a/Documentation/x86/x86_64/mm.txt
+++ b/Documentation/x86/x86_64/mm.txt
@@ -4,7 +4,7 @@
Virtual memory map with 4 level page tables:

0000000000000000 - 00007fffffffffff (=47 bits) user space, different per mm
-hole caused by [48:63] sign extension
+hole caused by [47:63] sign extension
ffff800000000000 - ffff87ffffffffff (=43 bits) guard hole, reserved for hypervisor
ffff880000000000 - ffffc7ffffffffff (=64 TB) direct mapping of all phys. memory
ffffc80000000000 - ffffc8ffffffffff (=40 bits) hole
@@ -23,12 +23,39 @@ ffffffffa0000000 - ffffffffff5fffff (=1526 MB) module mapping space (variable)
ffffffffff600000 - ffffffffffdfffff (=8 MB) vsyscalls
ffffffffffe00000 - ffffffffffffffff (=2 MB) unused hole

+Virtual memory map with 5 level page tables:
+
+0000000000000000 - 00ffffffffffffff (=56 bits) user space, different per mm
+hole caused by [56:63] sign extension
+ff00000000000000 - ff0fffffffffffff (=52 bits) guard hole, reserved for hypervisor
+ff10000000000000 - ff8fffffffffffff (=55 bits) direct mapping of all phys. memory
+ff90000000000000 - ff91ffffffffffff (=49 bits) hole
+ff92000000000000 - ffd1ffffffffffff (=54 bits) vmalloc/ioremap space
+ffd2000000000000 - ffd3ffffffffffff (=49 bits) hole
+ffd4000000000000 - ffd5ffffffffffff (=49 bits) virtual memory map (512TB)
+... unused hole ...
+ffd8000000000000 - fff7ffffffffffff (=53 bits) kasan shadow memory (8PB)
+... unused hole ...
+fffe000000000000 - fffe007fffffffff (=39 bits) %esp fixup stacks
+... unused hole ...
+ffffffef00000000 - fffffffeffffffff (=64 GB) EFI region mapping space
+... unused hole ...
+ffffffff80000000 - ffffffff9fffffff (=512 MB) kernel text mapping, from phys 0
+ffffffffa0000000 - ffffffffff5fffff (=1526 MB) module mapping space
+ffffffffff600000 - ffffffffffdfffff (=8 MB) vsyscalls
+ffffffffffe00000 - ffffffffffffffff (=2 MB) unused hole
+
+Architecture defines a 64-bit virtual address. Implementations can support
+less. Currently supported are 48- and 57-bit virtual addresses. Bits 63
+through to the most-significant implemented bit are set to either all ones
+or all zero. This causes hole between user space and kernel addresses.
+
The direct mapping covers all memory in the system up to the highest
memory address (this means in some cases it can also include PCI memory
holes).

-vmalloc space is lazily synchronized into the different PML4 pages of
-the processes using the page fault handler, with init_level4_pgt as
+vmalloc space is lazily synchronized into the different PML4/PML5 pages of
+the processes using the page fault handler, with init_top_pgt as
reference.

Current X86-64 implementations support up to 46 bits of address space (64 TB),
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index ff5c43af7b4e..6a8535a893e2 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -291,6 +291,7 @@ config ARCH_SUPPORTS_DEBUG_PAGEALLOC
config KASAN_SHADOW_OFFSET
hex
depends on KASAN
+ default 0xdff8000000000000 if X86_5LEVEL
default 0xdffffc0000000000

config HAVE_INTEL_TXT
diff --git a/arch/x86/include/asm/kasan.h b/arch/x86/include/asm/kasan.h
index 1410b567ecde..f527b02a0ee3 100644
--- a/arch/x86/include/asm/kasan.h
+++ b/arch/x86/include/asm/kasan.h
@@ -11,9 +11,12 @@
* 'kernel address space start' >> KASAN_SHADOW_SCALE_SHIFT
*/
#define KASAN_SHADOW_START (KASAN_SHADOW_OFFSET + \
- (0xffff800000000000ULL >> 3))
-/* 47 bits for kernel address -> (47 - 3) bits for shadow */
-#define KASAN_SHADOW_END (KASAN_SHADOW_START + (1ULL << (47 - 3)))
+ ((-1UL << __VIRTUAL_MASK_SHIFT) >> 3))
+/*
+ * 47 bits for kernel address -> (47 - 3) bits for shadow
+ * 56 bits for kernel address -> (56 - 3) bits for shadow
+ */
+#define KASAN_SHADOW_END (KASAN_SHADOW_START + (1ULL << (__VIRTUAL_MASK_SHIFT - 3)))

#ifndef __ASSEMBLY__

diff --git a/arch/x86/include/asm/page_64_types.h b/arch/x86/include/asm/page_64_types.h
index 9215e0527647..3f5f08b010d0 100644
--- a/arch/x86/include/asm/page_64_types.h
+++ b/arch/x86/include/asm/page_64_types.h
@@ -36,7 +36,12 @@
* hypervisor to fit. Choosing 16 slots here is arbitrary, but it's
* what Xen requires.
*/
+#ifdef CONFIG_X86_5LEVEL
+#define __PAGE_OFFSET_BASE _AC(0xff10000000000000, UL)
+#else
#define __PAGE_OFFSET_BASE _AC(0xffff880000000000, UL)
+#endif
+
#ifdef CONFIG_RANDOMIZE_MEMORY
#define __PAGE_OFFSET page_offset_base
#else
@@ -46,8 +51,13 @@
#define __START_KERNEL_map _AC(0xffffffff80000000, UL)

/* See Documentation/x86/x86_64/mm.txt for a description of the memory map. */
+#ifdef CONFIG_X86_5LEVEL
+#define __PHYSICAL_MASK_SHIFT 52
+#define __VIRTUAL_MASK_SHIFT 56
+#else
#define __PHYSICAL_MASK_SHIFT 46
#define __VIRTUAL_MASK_SHIFT 47
+#endif

/*
* Kernel image size is limited to 1GiB due to the fixmap living in the
diff --git a/arch/x86/include/asm/pgtable_64_types.h b/arch/x86/include/asm/pgtable_64_types.h
index 516593e66bd6..4edc97917382 100644
--- a/arch/x86/include/asm/pgtable_64_types.h
+++ b/arch/x86/include/asm/pgtable_64_types.h
@@ -56,9 +56,15 @@ typedef struct { pteval_t pte; } pte_t;

/* See Documentation/x86/x86_64/mm.txt for a description of the memory map. */
#define MAXMEM _AC(__AC(1, UL) << MAX_PHYSMEM_BITS, UL)
+#ifdef CONFIG_X86_5LEVEL
+#define VMALLOC_SIZE_TB _AC(16384, UL)
+#define __VMALLOC_BASE _AC(0xff92000000000000, UL)
+#define __VMEMMAP_BASE _AC(0xffd4000000000000, UL)
+#else
#define VMALLOC_SIZE_TB _AC(32, UL)
#define __VMALLOC_BASE _AC(0xffffc90000000000, UL)
#define __VMEMMAP_BASE _AC(0xffffea0000000000, UL)
+#endif
#ifdef CONFIG_RANDOMIZE_MEMORY
#define VMALLOC_START vmalloc_base
#define VMEMMAP_START vmemmap_base
diff --git a/arch/x86/include/asm/sparsemem.h b/arch/x86/include/asm/sparsemem.h
index 4517d6b93188..1f5bee2c202f 100644
--- a/arch/x86/include/asm/sparsemem.h
+++ b/arch/x86/include/asm/sparsemem.h
@@ -26,8 +26,13 @@
# endif
#else /* CONFIG_X86_32 */
# define SECTION_SIZE_BITS 27 /* matt - 128 is convenient right now */
-# define MAX_PHYSADDR_BITS 44
-# define MAX_PHYSMEM_BITS 46
+# ifdef CONFIG_X86_5LEVEL
+# define MAX_PHYSADDR_BITS 52
+# define MAX_PHYSMEM_BITS 52
+# else
+# define MAX_PHYSADDR_BITS 44
+# define MAX_PHYSMEM_BITS 46
+# endif
#endif

#endif /* CONFIG_SPARSEMEM */
--
2.11.0

2017-03-27 16:30:31

by Kirill A. Shutemov

[permalink] [raw]
Subject: [PATCH 7/8] x86/kasan: Extend to support 5-level paging

This patch bring support for non-folded additional page table level.

Signed-off-by: Kirill A. Shutemov <[email protected]>
Cc: Dmitry Vyukov <[email protected]>
Cc: Andrey Ryabinin <[email protected]>
---
arch/x86/mm/kasan_init_64.c | 18 ++++++++++++++++--
1 file changed, 16 insertions(+), 2 deletions(-)

diff --git a/arch/x86/mm/kasan_init_64.c b/arch/x86/mm/kasan_init_64.c
index 864950994371..0c7d8129bed6 100644
--- a/arch/x86/mm/kasan_init_64.c
+++ b/arch/x86/mm/kasan_init_64.c
@@ -56,8 +56,18 @@ static void __init kasan_map_early_shadow(pgd_t *pgd)
unsigned long end = KASAN_SHADOW_END;

for (i = pgd_index(start); start < end; i++) {
- pgd[i] = __pgd(__pa_nodebug(kasan_zero_pud)
- | _KERNPG_TABLE);
+ switch (CONFIG_PGTABLE_LEVELS) {
+ case 4:
+ pgd[i] = __pgd(__pa_nodebug(kasan_zero_pud) |
+ _KERNPG_TABLE);
+ break;
+ case 5:
+ pgd[i] = __pgd(__pa_nodebug(kasan_zero_p4d) |
+ _KERNPG_TABLE);
+ break;
+ default:
+ BUILD_BUG();
+ }
start += PGDIR_SIZE;
}
}
@@ -85,6 +95,7 @@ void __init kasan_early_init(void)
pteval_t pte_val = __pa_nodebug(kasan_zero_page) | __PAGE_KERNEL;
pmdval_t pmd_val = __pa_nodebug(kasan_zero_pte) | _KERNPG_TABLE;
pudval_t pud_val = __pa_nodebug(kasan_zero_pmd) | _KERNPG_TABLE;
+ p4dval_t p4d_val = __pa_nodebug(kasan_zero_pud) | _KERNPG_TABLE;

for (i = 0; i < PTRS_PER_PTE; i++)
kasan_zero_pte[i] = __pte(pte_val);
@@ -95,6 +106,9 @@ void __init kasan_early_init(void)
for (i = 0; i < PTRS_PER_PUD; i++)
kasan_zero_pud[i] = __pud(pud_val);

+ for (i = 0; CONFIG_PGTABLE_LEVELS >= 5 && i < PTRS_PER_P4D; i++)
+ kasan_zero_p4d[i] = __p4d(p4d_val);
+
kasan_map_early_shadow(early_level4_pgt);
kasan_map_early_shadow(init_level4_pgt);
}
--
2.11.0

2017-03-27 16:30:47

by Kirill A. Shutemov

[permalink] [raw]
Subject: [PATCH 2/8] x86/asm: Remove __VIRTUAL_MASK_SHIFT==47 assert

We don't need the assert anymore. 17be0aec74fb ("x86/asm/entry/64:
Implement better check for canonical addresses") made canonical
address check generic wrt. address width.

Signed-off-by: Kirill A. Shutemov <[email protected]>
---
arch/x86/entry/entry_64.S | 7 ++-----
1 file changed, 2 insertions(+), 5 deletions(-)

diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
index d2b2a2948ffe..607d72c4a485 100644
--- a/arch/x86/entry/entry_64.S
+++ b/arch/x86/entry/entry_64.S
@@ -265,12 +265,9 @@ return_from_SYSCALL_64:
*
* If width of "canonical tail" ever becomes variable, this will need
* to be updated to remain correct on both old and new CPUs.
+ *
+ * Change top 16 bits to be the sign-extension of 47th bit
*/
- .ifne __VIRTUAL_MASK_SHIFT - 47
- .error "virtual address width changed -- SYSRET checks need update"
- .endif
-
- /* Change top 16 bits to be the sign-extension of 47th bit */
shl $(64 - (__VIRTUAL_MASK_SHIFT+1)), %rcx
sar $(64 - (__VIRTUAL_MASK_SHIFT+1)), %rcx

--
2.11.0

2017-03-27 16:31:50

by Kirill A. Shutemov

[permalink] [raw]
Subject: [PATCH 5/8] x86/mm: Add basic defines/helpers for CONFIG_X86_5LEVEL

Extends pagetable headers to support new paging mode.

Signed-off-by: Kirill A. Shutemov <[email protected]>
---
arch/x86/include/asm/pgtable_64.h | 11 +++++++++++
arch/x86/include/asm/pgtable_64_types.h | 20 +++++++++++++++++++
arch/x86/include/asm/pgtable_types.h | 10 +++++++++-
arch/x86/mm/pgtable.c | 34 ++++++++++++++++++++++++++++++++-
4 files changed, 73 insertions(+), 2 deletions(-)

diff --git a/arch/x86/include/asm/pgtable_64.h b/arch/x86/include/asm/pgtable_64.h
index 0593a1ae7573..12ea31274eb6 100644
--- a/arch/x86/include/asm/pgtable_64.h
+++ b/arch/x86/include/asm/pgtable_64.h
@@ -35,6 +35,13 @@ extern void paging_init(void);
#define pud_ERROR(e) \
pr_err("%s:%d: bad pud %p(%016lx)\n", \
__FILE__, __LINE__, &(e), pud_val(e))
+
+#if CONFIG_PGTABLE_LEVELS >= 5
+#define p4d_ERROR(e) \
+ pr_err("%s:%d: bad p4d %p(%016lx)\n", \
+ __FILE__, __LINE__, &(e), p4d_val(e))
+#endif
+
#define pgd_ERROR(e) \
pr_err("%s:%d: bad pgd %p(%016lx)\n", \
__FILE__, __LINE__, &(e), pgd_val(e))
@@ -128,7 +135,11 @@ static inline void native_set_p4d(p4d_t *p4dp, p4d_t p4d)

static inline void native_p4d_clear(p4d_t *p4d)
{
+#ifdef CONFIG_X86_5LEVEL
+ native_set_p4d(p4d, native_make_p4d(0));
+#else
native_set_p4d(p4d, (p4d_t) { .pgd = native_make_pgd(0)});
+#endif
}

static inline void native_set_pgd(pgd_t *pgdp, pgd_t pgd)
diff --git a/arch/x86/include/asm/pgtable_64_types.h b/arch/x86/include/asm/pgtable_64_types.h
index 4edc97917382..d3df6dc05d19 100644
--- a/arch/x86/include/asm/pgtable_64_types.h
+++ b/arch/x86/include/asm/pgtable_64_types.h
@@ -23,12 +23,32 @@ typedef struct { pteval_t pte; } pte_t;

#define SHARED_KERNEL_PMD 0

+#ifdef CONFIG_X86_5LEVEL
+
+/*
+ * PGDIR_SHIFT determines what a top-level page table entry can map
+ */
+#define PGDIR_SHIFT 48
+#define PTRS_PER_PGD 512
+
+/*
+ * 4rd level page in 5-level paging case
+ */
+#define P4D_SHIFT 39
+#define PTRS_PER_P4D 512
+#define P4D_SIZE (_AC(1, UL) << P4D_SHIFT)
+#define P4D_MASK (~(P4D_SIZE - 1))
+
+#else /* CONFIG_X86_5LEVEL */
+
/*
* PGDIR_SHIFT determines what a top-level page table entry can map
*/
#define PGDIR_SHIFT 39
#define PTRS_PER_PGD 512

+#endif /* CONFIG_X86_5LEVEL */
+
/*
* 3rd level page
*/
diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h
index 4930afe9df0a..bf9638e1ee42 100644
--- a/arch/x86/include/asm/pgtable_types.h
+++ b/arch/x86/include/asm/pgtable_types.h
@@ -273,9 +273,17 @@ static inline pgdval_t pgd_flags(pgd_t pgd)
}

#if CONFIG_PGTABLE_LEVELS > 4
+typedef struct { p4dval_t p4d; } p4d_t;

-#error FIXME
+static inline p4d_t native_make_p4d(pudval_t val)
+{
+ return (p4d_t) { val };
+}

+static inline p4dval_t native_p4d_val(p4d_t p4d)
+{
+ return p4d.p4d;
+}
#else
#include <asm-generic/pgtable-nop4d.h>

diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c
index 38b6daf72deb..d26b066944a5 100644
--- a/arch/x86/mm/pgtable.c
+++ b/arch/x86/mm/pgtable.c
@@ -81,6 +81,14 @@ void ___pud_free_tlb(struct mmu_gather *tlb, pud_t *pud)
paravirt_release_pud(__pa(pud) >> PAGE_SHIFT);
tlb_remove_page(tlb, virt_to_page(pud));
}
+
+#if CONFIG_PGTABLE_LEVELS > 4
+void ___p4d_free_tlb(struct mmu_gather *tlb, p4d_t *p4d)
+{
+ paravirt_release_p4d(__pa(p4d) >> PAGE_SHIFT);
+ tlb_remove_page(tlb, virt_to_page(p4d));
+}
+#endif /* CONFIG_PGTABLE_LEVELS > 4 */
#endif /* CONFIG_PGTABLE_LEVELS > 3 */
#endif /* CONFIG_PGTABLE_LEVELS > 2 */

@@ -120,7 +128,7 @@ static void pgd_ctor(struct mm_struct *mm, pgd_t *pgd)
references from swapper_pg_dir. */
if (CONFIG_PGTABLE_LEVELS == 2 ||
(CONFIG_PGTABLE_LEVELS == 3 && SHARED_KERNEL_PMD) ||
- CONFIG_PGTABLE_LEVELS == 4) {
+ CONFIG_PGTABLE_LEVELS >= 4) {
clone_pgd_range(pgd + KERNEL_PGD_BOUNDARY,
swapper_pg_dir + KERNEL_PGD_BOUNDARY,
KERNEL_PGD_PTRS);
@@ -582,6 +590,30 @@ void native_set_fixmap(enum fixed_addresses idx, phys_addr_t phys,
}

#ifdef CONFIG_HAVE_ARCH_HUGE_VMAP
+#ifdef CONFIG_X86_5LEVEL
+/**
+ * p4d_set_huge - setup kernel P4D mapping
+ *
+ * No 512GB pages yet -- always return 0
+ *
+ * Returns 1 on success and 0 on failure.
+ */
+int p4d_set_huge(p4d_t *p4d, phys_addr_t addr, pgprot_t prot)
+{
+ return 0;
+}
+
+/**
+ * p4d_clear_huge - clear kernel P4D mapping when it is set
+ *
+ * No 512GB pages yet -- always return 0
+ */
+int p4d_clear_huge(p4d_t *p4d)
+{
+ return 0;
+}
+#endif
+
/**
* pud_set_huge - setup kernel PUD mapping
*
--
2.11.0

2017-03-27 16:31:39

by Kirill A. Shutemov

[permalink] [raw]
Subject: [PATCH 6/8] x86/dump_pagetables: Add support 5-level paging

Simple extension to support one more page table level.

Signed-off-by: Kirill A. Shutemov <[email protected]>
---
arch/x86/mm/dump_pagetables.c | 49 ++++++++++++++++++++++++++++++++++++-------
1 file changed, 42 insertions(+), 7 deletions(-)

diff --git a/arch/x86/mm/dump_pagetables.c b/arch/x86/mm/dump_pagetables.c
index 58b5bee7ea27..0effac6989cd 100644
--- a/arch/x86/mm/dump_pagetables.c
+++ b/arch/x86/mm/dump_pagetables.c
@@ -110,7 +110,8 @@ static struct addr_marker address_markers[] = {
#define PTE_LEVEL_MULT (PAGE_SIZE)
#define PMD_LEVEL_MULT (PTRS_PER_PTE * PTE_LEVEL_MULT)
#define PUD_LEVEL_MULT (PTRS_PER_PMD * PMD_LEVEL_MULT)
-#define PGD_LEVEL_MULT (PTRS_PER_PUD * PUD_LEVEL_MULT)
+#define P4D_LEVEL_MULT (PTRS_PER_PUD * PUD_LEVEL_MULT)
+#define PGD_LEVEL_MULT (PTRS_PER_PUD * P4D_LEVEL_MULT)

#define pt_dump_seq_printf(m, to_dmesg, fmt, args...) \
({ \
@@ -347,7 +348,7 @@ static bool pud_already_checked(pud_t *prev_pud, pud_t *pud, bool checkwx)
return checkwx && prev_pud && (pud_val(*prev_pud) == pud_val(*pud));
}

-static void walk_pud_level(struct seq_file *m, struct pg_state *st, pgd_t addr,
+static void walk_pud_level(struct seq_file *m, struct pg_state *st, p4d_t addr,
unsigned long P)
{
int i;
@@ -355,7 +356,7 @@ static void walk_pud_level(struct seq_file *m, struct pg_state *st, pgd_t addr,
pgprotval_t prot;
pud_t *prev_pud = NULL;

- start = (pud_t *) pgd_page_vaddr(addr);
+ start = (pud_t *) p4d_page_vaddr(addr);

for (i = 0; i < PTRS_PER_PUD; i++) {
st->current_address = normalize_addr(P + i * PUD_LEVEL_MULT);
@@ -377,9 +378,43 @@ static void walk_pud_level(struct seq_file *m, struct pg_state *st, pgd_t addr,
}

#else
-#define walk_pud_level(m,s,a,p) walk_pmd_level(m,s,__pud(pgd_val(a)),p)
-#define pgd_large(a) pud_large(__pud(pgd_val(a)))
-#define pgd_none(a) pud_none(__pud(pgd_val(a)))
+#define walk_pud_level(m,s,a,p) walk_pmd_level(m,s,__pud(p4d_val(a)),p)
+#define p4d_large(a) pud_large(__pud(p4d_val(a)))
+#define p4d_none(a) pud_none(__pud(p4d_val(a)))
+#endif
+
+#if PTRS_PER_P4D > 1
+
+static void walk_p4d_level(struct seq_file *m, struct pg_state *st, pgd_t addr,
+ unsigned long P)
+{
+ int i;
+ p4d_t *start;
+ pgprotval_t prot;
+
+ start = (p4d_t *) pgd_page_vaddr(addr);
+
+ for (i = 0; i < PTRS_PER_P4D; i++) {
+ st->current_address = normalize_addr(P + i * P4D_LEVEL_MULT);
+ if (!p4d_none(*start)) {
+ if (p4d_large(*start) || !p4d_present(*start)) {
+ prot = p4d_flags(*start);
+ note_page(m, st, __pgprot(prot), 2);
+ } else {
+ walk_pud_level(m, st, *start,
+ P + i * P4D_LEVEL_MULT);
+ }
+ } else
+ note_page(m, st, __pgprot(0), 2);
+
+ start++;
+ }
+}
+
+#else
+#define walk_p4d_level(m,s,a,p) walk_pud_level(m,s,__p4d(pgd_val(a)),p)
+#define pgd_large(a) p4d_large(__p4d(pgd_val(a)))
+#define pgd_none(a) p4d_none(__p4d(pgd_val(a)))
#endif

static inline bool is_hypervisor_range(int idx)
@@ -424,7 +459,7 @@ static void ptdump_walk_pgd_level_core(struct seq_file *m, pgd_t *pgd,
prot = pgd_flags(*start);
note_page(m, &st, __pgprot(prot), 1);
} else {
- walk_pud_level(m, &st, *start,
+ walk_p4d_level(m, &st, *start,
i * PGD_LEVEL_MULT);
}
} else
--
2.11.0

2017-03-28 06:06:16

by Ingo Molnar

[permalink] [raw]
Subject: Re: [PATCH 1/8] x86/boot: Detect 5-level paging support


* Kirill A. Shutemov <[email protected]> wrote:

> +#ifdef CONFIG_X86_5LEVEL
> +#define DISABLE_LA57 0
> +#else
> +#define DISABLE_LA57 (1<<(X86_FEATURE_LA57 & 31))
> +#endif

> +#ifdef CONFIG_X86_5LEVEL
> +# define NEED_LA57 (1<<(X86_FEATURE_LA57 & 31))
> +#else
> +# define NEED_LA57 0
> +#endif

Please use consistent style.

Thanks,

Ingo

2017-03-28 06:12:28

by Ingo Molnar

[permalink] [raw]
Subject: Re: [PATCH 5/8] x86/mm: Add basic defines/helpers for CONFIG_X86_5LEVEL


* Kirill A. Shutemov <[email protected]> wrote:

> +#ifdef CONFIG_X86_5LEVEL
> +
> +/*
> + * PGDIR_SHIFT determines what a top-level page table entry can map
> + */
> +#define PGDIR_SHIFT 48
> +#define PTRS_PER_PGD 512
> +
> +/*
> + * 4rd level page in 5-level paging case

4th.

> + */
> +#define P4D_SHIFT 39
> +#define PTRS_PER_P4D 512
> +#define P4D_SIZE (_AC(1, UL) << P4D_SHIFT)
> +#define P4D_MASK (~(P4D_SIZE - 1))
> +
> +#else /* CONFIG_X86_5LEVEL */

Single space suffices before teh comment.

> +
> /*
> * PGDIR_SHIFT determines what a top-level page table entry can map
> */
> #define PGDIR_SHIFT 39
> #define PTRS_PER_PGD 512
>
> +#endif /* CONFIG_X86_5LEVEL */

Ditto.

> +#ifdef CONFIG_X86_5LEVEL
> +/**
> + * p4d_set_huge - setup kernel P4D mapping
> + *
> + * No 512GB pages yet -- always return 0
> + *
> + * Returns 1 on success and 0 on failure.
> + */
> +int p4d_set_huge(p4d_t *p4d, phys_addr_t addr, pgprot_t prot)
> +{
> + return 0;
> +}

The last comment line can be deleted I suppose.

Thanks,

Ingo

2017-03-28 06:21:01

by Ingo Molnar

[permalink] [raw]
Subject: Re: [PATCH 6/8] x86/dump_pagetables: Add support 5-level paging


* Kirill A. Shutemov <[email protected]> wrote:

> +#if PTRS_PER_P4D > 1
> +
> +static void walk_p4d_level(struct seq_file *m, struct pg_state *st, pgd_t addr,
> + unsigned long P)

Pretty ugly line break. Either don't break the line, or break it in a more logical
place, like:

static void
walk_p4d_level(struct seq_file *m, struct pg_state *st, pgd_t addr, unsigned long P)

> + start = (p4d_t *) pgd_page_vaddr(addr);

The space between the type cast and the function invocation is not needed.

Thanks,

Ingo

2017-03-28 09:40:22

by Kirill A. Shutemov

[permalink] [raw]
Subject: Re: [PATCH 6/8] x86/dump_pagetables: Add support 5-level paging

On Tue, Mar 28, 2017 at 08:12:59AM +0200, Ingo Molnar wrote:
>
> * Kirill A. Shutemov <[email protected]> wrote:
>
> > +#if PTRS_PER_P4D > 1
> > +
> > +static void walk_p4d_level(struct seq_file *m, struct pg_state *st, pgd_t addr,
> > + unsigned long P)
>
> Pretty ugly line break. Either don't break the line, or break it in a more logical
> place, like:
>
> static void
> walk_p4d_level(struct seq_file *m, struct pg_state *st, pgd_t addr, unsigned long P)
>
> > + start = (p4d_t *) pgd_page_vaddr(addr);
>
> The space between the type cast and the function invocation is not needed.

Both style issues you have pointed to are inherited from handling of other
page table levels.

Do you want me to adjust them too?

This kind of inconsistency bother me more than style issues itself.

--
Kirill A. Shutemov

2017-03-28 09:40:32

by Ingo Molnar

[permalink] [raw]
Subject: Re: [PATCH 6/8] x86/dump_pagetables: Add support 5-level paging


* Kirill A. Shutemov <[email protected]> wrote:

> On Tue, Mar 28, 2017 at 08:12:59AM +0200, Ingo Molnar wrote:
> >
> > * Kirill A. Shutemov <[email protected]> wrote:
> >
> > > +#if PTRS_PER_P4D > 1
> > > +
> > > +static void walk_p4d_level(struct seq_file *m, struct pg_state *st, pgd_t addr,
> > > + unsigned long P)
> >
> > Pretty ugly line break. Either don't break the line, or break it in a more logical
> > place, like:
> >
> > static void
> > walk_p4d_level(struct seq_file *m, struct pg_state *st, pgd_t addr, unsigned long P)
> >
> > > + start = (p4d_t *) pgd_page_vaddr(addr);
> >
> > The space between the type cast and the function invocation is not needed.
>
> Both style issues you have pointed to are inherited from handling of other
> page table levels.
>
> Do you want me to adjust them too?

Yes, pre-existing uncleanlinesses are not a reason to replicate them going
forward. Feel free to do it in a separate preparatory patch if the noise
is too large.

Thanks,

Ingo

2017-03-28 10:46:38

by Kirill A. Shutemov

[permalink] [raw]
Subject: [PATCHv2 1/8] x86/boot: Detect 5-level paging support

5-level paging support is required from hardware when compiled with
CONFIG_X86_5LEVEL=y.

We will implement boot-time switch between 4- and 5-level paging later.

Signed-off-by: Kirill A. Shutemov <[email protected]>
---
arch/x86/boot/cpucheck.c | 9 +++++++++
arch/x86/boot/cpuflags.c | 12 ++++++++++--
arch/x86/include/asm/disabled-features.h | 8 +++++++-
arch/x86/include/asm/required-features.h | 8 +++++++-
4 files changed, 33 insertions(+), 4 deletions(-)

diff --git a/arch/x86/boot/cpucheck.c b/arch/x86/boot/cpucheck.c
index 4ad7d70e8739..8f0c4c9fc904 100644
--- a/arch/x86/boot/cpucheck.c
+++ b/arch/x86/boot/cpucheck.c
@@ -44,6 +44,15 @@ static const u32 req_flags[NCAPINTS] =
0, /* REQUIRED_MASK5 not implemented in this file */
REQUIRED_MASK6,
0, /* REQUIRED_MASK7 not implemented in this file */
+ 0, /* REQUIRED_MASK8 not implemented in this file */
+ 0, /* REQUIRED_MASK9 not implemented in this file */
+ 0, /* REQUIRED_MASK10 not implemented in this file */
+ 0, /* REQUIRED_MASK11 not implemented in this file */
+ 0, /* REQUIRED_MASK12 not implemented in this file */
+ 0, /* REQUIRED_MASK13 not implemented in this file */
+ 0, /* REQUIRED_MASK14 not implemented in this file */
+ 0, /* REQUIRED_MASK15 not implemented in this file */
+ REQUIRED_MASK16,
};

#define A32(a, b, c, d) (((d) << 24)+((c) << 16)+((b) << 8)+(a))
diff --git a/arch/x86/boot/cpuflags.c b/arch/x86/boot/cpuflags.c
index 6687ab953257..9e77c23c2422 100644
--- a/arch/x86/boot/cpuflags.c
+++ b/arch/x86/boot/cpuflags.c
@@ -70,16 +70,19 @@ int has_eflag(unsigned long mask)
# define EBX_REG "=b"
#endif

-static inline void cpuid(u32 id, u32 *a, u32 *b, u32 *c, u32 *d)
+static inline void cpuid_count(u32 id, u32 count,
+ u32 *a, u32 *b, u32 *c, u32 *d)
{
asm volatile(".ifnc %%ebx,%3 ; movl %%ebx,%3 ; .endif \n\t"
"cpuid \n\t"
".ifnc %%ebx,%3 ; xchgl %%ebx,%3 ; .endif \n\t"
: "=a" (*a), "=c" (*c), "=d" (*d), EBX_REG (*b)
- : "a" (id)
+ : "a" (id), "c" (count)
);
}

+#define cpuid(id, a, b, c, d) cpuid_count(id, 0, a, b, c, d)
+
void get_cpuflags(void)
{
u32 max_intel_level, max_amd_level;
@@ -108,6 +111,11 @@ void get_cpuflags(void)
cpu.model += ((tfms >> 16) & 0xf) << 4;
}

+ if (max_intel_level >= 0x00000007) {
+ cpuid_count(0x00000007, 0, &ignored, &ignored,
+ &cpu.flags[16], &ignored);
+ }
+
cpuid(0x80000000, &max_amd_level, &ignored, &ignored,
&ignored);

diff --git a/arch/x86/include/asm/disabled-features.h b/arch/x86/include/asm/disabled-features.h
index 85599ad4d024..5dff775af7cd 100644
--- a/arch/x86/include/asm/disabled-features.h
+++ b/arch/x86/include/asm/disabled-features.h
@@ -36,6 +36,12 @@
# define DISABLE_OSPKE (1<<(X86_FEATURE_OSPKE & 31))
#endif /* CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS */

+#ifdef CONFIG_X86_5LEVEL
+# define DISABLE_LA57 0
+#else
+# define DISABLE_LA57 (1<<(X86_FEATURE_LA57 & 31))
+#endif
+
/*
* Make sure to add features to the correct mask
*/
@@ -55,7 +61,7 @@
#define DISABLED_MASK13 0
#define DISABLED_MASK14 0
#define DISABLED_MASK15 0
-#define DISABLED_MASK16 (DISABLE_PKU|DISABLE_OSPKE)
+#define DISABLED_MASK16 (DISABLE_PKU|DISABLE_OSPKE|DISABLE_LA57)
#define DISABLED_MASK17 0
#define DISABLED_MASK_CHECK BUILD_BUG_ON_ZERO(NCAPINTS != 18)

diff --git a/arch/x86/include/asm/required-features.h b/arch/x86/include/asm/required-features.h
index fac9a5c0abe9..d91ba04dd007 100644
--- a/arch/x86/include/asm/required-features.h
+++ b/arch/x86/include/asm/required-features.h
@@ -53,6 +53,12 @@
# define NEED_MOVBE 0
#endif

+#ifdef CONFIG_X86_5LEVEL
+# define NEED_LA57 (1<<(X86_FEATURE_LA57 & 31))
+#else
+# define NEED_LA57 0
+#endif
+
#ifdef CONFIG_X86_64
#ifdef CONFIG_PARAVIRT
/* Paravirtualized systems may not have PSE or PGE available */
@@ -98,7 +104,7 @@
#define REQUIRED_MASK13 0
#define REQUIRED_MASK14 0
#define REQUIRED_MASK15 0
-#define REQUIRED_MASK16 0
+#define REQUIRED_MASK16 (NEED_LA57)
#define REQUIRED_MASK17 0
#define REQUIRED_MASK_CHECK BUILD_BUG_ON_ZERO(NCAPINTS != 18)

--
2.11.0

2017-03-28 10:46:56

by Kirill A. Shutemov

[permalink] [raw]
Subject: [PATCHv2 5/8] x86/mm: Add basic defines/helpers for CONFIG_X86_5LEVEL

Extends pagetable headers to support new paging mode.

Signed-off-by: Kirill A. Shutemov <[email protected]>
---
arch/x86/include/asm/pgtable_64.h | 11 +++++++++++
arch/x86/include/asm/pgtable_64_types.h | 20 ++++++++++++++++++++
arch/x86/include/asm/pgtable_types.h | 10 +++++++++-
arch/x86/mm/pgtable.c | 32 +++++++++++++++++++++++++++++++-
4 files changed, 71 insertions(+), 2 deletions(-)

diff --git a/arch/x86/include/asm/pgtable_64.h b/arch/x86/include/asm/pgtable_64.h
index 0593a1ae7573..12ea31274eb6 100644
--- a/arch/x86/include/asm/pgtable_64.h
+++ b/arch/x86/include/asm/pgtable_64.h
@@ -35,6 +35,13 @@ extern void paging_init(void);
#define pud_ERROR(e) \
pr_err("%s:%d: bad pud %p(%016lx)\n", \
__FILE__, __LINE__, &(e), pud_val(e))
+
+#if CONFIG_PGTABLE_LEVELS >= 5
+#define p4d_ERROR(e) \
+ pr_err("%s:%d: bad p4d %p(%016lx)\n", \
+ __FILE__, __LINE__, &(e), p4d_val(e))
+#endif
+
#define pgd_ERROR(e) \
pr_err("%s:%d: bad pgd %p(%016lx)\n", \
__FILE__, __LINE__, &(e), pgd_val(e))
@@ -128,7 +135,11 @@ static inline void native_set_p4d(p4d_t *p4dp, p4d_t p4d)

static inline void native_p4d_clear(p4d_t *p4d)
{
+#ifdef CONFIG_X86_5LEVEL
+ native_set_p4d(p4d, native_make_p4d(0));
+#else
native_set_p4d(p4d, (p4d_t) { .pgd = native_make_pgd(0)});
+#endif
}

static inline void native_set_pgd(pgd_t *pgdp, pgd_t pgd)
diff --git a/arch/x86/include/asm/pgtable_64_types.h b/arch/x86/include/asm/pgtable_64_types.h
index 4edc97917382..adc3e7b107ee 100644
--- a/arch/x86/include/asm/pgtable_64_types.h
+++ b/arch/x86/include/asm/pgtable_64_types.h
@@ -23,12 +23,32 @@ typedef struct { pteval_t pte; } pte_t;

#define SHARED_KERNEL_PMD 0

+#ifdef CONFIG_X86_5LEVEL
+
+/*
+ * PGDIR_SHIFT determines what a top-level page table entry can map
+ */
+#define PGDIR_SHIFT 48
+#define PTRS_PER_PGD 512
+
+/*
+ * 4th level page in 5-level paging case
+ */
+#define P4D_SHIFT 39
+#define PTRS_PER_P4D 512
+#define P4D_SIZE (_AC(1, UL) << P4D_SHIFT)
+#define P4D_MASK (~(P4D_SIZE - 1))
+
+#else /* CONFIG_X86_5LEVEL */
+
/*
* PGDIR_SHIFT determines what a top-level page table entry can map
*/
#define PGDIR_SHIFT 39
#define PTRS_PER_PGD 512

+#endif /* CONFIG_X86_5LEVEL */
+
/*
* 3rd level page
*/
diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h
index 4930afe9df0a..bf9638e1ee42 100644
--- a/arch/x86/include/asm/pgtable_types.h
+++ b/arch/x86/include/asm/pgtable_types.h
@@ -273,9 +273,17 @@ static inline pgdval_t pgd_flags(pgd_t pgd)
}

#if CONFIG_PGTABLE_LEVELS > 4
+typedef struct { p4dval_t p4d; } p4d_t;

-#error FIXME
+static inline p4d_t native_make_p4d(pudval_t val)
+{
+ return (p4d_t) { val };
+}

+static inline p4dval_t native_p4d_val(p4d_t p4d)
+{
+ return p4d.p4d;
+}
#else
#include <asm-generic/pgtable-nop4d.h>

diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c
index 38b6daf72deb..508a708eb9a6 100644
--- a/arch/x86/mm/pgtable.c
+++ b/arch/x86/mm/pgtable.c
@@ -81,6 +81,14 @@ void ___pud_free_tlb(struct mmu_gather *tlb, pud_t *pud)
paravirt_release_pud(__pa(pud) >> PAGE_SHIFT);
tlb_remove_page(tlb, virt_to_page(pud));
}
+
+#if CONFIG_PGTABLE_LEVELS > 4
+void ___p4d_free_tlb(struct mmu_gather *tlb, p4d_t *p4d)
+{
+ paravirt_release_p4d(__pa(p4d) >> PAGE_SHIFT);
+ tlb_remove_page(tlb, virt_to_page(p4d));
+}
+#endif /* CONFIG_PGTABLE_LEVELS > 4 */
#endif /* CONFIG_PGTABLE_LEVELS > 3 */
#endif /* CONFIG_PGTABLE_LEVELS > 2 */

@@ -120,7 +128,7 @@ static void pgd_ctor(struct mm_struct *mm, pgd_t *pgd)
references from swapper_pg_dir. */
if (CONFIG_PGTABLE_LEVELS == 2 ||
(CONFIG_PGTABLE_LEVELS == 3 && SHARED_KERNEL_PMD) ||
- CONFIG_PGTABLE_LEVELS == 4) {
+ CONFIG_PGTABLE_LEVELS >= 4) {
clone_pgd_range(pgd + KERNEL_PGD_BOUNDARY,
swapper_pg_dir + KERNEL_PGD_BOUNDARY,
KERNEL_PGD_PTRS);
@@ -582,6 +590,28 @@ void native_set_fixmap(enum fixed_addresses idx, phys_addr_t phys,
}

#ifdef CONFIG_HAVE_ARCH_HUGE_VMAP
+#ifdef CONFIG_X86_5LEVEL
+/**
+ * p4d_set_huge - setup kernel P4D mapping
+ *
+ * No 512GB pages yet -- always return 0
+ */
+int p4d_set_huge(p4d_t *p4d, phys_addr_t addr, pgprot_t prot)
+{
+ return 0;
+}
+
+/**
+ * p4d_clear_huge - clear kernel P4D mapping when it is set
+ *
+ * No 512GB pages yet -- always return 0
+ */
+int p4d_clear_huge(p4d_t *p4d)
+{
+ return 0;
+}
+#endif
+
/**
* pud_set_huge - setup kernel PUD mapping
*
--
2.11.0

2017-03-28 10:50:11

by Kirill A. Shutemov

[permalink] [raw]
Subject: [PATCHv2 6/8] x86/dump_pagetables: Add support 5-level paging

Simple extension to support one more page table level.

Signed-off-by: Kirill A. Shutemov <[email protected]>
---
arch/x86/mm/dump_pagetables.c | 59 +++++++++++++++++++++++++++++++++----------
1 file changed, 45 insertions(+), 14 deletions(-)

diff --git a/arch/x86/mm/dump_pagetables.c b/arch/x86/mm/dump_pagetables.c
index 58b5bee7ea27..9f305be71a72 100644
--- a/arch/x86/mm/dump_pagetables.c
+++ b/arch/x86/mm/dump_pagetables.c
@@ -110,7 +110,8 @@ static struct addr_marker address_markers[] = {
#define PTE_LEVEL_MULT (PAGE_SIZE)
#define PMD_LEVEL_MULT (PTRS_PER_PTE * PTE_LEVEL_MULT)
#define PUD_LEVEL_MULT (PTRS_PER_PMD * PMD_LEVEL_MULT)
-#define PGD_LEVEL_MULT (PTRS_PER_PUD * PUD_LEVEL_MULT)
+#define P4D_LEVEL_MULT (PTRS_PER_PUD * PUD_LEVEL_MULT)
+#define PGD_LEVEL_MULT (PTRS_PER_PUD * P4D_LEVEL_MULT)

#define pt_dump_seq_printf(m, to_dmesg, fmt, args...) \
({ \
@@ -286,14 +287,13 @@ static void note_page(struct seq_file *m, struct pg_state *st,
}
}

-static void walk_pte_level(struct seq_file *m, struct pg_state *st, pmd_t addr,
- unsigned long P)
+static void walk_pte_level(struct seq_file *m, struct pg_state *st, pmd_t addr, unsigned long P)
{
int i;
pte_t *start;
pgprotval_t prot;

- start = (pte_t *) pmd_page_vaddr(addr);
+ start = (pte_t *)pmd_page_vaddr(addr);
for (i = 0; i < PTRS_PER_PTE; i++) {
prot = pte_flags(*start);
st->current_address = normalize_addr(P + i * PTE_LEVEL_MULT);
@@ -304,14 +304,13 @@ static void walk_pte_level(struct seq_file *m, struct pg_state *st, pmd_t addr,

#if PTRS_PER_PMD > 1

-static void walk_pmd_level(struct seq_file *m, struct pg_state *st, pud_t addr,
- unsigned long P)
+static void walk_pmd_level(struct seq_file *m, struct pg_state *st, pud_t addr, unsigned long P)
{
int i;
pmd_t *start;
pgprotval_t prot;

- start = (pmd_t *) pud_page_vaddr(addr);
+ start = (pmd_t *)pud_page_vaddr(addr);
for (i = 0; i < PTRS_PER_PMD; i++) {
st->current_address = normalize_addr(P + i * PMD_LEVEL_MULT);
if (!pmd_none(*start)) {
@@ -347,15 +346,14 @@ static bool pud_already_checked(pud_t *prev_pud, pud_t *pud, bool checkwx)
return checkwx && prev_pud && (pud_val(*prev_pud) == pud_val(*pud));
}

-static void walk_pud_level(struct seq_file *m, struct pg_state *st, pgd_t addr,
- unsigned long P)
+static void walk_pud_level(struct seq_file *m, struct pg_state *st, p4d_t addr, unsigned long P)
{
int i;
pud_t *start;
pgprotval_t prot;
pud_t *prev_pud = NULL;

- start = (pud_t *) pgd_page_vaddr(addr);
+ start = (pud_t *)p4d_page_vaddr(addr);

for (i = 0; i < PTRS_PER_PUD; i++) {
st->current_address = normalize_addr(P + i * PUD_LEVEL_MULT);
@@ -377,9 +375,42 @@ static void walk_pud_level(struct seq_file *m, struct pg_state *st, pgd_t addr,
}

#else
-#define walk_pud_level(m,s,a,p) walk_pmd_level(m,s,__pud(pgd_val(a)),p)
-#define pgd_large(a) pud_large(__pud(pgd_val(a)))
-#define pgd_none(a) pud_none(__pud(pgd_val(a)))
+#define walk_pud_level(m,s,a,p) walk_pmd_level(m,s,__pud(p4d_val(a)),p)
+#define p4d_large(a) pud_large(__pud(p4d_val(a)))
+#define p4d_none(a) pud_none(__pud(p4d_val(a)))
+#endif
+
+#if PTRS_PER_P4D > 1
+
+static void walk_p4d_level(struct seq_file *m, struct pg_state *st, pgd_t addr, unsigned long P)
+{
+ int i;
+ p4d_t *start;
+ pgprotval_t prot;
+
+ start = (p4d_t *)pgd_page_vaddr(addr);
+
+ for (i = 0; i < PTRS_PER_P4D; i++) {
+ st->current_address = normalize_addr(P + i * P4D_LEVEL_MULT);
+ if (!p4d_none(*start)) {
+ if (p4d_large(*start) || !p4d_present(*start)) {
+ prot = p4d_flags(*start);
+ note_page(m, st, __pgprot(prot), 2);
+ } else {
+ walk_pud_level(m, st, *start,
+ P + i * P4D_LEVEL_MULT);
+ }
+ } else
+ note_page(m, st, __pgprot(0), 2);
+
+ start++;
+ }
+}
+
+#else
+#define walk_p4d_level(m,s,a,p) walk_pud_level(m,s,__p4d(pgd_val(a)),p)
+#define pgd_large(a) p4d_large(__p4d(pgd_val(a)))
+#define pgd_none(a) p4d_none(__p4d(pgd_val(a)))
#endif

static inline bool is_hypervisor_range(int idx)
@@ -424,7 +455,7 @@ static void ptdump_walk_pgd_level_core(struct seq_file *m, pgd_t *pgd,
prot = pgd_flags(*start);
note_page(m, &st, __pgprot(prot), 1);
} else {
- walk_pud_level(m, &st, *start,
+ walk_p4d_level(m, &st, *start,
i * PGD_LEVEL_MULT);
}
} else
--
2.11.0

2017-03-28 18:55:35

by Borislav Petkov

[permalink] [raw]
Subject: Re: [PATCHv2 6/8] x86/dump_pagetables: Add support 5-level paging

On Tue, Mar 28, 2017 at 01:48:06PM +0300, Kirill A. Shutemov wrote:
> Simple extension to support one more page table level.
>
> Signed-off-by: Kirill A. Shutemov <[email protected]>
> ---
> arch/x86/mm/dump_pagetables.c | 59 +++++++++++++++++++++++++++++++++----------
> 1 file changed, 45 insertions(+), 14 deletions(-)

Hmm, so without this I get the splat below.

Can we do something about this bisection breakage? I mean, this is the
second explosion caused by 5level paging I trigger. Maybe we should
merge the whole thing into a single big patch when everything is applied
and tested, more or less, so that bisection is fine.

Or someone might have a better idea...

[ 2.801262] BUG: unable to handle kernel paging request at ffffc753f000f000
[ 2.803013] IP: ptdump_walk_pgd_level_core+0x236/0x3a0
[ 2.804472] PGD 0
[ 2.804473] P4D 0
[ 2.805231]
[ 2.805231] Oops: 0000 [#1] PREEMPT SMP
[ 2.805231] Modules linked in:
[ 2.805231] CPU: 1 PID: 1 Comm: swapper/0 Not tainted 4.11.0-rc4+ #1
[ 2.805231] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Debian-1.8.2-1 04/01/2014
[ 2.805231] task: ffff88007c1c8040 task.stack: ffffc90000008000
[ 2.805231] RIP: 0010:ptdump_walk_pgd_level_core+0x236/0x3a0
[ 2.805231] RSP: 0018:ffffc9000000be48 EFLAGS: 00010256
[ 2.805231] RAX: 0000000000000000 RBX: 0000000000000000 RCX: 0000000000000000
[ 2.805231] RDX: ffff880000001000 RSI: ffff880000000000 RDI: ffffc9000000bed0
[ 2.805231] RBP: ffffc9000000bef8 R08: 0000000000000000 R09: 000000000000017f
[ 2.805231] R10: 000000000000001f R11: 0000000000000001 R12: ffffc9000000be90
[ 2.805231] R13: 0000000000000000 R14: ffffc753f000f000 R15: 00000000ffffff00
[ 2.805231] FS: 0000000000000000(0000) GS:ffff88007ed00000(0000) knlGS:0000000000000000
[ 2.805231] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[ 2.805231] CR2: ffffc753f000f000 CR3: 0000000001c09000 CR4: 00000000000406e0
[ 2.805231] Call Trace:
[ 2.805231] ? 0xffffffff81000000
[ 2.805231] ptdump_walk_pgd_level_checkwx+0x17/0x20
[ 2.805231] mark_rodata_ro+0xec/0x100
[ 2.805231] ? rest_init+0x90/0x90
[ 2.805231] kernel_init+0x2a/0x100
[ 2.805231] ret_from_fork+0x2e/0x40
[ 2.805231] Code: 00 88 ff ff 48 8b 5d 88 4c 8d 34 10 48 ba 00 10 00 00 00 88 ff ff 48 01 d0 48 89 85 70 ff ff ff 48 89 d8 48 c1 f8 10 48 89 45 b0 <49> 8b 06 48 a9 9f ff ff ff 74 7c 48 89 c1 48 be ff 0f 00 00 00
[ 2.805231] RIP: ptdump_walk_pgd_level_core+0x236/0x3a0 RSP: ffffc9000000be48
[ 2.805231] CR2: ffffc753f000f000
[ 2.805231] ---[ end trace 3ec6e2c757df799d ]---
[ 2.805231] Kernel panic - not syncing: Fatal exception
[ 2.805231] Kernel Offset: disabled
[ 2.805231] ---[ end Kernel panic - not syncing: Fatal exception

--
Regards/Gruss,
Boris.

Good mailing practices for 400: avoid top-posting and trim the reply.

2017-03-28 21:15:13

by Kirill A. Shutemov

[permalink] [raw]
Subject: Re: [PATCHv2 6/8] x86/dump_pagetables: Add support 5-level paging

On Tue, Mar 28, 2017 at 08:55:22PM +0200, Borislav Petkov wrote:
> On Tue, Mar 28, 2017 at 01:48:06PM +0300, Kirill A. Shutemov wrote:
> > Simple extension to support one more page table level.
> >
> > Signed-off-by: Kirill A. Shutemov <[email protected]>
> > ---
> > arch/x86/mm/dump_pagetables.c | 59 +++++++++++++++++++++++++++++++++----------
> > 1 file changed, 45 insertions(+), 14 deletions(-)
>
> Hmm, so without this I get the splat below.

On current tip/master?

> Can we do something about this bisection breakage? I mean, this is the
> second explosion caused by 5level paging I trigger. Maybe we should
> merge the whole thing into a single big patch when everything is applied
> and tested, more or less, so that bisection is fine.
>
> Or someone might have a better idea...

I'm not sure that collapsing history in one commit to fix bisectability is
any better than having broken bisectability.

I'll try to look more into this issue tomorrow.

Sorry for this.

--
Kirill A. Shutemov

2017-03-28 21:39:18

by Borislav Petkov

[permalink] [raw]
Subject: Re: [PATCHv2 6/8] x86/dump_pagetables: Add support 5-level paging

On Wed, Mar 29, 2017 at 12:15:07AM +0300, Kirill A. Shutemov wrote:
> On current tip/master?

tip/master from Monday:

commit d3b6ed97fbc63219e262faca86da2fe62885eff2 (refs/remotes/tip/master)
Merge: 399c980bd22b f2a6a7050109
Author: Ingo Molnar <[email protected]>
Date: Mon Mar 27 10:48:30 2017 +0200

Merge branch 'x86/mm'

> I'm not sure that collapsing history in one commit to fix bisectability is
> any better than having broken bisectability.

Of course it is better. How do you tell everyone who bisects in the
future to jump over those commits?

So perhaps not a single commit but at least meld those together which
change pagetable walking like the current example and cause a breakage.

Thanks.

--
Regards/Gruss,
Boris.

Good mailing practices for 400: avoid top-posting and trim the reply.

2017-03-28 22:28:28

by H. Peter Anvin

[permalink] [raw]
Subject: Re: [PATCH 3/8] x86/mm: Define virtual memory map for 5-level paging

On 03/27/17 09:29, Kirill A. Shutemov wrote:
> +fffe000000000000 - fffe007fffffffff (=39 bits) %esp fixup stacks

Why move this?

-hpa


2017-03-28 22:47:45

by Kirill A. Shutemov

[permalink] [raw]
Subject: Re: [PATCH 3/8] x86/mm: Define virtual memory map for 5-level paging

On Tue, Mar 28, 2017 at 03:21:39PM -0700, H. Peter Anvin wrote:
> On 03/27/17 09:29, Kirill A. Shutemov wrote:
> > +fffe000000000000 - fffe007fffffffff (=39 bits) %esp fixup stacks
>
> Why move this?

You're right. There's no reason to.

It's accident due to ESPFIX_BASE_ADDR being defined using PGDIR_SHIFT.
We should use P4D_SHIFT instead to produce consistent result across
paging modes.

I'll update the patch tomorrow. Thanks for noticing this.

--
Kirill A. Shutemov

2017-03-29 13:20:27

by Kirill A. Shutemov

[permalink] [raw]
Subject: [PATCHv2 3/8] x86/mm: Define virtual memory map for 5-level paging

The first part of memory map (up to %esp fixup) simply scales existing
map for 4-level paging by factor of 9 -- number of bits addressed by
additional page table level.

The rest of the map is unchanged.

Signed-off-by: Kirill A. Shutemov <[email protected]>

---
v2:
- Document esp fixup stack for 5-level paging at the same place as for
4-level mode (actual change is in v2 of 8/8).
---
Documentation/x86/x86_64/mm.txt | 33 ++++++++++++++++++++++++++++++---
arch/x86/Kconfig | 1 +
arch/x86/include/asm/kasan.h | 9 ++++++---
arch/x86/include/asm/page_64_types.h | 10 ++++++++++
arch/x86/include/asm/pgtable_64_types.h | 6 ++++++
arch/x86/include/asm/sparsemem.h | 9 +++++++--
6 files changed, 60 insertions(+), 8 deletions(-)

diff --git a/Documentation/x86/x86_64/mm.txt b/Documentation/x86/x86_64/mm.txt
index ee3f9c30957c..b0798e281aa6 100644
--- a/Documentation/x86/x86_64/mm.txt
+++ b/Documentation/x86/x86_64/mm.txt
@@ -4,7 +4,7 @@
Virtual memory map with 4 level page tables:

0000000000000000 - 00007fffffffffff (=47 bits) user space, different per mm
-hole caused by [48:63] sign extension
+hole caused by [47:63] sign extension
ffff800000000000 - ffff87ffffffffff (=43 bits) guard hole, reserved for hypervisor
ffff880000000000 - ffffc7ffffffffff (=64 TB) direct mapping of all phys. memory
ffffc80000000000 - ffffc8ffffffffff (=40 bits) hole
@@ -23,12 +23,39 @@ ffffffffa0000000 - ffffffffff5fffff (=1526 MB) module mapping space (variable)
ffffffffff600000 - ffffffffffdfffff (=8 MB) vsyscalls
ffffffffffe00000 - ffffffffffffffff (=2 MB) unused hole

+Virtual memory map with 5 level page tables:
+
+0000000000000000 - 00ffffffffffffff (=56 bits) user space, different per mm
+hole caused by [56:63] sign extension
+ff00000000000000 - ff0fffffffffffff (=52 bits) guard hole, reserved for hypervisor
+ff10000000000000 - ff8fffffffffffff (=55 bits) direct mapping of all phys. memory
+ff90000000000000 - ff91ffffffffffff (=49 bits) hole
+ff92000000000000 - ffd1ffffffffffff (=54 bits) vmalloc/ioremap space
+ffd2000000000000 - ffd3ffffffffffff (=49 bits) hole
+ffd4000000000000 - ffd5ffffffffffff (=49 bits) virtual memory map (512TB)
+... unused hole ...
+ffd8000000000000 - fff7ffffffffffff (=53 bits) kasan shadow memory (8PB)
+... unused hole ...
+ffffff0000000000 - ffffff7fffffffff (=39 bits) %esp fixup stacks
+... unused hole ...
+ffffffef00000000 - fffffffeffffffff (=64 GB) EFI region mapping space
+... unused hole ...
+ffffffff80000000 - ffffffff9fffffff (=512 MB) kernel text mapping, from phys 0
+ffffffffa0000000 - ffffffffff5fffff (=1526 MB) module mapping space
+ffffffffff600000 - ffffffffffdfffff (=8 MB) vsyscalls
+ffffffffffe00000 - ffffffffffffffff (=2 MB) unused hole
+
+Architecture defines a 64-bit virtual address. Implementations can support
+less. Currently supported are 48- and 57-bit virtual addresses. Bits 63
+through to the most-significant implemented bit are set to either all ones
+or all zero. This causes hole between user space and kernel addresses.
+
The direct mapping covers all memory in the system up to the highest
memory address (this means in some cases it can also include PCI memory
holes).

-vmalloc space is lazily synchronized into the different PML4 pages of
-the processes using the page fault handler, with init_level4_pgt as
+vmalloc space is lazily synchronized into the different PML4/PML5 pages of
+the processes using the page fault handler, with init_top_pgt as
reference.

Current X86-64 implementations support up to 46 bits of address space (64 TB),
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index ff5c43af7b4e..6a8535a893e2 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -291,6 +291,7 @@ config ARCH_SUPPORTS_DEBUG_PAGEALLOC
config KASAN_SHADOW_OFFSET
hex
depends on KASAN
+ default 0xdff8000000000000 if X86_5LEVEL
default 0xdffffc0000000000

config HAVE_INTEL_TXT
diff --git a/arch/x86/include/asm/kasan.h b/arch/x86/include/asm/kasan.h
index 1410b567ecde..f527b02a0ee3 100644
--- a/arch/x86/include/asm/kasan.h
+++ b/arch/x86/include/asm/kasan.h
@@ -11,9 +11,12 @@
* 'kernel address space start' >> KASAN_SHADOW_SCALE_SHIFT
*/
#define KASAN_SHADOW_START (KASAN_SHADOW_OFFSET + \
- (0xffff800000000000ULL >> 3))
-/* 47 bits for kernel address -> (47 - 3) bits for shadow */
-#define KASAN_SHADOW_END (KASAN_SHADOW_START + (1ULL << (47 - 3)))
+ ((-1UL << __VIRTUAL_MASK_SHIFT) >> 3))
+/*
+ * 47 bits for kernel address -> (47 - 3) bits for shadow
+ * 56 bits for kernel address -> (56 - 3) bits for shadow
+ */
+#define KASAN_SHADOW_END (KASAN_SHADOW_START + (1ULL << (__VIRTUAL_MASK_SHIFT - 3)))

#ifndef __ASSEMBLY__

diff --git a/arch/x86/include/asm/page_64_types.h b/arch/x86/include/asm/page_64_types.h
index 9215e0527647..3f5f08b010d0 100644
--- a/arch/x86/include/asm/page_64_types.h
+++ b/arch/x86/include/asm/page_64_types.h
@@ -36,7 +36,12 @@
* hypervisor to fit. Choosing 16 slots here is arbitrary, but it's
* what Xen requires.
*/
+#ifdef CONFIG_X86_5LEVEL
+#define __PAGE_OFFSET_BASE _AC(0xff10000000000000, UL)
+#else
#define __PAGE_OFFSET_BASE _AC(0xffff880000000000, UL)
+#endif
+
#ifdef CONFIG_RANDOMIZE_MEMORY
#define __PAGE_OFFSET page_offset_base
#else
@@ -46,8 +51,13 @@
#define __START_KERNEL_map _AC(0xffffffff80000000, UL)

/* See Documentation/x86/x86_64/mm.txt for a description of the memory map. */
+#ifdef CONFIG_X86_5LEVEL
+#define __PHYSICAL_MASK_SHIFT 52
+#define __VIRTUAL_MASK_SHIFT 56
+#else
#define __PHYSICAL_MASK_SHIFT 46
#define __VIRTUAL_MASK_SHIFT 47
+#endif

/*
* Kernel image size is limited to 1GiB due to the fixmap living in the
diff --git a/arch/x86/include/asm/pgtable_64_types.h b/arch/x86/include/asm/pgtable_64_types.h
index 516593e66bd6..4edc97917382 100644
--- a/arch/x86/include/asm/pgtable_64_types.h
+++ b/arch/x86/include/asm/pgtable_64_types.h
@@ -56,9 +56,15 @@ typedef struct { pteval_t pte; } pte_t;

/* See Documentation/x86/x86_64/mm.txt for a description of the memory map. */
#define MAXMEM _AC(__AC(1, UL) << MAX_PHYSMEM_BITS, UL)
+#ifdef CONFIG_X86_5LEVEL
+#define VMALLOC_SIZE_TB _AC(16384, UL)
+#define __VMALLOC_BASE _AC(0xff92000000000000, UL)
+#define __VMEMMAP_BASE _AC(0xffd4000000000000, UL)
+#else
#define VMALLOC_SIZE_TB _AC(32, UL)
#define __VMALLOC_BASE _AC(0xffffc90000000000, UL)
#define __VMEMMAP_BASE _AC(0xffffea0000000000, UL)
+#endif
#ifdef CONFIG_RANDOMIZE_MEMORY
#define VMALLOC_START vmalloc_base
#define VMEMMAP_START vmemmap_base
diff --git a/arch/x86/include/asm/sparsemem.h b/arch/x86/include/asm/sparsemem.h
index 4517d6b93188..1f5bee2c202f 100644
--- a/arch/x86/include/asm/sparsemem.h
+++ b/arch/x86/include/asm/sparsemem.h
@@ -26,8 +26,13 @@
# endif
#else /* CONFIG_X86_32 */
# define SECTION_SIZE_BITS 27 /* matt - 128 is convenient right now */
-# define MAX_PHYSADDR_BITS 44
-# define MAX_PHYSMEM_BITS 46
+# ifdef CONFIG_X86_5LEVEL
+# define MAX_PHYSADDR_BITS 52
+# define MAX_PHYSMEM_BITS 52
+# else
+# define MAX_PHYSADDR_BITS 44
+# define MAX_PHYSMEM_BITS 46
+# endif
#endif

#endif /* CONFIG_SPARSEMEM */
--
2.11.0

2017-03-29 13:22:54

by Kirill A. Shutemov

[permalink] [raw]
Subject: [PATCHv2 8/8] x86/espfix: Add support 5-level paging

We don't need extra virtual address space for ESPFIX, so it stays within
one PUD page table for both 4- and 5-level paging.

Redefining ESPFIX_BASE_ADDR using P4D_SHIFT instead of PGDIR_SHIFT would
make it stay in the same place regarding of paging mode.

Signed-off-by: Kirill A. Shutemov <[email protected]>
---
v2:
- make ESPFIX_BASE_ADDR the same for both paging modes.
---
arch/x86/include/asm/pgtable_64_types.h | 2 +-
arch/x86/kernel/espfix_64.c | 12 +++++++-----
2 files changed, 8 insertions(+), 6 deletions(-)

diff --git a/arch/x86/include/asm/pgtable_64_types.h b/arch/x86/include/asm/pgtable_64_types.h
index adc3e7b107ee..06470da156ba 100644
--- a/arch/x86/include/asm/pgtable_64_types.h
+++ b/arch/x86/include/asm/pgtable_64_types.h
@@ -98,7 +98,7 @@ typedef struct { pteval_t pte; } pte_t;
#define MODULES_END __fix_to_virt(__end_of_fixed_addresses + 1)
#define MODULES_LEN (MODULES_END - MODULES_VADDR)
#define ESPFIX_PGD_ENTRY _AC(-2, UL)
-#define ESPFIX_BASE_ADDR (ESPFIX_PGD_ENTRY << PGDIR_SHIFT)
+#define ESPFIX_BASE_ADDR (ESPFIX_PGD_ENTRY << P4D_SHIFT)
#define EFI_VA_START ( -4 * (_AC(1, UL) << 30))
#define EFI_VA_END (-68 * (_AC(1, UL) << 30))

diff --git a/arch/x86/kernel/espfix_64.c b/arch/x86/kernel/espfix_64.c
index 04f89caef9c4..8e598a1ad986 100644
--- a/arch/x86/kernel/espfix_64.c
+++ b/arch/x86/kernel/espfix_64.c
@@ -50,11 +50,11 @@
#define ESPFIX_STACKS_PER_PAGE (PAGE_SIZE/ESPFIX_STACK_SIZE)

/* There is address space for how many espfix pages? */
-#define ESPFIX_PAGE_SPACE (1UL << (PGDIR_SHIFT-PAGE_SHIFT-16))
+#define ESPFIX_PAGE_SPACE (1UL << (P4D_SHIFT-PAGE_SHIFT-16))

#define ESPFIX_MAX_CPUS (ESPFIX_STACKS_PER_PAGE * ESPFIX_PAGE_SPACE)
#if CONFIG_NR_CPUS > ESPFIX_MAX_CPUS
-# error "Need more than one PGD for the ESPFIX hack"
+# error "Need more virtual address space for the ESPFIX hack"
#endif

#define PGALLOC_GFP (GFP_KERNEL | __GFP_NOTRACK | __GFP_ZERO)
@@ -121,11 +121,13 @@ static void init_espfix_random(void)

void __init init_espfix_bsp(void)
{
- pgd_t *pgd_p;
+ pgd_t *pgd;
+ p4d_t *p4d;

/* Install the espfix pud into the kernel page directory */
- pgd_p = &init_level4_pgt[pgd_index(ESPFIX_BASE_ADDR)];
- pgd_populate(&init_mm, pgd_p, (pud_t *)espfix_pud_page);
+ pgd = &init_level4_pgt[pgd_index(ESPFIX_BASE_ADDR)];
+ p4d = p4d_alloc(&init_mm, pgd, ESPFIX_BASE_ADDR);
+ p4d_populate(&init_mm, p4d, espfix_pud_page);

/* Randomize the locations */
init_espfix_random();
--
2.11.0

2017-03-29 15:25:15

by Kirill A. Shutemov

[permalink] [raw]
Subject: Re: [PATCHv2 6/8] x86/dump_pagetables: Add support 5-level paging

On Wed, Mar 29, 2017 at 12:15:07AM +0300, Kirill A. Shutemov wrote:
> I'll try to look more into this issue tomorrow.

Putting this commit before seems f2a6a7050109 ("x86: Convert the rest of
the code to support p4d_t") seems fixes the issue.

--
Kirill A. Shutemov

2017-03-30 06:22:57

by Ingo Molnar

[permalink] [raw]
Subject: Re: [PATCHv2 6/8] x86/dump_pagetables: Add support 5-level paging


* Kirill A. Shutemov <[email protected]> wrote:

> On Wed, Mar 29, 2017 at 12:15:07AM +0300, Kirill A. Shutemov wrote:
> > I'll try to look more into this issue tomorrow.
>
> Putting this commit before seems f2a6a7050109 ("x86: Convert the rest of
> the code to support p4d_t") seems fixes the issue.

Ok, I've applied this patch standalone to make tip:x86/mm boot again.

Since half of the patches in this series got iterated please send out a clean v3
series against the tip:x86/mm that I'm going to push out later today.

Thanks,

Ingo

Subject: [tip:x86/mm] x86/dump_pagetables: Add support for 5-level paging

Commit-ID: fdd3d8ce0ea62c32b039af45cc5538b728e366d9
Gitweb: http://git.kernel.org/tip/fdd3d8ce0ea62c32b039af45cc5538b728e366d9
Author: Kirill A. Shutemov <[email protected]>
AuthorDate: Tue, 28 Mar 2017 13:48:06 +0300
Committer: Ingo Molnar <[email protected]>
CommitDate: Thu, 30 Mar 2017 08:20:17 +0200

x86/dump_pagetables: Add support for 5-level paging

Simple extension to support one more page table level.

Signed-off-by: Kirill A. Shutemov <[email protected]>
Cc: Andrew Morton <[email protected]>
Cc: Andy Lutomirski <[email protected]>
Cc: Dave Hansen <[email protected]>
Cc: Linus Torvalds <[email protected]>
Cc: Peter Zijlstra <[email protected]>
Cc: Thomas Gleixner <[email protected]>
Cc: [email protected]
Cc: [email protected]
Link: http://lkml.kernel.org/r/[email protected]
Signed-off-by: Ingo Molnar <[email protected]>
---
arch/x86/mm/dump_pagetables.c | 59 +++++++++++++++++++++++++++++++++----------
1 file changed, 45 insertions(+), 14 deletions(-)

diff --git a/arch/x86/mm/dump_pagetables.c b/arch/x86/mm/dump_pagetables.c
index 58b5bee..9f305be 100644
--- a/arch/x86/mm/dump_pagetables.c
+++ b/arch/x86/mm/dump_pagetables.c
@@ -110,7 +110,8 @@ static struct addr_marker address_markers[] = {
#define PTE_LEVEL_MULT (PAGE_SIZE)
#define PMD_LEVEL_MULT (PTRS_PER_PTE * PTE_LEVEL_MULT)
#define PUD_LEVEL_MULT (PTRS_PER_PMD * PMD_LEVEL_MULT)
-#define PGD_LEVEL_MULT (PTRS_PER_PUD * PUD_LEVEL_MULT)
+#define P4D_LEVEL_MULT (PTRS_PER_PUD * PUD_LEVEL_MULT)
+#define PGD_LEVEL_MULT (PTRS_PER_PUD * P4D_LEVEL_MULT)

#define pt_dump_seq_printf(m, to_dmesg, fmt, args...) \
({ \
@@ -286,14 +287,13 @@ static void note_page(struct seq_file *m, struct pg_state *st,
}
}

-static void walk_pte_level(struct seq_file *m, struct pg_state *st, pmd_t addr,
- unsigned long P)
+static void walk_pte_level(struct seq_file *m, struct pg_state *st, pmd_t addr, unsigned long P)
{
int i;
pte_t *start;
pgprotval_t prot;

- start = (pte_t *) pmd_page_vaddr(addr);
+ start = (pte_t *)pmd_page_vaddr(addr);
for (i = 0; i < PTRS_PER_PTE; i++) {
prot = pte_flags(*start);
st->current_address = normalize_addr(P + i * PTE_LEVEL_MULT);
@@ -304,14 +304,13 @@ static void walk_pte_level(struct seq_file *m, struct pg_state *st, pmd_t addr,

#if PTRS_PER_PMD > 1

-static void walk_pmd_level(struct seq_file *m, struct pg_state *st, pud_t addr,
- unsigned long P)
+static void walk_pmd_level(struct seq_file *m, struct pg_state *st, pud_t addr, unsigned long P)
{
int i;
pmd_t *start;
pgprotval_t prot;

- start = (pmd_t *) pud_page_vaddr(addr);
+ start = (pmd_t *)pud_page_vaddr(addr);
for (i = 0; i < PTRS_PER_PMD; i++) {
st->current_address = normalize_addr(P + i * PMD_LEVEL_MULT);
if (!pmd_none(*start)) {
@@ -347,15 +346,14 @@ static bool pud_already_checked(pud_t *prev_pud, pud_t *pud, bool checkwx)
return checkwx && prev_pud && (pud_val(*prev_pud) == pud_val(*pud));
}

-static void walk_pud_level(struct seq_file *m, struct pg_state *st, pgd_t addr,
- unsigned long P)
+static void walk_pud_level(struct seq_file *m, struct pg_state *st, p4d_t addr, unsigned long P)
{
int i;
pud_t *start;
pgprotval_t prot;
pud_t *prev_pud = NULL;

- start = (pud_t *) pgd_page_vaddr(addr);
+ start = (pud_t *)p4d_page_vaddr(addr);

for (i = 0; i < PTRS_PER_PUD; i++) {
st->current_address = normalize_addr(P + i * PUD_LEVEL_MULT);
@@ -377,9 +375,42 @@ static void walk_pud_level(struct seq_file *m, struct pg_state *st, pgd_t addr,
}

#else
-#define walk_pud_level(m,s,a,p) walk_pmd_level(m,s,__pud(pgd_val(a)),p)
-#define pgd_large(a) pud_large(__pud(pgd_val(a)))
-#define pgd_none(a) pud_none(__pud(pgd_val(a)))
+#define walk_pud_level(m,s,a,p) walk_pmd_level(m,s,__pud(p4d_val(a)),p)
+#define p4d_large(a) pud_large(__pud(p4d_val(a)))
+#define p4d_none(a) pud_none(__pud(p4d_val(a)))
+#endif
+
+#if PTRS_PER_P4D > 1
+
+static void walk_p4d_level(struct seq_file *m, struct pg_state *st, pgd_t addr, unsigned long P)
+{
+ int i;
+ p4d_t *start;
+ pgprotval_t prot;
+
+ start = (p4d_t *)pgd_page_vaddr(addr);
+
+ for (i = 0; i < PTRS_PER_P4D; i++) {
+ st->current_address = normalize_addr(P + i * P4D_LEVEL_MULT);
+ if (!p4d_none(*start)) {
+ if (p4d_large(*start) || !p4d_present(*start)) {
+ prot = p4d_flags(*start);
+ note_page(m, st, __pgprot(prot), 2);
+ } else {
+ walk_pud_level(m, st, *start,
+ P + i * P4D_LEVEL_MULT);
+ }
+ } else
+ note_page(m, st, __pgprot(0), 2);
+
+ start++;
+ }
+}
+
+#else
+#define walk_p4d_level(m,s,a,p) walk_pud_level(m,s,__p4d(pgd_val(a)),p)
+#define pgd_large(a) p4d_large(__p4d(pgd_val(a)))
+#define pgd_none(a) p4d_none(__p4d(pgd_val(a)))
#endif

static inline bool is_hypervisor_range(int idx)
@@ -424,7 +455,7 @@ static void ptdump_walk_pgd_level_core(struct seq_file *m, pgd_t *pgd,
prot = pgd_flags(*start);
note_page(m, &st, __pgprot(prot), 1);
} else {
- walk_pud_level(m, &st, *start,
+ walk_p4d_level(m, &st, *start,
i * PGD_LEVEL_MULT);
}
} else