2019-04-17 15:44:07

by Brijesh Singh

[permalink] [raw]
Subject: [PATCH v3] x86: mm: Do not use set_{pud,pmd}_safe when splitting the large page

The following commit 0a9fe8ca844d ("x86/mm: Validate kernel_physical_mapping_init()
PTE population") triggers the below warning in the SEV guest.

WARNING: CPU: 0 PID: 0 at arch/x86/include/asm/pgalloc.h:87 phys_pmd_init+0x30d/0x386
Call Trace:
kernel_physical_mapping_init+0xce/0x259
early_set_memory_enc_dec+0x10f/0x160
kvm_smp_prepare_boot_cpu+0x71/0x9d
start_kernel+0x1c9/0x50b
secondary_startup_64+0xa4/0xb0

The SEV guest calls kernel_physical_mapping_init() to clear the encryption
mask from an existing mapping. While clearing the encryption mask
kernel_physical_mapping_init() splits the large pages into the smaller.
To split the page, the kernel_physical_mapping_init() allocates a new page
and updates the existing entry. The set_{pud,pmd}_safe triggers warning
when updating the entry with page in the present state.

Add a new kernel_physical_mapping_change() which uses the non-safe variants
of set_{pmd,pud,p4d}() and {pmd,pud,p4d}_populate() routines when updating
the entry. Since the kernel_physical_mapping_change() may replace the
existing entry with a new entry so the caller is responsible to issue the
TLB flushes. Update the early_set_memory_enc_dec() to use
kernel_physical_mapping_change() when it wants to clear the memory
encryption mask from the page table entry.

Signed-off-by: Brijesh Singh <[email protected]>
Fixes: 0a9fe8ca844d (x86/mm: Validate kernel_physical_mapping_init() ...)
Cc: Peter Zijlstra <[email protected]>
Cc: Dave Hansen <[email protected]>
Cc: Dan Williams <[email protected]>
Cc: Kirill A. Shutemov <[email protected]>
Cc: Peter Zijlstra (Intel) <[email protected]>
Cc: Andy Lutomirski <[email protected]>
Cc: Borislav Petkov <[email protected]>
Cc: H. Peter Anvin <[email protected]>
Cc: Thomas Gleixner <[email protected]>
Cc: Tom Lendacky <[email protected]>
---

Changes since v2:
- rename variable safe->init
- rename __set_* -> set_*_init()
- remame __*_populate -> *_populate_init()


Changes since v1:
- add kernel_physical_mapping_change() which uses non-safe variants of
set_{pmd,pud,pte}.

arch/x86/mm/init_64.c | 137 ++++++++++++++++++++++++++++----------
arch/x86/mm/mem_encrypt.c | 10 ++-
arch/x86/mm/mm_internal.h | 3 +
3 files changed, 110 insertions(+), 40 deletions(-)

diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index bccff68e3267..70065726c39a 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -58,6 +58,37 @@

#include "ident_map.c"

+#define DEFINE_POPULATE(fname, type1, type2, init) \
+static inline void fname##_init(struct mm_struct *mm, \
+ type1##_t *arg1, type2##_t *arg2, bool init) \
+{ \
+ if (init) \
+ fname##_safe(mm, arg1, arg2); \
+ else \
+ fname(mm, arg1, arg2); \
+}
+
+DEFINE_POPULATE(p4d_populate, p4d, pud, init)
+DEFINE_POPULATE(pgd_populate, pgd, p4d, init)
+DEFINE_POPULATE(pud_populate, pud, pmd, init)
+DEFINE_POPULATE(pmd_populate_kernel, pmd, pte, init)
+
+#define DEFINE_ENTRY(type1, type2, init) \
+static inline void set_##type1##_init(type1##_t *arg1, \
+ type2##_t arg2, bool init) \
+{ \
+ if (init) \
+ set_##type1##_safe(arg1, arg2); \
+ else \
+ set_##type1(arg1, arg2); \
+}
+
+DEFINE_ENTRY(p4d, p4d, init)
+DEFINE_ENTRY(pud, pud, init)
+DEFINE_ENTRY(pmd, pmd, init)
+DEFINE_ENTRY(pte, pte, init)
+
+
/*
* NOTE: pagetable_init alloc all the fixmap pagetables contiguous on the
* physical space so we can cache the place of the first one and move
@@ -414,7 +445,7 @@ void __init cleanup_highmap(void)
*/
static unsigned long __meminit
phys_pte_init(pte_t *pte_page, unsigned long paddr, unsigned long paddr_end,
- pgprot_t prot)
+ pgprot_t prot, bool init)
{
unsigned long pages = 0, paddr_next;
unsigned long paddr_last = paddr_end;
@@ -432,7 +463,7 @@ phys_pte_init(pte_t *pte_page, unsigned long paddr, unsigned long paddr_end,
E820_TYPE_RAM) &&
!e820__mapped_any(paddr & PAGE_MASK, paddr_next,
E820_TYPE_RESERVED_KERN))
- set_pte_safe(pte, __pte(0));
+ set_pte_init(pte, __pte(0), init);
continue;
}

@@ -452,7 +483,7 @@ phys_pte_init(pte_t *pte_page, unsigned long paddr, unsigned long paddr_end,
pr_info(" pte=%p addr=%lx pte=%016lx\n", pte, paddr,
pfn_pte(paddr >> PAGE_SHIFT, PAGE_KERNEL).pte);
pages++;
- set_pte_safe(pte, pfn_pte(paddr >> PAGE_SHIFT, prot));
+ set_pte_init(pte, pfn_pte(paddr >> PAGE_SHIFT, prot), init);
paddr_last = (paddr & PAGE_MASK) + PAGE_SIZE;
}

@@ -468,7 +499,7 @@ phys_pte_init(pte_t *pte_page, unsigned long paddr, unsigned long paddr_end,
*/
static unsigned long __meminit
phys_pmd_init(pmd_t *pmd_page, unsigned long paddr, unsigned long paddr_end,
- unsigned long page_size_mask, pgprot_t prot)
+ unsigned long page_size_mask, pgprot_t prot, bool init)
{
unsigned long pages = 0, paddr_next;
unsigned long paddr_last = paddr_end;
@@ -487,7 +518,7 @@ phys_pmd_init(pmd_t *pmd_page, unsigned long paddr, unsigned long paddr_end,
E820_TYPE_RAM) &&
!e820__mapped_any(paddr & PMD_MASK, paddr_next,
E820_TYPE_RESERVED_KERN))
- set_pmd_safe(pmd, __pmd(0));
+ set_pmd_init(pmd, __pmd(0), init);
continue;
}

@@ -496,7 +527,8 @@ phys_pmd_init(pmd_t *pmd_page, unsigned long paddr, unsigned long paddr_end,
spin_lock(&init_mm.page_table_lock);
pte = (pte_t *)pmd_page_vaddr(*pmd);
paddr_last = phys_pte_init(pte, paddr,
- paddr_end, prot);
+ paddr_end, prot,
+ init);
spin_unlock(&init_mm.page_table_lock);
continue;
}
@@ -524,19 +556,20 @@ phys_pmd_init(pmd_t *pmd_page, unsigned long paddr, unsigned long paddr_end,
if (page_size_mask & (1<<PG_LEVEL_2M)) {
pages++;
spin_lock(&init_mm.page_table_lock);
- set_pte_safe((pte_t *)pmd,
- pfn_pte((paddr & PMD_MASK) >> PAGE_SHIFT,
- __pgprot(pgprot_val(prot) | _PAGE_PSE)));
+ set_pte_init((pte_t *)pmd,
+ pfn_pte((paddr & PMD_MASK) >> PAGE_SHIFT,
+ __pgprot(pgprot_val(prot) | _PAGE_PSE)),
+ init);
spin_unlock(&init_mm.page_table_lock);
paddr_last = paddr_next;
continue;
}

pte = alloc_low_page();
- paddr_last = phys_pte_init(pte, paddr, paddr_end, new_prot);
+ paddr_last = phys_pte_init(pte, paddr, paddr_end, new_prot, init);

spin_lock(&init_mm.page_table_lock);
- pmd_populate_kernel_safe(&init_mm, pmd, pte);
+ pmd_populate_kernel_init(&init_mm, pmd, pte, init);
spin_unlock(&init_mm.page_table_lock);
}
update_page_count(PG_LEVEL_2M, pages);
@@ -551,7 +584,7 @@ phys_pmd_init(pmd_t *pmd_page, unsigned long paddr, unsigned long paddr_end,
*/
static unsigned long __meminit
phys_pud_init(pud_t *pud_page, unsigned long paddr, unsigned long paddr_end,
- unsigned long page_size_mask)
+ unsigned long page_size_mask, bool init)
{
unsigned long pages = 0, paddr_next;
unsigned long paddr_last = paddr_end;
@@ -573,7 +606,7 @@ phys_pud_init(pud_t *pud_page, unsigned long paddr, unsigned long paddr_end,
E820_TYPE_RAM) &&
!e820__mapped_any(paddr & PUD_MASK, paddr_next,
E820_TYPE_RESERVED_KERN))
- set_pud_safe(pud, __pud(0));
+ set_pud_init(pud, __pud(0), init);
continue;
}

@@ -583,7 +616,7 @@ phys_pud_init(pud_t *pud_page, unsigned long paddr, unsigned long paddr_end,
paddr_last = phys_pmd_init(pmd, paddr,
paddr_end,
page_size_mask,
- prot);
+ prot, init);
continue;
}
/*
@@ -610,9 +643,9 @@ phys_pud_init(pud_t *pud_page, unsigned long paddr, unsigned long paddr_end,
if (page_size_mask & (1<<PG_LEVEL_1G)) {
pages++;
spin_lock(&init_mm.page_table_lock);
- set_pte_safe((pte_t *)pud,
- pfn_pte((paddr & PUD_MASK) >> PAGE_SHIFT,
- PAGE_KERNEL_LARGE));
+ set_pte_init((pte_t *)pud,
+ pfn_pte((paddr & PUD_MASK) >> PAGE_SHIFT,
+ PAGE_KERNEL_LARGE), init);
spin_unlock(&init_mm.page_table_lock);
paddr_last = paddr_next;
continue;
@@ -620,10 +653,10 @@ phys_pud_init(pud_t *pud_page, unsigned long paddr, unsigned long paddr_end,

pmd = alloc_low_page();
paddr_last = phys_pmd_init(pmd, paddr, paddr_end,
- page_size_mask, prot);
+ page_size_mask, prot, init);

spin_lock(&init_mm.page_table_lock);
- pud_populate_safe(&init_mm, pud, pmd);
+ pud_populate_init(&init_mm, pud, pmd, init);
spin_unlock(&init_mm.page_table_lock);
}

@@ -634,14 +667,15 @@ phys_pud_init(pud_t *pud_page, unsigned long paddr, unsigned long paddr_end,

static unsigned long __meminit
phys_p4d_init(p4d_t *p4d_page, unsigned long paddr, unsigned long paddr_end,
- unsigned long page_size_mask)
+ unsigned long page_size_mask, bool init)
{
unsigned long paddr_next, paddr_last = paddr_end;
unsigned long vaddr = (unsigned long)__va(paddr);
int i = p4d_index(vaddr);

if (!pgtable_l5_enabled())
- return phys_pud_init((pud_t *) p4d_page, paddr, paddr_end, page_size_mask);
+ return phys_pud_init((pud_t *) p4d_page, paddr, paddr_end,
+ page_size_mask, init);

for (; i < PTRS_PER_P4D; i++, paddr = paddr_next) {
p4d_t *p4d;
@@ -657,7 +691,7 @@ phys_p4d_init(p4d_t *p4d_page, unsigned long paddr, unsigned long paddr_end,
E820_TYPE_RAM) &&
!e820__mapped_any(paddr & P4D_MASK, paddr_next,
E820_TYPE_RESERVED_KERN))
- set_p4d_safe(p4d, __p4d(0));
+ set_p4d_init(p4d, __p4d(0), init);
continue;
}

@@ -665,31 +699,27 @@ phys_p4d_init(p4d_t *p4d_page, unsigned long paddr, unsigned long paddr_end,
pud = pud_offset(p4d, 0);
paddr_last = phys_pud_init(pud, paddr,
paddr_end,
- page_size_mask);
+ page_size_mask, init);
continue;
}

pud = alloc_low_page();
paddr_last = phys_pud_init(pud, paddr, paddr_end,
- page_size_mask);
+ page_size_mask, init);

spin_lock(&init_mm.page_table_lock);
- p4d_populate_safe(&init_mm, p4d, pud);
+ p4d_populate_init(&init_mm, p4d, pud, init);
spin_unlock(&init_mm.page_table_lock);
}

return paddr_last;
}

-/*
- * Create page table mapping for the physical memory for specific physical
- * addresses. The virtual and physical addresses have to be aligned on PMD level
- * down. It returns the last physical address mapped.
- */
-unsigned long __meminit
-kernel_physical_mapping_init(unsigned long paddr_start,
+static unsigned long __meminit
+__kernel_physical_mapping_init(unsigned long paddr_start,
unsigned long paddr_end,
- unsigned long page_size_mask)
+ unsigned long page_size_mask,
+ bool init)
{
bool pgd_changed = false;
unsigned long vaddr, vaddr_start, vaddr_end, vaddr_next, paddr_last;
@@ -709,19 +739,22 @@ kernel_physical_mapping_init(unsigned long paddr_start,
p4d = (p4d_t *)pgd_page_vaddr(*pgd);
paddr_last = phys_p4d_init(p4d, __pa(vaddr),
__pa(vaddr_end),
- page_size_mask);
+ page_size_mask,
+ init);
continue;
}

p4d = alloc_low_page();
paddr_last = phys_p4d_init(p4d, __pa(vaddr), __pa(vaddr_end),
- page_size_mask);
+ page_size_mask, init);

spin_lock(&init_mm.page_table_lock);
if (pgtable_l5_enabled())
- pgd_populate_safe(&init_mm, pgd, p4d);
+ pgd_populate_init(&init_mm, pgd, p4d, init);
else
- p4d_populate_safe(&init_mm, p4d_offset(pgd, vaddr), (pud_t *) p4d);
+ p4d_populate_init(&init_mm, p4d_offset(pgd, vaddr),
+ (pud_t *) p4d, init);
+
spin_unlock(&init_mm.page_table_lock);
pgd_changed = true;
}
@@ -732,6 +765,36 @@ kernel_physical_mapping_init(unsigned long paddr_start,
return paddr_last;
}

+
+/*
+ * Create page table mapping for the physical memory for specific physical
+ * addresses. The virtual and physical addresses have to be aligned on PMD level
+ * down. It returns the last physical address mapped.
+ */
+unsigned long __meminit
+kernel_physical_mapping_init(unsigned long paddr_start,
+ unsigned long paddr_end,
+ unsigned long page_size_mask)
+{
+ return __kernel_physical_mapping_init(paddr_start, paddr_end,
+ page_size_mask, true);
+}
+
+/*
+ * This function is similar to the kernel_physical_mapping_init() with exception
+ * that it uses set_{pud,pmd} instead of the set_{pud,pte}_safe when updating
+ * the mapping. The caller is responsible to flush the TLBs after the function
+ * returns.
+ */
+unsigned long __meminit
+kernel_physical_mapping_change(unsigned long paddr_start,
+ unsigned long paddr_end,
+ unsigned long page_size_mask)
+{
+ return __kernel_physical_mapping_init(paddr_start, paddr_end,
+ page_size_mask, false);
+}
+
#ifndef CONFIG_NUMA
void __init initmem_init(void)
{
diff --git a/arch/x86/mm/mem_encrypt.c b/arch/x86/mm/mem_encrypt.c
index 385afa2b9e17..51f50a7a07ef 100644
--- a/arch/x86/mm/mem_encrypt.c
+++ b/arch/x86/mm/mem_encrypt.c
@@ -301,9 +301,13 @@ static int __init early_set_memory_enc_dec(unsigned long vaddr,
else
split_page_size_mask = 1 << PG_LEVEL_2M;

- kernel_physical_mapping_init(__pa(vaddr & pmask),
- __pa((vaddr_end & pmask) + psize),
- split_page_size_mask);
+ /*
+ * kernel_physical_mapping_change() does not flush the TLBs, so
+ * a TLB flush is required after we exit from the for loop.
+ */
+ kernel_physical_mapping_change(__pa(vaddr & pmask),
+ __pa((vaddr_end & pmask) + psize),
+ split_page_size_mask);
}

ret = 0;
diff --git a/arch/x86/mm/mm_internal.h b/arch/x86/mm/mm_internal.h
index 319bde386d5f..eeae142062ed 100644
--- a/arch/x86/mm/mm_internal.h
+++ b/arch/x86/mm/mm_internal.h
@@ -13,6 +13,9 @@ void early_ioremap_page_table_range_init(void);
unsigned long kernel_physical_mapping_init(unsigned long start,
unsigned long end,
unsigned long page_size_mask);
+unsigned long kernel_physical_mapping_change(unsigned long start,
+ unsigned long end,
+ unsigned long page_size_mask);
void zone_sizes_init(void);

extern int after_bootmem;
--
2.17.1


2019-04-17 17:25:44

by Dave Hansen

[permalink] [raw]
Subject: Re: [PATCH v3] x86: mm: Do not use set_{pud,pmd}_safe when splitting the large page

On 4/17/19 8:41 AM, Singh, Brijesh wrote:
> Changes since v2:
> - rename variable safe->init
> - rename __set_* -> set_*_init()
> - remame __*_populate -> *_populate_init()

"init" in this context really means "setting non-present PTEs is OK, but
changing present ones is not". It still doesn't do a great job of
conveying that because both the init=0 and init=1 variants are used
during init-time.

> +/*
> + * Create page table mapping for the physical memory for specific physical
> + * addresses. The virtual and physical addresses have to be aligned on PMD level
> + * down. It returns the last physical address mapped.
> + */
> +unsigned long __meminit
> +kernel_physical_mapping_init(unsigned long paddr_start,
> + unsigned long paddr_end,
> + unsigned long page_size_mask)
> +{
> + return __kernel_physical_mapping_init(paddr_start, paddr_end,
> + page_size_mask, true);
> +}

The comment here is missing one key point:
kernel_physical_mapping_init() can only be used to populate non-present
entries. It kinda infers that from "Create...", but I really think we
need to be explicit.

Anyway, it's better than it was, and it does fix a bug, so:

Reviewed-by: Dave Hansen <[email protected]>

But please flesh out that comment if you do another version for some reason.

2019-05-03 15:59:49

by Brijesh Singh

[permalink] [raw]
Subject: Re: [PATCH v3] x86: mm: Do not use set_{pud,pmd}_safe when splitting the large page

Hi Boris,

On 4/17/19 12:24 PM, Dave Hansen wrote:
...

>
> The comment here is missing one key point:
> kernel_physical_mapping_init() can only be used to populate non-present
> entries. It kinda infers that from "Create...", but I really think we
> need to be explicit.
>
> Anyway, it's better than it was, and it does fix a bug, so:
>
> Reviewed-by: Dave Hansen <[email protected]>
>
> But please flesh out that comment if you do another version for some reason.
>

Can you please pick this patch? Or do you want me to resend it ?
It's fixing a bug seen in SEV guest.

-Brijesh

Subject: [tip:x86/urgent] x86/mm: Do not use set_{pud, pmd}_safe() when splitting a large page

Commit-ID: eccd906484d1cd4b5da00f093d678badb6f48f28
Gitweb: https://git.kernel.org/tip/eccd906484d1cd4b5da00f093d678badb6f48f28
Author: Brijesh Singh <[email protected]>
AuthorDate: Wed, 17 Apr 2019 15:41:17 +0000
Committer: Borislav Petkov <[email protected]>
CommitDate: Wed, 8 May 2019 19:08:35 +0200

x86/mm: Do not use set_{pud, pmd}_safe() when splitting a large page

The commit

0a9fe8ca844d ("x86/mm: Validate kernel_physical_mapping_init() PTE population")

triggers this warning in SEV guests:

WARNING: CPU: 0 PID: 0 at arch/x86/include/asm/pgalloc.h:87 phys_pmd_init+0x30d/0x386
Call Trace:
kernel_physical_mapping_init+0xce/0x259
early_set_memory_enc_dec+0x10f/0x160
kvm_smp_prepare_boot_cpu+0x71/0x9d
start_kernel+0x1c9/0x50b
secondary_startup_64+0xa4/0xb0

A SEV guest calls kernel_physical_mapping_init() to clear the encryption
mask from an existing mapping. While doing so, it also splits large
pages into smaller.

To split a page, kernel_physical_mapping_init() allocates a new page and
updates the existing entry. The set_{pud,pmd}_safe() helpers trigger a
warning when updating an entry with a page in the present state.

Add a new kernel_physical_mapping_change() helper which uses the
non-safe variants of set_{pmd,pud,p4d}() and {pmd,pud,p4d}_populate()
routines when updating the entry.

Since kernel_physical_mapping_change() may replace an existing
entry with a new entry, the caller is responsible to flush
the TLB at the end. Change early_set_memory_enc_dec() to use
kernel_physical_mapping_change() when it wants to clear the memory
encryption mask from the page table entry.

[ bp:
- massage commit message.
- flesh out comment according to dhansen's request.
- align function arguments at opening brace. ]

Fixes: 0a9fe8ca844d ("x86/mm: Validate kernel_physical_mapping_init() PTE population")
Signed-off-by: Brijesh Singh <[email protected]>
Signed-off-by: Borislav Petkov <[email protected]>
Reviewed-by: Dave Hansen <[email protected]>
Acked-by: Peter Zijlstra (Intel) <[email protected]>
Cc: Andy Lutomirski <[email protected]>
Cc: Dan Williams <[email protected]>
Cc: "H. Peter Anvin" <[email protected]>
Cc: Ingo Molnar <[email protected]>
Cc: "Kirill A . Shutemov" <[email protected]>
Cc: Thomas Gleixner <[email protected]>
Cc: Thomas Lendacky <[email protected]>
Cc: x86-ml <[email protected]>
Link: https://lkml.kernel.org/r/[email protected]
---
arch/x86/mm/init_64.c | 144 +++++++++++++++++++++++++++++++++-------------
arch/x86/mm/mem_encrypt.c | 10 +++-
arch/x86/mm/mm_internal.h | 3 +
3 files changed, 114 insertions(+), 43 deletions(-)

diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index bccff68e3267..5cd125bd2a85 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -58,6 +58,37 @@

#include "ident_map.c"

+#define DEFINE_POPULATE(fname, type1, type2, init) \
+static inline void fname##_init(struct mm_struct *mm, \
+ type1##_t *arg1, type2##_t *arg2, bool init) \
+{ \
+ if (init) \
+ fname##_safe(mm, arg1, arg2); \
+ else \
+ fname(mm, arg1, arg2); \
+}
+
+DEFINE_POPULATE(p4d_populate, p4d, pud, init)
+DEFINE_POPULATE(pgd_populate, pgd, p4d, init)
+DEFINE_POPULATE(pud_populate, pud, pmd, init)
+DEFINE_POPULATE(pmd_populate_kernel, pmd, pte, init)
+
+#define DEFINE_ENTRY(type1, type2, init) \
+static inline void set_##type1##_init(type1##_t *arg1, \
+ type2##_t arg2, bool init) \
+{ \
+ if (init) \
+ set_##type1##_safe(arg1, arg2); \
+ else \
+ set_##type1(arg1, arg2); \
+}
+
+DEFINE_ENTRY(p4d, p4d, init)
+DEFINE_ENTRY(pud, pud, init)
+DEFINE_ENTRY(pmd, pmd, init)
+DEFINE_ENTRY(pte, pte, init)
+
+
/*
* NOTE: pagetable_init alloc all the fixmap pagetables contiguous on the
* physical space so we can cache the place of the first one and move
@@ -414,7 +445,7 @@ void __init cleanup_highmap(void)
*/
static unsigned long __meminit
phys_pte_init(pte_t *pte_page, unsigned long paddr, unsigned long paddr_end,
- pgprot_t prot)
+ pgprot_t prot, bool init)
{
unsigned long pages = 0, paddr_next;
unsigned long paddr_last = paddr_end;
@@ -432,7 +463,7 @@ phys_pte_init(pte_t *pte_page, unsigned long paddr, unsigned long paddr_end,
E820_TYPE_RAM) &&
!e820__mapped_any(paddr & PAGE_MASK, paddr_next,
E820_TYPE_RESERVED_KERN))
- set_pte_safe(pte, __pte(0));
+ set_pte_init(pte, __pte(0), init);
continue;
}

@@ -452,7 +483,7 @@ phys_pte_init(pte_t *pte_page, unsigned long paddr, unsigned long paddr_end,
pr_info(" pte=%p addr=%lx pte=%016lx\n", pte, paddr,
pfn_pte(paddr >> PAGE_SHIFT, PAGE_KERNEL).pte);
pages++;
- set_pte_safe(pte, pfn_pte(paddr >> PAGE_SHIFT, prot));
+ set_pte_init(pte, pfn_pte(paddr >> PAGE_SHIFT, prot), init);
paddr_last = (paddr & PAGE_MASK) + PAGE_SIZE;
}

@@ -468,7 +499,7 @@ phys_pte_init(pte_t *pte_page, unsigned long paddr, unsigned long paddr_end,
*/
static unsigned long __meminit
phys_pmd_init(pmd_t *pmd_page, unsigned long paddr, unsigned long paddr_end,
- unsigned long page_size_mask, pgprot_t prot)
+ unsigned long page_size_mask, pgprot_t prot, bool init)
{
unsigned long pages = 0, paddr_next;
unsigned long paddr_last = paddr_end;
@@ -487,7 +518,7 @@ phys_pmd_init(pmd_t *pmd_page, unsigned long paddr, unsigned long paddr_end,
E820_TYPE_RAM) &&
!e820__mapped_any(paddr & PMD_MASK, paddr_next,
E820_TYPE_RESERVED_KERN))
- set_pmd_safe(pmd, __pmd(0));
+ set_pmd_init(pmd, __pmd(0), init);
continue;
}

@@ -496,7 +527,8 @@ phys_pmd_init(pmd_t *pmd_page, unsigned long paddr, unsigned long paddr_end,
spin_lock(&init_mm.page_table_lock);
pte = (pte_t *)pmd_page_vaddr(*pmd);
paddr_last = phys_pte_init(pte, paddr,
- paddr_end, prot);
+ paddr_end, prot,
+ init);
spin_unlock(&init_mm.page_table_lock);
continue;
}
@@ -524,19 +556,20 @@ phys_pmd_init(pmd_t *pmd_page, unsigned long paddr, unsigned long paddr_end,
if (page_size_mask & (1<<PG_LEVEL_2M)) {
pages++;
spin_lock(&init_mm.page_table_lock);
- set_pte_safe((pte_t *)pmd,
- pfn_pte((paddr & PMD_MASK) >> PAGE_SHIFT,
- __pgprot(pgprot_val(prot) | _PAGE_PSE)));
+ set_pte_init((pte_t *)pmd,
+ pfn_pte((paddr & PMD_MASK) >> PAGE_SHIFT,
+ __pgprot(pgprot_val(prot) | _PAGE_PSE)),
+ init);
spin_unlock(&init_mm.page_table_lock);
paddr_last = paddr_next;
continue;
}

pte = alloc_low_page();
- paddr_last = phys_pte_init(pte, paddr, paddr_end, new_prot);
+ paddr_last = phys_pte_init(pte, paddr, paddr_end, new_prot, init);

spin_lock(&init_mm.page_table_lock);
- pmd_populate_kernel_safe(&init_mm, pmd, pte);
+ pmd_populate_kernel_init(&init_mm, pmd, pte, init);
spin_unlock(&init_mm.page_table_lock);
}
update_page_count(PG_LEVEL_2M, pages);
@@ -551,7 +584,7 @@ phys_pmd_init(pmd_t *pmd_page, unsigned long paddr, unsigned long paddr_end,
*/
static unsigned long __meminit
phys_pud_init(pud_t *pud_page, unsigned long paddr, unsigned long paddr_end,
- unsigned long page_size_mask)
+ unsigned long page_size_mask, bool init)
{
unsigned long pages = 0, paddr_next;
unsigned long paddr_last = paddr_end;
@@ -573,7 +606,7 @@ phys_pud_init(pud_t *pud_page, unsigned long paddr, unsigned long paddr_end,
E820_TYPE_RAM) &&
!e820__mapped_any(paddr & PUD_MASK, paddr_next,
E820_TYPE_RESERVED_KERN))
- set_pud_safe(pud, __pud(0));
+ set_pud_init(pud, __pud(0), init);
continue;
}

@@ -583,7 +616,7 @@ phys_pud_init(pud_t *pud_page, unsigned long paddr, unsigned long paddr_end,
paddr_last = phys_pmd_init(pmd, paddr,
paddr_end,
page_size_mask,
- prot);
+ prot, init);
continue;
}
/*
@@ -610,9 +643,10 @@ phys_pud_init(pud_t *pud_page, unsigned long paddr, unsigned long paddr_end,
if (page_size_mask & (1<<PG_LEVEL_1G)) {
pages++;
spin_lock(&init_mm.page_table_lock);
- set_pte_safe((pte_t *)pud,
- pfn_pte((paddr & PUD_MASK) >> PAGE_SHIFT,
- PAGE_KERNEL_LARGE));
+ set_pte_init((pte_t *)pud,
+ pfn_pte((paddr & PUD_MASK) >> PAGE_SHIFT,
+ PAGE_KERNEL_LARGE),
+ init);
spin_unlock(&init_mm.page_table_lock);
paddr_last = paddr_next;
continue;
@@ -620,10 +654,10 @@ phys_pud_init(pud_t *pud_page, unsigned long paddr, unsigned long paddr_end,

pmd = alloc_low_page();
paddr_last = phys_pmd_init(pmd, paddr, paddr_end,
- page_size_mask, prot);
+ page_size_mask, prot, init);

spin_lock(&init_mm.page_table_lock);
- pud_populate_safe(&init_mm, pud, pmd);
+ pud_populate_init(&init_mm, pud, pmd, init);
spin_unlock(&init_mm.page_table_lock);
}

@@ -634,14 +668,15 @@ phys_pud_init(pud_t *pud_page, unsigned long paddr, unsigned long paddr_end,

static unsigned long __meminit
phys_p4d_init(p4d_t *p4d_page, unsigned long paddr, unsigned long paddr_end,
- unsigned long page_size_mask)
+ unsigned long page_size_mask, bool init)
{
unsigned long paddr_next, paddr_last = paddr_end;
unsigned long vaddr = (unsigned long)__va(paddr);
int i = p4d_index(vaddr);

if (!pgtable_l5_enabled())
- return phys_pud_init((pud_t *) p4d_page, paddr, paddr_end, page_size_mask);
+ return phys_pud_init((pud_t *) p4d_page, paddr, paddr_end,
+ page_size_mask, init);

for (; i < PTRS_PER_P4D; i++, paddr = paddr_next) {
p4d_t *p4d;
@@ -657,39 +692,34 @@ phys_p4d_init(p4d_t *p4d_page, unsigned long paddr, unsigned long paddr_end,
E820_TYPE_RAM) &&
!e820__mapped_any(paddr & P4D_MASK, paddr_next,
E820_TYPE_RESERVED_KERN))
- set_p4d_safe(p4d, __p4d(0));
+ set_p4d_init(p4d, __p4d(0), init);
continue;
}

if (!p4d_none(*p4d)) {
pud = pud_offset(p4d, 0);
- paddr_last = phys_pud_init(pud, paddr,
- paddr_end,
- page_size_mask);
+ paddr_last = phys_pud_init(pud, paddr, paddr_end,
+ page_size_mask, init);
continue;
}

pud = alloc_low_page();
paddr_last = phys_pud_init(pud, paddr, paddr_end,
- page_size_mask);
+ page_size_mask, init);

spin_lock(&init_mm.page_table_lock);
- p4d_populate_safe(&init_mm, p4d, pud);
+ p4d_populate_init(&init_mm, p4d, pud, init);
spin_unlock(&init_mm.page_table_lock);
}

return paddr_last;
}

-/*
- * Create page table mapping for the physical memory for specific physical
- * addresses. The virtual and physical addresses have to be aligned on PMD level
- * down. It returns the last physical address mapped.
- */
-unsigned long __meminit
-kernel_physical_mapping_init(unsigned long paddr_start,
- unsigned long paddr_end,
- unsigned long page_size_mask)
+static unsigned long __meminit
+__kernel_physical_mapping_init(unsigned long paddr_start,
+ unsigned long paddr_end,
+ unsigned long page_size_mask,
+ bool init)
{
bool pgd_changed = false;
unsigned long vaddr, vaddr_start, vaddr_end, vaddr_next, paddr_last;
@@ -709,19 +739,22 @@ kernel_physical_mapping_init(unsigned long paddr_start,
p4d = (p4d_t *)pgd_page_vaddr(*pgd);
paddr_last = phys_p4d_init(p4d, __pa(vaddr),
__pa(vaddr_end),
- page_size_mask);
+ page_size_mask,
+ init);
continue;
}

p4d = alloc_low_page();
paddr_last = phys_p4d_init(p4d, __pa(vaddr), __pa(vaddr_end),
- page_size_mask);
+ page_size_mask, init);

spin_lock(&init_mm.page_table_lock);
if (pgtable_l5_enabled())
- pgd_populate_safe(&init_mm, pgd, p4d);
+ pgd_populate_init(&init_mm, pgd, p4d, init);
else
- p4d_populate_safe(&init_mm, p4d_offset(pgd, vaddr), (pud_t *) p4d);
+ p4d_populate_init(&init_mm, p4d_offset(pgd, vaddr),
+ (pud_t *) p4d, init);
+
spin_unlock(&init_mm.page_table_lock);
pgd_changed = true;
}
@@ -732,6 +765,37 @@ kernel_physical_mapping_init(unsigned long paddr_start,
return paddr_last;
}

+
+/*
+ * Create page table mapping for the physical memory for specific physical
+ * addresses. Note that it can only be used to populate non-present entries.
+ * The virtual and physical addresses have to be aligned on PMD level
+ * down. It returns the last physical address mapped.
+ */
+unsigned long __meminit
+kernel_physical_mapping_init(unsigned long paddr_start,
+ unsigned long paddr_end,
+ unsigned long page_size_mask)
+{
+ return __kernel_physical_mapping_init(paddr_start, paddr_end,
+ page_size_mask, true);
+}
+
+/*
+ * This function is similar to kernel_physical_mapping_init() above with the
+ * exception that it uses set_{pud,pmd}() instead of the set_{pud,pte}_safe()
+ * when updating the mapping. The caller is responsible to flush the TLBs after
+ * the function returns.
+ */
+unsigned long __meminit
+kernel_physical_mapping_change(unsigned long paddr_start,
+ unsigned long paddr_end,
+ unsigned long page_size_mask)
+{
+ return __kernel_physical_mapping_init(paddr_start, paddr_end,
+ page_size_mask, false);
+}
+
#ifndef CONFIG_NUMA
void __init initmem_init(void)
{
diff --git a/arch/x86/mm/mem_encrypt.c b/arch/x86/mm/mem_encrypt.c
index 385afa2b9e17..51f50a7a07ef 100644
--- a/arch/x86/mm/mem_encrypt.c
+++ b/arch/x86/mm/mem_encrypt.c
@@ -301,9 +301,13 @@ static int __init early_set_memory_enc_dec(unsigned long vaddr,
else
split_page_size_mask = 1 << PG_LEVEL_2M;

- kernel_physical_mapping_init(__pa(vaddr & pmask),
- __pa((vaddr_end & pmask) + psize),
- split_page_size_mask);
+ /*
+ * kernel_physical_mapping_change() does not flush the TLBs, so
+ * a TLB flush is required after we exit from the for loop.
+ */
+ kernel_physical_mapping_change(__pa(vaddr & pmask),
+ __pa((vaddr_end & pmask) + psize),
+ split_page_size_mask);
}

ret = 0;
diff --git a/arch/x86/mm/mm_internal.h b/arch/x86/mm/mm_internal.h
index 319bde386d5f..eeae142062ed 100644
--- a/arch/x86/mm/mm_internal.h
+++ b/arch/x86/mm/mm_internal.h
@@ -13,6 +13,9 @@ void early_ioremap_page_table_range_init(void);
unsigned long kernel_physical_mapping_init(unsigned long start,
unsigned long end,
unsigned long page_size_mask);
+unsigned long kernel_physical_mapping_change(unsigned long start,
+ unsigned long end,
+ unsigned long page_size_mask);
void zone_sizes_init(void);

extern int after_bootmem;