From: Joerg Roedel <[email protected]>
This function is needed to map the GHCB for SEV-ES guests. The GHCB is
used for communication with the hypervisor, so its content must not be
encrypted.
Signed-off-by: Joerg Roedel <[email protected]>
---
arch/x86/boot/compressed/ident_map_64.c | 125 ++++++++++++++++++++++++
arch/x86/boot/compressed/misc.h | 1 +
2 files changed, 126 insertions(+)
diff --git a/arch/x86/boot/compressed/ident_map_64.c b/arch/x86/boot/compressed/ident_map_64.c
index feb180cced28..04a5ff4bda66 100644
--- a/arch/x86/boot/compressed/ident_map_64.c
+++ b/arch/x86/boot/compressed/ident_map_64.c
@@ -26,6 +26,7 @@
#include <asm/init.h>
#include <asm/pgtable.h>
#include <asm/trap_defs.h>
+#include <asm/cmpxchg.h>
/* Use the static base for this part of the boot process */
#undef __PAGE_OFFSET
#define __PAGE_OFFSET __PAGE_OFFSET_BASE
@@ -157,6 +158,130 @@ void initialize_identity_maps(void)
write_cr3(top_level_pgt);
}
+static pte_t *split_large_pmd(struct x86_mapping_info *info,
+ pmd_t *pmdp, unsigned long __address)
+{
+ unsigned long page_flags;
+ unsigned long address;
+ pte_t *pte;
+ pmd_t pmd;
+ int i;
+
+ pte = (pte_t *)info->alloc_pgt_page(info->context);
+ if (!pte)
+ return NULL;
+
+ address = __address & PMD_MASK;
+ /* No large page - clear PSE flag */
+ page_flags = info->page_flag & ~_PAGE_PSE;
+
+ /* Populate the PTEs */
+ for (i = 0; i < PTRS_PER_PMD; i++) {
+ set_pte(&pte[i], __pte(address | page_flags));
+ address += PAGE_SIZE;
+ }
+
+ /*
+ * Ideally we need to clear the large PMD first and do a TLB
+ * flush before we write the new PMD. But the 2M range of the
+ * PMD might contain the code we execute and/or the stack
+ * we are on, so we can't do that. But that should be safe here
+ * because we are going from large to small mappings and we are
+ * also the only user of the page-table, so there is no chance
+ * of a TLB multihit.
+ */
+ pmd = __pmd((unsigned long)pte | info->kernpg_flag);
+ set_pmd(pmdp, pmd);
+ /* Flush TLB to establish the new PMD */
+ write_cr3(top_level_pgt);
+
+ return pte + pte_index(__address);
+}
+
+static void clflush_page(unsigned long address)
+{
+ unsigned int flush_size;
+ char *cl, *start, *end;
+
+ /*
+ * Hardcode cl-size to 64 - CPUID can't be used here because that might
+ * cause another #VC exception and the GHCB is not ready to use yet.
+ */
+ flush_size = 64;
+ start = (char *)(address & PAGE_MASK);
+ end = start + PAGE_SIZE;
+
+ /*
+ * First make sure there are no pending writes on the cache-lines to
+ * flush.
+ */
+ asm volatile("mfence" : : : "memory");
+
+ for (cl = start; cl != end; cl += flush_size)
+ clflush(cl);
+}
+
+static int __set_page_decrypted(struct x86_mapping_info *info,
+ unsigned long address)
+{
+ unsigned long scratch, *target;
+ pgd_t *pgdp = (pgd_t *)top_level_pgt;
+ p4d_t *p4dp;
+ pud_t *pudp;
+ pmd_t *pmdp;
+ pte_t *ptep, pte;
+
+ /*
+ * First make sure there is a PMD mapping for 'address'.
+ * It should already exist, but keep things generic.
+ *
+ * To map the page just read from it and fault it in if there is no
+ * mapping yet. add_identity_map() can't be called here because that
+ * would unconditionally map the address on PMD level, destroying any
+ * PTE-level mappings that might already exist. Also do something
+ * useless with 'scratch' so the access won't be optimized away.
+ */
+ target = (unsigned long *)address;
+ scratch = *target;
+ arch_cmpxchg(target, scratch, scratch);
+
+ /*
+ * The page is mapped at least with PMD size - so skip checks and walk
+ * directly to the PMD.
+ */
+ p4dp = p4d_offset(pgdp, address);
+ pudp = pud_offset(p4dp, address);
+ pmdp = pmd_offset(pudp, address);
+
+ if (pmd_large(*pmdp))
+ ptep = split_large_pmd(info, pmdp, address);
+ else
+ ptep = pte_offset_kernel(pmdp, address);
+
+ if (!ptep)
+ return -ENOMEM;
+
+ /* Clear encryption flag and write new pte */
+ pte = pte_clear_flags(*ptep, _PAGE_ENC);
+ set_pte(ptep, pte);
+
+ /* Flush TLB to map the page unencrypted */
+ write_cr3(top_level_pgt);
+
+ /*
+ * Changing encryption attributes of a page requires to flush it from
+ * the caches.
+ */
+ clflush_page(address);
+
+ return 0;
+}
+
+int set_page_decrypted(unsigned long address)
+{
+ return __set_page_decrypted(&mapping_info, address);
+}
+
static void pf_error(unsigned long error_code, unsigned long address,
struct pt_regs *regs)
{
diff --git a/arch/x86/boot/compressed/misc.h b/arch/x86/boot/compressed/misc.h
index 0e3508c5c15c..42f68a858a35 100644
--- a/arch/x86/boot/compressed/misc.h
+++ b/arch/x86/boot/compressed/misc.h
@@ -98,6 +98,7 @@ static inline void choose_random_location(unsigned long input,
#endif
#ifdef CONFIG_X86_64
+extern int set_page_decrypted(unsigned long address);
extern unsigned char _pgtable[];
#endif
--
2.17.1
On Thu, 19 Mar 2020, Joerg Roedel wrote:
> From: Joerg Roedel <[email protected]>
>
> This function is needed to map the GHCB for SEV-ES guests. The GHCB is
> used for communication with the hypervisor, so its content must not be
> encrypted.
>
> Signed-off-by: Joerg Roedel <[email protected]>
> ---
> arch/x86/boot/compressed/ident_map_64.c | 125 ++++++++++++++++++++++++
> arch/x86/boot/compressed/misc.h | 1 +
> 2 files changed, 126 insertions(+)
>
> diff --git a/arch/x86/boot/compressed/ident_map_64.c b/arch/x86/boot/compressed/ident_map_64.c
> index feb180cced28..04a5ff4bda66 100644
> --- a/arch/x86/boot/compressed/ident_map_64.c
> +++ b/arch/x86/boot/compressed/ident_map_64.c
> @@ -26,6 +26,7 @@
> #include <asm/init.h>
> #include <asm/pgtable.h>
> #include <asm/trap_defs.h>
> +#include <asm/cmpxchg.h>
> /* Use the static base for this part of the boot process */
> #undef __PAGE_OFFSET
> #define __PAGE_OFFSET __PAGE_OFFSET_BASE
> @@ -157,6 +158,130 @@ void initialize_identity_maps(void)
> write_cr3(top_level_pgt);
> }
>
> +static pte_t *split_large_pmd(struct x86_mapping_info *info,
> + pmd_t *pmdp, unsigned long __address)
> +{
> + unsigned long page_flags;
> + unsigned long address;
> + pte_t *pte;
> + pmd_t pmd;
> + int i;
> +
> + pte = (pte_t *)info->alloc_pgt_page(info->context);
> + if (!pte)
> + return NULL;
> +
> + address = __address & PMD_MASK;
> + /* No large page - clear PSE flag */
> + page_flags = info->page_flag & ~_PAGE_PSE;
> +
> + /* Populate the PTEs */
> + for (i = 0; i < PTRS_PER_PMD; i++) {
> + set_pte(&pte[i], __pte(address | page_flags));
> + address += PAGE_SIZE;
> + }
> +
> + /*
> + * Ideally we need to clear the large PMD first and do a TLB
> + * flush before we write the new PMD. But the 2M range of the
> + * PMD might contain the code we execute and/or the stack
> + * we are on, so we can't do that. But that should be safe here
> + * because we are going from large to small mappings and we are
> + * also the only user of the page-table, so there is no chance
> + * of a TLB multihit.
> + */
> + pmd = __pmd((unsigned long)pte | info->kernpg_flag);
> + set_pmd(pmdp, pmd);
> + /* Flush TLB to establish the new PMD */
> + write_cr3(top_level_pgt);
> +
> + return pte + pte_index(__address);
> +}
> +
> +static void clflush_page(unsigned long address)
> +{
> + unsigned int flush_size;
> + char *cl, *start, *end;
> +
> + /*
> + * Hardcode cl-size to 64 - CPUID can't be used here because that might
> + * cause another #VC exception and the GHCB is not ready to use yet.
> + */
> + flush_size = 64;
> + start = (char *)(address & PAGE_MASK);
> + end = start + PAGE_SIZE;
> +
> + /*
> + * First make sure there are no pending writes on the cache-lines to
> + * flush.
> + */
> + asm volatile("mfence" : : : "memory");
> +
> + for (cl = start; cl != end; cl += flush_size)
> + clflush(cl);
> +}
> +
> +static int __set_page_decrypted(struct x86_mapping_info *info,
> + unsigned long address)
> +{
> + unsigned long scratch, *target;
> + pgd_t *pgdp = (pgd_t *)top_level_pgt;
> + p4d_t *p4dp;
> + pud_t *pudp;
> + pmd_t *pmdp;
> + pte_t *ptep, pte;
> +
> + /*
> + * First make sure there is a PMD mapping for 'address'.
> + * It should already exist, but keep things generic.
> + *
> + * To map the page just read from it and fault it in if there is no
> + * mapping yet. add_identity_map() can't be called here because that
> + * would unconditionally map the address on PMD level, destroying any
> + * PTE-level mappings that might already exist. Also do something
> + * useless with 'scratch' so the access won't be optimized away.
> + */
> + target = (unsigned long *)address;
> + scratch = *target;
> + arch_cmpxchg(target, scratch, scratch);
> +
> + /*
> + * The page is mapped at least with PMD size - so skip checks and walk
> + * directly to the PMD.
> + */
> + p4dp = p4d_offset(pgdp, address);
> + pudp = pud_offset(p4dp, address);
> + pmdp = pmd_offset(pudp, address);
> +
> + if (pmd_large(*pmdp))
> + ptep = split_large_pmd(info, pmdp, address);
> + else
> + ptep = pte_offset_kernel(pmdp, address);
> +
> + if (!ptep)
> + return -ENOMEM;
> +
> + /* Clear encryption flag and write new pte */
> + pte = pte_clear_flags(*ptep, _PAGE_ENC);
> + set_pte(ptep, pte);
> +
> + /* Flush TLB to map the page unencrypted */
> + write_cr3(top_level_pgt);
> +
Is there a guarantee that this flushes the tlb if cr3 == top_level_pgt
alrady without an invlpg?
> + /*
> + * Changing encryption attributes of a page requires to flush it from
> + * the caches.
> + */
> + clflush_page(address);
> +
> + return 0;
> +}
> +
> +int set_page_decrypted(unsigned long address)
> +{
> + return __set_page_decrypted(&mapping_info, address);
> +}
> +
> static void pf_error(unsigned long error_code, unsigned long address,
> struct pt_regs *regs)
> {
> diff --git a/arch/x86/boot/compressed/misc.h b/arch/x86/boot/compressed/misc.h
> index 0e3508c5c15c..42f68a858a35 100644
> --- a/arch/x86/boot/compressed/misc.h
> +++ b/arch/x86/boot/compressed/misc.h
> @@ -98,6 +98,7 @@ static inline void choose_random_location(unsigned long input,
> #endif
>
> #ifdef CONFIG_X86_64
> +extern int set_page_decrypted(unsigned long address);
> extern unsigned char _pgtable[];
> #endif
>
On 3/20/20 1:53 PM, David Rientjes wrote:
>> +
>> + /* Clear encryption flag and write new pte */
>> + pte = pte_clear_flags(*ptep, _PAGE_ENC);
>> + set_pte(ptep, pte);
>> +
>> + /* Flush TLB to map the page unencrypted */
>> + write_cr3(top_level_pgt);
>> +
> Is there a guarantee that this flushes the tlb if cr3 == top_level_pgt
> alrady without an invlpg?
Ahh, good catch.
It *never* flushes global pages. For a generic function like this, that
seems pretty dangerous because the PTEs it goes after could quite easily
be Global. It's also not _obviously_ correct if PCIDs are in play
(which I don't think they are on AMD).
A flush_tlb_global() is probably more appropriate. Better yet, is there
a reason not to use flush_tlb_kernel_range()? I don't think it's
necessary to whack the entire TLB for one PTE set.
On Fri, Mar 20, 2020 at 02:02:13PM -0700, Dave Hansen wrote:
> It *never* flushes global pages. For a generic function like this, that
> seems pretty dangerous because the PTEs it goes after could quite easily
> be Global. It's also not _obviously_ correct if PCIDs are in play
> (which I don't think they are on AMD).
>
> A flush_tlb_global() is probably more appropriate. Better yet, is there
> a reason not to use flush_tlb_kernel_range()? I don't think it's
> necessary to whack the entire TLB for one PTE set.
This code runs before the actual kernel image is decompressed, so there
is no PCID and no global pages (I think CR4.PGE is still 0). So a
cr3-write is enough to flush the TLB. Also the TLB-flush helpers of the
running kernel are not available here.
Regards,
Joerg
On 3/20/20 3:12 PM, Joerg Roedel wrote:
> On Fri, Mar 20, 2020 at 02:02:13PM -0700, Dave Hansen wrote:
>> It *never* flushes global pages. For a generic function like this, that
>> seems pretty dangerous because the PTEs it goes after could quite easily
>> be Global. It's also not _obviously_ correct if PCIDs are in play
>> (which I don't think they are on AMD).
>>
>> A flush_tlb_global() is probably more appropriate. Better yet, is there
>> a reason not to use flush_tlb_kernel_range()? I don't think it's
>> necessary to whack the entire TLB for one PTE set.
>
> This code runs before the actual kernel image is decompressed, so there
> is no PCID and no global pages (I think CR4.PGE is still 0). So a
> cr3-write is enough to flush the TLB. Also the TLB-flush helpers of the
> running kernel are not available here.
Geez, I always forget about the compressed code. :) Good point about PCIDs.
In any case, I thought this all came through initialize_identity_maps(),
which does, for instance:
mapping_info.page_flag = __PAGE_KERNEL_LARGE_EXEC | sme_me_mask;
Where:
#define __PAGE_KERNEL_LARGE_EXEC (__PP|__RW| 0|___A| 0|___D|_PSE|___G)
That looks like it has the Global bit set. Does that not apply here
somehow?
On Fri, Mar 20, 2020 at 03:26:09PM -0700, Dave Hansen wrote:
> In any case, I thought this all came through initialize_identity_maps(),
> which does, for instance:
>
> mapping_info.page_flag = __PAGE_KERNEL_LARGE_EXEC | sme_me_mask;
>
> Where:
>
> #define __PAGE_KERNEL_LARGE_EXEC (__PP|__RW| 0|___A| 0|___D|_PSE|___G)
>
> That looks like it has the Global bit set. Does that not apply here
> somehow?
No, as the value of %cr4 at boot is 0x00000020, so PGE is not set and
global pages are not enabled. It wouldn't make sense anyhow, as global
pages only make sense when there are more than one address space, which
is not the case that early in boot.
Regards,
Joerg