The following commit
"
x86/kvmclock: Remove memblock dependency
https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=368a540e0232ad446931f5a4e8a5e06f69f21343
"
introduced SEV guest regression.
The guest physical address holding the wall_clock and hv_clock_boot
are shared with the hypervisor must be mapped with C=0 when SEV
is active. To clear the C-bit we use kernel_physical_mapping_init() to
split the large pages. The above commit moved the kvmclock initialization
very early and kernel_physical_mapping_init() fails to allocate memory
while spliting the large page.
To solve it, we add a special .data..decrypted section, this section can be
used to hold the shared variables. Early boot code maps this section with
C=0. The section is pmd aligned and sized to avoid the need to split the pages.
Caller can use __decrypted attribute to add the variables in .data..decrypted
section.
Cc: Tom Lendacky <[email protected]>
Cc: Thomas Gleixner <[email protected]>
Cc: Borislav Petkov <[email protected]>
Cc: Paolo Bonzini <[email protected]>
Cc: Sean Christopherson <[email protected]>
Cc: "Radim Krčmář" <[email protected]>
Changes since v2:
- commit message and code comment improvements (based on Boris feedback)
- move sme_populate_pgd fixes in new patch.
- drop stable Cc - will submit to stable after patch is upstreamed.
Changes since v1:
- move the logic to re-arrange mapping in new patch
- move the definition of __start_data_* in mem_encrypt.h
- map the workarea buffer as encrypted when SEV is enabled
- enhance the sme_populate_pgd to update the pte/pmd flags when mapping exist
Brijesh Singh (4):
x86/mm: Restructure sme_encrypt_kernel()
x86/mm: fix sme_populate_pgd() to update page flags
x86/mm: add .data..decrypted section to hold shared variables
x86/kvm: use __decrypted attribute in shared variables
arch/x86/include/asm/mem_encrypt.h | 6 +
arch/x86/kernel/head64.c | 11 ++
arch/x86/kernel/kvmclock.c | 30 ++++-
arch/x86/kernel/vmlinux.lds.S | 17 +++
arch/x86/mm/mem_encrypt_identity.c | 232 +++++++++++++++++++++++++++----------
5 files changed, 229 insertions(+), 67 deletions(-)
--
2.7.4
Re-arrange the sme_encrypt_kernel() by moving the workarea map/unmap
logic in a separate static function. There are no logical changes in this
patch. The restructuring will allow us to expand the sme_encrypt_kernel
in future.
Signed-off-by: Brijesh Singh <[email protected]>
Cc: Tom Lendacky <[email protected]>
Cc: [email protected]
Cc: Thomas Gleixner <[email protected]>
Cc: Borislav Petkov <[email protected]>
Cc: "H. Peter Anvin" <[email protected]>
Cc: [email protected]
Cc: Paolo Bonzini <[email protected]>
Cc: Sean Christopherson <[email protected]>
Cc: [email protected]
Cc: "Radim Krčmář" <[email protected]>
---
arch/x86/mm/mem_encrypt_identity.c | 160 ++++++++++++++++++++++++-------------
1 file changed, 104 insertions(+), 56 deletions(-)
diff --git a/arch/x86/mm/mem_encrypt_identity.c b/arch/x86/mm/mem_encrypt_identity.c
index 7ae3686..92265d3 100644
--- a/arch/x86/mm/mem_encrypt_identity.c
+++ b/arch/x86/mm/mem_encrypt_identity.c
@@ -72,6 +72,22 @@ struct sme_populate_pgd_data {
unsigned long vaddr_end;
};
+struct sme_workarea_data {
+ unsigned long kernel_start;
+ unsigned long kernel_end;
+ unsigned long kernel_len;
+
+ unsigned long initrd_start;
+ unsigned long initrd_end;
+ unsigned long initrd_len;
+
+ unsigned long workarea_start;
+ unsigned long workarea_end;
+ unsigned long workarea_len;
+
+ unsigned long decrypted_base;
+};
+
static char sme_cmdline_arg[] __initdata = "mem_encrypt";
static char sme_cmdline_on[] __initdata = "on";
static char sme_cmdline_off[] __initdata = "off";
@@ -266,19 +282,17 @@ static unsigned long __init sme_pgtable_calc(unsigned long len)
return entries + tables;
}
-void __init sme_encrypt_kernel(struct boot_params *bp)
+static void __init build_workarea_map(struct boot_params *bp,
+ struct sme_workarea_data *wa,
+ struct sme_populate_pgd_data *ppd)
{
unsigned long workarea_start, workarea_end, workarea_len;
unsigned long execute_start, execute_end, execute_len;
unsigned long kernel_start, kernel_end, kernel_len;
unsigned long initrd_start, initrd_end, initrd_len;
- struct sme_populate_pgd_data ppd;
unsigned long pgtable_area_len;
unsigned long decrypted_base;
- if (!sme_active())
- return;
-
/*
* Prepare for encrypting the kernel and initrd by building new
* pagetables with the necessary attributes needed to encrypt the
@@ -358,17 +372,17 @@ void __init sme_encrypt_kernel(struct boot_params *bp)
* pagetables and when the new encrypted and decrypted kernel
* mappings are populated.
*/
- ppd.pgtable_area = (void *)execute_end;
+ ppd->pgtable_area = (void *)execute_end;
/*
* Make sure the current pagetable structure has entries for
* addressing the workarea.
*/
- ppd.pgd = (pgd_t *)native_read_cr3_pa();
- ppd.paddr = workarea_start;
- ppd.vaddr = workarea_start;
- ppd.vaddr_end = workarea_end;
- sme_map_range_decrypted(&ppd);
+ ppd->pgd = (pgd_t *)native_read_cr3_pa();
+ ppd->paddr = workarea_start;
+ ppd->vaddr = workarea_start;
+ ppd->vaddr_end = workarea_end;
+ sme_map_range_decrypted(ppd);
/* Flush the TLB - no globals so cr3 is enough */
native_write_cr3(__native_read_cr3());
@@ -379,9 +393,9 @@ void __init sme_encrypt_kernel(struct boot_params *bp)
* then be populated with new PUDs and PMDs as the encrypted and
* decrypted kernel mappings are created.
*/
- ppd.pgd = ppd.pgtable_area;
- memset(ppd.pgd, 0, sizeof(pgd_t) * PTRS_PER_PGD);
- ppd.pgtable_area += sizeof(pgd_t) * PTRS_PER_PGD;
+ ppd->pgd = ppd->pgtable_area;
+ memset(ppd->pgd, 0, sizeof(pgd_t) * PTRS_PER_PGD);
+ ppd->pgtable_area += sizeof(pgd_t) * PTRS_PER_PGD;
/*
* A different PGD index/entry must be used to get different
@@ -399,75 +413,109 @@ void __init sme_encrypt_kernel(struct boot_params *bp)
decrypted_base <<= PGDIR_SHIFT;
/* Add encrypted kernel (identity) mappings */
- ppd.paddr = kernel_start;
- ppd.vaddr = kernel_start;
- ppd.vaddr_end = kernel_end;
- sme_map_range_encrypted(&ppd);
+ ppd->paddr = kernel_start;
+ ppd->vaddr = kernel_start;
+ ppd->vaddr_end = kernel_end;
+ sme_map_range_encrypted(ppd);
/* Add decrypted, write-protected kernel (non-identity) mappings */
- ppd.paddr = kernel_start;
- ppd.vaddr = kernel_start + decrypted_base;
- ppd.vaddr_end = kernel_end + decrypted_base;
- sme_map_range_decrypted_wp(&ppd);
+ ppd->paddr = kernel_start;
+ ppd->vaddr = kernel_start + decrypted_base;
+ ppd->vaddr_end = kernel_end + decrypted_base;
+ sme_map_range_decrypted_wp(ppd);
if (initrd_len) {
/* Add encrypted initrd (identity) mappings */
- ppd.paddr = initrd_start;
- ppd.vaddr = initrd_start;
- ppd.vaddr_end = initrd_end;
- sme_map_range_encrypted(&ppd);
+ ppd->paddr = initrd_start;
+ ppd->vaddr = initrd_start;
+ ppd->vaddr_end = initrd_end;
+ sme_map_range_encrypted(ppd);
/*
* Add decrypted, write-protected initrd (non-identity) mappings
*/
- ppd.paddr = initrd_start;
- ppd.vaddr = initrd_start + decrypted_base;
- ppd.vaddr_end = initrd_end + decrypted_base;
- sme_map_range_decrypted_wp(&ppd);
+ ppd->paddr = initrd_start;
+ ppd->vaddr = initrd_start + decrypted_base;
+ ppd->vaddr_end = initrd_end + decrypted_base;
+ sme_map_range_decrypted_wp(ppd);
}
/* Add decrypted workarea mappings to both kernel mappings */
- ppd.paddr = workarea_start;
- ppd.vaddr = workarea_start;
- ppd.vaddr_end = workarea_end;
- sme_map_range_decrypted(&ppd);
+ ppd->paddr = workarea_start;
+ ppd->vaddr = workarea_start;
+ ppd->vaddr_end = workarea_end;
+ sme_map_range_decrypted(ppd);
- ppd.paddr = workarea_start;
- ppd.vaddr = workarea_start + decrypted_base;
- ppd.vaddr_end = workarea_end + decrypted_base;
- sme_map_range_decrypted(&ppd);
+ ppd->paddr = workarea_start;
+ ppd->vaddr = workarea_start + decrypted_base;
+ ppd->vaddr_end = workarea_end + decrypted_base;
+ sme_map_range_decrypted(ppd);
- /* Perform the encryption */
- sme_encrypt_execute(kernel_start, kernel_start + decrypted_base,
- kernel_len, workarea_start, (unsigned long)ppd.pgd);
+ wa->kernel_start = kernel_start;
+ wa->kernel_end = kernel_end;
+ wa->kernel_len = kernel_len;
- if (initrd_len)
- sme_encrypt_execute(initrd_start, initrd_start + decrypted_base,
- initrd_len, workarea_start,
- (unsigned long)ppd.pgd);
+ wa->initrd_start = initrd_start;
+ wa->initrd_end = initrd_end;
+ wa->initrd_len = initrd_len;
+
+ wa->workarea_start = workarea_start;
+ wa->workarea_end = workarea_end;
+ wa->workarea_len = workarea_len;
+
+ wa->decrypted_base = decrypted_base;
+}
+static void __init teardown_workarea_map(struct sme_workarea_data *wa,
+ struct sme_populate_pgd_data *ppd)
+{
/*
* At this point we are running encrypted. Remove the mappings for
* the decrypted areas - all that is needed for this is to remove
* the PGD entry/entries.
*/
- ppd.vaddr = kernel_start + decrypted_base;
- ppd.vaddr_end = kernel_end + decrypted_base;
- sme_clear_pgd(&ppd);
-
- if (initrd_len) {
- ppd.vaddr = initrd_start + decrypted_base;
- ppd.vaddr_end = initrd_end + decrypted_base;
- sme_clear_pgd(&ppd);
+ ppd->vaddr = wa->kernel_start + wa->decrypted_base;
+ ppd->vaddr_end = wa->kernel_end + wa->decrypted_base;
+ sme_clear_pgd(ppd);
+
+ if (wa->initrd_len) {
+ ppd->vaddr = wa->initrd_start + wa->decrypted_base;
+ ppd->vaddr_end = wa->initrd_end + wa->decrypted_base;
+ sme_clear_pgd(ppd);
}
- ppd.vaddr = workarea_start + decrypted_base;
- ppd.vaddr_end = workarea_end + decrypted_base;
- sme_clear_pgd(&ppd);
+ ppd->vaddr = wa->workarea_start + wa->decrypted_base;
+ ppd->vaddr_end = wa->workarea_end + wa->decrypted_base;
+ sme_clear_pgd(ppd);
/* Flush the TLB - no globals so cr3 is enough */
native_write_cr3(__native_read_cr3());
}
+void __init sme_encrypt_kernel(struct boot_params *bp)
+{
+ struct sme_populate_pgd_data ppd;
+ struct sme_workarea_data wa;
+
+ if (!sme_active())
+ return;
+
+ build_workarea_map(bp, &wa, &ppd);
+
+ /* When SEV is active, encrypt kernel and initrd */
+ sme_encrypt_execute(wa.kernel_start,
+ wa.kernel_start + wa.decrypted_base,
+ wa.kernel_len, wa.workarea_start,
+ (unsigned long)ppd.pgd);
+
+ if (wa.initrd_len)
+ sme_encrypt_execute(wa.initrd_start,
+ wa.initrd_start + wa.decrypted_base,
+ wa.initrd_len, wa.workarea_start,
+ (unsigned long)ppd.pgd);
+
+ teardown_workarea_map(&wa, &ppd);
+}
+
void __init sme_enable(struct boot_params *bp)
{
const char *cmdline_ptr, *cmdline_arg, *cmdline_on, *cmdline_off;
--
2.7.4
The following commit:
368a540e0232 (x86/kvmclock: Remove memblock dependency)
caused SEV guest regression. When SEV is active, we map the shared
variables (wall_clock and hv_clock_boot) with C=0 to ensure that both
the guest and the hypervisor is able to access the data. To map the
variables we use kernel_physical_mapping_init() to split the large pages,
but this routine fails to allocate a new page. Before the above commit,
kvmclock initialization was called after memory allocator was available
but now its called early during boot.
Recently we added a special .data..decrypted section to hold the shared
variables. This section is mapped with C=0 early during boot. Use
__decrypted attribute to put the wall_clock and hv_clock_boot in
.data..decrypted section so that they are mapped with C=0.
Signed-off-by: Brijesh Singh <[email protected]>
Fixes: 368a540e0232 ("x86/kvmclock: Remove memblock dependency")
Cc: Tom Lendacky <[email protected]>
Cc: [email protected]
Cc: Thomas Gleixner <[email protected]>
Cc: Borislav Petkov <[email protected]>
Cc: "H. Peter Anvin" <[email protected]>
Cc: [email protected]
Cc: Paolo Bonzini <[email protected]>
Cc: Sean Christopherson <[email protected]>
Cc: [email protected]
Cc: "Radim Krčmář" <[email protected]>
---
arch/x86/kernel/kvmclock.c | 30 +++++++++++++++++++++++++-----
1 file changed, 25 insertions(+), 5 deletions(-)
diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c
index 1e67646..08f5f8a 100644
--- a/arch/x86/kernel/kvmclock.c
+++ b/arch/x86/kernel/kvmclock.c
@@ -28,6 +28,7 @@
#include <linux/sched/clock.h>
#include <linux/mm.h>
#include <linux/slab.h>
+#include <linux/set_memory.h>
#include <asm/hypervisor.h>
#include <asm/mem_encrypt.h>
@@ -61,8 +62,8 @@ early_param("no-kvmclock-vsyscall", parse_no_kvmclock_vsyscall);
(PAGE_SIZE / sizeof(struct pvclock_vsyscall_time_info))
static struct pvclock_vsyscall_time_info
- hv_clock_boot[HVC_BOOT_ARRAY_SIZE] __aligned(PAGE_SIZE);
-static struct pvclock_wall_clock wall_clock;
+ hv_clock_boot[HVC_BOOT_ARRAY_SIZE] __decrypted __aligned(PAGE_SIZE);
+static struct pvclock_wall_clock wall_clock __decrypted;
static DEFINE_PER_CPU(struct pvclock_vsyscall_time_info *, hv_clock_per_cpu);
static inline struct pvclock_vcpu_time_info *this_cpu_pvti(void)
@@ -267,10 +268,29 @@ static int kvmclock_setup_percpu(unsigned int cpu)
return 0;
/* Use the static page for the first CPUs, allocate otherwise */
- if (cpu < HVC_BOOT_ARRAY_SIZE)
+ if (cpu < HVC_BOOT_ARRAY_SIZE) {
p = &hv_clock_boot[cpu];
- else
- p = kzalloc(sizeof(*p), GFP_KERNEL);
+ } else {
+ int rc;
+ unsigned int sz = sizeof(*p);
+
+ if (sev_active())
+ sz = PAGE_ALIGN(sz);
+
+ p = kzalloc(sz, GFP_KERNEL);
+
+ /*
+ * The physical address of per-cpu variable will be shared with
+ * the hypervisor. Let's clear the C-bit before we assign the
+ * memory to per_cpu variable.
+ */
+ if (p && sev_active()) {
+ rc = set_memory_decrypted((unsigned long)p, sz >> PAGE_SHIFT);
+ if (rc)
+ return rc;
+ memset(p, 0, sz);
+ }
+ }
per_cpu(hv_clock_per_cpu, cpu) = p;
return p ? 0 : -ENOMEM;
--
2.7.4
kvmclock defines few static variables which are shared with the
hypervisor during the kvmclock initialization.
When SEV is active, memory is encrypted with a guest-specific key, and
if guest OS wants to share the memory region with hypervisor then it must
clear the C-bit before sharing it. Currently, we use
kernel_physical_mapping_init() to split large pages before clearing the
C-bit on shared pages. But it fails when called from the kvmclock
initialization (mainly because memblock allocator is not ready that early
during boot).
Add a __decrypted section attribute which can be used when defining
such shared variable. The so-defined variables will be placed in the
.data..decrypted section. This section is mapped with C=0 early
during boot, we also ensure that the initialized values are updated
to match with C=0 (i.e perform an in-place decryption). The
.data..decrypted section is PMD-aligned and sized so that we avoid
the need to split the large pages when mapping the section.
The sme_encrypt_kernel() was used to perform the in-place encryption
of the Linux kernel and initrd when SME is active. The routine has been
enhanced to decrypt the .data..decrypted section for both SME and SEV
cases.
Signed-off-by: Brijesh Singh <[email protected]>
Cc: Tom Lendacky <[email protected]>
Cc: [email protected]
Cc: Thomas Gleixner <[email protected]>
Cc: Borislav Petkov <[email protected]>
Cc: "H. Peter Anvin" <[email protected]>
Cc: [email protected]
Cc: Paolo Bonzini <[email protected]>
Cc: Sean Christopherson <[email protected]>
Cc: [email protected]
Cc: "Radim Krčmář" <[email protected]>
---
arch/x86/include/asm/mem_encrypt.h | 6 +++
arch/x86/kernel/head64.c | 11 +++++
arch/x86/kernel/vmlinux.lds.S | 17 +++++++
arch/x86/mm/mem_encrypt_identity.c | 94 ++++++++++++++++++++++++++++++++------
4 files changed, 113 insertions(+), 15 deletions(-)
diff --git a/arch/x86/include/asm/mem_encrypt.h b/arch/x86/include/asm/mem_encrypt.h
index c064383..802b2eb 100644
--- a/arch/x86/include/asm/mem_encrypt.h
+++ b/arch/x86/include/asm/mem_encrypt.h
@@ -52,6 +52,8 @@ void __init mem_encrypt_init(void);
bool sme_active(void);
bool sev_active(void);
+#define __decrypted __attribute__((__section__(".data..decrypted")))
+
#else /* !CONFIG_AMD_MEM_ENCRYPT */
#define sme_me_mask 0ULL
@@ -77,6 +79,8 @@ early_set_memory_decrypted(unsigned long vaddr, unsigned long size) { return 0;
static inline int __init
early_set_memory_encrypted(unsigned long vaddr, unsigned long size) { return 0; }
+#define __decrypted
+
#endif /* CONFIG_AMD_MEM_ENCRYPT */
/*
@@ -88,6 +92,8 @@ early_set_memory_encrypted(unsigned long vaddr, unsigned long size) { return 0;
#define __sme_pa(x) (__pa(x) | sme_me_mask)
#define __sme_pa_nodebug(x) (__pa_nodebug(x) | sme_me_mask)
+extern char __start_data_decrypted[], __end_data_decrypted[];
+
#endif /* __ASSEMBLY__ */
#endif /* __X86_MEM_ENCRYPT_H__ */
diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c
index 8047379..af39d68 100644
--- a/arch/x86/kernel/head64.c
+++ b/arch/x86/kernel/head64.c
@@ -112,6 +112,7 @@ static bool __head check_la57_support(unsigned long physaddr)
unsigned long __head __startup_64(unsigned long physaddr,
struct boot_params *bp)
{
+ unsigned long vaddr, vaddr_end;
unsigned long load_delta, *p;
unsigned long pgtable_flags;
pgdval_t *pgd;
@@ -234,6 +235,16 @@ unsigned long __head __startup_64(unsigned long physaddr,
/* Encrypt the kernel and related (if SME is active) */
sme_encrypt_kernel(bp);
+ /* Clear the memory encryption mask from the .data..decrypted section. */
+ if (mem_encrypt_active()) {
+ vaddr = (unsigned long)__start_data_decrypted;
+ vaddr_end = (unsigned long)__end_data_decrypted;
+ for (; vaddr < vaddr_end; vaddr += PMD_SIZE) {
+ i = pmd_index(vaddr);
+ pmd[i] -= sme_get_me_mask();
+ }
+ }
+
/*
* Return the SME encryption mask (if SME is active) to be used as a
* modifier for the initial pgdir entry programmed into CR3.
diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S
index 8bde0a4..78d3169 100644
--- a/arch/x86/kernel/vmlinux.lds.S
+++ b/arch/x86/kernel/vmlinux.lds.S
@@ -89,6 +89,21 @@ PHDRS {
note PT_NOTE FLAGS(0); /* ___ */
}
+/*
+ * This section contains data which will be mapped as decrypted. Memory
+ * encryption operates on a page basis. Make this section PMD-aligned
+ * to avoid spliting the pages while mapping the section early.
+ *
+ * Note: We use a separate section so that only this section gets
+ * decrypted to avoid exposing more than we wish.
+ */
+#define DATA_DECRYPTED \
+ . = ALIGN(PMD_SIZE); \
+ __start_data_decrypted = .; \
+ *(.data..decrypted); \
+ . = ALIGN(PMD_SIZE); \
+ __end_data_decrypted = .; \
+
SECTIONS
{
#ifdef CONFIG_X86_32
@@ -171,6 +186,8 @@ SECTIONS
/* rarely changed data like cpu maps */
READ_MOSTLY_DATA(INTERNODE_CACHE_BYTES)
+ DATA_DECRYPTED
+
/* End of data section */
_edata = .;
} :data
diff --git a/arch/x86/mm/mem_encrypt_identity.c b/arch/x86/mm/mem_encrypt_identity.c
index 7659e65..08e70ba 100644
--- a/arch/x86/mm/mem_encrypt_identity.c
+++ b/arch/x86/mm/mem_encrypt_identity.c
@@ -51,6 +51,8 @@
(_PAGE_PAT | _PAGE_PWT))
#define PMD_FLAGS_ENC (PMD_FLAGS_LARGE | _PAGE_ENC)
+#define PMD_FLAGS_ENC_WP ((PMD_FLAGS_ENC & ~_PAGE_CACHE_MASK) | \
+ (_PAGE_PAT | _PAGE_PWT))
#define PTE_FLAGS (__PAGE_KERNEL_EXEC & ~_PAGE_GLOBAL)
@@ -59,6 +61,8 @@
(_PAGE_PAT | _PAGE_PWT))
#define PTE_FLAGS_ENC (PTE_FLAGS | _PAGE_ENC)
+#define PTE_FLAGS_ENC_WP ((PTE_FLAGS_ENC & ~_PAGE_CACHE_MASK) | \
+ (_PAGE_PAT | _PAGE_PWT))
struct sme_populate_pgd_data {
void *pgtable_area;
@@ -231,6 +235,11 @@ static void __init sme_map_range_encrypted(struct sme_populate_pgd_data *ppd)
__sme_map_range(ppd, PMD_FLAGS_ENC, PTE_FLAGS_ENC);
}
+static void __init sme_map_range_encrypted_wp(struct sme_populate_pgd_data *ppd)
+{
+ __sme_map_range(ppd, PMD_FLAGS_ENC_WP, PTE_FLAGS_ENC_WP);
+}
+
static void __init sme_map_range_decrypted(struct sme_populate_pgd_data *ppd)
{
__sme_map_range(ppd, PMD_FLAGS_DEC, PTE_FLAGS_DEC);
@@ -378,7 +387,10 @@ static void __init build_workarea_map(struct boot_params *bp,
ppd->paddr = workarea_start;
ppd->vaddr = workarea_start;
ppd->vaddr_end = workarea_end;
- sme_map_range_decrypted(ppd);
+ if (sev_active())
+ sme_map_range_encrypted(ppd);
+ else
+ sme_map_range_decrypted(ppd);
/* Flush the TLB - no globals so cr3 is enough */
native_write_cr3(__native_read_cr3());
@@ -435,16 +447,27 @@ static void __init build_workarea_map(struct boot_params *bp,
sme_map_range_decrypted_wp(ppd);
}
- /* Add decrypted workarea mappings to both kernel mappings */
+ /*
+ * When SEV is active, kernel is already encrypted hence mapping
+ * the initial workarea_start as encrypted. When SME is active,
+ * the kernel is not encrypted hence add decrypted workarea
+ * mappings to both kernel mappings.
+ */
ppd->paddr = workarea_start;
ppd->vaddr = workarea_start;
ppd->vaddr_end = workarea_end;
- sme_map_range_decrypted(ppd);
+ if (sev_active())
+ sme_map_range_encrypted(ppd);
+ else
+ sme_map_range_decrypted(ppd);
ppd->paddr = workarea_start;
ppd->vaddr = workarea_start + decrypted_base;
ppd->vaddr_end = workarea_end + decrypted_base;
- sme_map_range_decrypted(ppd);
+ if (sev_active())
+ sme_map_range_encrypted(ppd);
+ else
+ sme_map_range_decrypted(ppd);
wa->kernel_start = kernel_start;
wa->kernel_end = kernel_end;
@@ -487,28 +510,69 @@ static void __init teardown_workarea_map(struct sme_workarea_data *wa,
native_write_cr3(__native_read_cr3());
}
+static void __init decrypt_shared_data(struct sme_workarea_data *wa,
+ struct sme_populate_pgd_data *ppd)
+{
+ unsigned long decrypted_start, decrypted_end, decrypted_len;
+
+ /* Physical addresses of decrypted data section */
+ decrypted_start = __pa_symbol(__start_data_decrypted);
+ decrypted_end = ALIGN(__pa_symbol(__end_data_decrypted), PMD_PAGE_SIZE);
+ decrypted_len = decrypted_end - decrypted_start;
+
+ if (!decrypted_len)
+ return;
+
+ /* Add decrypted mapping for the section (identity) */
+ ppd->paddr = decrypted_start;
+ ppd->vaddr = decrypted_start;
+ ppd->vaddr_end = decrypted_end;
+ sme_map_range_decrypted(ppd);
+
+ /* Add encrypted-wp mapping for the section (non-identity) */
+ ppd->paddr = decrypted_start;
+ ppd->vaddr = decrypted_start + wa->decrypted_base;
+ ppd->vaddr_end = decrypted_end + wa->decrypted_base;
+ sme_map_range_encrypted_wp(ppd);
+
+ /* Perform in-place decryption */
+ sme_encrypt_execute(decrypted_start,
+ decrypted_start + wa->decrypted_base,
+ decrypted_len, wa->workarea_start,
+ (unsigned long)ppd->pgd);
+
+ ppd->vaddr = decrypted_start + wa->decrypted_base;
+ ppd->vaddr_end = decrypted_end + wa->decrypted_base;
+ sme_clear_pgd(ppd);
+}
+
void __init sme_encrypt_kernel(struct boot_params *bp)
{
struct sme_populate_pgd_data ppd;
struct sme_workarea_data wa;
- if (!sme_active())
+ if (!mem_encrypt_active())
return;
build_workarea_map(bp, &wa, &ppd);
- /* When SEV is active, encrypt kernel and initrd */
- sme_encrypt_execute(wa.kernel_start,
- wa.kernel_start + wa.decrypted_base,
- wa.kernel_len, wa.workarea_start,
- (unsigned long)ppd.pgd);
-
- if (wa.initrd_len)
- sme_encrypt_execute(wa.initrd_start,
- wa.initrd_start + wa.decrypted_base,
- wa.initrd_len, wa.workarea_start,
+ /* When SME is active, encrypt kernel and initrd */
+ if (sme_active()) {
+ sme_encrypt_execute(wa.kernel_start,
+ wa.kernel_start + wa.decrypted_base,
+ wa.kernel_len, wa.workarea_start,
(unsigned long)ppd.pgd);
+ if (wa.initrd_len)
+ sme_encrypt_execute(wa.initrd_start,
+ wa.initrd_start + wa.decrypted_base,
+ wa.initrd_len, wa.workarea_start,
+ (unsigned long)ppd.pgd);
+ }
+
+ /* Decrypt the contents of .data..decrypted section */
+ decrypt_shared_data(&wa, &ppd);
+
teardown_workarea_map(&wa, &ppd);
}
--
2.7.4
Fix sme_populate_pgd() to update page flags if the PMD/PTE entry
already exists.
Signed-off-by: Brijesh Singh <[email protected]>
Cc: Tom Lendacky <[email protected]>
Cc: [email protected]
Cc: Thomas Gleixner <[email protected]>
Cc: Borislav Petkov <[email protected]>
Cc: "H. Peter Anvin" <[email protected]>
Cc: [email protected]
Cc: Paolo Bonzini <[email protected]>
Cc: Sean Christopherson <[email protected]>
Cc: [email protected]
Cc: "Radim Krčmář" <[email protected]>
---
arch/x86/mm/mem_encrypt_identity.c | 6 +-----
1 file changed, 1 insertion(+), 5 deletions(-)
diff --git a/arch/x86/mm/mem_encrypt_identity.c b/arch/x86/mm/mem_encrypt_identity.c
index 92265d3..7659e65 100644
--- a/arch/x86/mm/mem_encrypt_identity.c
+++ b/arch/x86/mm/mem_encrypt_identity.c
@@ -154,9 +154,6 @@ static void __init sme_populate_pgd_large(struct sme_populate_pgd_data *ppd)
return;
pmd = pmd_offset(pud, ppd->vaddr);
- if (pmd_large(*pmd))
- return;
-
set_pmd(pmd, __pmd(ppd->paddr | ppd->pmd_flags));
}
@@ -182,8 +179,7 @@ static void __init sme_populate_pgd(struct sme_populate_pgd_data *ppd)
return;
pte = pte_offset_map(pmd, ppd->vaddr);
- if (pte_none(*pte))
- set_pte(pte, __pte(ppd->paddr | ppd->pte_flags));
+ set_pte(pte, __pte(ppd->paddr | ppd->pte_flags));
}
static void __init __sme_map_range_pmd(struct sme_populate_pgd_data *ppd)
--
2.7.4
On 08/29/2018 01:23 PM, Brijesh Singh wrote:
> The following commit
>
> "
> x86/kvmclock: Remove memblock dependency
>
> https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=368a540e0232ad446931f5a4e8a5e06f69f21343
> "
>
> introduced SEV guest regression.
>
> The guest physical address holding the wall_clock and hv_clock_boot
> are shared with the hypervisor must be mapped with C=0 when SEV
> is active. To clear the C-bit we use kernel_physical_mapping_init() to
> split the large pages. The above commit moved the kvmclock initialization
> very early and kernel_physical_mapping_init() fails to allocate memory
> while spliting the large page.
>
> To solve it, we add a special .data..decrypted section, this section can be
> used to hold the shared variables. Early boot code maps this section with
> C=0. The section is pmd aligned and sized to avoid the need to split the pages.
> Caller can use __decrypted attribute to add the variables in .data..decrypted
> section.
>
> Cc: Tom Lendacky <[email protected]>
> Cc: Thomas Gleixner <[email protected]>
> Cc: Borislav Petkov <[email protected]>
> Cc: Paolo Bonzini <[email protected]>
> Cc: Sean Christopherson <[email protected]>
> Cc: "Radim Krčmář" <[email protected]>
>
Reviewed-by: Tom Lendacky <[email protected]>
> Changes since v2:
> - commit message and code comment improvements (based on Boris feedback)
> - move sme_populate_pgd fixes in new patch.
> - drop stable Cc - will submit to stable after patch is upstreamed.
>
> Changes since v1:
> - move the logic to re-arrange mapping in new patch
> - move the definition of __start_data_* in mem_encrypt.h
> - map the workarea buffer as encrypted when SEV is enabled
> - enhance the sme_populate_pgd to update the pte/pmd flags when mapping exist
>
> Brijesh Singh (4):
> x86/mm: Restructure sme_encrypt_kernel()
> x86/mm: fix sme_populate_pgd() to update page flags
> x86/mm: add .data..decrypted section to hold shared variables
> x86/kvm: use __decrypted attribute in shared variables
>
> arch/x86/include/asm/mem_encrypt.h | 6 +
> arch/x86/kernel/head64.c | 11 ++
> arch/x86/kernel/kvmclock.c | 30 ++++-
> arch/x86/kernel/vmlinux.lds.S | 17 +++
> arch/x86/mm/mem_encrypt_identity.c | 232 +++++++++++++++++++++++++++----------
> 5 files changed, 229 insertions(+), 67 deletions(-)
>
On Wed, Aug 29, 2018 at 01:24:00PM -0500, Brijesh Singh wrote:
> The following commit:
>
> 368a540e0232 (x86/kvmclock: Remove memblock dependency)
Checkpatch prefers:
Commit 368a540e0232 ("x86/kvmclock: Remove memblock dependency")
That'll also save three lines in the commit message.
> caused SEV guest regression. When SEV is active, we map the shared
> variables (wall_clock and hv_clock_boot) with C=0 to ensure that both
> the guest and the hypervisor is able to access the data. To map the
Nit: s/is/are
> variables we use kernel_physical_mapping_init() to split the large pages,
> but this routine fails to allocate a new page. Before the above commit,
> kvmclock initialization was called after memory allocator was available
> but now its called early during boot.
What about something like this to make the issue a bit clearer:
variables we use kernel_physical_mapping_init() to split the large pages,
but splitting large pages requires allocating a new PMD, which fails now
that kvmclock initialization is called early during boot.
> Recently we added a special .data..decrypted section to hold the shared
> variables. This section is mapped with C=0 early during boot. Use
> __decrypted attribute to put the wall_clock and hv_clock_boot in
> .data..decrypted section so that they are mapped with C=0.
>
> Signed-off-by: Brijesh Singh <[email protected]>
> Fixes: 368a540e0232 ("x86/kvmclock: Remove memblock dependency")
> Cc: Tom Lendacky <[email protected]>
> Cc: [email protected]
> Cc: Thomas Gleixner <[email protected]>
> Cc: Borislav Petkov <[email protected]>
> Cc: "H. Peter Anvin" <[email protected]>
> Cc: [email protected]
> Cc: Paolo Bonzini <[email protected]>
> Cc: Sean Christopherson <[email protected]>
> Cc: [email protected]
> Cc: "Radim Krčmář" <[email protected]>
> ---
> arch/x86/kernel/kvmclock.c | 30 +++++++++++++++++++++++++-----
> 1 file changed, 25 insertions(+), 5 deletions(-)
>
> diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c
> index 1e67646..08f5f8a 100644
> --- a/arch/x86/kernel/kvmclock.c
> +++ b/arch/x86/kernel/kvmclock.c
> @@ -28,6 +28,7 @@
> #include <linux/sched/clock.h>
> #include <linux/mm.h>
> #include <linux/slab.h>
> +#include <linux/set_memory.h>
>
> #include <asm/hypervisor.h>
> #include <asm/mem_encrypt.h>
> @@ -61,8 +62,8 @@ early_param("no-kvmclock-vsyscall", parse_no_kvmclock_vsyscall);
> (PAGE_SIZE / sizeof(struct pvclock_vsyscall_time_info))
>
> static struct pvclock_vsyscall_time_info
> - hv_clock_boot[HVC_BOOT_ARRAY_SIZE] __aligned(PAGE_SIZE);
> -static struct pvclock_wall_clock wall_clock;
> + hv_clock_boot[HVC_BOOT_ARRAY_SIZE] __decrypted __aligned(PAGE_SIZE);
> +static struct pvclock_wall_clock wall_clock __decrypted;
> static DEFINE_PER_CPU(struct pvclock_vsyscall_time_info *, hv_clock_per_cpu);
>
> static inline struct pvclock_vcpu_time_info *this_cpu_pvti(void)
> @@ -267,10 +268,29 @@ static int kvmclock_setup_percpu(unsigned int cpu)
> return 0;
>
> /* Use the static page for the first CPUs, allocate otherwise */
> - if (cpu < HVC_BOOT_ARRAY_SIZE)
> + if (cpu < HVC_BOOT_ARRAY_SIZE) {
> p = &hv_clock_boot[cpu];
> - else
> - p = kzalloc(sizeof(*p), GFP_KERNEL);
> + } else {
> + int rc;
> + unsigned int sz = sizeof(*p);
> +
> + if (sev_active())
> + sz = PAGE_ALIGN(sz);
This is a definite downside to the section approach. Unless I missed
something, the section padding goes to waste since we don't have a
mechanism in place to allocate into that section, e.g. as is we're
burning nearly 2mb of data since we're only using 4k of the 2mb page.
And every decrypted allocation can potentially fracture a large page
since the allocator is unaware of the decrypted requirement. Might
not be an issue for kvmclock since it's a one-time allocation, but
we could suffer death by a thousand cuts if there are scenarios where
a decrypted allocation isn't be persistent (VirtIO queues maybe?).
Duplicating the full kernel tables for C=0 accesses doesn't suffer
from these issues. And I think potential corruption issues due to
mis-{aligned,size} objects can be detected through static analysis,
build assertions and/or runtime checks.
> + p = kzalloc(sz, GFP_KERNEL);
For the SEV case, can't we do a straight kmalloc() since we zero
out the page after decrypting it?
> +
> + /*
> + * The physical address of per-cpu variable will be shared with
> + * the hypervisor. Let's clear the C-bit before we assign the
> + * memory to per_cpu variable.
> + */
> + if (p && sev_active()) {
> + rc = set_memory_decrypted((unsigned long)p, sz >> PAGE_SHIFT);
> + if (rc)
> + return rc;
> + memset(p, 0, sz);
> + }
> + }
>
> per_cpu(hv_clock_per_cpu, cpu) = p;
> return p ? 0 : -ENOMEM;
> --
> 2.7.4
>
On 08/29/2018 02:56 PM, Sean Christopherson wrote:
> On Wed, Aug 29, 2018 at 01:24:00PM -0500, Brijesh Singh wrote:
>> The following commit:
>>
>> 368a540e0232 (x86/kvmclock: Remove memblock dependency)
>
> Checkpatch prefers:
>
> Commit 368a540e0232 ("x86/kvmclock: Remove memblock dependency")
>
> That'll also save three lines in the commit message.
Noted.
>
>> caused SEV guest regression. When SEV is active, we map the shared
>> variables (wall_clock and hv_clock_boot) with C=0 to ensure that both
>> the guest and the hypervisor is able to access the data. To map the
>
> Nit: s/is/are
Noted.
>
>> variables we use kernel_physical_mapping_init() to split the large pages,
>> but this routine fails to allocate a new page. Before the above commit,
>> kvmclock initialization was called after memory allocator was available
>> but now its called early during boot.
>
> What about something like this to make the issue a bit clearer:
>
> variables we use kernel_physical_mapping_init() to split the large pages,
> but splitting large pages requires allocating a new PMD, which fails now
> that kvmclock initialization is called early during boot.
>
Much better.
>> Recently we added a special .data..decrypted section to hold the shared
>> variables. This section is mapped with C=0 early during boot. Use
>> __decrypted attribute to put the wall_clock and hv_clock_boot in
>> .data..decrypted section so that they are mapped with C=0.
>>
>> Signed-off-by: Brijesh Singh <[email protected]>
>> Fixes: 368a540e0232 ("x86/kvmclock: Remove memblock dependency")
>> Cc: Tom Lendacky <[email protected]>
>> Cc: [email protected]
>> Cc: Thomas Gleixner <[email protected]>
>> Cc: Borislav Petkov <[email protected]>
>> Cc: "H. Peter Anvin" <[email protected]>
>> Cc: [email protected]
>> Cc: Paolo Bonzini <[email protected]>
>> Cc: Sean Christopherson <[email protected]>
>> Cc: [email protected]
>> Cc: "Radim Krčmář" <[email protected]>
>> ---
>> arch/x86/kernel/kvmclock.c | 30 +++++++++++++++++++++++++-----
>> 1 file changed, 25 insertions(+), 5 deletions(-)
>>
>> diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c
>> index 1e67646..08f5f8a 100644
>> --- a/arch/x86/kernel/kvmclock.c
>> +++ b/arch/x86/kernel/kvmclock.c
>> @@ -28,6 +28,7 @@
>> #include <linux/sched/clock.h>
>> #include <linux/mm.h>
>> #include <linux/slab.h>
>> +#include <linux/set_memory.h>
>>
>> #include <asm/hypervisor.h>
>> #include <asm/mem_encrypt.h>
>> @@ -61,8 +62,8 @@ early_param("no-kvmclock-vsyscall", parse_no_kvmclock_vsyscall);
>> (PAGE_SIZE / sizeof(struct pvclock_vsyscall_time_info))
>>
>> static struct pvclock_vsyscall_time_info
>> - hv_clock_boot[HVC_BOOT_ARRAY_SIZE] __aligned(PAGE_SIZE);
>> -static struct pvclock_wall_clock wall_clock;
>> + hv_clock_boot[HVC_BOOT_ARRAY_SIZE] __decrypted __aligned(PAGE_SIZE);
>> +static struct pvclock_wall_clock wall_clock __decrypted;
>> static DEFINE_PER_CPU(struct pvclock_vsyscall_time_info *, hv_clock_per_cpu);
>>
>> static inline struct pvclock_vcpu_time_info *this_cpu_pvti(void)
>> @@ -267,10 +268,29 @@ static int kvmclock_setup_percpu(unsigned int cpu)
>> return 0;
>>
>> /* Use the static page for the first CPUs, allocate otherwise */
>> - if (cpu < HVC_BOOT_ARRAY_SIZE)
>> + if (cpu < HVC_BOOT_ARRAY_SIZE) {
>> p = &hv_clock_boot[cpu];
>> - else
>> - p = kzalloc(sizeof(*p), GFP_KERNEL);
>> + } else {
>> + int rc;
>> + unsigned int sz = sizeof(*p);
>> +
>> + if (sev_active())
>> + sz = PAGE_ALIGN(sz);
>
> This is a definite downside to the section approach. Unless I missed
> something, the section padding goes to waste since we don't have a
> mechanism in place to allocate into that section, e.g. as is we're
> burning nearly 2mb of data since we're only using 4k of the 2mb page.
> And every decrypted allocation can potentially fracture a large page
> since the allocator is unaware of the decrypted requirement. Might
> not be an issue for kvmclock since it's a one-time allocation, but
> we could suffer death by a thousand cuts if there are scenarios where
> a decrypted allocation isn't be persistent (VirtIO queues maybe?).
>
The .data..decrypted is used for storing the static variables (which
need to be accessed unencrypted before dynamical allocators are ready).
If caller does a dynamic allocation and want to use the buffer as
decrypted then she is responsible to set/clear the C-bit using
set_memory_{decrypted,encrypted}. Currently, the decrypted section
holds the wall_clock and hv_clock_boot variables but its user will grow
when we add SEV-ES support. In SEV-ES case, the GHCB (guest-host
communication block) need to be accessed unencrypted very early during
boot.
SEV uses SWIOTLB for DMA buffer, the buffer pool is allocated and mapped
as decrypted during the boot. For the SEV case, Virtio uses the DMA
APIs to allocate the VRING, caller does not need to map the buffers as
decrypted. Actually there are very few number of
set_memory_{decrypted/encrypted} calls overall (during or after the
kernel boot).
> Duplicating the full kernel tables for C=0 accesses doesn't suffer
> from these issues. And I think potential corruption issues due to
> mis-{aligned,size} objects can be detected through static analysis,
> build assertions and/or runtime checks.
>
>> + p = kzalloc(sz, GFP_KERNEL);
>
> For the SEV case, can't we do a straight kmalloc() since we zero
> out the page after decrypting it?
>
Sure we can do kmalloc(); IMO, since this is not hot code path and doing
kmalloc() for SEV and kzalloc() for non-SEV does not buy much.
>> +
>> + /*
>> + * The physical address of per-cpu variable will be shared with
>> + * the hypervisor. Let's clear the C-bit before we assign the
>> + * memory to per_cpu variable.
>> + */
>> + if (p && sev_active()) {
>> + rc = set_memory_decrypted((unsigned long)p, sz >> PAGE_SHIFT);
>> + if (rc)
>> + return rc;
>> + memset(p, 0, sz);
>> + }
>> + }
>>
>> per_cpu(hv_clock_per_cpu, cpu) = p;
>> return p ? 0 : -ENOMEM;
>> --
>> 2.7.4
>>