From: Tianyu Lan <[email protected]>
This patchset is to add AMD sev-snp enlightened guest
support on hyperv. Hyperv uses Linux direct boot mode
to boot up Linux kernel and so it needs to pvalidate
system memory by itself.
In hyperv case, there is no boot loader and so cc blob
is prepared by hypervisor. In this series, hypervisor
set the cc blob address directly into boot parameter
of Linux kernel. If the magic number on cc blob address
is valid, kernel will read cc blob.
Shared memory between guests and hypervisor should be
decrypted and zero memory after decrypt memory. The data
in the target address. It maybe smearedto avoid smearing
data.
Introduce #HV exception support in AMD sev snp code and
#HV handler.
Tianyu Lan (17):
x86/boot: Check boot param's cc_blob_address for direct boot mode
x86/sev: Pvalidate memory gab for decompressing kernel
x86/hyperv: Add sev-snp enlightened guest specific config
x86/hyperv: apic change for sev-snp enlightened guest
x86/hyperv: Decrypt hv vp assist page in sev-snp enlightened guest
x86/hyperv: Get Virtual Trust Level via hvcall
x86/hyperv: Use vmmcall to implement hvcall in sev-snp enlightened
guest
clocksource: hyper-v: decrypt hyperv tsc page in sev-snp enlightened
guest
x86/hyperv: decrypt vmbus pages for sev-snp enlightened guest
x86/hyperv: set target vtl in the vmbus init message
drivers: hv: Decrypt percpu hvcall input arg page in sev-snp
enlightened guest
Drivers: hv: vmbus: Decrypt vmbus ring buffer
x86/hyperv: Initialize cpu and memory for sev-snp enlightened guest
x86/hyperv: Add smp support for sev-snp guest
x86/hyperv: Add hyperv-specific hadling for VMMCALL under SEV-ES
x86/sev: Add a #HV exception handler
x86/sev: Initialize #HV doorbell and handle interrupt requests
arch/x86/boot/compressed/head_64.S | 8 +
arch/x86/boot/compressed/sev.c | 111 +++++++-
arch/x86/entry/entry_64.S | 76 +++++
arch/x86/hyperv/hv_apic.c | 79 ++++--
arch/x86/hyperv/hv_init.c | 47 ++++
arch/x86/hyperv/ivm.c | 12 +-
arch/x86/include/asm/cpu_entry_area.h | 6 +
arch/x86/include/asm/idtentry.h | 39 ++-
arch/x86/include/asm/irqflags.h | 19 ++
arch/x86/include/asm/mem_encrypt.h | 2 +
arch/x86/include/asm/mshyperv.h | 68 +++--
arch/x86/include/asm/msr-index.h | 6 +
arch/x86/include/asm/page_64_types.h | 1 +
arch/x86/include/asm/sev.h | 13 +
arch/x86/include/asm/svm.h | 55 +++-
arch/x86/include/asm/trapnr.h | 1 +
arch/x86/include/asm/traps.h | 1 +
arch/x86/include/uapi/asm/svm.h | 4 +
arch/x86/kernel/cpu/common.c | 1 +
arch/x86/kernel/cpu/mshyperv.c | 267 +++++++++++++++++-
arch/x86/kernel/dumpstack_64.c | 9 +-
arch/x86/kernel/idt.c | 1 +
arch/x86/kernel/sev.c | 384 ++++++++++++++++++++++----
arch/x86/kernel/traps.c | 50 ++++
arch/x86/mm/cpu_entry_area.c | 2 +
drivers/clocksource/hyperv_timer.c | 2 +-
drivers/hv/connection.c | 14 +
drivers/hv/hv.c | 32 ++-
drivers/hv/hv_common.c | 22 ++
drivers/hv/ring_buffer.c | 7 +-
include/asm-generic/hyperv-tlfs.h | 19 ++
include/asm-generic/mshyperv.h | 2 +
include/linux/hyperv.h | 4 +-
33 files changed, 1250 insertions(+), 114 deletions(-)
--
2.25.1
From: Tianyu Lan <[email protected]>
In sev-snp enlightened guest, hvcall needs to use vmmcall to trigger
vmexit and notify hypervisor to handle hypercall request.
Signed-off-by: Tianyu Lan <[email protected]>
---
arch/x86/include/asm/mshyperv.h | 66 ++++++++++++++++++++++-----------
1 file changed, 45 insertions(+), 21 deletions(-)
diff --git a/arch/x86/include/asm/mshyperv.h b/arch/x86/include/asm/mshyperv.h
index 9b8c3f638845..28d5429e33c9 100644
--- a/arch/x86/include/asm/mshyperv.h
+++ b/arch/x86/include/asm/mshyperv.h
@@ -45,16 +45,25 @@ static inline u64 hv_do_hypercall(u64 control, void *input, void *output)
u64 hv_status;
#ifdef CONFIG_X86_64
- if (!hv_hypercall_pg)
- return U64_MAX;
+ if (hv_isolation_type_en_snp()) {
+ __asm__ __volatile__("mov %4, %%r8\n"
+ "vmmcall"
+ : "=a" (hv_status), ASM_CALL_CONSTRAINT,
+ "+c" (control), "+d" (input_address)
+ : "r" (output_address)
+ : "cc", "memory", "r8", "r9", "r10", "r11");
+ } else {
+ if (!hv_hypercall_pg)
+ return U64_MAX;
- __asm__ __volatile__("mov %4, %%r8\n"
- CALL_NOSPEC
- : "=a" (hv_status), ASM_CALL_CONSTRAINT,
- "+c" (control), "+d" (input_address)
- : "r" (output_address),
- THUNK_TARGET(hv_hypercall_pg)
- : "cc", "memory", "r8", "r9", "r10", "r11");
+ __asm__ __volatile__("mov %4, %%r8\n"
+ CALL_NOSPEC
+ : "=a" (hv_status), ASM_CALL_CONSTRAINT,
+ "+c" (control), "+d" (input_address)
+ : "r" (output_address),
+ THUNK_TARGET(hv_hypercall_pg)
+ : "cc", "memory", "r8", "r9", "r10", "r11");
+ }
#else
u32 input_address_hi = upper_32_bits(input_address);
u32 input_address_lo = lower_32_bits(input_address);
@@ -82,12 +91,18 @@ static inline u64 hv_do_fast_hypercall8(u16 code, u64 input1)
u64 hv_status, control = (u64)code | HV_HYPERCALL_FAST_BIT;
#ifdef CONFIG_X86_64
- {
+ if (hv_isolation_type_en_snp()) {
+ __asm__ __volatile__(
+ "vmmcall"
+ : "=a" (hv_status), ASM_CALL_CONSTRAINT,
+ "+c" (control), "+d" (input1)
+ :: "cc", "r8", "r9", "r10", "r11");
+ } else {
__asm__ __volatile__(CALL_NOSPEC
- : "=a" (hv_status), ASM_CALL_CONSTRAINT,
- "+c" (control), "+d" (input1)
- : THUNK_TARGET(hv_hypercall_pg)
- : "cc", "r8", "r9", "r10", "r11");
+ : "=a" (hv_status), ASM_CALL_CONSTRAINT,
+ "+c" (control), "+d" (input1)
+ : THUNK_TARGET(hv_hypercall_pg)
+ : "cc", "r8", "r9", "r10", "r11");
}
#else
{
@@ -113,14 +128,21 @@ static inline u64 hv_do_fast_hypercall16(u16 code, u64 input1, u64 input2)
u64 hv_status, control = (u64)code | HV_HYPERCALL_FAST_BIT;
#ifdef CONFIG_X86_64
- {
+ if (hv_isolation_type_en_snp()) {
__asm__ __volatile__("mov %4, %%r8\n"
- CALL_NOSPEC
- : "=a" (hv_status), ASM_CALL_CONSTRAINT,
- "+c" (control), "+d" (input1)
- : "r" (input2),
- THUNK_TARGET(hv_hypercall_pg)
- : "cc", "r8", "r9", "r10", "r11");
+ "vmmcall"
+ : "=a" (hv_status), ASM_CALL_CONSTRAINT,
+ "+c" (control), "+d" (input1)
+ : "r" (input2)
+ : "cc", "r8", "r9", "r10", "r11");
+ } else {
+ __asm__ __volatile__("mov %4, %%r8\n"
+ CALL_NOSPEC
+ : "=a" (hv_status), ASM_CALL_CONSTRAINT,
+ "+c" (control), "+d" (input1)
+ : "r" (input2),
+ THUNK_TARGET(hv_hypercall_pg)
+ : "cc", "r8", "r9", "r10", "r11");
}
#else
{
@@ -177,6 +199,7 @@ int hv_map_ioapic_interrupt(int ioapic_id, bool level, int vcpu, int vector,
struct hv_interrupt_entry *entry);
int hv_unmap_ioapic_interrupt(int ioapic_id, struct hv_interrupt_entry *entry);
int hv_set_mem_host_visibility(unsigned long addr, int numpages, bool visible);
+int hv_snp_boot_ap(int cpu, unsigned long start_ip);
#ifdef CONFIG_AMD_MEM_ENCRYPT
void hv_ghcb_msr_write(u64 msr, u64 value);
@@ -191,6 +214,7 @@ static inline void hv_ghcb_terminate(unsigned int set, unsigned int reason) {}
#endif
extern bool hv_isolation_type_snp(void);
+extern bool hv_isolation_type_en_snp(void);
static inline bool hv_is_synic_reg(unsigned int reg)
{
--
2.25.1
From: Tianyu Lan <[email protected]>
sev-snp guest provides vtl(Virtual Trust Level) and get it from
hyperv hvcall via HVCALL_GET_VP_REGISTERS.
Signed-off-by: Tianyu Lan <[email protected]>
---
arch/x86/hyperv/hv_init.c | 35 ++++++++++++++++++++++++++++++++++
include/asm-generic/mshyperv.h | 2 ++
2 files changed, 37 insertions(+)
diff --git a/arch/x86/hyperv/hv_init.c b/arch/x86/hyperv/hv_init.c
index 4600c5941957..5b919d4d24c0 100644
--- a/arch/x86/hyperv/hv_init.c
+++ b/arch/x86/hyperv/hv_init.c
@@ -390,6 +390,39 @@ static void __init hv_get_partition_id(void)
local_irq_restore(flags);
}
+static u8 __init get_current_vtl(void)
+{
+ u64 control = ((u64)1 << HV_HYPERCALL_REP_COMP_OFFSET) | HVCALL_GET_VP_REGISTERS;
+ struct hv_get_vp_registers_input *input = NULL;
+ struct hv_get_vp_registers_output *output = NULL;
+ u8 vtl = 0;
+ int ret;
+ unsigned long flags;
+
+ local_irq_save(flags);
+ input = *(struct hv_get_vp_registers_input **)this_cpu_ptr(hyperv_pcpu_input_arg);
+ output = (struct hv_get_vp_registers_output *)input;
+ if (!input || !output) {
+ pr_err("Hyper-V: cannot allocate a shared page!");
+ goto done;
+ }
+
+ memset(input, 0, sizeof(*input) + sizeof(input->element[0]));
+ input->header.partitionid = HV_PARTITION_ID_SELF;
+ input->header.inputvtl = 0;
+ input->element[0].name0 = 0x000D0003;
+
+ ret = hv_do_hypercall(control, input, output);
+ if (ret == 0)
+ vtl = output->as64.low & 0xf;
+ else
+ pr_err("Hyper-V: failed to get the current VTL!");
+ local_irq_restore(flags);
+
+done:
+ return vtl;
+}
+
/*
* This function is to be invoked early in the boot sequence after the
* hypervisor has been detected.
@@ -527,6 +560,8 @@ void __init hyperv_init(void)
if (hv_is_isolation_supported())
swiotlb_update_mem_attributes();
#endif
+ /* Find the current VTL */
+ ms_hyperv.vtl = get_current_vtl();
return;
diff --git a/include/asm-generic/mshyperv.h b/include/asm-generic/mshyperv.h
index bfb9eb9d7215..68133de044ec 100644
--- a/include/asm-generic/mshyperv.h
+++ b/include/asm-generic/mshyperv.h
@@ -46,6 +46,7 @@ struct ms_hyperv_info {
};
};
u64 shared_gpa_boundary;
+ u8 vtl;
};
extern struct ms_hyperv_info ms_hyperv;
@@ -55,6 +56,7 @@ extern void * __percpu *hyperv_pcpu_output_arg;
extern u64 hv_do_hypercall(u64 control, void *inputaddr, void *outputaddr);
extern u64 hv_do_fast_hypercall8(u16 control, u64 input8);
extern bool hv_isolation_type_snp(void);
+extern bool hv_isolation_type_en_snp(void);
/* Helper functions that provide a consistent pattern for checking Hyper-V hypercall status. */
static inline int hv_result(u64 status)
--
2.25.1
From: Tianyu Lan <[email protected]>
hv vp assist page is shared between sev snp guest and hyperv. Decrypt
the page when use it.
Signed-off-by: Tianyu Lan <[email protected]>
---
arch/x86/hyperv/hv_init.c | 12 ++++++++++++
1 file changed, 12 insertions(+)
diff --git a/arch/x86/hyperv/hv_init.c b/arch/x86/hyperv/hv_init.c
index 29774126e931..4600c5941957 100644
--- a/arch/x86/hyperv/hv_init.c
+++ b/arch/x86/hyperv/hv_init.c
@@ -30,6 +30,7 @@
#include <clocksource/hyperv_timer.h>
#include <linux/highmem.h>
#include <linux/swiotlb.h>
+#include <linux/set_memory.h>
int hyperv_init_cpuhp;
u64 hv_current_partition_id = ~0ull;
@@ -112,6 +113,11 @@ static int hv_cpu_init(unsigned int cpu)
}
WARN_ON(!(*hvp));
if (*hvp) {
+ if (hv_isolation_type_en_snp()) {
+ WARN_ON_ONCE(set_memory_decrypted((unsigned long)(*hvp), 1) != 0);
+ memset(*hvp, 0, PAGE_SIZE);
+ }
+
msr.enable = 1;
wrmsrl(HV_X64_MSR_VP_ASSIST_PAGE, msr.as_uint64);
}
@@ -228,6 +234,12 @@ static int hv_cpu_die(unsigned int cpu)
if (hv_vp_assist_page && hv_vp_assist_page[cpu]) {
union hv_vp_assist_msr_contents msr = { 0 };
+
+ if (hv_isolation_type_en_snp())
+ WARN_ON_ONCE(set_memory_encrypted(
+ (unsigned long)hv_vp_assist_page[cpu],
+ 1) != 0);
+
if (hv_root_partition) {
/*
* For root partition the VP assist page is mapped to
--
2.25.1
From: Tianyu Lan <[email protected]>
Vmbus int, synic and post message pages are shared with hypervisor
and so decrypt these pages in the sev-snp guest.
Signed-off-by: Tianyu Lan <[email protected]>
---
drivers/hv/connection.c | 13 +++++++++++++
drivers/hv/hv.c | 32 +++++++++++++++++++++++++++++++-
2 files changed, 44 insertions(+), 1 deletion(-)
diff --git a/drivers/hv/connection.c b/drivers/hv/connection.c
index 9dc27e5d367a..43141225ea15 100644
--- a/drivers/hv/connection.c
+++ b/drivers/hv/connection.c
@@ -215,6 +215,15 @@ int vmbus_connect(void)
(void *)((unsigned long)vmbus_connection.int_page +
(HV_HYP_PAGE_SIZE >> 1));
+ if (hv_isolation_type_snp() || hv_isolation_type_en_snp()) {
+ ret = set_memory_decrypted((unsigned long)
+ vmbus_connection.int_page, 1);
+ if (ret)
+ goto cleanup;
+
+ memset(vmbus_connection.int_page, 0, PAGE_SIZE);
+ }
+
/*
* Setup the monitor notification facility. The 1st page for
* parent->child and the 2nd page for child->parent
@@ -372,6 +381,10 @@ void vmbus_disconnect(void)
destroy_workqueue(vmbus_connection.work_queue);
if (vmbus_connection.int_page) {
+ if (hv_isolation_type_en_snp())
+ set_memory_encrypted((unsigned long)
+ vmbus_connection.int_page, 1);
+
hv_free_hyperv_page((unsigned long)vmbus_connection.int_page);
vmbus_connection.int_page = NULL;
}
diff --git a/drivers/hv/hv.c b/drivers/hv/hv.c
index 4d6480d57546..f9111eb32739 100644
--- a/drivers/hv/hv.c
+++ b/drivers/hv/hv.c
@@ -20,6 +20,7 @@
#include <linux/interrupt.h>
#include <clocksource/hyperv_timer.h>
#include <asm/mshyperv.h>
+#include <linux/set_memory.h>
#include "hyperv_vmbus.h"
/* The one and only */
@@ -117,7 +118,7 @@ int hv_post_message(union hv_connection_id connection_id,
int hv_synic_alloc(void)
{
- int cpu;
+ int cpu, ret;
struct hv_per_cpu_context *hv_cpu;
/*
@@ -168,6 +169,29 @@ int hv_synic_alloc(void)
pr_err("Unable to allocate post msg page\n");
goto err;
}
+
+ if (hv_isolation_type_en_snp()) {
+ ret = set_memory_decrypted((unsigned long)
+ hv_cpu->synic_message_page, 1);
+ ret |= set_memory_decrypted((unsigned long)
+ hv_cpu->synic_event_page, 1);
+ ret |= set_memory_decrypted((unsigned long)
+ hv_cpu->post_msg_page, 1);
+
+ if (ret) {
+ set_memory_encrypted((unsigned long)
+ hv_cpu->synic_message_page, 1);
+ set_memory_encrypted((unsigned long)
+ hv_cpu->synic_event_page, 1);
+ set_memory_encrypted((unsigned long)
+ hv_cpu->post_msg_page, 1);
+ goto err;
+ }
+
+ memset(hv_cpu->synic_message_page, 0, PAGE_SIZE);
+ memset(hv_cpu->synic_event_page, 0, PAGE_SIZE);
+ memset(hv_cpu->post_msg_page, 0, PAGE_SIZE);
+ }
}
return 0;
@@ -188,6 +212,12 @@ void hv_synic_free(void)
struct hv_per_cpu_context *hv_cpu
= per_cpu_ptr(hv_context.cpu_context, cpu);
+ if (hv_isolation_type_en_snp()) {
+ set_memory_encrypted((unsigned long)hv_cpu->synic_message_page, 1);
+ set_memory_encrypted((unsigned long)hv_cpu->synic_event_page, 1);
+ set_memory_encrypted((unsigned long)hv_cpu->post_msg_page, 1);
+ }
+
free_page((unsigned long)hv_cpu->synic_event_page);
free_page((unsigned long)hv_cpu->synic_message_page);
free_page((unsigned long)hv_cpu->post_msg_page);
--
2.25.1
From: Tianyu Lan <[email protected]>
Set target vtl (Virtual Trust Level) in the vmbus init message.
Signed-off-by: Tianyu Lan <[email protected]>
---
drivers/hv/connection.c | 1 +
include/linux/hyperv.h | 4 ++--
2 files changed, 3 insertions(+), 2 deletions(-)
diff --git a/drivers/hv/connection.c b/drivers/hv/connection.c
index 43141225ea15..09a1253b539a 100644
--- a/drivers/hv/connection.c
+++ b/drivers/hv/connection.c
@@ -98,6 +98,7 @@ int vmbus_negotiate_version(struct vmbus_channel_msginfo *msginfo, u32 version)
*/
if (version >= VERSION_WIN10_V5) {
msg->msg_sint = VMBUS_MESSAGE_SINT;
+ msg->msg_vtl = ms_hyperv.vtl;
vmbus_connection.msg_conn_id = VMBUS_MESSAGE_CONNECTION_ID_4;
} else {
msg->interrupt_page = virt_to_phys(vmbus_connection.int_page);
diff --git a/include/linux/hyperv.h b/include/linux/hyperv.h
index 3b42264333ef..2be0b5efd1ea 100644
--- a/include/linux/hyperv.h
+++ b/include/linux/hyperv.h
@@ -665,8 +665,8 @@ struct vmbus_channel_initiate_contact {
u64 interrupt_page;
struct {
u8 msg_sint;
- u8 padding1[3];
- u32 padding2;
+ u8 msg_vtl;
+ u8 reserved[6];
};
};
u64 monitor_page1;
--
2.25.1
From: Tianyu Lan <[email protected]>
Read processor amd memory info from specific address which are
populated by Hyper-V. Initialize smp cpu related ops, pvalidate
system memory and add it into e820 table.
Signed-off-by: Tianyu Lan <[email protected]>
---
arch/x86/kernel/cpu/mshyperv.c | 75 ++++++++++++++++++++++++++++++++++
1 file changed, 75 insertions(+)
diff --git a/arch/x86/kernel/cpu/mshyperv.c b/arch/x86/kernel/cpu/mshyperv.c
index 2ea4f21c6172..f0c97210c64a 100644
--- a/arch/x86/kernel/cpu/mshyperv.c
+++ b/arch/x86/kernel/cpu/mshyperv.c
@@ -34,6 +34,12 @@
#include <clocksource/hyperv_timer.h>
#include <asm/numa.h>
#include <asm/coco.h>
+#include <asm/io_apic.h>
+#include <asm/svm.h>
+#include <asm/sev.h>
+#include <asm/sev-snp.h>
+#include <asm/realmode.h>
+#include <asm/e820/api.h>
/* Is Linux running as the root partition? */
bool hv_root_partition;
@@ -253,6 +259,33 @@ static void __init hv_smp_prepare_cpus(unsigned int max_cpus)
}
#endif
+static __init int hv_snp_set_rtc_noop(const struct timespec64 *now) { return -EINVAL; }
+static __init void hv_snp_get_rtc_noop(struct timespec64 *now) { }
+
+static u32 processor_count;
+
+static __init void hv_snp_get_smp_config(unsigned int early)
+{
+ if (!early) {
+ while (num_processors < processor_count) {
+ early_per_cpu(x86_cpu_to_apicid, num_processors) = num_processors;
+ early_per_cpu(x86_bios_cpu_apicid, num_processors) = num_processors;
+ physid_set(num_processors, phys_cpu_present_map);
+ set_cpu_possible(num_processors, true);
+ set_cpu_present(num_processors, true);
+ num_processors++;
+ }
+ }
+}
+
+struct memory_map_entry {
+ u64 starting_gpn;
+ u64 numpages;
+ u16 type;
+ u16 flags;
+ u32 reserved;
+};
+
static void __init ms_hyperv_init_platform(void)
{
int hv_max_functions_eax;
@@ -260,6 +293,11 @@ static void __init ms_hyperv_init_platform(void)
int hv_host_info_ebx;
int hv_host_info_ecx;
int hv_host_info_edx;
+ struct memory_map_entry *entry;
+ struct e820_entry *e820_entry;
+ u64 e820_end;
+ u64 ram_end;
+ u64 page;
#ifdef CONFIG_PARAVIRT
pv_info.name = "Hyper-V";
@@ -477,6 +515,43 @@ static void __init ms_hyperv_init_platform(void)
if (!(ms_hyperv.features & HV_ACCESS_TSC_INVARIANT))
mark_tsc_unstable("running on Hyper-V");
+ if (cc_platform_has(CC_ATTR_GUEST_SEV_SNP)) {
+ x86_platform.legacy.reserve_bios_regions = 0;
+ x86_platform.set_wallclock = hv_snp_set_rtc_noop;
+ x86_platform.get_wallclock = hv_snp_get_rtc_noop;
+ x86_init.resources.probe_roms = x86_init_noop;
+ x86_init.resources.reserve_resources = x86_init_noop;
+ x86_init.mpparse.find_smp_config = x86_init_noop;
+ x86_init.mpparse.get_smp_config = hv_snp_get_smp_config;
+
+ /*
+ * Hyper-V SEV-SNP enlightened guest doesn't support ioapic
+ * and legacy APIC page read/write. Switch to hv apic here.
+ */
+ disable_ioapic_support();
+ hv_apic_init();
+
+ processor_count = *(u32 *)__va(EN_SEV_SNP_PROCESSOR_INFO_ADDR);
+
+ entry = (struct memory_map_entry *)(__va(EN_SEV_SNP_PROCESSOR_INFO_ADDR)
+ + sizeof(struct memory_map_entry));
+
+ for (; entry->numpages != 0; entry++) {
+ e820_entry = &e820_table->entries[e820_table->nr_entries - 1];
+ e820_end = e820_entry->addr + e820_entry->size;
+ ram_end = (entry->starting_gpn + entry->numpages) * PAGE_SIZE;
+
+ if (e820_end < entry->starting_gpn * PAGE_SIZE)
+ e820_end = entry->starting_gpn * PAGE_SIZE;
+ if (e820_end < ram_end) {
+ pr_info("Hyper-V: add [mem %#018Lx-%#018Lx]\n", e820_end, ram_end - 1);
+ e820__range_add(e820_end, ram_end - e820_end, E820_TYPE_RAM);
+ for (page = e820_end; page < ram_end; page += PAGE_SIZE)
+ pvalidate((unsigned long)__va(page), RMP_PG_SIZE_4K, true);
+ }
+ }
+ }
+
hardlockup_detector_disable();
}
--
2.25.1
From: Tianyu Lan <[email protected]>
Add Hyperv-specific handling for faults caused by VMMCALL
instructions.
Signed-off-by: Tianyu Lan <[email protected]>
---
arch/x86/kernel/cpu/mshyperv.c | 16 ++++++++++++++++
1 file changed, 16 insertions(+)
diff --git a/arch/x86/kernel/cpu/mshyperv.c b/arch/x86/kernel/cpu/mshyperv.c
index b266f648e5cd..a4e526378603 100644
--- a/arch/x86/kernel/cpu/mshyperv.c
+++ b/arch/x86/kernel/cpu/mshyperv.c
@@ -725,6 +725,20 @@ static bool __init ms_hyperv_msi_ext_dest_id(void)
return eax & HYPERV_VS_PROPERTIES_EAX_EXTENDED_IOAPIC_RTE;
}
+static void hv_sev_es_hcall_prepare(struct ghcb *ghcb, struct pt_regs *regs)
+{
+ /* RAX and CPL are already in the GHCB */
+ ghcb_set_rcx(ghcb, regs->cx);
+ ghcb_set_rdx(ghcb, regs->dx);
+ ghcb_set_r8(ghcb, regs->r8);
+}
+
+static bool hv_sev_es_hcall_finish(struct ghcb *ghcb, struct pt_regs *regs)
+{
+ /* No checking of the return state needed */
+ return true;
+}
+
const __initconst struct hypervisor_x86 x86_hyper_ms_hyperv = {
.name = "Microsoft Hyper-V",
.detect = ms_hyperv_platform,
@@ -732,4 +746,6 @@ const __initconst struct hypervisor_x86 x86_hyper_ms_hyperv = {
.init.x2apic_available = ms_hyperv_x2apic_available,
.init.msi_ext_dest_id = ms_hyperv_msi_ext_dest_id,
.init.init_platform = ms_hyperv_init_platform,
+ .runtime.sev_es_hcall_prepare = hv_sev_es_hcall_prepare,
+ .runtime.sev_es_hcall_finish = hv_sev_es_hcall_finish,
};
--
2.25.1
From: Tianyu Lan <[email protected]>
Add a #HV exception handler that uses IST stack.
Signed-off-by: Tianyu Lan <[email protected]>
---
arch/x86/entry/entry_64.S | 58 ++++++++++++++++++++++++++
arch/x86/include/asm/cpu_entry_area.h | 6 +++
arch/x86/include/asm/idtentry.h | 39 +++++++++++++++++-
arch/x86/include/asm/page_64_types.h | 1 +
arch/x86/include/asm/trapnr.h | 1 +
arch/x86/include/asm/traps.h | 1 +
arch/x86/kernel/cpu/common.c | 1 +
arch/x86/kernel/dumpstack_64.c | 9 +++-
arch/x86/kernel/idt.c | 1 +
arch/x86/kernel/sev.c | 59 +++++++++++++++++++++++++++
arch/x86/mm/cpu_entry_area.c | 2 +
11 files changed, 175 insertions(+), 3 deletions(-)
diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
index 9953d966d124..b2059df43c57 100644
--- a/arch/x86/entry/entry_64.S
+++ b/arch/x86/entry/entry_64.S
@@ -560,6 +560,64 @@ SYM_CODE_START(\asmsym)
.Lfrom_usermode_switch_stack_\@:
idtentry_body user_\cfunc, has_error_code=1
+_ASM_NOKPROBE(\asmsym)
+SYM_CODE_END(\asmsym)
+.endm
+/*
+ * idtentry_hv - Macro to generate entry stub for #HV
+ * @vector: Vector number
+ * @asmsym: ASM symbol for the entry point
+ * @cfunc: C function to be called
+ *
+ * The macro emits code to set up the kernel context for #HV. The #HV handler
+ * runs on an IST stack and needs to be able to support nested #HV exceptions.
+ *
+ * To make this work the #HV entry code tries its best to pretend it doesn't use
+ * an IST stack by switching to the task stack if coming from user-space (which
+ * includes early SYSCALL entry path) or back to the stack in the IRET frame if
+ * entered from kernel-mode.
+ *
+ * If entered from kernel-mode the return stack is validated first, and if it is
+ * not safe to use (e.g. because it points to the entry stack) the #HV handler
+ * will switch to a fall-back stack (HV2) and call a special handler function.
+ *
+ * The macro is only used for one vector, but it is planned to be extended in
+ * the future for the #HV exception.
+ */
+.macro idtentry_hv vector asmsym cfunc
+SYM_CODE_START(\asmsym)
+ UNWIND_HINT_IRET_REGS
+ ASM_CLAC
+ pushq $-1 /* ORIG_RAX: no syscall to restart */
+
+ testb $3, CS-ORIG_RAX(%rsp)
+ jnz .Lfrom_usermode_switch_stack_\@
+
+ call paranoid_entry
+
+ UNWIND_HINT_REGS
+
+ /*
+ * Switch off the IST stack to make it free for nested exceptions.
+ */
+ movq %rsp, %rdi /* pt_regs pointer */
+ call hv_switch_off_ist
+ movq %rax, %rsp /* Switch to new stack */
+
+ UNWIND_HINT_REGS
+
+ /* Update pt_regs */
+ movq ORIG_RAX(%rsp), %rsi /* get error code into 2nd argument*/
+ movq $-1, ORIG_RAX(%rsp) /* no syscall to restart */
+
+ movq %rsp, %rdi /* pt_regs pointer */
+ call kernel_\cfunc
+
+ jmp paranoid_exit
+
+.Lfrom_usermode_switch_stack_\@:
+ idtentry_body user_\cfunc, has_error_code=1
+
_ASM_NOKPROBE(\asmsym)
SYM_CODE_END(\asmsym)
.endm
diff --git a/arch/x86/include/asm/cpu_entry_area.h b/arch/x86/include/asm/cpu_entry_area.h
index 75efc4c6f076..f173a16cfc59 100644
--- a/arch/x86/include/asm/cpu_entry_area.h
+++ b/arch/x86/include/asm/cpu_entry_area.h
@@ -30,6 +30,10 @@
char VC_stack[optional_stack_size]; \
char VC2_stack_guard[guardsize]; \
char VC2_stack[optional_stack_size]; \
+ char HV_stack_guard[guardsize]; \
+ char HV_stack[optional_stack_size]; \
+ char HV2_stack_guard[guardsize]; \
+ char HV2_stack[optional_stack_size]; \
char IST_top_guard[guardsize]; \
/* The exception stacks' physical storage. No guard pages required */
@@ -52,6 +56,8 @@ enum exception_stack_ordering {
ESTACK_MCE,
ESTACK_VC,
ESTACK_VC2,
+ ESTACK_HV,
+ ESTACK_HV2,
N_EXCEPTION_STACKS
};
diff --git a/arch/x86/include/asm/idtentry.h b/arch/x86/include/asm/idtentry.h
index 72184b0b2219..ed68acd6f723 100644
--- a/arch/x86/include/asm/idtentry.h
+++ b/arch/x86/include/asm/idtentry.h
@@ -317,6 +317,19 @@ static __always_inline void __##func(struct pt_regs *regs)
__visible noinstr void kernel_##func(struct pt_regs *regs, unsigned long error_code); \
__visible noinstr void user_##func(struct pt_regs *regs, unsigned long error_code)
+
+/**
+ * DECLARE_IDTENTRY_HV - Declare functions for the HV entry point
+ * @vector: Vector number (ignored for C)
+ * @func: Function name of the entry point
+ *
+ * Maps to DECLARE_IDTENTRY_RAW, but declares also the user C handler.
+ */
+#define DECLARE_IDTENTRY_HV(vector, func) \
+ DECLARE_IDTENTRY_RAW_ERRORCODE(vector, func); \
+ __visible noinstr void kernel_##func(struct pt_regs *regs, unsigned long error_code); \
+ __visible noinstr void user_##func(struct pt_regs *regs, unsigned long error_code)
+
/**
* DEFINE_IDTENTRY_IST - Emit code for IST entry points
* @func: Function name of the entry point
@@ -376,6 +389,26 @@ static __always_inline void __##func(struct pt_regs *regs)
#define DEFINE_IDTENTRY_VC_USER(func) \
DEFINE_IDTENTRY_RAW_ERRORCODE(user_##func)
+/**
+ * DEFINE_IDTENTRY_HV_KERNEL - Emit code for HV injection handler
+ * when raised from kernel mode
+ * @func: Function name of the entry point
+ *
+ * Maps to DEFINE_IDTENTRY_RAW
+ */
+#define DEFINE_IDTENTRY_HV_KERNEL(func) \
+ DEFINE_IDTENTRY_RAW_ERRORCODE(kernel_##func)
+
+/**
+ * DEFINE_IDTENTRY_HV_USER - Emit code for HV injection handler
+ * when raised from user mode
+ * @func: Function name of the entry point
+ *
+ * Maps to DEFINE_IDTENTRY_RAW
+ */
+#define DEFINE_IDTENTRY_HV_USER(func) \
+ DEFINE_IDTENTRY_RAW_ERRORCODE(user_##func)
+
#else /* CONFIG_X86_64 */
/**
@@ -465,6 +498,9 @@ __visible noinstr void func(struct pt_regs *regs, \
# define DECLARE_IDTENTRY_VC(vector, func) \
idtentry_vc vector asm_##func func
+# define DECLARE_IDTENTRY_HV(vector, func) \
+ idtentry_hv vector asm_##func func
+
#else
# define DECLARE_IDTENTRY_MCE(vector, func) \
DECLARE_IDTENTRY(vector, func)
@@ -622,9 +658,10 @@ DECLARE_IDTENTRY_RAW_ERRORCODE(X86_TRAP_DF, xenpv_exc_double_fault);
DECLARE_IDTENTRY_ERRORCODE(X86_TRAP_CP, exc_control_protection);
#endif
-/* #VC */
+/* #VC & #HV */
#ifdef CONFIG_AMD_MEM_ENCRYPT
DECLARE_IDTENTRY_VC(X86_TRAP_VC, exc_vmm_communication);
+DECLARE_IDTENTRY_HV(X86_TRAP_HV, exc_hv_injection);
#endif
#ifdef CONFIG_XEN_PV
diff --git a/arch/x86/include/asm/page_64_types.h b/arch/x86/include/asm/page_64_types.h
index e9e2c3ba5923..0bd7dab676c5 100644
--- a/arch/x86/include/asm/page_64_types.h
+++ b/arch/x86/include/asm/page_64_types.h
@@ -29,6 +29,7 @@
#define IST_INDEX_DB 2
#define IST_INDEX_MCE 3
#define IST_INDEX_VC 4
+#define IST_INDEX_HV 5
/*
* Set __PAGE_OFFSET to the most negative possible address +
diff --git a/arch/x86/include/asm/trapnr.h b/arch/x86/include/asm/trapnr.h
index f5d2325aa0b7..c6583631cecb 100644
--- a/arch/x86/include/asm/trapnr.h
+++ b/arch/x86/include/asm/trapnr.h
@@ -26,6 +26,7 @@
#define X86_TRAP_XF 19 /* SIMD Floating-Point Exception */
#define X86_TRAP_VE 20 /* Virtualization Exception */
#define X86_TRAP_CP 21 /* Control Protection Exception */
+#define X86_TRAP_HV 28 /* HV injected exception in SNP restricted mode */
#define X86_TRAP_VC 29 /* VMM Communication Exception */
#define X86_TRAP_IRET 32 /* IRET Exception */
diff --git a/arch/x86/include/asm/traps.h b/arch/x86/include/asm/traps.h
index 47ecfff2c83d..6795d3e517d6 100644
--- a/arch/x86/include/asm/traps.h
+++ b/arch/x86/include/asm/traps.h
@@ -16,6 +16,7 @@ asmlinkage __visible notrace
struct pt_regs *fixup_bad_iret(struct pt_regs *bad_regs);
void __init trap_init(void);
asmlinkage __visible noinstr struct pt_regs *vc_switch_off_ist(struct pt_regs *eregs);
+asmlinkage __visible noinstr struct pt_regs *hv_switch_off_ist(struct pt_regs *eregs);
#endif
extern bool ibt_selftest(void);
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index 3e508f239098..87afa3a4c8b1 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -2165,6 +2165,7 @@ static inline void tss_setup_ist(struct tss_struct *tss)
tss->x86_tss.ist[IST_INDEX_MCE] = __this_cpu_ist_top_va(MCE);
/* Only mapped when SEV-ES is active */
tss->x86_tss.ist[IST_INDEX_VC] = __this_cpu_ist_top_va(VC);
+ tss->x86_tss.ist[IST_INDEX_HV] = __this_cpu_ist_top_va(HV);
}
#else /* CONFIG_X86_64 */
diff --git a/arch/x86/kernel/dumpstack_64.c b/arch/x86/kernel/dumpstack_64.c
index 6c5defd6569a..23aa5912c87a 100644
--- a/arch/x86/kernel/dumpstack_64.c
+++ b/arch/x86/kernel/dumpstack_64.c
@@ -26,11 +26,14 @@ static const char * const exception_stack_names[] = {
[ ESTACK_MCE ] = "#MC",
[ ESTACK_VC ] = "#VC",
[ ESTACK_VC2 ] = "#VC2",
+ [ ESTACK_HV ] = "#HV",
+ [ ESTACK_HV2 ] = "#HV2",
+
};
const char *stack_type_name(enum stack_type type)
{
- BUILD_BUG_ON(N_EXCEPTION_STACKS != 6);
+ BUILD_BUG_ON(N_EXCEPTION_STACKS != 8);
if (type == STACK_TYPE_TASK)
return "TASK";
@@ -89,6 +92,8 @@ struct estack_pages estack_pages[CEA_ESTACK_PAGES] ____cacheline_aligned = {
EPAGERANGE(MCE),
EPAGERANGE(VC),
EPAGERANGE(VC2),
+ EPAGERANGE(HV),
+ EPAGERANGE(HV2),
};
static __always_inline bool in_exception_stack(unsigned long *stack, struct stack_info *info)
@@ -98,7 +103,7 @@ static __always_inline bool in_exception_stack(unsigned long *stack, struct stac
struct pt_regs *regs;
unsigned int k;
- BUILD_BUG_ON(N_EXCEPTION_STACKS != 6);
+ BUILD_BUG_ON(N_EXCEPTION_STACKS != 8);
begin = (unsigned long)__this_cpu_read(cea_exception_stacks);
/*
diff --git a/arch/x86/kernel/idt.c b/arch/x86/kernel/idt.c
index a58c6bc1cd68..48c0a7e1dbcb 100644
--- a/arch/x86/kernel/idt.c
+++ b/arch/x86/kernel/idt.c
@@ -113,6 +113,7 @@ static const __initconst struct idt_data def_idts[] = {
#ifdef CONFIG_AMD_MEM_ENCRYPT
ISTG(X86_TRAP_VC, asm_exc_vmm_communication, IST_INDEX_VC),
+ ISTG(X86_TRAP_HV, asm_exc_hv_injection, IST_INDEX_HV),
#endif
SYSG(X86_TRAP_OF, asm_exc_overflow),
diff --git a/arch/x86/kernel/sev.c b/arch/x86/kernel/sev.c
index a428c62330d3..63ddb043d16d 100644
--- a/arch/x86/kernel/sev.c
+++ b/arch/x86/kernel/sev.c
@@ -2004,6 +2004,65 @@ DEFINE_IDTENTRY_VC_USER(exc_vmm_communication)
irqentry_exit_to_user_mode(regs);
}
+static bool hv_raw_handle_exception(struct pt_regs *regs)
+{
+ return false;
+}
+
+static __always_inline bool on_hv_fallback_stack(struct pt_regs *regs)
+{
+ unsigned long sp = (unsigned long)regs;
+
+ return (sp >= __this_cpu_ist_bottom_va(HV2) && sp < __this_cpu_ist_top_va(HV2));
+}
+
+DEFINE_IDTENTRY_HV_USER(exc_hv_injection)
+{
+ irqentry_enter_from_user_mode(regs);
+ instrumentation_begin();
+
+ if (!hv_raw_handle_exception(regs)) {
+ /*
+ * Do not kill the machine if user-space triggered the
+ * exception. Send SIGBUS instead and let user-space deal
+ * with it.
+ */
+ force_sig_fault(SIGBUS, BUS_OBJERR, (void __user *)0);
+ }
+
+ instrumentation_end();
+ irqentry_exit_to_user_mode(regs);
+}
+
+DEFINE_IDTENTRY_HV_KERNEL(exc_hv_injection)
+{
+ irqentry_state_t irq_state;
+
+ if (unlikely(on_hv_fallback_stack(regs))) {
+ instrumentation_begin();
+ panic("Can't handle #HV exception from unsupported context\n");
+ instrumentation_end();
+ }
+
+ irq_state = irqentry_nmi_enter(regs);
+ instrumentation_begin();
+
+ if (!hv_raw_handle_exception(regs)) {
+ pr_emerg("PANIC: Unhandled #HV exception in kernel space\n");
+
+ /* Show some debug info */
+ show_regs(regs);
+
+ /* Ask hypervisor to sev_es_terminate */
+ sev_es_terminate(SEV_TERM_SET_GEN, GHCB_SEV_ES_GEN_REQ);
+
+ panic("Returned from Terminate-Request to Hypervisor\n");
+ }
+
+ instrumentation_end();
+ irqentry_nmi_exit(regs, irq_state);
+}
+
bool __init handle_vc_boot_ghcb(struct pt_regs *regs)
{
unsigned long exit_code = regs->orig_ax;
diff --git a/arch/x86/mm/cpu_entry_area.c b/arch/x86/mm/cpu_entry_area.c
index 6c2f1b76a0b6..608905dc6704 100644
--- a/arch/x86/mm/cpu_entry_area.c
+++ b/arch/x86/mm/cpu_entry_area.c
@@ -115,6 +115,8 @@ static void __init percpu_setup_exception_stacks(unsigned int cpu)
if (cc_platform_has(CC_ATTR_GUEST_STATE_ENCRYPT)) {
cea_map_stack(VC);
cea_map_stack(VC2);
+ cea_map_stack(HV);
+ cea_map_stack(HV2);
}
}
}
--
2.25.1
From: Tianyu Lan <[email protected]>
Hypervisor needs to access iput arg page and guest should decrypt
the page.
Signed-off-by: Tianyu Lan <[email protected]>
---
drivers/hv/hv_common.c | 15 +++++++++++++++
1 file changed, 15 insertions(+)
diff --git a/drivers/hv/hv_common.c b/drivers/hv/hv_common.c
index 2c6602571c47..c16961e686a0 100644
--- a/drivers/hv/hv_common.c
+++ b/drivers/hv/hv_common.c
@@ -21,6 +21,7 @@
#include <linux/ptrace.h>
#include <linux/slab.h>
#include <linux/dma-map-ops.h>
+#include <linux/set_memory.h>
#include <asm/hyperv-tlfs.h>
#include <asm/mshyperv.h>
@@ -125,6 +126,7 @@ int hv_common_cpu_init(unsigned int cpu)
u64 msr_vp_index;
gfp_t flags;
int pgcount = hv_root_partition ? 2 : 1;
+ int ret;
/* hv_cpu_init() can be called with IRQs disabled from hv_resume() */
flags = irqs_disabled() ? GFP_ATOMIC : GFP_KERNEL;
@@ -134,6 +136,16 @@ int hv_common_cpu_init(unsigned int cpu)
if (!(*inputarg))
return -ENOMEM;
+ if (hv_isolation_type_en_snp()) {
+ ret = set_memory_decrypted((unsigned long)*inputarg, 1);
+ if (ret) {
+ kfree(*inputarg);
+ return ret;
+ }
+
+ memset(*inputarg, 0x00, PAGE_SIZE);
+ }
+
if (hv_root_partition) {
outputarg = (void **)this_cpu_ptr(hyperv_pcpu_output_arg);
*outputarg = (char *)(*inputarg) + HV_HYP_PAGE_SIZE;
@@ -168,6 +180,9 @@ int hv_common_cpu_die(unsigned int cpu)
local_irq_restore(flags);
+ if (hv_isolation_type_en_snp())
+ set_memory_encrypted((unsigned long)mem, 1);
+
kfree(mem);
return 0;
--
2.25.1
From: Tianyu Lan <[email protected]>
Introduce static key isolation_type_en_snp for enlightened
guest check and add some specific options in ms_hyperv_init_
platform().
Signed-off-by: Tianyu Lan <[email protected]>
---
arch/x86/hyperv/ivm.c | 12 +++++++++++-
arch/x86/include/asm/mshyperv.h | 2 ++
arch/x86/kernel/cpu/mshyperv.c | 29 ++++++++++++++++++++++++-----
drivers/hv/hv_common.c | 7 +++++++
4 files changed, 44 insertions(+), 6 deletions(-)
diff --git a/arch/x86/hyperv/ivm.c b/arch/x86/hyperv/ivm.c
index 1dbcbd9da74d..e9c30dad3419 100644
--- a/arch/x86/hyperv/ivm.c
+++ b/arch/x86/hyperv/ivm.c
@@ -259,10 +259,20 @@ bool hv_is_isolation_supported(void)
}
DEFINE_STATIC_KEY_FALSE(isolation_type_snp);
+DEFINE_STATIC_KEY_FALSE(isolation_type_en_snp);
+
+/*
+ * hv_isolation_type_en_snp - Check system runs in the AMD SEV-SNP based
+ * isolation enlightened VM.
+ */
+bool hv_isolation_type_en_snp(void)
+{
+ return static_branch_unlikely(&isolation_type_en_snp);
+}
/*
* hv_isolation_type_snp - Check system runs in the AMD SEV-SNP based
- * isolation VM.
+ * isolation VM with vTOM support.
*/
bool hv_isolation_type_snp(void)
{
diff --git a/arch/x86/include/asm/mshyperv.h b/arch/x86/include/asm/mshyperv.h
index 61f0c206bff0..9b8c3f638845 100644
--- a/arch/x86/include/asm/mshyperv.h
+++ b/arch/x86/include/asm/mshyperv.h
@@ -14,6 +14,7 @@
union hv_ghcb;
DECLARE_STATIC_KEY_FALSE(isolation_type_snp);
+DECLARE_STATIC_KEY_FALSE(isolation_type_en_snp);
typedef int (*hyperv_fill_flush_list_func)(
struct hv_guest_mapping_flush_list *flush,
@@ -32,6 +33,7 @@ extern u64 hv_current_partition_id;
extern union hv_ghcb * __percpu *hv_ghcb_pg;
+extern bool hv_isolation_type_en_snp(void);
int hv_call_deposit_pages(int node, u64 partition_id, u32 num_pages);
int hv_call_add_logical_proc(int node, u32 lp_index, u32 acpi_id);
int hv_call_create_vp(int node, u64 partition_id, u32 vp_index, u32 flags);
diff --git a/arch/x86/kernel/cpu/mshyperv.c b/arch/x86/kernel/cpu/mshyperv.c
index 831613959a92..2ea4f21c6172 100644
--- a/arch/x86/kernel/cpu/mshyperv.c
+++ b/arch/x86/kernel/cpu/mshyperv.c
@@ -273,6 +273,21 @@ static void __init ms_hyperv_init_platform(void)
ms_hyperv.misc_features = cpuid_edx(HYPERV_CPUID_FEATURES);
ms_hyperv.hints = cpuid_eax(HYPERV_CPUID_ENLIGHTMENT_INFO);
+ /*
+ * Add custom configuration for SEV-SNP Enlightened guest
+ */
+ if (cc_platform_has(CC_ATTR_GUEST_SEV_SNP)) {
+ ms_hyperv.features |= HV_ACCESS_FREQUENCY_MSRS;
+ ms_hyperv.misc_features |= HV_FEATURE_FREQUENCY_MSRS_AVAILABLE;
+ ms_hyperv.misc_features &= ~HV_FEATURE_GUEST_CRASH_MSR_AVAILABLE;
+ ms_hyperv.hints |= HV_DEPRECATING_AEOI_RECOMMENDED;
+ ms_hyperv.hints |= HV_X64_APIC_ACCESS_RECOMMENDED;
+ ms_hyperv.hints |= HV_X64_CLUSTER_IPI_RECOMMENDED;
+ }
+
+ pr_info("Hyper-V: enlightment features 0x%x, hints 0x%x, misc 0x%x\n",
+ ms_hyperv.features, ms_hyperv.hints, ms_hyperv.misc_features);
+
hv_max_functions_eax = cpuid_eax(HYPERV_CPUID_VENDOR_AND_MAX_FUNCTIONS);
pr_info("Hyper-V: privilege flags low 0x%x, high 0x%x, hints 0x%x, misc 0x%x\n",
@@ -328,18 +343,22 @@ static void __init ms_hyperv_init_platform(void)
ms_hyperv.shared_gpa_boundary =
BIT_ULL(ms_hyperv.shared_gpa_boundary_bits);
- pr_info("Hyper-V: Isolation Config: Group A 0x%x, Group B 0x%x\n",
- ms_hyperv.isolation_config_a, ms_hyperv.isolation_config_b);
-
- if (hv_get_isolation_type() == HV_ISOLATION_TYPE_SNP) {
+ if (cc_platform_has(CC_ATTR_GUEST_SEV_SNP)) {
+ static_branch_enable(&isolation_type_en_snp);
+ } else if (hv_get_isolation_type() == HV_ISOLATION_TYPE_SNP) {
static_branch_enable(&isolation_type_snp);
#ifdef CONFIG_SWIOTLB
swiotlb_unencrypted_base = ms_hyperv.shared_gpa_boundary;
#endif
}
+
+ pr_info("Hyper-V: Isolation Config: Group A 0x%x, Group B 0x%x\n",
+ ms_hyperv.isolation_config_a, ms_hyperv.isolation_config_b);
+
/* Isolation VMs are unenlightened SEV-based VMs, thus this check: */
if (IS_ENABLED(CONFIG_AMD_MEM_ENCRYPT)) {
- if (hv_get_isolation_type() != HV_ISOLATION_TYPE_NONE)
+ if (hv_get_isolation_type() != HV_ISOLATION_TYPE_NONE
+ && !cc_platform_has(CC_ATTR_GUEST_SEV_SNP))
cc_set_vendor(CC_VENDOR_HYPERV);
}
}
diff --git a/drivers/hv/hv_common.c b/drivers/hv/hv_common.c
index ae68298c0dca..2c6602571c47 100644
--- a/drivers/hv/hv_common.c
+++ b/drivers/hv/hv_common.c
@@ -268,6 +268,13 @@ bool __weak hv_isolation_type_snp(void)
}
EXPORT_SYMBOL_GPL(hv_isolation_type_snp);
+bool __weak hv_isolation_type_en_snp(void)
+{
+ return false;
+}
+EXPORT_SYMBOL_GPL(hv_isolation_type_en_snp);
+
+
void __weak hv_setup_vmbus_handler(void (*handler)(void))
{
}
--
2.25.1
From: Tianyu Lan <[email protected]>
Hyperv sev-snp enlightened guest doesn't support x2apic and
apic page read/write operation. Bypass these requests. ipi
request maybe returned with timeout error code and add retry
mechanism.
Signed-off-by: Tianyu Lan <[email protected]>
---
arch/x86/hyperv/hv_apic.c | 79 ++++++++++++++++++++++++-------
include/asm-generic/hyperv-tlfs.h | 1 +
2 files changed, 63 insertions(+), 17 deletions(-)
diff --git a/arch/x86/hyperv/hv_apic.c b/arch/x86/hyperv/hv_apic.c
index fb8b2c088681..214354d20833 100644
--- a/arch/x86/hyperv/hv_apic.c
+++ b/arch/x86/hyperv/hv_apic.c
@@ -66,9 +66,15 @@ static u32 hv_apic_read(u32 reg)
rdmsr(HV_X64_MSR_TPR, reg_val, hi);
(void)hi;
return reg_val;
-
+ case APIC_ID:
+ if (hv_isolation_type_en_snp())
+ return smp_processor_id();
+ fallthrough;
default:
- return native_apic_mem_read(reg);
+ if (!hv_isolation_type_en_snp())
+ return native_apic_mem_read(reg);
+ else
+ return 0;
}
}
@@ -82,7 +88,8 @@ static void hv_apic_write(u32 reg, u32 val)
wrmsr(HV_X64_MSR_TPR, val, 0);
break;
default:
- native_apic_mem_write(reg, val);
+ if (!hv_isolation_type_en_snp())
+ native_apic_mem_write(reg, val);
}
}
@@ -106,6 +113,7 @@ static bool __send_ipi_mask_ex(const struct cpumask *mask, int vector,
struct hv_send_ipi_ex *ipi_arg;
unsigned long flags;
int nr_bank = 0;
+ int retry = 5;
u64 status = HV_STATUS_INVALID_PARAMETER;
if (!(ms_hyperv.hints & HV_X64_EX_PROCESSOR_MASKS_RECOMMENDED))
@@ -144,8 +152,10 @@ static bool __send_ipi_mask_ex(const struct cpumask *mask, int vector,
ipi_arg->vp_set.format = HV_GENERIC_SET_ALL;
}
- status = hv_do_rep_hypercall(HVCALL_SEND_IPI_EX, 0, nr_bank,
+ do {
+ status = hv_do_rep_hypercall(HVCALL_SEND_IPI_EX, 0, nr_bank,
ipi_arg, NULL);
+ } while (status == HV_STATUS_TIME_OUT && retry--);
ipi_mask_ex_done:
local_irq_restore(flags);
@@ -159,6 +169,7 @@ static bool __send_ipi_mask(const struct cpumask *mask, int vector,
struct hv_send_ipi ipi_arg;
u64 status;
unsigned int weight;
+ int retry = 5;
trace_hyperv_send_ipi_mask(mask, vector);
@@ -212,8 +223,11 @@ static bool __send_ipi_mask(const struct cpumask *mask, int vector,
__set_bit(vcpu, (unsigned long *)&ipi_arg.cpu_mask);
}
- status = hv_do_fast_hypercall16(HVCALL_SEND_IPI, ipi_arg.vector,
- ipi_arg.cpu_mask);
+ do {
+ status = hv_do_fast_hypercall16(HVCALL_SEND_IPI, ipi_arg.vector,
+ ipi_arg.cpu_mask);
+ } while (status == HV_STATUS_TIME_OUT && retry--);
+
return hv_result_success(status);
do_ex_hypercall:
@@ -224,6 +238,7 @@ static bool __send_ipi_one(int cpu, int vector)
{
int vp = hv_cpu_number_to_vp_number(cpu);
u64 status;
+ int retry = 5;
trace_hyperv_send_ipi_one(cpu, vector);
@@ -236,26 +251,48 @@ static bool __send_ipi_one(int cpu, int vector)
if (vp >= 64)
return __send_ipi_mask_ex(cpumask_of(cpu), vector, false);
- status = hv_do_fast_hypercall16(HVCALL_SEND_IPI, vector, BIT_ULL(vp));
+ do {
+ status = hv_do_fast_hypercall16(HVCALL_SEND_IPI, vector, BIT_ULL(vp));
+ } while (status == HV_STATUS_TIME_OUT || retry--);
+
return hv_result_success(status);
}
static void hv_send_ipi(int cpu, int vector)
{
- if (!__send_ipi_one(cpu, vector))
- orig_apic.send_IPI(cpu, vector);
+ if (!__send_ipi_one(cpu, vector)) {
+ if (!hv_isolation_type_en_snp())
+ orig_apic.send_IPI(cpu, vector);
+ else
+ WARN_ON_ONCE(1);
+ }
}
static void hv_send_ipi_mask(const struct cpumask *mask, int vector)
{
- if (!__send_ipi_mask(mask, vector, false))
- orig_apic.send_IPI_mask(mask, vector);
+ if (!__send_ipi_mask(mask, vector, false)) {
+ if (!hv_isolation_type_en_snp())
+ orig_apic.send_IPI_mask(mask, vector);
+ else
+ WARN_ON_ONCE(1);
+ }
}
static void hv_send_ipi_mask_allbutself(const struct cpumask *mask, int vector)
{
- if (!__send_ipi_mask(mask, vector, true))
- orig_apic.send_IPI_mask_allbutself(mask, vector);
+ unsigned int this_cpu = smp_processor_id();
+ struct cpumask new_mask;
+ const struct cpumask *local_mask;
+
+ cpumask_copy(&new_mask, mask);
+ cpumask_clear_cpu(this_cpu, &new_mask);
+ local_mask = &new_mask;
+ if (!__send_ipi_mask(local_mask, vector, true)) {
+ if (!hv_isolation_type_en_snp())
+ orig_apic.send_IPI_mask_allbutself(mask, vector);
+ else
+ WARN_ON_ONCE(1);
+ }
}
static void hv_send_ipi_allbutself(int vector)
@@ -265,14 +302,22 @@ static void hv_send_ipi_allbutself(int vector)
static void hv_send_ipi_all(int vector)
{
- if (!__send_ipi_mask(cpu_online_mask, vector, false))
- orig_apic.send_IPI_all(vector);
+ if (!__send_ipi_mask(cpu_online_mask, vector, false)) {
+ if (!hv_isolation_type_en_snp())
+ orig_apic.send_IPI_all(vector);
+ else
+ WARN_ON_ONCE(1);
+ }
}
static void hv_send_ipi_self(int vector)
{
- if (!__send_ipi_one(smp_processor_id(), vector))
- orig_apic.send_IPI_self(vector);
+ if (!__send_ipi_one(smp_processor_id(), vector)) {
+ if (!hv_isolation_type_en_snp())
+ orig_apic.send_IPI_self(vector);
+ else
+ WARN_ON_ONCE(1);
+ }
}
void __init hv_apic_init(void)
diff --git a/include/asm-generic/hyperv-tlfs.h b/include/asm-generic/hyperv-tlfs.h
index fdce7a4cfc6f..6e2a090e2649 100644
--- a/include/asm-generic/hyperv-tlfs.h
+++ b/include/asm-generic/hyperv-tlfs.h
@@ -208,6 +208,7 @@ enum HV_GENERIC_SET_FORMAT {
#define HV_STATUS_INVALID_PORT_ID 17
#define HV_STATUS_INVALID_CONNECTION_ID 18
#define HV_STATUS_INSUFFICIENT_BUFFERS 19
+#define HV_STATUS_TIME_OUT 0x78
/*
* The Hyper-V TimeRefCount register and the TSC
--
2.25.1
From: Tianyu Lan <[email protected]>
Hypervisor may pass cc blob address directly into boot param's cc
blob address in the direct boot mode. Check cc blcb hdr magic first
in the sev_enable() and use it as cc blob address if check successfully.
Signed-off-by: Tianyu Lan <[email protected]>
---
arch/x86/boot/compressed/sev.c | 27 ++++++++++++++++++++-------
1 file changed, 20 insertions(+), 7 deletions(-)
diff --git a/arch/x86/boot/compressed/sev.c b/arch/x86/boot/compressed/sev.c
index c93930d5ccbd..960968f8bf75 100644
--- a/arch/x86/boot/compressed/sev.c
+++ b/arch/x86/boot/compressed/sev.c
@@ -272,17 +272,24 @@ static void enforce_vmpl0(void)
void sev_enable(struct boot_params *bp)
{
+ struct cc_blob_sev_info *cc_info;
unsigned int eax, ebx, ecx, edx;
struct msr m;
bool snp;
/*
- * bp->cc_blob_address should only be set by boot/compressed kernel.
- * Initialize it to 0 to ensure that uninitialized values from
- * buggy bootloaders aren't propagated.
+ * bp->cc_blob_address should only be set by boot/compressed
+ * kernel and hypervisor with direct boot mode. Initialize it
+ * to 0 after checking in order to ensure that uninitialized
+ * values from buggy bootloaders aren't propagated.
*/
- if (bp)
- bp->cc_blob_address = 0;
+ if (bp) {
+ cc_info = (struct cc_blob_sev_info *)(unsigned long)
+ bp->cc_blob_address;
+
+ if (cc_info->magic != CC_BLOB_SEV_HDR_MAGIC)
+ bp->cc_blob_address = 0;
+ }
/*
* Setup/preliminary detection of SNP. This will be sanity-checked
@@ -374,6 +381,10 @@ static struct cc_blob_sev_info *find_cc_blob(struct boot_params *bp)
{
struct cc_blob_sev_info *cc_info;
+ /* Boot kernel would have passed the CC blob via boot_params. */
+ if (bp->cc_blob_address)
+ return (struct cc_blob_sev_info *)(unsigned long)bp->cc_blob_address;
+
cc_info = find_cc_blob_efi(bp);
if (cc_info)
goto found_cc_info;
@@ -416,9 +427,11 @@ bool snp_init(struct boot_params *bp)
/*
* Pass run-time kernel a pointer to CC info via boot_params so EFI
* config table doesn't need to be searched again during early startup
- * phase.
+ * phase. Hypervisor also may popualte cc_blob_address directyly
+ * in direct boot mode.
*/
- bp->cc_blob_address = (u32)(unsigned long)cc_info;
+ if (!bp->cc_blob_address)
+ bp->cc_blob_address = (u32)(unsigned long)cc_info;
return true;
}
--
2.25.1
From: Tianyu Lan <[email protected]>
Hyper-V tsc page is shared with hypervisor and it should be decrypted
in sev-snp enlightened guest when it's used.
Signed-off-by: Tianyu Lan <[email protected]>
---
drivers/clocksource/hyperv_timer.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/drivers/clocksource/hyperv_timer.c b/drivers/clocksource/hyperv_timer.c
index bb47610bbd1c..aa68eebed5ee 100644
--- a/drivers/clocksource/hyperv_timer.c
+++ b/drivers/clocksource/hyperv_timer.c
@@ -364,7 +364,7 @@ EXPORT_SYMBOL_GPL(hv_stimer_global_cleanup);
static union {
struct ms_hyperv_tsc_page page;
u8 reserved[PAGE_SIZE];
-} tsc_pg __aligned(PAGE_SIZE);
+} tsc_pg __bss_decrypted __aligned(PAGE_SIZE);
struct ms_hyperv_tsc_page *hv_get_tsc_page(void)
{
--
2.25.1
From: Tianyu Lan <[email protected]>
Pvalidate needed pages for decompressing kernel. The E820_TYPE_RAM
entry includes only validated memory. The kernel expects that the
RAM entry's addr is fixed while the entry size is to be extended
to cover addresses to the start of next entry. This patch increases
the RAM entry size to cover all possilble memory addresses until
init_size.
Signed-off-by: Tianyu Lan <[email protected]>
---
arch/x86/boot/compressed/head_64.S | 8 +++
arch/x86/boot/compressed/sev.c | 84 ++++++++++++++++++++++++++++++
2 files changed, 92 insertions(+)
diff --git a/arch/x86/boot/compressed/head_64.S b/arch/x86/boot/compressed/head_64.S
index d33f060900d2..818edaf5d0cf 100644
--- a/arch/x86/boot/compressed/head_64.S
+++ b/arch/x86/boot/compressed/head_64.S
@@ -348,6 +348,14 @@ SYM_CODE_START(startup_64)
cld
cli
+#ifdef CONFIG_AMD_MEM_ENCRYPT
+ /* pvalidate memory on demand if SNP is enabled. */
+ pushq %rsi
+ movq %rsi, %rdi
+ call pvalidate_for_startup_64
+ popq %rsi
+#endif
+
/* Setup data segments. */
xorl %eax, %eax
movl %eax, %ds
diff --git a/arch/x86/boot/compressed/sev.c b/arch/x86/boot/compressed/sev.c
index 960968f8bf75..3a5a1ab16095 100644
--- a/arch/x86/boot/compressed/sev.c
+++ b/arch/x86/boot/compressed/sev.c
@@ -12,8 +12,10 @@
*/
#include "misc.h"
+#include <asm/msr-index.h>
#include <asm/pgtable_types.h>
#include <asm/sev.h>
+#include <asm/svm.h>
#include <asm/trapnr.h>
#include <asm/trap_pf.h>
#include <asm/msr-index.h>
@@ -21,6 +23,7 @@
#include <asm/ptrace.h>
#include <asm/svm.h>
#include <asm/cpuid.h>
+#include <asm/e820/types.h>
#include "error.h"
#include "../msr.h"
@@ -117,6 +120,22 @@ static enum es_result vc_read_mem(struct es_em_ctxt *ctxt,
/* Include code for early handlers */
#include "../../kernel/sev-shared.c"
+/* Check SEV-SNP via MSR */
+static bool sev_snp_runtime_check(void)
+{
+ unsigned long low, high;
+ u64 val;
+
+ asm volatile("rdmsr\n" : "=a" (low), "=d" (high) :
+ "c" (MSR_AMD64_SEV));
+
+ val = (high << 32) | low;
+ if (val & MSR_AMD64_SEV_SNP_ENABLED)
+ return true;
+
+ return false;
+}
+
static inline bool sev_snp_enabled(void)
{
return sev_status & MSR_AMD64_SEV_SNP_ENABLED;
@@ -456,3 +475,68 @@ void sev_prep_identity_maps(unsigned long top_level_pgt)
sev_verify_cbit(top_level_pgt);
}
+
+static void extend_e820_on_demand(struct boot_e820_entry *e820_entry,
+ u64 needed_ram_end)
+{
+ u64 end, paddr;
+ unsigned long eflags;
+ int rc;
+
+ if (!e820_entry)
+ return;
+
+ /* Validated memory must be aligned by PAGE_SIZE. */
+ end = ALIGN(e820_entry->addr + e820_entry->size, PAGE_SIZE);
+ if (needed_ram_end > end && e820_entry->type == E820_TYPE_RAM) {
+ for (paddr = end; paddr < needed_ram_end; paddr += PAGE_SIZE) {
+ rc = pvalidate(paddr, RMP_PG_SIZE_4K, true);
+ if (rc) {
+ error("Failed to validate address.n");
+ return;
+ }
+ }
+ e820_entry->size = needed_ram_end - e820_entry->addr;
+ }
+}
+
+/*
+ * Explicitly pvalidate needed pages for decompressing the kernel.
+ * The E820_TYPE_RAM entry includes only validated memory. The kernel
+ * expects that the RAM entry's addr is fixed while the entry size is to be
+ * extended to cover addresses to the start of next entry.
+ * The function increases the RAM entry size to cover all possible memory
+ * addresses until init_size.
+ * For example, init_end = 0x4000000,
+ * [RAM: 0x0 - 0x0], M[RAM: 0x0 - 0xa0000]
+ * [RSVD: 0xa0000 - 0x10000] [RSVD: 0xa0000 - 0x10000]
+ * [ACPI: 0x10000 - 0x20000] ==> [ACPI: 0x10000 - 0x20000]
+ * [RSVD: 0x800000 - 0x900000] [RSVD: 0x800000 - 0x900000]
+ * [RAM: 0x1000000 - 0x2000000] M[RAM: 0x1000000 - 0x2001000]
+ * [RAM: 0x2001000 - 0x2007000] M[RAM: 0x2001000 - 0x4000000]
+ * Other RAM memory after init_end is pvalidated by ms_hyperv_init_platform
+ */
+__visible void pvalidate_for_startup_64(struct boot_params *boot_params)
+{
+ struct boot_e820_entry *e820_entry;
+ u64 init_end =
+ boot_params->hdr.pref_address + boot_params->hdr.init_size;
+ u8 i, nr_entries = boot_params->e820_entries;
+ u64 needed_end;
+
+ if (!sev_snp_runtime_check())
+ return;
+
+ for (i = 0; i < nr_entries; ++i) {
+ /* Pvalidate memory holes in e820 RAM entries. */
+ e820_entry = &boot_params->e820_table[i];
+ if (i < nr_entries - 1) {
+ needed_end = boot_params->e820_table[i + 1].addr;
+ if (needed_end < e820_entry->addr)
+ error("e820 table is not sorted.\n");
+ } else {
+ needed_end = init_end;
+ }
+ extend_e820_on_demand(e820_entry, needed_end);
+ }
+}
--
2.25.1
From: Tianyu Lan <[email protected]>
Enable #HV exception to handle interrupt requests from hypervisor.
Signed-off-by: Tianyu Lan <[email protected]>
---
arch/x86/entry/entry_64.S | 18 ++
arch/x86/include/asm/irqflags.h | 19 ++
arch/x86/include/asm/mem_encrypt.h | 2 +
arch/x86/include/asm/msr-index.h | 6 +
arch/x86/include/uapi/asm/svm.h | 4 +
arch/x86/kernel/sev.c | 327 ++++++++++++++++++++++++-----
arch/x86/kernel/traps.c | 50 +++++
7 files changed, 373 insertions(+), 53 deletions(-)
diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
index b2059df43c57..fe460cf44ab5 100644
--- a/arch/x86/entry/entry_64.S
+++ b/arch/x86/entry/entry_64.S
@@ -1058,6 +1058,15 @@ SYM_CODE_END(paranoid_entry)
* R15 - old SPEC_CTRL
*/
SYM_CODE_START_LOCAL(paranoid_exit)
+#ifdef CONFIG_AMD_MEM_ENCRYPT
+ /*
+ * If a #HV was delivered during execution and interrupts were
+ * disabled, then check if it can be handled before the iret
+ * (which may re-enable interrupts).
+ */
+ mov %rsp, %rdi
+ call check_hv_pending
+#endif
UNWIND_HINT_REGS
/*
@@ -1183,6 +1192,15 @@ SYM_CODE_START_LOCAL(error_entry)
SYM_CODE_END(error_entry)
SYM_CODE_START_LOCAL(error_return)
+#ifdef CONFIG_AMD_MEM_ENCRYPT
+ /*
+ * If a #HV was delivered during execution and interrupts were
+ * disabled, then check if it can be handled before the iret
+ * (which may re-enable interrupts).
+ */
+ mov %rsp, %rdi
+ call check_hv_pending
+#endif
UNWIND_HINT_REGS
DEBUG_ENTRY_ASSERT_IRQS_OFF
testb $3, CS(%rsp)
diff --git a/arch/x86/include/asm/irqflags.h b/arch/x86/include/asm/irqflags.h
index 7793e52d6237..e0730d8bc0ac 100644
--- a/arch/x86/include/asm/irqflags.h
+++ b/arch/x86/include/asm/irqflags.h
@@ -14,6 +14,9 @@
/*
* Interrupt control:
*/
+#ifdef CONFIG_AMD_MEM_ENCRYPT
+void check_hv_pending(struct pt_regs *regs);
+#endif
/* Declaration required for gcc < 4.9 to prevent -Werror=missing-prototypes */
extern inline unsigned long native_save_fl(void);
@@ -35,6 +38,19 @@ extern __always_inline unsigned long native_save_fl(void)
return flags;
}
+extern inline void native_restore_fl(unsigned long flags)
+{
+ asm volatile("push %0 ; popf"
+ : /* no output */
+ : "g" (flags)
+ : "memory", "cc");
+#ifdef CONFIG_AMD_MEM_ENCRYPT
+ if ((flags & X86_EFLAGS_IF)) {
+ check_hv_pending(NULL);
+ }
+#endif
+}
+
static __always_inline void native_irq_disable(void)
{
asm volatile("cli": : :"memory");
@@ -43,6 +59,9 @@ static __always_inline void native_irq_disable(void)
static __always_inline void native_irq_enable(void)
{
asm volatile("sti": : :"memory");
+#ifdef CONFIG_AMD_MEM_ENCRYPT
+ check_hv_pending(NULL);
+#endif
}
static inline __cpuidle void native_safe_halt(void)
diff --git a/arch/x86/include/asm/mem_encrypt.h b/arch/x86/include/asm/mem_encrypt.h
index 72ca90552b6a..7264ca5f5b2d 100644
--- a/arch/x86/include/asm/mem_encrypt.h
+++ b/arch/x86/include/asm/mem_encrypt.h
@@ -50,6 +50,7 @@ void __init early_set_mem_enc_dec_hypercall(unsigned long vaddr, int npages,
void __init mem_encrypt_free_decrypted_mem(void);
void __init sev_es_init_vc_handling(void);
+void __init sev_snp_init_hv_handling(void);
#define __bss_decrypted __section(".bss..decrypted")
@@ -72,6 +73,7 @@ static inline void __init sme_encrypt_kernel(struct boot_params *bp) { }
static inline void __init sme_enable(struct boot_params *bp) { }
static inline void sev_es_init_vc_handling(void) { }
+static inline void sev_snp_init_hv_handling(void) { }
static inline int __init
early_set_memory_decrypted(unsigned long vaddr, unsigned long size) { return 0; }
diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h
index 10ac52705892..6fe25a6e325f 100644
--- a/arch/x86/include/asm/msr-index.h
+++ b/arch/x86/include/asm/msr-index.h
@@ -562,10 +562,16 @@
#define MSR_AMD64_SEV_ENABLED_BIT 0
#define MSR_AMD64_SEV_ES_ENABLED_BIT 1
#define MSR_AMD64_SEV_SNP_ENABLED_BIT 2
+#define MSR_AMD64_SEV_REFLECTVC_ENABLED_BIT 4
+#define MSR_AMD64_SEV_RESTRICTED_INJECTION_ENABLED_BIT 5
+#define MSR_AMD64_SEV_ALTERNATE_INJECTION_ENABLED_BIT 6
#define MSR_AMD64_SEV_ENABLED BIT_ULL(MSR_AMD64_SEV_ENABLED_BIT)
#define MSR_AMD64_SEV_ES_ENABLED BIT_ULL(MSR_AMD64_SEV_ES_ENABLED_BIT)
#define MSR_AMD64_SEV_SNP_ENABLED BIT_ULL(MSR_AMD64_SEV_SNP_ENABLED_BIT)
+#define MSR_AMD64_SEV_REFLECTVC_ENABLED BIT_ULL(MSR_AMD64_SEV_REFLECTVC_ENABLED_BIT)
+#define MSR_AMD64_SEV_RESTRICTED_INJECTION_ENABLED BIT_ULL(MSR_AMD64_SEV_RESTRICTED_INJECTION_ENABLED_BIT)
+#define MSR_AMD64_SEV_ALTERNATE_INJECTION_ENABLED BIT_ULL(MSR_AMD64_SEV_ALTERNATE_INJECTION_ENABLED_BIT)
#define MSR_AMD64_VIRT_SPEC_CTRL 0xc001011f
/* AMD Collaborative Processor Performance Control MSRs */
diff --git a/arch/x86/include/uapi/asm/svm.h b/arch/x86/include/uapi/asm/svm.h
index f69c168391aa..85d6882262e7 100644
--- a/arch/x86/include/uapi/asm/svm.h
+++ b/arch/x86/include/uapi/asm/svm.h
@@ -115,6 +115,10 @@
#define SVM_VMGEXIT_AP_CREATE_ON_INIT 0
#define SVM_VMGEXIT_AP_CREATE 1
#define SVM_VMGEXIT_AP_DESTROY 2
+#define SVM_VMGEXIT_HV_DOORBELL_PAGE 0x80000014
+#define SVM_VMGEXIT_GET_PREFERRED_HV_DOORBELL_PAGE 0
+#define SVM_VMGEXIT_SET_HV_DOORBELL_PAGE 1
+#define SVM_VMGEXIT_QUERY_HV_DOORBELL_PAGE 2
#define SVM_VMGEXIT_HV_FEATURES 0x8000fffd
#define SVM_VMGEXIT_UNSUPPORTED_EVENT 0x8000ffff
diff --git a/arch/x86/kernel/sev.c b/arch/x86/kernel/sev.c
index 63ddb043d16d..65eb9f96d0c4 100644
--- a/arch/x86/kernel/sev.c
+++ b/arch/x86/kernel/sev.c
@@ -104,6 +104,12 @@ struct sev_es_runtime_data {
* is currently unsupported in SEV-ES guests.
*/
unsigned long dr7;
+ /*
+ * SEV-SNP requires that the GHCB must be registered before using it.
+ * The flag below will indicate whether the GHCB is registered, if its
+ * not registered then sev_es_get_ghcb() will perform the registration.
+ */
+ bool ghcb_registered;
};
struct ghcb_state {
@@ -122,6 +128,156 @@ struct sev_config {
static struct sev_config sev_cfg __read_mostly;
+static noinstr struct ghcb *__sev_get_ghcb(struct ghcb_state *state);
+static noinstr void __sev_put_ghcb(struct ghcb_state *state);
+static int vmgexit_hv_doorbell_page(struct ghcb *ghcb, u64 op, u64 pa);
+static void sev_snp_setup_hv_doorbell_page(struct ghcb *ghcb);
+
+struct sev_hv_doorbell_page {
+ union {
+ u16 pending_events;
+ struct {
+ u8 vector;
+ u8 nmi : 1;
+ u8 mc : 1;
+ u8 reserved1 : 5;
+ u8 no_further_signal : 1;
+ };
+ };
+ u8 no_eoi_required;
+ u8 reserved2[61];
+ u8 padding[4032];
+};
+
+struct sev_snp_runtime_data {
+ struct sev_hv_doorbell_page hv_doorbell_page;
+};
+
+static DEFINE_PER_CPU(struct sev_snp_runtime_data*, snp_runtime_data);
+
+static inline u64 sev_es_rd_ghcb_msr(void)
+{
+ return __rdmsr(MSR_AMD64_SEV_ES_GHCB);
+}
+
+static __always_inline void sev_es_wr_ghcb_msr(u64 val)
+{
+ u32 low, high;
+
+ low = (u32)(val);
+ high = (u32)(val >> 32);
+
+ native_wrmsr(MSR_AMD64_SEV_ES_GHCB, low, high);
+}
+
+struct sev_hv_doorbell_page *sev_snp_current_doorbell_page(void)
+{
+ return &this_cpu_read(snp_runtime_data)->hv_doorbell_page;
+}
+
+static u8 sev_hv_pending(void)
+{
+ return sev_snp_current_doorbell_page()->vector;
+}
+
+static void hv_doorbell_apic_eoi_write(u32 reg, u32 val)
+{
+ if (xchg(&sev_snp_current_doorbell_page()->no_eoi_required, 0) & 0x1)
+ return;
+
+ BUG_ON(reg != APIC_EOI);
+ apic->write(reg, val);
+}
+
+static void do_exc_hv(struct pt_regs *regs)
+{
+ u8 vector;
+
+ while (sev_hv_pending()) {
+ asm volatile("cli" : : : "memory");
+
+ vector = xchg(&sev_snp_current_doorbell_page()->vector, 0);
+
+ switch (vector) {
+#if IS_ENABLED(CONFIG_HYPERV)
+ case HYPERV_STIMER0_VECTOR:
+ sysvec_hyperv_stimer0(regs);
+ break;
+ case HYPERVISOR_CALLBACK_VECTOR:
+ sysvec_hyperv_callback(regs);
+ break;
+#endif
+#ifdef CONFIG_SMP
+ case RESCHEDULE_VECTOR:
+ sysvec_reschedule_ipi(regs);
+ break;
+ case IRQ_MOVE_CLEANUP_VECTOR:
+ sysvec_irq_move_cleanup(regs);
+ break;
+ case REBOOT_VECTOR:
+ sysvec_reboot(regs);
+ break;
+ case CALL_FUNCTION_SINGLE_VECTOR:
+ sysvec_call_function_single(regs);
+ break;
+ case CALL_FUNCTION_VECTOR:
+ sysvec_call_function(regs);
+ break;
+#endif
+#ifdef CONFIG_X86_LOCAL_APIC
+ case ERROR_APIC_VECTOR:
+ sysvec_error_interrupt(regs);
+ break;
+ case SPURIOUS_APIC_VECTOR:
+ sysvec_spurious_apic_interrupt(regs);
+ break;
+ case LOCAL_TIMER_VECTOR:
+ sysvec_apic_timer_interrupt(regs);
+ break;
+ case X86_PLATFORM_IPI_VECTOR:
+ sysvec_x86_platform_ipi(regs);
+ break;
+#endif
+ case 0x0:
+ break;
+ default:
+ panic("Unexpected vector %d\n", vector);
+ unreachable();
+ }
+
+ asm volatile("sti" : : : "memory");
+ }
+}
+
+void check_hv_pending(struct pt_regs *regs)
+{
+ struct pt_regs local_regs;
+
+ if (!cc_platform_has(CC_ATTR_GUEST_SEV_SNP))
+ return;
+
+ if (regs) {
+ if ((regs->flags & X86_EFLAGS_IF) == 0)
+ return;
+
+ if (!sev_hv_pending())
+ return;
+
+ do_exc_hv(regs);
+ } else {
+ if (sev_hv_pending()) {
+ memset(&local_regs, 0, sizeof(struct pt_regs));
+ regs = &local_regs;
+ asm volatile("movl %%cs, %%eax;" : "=a" (regs->cs));
+ asm volatile("movl %%ss, %%eax;" : "=a" (regs->ss));
+ regs->orig_ax = 0xffffffff;
+ regs->flags = native_save_fl();
+ do_exc_hv(regs);
+ }
+ }
+}
+EXPORT_SYMBOL_GPL(check_hv_pending);
+
static __always_inline bool on_vc_stack(struct pt_regs *regs)
{
unsigned long sp = regs->sp;
@@ -193,68 +349,35 @@ void noinstr __sev_es_ist_exit(void)
this_cpu_write(cpu_tss_rw.x86_tss.ist[IST_INDEX_VC], *(unsigned long *)ist);
}
-/*
- * Nothing shall interrupt this code path while holding the per-CPU
- * GHCB. The backup GHCB is only for NMIs interrupting this path.
- *
- * Callers must disable local interrupts around it.
- */
-static noinstr struct ghcb *__sev_get_ghcb(struct ghcb_state *state)
+static bool sev_restricted_injection_enabled(void)
{
- struct sev_es_runtime_data *data;
+ return sev_status & MSR_AMD64_SEV_RESTRICTED_INJECTION_ENABLED;
+}
+
+void __init sev_snp_init_hv_handling(void)
+{
+ struct sev_snp_runtime_data *snp_data;
+ struct ghcb_state state;
struct ghcb *ghcb;
+ unsigned long flags;
+ int cpu;
+ int err;
WARN_ON(!irqs_disabled());
+ if (!cc_platform_has(CC_ATTR_GUEST_SEV_SNP) || !sev_restricted_injection_enabled())
+ return;
- data = this_cpu_read(runtime_data);
- ghcb = &data->ghcb_page;
-
- if (unlikely(data->ghcb_active)) {
- /* GHCB is already in use - save its contents */
-
- if (unlikely(data->backup_ghcb_active)) {
- /*
- * Backup-GHCB is also already in use. There is no way
- * to continue here so just kill the machine. To make
- * panic() work, mark GHCBs inactive so that messages
- * can be printed out.
- */
- data->ghcb_active = false;
- data->backup_ghcb_active = false;
-
- instrumentation_begin();
- panic("Unable to handle #VC exception! GHCB and Backup GHCB are already in use");
- instrumentation_end();
- }
-
- /* Mark backup_ghcb active before writing to it */
- data->backup_ghcb_active = true;
-
- state->ghcb = &data->backup_ghcb;
-
- /* Backup GHCB content */
- *state->ghcb = *ghcb;
- } else {
- state->ghcb = NULL;
- data->ghcb_active = true;
- }
+ local_irq_save(flags);
- return ghcb;
-}
+ ghcb = __sev_get_ghcb(&state);
-static inline u64 sev_es_rd_ghcb_msr(void)
-{
- return __rdmsr(MSR_AMD64_SEV_ES_GHCB);
-}
+ sev_snp_setup_hv_doorbell_page(ghcb);
-static __always_inline void sev_es_wr_ghcb_msr(u64 val)
-{
- u32 low, high;
+ __sev_put_ghcb(&state);
- low = (u32)(val);
- high = (u32)(val >> 32);
+ apic_set_eoi_write(hv_doorbell_apic_eoi_write);
- native_wrmsr(MSR_AMD64_SEV_ES_GHCB, low, high);
+ local_irq_restore(flags);
}
static int vc_fetch_insn_kernel(struct es_em_ctxt *ctxt,
@@ -515,6 +638,79 @@ static enum es_result vc_slow_virt_to_phys(struct ghcb *ghcb, struct es_em_ctxt
/* Include code shared with pre-decompression boot stage */
#include "sev-shared.c"
+/*
+ * Nothing shall interrupt this code path while holding the per-CPU
+ * GHCB. The backup GHCB is only for NMIs interrupting this path.
+ *
+ * Callers must disable local interrupts around it.
+ */
+static noinstr struct ghcb *__sev_get_ghcb(struct ghcb_state *state)
+{
+ struct sev_es_runtime_data *data;
+ struct ghcb *ghcb;
+
+ WARN_ON(!irqs_disabled());
+
+ data = this_cpu_read(runtime_data);
+ ghcb = &data->ghcb_page;
+
+ if (unlikely(data->ghcb_active)) {
+ /* GHCB is already in use - save its contents */
+
+ if (unlikely(data->backup_ghcb_active)) {
+ /*
+ * Backup-GHCB is also already in use. There is no way
+ * to continue here so just kill the machine. To make
+ * panic() work, mark GHCBs inactive so that messages
+ * can be printed out.
+ */
+ data->ghcb_active = false;
+ data->backup_ghcb_active = false;
+
+ instrumentation_begin();
+ panic("Unable to handle #VC exception! GHCB and Backup GHCB are already in use");
+ instrumentation_end();
+ }
+
+ /* Mark backup_ghcb active before writing to it */
+ data->backup_ghcb_active = true;
+
+ state->ghcb = &data->backup_ghcb;
+
+ /* Backup GHCB content */
+ *state->ghcb = *ghcb;
+ } else {
+ state->ghcb = NULL;
+ data->ghcb_active = true;
+ }
+
+ /* SEV-SNP guest requires that GHCB must be registered before using it. */
+ if (!data->ghcb_registered) {
+ if (cc_platform_has(CC_ATTR_GUEST_SEV_SNP)) {
+ snp_register_ghcb_early(__pa(ghcb));
+ sev_snp_setup_hv_doorbell_page(ghcb);
+ } else {
+ sev_es_wr_ghcb_msr(__pa(ghcb));
+ }
+ data->ghcb_registered = true;
+ }
+
+ return ghcb;
+}
+
+static void sev_snp_setup_hv_doorbell_page(struct ghcb *ghcb)
+{
+ u64 pa;
+ enum es_result ret;
+
+ pa = __pa(sev_snp_current_doorbell_page());
+ vc_ghcb_invalidate(ghcb);
+ ret = vmgexit_hv_doorbell_page(ghcb,
+ SVM_VMGEXIT_SET_HV_DOORBELL_PAGE, pa);
+ if (ret != ES_OK)
+ panic("SEV-SNP: failed to set up #HV doorbell page");
+}
+
static noinstr void __sev_put_ghcb(struct ghcb_state *state)
{
struct sev_es_runtime_data *data;
@@ -1282,6 +1478,11 @@ void setup_ghcb(void)
snp_register_ghcb_early(__pa(&boot_ghcb_page));
}
+int vmgexit_hv_doorbell_page(struct ghcb *ghcb, u64 op, u64 pa)
+{
+ return sev_es_ghcb_hv_call(ghcb, NULL, SVM_VMGEXIT_HV_DOORBELL_PAGE, op, pa);
+}
+
#ifdef CONFIG_HOTPLUG_CPU
static void sev_es_ap_hlt_loop(void)
{
@@ -1355,6 +1556,7 @@ static void __init alloc_runtime_data(int cpu)
static void __init init_ghcb(int cpu)
{
struct sev_es_runtime_data *data;
+ struct sev_snp_runtime_data *snp_data;
int err;
data = per_cpu(runtime_data, cpu);
@@ -1366,8 +1568,22 @@ static void __init init_ghcb(int cpu)
memset(&data->ghcb_page, 0, sizeof(data->ghcb_page));
+ snp_data = memblock_alloc(sizeof(*snp_data), PAGE_SIZE);
+ if (!snp_data)
+ panic("Can't allocate SEV-SNP runtime data");
+
+ err = early_set_memory_decrypted((unsigned long)&snp_data->hv_doorbell_page,
+ sizeof(snp_data->hv_doorbell_page));
+ if (err)
+ panic("Can't map #HV doorbell pages unencrypted");
+
+ memset(&snp_data->hv_doorbell_page, 0, sizeof(snp_data->hv_doorbell_page));
+
+ per_cpu(snp_runtime_data, cpu) = snp_data;
+
data->ghcb_active = false;
data->backup_ghcb_active = false;
+ data->ghcb_registered = false;
}
void __init sev_es_init_vc_handling(void)
@@ -2006,7 +2222,12 @@ DEFINE_IDTENTRY_VC_USER(exc_vmm_communication)
static bool hv_raw_handle_exception(struct pt_regs *regs)
{
- return false;
+ /* Clear the no_further_signal bit */
+ sev_snp_current_doorbell_page()->pending_events &= 0x7fff;
+
+ check_hv_pending(regs);
+
+ return true;
}
static __always_inline bool on_hv_fallback_stack(struct pt_regs *regs)
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index 178015a820f0..af97e6610fbb 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -898,6 +898,53 @@ asmlinkage __visible noinstr struct pt_regs *vc_switch_off_ist(struct pt_regs *r
return regs_ret;
}
+
+asmlinkage __visible noinstr struct pt_regs *hv_switch_off_ist(struct pt_regs *regs)
+{
+ unsigned long sp, *stack;
+ struct stack_info info;
+ struct pt_regs *regs_ret;
+
+ /*
+ * A malicious hypervisor can inject 2 HVs in a row, which will corrupt
+ * the trap frame on our IST stack. We add a defensive check here to
+ * catch such behavior.
+ */
+ BUG_ON(regs->sp >= __this_cpu_ist_bottom_va(HV) && regs->sp < __this_cpu_ist_top_va(HV));
+
+ /*
+ * In the SYSCALL entry path the RSP value comes from user-space - don't
+ * trust it and switch to the current kernel stack
+ */
+ if (ip_within_syscall_gap(regs)) {
+ sp = this_cpu_read(cpu_current_top_of_stack);
+ goto sync;
+ }
+
+ /*
+ * From here on the RSP value is trusted. Now check whether entry
+ * happened from a safe stack. Not safe are the entry or unknown stacks,
+ * use the fall-back stack instead in this case.
+ */
+ sp = regs->sp;
+ stack = (unsigned long *)sp;
+
+ if (!get_stack_info_noinstr(stack, current, &info) || info.type == STACK_TYPE_ENTRY ||
+ info.type > STACK_TYPE_EXCEPTION_LAST)
+ sp = __this_cpu_ist_top_va(HV2);
+sync:
+ /*
+ * Found a safe stack - switch to it as if the entry didn't happen via
+ * IST stack. The code below only copies pt_regs, the real switch happens
+ * in assembly code.
+ */
+ sp = ALIGN_DOWN(sp, 8) - sizeof(*regs_ret);
+
+ regs_ret = (struct pt_regs *)sp;
+ *regs_ret = *regs;
+
+ return regs_ret;
+}
#endif
asmlinkage __visible noinstr struct pt_regs *fixup_bad_iret(struct pt_regs *bad_regs)
@@ -1457,4 +1504,7 @@ void __init trap_init(void)
/* Setup traps as cpu_init() might #GP */
idt_setup_traps();
cpu_init();
+
+ /* Init #HV doorbell pages when running as an SEV-SNP guest */
+ sev_snp_init_hv_handling();
}
--
2.25.1
From: Tianyu Lan <[email protected]>
The ring buffer is remapped in the hv_ringbuffer_init()
and it should be with decrypt flag in order to share it
with hypervisor in sev-snp enlightened guest.
Signed-off-by: Tianyu Lan <[email protected]>
---
drivers/hv/ring_buffer.c | 7 ++++++-
1 file changed, 6 insertions(+), 1 deletion(-)
diff --git a/drivers/hv/ring_buffer.c b/drivers/hv/ring_buffer.c
index 59a4aa86d1f3..391995c76be7 100644
--- a/drivers/hv/ring_buffer.c
+++ b/drivers/hv/ring_buffer.c
@@ -18,6 +18,7 @@
#include <linux/slab.h>
#include <linux/prefetch.h>
#include <linux/io.h>
+#include <linux/set_memory.h>
#include <asm/mshyperv.h>
#include "hyperv_vmbus.h"
@@ -233,14 +234,18 @@ int hv_ringbuffer_init(struct hv_ring_buffer_info *ring_info,
ring_info->ring_buffer = (struct hv_ring_buffer *)
vmap(pages_wraparound, page_cnt * 2 - 1, VM_MAP,
+ hv_isolation_type_en_snp() ?
+ pgprot_decrypted(PAGE_KERNEL_NOENC) :
PAGE_KERNEL);
+ if (hv_isolation_type_en_snp())
+ memset(ring_info->ring_buffer, 0x00, page_cnt * PAGE_SIZE);
+
kfree(pages_wraparound);
if (!ring_info->ring_buffer)
return -ENOMEM;
}
-
ring_info->ring_buffer->read_index =
ring_info->ring_buffer->write_index = 0;
--
2.25.1
From: Tianyu Lan <[email protected]>
The wakeup_secondary_cpu callback was populated with wakeup_
cpu_via_vmgexit() which doesn't work for Hyper-V. Override it
with Hyper-V specific hook which uses HVCALL_START_VIRTUAL_
PROCESSOR hvcall to start AP with vmsa data structure.
Signed-off-by: Tianyu Lan <[email protected]>
---
arch/x86/include/asm/sev.h | 13 +++
arch/x86/include/asm/svm.h | 55 ++++++++++-
arch/x86/kernel/cpu/mshyperv.c | 147 +++++++++++++++++++++++++++++-
include/asm-generic/hyperv-tlfs.h | 18 ++++
4 files changed, 230 insertions(+), 3 deletions(-)
diff --git a/arch/x86/include/asm/sev.h b/arch/x86/include/asm/sev.h
index ebc271bb6d8e..e34aaf730220 100644
--- a/arch/x86/include/asm/sev.h
+++ b/arch/x86/include/asm/sev.h
@@ -86,6 +86,19 @@ extern bool handle_vc_boot_ghcb(struct pt_regs *regs);
#define RMPADJUST_VMSA_PAGE_BIT BIT(16)
+union sev_rmp_adjust {
+ u64 as_uint64;
+ struct {
+ unsigned long target_vmpl : 8;
+ unsigned long enable_read : 1;
+ unsigned long enable_write : 1;
+ unsigned long enable_user_execute : 1;
+ unsigned long enable_kernel_execute : 1;
+ unsigned long reserved1 : 4;
+ unsigned long vmsa : 1;
+ };
+};
+
/* SNP Guest message request */
struct snp_req_data {
unsigned long req_gpa;
diff --git a/arch/x86/include/asm/svm.h b/arch/x86/include/asm/svm.h
index 0361626841bc..fc54d3e7f817 100644
--- a/arch/x86/include/asm/svm.h
+++ b/arch/x86/include/asm/svm.h
@@ -328,8 +328,61 @@ struct vmcb_save_area {
u64 br_to;
u64 last_excp_from;
u64 last_excp_to;
- u8 reserved_6[72];
+
+ /*
+ * The following part of the save area is valid only for
+ * SEV-ES guests when referenced through the GHCB or for
+ * saving to the host save area.
+ */
+ u8 reserved_7[72];
u32 spec_ctrl; /* Guest version of SPEC_CTRL at 0x2E0 */
+ u8 reserved_7b[4];
+ u32 pkru;
+ u8 reserved_7a[20];
+ u64 reserved_8; /* rax already available at 0x01f8 */
+ u64 rcx;
+ u64 rdx;
+ u64 rbx;
+ u64 reserved_9; /* rsp already available at 0x01d8 */
+ u64 rbp;
+ u64 rsi;
+ u64 rdi;
+ u64 r8;
+ u64 r9;
+ u64 r10;
+ u64 r11;
+ u64 r12;
+ u64 r13;
+ u64 r14;
+ u64 r15;
+ u8 reserved_10[16];
+ u64 sw_exit_code;
+ u64 sw_exit_info_1;
+ u64 sw_exit_info_2;
+ u64 sw_scratch;
+ union {
+ u64 sev_features;
+ struct {
+ u64 sev_feature_snp : 1;
+ u64 sev_feature_vtom : 1;
+ u64 sev_feature_reflectvc : 1;
+ u64 sev_feature_restrict_injection : 1;
+ u64 sev_feature_alternate_injection : 1;
+ u64 sev_feature_full_debug : 1;
+ u64 sev_feature_reserved1 : 1;
+ u64 sev_feature_snpbtb_isolation : 1;
+ u64 sev_feature_resrved2 : 56;
+ };
+ };
+ u64 vintr_ctrl;
+ u64 guest_error_code;
+ u64 virtual_tom;
+ u64 tlb_id;
+ u64 pcpu_id;
+ u64 event_inject;
+ u64 xcr0;
+ u8 valid_bitmap[16];
+ u64 x87_state_gpa;
} __packed;
/* Save area definition for SEV-ES and SEV-SNP guests */
diff --git a/arch/x86/kernel/cpu/mshyperv.c b/arch/x86/kernel/cpu/mshyperv.c
index f0c97210c64a..b266f648e5cd 100644
--- a/arch/x86/kernel/cpu/mshyperv.c
+++ b/arch/x86/kernel/cpu/mshyperv.c
@@ -41,6 +41,10 @@
#include <asm/realmode.h>
#include <asm/e820/api.h>
+#define EN_SEV_SNP_PROCESSOR_INFO_ADDR 0x802000
+#define HV_AP_INIT_GPAT_DEFAULT 0x0007040600070406ULL
+#define HV_AP_SEGMENT_LIMIT 0xffffffff
+
/* Is Linux running as the root partition? */
bool hv_root_partition;
struct ms_hyperv_info ms_hyperv;
@@ -232,6 +236,136 @@ static void __init hv_smp_prepare_boot_cpu(void)
#endif
}
+static u8 ap_start_input_arg[PAGE_SIZE] __bss_decrypted __aligned(PAGE_SIZE);
+static u8 ap_start_stack[PAGE_SIZE] __aligned(PAGE_SIZE);
+
+int hv_snp_boot_ap(int cpu, unsigned long start_ip)
+{
+ struct vmcb_save_area *vmsa = (struct vmcb_save_area *)
+ __get_free_page(GFP_KERNEL | __GFP_ZERO);
+ struct desc_ptr gdtr;
+ u64 ret, retry = 5;
+ struct hv_enable_vp_vtl_input *enable_vtl_input;
+ struct hv_start_virtual_processor_input *start_vp_input;
+ union sev_rmp_adjust rmp_adjust;
+ void **arg;
+ unsigned long flags;
+
+ *(void **)per_cpu_ptr(hyperv_pcpu_input_arg, cpu) = ap_start_input_arg;
+
+ hv_vp_index[cpu] = cpu;
+
+ /* Prevent APs from entering busy calibration loop */
+ preset_lpj = lpj_fine;
+
+ /* Replace the provided real-mode start_ip */
+ start_ip = (unsigned long)secondary_startup_64_no_verify;
+
+ native_store_gdt(&gdtr);
+
+ vmsa->gdtr.base = gdtr.address;
+ vmsa->gdtr.limit = gdtr.size;
+
+ asm volatile("movl %%es, %%eax;" : "=a" (vmsa->es.selector));
+ if (vmsa->es.selector) {
+ vmsa->es.base = 0;
+ vmsa->es.limit = HV_AP_SEGMENT_LIMIT;
+ vmsa->es.attrib = *(u16 *)(vmsa->gdtr.base + vmsa->es.selector + 5);
+ vmsa->es.attrib = (vmsa->es.attrib & 0xFF) | ((vmsa->es.attrib >> 4) & 0xF00);
+ }
+
+ asm volatile("movl %%cs, %%eax;" : "=a" (vmsa->cs.selector));
+ if (vmsa->cs.selector) {
+ vmsa->cs.base = 0;
+ vmsa->cs.limit = HV_AP_SEGMENT_LIMIT;
+ vmsa->cs.attrib = *(u16 *)(vmsa->gdtr.base + vmsa->cs.selector + 5);
+ vmsa->cs.attrib = (vmsa->cs.attrib & 0xFF) | ((vmsa->cs.attrib >> 4) & 0xF00);
+ }
+
+ asm volatile("movl %%ss, %%eax;" : "=a" (vmsa->ss.selector));
+ if (vmsa->ss.selector) {
+ vmsa->ss.base = 0;
+ vmsa->ss.limit = HV_AP_SEGMENT_LIMIT;
+ vmsa->ss.attrib = *(u16 *)(vmsa->gdtr.base + vmsa->ss.selector + 5);
+ vmsa->ss.attrib = (vmsa->ss.attrib & 0xFF) | ((vmsa->ss.attrib >> 4) & 0xF00);
+ }
+
+ asm volatile("movl %%ds, %%eax;" : "=a" (vmsa->ds.selector));
+ if (vmsa->ds.selector) {
+ vmsa->ds.base = 0;
+ vmsa->ds.limit = HV_AP_SEGMENT_LIMIT;
+ vmsa->ds.attrib = *(u16 *)(vmsa->gdtr.base + vmsa->ds.selector + 5);
+ vmsa->ds.attrib = (vmsa->ds.attrib & 0xFF) | ((vmsa->ds.attrib >> 4) & 0xF00);
+ }
+
+ vmsa->efer = native_read_msr(MSR_EFER);
+
+ asm volatile("movq %%cr4, %%rax;" : "=a" (vmsa->cr4));
+ asm volatile("movq %%cr3, %%rax;" : "=a" (vmsa->cr3));
+ asm volatile("movq %%cr0, %%rax;" : "=a" (vmsa->cr0));
+
+ vmsa->xcr0 = 1;
+ vmsa->g_pat = HV_AP_INIT_GPAT_DEFAULT;
+ vmsa->rip = (u64)start_ip;
+ vmsa->rsp = (u64)&ap_start_stack[PAGE_SIZE];
+
+ vmsa->sev_feature_snp = 1;
+ vmsa->sev_feature_restrict_injection = 1;
+
+ rmp_adjust.as_uint64 = 0;
+ rmp_adjust.target_vmpl = 1;
+ rmp_adjust.vmsa = 1;
+ ret = rmpadjust((unsigned long)vmsa, RMP_PG_SIZE_4K,
+ rmp_adjust.as_uint64);
+ if (ret != 0) {
+ pr_err("RMPADJUST(%llx) failed: %llx\n", (u64)vmsa, ret);
+ return ret;
+ }
+
+ local_irq_save(flags);
+ arg = (void **)this_cpu_ptr(hyperv_pcpu_input_arg);
+ if (unlikely(!*arg)) {
+ ret = -ENOMEM;
+ goto done;
+ }
+
+ if (ms_hyperv.vtl != 0) {
+ enable_vtl_input = (struct hv_enable_vp_vtl_input *)*arg;
+ memset(enable_vtl_input, 0, sizeof(*enable_vtl_input));
+ enable_vtl_input->partitionid = -1;
+ enable_vtl_input->vpindex = cpu;
+ enable_vtl_input->targetvtl = ms_hyperv.vtl;
+ *(u64 *)&enable_vtl_input->context[0] = __pa(vmsa) | 1;
+
+ ret = hv_do_hypercall(HVCALL_ENABLE_VP_VTL, enable_vtl_input, NULL);
+ if (ret != 0) {
+ pr_err("HvCallEnableVpVtl failed: %llx\n", ret);
+ goto done;
+ }
+ }
+
+ start_vp_input = (struct hv_start_virtual_processor_input *)*arg;
+ memset(start_vp_input, 0, sizeof(*start_vp_input));
+ start_vp_input->partitionid = -1;
+ start_vp_input->vpindex = cpu;
+ start_vp_input->targetvtl = ms_hyperv.vtl;
+ *(u64 *)&start_vp_input->context[0] = __pa(vmsa) | 1;
+
+ do {
+ ret = hv_do_hypercall(HVCALL_START_VIRTUAL_PROCESSOR,
+ start_vp_input, NULL);
+ } while (ret == HV_STATUS_TIME_OUT && retry--);
+
+ if (ret != 0) {
+ pr_err("HvCallStartVirtualProcessor failed: %llx\n", ret);
+ goto done;
+ }
+
+done:
+ local_irq_restore(flags);
+ return ret;
+}
+
static void __init hv_smp_prepare_cpus(unsigned int max_cpus)
{
#ifdef CONFIG_X86_64
@@ -241,6 +375,16 @@ static void __init hv_smp_prepare_cpus(unsigned int max_cpus)
native_smp_prepare_cpus(max_cpus);
+ /*
+ * Override wakeup_secondary_cpu callback for SEV-SNP
+ * enlightened guest.
+ */
+ if (hv_isolation_type_en_snp())
+ apic->wakeup_secondary_cpu = hv_snp_boot_ap;
+
+ if (!hv_root_partition)
+ return;
+
#ifdef CONFIG_X86_64
for_each_present_cpu(i) {
if (i == 0)
@@ -489,8 +633,7 @@ static void __init ms_hyperv_init_platform(void)
# ifdef CONFIG_SMP
smp_ops.smp_prepare_boot_cpu = hv_smp_prepare_boot_cpu;
- if (hv_root_partition)
- smp_ops.smp_prepare_cpus = hv_smp_prepare_cpus;
+ smp_ops.smp_prepare_cpus = hv_smp_prepare_cpus;
# endif
/*
diff --git a/include/asm-generic/hyperv-tlfs.h b/include/asm-generic/hyperv-tlfs.h
index 6e2a090e2649..7072adbf5540 100644
--- a/include/asm-generic/hyperv-tlfs.h
+++ b/include/asm-generic/hyperv-tlfs.h
@@ -139,6 +139,7 @@ struct ms_hyperv_tsc_page {
#define HVCALL_FLUSH_VIRTUAL_ADDRESS_LIST 0x0003
#define HVCALL_NOTIFY_LONG_SPIN_WAIT 0x0008
#define HVCALL_SEND_IPI 0x000b
+#define HVCALL_ENABLE_VP_VTL 0x000f
#define HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE_EX 0x0013
#define HVCALL_FLUSH_VIRTUAL_ADDRESS_LIST_EX 0x0014
#define HVCALL_SEND_IPI_EX 0x0015
@@ -156,6 +157,7 @@ struct ms_hyperv_tsc_page {
#define HVCALL_MAP_DEVICE_INTERRUPT 0x007c
#define HVCALL_UNMAP_DEVICE_INTERRUPT 0x007d
#define HVCALL_RETARGET_INTERRUPT 0x007e
+#define HVCALL_START_VIRTUAL_PROCESSOR 0x0099
#define HVCALL_FLUSH_GUEST_PHYSICAL_ADDRESS_SPACE 0x00af
#define HVCALL_FLUSH_GUEST_PHYSICAL_ADDRESS_LIST 0x00b0
#define HVCALL_MODIFY_SPARSE_GPA_PAGE_HOST_VISIBILITY 0x00db
@@ -763,6 +765,22 @@ struct hv_input_unmap_device_interrupt {
struct hv_interrupt_entry interrupt_entry;
} __packed;
+struct hv_enable_vp_vtl_input {
+ u64 partitionid;
+ u32 vpindex;
+ u8 targetvtl;
+ u8 padding[3];
+ u8 context[0xe0];
+} __packed;
+
+struct hv_start_virtual_processor_input {
+ u64 partitionid;
+ u32 vpindex;
+ u8 targetvtl;
+ u8 padding[3];
+ u8 context[0xe0];
+} __packed;
+
#define HV_SOURCE_SHADOW_NONE 0x0
#define HV_SOURCE_SHADOW_BRIDGE_BUS_RANGE 0x1
--
2.25.1
On Wed, Nov 09, 2022 at 03:53:36PM -0500, Tianyu Lan wrote:
> From: Tianyu Lan <[email protected]>
>
> Hypervisor may pass cc blob address directly into boot param's cc
> blob address in the direct boot mode. Check cc blcb hdr magic first
> in the sev_enable() and use it as cc blob address if check successfully.
>
> Signed-off-by: Tianyu Lan <[email protected]>
> ---
> arch/x86/boot/compressed/sev.c | 27 ++++++++++++++++++++-------
> 1 file changed, 20 insertions(+), 7 deletions(-)
>
> diff --git a/arch/x86/boot/compressed/sev.c b/arch/x86/boot/compressed/sev.c
> index c93930d5ccbd..960968f8bf75 100644
> --- a/arch/x86/boot/compressed/sev.c
> +++ b/arch/x86/boot/compressed/sev.c
> @@ -272,17 +272,24 @@ static void enforce_vmpl0(void)
>
> void sev_enable(struct boot_params *bp)
> {
> + struct cc_blob_sev_info *cc_info;
> unsigned int eax, ebx, ecx, edx;
> struct msr m;
> bool snp;
>
> /*
> - * bp->cc_blob_address should only be set by boot/compressed kernel.
> - * Initialize it to 0 to ensure that uninitialized values from
> - * buggy bootloaders aren't propagated.
> + * bp->cc_blob_address should only be set by boot/compressed
> + * kernel and hypervisor with direct boot mode. Initialize it
> + * to 0 after checking in order to ensure that uninitialized
> + * values from buggy bootloaders aren't propagated.
> */
> - if (bp)
> - bp->cc_blob_address = 0;
> + if (bp) {
> + cc_info = (struct cc_blob_sev_info *)(unsigned long)
> + bp->cc_blob_address;
> +
> + if (cc_info->magic != CC_BLOB_SEV_HDR_MAGIC)
> + bp->cc_blob_address = 0;
It doesn't seem great to rely on SEV_HDR_MAGIC to determine whether
bp->cc_blob_address is valid or not since it is only a 32-bit value.
Would it be possible to use a setup_data entry of type SETUP_CC_BLOB
in bp->hdr.setup_data instead? There's already handling for that in
find_cc_blob_setup_data() so it should "just work".
-Mike
On 11/10/2022 7:39 AM, Michael Roth wrote:
>> - * bp->cc_blob_address should only be set by boot/compressed kernel.
>> - * Initialize it to 0 to ensure that uninitialized values from
>> - * buggy bootloaders aren't propagated.
>> + * bp->cc_blob_address should only be set by boot/compressed
>> + * kernel and hypervisor with direct boot mode. Initialize it
>> + * to 0 after checking in order to ensure that uninitialized
>> + * values from buggy bootloaders aren't propagated.
>> */
>> - if (bp)
>> - bp->cc_blob_address = 0;
>> + if (bp) {
>> + cc_info = (struct cc_blob_sev_info *)(unsigned long)
>> + bp->cc_blob_address;
>> +
>> + if (cc_info->magic != CC_BLOB_SEV_HDR_MAGIC)
>> + bp->cc_blob_address = 0;
> It doesn't seem great to rely on SEV_HDR_MAGIC to determine whether
> bp->cc_blob_address is valid or not since it is only a 32-bit value.
>
> Would it be possible to use a setup_data entry of type SETUP_CC_BLOB
> in bp->hdr.setup_data instead? There's already handling for that in
> find_cc_blob_setup_data() so it should "just work".
Hi Michael:
Thanks for your review. I will have a try. Hypervisor may set
cc_blob_address directly and so propose this.
Hello Tianyu,
On 11/9/2022 2:53 PM, Tianyu Lan wrote:
> From: Tianyu Lan <[email protected]>
>
> Enable #HV exception to handle interrupt requests from hypervisor.
>
> Signed-off-by: Tianyu Lan <[email protected]>
> ---
> arch/x86/entry/entry_64.S | 18 ++
> arch/x86/include/asm/irqflags.h | 19 ++
> arch/x86/include/asm/mem_encrypt.h | 2 +
> arch/x86/include/asm/msr-index.h | 6 +
> arch/x86/include/uapi/asm/svm.h | 4 +
> arch/x86/kernel/sev.c | 327 ++++++++++++++++++++++++-----
> arch/x86/kernel/traps.c | 50 +++++
> 7 files changed, 373 insertions(+), 53 deletions(-)
>
> diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
> index b2059df43c57..fe460cf44ab5 100644
> --- a/arch/x86/entry/entry_64.S
> +++ b/arch/x86/entry/entry_64.S
> @@ -1058,6 +1058,15 @@ SYM_CODE_END(paranoid_entry)
> * R15 - old SPEC_CTRL
> */
> SYM_CODE_START_LOCAL(paranoid_exit)
> +#ifdef CONFIG_AMD_MEM_ENCRYPT
> + /*
> + * If a #HV was delivered during execution and interrupts were
> + * disabled, then check if it can be handled before the iret
> + * (which may re-enable interrupts).
> + */
> + mov %rsp, %rdi
> + call check_hv_pending
> +#endif
> UNWIND_HINT_REGS
>
> /*
> @@ -1183,6 +1192,15 @@ SYM_CODE_START_LOCAL(error_entry)
> SYM_CODE_END(error_entry)
>
> SYM_CODE_START_LOCAL(error_return)
> +#ifdef CONFIG_AMD_MEM_ENCRYPT
> + /*
> + * If a #HV was delivered during execution and interrupts were
> + * disabled, then check if it can be handled before the iret
> + * (which may re-enable interrupts).
> + */
> + mov %rsp, %rdi
> + call check_hv_pending
> +#endif
> UNWIND_HINT_REGS
> DEBUG_ENTRY_ASSERT_IRQS_OFF
> testb $3, CS(%rsp)
> diff --git a/arch/x86/include/asm/irqflags.h b/arch/x86/include/asm/irqflags.h
> index 7793e52d6237..e0730d8bc0ac 100644
> --- a/arch/x86/include/asm/irqflags.h
> +++ b/arch/x86/include/asm/irqflags.h
> @@ -14,6 +14,9 @@
> /*
> * Interrupt control:
> */
> +#ifdef CONFIG_AMD_MEM_ENCRYPT
> +void check_hv_pending(struct pt_regs *regs);
> +#endif
>
> /* Declaration required for gcc < 4.9 to prevent -Werror=missing-prototypes */
> extern inline unsigned long native_save_fl(void);
> @@ -35,6 +38,19 @@ extern __always_inline unsigned long native_save_fl(void)
> return flags;
> }
>
> +extern inline void native_restore_fl(unsigned long flags)
> +{
> + asm volatile("push %0 ; popf"
> + : /* no output */
> + : "g" (flags)
> + : "memory", "cc");
> +#ifdef CONFIG_AMD_MEM_ENCRYPT
> + if ((flags & X86_EFLAGS_IF)) {
> + check_hv_pending(NULL);
> + }
> +#endif
> +}
> +
> static __always_inline void native_irq_disable(void)
> {
> asm volatile("cli": : :"memory");
> @@ -43,6 +59,9 @@ static __always_inline void native_irq_disable(void)
> static __always_inline void native_irq_enable(void)
> {
> asm volatile("sti": : :"memory");
> +#ifdef CONFIG_AMD_MEM_ENCRYPT
> + check_hv_pending(NULL);
> +#endif
> }
>
> static inline __cpuidle void native_safe_halt(void)
Are these checks required for native_safe_halt() too ?
> diff --git a/arch/x86/include/asm/mem_encrypt.h b/arch/x86/include/asm/mem_encrypt.h
> index 72ca90552b6a..7264ca5f5b2d 100644
> --- a/arch/x86/include/asm/mem_encrypt.h
> +++ b/arch/x86/include/asm/mem_encrypt.h
> @@ -50,6 +50,7 @@ void __init early_set_mem_enc_dec_hypercall(unsigned long vaddr, int npages,
> void __init mem_encrypt_free_decrypted_mem(void);
>
> void __init sev_es_init_vc_handling(void);
> +void __init sev_snp_init_hv_handling(void);
>
> #define __bss_decrypted __section(".bss..decrypted")
>
> @@ -72,6 +73,7 @@ static inline void __init sme_encrypt_kernel(struct boot_params *bp) { }
> static inline void __init sme_enable(struct boot_params *bp) { }
>
> static inline void sev_es_init_vc_handling(void) { }
> +static inline void sev_snp_init_hv_handling(void) { }
>
> static inline int __init
> early_set_memory_decrypted(unsigned long vaddr, unsigned long size) { return 0; }
> diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h
> index 10ac52705892..6fe25a6e325f 100644
> --- a/arch/x86/include/asm/msr-index.h
> +++ b/arch/x86/include/asm/msr-index.h
> @@ -562,10 +562,16 @@
> #define MSR_AMD64_SEV_ENABLED_BIT 0
> #define MSR_AMD64_SEV_ES_ENABLED_BIT 1
> #define MSR_AMD64_SEV_SNP_ENABLED_BIT 2
> +#define MSR_AMD64_SEV_REFLECTVC_ENABLED_BIT 4
> +#define MSR_AMD64_SEV_RESTRICTED_INJECTION_ENABLED_BIT 5
> +#define MSR_AMD64_SEV_ALTERNATE_INJECTION_ENABLED_BIT 6
> #define MSR_AMD64_SEV_ENABLED BIT_ULL(MSR_AMD64_SEV_ENABLED_BIT)
> #define MSR_AMD64_SEV_ES_ENABLED BIT_ULL(MSR_AMD64_SEV_ES_ENABLED_BIT)
> #define MSR_AMD64_SEV_SNP_ENABLED BIT_ULL(MSR_AMD64_SEV_SNP_ENABLED_BIT)
>
> +#define MSR_AMD64_SEV_REFLECTVC_ENABLED BIT_ULL(MSR_AMD64_SEV_REFLECTVC_ENABLED_BIT)
> +#define MSR_AMD64_SEV_RESTRICTED_INJECTION_ENABLED BIT_ULL(MSR_AMD64_SEV_RESTRICTED_INJECTION_ENABLED_BIT)
> +#define MSR_AMD64_SEV_ALTERNATE_INJECTION_ENABLED BIT_ULL(MSR_AMD64_SEV_ALTERNATE_INJECTION_ENABLED_BIT)
> #define MSR_AMD64_VIRT_SPEC_CTRL 0xc001011f
>
> /* AMD Collaborative Processor Performance Control MSRs */
> diff --git a/arch/x86/include/uapi/asm/svm.h b/arch/x86/include/uapi/asm/svm.h
> index f69c168391aa..85d6882262e7 100644
> --- a/arch/x86/include/uapi/asm/svm.h
> +++ b/arch/x86/include/uapi/asm/svm.h
> @@ -115,6 +115,10 @@
> #define SVM_VMGEXIT_AP_CREATE_ON_INIT 0
> #define SVM_VMGEXIT_AP_CREATE 1
> #define SVM_VMGEXIT_AP_DESTROY 2
> +#define SVM_VMGEXIT_HV_DOORBELL_PAGE 0x80000014
> +#define SVM_VMGEXIT_GET_PREFERRED_HV_DOORBELL_PAGE 0
> +#define SVM_VMGEXIT_SET_HV_DOORBELL_PAGE 1
> +#define SVM_VMGEXIT_QUERY_HV_DOORBELL_PAGE 2
> #define SVM_VMGEXIT_HV_FEATURES 0x8000fffd
> #define SVM_VMGEXIT_UNSUPPORTED_EVENT 0x8000ffff
>
> diff --git a/arch/x86/kernel/sev.c b/arch/x86/kernel/sev.c
> index 63ddb043d16d..65eb9f96d0c4 100644
> --- a/arch/x86/kernel/sev.c
> +++ b/arch/x86/kernel/sev.c
> @@ -104,6 +104,12 @@ struct sev_es_runtime_data {
> * is currently unsupported in SEV-ES guests.
> */
> unsigned long dr7;
> + /*
> + * SEV-SNP requires that the GHCB must be registered before using it.
> + * The flag below will indicate whether the GHCB is registered, if its
> + * not registered then sev_es_get_ghcb() will perform the registration.
> + */
> + bool ghcb_registered;
> };
>
> struct ghcb_state {
> @@ -122,6 +128,156 @@ struct sev_config {
>
> static struct sev_config sev_cfg __read_mostly;
>
> +static noinstr struct ghcb *__sev_get_ghcb(struct ghcb_state *state);
> +static noinstr void __sev_put_ghcb(struct ghcb_state *state);
> +static int vmgexit_hv_doorbell_page(struct ghcb *ghcb, u64 op, u64 pa);
> +static void sev_snp_setup_hv_doorbell_page(struct ghcb *ghcb);
> +
> +struct sev_hv_doorbell_page {
> + union {
> + u16 pending_events;
> + struct {
> + u8 vector;
> + u8 nmi : 1;
> + u8 mc : 1;
> + u8 reserved1 : 5;
> + u8 no_further_signal : 1;
> + };
> + };
> + u8 no_eoi_required;
> + u8 reserved2[61];
> + u8 padding[4032];
> +};
> +
> +struct sev_snp_runtime_data {
> + struct sev_hv_doorbell_page hv_doorbell_page;
> +};
> +
> +static DEFINE_PER_CPU(struct sev_snp_runtime_data*, snp_runtime_data);
> +
> +static inline u64 sev_es_rd_ghcb_msr(void)
> +{
> + return __rdmsr(MSR_AMD64_SEV_ES_GHCB);
> +}
> +
> +static __always_inline void sev_es_wr_ghcb_msr(u64 val)
> +{
> + u32 low, high;
> +
> + low = (u32)(val);
> + high = (u32)(val >> 32);
> +
> + native_wrmsr(MSR_AMD64_SEV_ES_GHCB, low, high);
> +}
> +
> +struct sev_hv_doorbell_page *sev_snp_current_doorbell_page(void)
> +{
> + return &this_cpu_read(snp_runtime_data)->hv_doorbell_page;
> +}
> +
> +static u8 sev_hv_pending(void)
> +{
> + return sev_snp_current_doorbell_page()->vector;
> +}
> +
> +static void hv_doorbell_apic_eoi_write(u32 reg, u32 val)
> +{
> + if (xchg(&sev_snp_current_doorbell_page()->no_eoi_required, 0) & 0x1)
> + return;
> +
> + BUG_ON(reg != APIC_EOI);
> + apic->write(reg, val);
> +}
> +
> +static void do_exc_hv(struct pt_regs *regs)
> +{
> + u8 vector;
> +
> + while (sev_hv_pending()) {
> + asm volatile("cli" : : : "memory");
> +
> + vector = xchg(&sev_snp_current_doorbell_page()->vector, 0);
> +
> + switch (vector) {
As a general comment, all these system vectors are now going to be
dispatched through this #HV exception handler, once Restricted interrupt
injection support is enabled on the guest.
If there are new system vectors being added, it will need the #HV
exception handler updated to dispatch them too (code maintainence
headache ?)
It is probably more efficient to construct some kind of a S/W dispatch
table dynamically like a system vector table and dispatch system vector
exceptions through sysvec_table from #HV exception handler instead of
explicitly calling each system vector. The system vector table is
created dynamically and is placed in a new named ELF section.
Something like this, by overloading the idtentry macro:
.macro idtentry vector asmsym cfunc has_error_code:req
..
_ASM_NOKPROBE(\asmsym)
SYM_CODE_END(\asmsym)
+ .if \vector >= FIRST_SYSTEM_VECTOR && \vector < NR_VECTORS
+ .section .system_vectors, "aw"
+ .byte \vector
+ .quad \cfunc
+ .previous
+ .endif
+static void (*sysvec_table[NR_VECTORS - FIRST_SYSTEM_VECTOR])(struct
pt_regs *regs) __ro_after_init;
+
+struct __attribute__ ((__packed__)) sysvec_entry {
+ unsigned char vector;
+ void (*sysvec_func)(struct pt_regs *regs);
+};
+
and then dispatching the system vectors here:
if ((sysvec_table[events.vector - FIRST_SYSTEM_VECTOR]))
(*sysvec_table[events.vector - FIRST_SYSTEM_VECTOR])(regs);
...
> +#if IS_ENABLED(CONFIG_HYPERV)
> + case HYPERV_STIMER0_VECTOR:
> + sysvec_hyperv_stimer0(regs);
> + break;
> + case HYPERVISOR_CALLBACK_VECTOR:
> + sysvec_hyperv_callback(regs);
> + break;
> +#endif
> +#ifdef CONFIG_SMP
> + case RESCHEDULE_VECTOR:
> + sysvec_reschedule_ipi(regs);
> + break;
Additionally, during our prototyping and testing of SNP Restricted
interrupt injection support, we had observed that
irqentry_exit_to_user_mode() code path (entered at the end of the sysvec
handler above) can potentially cause the #HV handler to be preempted and
rescheduled on another CPU. Rescheduled #HV handler on another cpu will
cause interrupts to be handled on a different cpu than the injected one,
causing invalid EOIs and missed/lost guest interrupts and corresponding
hangs and/or per-cpu IRQs handled on non-intended cpu.
Therefore, we had to add checks in interrupt exit code paths in case of
returns to user mode to check if currently executing the #HV handler
and if so, then not to follow the irqentry_exit_to_user_mode path as
that can potentially cause the #HV handler to be preempted and
rescheduled on another CPU.
Something like this:
+#ifndef CONFIG_AMD_MEM_ENCRYPT
/**
* DEFINE_IDTENTRY_IRQ - Emit code for device interrupt IDT entry points
* @func: Function name of the entry point
@@ -204,6 +209,27 @@ __visible noinstr void func(struct pt_regs *regs,
static noinline void __##func(struct pt_regs *regs, u32 vector)
+#else
+
+#define DEFINE_IDTENTRY_IRQ(func) \
+static void __##func(struct pt_regs *regs, u32 vector); \
+ \
+__visible noinstr void func(struct pt_regs *regs, \
+ unsigned long error_code) \
+{ \
+ irqentry_state_t state = irqentry_enter(regs); \
+ u32 vector = (u32)(u8)error_code; \
+ \
+ instrumentation_begin(); \
+ kvm_set_cpu_l1tf_flush_l1d(); \
+ run_irq_on_irqstack_cond(__##func, regs, vector); \
+ instrumentation_end(); \
+ irqentry_exit_hv_cond(regs, state); \
+} \
+ \
+static noinline void __##func(struct pt_regs *regs, u32 vector)
+#endif
...
+noinstr void irqentry_exit_hv_cond(struct pt_regs *regs,
irqentry_state_t state)
+{
+ struct sev_hvdb_runtime_data *hvdb_data;
+ struct sev_es_runtime_data *data;
+
+ data = this_cpu_read(runtime_data);
+ if (WARN_ON(!data))
+ irqentry_exit(regs, state);
+
+ hvdb_data = data->hvdb_data;
+ if (WARN_ON(!hvdb_data))
+ irqentry_exit(regs, state);
+
+ /*
+ * Check whether this returns to user mode, if so and if
+ * we are currently executing the #HV handler then we don't
+ * want to follow the irqentry_exit_to_user_mode path as
+ * that can potentially cause the #HV handler to be
+ * preempted and rescheduled on another CPU. Rescheduled #HV
+ * handler on another cpu will cause interrupts to be handled
+ * on a different cpu than the injected one, causing
+ * invalid EOIs and missed/lost guest interrupts and
+ * corresponding hangs and/or per-cpu IRQs handled on
+ * non-intended cpu.
+ */
+
+ if (user_mode(regs) && hvdb_data->hv_handling_events) {
+ return;
+ }
+ else {
+ /* follow normal interrupt return/exit path */
+ irqentry_exit(regs, state);
+ }
+}
+
> + case IRQ_MOVE_CLEANUP_VECTOR: > + sysvec_irq_move_cleanup(regs);
> + break;
> + case REBOOT_VECTOR:
> + sysvec_reboot(regs);
> + break;
> + case CALL_FUNCTION_SINGLE_VECTOR:
> + sysvec_call_function_single(regs);
> + break;
> + case CALL_FUNCTION_VECTOR:
> + sysvec_call_function(regs);
> + break;
> +#endif
> +#ifdef CONFIG_X86_LOCAL_APIC
> + case ERROR_APIC_VECTOR:
> + sysvec_error_interrupt(regs);
> + break;
> + case SPURIOUS_APIC_VECTOR:
> + sysvec_spurious_apic_interrupt(regs);
> + break;
> + case LOCAL_TIMER_VECTOR:
> + sysvec_apic_timer_interrupt(regs);
> + break;
> + case X86_PLATFORM_IPI_VECTOR:
> + sysvec_x86_platform_ipi(regs);
> + break;
> +#endif
What about device interrupts ?
> + case 0x0:
> + break;
> + default:
> + panic("Unexpected vector %d\n", vector);
> + unreachable();
> + }
> +
> + asm volatile("sti" : : : "memory");
> + }
> +}
> +
> +void check_hv_pending(struct pt_regs *regs)
> +{
> + struct pt_regs local_regs;
> +
> + if (!cc_platform_has(CC_ATTR_GUEST_SEV_SNP))
> + return;
> +
> + if (regs) {
> + if ((regs->flags & X86_EFLAGS_IF) == 0)
> + return;
> +
> + if (!sev_hv_pending())
> + return;
> +
> + do_exc_hv(regs);
> + } else {
> + if (sev_hv_pending()) {
> + memset(&local_regs, 0, sizeof(struct pt_regs));
> + regs = &local_regs;
> + asm volatile("movl %%cs, %%eax;" : "=a" (regs->cs));
> + asm volatile("movl %%ss, %%eax;" : "=a" (regs->ss));
> + regs->orig_ax = 0xffffffff;
> + regs->flags = native_save_fl();
> + do_exc_hv(regs);
> + }
> + }
> +}
> +EXPORT_SYMBOL_GPL(check_hv_pending);
> +
> static __always_inline bool on_vc_stack(struct pt_regs *regs)
> {
> unsigned long sp = regs->sp;
> @@ -193,68 +349,35 @@ void noinstr __sev_es_ist_exit(void)
> this_cpu_write(cpu_tss_rw.x86_tss.ist[IST_INDEX_VC], *(unsigned long *)ist);
> }
>
> -/*
> - * Nothing shall interrupt this code path while holding the per-CPU
> - * GHCB. The backup GHCB is only for NMIs interrupting this path.
> - *
> - * Callers must disable local interrupts around it.
> - */
> -static noinstr struct ghcb *__sev_get_ghcb(struct ghcb_state *state)
> +static bool sev_restricted_injection_enabled(void)
> {
> - struct sev_es_runtime_data *data;
> + return sev_status & MSR_AMD64_SEV_RESTRICTED_INJECTION_ENABLED;
> +}
> +
> +void __init sev_snp_init_hv_handling(void)
> +{
> + struct sev_snp_runtime_data *snp_data;
> + struct ghcb_state state;
> struct ghcb *ghcb;
> + unsigned long flags;
> + int cpu;
> + int err;
>
> WARN_ON(!irqs_disabled());
> + if (!cc_platform_has(CC_ATTR_GUEST_SEV_SNP) || !sev_restricted_injection_enabled())
> + return;
>
> - data = this_cpu_read(runtime_data);
> - ghcb = &data->ghcb_page;
> -
> - if (unlikely(data->ghcb_active)) {
> - /* GHCB is already in use - save its contents */
> -
> - if (unlikely(data->backup_ghcb_active)) {
> - /*
> - * Backup-GHCB is also already in use. There is no way
> - * to continue here so just kill the machine. To make
> - * panic() work, mark GHCBs inactive so that messages
> - * can be printed out.
> - */
> - data->ghcb_active = false;
> - data->backup_ghcb_active = false;
> -
> - instrumentation_begin();
> - panic("Unable to handle #VC exception! GHCB and Backup GHCB are already in use");
> - instrumentation_end();
> - }
> -
> - /* Mark backup_ghcb active before writing to it */
> - data->backup_ghcb_active = true;
> -
> - state->ghcb = &data->backup_ghcb;
> -
> - /* Backup GHCB content */
> - *state->ghcb = *ghcb;
> - } else {
> - state->ghcb = NULL;
> - data->ghcb_active = true;
> - }
> + local_irq_save(flags);
>
> - return ghcb;
> -}
> + ghcb = __sev_get_ghcb(&state);
>
> -static inline u64 sev_es_rd_ghcb_msr(void)
> -{
> - return __rdmsr(MSR_AMD64_SEV_ES_GHCB);
> -}
> + sev_snp_setup_hv_doorbell_page(ghcb);
>
> -static __always_inline void sev_es_wr_ghcb_msr(u64 val)
> -{
> - u32 low, high;
> + __sev_put_ghcb(&state);
>
> - low = (u32)(val);
> - high = (u32)(val >> 32);
> + apic_set_eoi_write(hv_doorbell_apic_eoi_write);
>
> - native_wrmsr(MSR_AMD64_SEV_ES_GHCB, low, high);
> + local_irq_restore(flags);
> }
>
> static int vc_fetch_insn_kernel(struct es_em_ctxt *ctxt,
> @@ -515,6 +638,79 @@ static enum es_result vc_slow_virt_to_phys(struct ghcb *ghcb, struct es_em_ctxt
> /* Include code shared with pre-decompression boot stage */
> #include "sev-shared.c"
>
> +/*
> + * Nothing shall interrupt this code path while holding the per-CPU
> + * GHCB. The backup GHCB is only for NMIs interrupting this path.
> + *
> + * Callers must disable local interrupts around it.
> + */
> +static noinstr struct ghcb *__sev_get_ghcb(struct ghcb_state *state)
> +{
> + struct sev_es_runtime_data *data;
> + struct ghcb *ghcb;
> +
> + WARN_ON(!irqs_disabled());
> +
> + data = this_cpu_read(runtime_data);
> + ghcb = &data->ghcb_page;
> +
> + if (unlikely(data->ghcb_active)) {
> + /* GHCB is already in use - save its contents */
> +
> + if (unlikely(data->backup_ghcb_active)) {
> + /*
> + * Backup-GHCB is also already in use. There is no way
> + * to continue here so just kill the machine. To make
> + * panic() work, mark GHCBs inactive so that messages
> + * can be printed out.
> + */
> + data->ghcb_active = false;
> + data->backup_ghcb_active = false;
> +
> + instrumentation_begin();
> + panic("Unable to handle #VC exception! GHCB and Backup GHCB are already in use");
> + instrumentation_end();
> + }
> +
> + /* Mark backup_ghcb active before writing to it */
> + data->backup_ghcb_active = true;
> +
> + state->ghcb = &data->backup_ghcb;
> +
> + /* Backup GHCB content */
> + *state->ghcb = *ghcb;
> + } else {
> + state->ghcb = NULL;
> + data->ghcb_active = true;
> + }
> +
> + /* SEV-SNP guest requires that GHCB must be registered before using it. */
> + if (!data->ghcb_registered) {
> + if (cc_platform_has(CC_ATTR_GUEST_SEV_SNP)) {
> + snp_register_ghcb_early(__pa(ghcb));
> + sev_snp_setup_hv_doorbell_page(ghcb);
> + } else {
> + sev_es_wr_ghcb_msr(__pa(ghcb));
> + }
> + data->ghcb_registered = true;
> + }
> +
> + return ghcb;
> +}
> +
> +static void sev_snp_setup_hv_doorbell_page(struct ghcb *ghcb)
> +{
> + u64 pa;
> + enum es_result ret;
> +
> + pa = __pa(sev_snp_current_doorbell_page());
> + vc_ghcb_invalidate(ghcb);
> + ret = vmgexit_hv_doorbell_page(ghcb,
> + SVM_VMGEXIT_SET_HV_DOORBELL_PAGE, pa);
> + if (ret != ES_OK)
> + panic("SEV-SNP: failed to set up #HV doorbell page");
> +}
> +
> static noinstr void __sev_put_ghcb(struct ghcb_state *state)
> {
> struct sev_es_runtime_data *data;
> @@ -1282,6 +1478,11 @@ void setup_ghcb(void)
> snp_register_ghcb_early(__pa(&boot_ghcb_page));
> }
>
> +int vmgexit_hv_doorbell_page(struct ghcb *ghcb, u64 op, u64 pa)
> +{
> + return sev_es_ghcb_hv_call(ghcb, NULL, SVM_VMGEXIT_HV_DOORBELL_PAGE, op, pa);
> +}
> +
> #ifdef CONFIG_HOTPLUG_CPU
> static void sev_es_ap_hlt_loop(void)
> {
> @@ -1355,6 +1556,7 @@ static void __init alloc_runtime_data(int cpu)
> static void __init init_ghcb(int cpu)
> {
> struct sev_es_runtime_data *data;
> + struct sev_snp_runtime_data *snp_data;
> int err;
>
> data = per_cpu(runtime_data, cpu);
> @@ -1366,8 +1568,22 @@ static void __init init_ghcb(int cpu)
>
> memset(&data->ghcb_page, 0, sizeof(data->ghcb_page));
>
> + snp_data = memblock_alloc(sizeof(*snp_data), PAGE_SIZE);
> + if (!snp_data)
> + panic("Can't allocate SEV-SNP runtime data");
> +
> + err = early_set_memory_decrypted((unsigned long)&snp_data->hv_doorbell_page,
> + sizeof(snp_data->hv_doorbell_page));
> + if (err)
> + panic("Can't map #HV doorbell pages unencrypted");
> +
> + memset(&snp_data->hv_doorbell_page, 0, sizeof(snp_data->hv_doorbell_page));
> +
> + per_cpu(snp_runtime_data, cpu) = snp_data;
> +
> data->ghcb_active = false;
> data->backup_ghcb_active = false;
> + data->ghcb_registered = false;
> }
>
> void __init sev_es_init_vc_handling(void)
> @@ -2006,7 +2222,12 @@ DEFINE_IDTENTRY_VC_USER(exc_vmm_communication)
>
> static bool hv_raw_handle_exception(struct pt_regs *regs)
> {
> - return false;
> + /* Clear the no_further_signal bit */
> + sev_snp_current_doorbell_page()->pending_events &= 0x7fff;
> +
> + check_hv_pending(regs);
> +
> + return true;
> }
>
> static __always_inline bool on_hv_fallback_stack(struct pt_regs *regs)
> diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
> index 178015a820f0..af97e6610fbb 100644
> --- a/arch/x86/kernel/traps.c
> +++ b/arch/x86/kernel/traps.c
> @@ -898,6 +898,53 @@ asmlinkage __visible noinstr struct pt_regs *vc_switch_off_ist(struct pt_regs *r
>
> return regs_ret;
> }
> +
> +asmlinkage __visible noinstr struct pt_regs *hv_switch_off_ist(struct pt_regs *regs)
> +{
> + unsigned long sp, *stack;
> + struct stack_info info;
> + struct pt_regs *regs_ret;
> +
> + /*
> + * A malicious hypervisor can inject 2 HVs in a row, which will corrupt
> + * the trap frame on our IST stack. We add a defensive check here to
> + * catch such behavior.
> + */
> + BUG_ON(regs->sp >= __this_cpu_ist_bottom_va(HV) && regs->sp < __this_cpu_ist_top_va(HV));
Does this nested #HV exception check also need to consider the HV2 (HV
fallback) stack, as we may have switched to it ?
Thanks,
Ashish
> +
> + /*
> + * In the SYSCALL entry path the RSP value comes from user-space - don't
> + * trust it and switch to the current kernel stack
> + */
> + if (ip_within_syscall_gap(regs)) {
> + sp = this_cpu_read(cpu_current_top_of_stack);
> + goto sync;
> + }
> +
> + /*
> + * From here on the RSP value is trusted. Now check whether entry
> + * happened from a safe stack. Not safe are the entry or unknown stacks,
> + * use the fall-back stack instead in this case.
> + */
> + sp = regs->sp;
> + stack = (unsigned long *)sp;
> +
> + if (!get_stack_info_noinstr(stack, current, &info) || info.type == STACK_TYPE_ENTRY ||
> + info.type > STACK_TYPE_EXCEPTION_LAST)
> + sp = __this_cpu_ist_top_va(HV2);
> +sync:
> + /*
> + * Found a safe stack - switch to it as if the entry didn't happen via
> + * IST stack. The code below only copies pt_regs, the real switch happens
> + * in assembly code.
> + */
> + sp = ALIGN_DOWN(sp, 8) - sizeof(*regs_ret);
> +
> + regs_ret = (struct pt_regs *)sp;
> + *regs_ret = *regs;
> +
> + return regs_ret;
> +}
> #endif
>
> asmlinkage __visible noinstr struct pt_regs *fixup_bad_iret(struct pt_regs *bad_regs)
> @@ -1457,4 +1504,7 @@ void __init trap_init(void)
> /* Setup traps as cpu_init() might #GP */
> idt_setup_traps();
> cpu_init();
> +
> + /* Init #HV doorbell pages when running as an SEV-SNP guest */
> + sev_snp_init_hv_handling();
> }
>
Hello Tianyu,
On 11/9/2022 2:53 PM, Tianyu Lan wrote:
> From: Tianyu Lan <[email protected]>
>
> Add a #HV exception handler that uses IST stack.
>
> Signed-off-by: Tianyu Lan <[email protected]>
> ---
> arch/x86/entry/entry_64.S | 58 ++++++++++++++++++++++++++
> arch/x86/include/asm/cpu_entry_area.h | 6 +++
> arch/x86/include/asm/idtentry.h | 39 +++++++++++++++++-
> arch/x86/include/asm/page_64_types.h | 1 +
> arch/x86/include/asm/trapnr.h | 1 +
> arch/x86/include/asm/traps.h | 1 +
> arch/x86/kernel/cpu/common.c | 1 +
> arch/x86/kernel/dumpstack_64.c | 9 +++-
> arch/x86/kernel/idt.c | 1 +
> arch/x86/kernel/sev.c | 59 +++++++++++++++++++++++++++
> arch/x86/mm/cpu_entry_area.c | 2 +
> 11 files changed, 175 insertions(+), 3 deletions(-)
>
> diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
> index 9953d966d124..b2059df43c57 100644
> --- a/arch/x86/entry/entry_64.S
> +++ b/arch/x86/entry/entry_64.S
> @@ -560,6 +560,64 @@ SYM_CODE_START(\asmsym)
> .Lfrom_usermode_switch_stack_\@:
> idtentry_body user_\cfunc, has_error_code=1
>
> +_ASM_NOKPROBE(\asmsym)
> +SYM_CODE_END(\asmsym)
> +.endm
> +/*
> + * idtentry_hv - Macro to generate entry stub for #HV
> + * @vector: Vector number
> + * @asmsym: ASM symbol for the entry point
> + * @cfunc: C function to be called
> + *
> + * The macro emits code to set up the kernel context for #HV. The #HV handler
> + * runs on an IST stack and needs to be able to support nested #HV exceptions.
> + *
> + * To make this work the #HV entry code tries its best to pretend it doesn't use
> + * an IST stack by switching to the task stack if coming from user-space (which
> + * includes early SYSCALL entry path) or back to the stack in the IRET frame if
> + * entered from kernel-mode.
> + *
> + * If entered from kernel-mode the return stack is validated first, and if it is
> + * not safe to use (e.g. because it points to the entry stack) the #HV handler
> + * will switch to a fall-back stack (HV2) and call a special handler function.
> + *
> + * The macro is only used for one vector, but it is planned to be extended in
> + * the future for the #HV exception.
> + */
> +.macro idtentry_hv vector asmsym cfunc
> +SYM_CODE_START(\asmsym)
> + UNWIND_HINT_IRET_REGS
> + ASM_CLAC
> + pushq $-1 /* ORIG_RAX: no syscall to restart */
> +
> + testb $3, CS-ORIG_RAX(%rsp)
> + jnz .Lfrom_usermode_switch_stack_\@
> +
> + call paranoid_entry
> +
> + UNWIND_HINT_REGS
> +
> + /*
> + * Switch off the IST stack to make it free for nested exceptions.
> + */
> + movq %rsp, %rdi /* pt_regs pointer */
> + call hv_switch_off_ist
> + movq %rax, %rsp /* Switch to new stack */
> +
> + UNWIND_HINT_REGS
> +
> + /* Update pt_regs */
> + movq ORIG_RAX(%rsp), %rsi /* get error code into 2nd argument*/
> + movq $-1, ORIG_RAX(%rsp) /* no syscall to restart */
> +
> + movq %rsp, %rdi /* pt_regs pointer */
> + call kernel_\cfunc
> +
> + jmp paranoid_exit
> +
> +.Lfrom_usermode_switch_stack_\@:
> + idtentry_body user_\cfunc, has_error_code=1
#HV exception handler cannot/does not inject an error code, so shouldn't
has_error_code == 0?
> +
> _ASM_NOKPROBE(\asmsym)
> SYM_CODE_END(\asmsym)
> .endm
> diff --git a/arch/x86/include/asm/cpu_entry_area.h b/arch/x86/include/asm/cpu_entry_area.h
> index 75efc4c6f076..f173a16cfc59 100644
> --- a/arch/x86/include/asm/cpu_entry_area.h
> +++ b/arch/x86/include/asm/cpu_entry_area.h
> @@ -30,6 +30,10 @@
> char VC_stack[optional_stack_size]; \
> char VC2_stack_guard[guardsize]; \
> char VC2_stack[optional_stack_size]; \
> + char HV_stack_guard[guardsize]; \
> + char HV_stack[optional_stack_size]; \
> + char HV2_stack_guard[guardsize]; \
> + char HV2_stack[optional_stack_size]; \
> char IST_top_guard[guardsize]; \
>
> /* The exception stacks' physical storage. No guard pages required */
> @@ -52,6 +56,8 @@ enum exception_stack_ordering {
> ESTACK_MCE,
> ESTACK_VC,
> ESTACK_VC2,
> + ESTACK_HV,
> + ESTACK_HV2,
> N_EXCEPTION_STACKS
> };
>
> diff --git a/arch/x86/include/asm/idtentry.h b/arch/x86/include/asm/idtentry.h
> index 72184b0b2219..ed68acd6f723 100644
> --- a/arch/x86/include/asm/idtentry.h
> +++ b/arch/x86/include/asm/idtentry.h
> @@ -317,6 +317,19 @@ static __always_inline void __##func(struct pt_regs *regs)
> __visible noinstr void kernel_##func(struct pt_regs *regs, unsigned long error_code); \
> __visible noinstr void user_##func(struct pt_regs *regs, unsigned long error_code)
>
> +
> +/**
> + * DECLARE_IDTENTRY_HV - Declare functions for the HV entry point
> + * @vector: Vector number (ignored for C)
> + * @func: Function name of the entry point
> + *
> + * Maps to DECLARE_IDTENTRY_RAW, but declares also the user C handler.
> + */
> +#define DECLARE_IDTENTRY_HV(vector, func) \
> + DECLARE_IDTENTRY_RAW_ERRORCODE(vector, func); \
> + __visible noinstr void kernel_##func(struct pt_regs *regs, unsigned long error_code); \
> + __visible noinstr void user_##func(struct pt_regs *regs, unsigned long error_code)
> +
> /**
> * DEFINE_IDTENTRY_IST - Emit code for IST entry points
> * @func: Function name of the entry point
> @@ -376,6 +389,26 @@ static __always_inline void __##func(struct pt_regs *regs)
> #define DEFINE_IDTENTRY_VC_USER(func) \
> DEFINE_IDTENTRY_RAW_ERRORCODE(user_##func)
>
> +/**
> + * DEFINE_IDTENTRY_HV_KERNEL - Emit code for HV injection handler
> + * when raised from kernel mode
> + * @func: Function name of the entry point
> + *
> + * Maps to DEFINE_IDTENTRY_RAW
> + */
> +#define DEFINE_IDTENTRY_HV_KERNEL(func) \
> + DEFINE_IDTENTRY_RAW_ERRORCODE(kernel_##func)
> +
> +/**
> + * DEFINE_IDTENTRY_HV_USER - Emit code for HV injection handler
> + * when raised from user mode
> + * @func: Function name of the entry point
> + *
> + * Maps to DEFINE_IDTENTRY_RAW
> + */
> +#define DEFINE_IDTENTRY_HV_USER(func) \
> + DEFINE_IDTENTRY_RAW_ERRORCODE(user_##func)
> +
> #else /* CONFIG_X86_64 */
>
> /**
> @@ -465,6 +498,9 @@ __visible noinstr void func(struct pt_regs *regs, \
> # define DECLARE_IDTENTRY_VC(vector, func) \
> idtentry_vc vector asm_##func func
>
> +# define DECLARE_IDTENTRY_HV(vector, func) \
> + idtentry_hv vector asm_##func func
> +
> #else
> # define DECLARE_IDTENTRY_MCE(vector, func) \
> DECLARE_IDTENTRY(vector, func)
> @@ -622,9 +658,10 @@ DECLARE_IDTENTRY_RAW_ERRORCODE(X86_TRAP_DF, xenpv_exc_double_fault);
> DECLARE_IDTENTRY_ERRORCODE(X86_TRAP_CP, exc_control_protection);
> #endif
>
> -/* #VC */
> +/* #VC & #HV */
> #ifdef CONFIG_AMD_MEM_ENCRYPT
> DECLARE_IDTENTRY_VC(X86_TRAP_VC, exc_vmm_communication);
> +DECLARE_IDTENTRY_HV(X86_TRAP_HV, exc_hv_injection);
> #endif
>
> #ifdef CONFIG_XEN_PV
> diff --git a/arch/x86/include/asm/page_64_types.h b/arch/x86/include/asm/page_64_types.h
> index e9e2c3ba5923..0bd7dab676c5 100644
> --- a/arch/x86/include/asm/page_64_types.h
> +++ b/arch/x86/include/asm/page_64_types.h
> @@ -29,6 +29,7 @@
> #define IST_INDEX_DB 2
> #define IST_INDEX_MCE 3
> #define IST_INDEX_VC 4
> +#define IST_INDEX_HV 5
>
> /*
> * Set __PAGE_OFFSET to the most negative possible address +
> diff --git a/arch/x86/include/asm/trapnr.h b/arch/x86/include/asm/trapnr.h
> index f5d2325aa0b7..c6583631cecb 100644
> --- a/arch/x86/include/asm/trapnr.h
> +++ b/arch/x86/include/asm/trapnr.h
> @@ -26,6 +26,7 @@
> #define X86_TRAP_XF 19 /* SIMD Floating-Point Exception */
> #define X86_TRAP_VE 20 /* Virtualization Exception */
> #define X86_TRAP_CP 21 /* Control Protection Exception */
> +#define X86_TRAP_HV 28 /* HV injected exception in SNP restricted mode */
> #define X86_TRAP_VC 29 /* VMM Communication Exception */
> #define X86_TRAP_IRET 32 /* IRET Exception */
>
> diff --git a/arch/x86/include/asm/traps.h b/arch/x86/include/asm/traps.h
> index 47ecfff2c83d..6795d3e517d6 100644
> --- a/arch/x86/include/asm/traps.h
> +++ b/arch/x86/include/asm/traps.h
> @@ -16,6 +16,7 @@ asmlinkage __visible notrace
> struct pt_regs *fixup_bad_iret(struct pt_regs *bad_regs);
> void __init trap_init(void);
> asmlinkage __visible noinstr struct pt_regs *vc_switch_off_ist(struct pt_regs *eregs);
> +asmlinkage __visible noinstr struct pt_regs *hv_switch_off_ist(struct pt_regs *eregs);
> #endif
>
> extern bool ibt_selftest(void);
> diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
> index 3e508f239098..87afa3a4c8b1 100644
> --- a/arch/x86/kernel/cpu/common.c
> +++ b/arch/x86/kernel/cpu/common.c
> @@ -2165,6 +2165,7 @@ static inline void tss_setup_ist(struct tss_struct *tss)
> tss->x86_tss.ist[IST_INDEX_MCE] = __this_cpu_ist_top_va(MCE);
> /* Only mapped when SEV-ES is active */
> tss->x86_tss.ist[IST_INDEX_VC] = __this_cpu_ist_top_va(VC);
> + tss->x86_tss.ist[IST_INDEX_HV] = __this_cpu_ist_top_va(HV);
> }
>
> #else /* CONFIG_X86_64 */
> diff --git a/arch/x86/kernel/dumpstack_64.c b/arch/x86/kernel/dumpstack_64.c
> index 6c5defd6569a..23aa5912c87a 100644
> --- a/arch/x86/kernel/dumpstack_64.c
> +++ b/arch/x86/kernel/dumpstack_64.c
> @@ -26,11 +26,14 @@ static const char * const exception_stack_names[] = {
> [ ESTACK_MCE ] = "#MC",
> [ ESTACK_VC ] = "#VC",
> [ ESTACK_VC2 ] = "#VC2",
> + [ ESTACK_HV ] = "#HV",
> + [ ESTACK_HV2 ] = "#HV2",
> +
> };
>
> const char *stack_type_name(enum stack_type type)
> {
> - BUILD_BUG_ON(N_EXCEPTION_STACKS != 6);
> + BUILD_BUG_ON(N_EXCEPTION_STACKS != 8);
>
> if (type == STACK_TYPE_TASK)
> return "TASK";
> @@ -89,6 +92,8 @@ struct estack_pages estack_pages[CEA_ESTACK_PAGES] ____cacheline_aligned = {
> EPAGERANGE(MCE),
> EPAGERANGE(VC),
> EPAGERANGE(VC2),
> + EPAGERANGE(HV),
> + EPAGERANGE(HV2),
> };
>
> static __always_inline bool in_exception_stack(unsigned long *stack, struct stack_info *info)
> @@ -98,7 +103,7 @@ static __always_inline bool in_exception_stack(unsigned long *stack, struct stac
> struct pt_regs *regs;
> unsigned int k;
>
> - BUILD_BUG_ON(N_EXCEPTION_STACKS != 6);
> + BUILD_BUG_ON(N_EXCEPTION_STACKS != 8);
>
> begin = (unsigned long)__this_cpu_read(cea_exception_stacks);
> /*
> diff --git a/arch/x86/kernel/idt.c b/arch/x86/kernel/idt.c
> index a58c6bc1cd68..48c0a7e1dbcb 100644
> --- a/arch/x86/kernel/idt.c
> +++ b/arch/x86/kernel/idt.c
> @@ -113,6 +113,7 @@ static const __initconst struct idt_data def_idts[] = {
>
> #ifdef CONFIG_AMD_MEM_ENCRYPT
> ISTG(X86_TRAP_VC, asm_exc_vmm_communication, IST_INDEX_VC),
> + ISTG(X86_TRAP_HV, asm_exc_hv_injection, IST_INDEX_HV),
> #endif
>
> SYSG(X86_TRAP_OF, asm_exc_overflow),
> diff --git a/arch/x86/kernel/sev.c b/arch/x86/kernel/sev.c
> index a428c62330d3..63ddb043d16d 100644
> --- a/arch/x86/kernel/sev.c
> +++ b/arch/x86/kernel/sev.c
> @@ -2004,6 +2004,65 @@ DEFINE_IDTENTRY_VC_USER(exc_vmm_communication)
> irqentry_exit_to_user_mode(regs);
> }
>
> +static bool hv_raw_handle_exception(struct pt_regs *regs)
> +{
> + return false;
> +}
> +
> +static __always_inline bool on_hv_fallback_stack(struct pt_regs *regs)
> +{
> + unsigned long sp = (unsigned long)regs;
> +
> + return (sp >= __this_cpu_ist_bottom_va(HV2) && sp < __this_cpu_ist_top_va(HV2));
> +}
> +
> +DEFINE_IDTENTRY_HV_USER(exc_hv_injection)
> +{
> + irqentry_enter_from_user_mode(regs);
> + instrumentation_begin();
> +
> + if (!hv_raw_handle_exception(regs)) {
> + /*
> + * Do not kill the machine if user-space triggered the
> + * exception. Send SIGBUS instead and let user-space deal
> + * with it.
> + */
> + force_sig_fault(SIGBUS, BUS_OBJERR, (void __user *)0);
> + }
> +
> + instrumentation_end();
> + irqentry_exit_to_user_mode(regs);
> +}
> +
> +DEFINE_IDTENTRY_HV_KERNEL(exc_hv_injection)
> +{
> + irqentry_state_t irq_state;
> +
> + if (unlikely(on_hv_fallback_stack(regs))) {
> + instrumentation_begin();
> + panic("Can't handle #HV exception from unsupported context\n");
> + instrumentation_end();
> + }
HV fallback stack exists and is used if we couldn't switch to HV stack.
If we have to issue a panic() here, why don't we simply issue the
panic() in hv_switch_off_ist(), when we couldn't switch to HV stack ?
Thanks,
Ashish
> +
> + irq_state = irqentry_nmi_enter(regs);
> + instrumentation_begin();
> +
> + if (!hv_raw_handle_exception(regs)) {
> + pr_emerg("PANIC: Unhandled #HV exception in kernel space\n");
> +
> + /* Show some debug info */
> + show_regs(regs);
> +
> + /* Ask hypervisor to sev_es_terminate */
> + sev_es_terminate(SEV_TERM_SET_GEN, GHCB_SEV_ES_GEN_REQ);
> +
> + panic("Returned from Terminate-Request to Hypervisor\n");
> + }
> +
> + instrumentation_end();
> + irqentry_nmi_exit(regs, irq_state);
> +}
> +
> bool __init handle_vc_boot_ghcb(struct pt_regs *regs)
> {
> unsigned long exit_code = regs->orig_ax;
> diff --git a/arch/x86/mm/cpu_entry_area.c b/arch/x86/mm/cpu_entry_area.c
> index 6c2f1b76a0b6..608905dc6704 100644
> --- a/arch/x86/mm/cpu_entry_area.c
> +++ b/arch/x86/mm/cpu_entry_area.c
> @@ -115,6 +115,8 @@ static void __init percpu_setup_exception_stacks(unsigned int cpu)
> if (cc_platform_has(CC_ATTR_GUEST_STATE_ENCRYPT)) {
> cea_map_stack(VC);
> cea_map_stack(VC2);
> + cea_map_stack(HV);
> + cea_map_stack(HV2);
> }
> }
> }
>
Thanks for your review.
On 11/11/2022 4:38 AM, Kalra, Ashish wrote:
>> + UNWIND_HINT_REGS
>> +
>> + /* Update pt_regs */
>> + movq ORIG_RAX(%rsp), %rsi /* get error code into 2nd
>> argument*/
>> + movq $-1, ORIG_RAX(%rsp) /* no syscall to restart */
>> +
>> + movq %rsp, %rdi /* pt_regs pointer */
>> + call kernel_\cfunc
>> +
>> + jmp paranoid_exit
>> +
>> +.Lfrom_usermode_switch_stack_\@:
>> + idtentry_body user_\cfunc, has_error_code=1
>
> #HV exception handler cannot/does not inject an error code, so shouldn't
> has_error_code == 0?
Nice catch. Will update in the next version.
>> + irqentry_state_t irq_state;
>> +
>> + if (unlikely(on_hv_fallback_stack(regs))) {
>> + instrumentation_begin();
>> + panic("Can't handle #HV exception from unsupported
>> context\n");
>> + instrumentation_end();
>> + }
>
> HV fallback stack exists and is used if we couldn't switch to HV stack.
> If we have to issue a panic() here, why don't we simply issue the
> panic() in hv_switch_off_ist(), when we couldn't switch to HV stack ?
>
Yes, this is a good idea. Will update. Thanks.