2020-03-19 09:16:33

by Joerg Roedel

[permalink] [raw]
Subject: [PATCH 70/70] x86/sev-es: Add NMI state tracking

From: Joerg Roedel <[email protected]>

Keep NMI state in SEV-ES code so the kernel can re-enable NMIs for the
vCPU when it reaches IRET.

Signed-off-by: Joerg Roedel <[email protected]>
---
arch/x86/entry/entry_64.S | 48 +++++++++++++++++++++++++++++++++
arch/x86/include/asm/sev-es.h | 27 +++++++++++++++++++
arch/x86/include/uapi/asm/svm.h | 1 +
arch/x86/kernel/nmi.c | 8 ++++++
arch/x86/kernel/sev-es.c | 31 ++++++++++++++++++++-
5 files changed, 114 insertions(+), 1 deletion(-)

diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
index 729876d368c5..355470b36896 100644
--- a/arch/x86/entry/entry_64.S
+++ b/arch/x86/entry/entry_64.S
@@ -38,6 +38,7 @@
#include <asm/export.h>
#include <asm/frame.h>
#include <asm/nospec-branch.h>
+#include <asm/sev-es.h>
#include <linux/err.h>

#include "calling.h"
@@ -629,6 +630,13 @@ SYM_INNER_LABEL(swapgs_restore_regs_and_return_to_usermode, SYM_L_GLOBAL)
ud2
1:
#endif
+
+ /*
+ * This code path is used by the NMI handler, so check if NMIs
+ * need to be re-enabled when running as an SEV-ES guest.
+ */
+ SEV_ES_IRET_CHECK
+
POP_REGS pop_rdi=0

/*
@@ -1474,6 +1482,8 @@ SYM_CODE_START(nmi)
movq $-1, %rsi
call do_nmi

+ SEV_ES_NMI_COMPLETE
+
/*
* Return back to user mode. We must *not* do the normal exit
* work, because we don't want to enable interrupts.
@@ -1599,6 +1609,7 @@ nested_nmi_out:
popq %rdx

/* We are returning to kernel mode, so this cannot result in a fault. */
+ SEV_ES_NMI_COMPLETE
iretq

first_nmi:
@@ -1687,6 +1698,12 @@ end_repeat_nmi:
movq $-1, %rsi
call do_nmi

+ /*
+ * When running as an SEV-ES guest, jump to the SEV-ES NMI IRET
+ * path.
+ */
+ SEV_ES_NMI_COMPLETE
+
/* Always restore stashed CR3 value (see paranoid_entry) */
RESTORE_CR3 scratch_reg=%r15 save_reg=%r14

@@ -1715,6 +1732,9 @@ nmi_restore:
std
movq $0, 5*8(%rsp) /* clear "NMI executing" */

+nmi_return:
+ UNWIND_HINT_IRET_REGS
+
/*
* iretq reads the "iret" frame and exits the NMI stack in a
* single instruction. We are returning to kernel mode, so this
@@ -1724,6 +1744,34 @@ nmi_restore:
iretq
SYM_CODE_END(nmi)

+#ifdef CONFIG_AMD_MEM_ENCRYPT
+SYM_CODE_START(sev_es_iret_user)
+ UNWIND_HINT_IRET_REGS offset=8
+ /*
+ * The kernel jumps here directly from
+ * swapgs_restore_regs_and_return_to_usermode. %rsp points already to
+ * trampoline stack, but %cr3 is still from kernel. User-regs are live
+ * except %rdi. Switch to user CR3, restore user %rdi and user gs_base
+ * and single-step over IRET
+ */
+ SWITCH_TO_USER_CR3_STACK scratch_reg=%rdi
+ popq %rdi
+ SWAPGS
+ /*
+ * Enable single-stepping and execute IRET. When IRET is
+ * finished the resulting #DB exception will cause a #VC
+ * exception to be raised. The #VC exception handler will send a
+ * NMI-complete message to the hypervisor to re-open the NMI
+ * window.
+ */
+sev_es_iret_kernel:
+ pushf
+ btsq $X86_EFLAGS_TF_BIT, (%rsp)
+ popf
+ iretq
+SYM_CODE_END(sev_es_iret_user)
+#endif
+
#ifndef CONFIG_IA32_EMULATION
/*
* This handles SYSCALL from 32-bit code. There is no way to program
diff --git a/arch/x86/include/asm/sev-es.h b/arch/x86/include/asm/sev-es.h
index 63acf50e6280..d866adb3e6d4 100644
--- a/arch/x86/include/asm/sev-es.h
+++ b/arch/x86/include/asm/sev-es.h
@@ -8,6 +8,8 @@
#ifndef __ASM_ENCRYPTED_STATE_H
#define __ASM_ENCRYPTED_STATE_H

+#ifndef __ASSEMBLY__
+
#include <linux/types.h>
#include <asm/insn.h>

@@ -82,11 +84,36 @@ struct real_mode_header;

#ifdef CONFIG_AMD_MEM_ENCRYPT
int sev_es_setup_ap_jump_table(struct real_mode_header *rmh);
+void sev_es_nmi_enter(void);
#else /* CONFIG_AMD_MEM_ENCRYPT */
static inline int sev_es_setup_ap_jump_table(struct real_mode_header *rmh)
{
return 0;
}
+static inline void sev_es_nmi_enter(void) { }
+#endif /* CONFIG_AMD_MEM_ENCRYPT*/
+
+#else /* !__ASSEMBLY__ */
+
+#ifdef CONFIG_AMD_MEM_ENCRYPT
+#define SEV_ES_NMI_COMPLETE \
+ ALTERNATIVE "", "callq sev_es_nmi_complete", X86_FEATURE_SEV_ES_GUEST
+
+.macro SEV_ES_IRET_CHECK
+ ALTERNATIVE "jmp .Lend_\@", "", X86_FEATURE_SEV_ES_GUEST
+ movq PER_CPU_VAR(sev_es_in_nmi), %rdi
+ testq %rdi, %rdi
+ jz .Lend_\@
+ callq sev_es_nmi_complete
+.Lend_\@:
+.endm
+
+#else /* CONFIG_AMD_MEM_ENCRYPT */
+#define SEV_ES_NMI_COMPLETE
+.macro SEV_ES_IRET_CHECK
+.endm
#endif /* CONFIG_AMD_MEM_ENCRYPT*/

+#endif /* __ASSEMBLY__ */
+
#endif
diff --git a/arch/x86/include/uapi/asm/svm.h b/arch/x86/include/uapi/asm/svm.h
index 20a05839dd9a..0f837339db66 100644
--- a/arch/x86/include/uapi/asm/svm.h
+++ b/arch/x86/include/uapi/asm/svm.h
@@ -84,6 +84,7 @@
/* SEV-ES software-defined VMGEXIT events */
#define SVM_VMGEXIT_MMIO_READ 0x80000001
#define SVM_VMGEXIT_MMIO_WRITE 0x80000002
+#define SVM_VMGEXIT_NMI_COMPLETE 0x80000003
#define SVM_VMGEXIT_AP_HLT_LOOP 0x80000004
#define SVM_VMGEXIT_AP_JUMP_TABLE 0x80000005
#define SVM_VMGEXIT_SET_AP_JUMP_TABLE 0
diff --git a/arch/x86/kernel/nmi.c b/arch/x86/kernel/nmi.c
index 54c21d6abd5a..7312a6d4d50f 100644
--- a/arch/x86/kernel/nmi.c
+++ b/arch/x86/kernel/nmi.c
@@ -37,6 +37,7 @@
#include <asm/reboot.h>
#include <asm/cache.h>
#include <asm/nospec-branch.h>
+#include <asm/sev-es.h>

#define CREATE_TRACE_POINTS
#include <trace/events/nmi.h>
@@ -510,6 +511,13 @@ NOKPROBE_SYMBOL(is_debug_stack);
dotraplinkage notrace void
do_nmi(struct pt_regs *regs, long error_code)
{
+ /*
+ * For SEV-ES the kernel needs to track whether NMIs are blocked until
+ * IRET is reached, even when the CPU is offline.
+ */
+ if (sev_es_active())
+ sev_es_nmi_enter();
+
if (IS_ENABLED(CONFIG_SMP) && cpu_is_offline(smp_processor_id()))
return;

diff --git a/arch/x86/kernel/sev-es.c b/arch/x86/kernel/sev-es.c
index 3c22f256645e..409a7a2aa630 100644
--- a/arch/x86/kernel/sev-es.c
+++ b/arch/x86/kernel/sev-es.c
@@ -37,6 +37,7 @@ struct ghcb boot_ghcb_page __bss_decrypted __aligned(PAGE_SIZE);
*/
struct ghcb __initdata *boot_ghcb;
static DEFINE_PER_CPU(unsigned long, cached_dr7) = DR7_RESET_VALUE;
+DEFINE_PER_CPU(bool, sev_es_in_nmi) = false;

struct ghcb_state {
struct ghcb *ghcb;
@@ -270,6 +271,31 @@ static phys_addr_t vc_slow_virt_to_phys(struct ghcb *ghcb, long vaddr)
/* Include code shared with pre-decompression boot stage */
#include "sev-es-shared.c"

+void sev_es_nmi_enter(void)
+{
+ this_cpu_write(sev_es_in_nmi, true);
+}
+
+void sev_es_nmi_complete(void)
+{
+ struct ghcb_state state;
+ struct ghcb *ghcb;
+
+ ghcb = sev_es_get_ghcb(&state);
+
+ vc_ghcb_invalidate(ghcb);
+ ghcb_set_sw_exit_code(ghcb, SVM_VMGEXIT_NMI_COMPLETE);
+ ghcb_set_sw_exit_info_1(ghcb, 0);
+ ghcb_set_sw_exit_info_2(ghcb, 0);
+
+ sev_es_wr_ghcb_msr(__pa(ghcb));
+ VMGEXIT();
+
+ sev_es_put_ghcb(&state);
+
+ this_cpu_write(sev_es_in_nmi, false);
+}
+
static u64 sev_es_get_jump_table_addr(void)
{
struct ghcb_state state;
@@ -891,7 +917,10 @@ static enum es_result vc_handle_vmmcall(struct ghcb *ghcb,
static enum es_result vc_handle_db_exception(struct ghcb *ghcb,
struct es_em_ctxt *ctxt)
{
- do_debug(ctxt->regs, 0);
+ if (this_cpu_read(sev_es_in_nmi))
+ sev_es_nmi_complete();
+ else
+ do_debug(ctxt->regs, 0);

/* Exception event, do not advance RIP */
return ES_RETRY;
--
2.17.1


2020-03-19 15:37:35

by Andy Lutomirski

[permalink] [raw]
Subject: Re: [PATCH 70/70] x86/sev-es: Add NMI state tracking

On Thu, Mar 19, 2020 at 2:14 AM Joerg Roedel <[email protected]> wrote:
>
> From: Joerg Roedel <[email protected]>
>
> Keep NMI state in SEV-ES code so the kernel can re-enable NMIs for the
> vCPU when it reaches IRET.

IIRC I suggested just re-enabling NMI in C from do_nmi(). What was
wrong with that approach?

> +#ifdef CONFIG_AMD_MEM_ENCRYPT
> +SYM_CODE_START(sev_es_iret_user)
> + UNWIND_HINT_IRET_REGS offset=8
> + /*
> + * The kernel jumps here directly from
> + * swapgs_restore_regs_and_return_to_usermode. %rsp points already to
> + * trampoline stack, but %cr3 is still from kernel. User-regs are live
> + * except %rdi. Switch to user CR3, restore user %rdi and user gs_base
> + * and single-step over IRET
> + */
> + SWITCH_TO_USER_CR3_STACK scratch_reg=%rdi
> + popq %rdi
> + SWAPGS
> + /*
> + * Enable single-stepping and execute IRET. When IRET is
> + * finished the resulting #DB exception will cause a #VC
> + * exception to be raised. The #VC exception handler will send a
> + * NMI-complete message to the hypervisor to re-open the NMI
> + * window.

This is distressing to say the least. The sequence if events is, roughly:

1. We're here with NMI masking in an unknown state because do_nmi()
and any nested faults could have done IRET, at least architecturally.
NMI could occur or it could not. I suppose that, on SEV-ES, as least
on current CPUs, NMI is definitely masked. What about on newer CPUs?
What if we migrate?

> + */
> +sev_es_iret_kernel:
> + pushf
> + btsq $X86_EFLAGS_TF_BIT, (%rsp)
> + popf

Now we have TF on, NMIs (architecturally) in unknown state.

> + iretq

This causes us to pop the NMI frame off the stack. Assuming the NMI
restart logic is invoked (which is maybe impossible?), we get #DB,
which presumably is actually delivered. And we end up on the #DB
stack, which might already have been in use, so we have a potential
increase in nesting. Also, #DB may be called from an unexpected
context.

Now somehow #DB is supposed to invoke #VC, which is supposed to do the
magic hypercall, and all of this is supposed to be safe? Or is #DB
unconditionally redirected to #VC? What happens if we had no stack
(e.g. we interrupted SYSCALL) or we were already in #VC to begin with?

I think there are two credible ways to approach this:

1. Just put the NMI unmask in do_nmi(). The kernel *already* knows
how to handle running do_nmi() with NMIs unmasked. This is much, much
simpler than your code.

2. Have an entirely separate NMI path for the
SEV-ES-on-misdesigned-CPU case. And have very clear documentation for
what prevents this code from being executed on future CPUs (Zen3?)
that have this issue fixed for real?

This hybrid code is no good.

--Andy

2020-03-19 16:09:38

by Joerg Roedel

[permalink] [raw]
Subject: Re: [PATCH 70/70] x86/sev-es: Add NMI state tracking

Hi Andy,

On Thu, Mar 19, 2020 at 08:35:59AM -0700, Andy Lutomirski wrote:
> On Thu, Mar 19, 2020 at 2:14 AM Joerg Roedel <[email protected]> wrote:
> >
> > From: Joerg Roedel <[email protected]>
> >
> > Keep NMI state in SEV-ES code so the kernel can re-enable NMIs for the
> > vCPU when it reaches IRET.
>
> IIRC I suggested just re-enabling NMI in C from do_nmi(). What was
> wrong with that approach?

If I understand the code correctly a nested NMI will just reset the
interrupted NMI handler to start executing again at 'restart_nmi'.
The interrupted NMI handler could be in the #VC handler, and it is not
safe to just jump back to the start of the NMI handler from somewhere
within the #VC handler.

So I decided to not allow NMI nesting for SEV-ES and only re-enable the
NMI window when the first NMI returns. This is not implemented in this
patch, but I will do that once Thomas' entry-code rewrite is upstream.

> This causes us to pop the NMI frame off the stack. Assuming the NMI
> restart logic is invoked (which is maybe impossible?), we get #DB,
> which presumably is actually delivered. And we end up on the #DB
> stack, which might already have been in use, so we have a potential
> increase in nesting. Also, #DB may be called from an unexpected
> context.

An SEV-ES hypervisor is required to intercept #DB, which means that the
#DB exception actually ends up being a #VC exception. So it will not end
up on the #DB stack.

> Now somehow #DB is supposed to invoke #VC, which is supposed to do the
> magic hypercall, and all of this is supposed to be safe? Or is #DB
> unconditionally redirected to #VC? What happens if we had no stack
> (e.g. we interrupted SYSCALL) or we were already in #VC to begin with?

Yeah, as I said above, the #DB is redirected to #VC, as the hypervisor
has to intercept #DB.

The stack-problem is the one that prevents the Single-step-over-iret
approach right now, because the NMI can hit while in kernel mode and on
entry stack, which the generic entry code (besided NMI) does not handle.
Getting a #VC exception there (like after an IRET to that state) breaks
things.

Last, in this version of the patch-set the #VC handler became
nesting-safe. It detects whether the per-cpu GHCB is in use and
safes/restores its contents in this case.


> I think there are two credible ways to approach this:
>
> 1. Just put the NMI unmask in do_nmi(). The kernel *already* knows
> how to handle running do_nmi() with NMIs unmasked. This is much, much
> simpler than your code.

Right, and I thought about that, but the implication is that the
complexity is moved somewhere else, namely into the #VC handler, which
then has to be restartable.

> 2. Have an entirely separate NMI path for the
> SEV-ES-on-misdesigned-CPU case. And have very clear documentation for
> what prevents this code from being executed on future CPUs (Zen3?)
> that have this issue fixed for real?

That sounds like a good alternative, I will investigate this approach.
The NMI handler should be much simpler as it doesn't need to allow NMI
nesting. The question is, does the C code down the NMI path depend on
the NMI handlers stack frame layout (e.g. the in-nmi flag)?

Regards,

Joerg

2020-03-19 16:54:34

by Mika Penttilä

[permalink] [raw]
Subject: Re: [PATCH 70/70] x86/sev-es: Add NMI state tracking


Hi!

On 19.3.2020 11.14, Joerg Roedel wrote:
> From: Joerg Roedel <[email protected]>
>
> Keep NMI state in SEV-ES code so the kernel can re-enable NMIs for the
> vCPU when it reaches IRET.
>
> Signed-off-by: Joerg Roedel <[email protected]>
> ---
> arch/x86/entry/entry_64.S | 48 +++++++++++++++++++++++++++++++++
> arch/x86/include/asm/sev-es.h | 27 +++++++++++++++++++
> arch/x86/include/uapi/asm/svm.h | 1 +
> arch/x86/kernel/nmi.c | 8 ++++++
> arch/x86/kernel/sev-es.c | 31 ++++++++++++++++++++-
> 5 files changed, 114 insertions(+), 1 deletion(-)
>
> diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
> index 729876d368c5..355470b36896 100644
> --- a/arch/x86/entry/entry_64.S
> +++ b/arch/x86/entry/entry_64.S
> @@ -38,6 +38,7 @@
> #include <asm/export.h>
> #include <asm/frame.h>
> #include <asm/nospec-branch.h>
> +#include <asm/sev-es.h>
> #include <linux/err.h>
>
> #include "calling.h"
> @@ -629,6 +630,13 @@ SYM_INNER_LABEL(swapgs_restore_regs_and_return_to_usermode, SYM_L_GLOBAL)
> ud2
> 1:
> #endif
> +
> + /*
> + * This code path is used by the NMI handler, so check if NMIs
> + * need to be re-enabled when running as an SEV-ES guest.
> + */
> + SEV_ES_IRET_CHECK
> +
> POP_REGS pop_rdi=0
>
> /*
> @@ -1474,6 +1482,8 @@ SYM_CODE_START(nmi)
> movq $-1, %rsi
> call do_nmi
>
> + SEV_ES_NMI_COMPLETE
> +
> /*
> * Return back to user mode. We must *not* do the normal exit
> * work, because we don't want to enable interrupts.
> @@ -1599,6 +1609,7 @@ nested_nmi_out:
> popq %rdx
>
> /* We are returning to kernel mode, so this cannot result in a fault. */
> + SEV_ES_NMI_COMPLETE
> iretq
>
> first_nmi:
> @@ -1687,6 +1698,12 @@ end_repeat_nmi:
> movq $-1, %rsi
> call do_nmi
>
> + /*
> + * When running as an SEV-ES guest, jump to the SEV-ES NMI IRET
> + * path.
> + */
> + SEV_ES_NMI_COMPLETE
> +
> /* Always restore stashed CR3 value (see paranoid_entry) */
> RESTORE_CR3 scratch_reg=%r15 save_reg=%r14
>
> @@ -1715,6 +1732,9 @@ nmi_restore:
> std
> movq $0, 5*8(%rsp) /* clear "NMI executing" */
>
> +nmi_return:
> + UNWIND_HINT_IRET_REGS
> +
> /*
> * iretq reads the "iret" frame and exits the NMI stack in a
> * single instruction. We are returning to kernel mode, so this
> @@ -1724,6 +1744,34 @@ nmi_restore:
> iretq
> SYM_CODE_END(nmi)
>
> +#ifdef CONFIG_AMD_MEM_ENCRYPT

> +SYM_CODE_START(sev_es_iret_user)


What makes kernel jump here? Can't see this referenced from anywhere?


> + UNWIND_HINT_IRET_REGS offset=8
> + /*
> + * The kernel jumps here directly from
> + * swapgs_restore_regs_and_return_to_usermode. %rsp points already to
> + * trampoline stack, but %cr3 is still from kernel. User-regs are live
> + * except %rdi. Switch to user CR3, restore user %rdi and user gs_base
> + * and single-step over IRET
> + */
> + SWITCH_TO_USER_CR3_STACK scratch_reg=%rdi
> + popq %rdi
> + SWAPGS
> + /*
> + * Enable single-stepping and execute IRET. When IRET is
> + * finished the resulting #DB exception will cause a #VC
> + * exception to be raised. The #VC exception handler will send a
> + * NMI-complete message to the hypervisor to re-open the NMI
> + * window.
> + */
> +sev_es_iret_kernel:
> + pushf
> + btsq $X86_EFLAGS_TF_BIT, (%rsp)
> + popf
> + iretq
> +SYM_CODE_END(sev_es_iret_user)
> +#endif
> +
> #ifndef CONFIG_IA32_EMULATION
> /*
> * This handles SYSCALL from 32-bit code. There is no way to program
> diff --git a/arch/x86/include/asm/sev-es.h b/arch/x86/include/asm/sev-es.h
> index 63acf50e6280..d866adb3e6d4 100644
> --- a/arch/x86/include/asm/sev-es.h
> +++ b/arch/x86/include/asm/sev-es.h
> @@ -8,6 +8,8 @@
> #ifndef __ASM_ENCRYPTED_STATE_H
> #define __ASM_ENCRYPTED_STATE_H
>
> +#ifndef __ASSEMBLY__
> +
> #include <linux/types.h>
> #include <asm/insn.h>
>
> @@ -82,11 +84,36 @@ struct real_mode_header;
>
> #ifdef CONFIG_AMD_MEM_ENCRYPT
> int sev_es_setup_ap_jump_table(struct real_mode_header *rmh);
> +void sev_es_nmi_enter(void);
> #else /* CONFIG_AMD_MEM_ENCRYPT */
> static inline int sev_es_setup_ap_jump_table(struct real_mode_header *rmh)
> {
> return 0;
> }
> +static inline void sev_es_nmi_enter(void) { }
> +#endif /* CONFIG_AMD_MEM_ENCRYPT*/
> +
> +#else /* !__ASSEMBLY__ */
> +
> +#ifdef CONFIG_AMD_MEM_ENCRYPT
> +#define SEV_ES_NMI_COMPLETE \
> + ALTERNATIVE "", "callq sev_es_nmi_complete", X86_FEATURE_SEV_ES_GUEST
> +
> +.macro SEV_ES_IRET_CHECK
> + ALTERNATIVE "jmp .Lend_\@", "", X86_FEATURE_SEV_ES_GUEST
> + movq PER_CPU_VAR(sev_es_in_nmi), %rdi
> + testq %rdi, %rdi
> + jz .Lend_\@
> + callq sev_es_nmi_complete
> +.Lend_\@:
> +.endm
> +
> +#else /* CONFIG_AMD_MEM_ENCRYPT */
> +#define SEV_ES_NMI_COMPLETE
> +.macro SEV_ES_IRET_CHECK
> +.endm
> #endif /* CONFIG_AMD_MEM_ENCRYPT*/
>
> +#endif /* __ASSEMBLY__ */
> +
> #endif
> diff --git a/arch/x86/include/uapi/asm/svm.h b/arch/x86/include/uapi/asm/svm.h
> index 20a05839dd9a..0f837339db66 100644
> --- a/arch/x86/include/uapi/asm/svm.h
> +++ b/arch/x86/include/uapi/asm/svm.h
> @@ -84,6 +84,7 @@
> /* SEV-ES software-defined VMGEXIT events */
> #define SVM_VMGEXIT_MMIO_READ 0x80000001
> #define SVM_VMGEXIT_MMIO_WRITE 0x80000002
> +#define SVM_VMGEXIT_NMI_COMPLETE 0x80000003
> #define SVM_VMGEXIT_AP_HLT_LOOP 0x80000004
> #define SVM_VMGEXIT_AP_JUMP_TABLE 0x80000005
> #define SVM_VMGEXIT_SET_AP_JUMP_TABLE 0
> diff --git a/arch/x86/kernel/nmi.c b/arch/x86/kernel/nmi.c
> index 54c21d6abd5a..7312a6d4d50f 100644
> --- a/arch/x86/kernel/nmi.c
> +++ b/arch/x86/kernel/nmi.c
> @@ -37,6 +37,7 @@
> #include <asm/reboot.h>
> #include <asm/cache.h>
> #include <asm/nospec-branch.h>
> +#include <asm/sev-es.h>
>
> #define CREATE_TRACE_POINTS
> #include <trace/events/nmi.h>
> @@ -510,6 +511,13 @@ NOKPROBE_SYMBOL(is_debug_stack);
> dotraplinkage notrace void
> do_nmi(struct pt_regs *regs, long error_code)
> {
> + /*
> + * For SEV-ES the kernel needs to track whether NMIs are blocked until
> + * IRET is reached, even when the CPU is offline.
> + */
> + if (sev_es_active())
> + sev_es_nmi_enter();
> +
> if (IS_ENABLED(CONFIG_SMP) && cpu_is_offline(smp_processor_id()))
> return;
>
> diff --git a/arch/x86/kernel/sev-es.c b/arch/x86/kernel/sev-es.c
> index 3c22f256645e..409a7a2aa630 100644
> --- a/arch/x86/kernel/sev-es.c
> +++ b/arch/x86/kernel/sev-es.c
> @@ -37,6 +37,7 @@ struct ghcb boot_ghcb_page __bss_decrypted __aligned(PAGE_SIZE);
> */
> struct ghcb __initdata *boot_ghcb;
> static DEFINE_PER_CPU(unsigned long, cached_dr7) = DR7_RESET_VALUE;
> +DEFINE_PER_CPU(bool, sev_es_in_nmi) = false;
>
> struct ghcb_state {
> struct ghcb *ghcb;
> @@ -270,6 +271,31 @@ static phys_addr_t vc_slow_virt_to_phys(struct ghcb *ghcb, long vaddr)
> /* Include code shared with pre-decompression boot stage */
> #include "sev-es-shared.c"
>
> +void sev_es_nmi_enter(void)
> +{
> + this_cpu_write(sev_es_in_nmi, true);
> +}
> +
> +void sev_es_nmi_complete(void)
> +{
> + struct ghcb_state state;
> + struct ghcb *ghcb;
> +
> + ghcb = sev_es_get_ghcb(&state);
> +
> + vc_ghcb_invalidate(ghcb);
> + ghcb_set_sw_exit_code(ghcb, SVM_VMGEXIT_NMI_COMPLETE);
> + ghcb_set_sw_exit_info_1(ghcb, 0);
> + ghcb_set_sw_exit_info_2(ghcb, 0);
> +
> + sev_es_wr_ghcb_msr(__pa(ghcb));
> + VMGEXIT();
> +
> + sev_es_put_ghcb(&state);
> +
> + this_cpu_write(sev_es_in_nmi, false);
> +}
> +
> static u64 sev_es_get_jump_table_addr(void)
> {
> struct ghcb_state state;
> @@ -891,7 +917,10 @@ static enum es_result vc_handle_vmmcall(struct ghcb *ghcb,
> static enum es_result vc_handle_db_exception(struct ghcb *ghcb,
> struct es_em_ctxt *ctxt)
> {
> - do_debug(ctxt->regs, 0);
> + if (this_cpu_read(sev_es_in_nmi))
> + sev_es_nmi_complete();
> + else
> + do_debug(ctxt->regs, 0);
>
> /* Exception event, do not advance RIP */
> return ES_RETRY;

2020-03-19 18:43:23

by Andy Lutomirski

[permalink] [raw]
Subject: Re: [PATCH 70/70] x86/sev-es: Add NMI state tracking

On Thu, Mar 19, 2020 at 9:07 AM Joerg Roedel <[email protected]> wrote:
>
> Hi Andy,
>
> On Thu, Mar 19, 2020 at 08:35:59AM -0700, Andy Lutomirski wrote:
> > On Thu, Mar 19, 2020 at 2:14 AM Joerg Roedel <[email protected]> wrote:
> > >
> > > From: Joerg Roedel <[email protected]>
> > >
> > > Keep NMI state in SEV-ES code so the kernel can re-enable NMIs for the
> > > vCPU when it reaches IRET.
> >
> > IIRC I suggested just re-enabling NMI in C from do_nmi(). What was
> > wrong with that approach?
>
> If I understand the code correctly a nested NMI will just reset the
> interrupted NMI handler to start executing again at 'restart_nmi'.
> The interrupted NMI handler could be in the #VC handler, and it is not
> safe to just jump back to the start of the NMI handler from somewhere
> within the #VC handler.

Nope. A nested NMI will reset the interrupted NMI's return frame to
cause it to run again when it's done. I don't think this will have
any real interaction with #VC. There's no longjmp() here.

>
> So I decided to not allow NMI nesting for SEV-ES and only re-enable the
> NMI window when the first NMI returns. This is not implemented in this
> patch, but I will do that once Thomas' entry-code rewrite is upstream.
>

I certainly *like* preventing nesting, but I don't think we really
want a whole alternate NMI path just for a couple of messed-up AMD
generations. And the TF trick is not so pretty either.

> > This causes us to pop the NMI frame off the stack. Assuming the NMI
> > restart logic is invoked (which is maybe impossible?), we get #DB,
> > which presumably is actually delivered. And we end up on the #DB
> > stack, which might already have been in use, so we have a potential
> > increase in nesting. Also, #DB may be called from an unexpected
> > context.
>
> An SEV-ES hypervisor is required to intercept #DB, which means that the
> #DB exception actually ends up being a #VC exception. So it will not end
> up on the #DB stack.

With your patch set, #DB doesn't seem to end up on the #DB stack either.

>
> > I think there are two credible ways to approach this:
> >
> > 1. Just put the NMI unmask in do_nmi(). The kernel *already* knows
> > how to handle running do_nmi() with NMIs unmasked. This is much, much
> > simpler than your code.
>
> Right, and I thought about that, but the implication is that the
> complexity is moved somewhere else, namely into the #VC handler, which
> then has to be restartable.

As above, I don't think there's an actual problem here.

>
> > 2. Have an entirely separate NMI path for the
> > SEV-ES-on-misdesigned-CPU case. And have very clear documentation for
> > what prevents this code from being executed on future CPUs (Zen3?)
> > that have this issue fixed for real?
>
> That sounds like a good alternative, I will investigate this approach.
> The NMI handler should be much simpler as it doesn't need to allow NMI
> nesting. The question is, does the C code down the NMI path depend on
> the NMI handlers stack frame layout (e.g. the in-nmi flag)?

Nope. In particular, the 32-bit path doesn't have all this.

2020-03-19 19:34:26

by Joerg Roedel

[permalink] [raw]
Subject: Re: [PATCH 70/70] x86/sev-es: Add NMI state tracking

On Thu, Mar 19, 2020 at 11:40:39AM -0700, Andy Lutomirski wrote:

> Nope. A nested NMI will reset the interrupted NMI's return frame to
> cause it to run again when it's done. I don't think this will have
> any real interaction with #VC. There's no longjmp() here.

Ahh, so I misunderstood that part, in this case your proposal of sending
the NMI-complete message right at the beginning of do_nmi() should work
just fine. I will test this and see how it works out.

> I certainly *like* preventing nesting, but I don't think we really
> want a whole alternate NMI path just for a couple of messed-up AMD
> generations. And the TF trick is not so pretty either.

Indeed, if it could be avoided, it should.

>
> > > This causes us to pop the NMI frame off the stack. Assuming the NMI
> > > restart logic is invoked (which is maybe impossible?), we get #DB,
> > > which presumably is actually delivered. And we end up on the #DB
> > > stack, which might already have been in use, so we have a potential
> > > increase in nesting. Also, #DB may be called from an unexpected
> > > context.
> >
> > An SEV-ES hypervisor is required to intercept #DB, which means that the
> > #DB exception actually ends up being a #VC exception. So it will not end
> > up on the #DB stack.
>
> With your patch set, #DB doesn't seem to end up on the #DB stack either.

Right, it does not use the #DB stack or shift-ist stuff. Maybe it
should, is this needed for anything else than making entry code
debugable by kgdb?

Regards,

Joerg

2020-03-19 19:42:41

by Joerg Roedel

[permalink] [raw]
Subject: Re: [PATCH 70/70] x86/sev-es: Add NMI state tracking

On Thu, Mar 19, 2020 at 06:53:29PM +0200, Mika Penttil? wrote:
> > +SYM_CODE_START(sev_es_iret_user)
>
>
> What makes kernel jump here? Can't see this referenced from anywhere?

Sorry, it is just a left-over from a previous version of this patch
(which implemented the single-step-over-iret). This label is not used
anymore. The jump to it was in
swapgs_restore_regs_and_return_to_usermode, after checking the
sev_es_in_nmi flag.

Regards,

Joerg

2020-03-19 21:29:58

by Andy Lutomirski

[permalink] [raw]
Subject: Re: [PATCH 70/70] x86/sev-es: Add NMI state tracking

On Thu, Mar 19, 2020 at 12:26 PM Joerg Roedel <[email protected]> wrote:
>
> On Thu, Mar 19, 2020 at 11:40:39AM -0700, Andy Lutomirski wrote:
>
> > Nope. A nested NMI will reset the interrupted NMI's return frame to
> > cause it to run again when it's done. I don't think this will have
> > any real interaction with #VC. There's no longjmp() here.
>
> Ahh, so I misunderstood that part, in this case your proposal of sending
> the NMI-complete message right at the beginning of do_nmi() should work
> just fine. I will test this and see how it works out.
>
> > I certainly *like* preventing nesting, but I don't think we really
> > want a whole alternate NMI path just for a couple of messed-up AMD
> > generations. And the TF trick is not so pretty either.
>
> Indeed, if it could be avoided, it should.
>
> >
> > > > This causes us to pop the NMI frame off the stack. Assuming the NMI
> > > > restart logic is invoked (which is maybe impossible?), we get #DB,
> > > > which presumably is actually delivered. And we end up on the #DB
> > > > stack, which might already have been in use, so we have a potential
> > > > increase in nesting. Also, #DB may be called from an unexpected
> > > > context.
> > >
> > > An SEV-ES hypervisor is required to intercept #DB, which means that the
> > > #DB exception actually ends up being a #VC exception. So it will not end
> > > up on the #DB stack.
> >
> > With your patch set, #DB doesn't seem to end up on the #DB stack either.
>
> Right, it does not use the #DB stack or shift-ist stuff. Maybe it
> should, is this needed for anything else than making entry code
> debugable by kgdb?

AIUI the shift-ist stuff is because we aren't very good about the way
that we handle tracing right now, and that can cause a limited degree
of recursion. #DB uses IST for historical reasons that don't
necessarily make sense. Right now, we need it for only one reason:
the MOV SS issue. IIRC this isn't actually triggerable without
debugging enabled -- MOV SS with no breakpoint but TF on doesn't seem
to malfunction quite as badly.

--Andy

>
> Regards,
>
> Joerg

2020-03-20 13:18:31

by Joerg Roedel

[permalink] [raw]
Subject: [RFC PATCH v2.1] x86/sev-es: Handle NMI State

On Thu, Mar 19, 2020 at 08:35:59AM -0700, Andy Lutomirski wrote:
> 1. Just put the NMI unmask in do_nmi(). The kernel *already* knows
> how to handle running do_nmi() with NMIs unmasked. This is much, much
> simpler than your code.

Okay, attached is the updated patch which implements this approach. I
tested it in an SEV-ES guest with 'perf top' running for a little more
than 30 minutes and all looked good. I also removed the dead code from
the patch.


From ec3b021c5d9130fd66e00d823c4fabc675c4b49e Mon Sep 17 00:00:00 2001
From: Joerg Roedel <[email protected]>
Date: Tue, 28 Jan 2020 17:31:05 +0100
Subject: [PATCH] x86/sev-es: Handle NMI State

When running under SEV-ES the kernel has to tell the hypervisor when to
open the NMI window again after an NMI was injected. This is done with
an NMI-complete message to the hypervisor.

Add code to the kernels NMI handler to send this message right at the
beginning of do_nmi(). This always allows nesting NMIs.

Signed-off-by: Joerg Roedel <[email protected]>
---
arch/x86/include/asm/sev-es.h | 2 ++
arch/x86/include/uapi/asm/svm.h | 1 +
arch/x86/kernel/nmi.c | 8 ++++++++
arch/x86/kernel/sev-es.c | 18 ++++++++++++++++++
4 files changed, 29 insertions(+)

diff --git a/arch/x86/include/asm/sev-es.h b/arch/x86/include/asm/sev-es.h
index 63acf50e6280..441ec1ba2cc7 100644
--- a/arch/x86/include/asm/sev-es.h
+++ b/arch/x86/include/asm/sev-es.h
@@ -82,11 +82,13 @@ struct real_mode_header;

#ifdef CONFIG_AMD_MEM_ENCRYPT
int sev_es_setup_ap_jump_table(struct real_mode_header *rmh);
+void sev_es_nmi_complete(void);
#else /* CONFIG_AMD_MEM_ENCRYPT */
static inline int sev_es_setup_ap_jump_table(struct real_mode_header *rmh)
{
return 0;
}
+static inline void sev_es_nmi_complete(void) { }
#endif /* CONFIG_AMD_MEM_ENCRYPT*/

#endif
diff --git a/arch/x86/include/uapi/asm/svm.h b/arch/x86/include/uapi/asm/svm.h
index 20a05839dd9a..0f837339db66 100644
--- a/arch/x86/include/uapi/asm/svm.h
+++ b/arch/x86/include/uapi/asm/svm.h
@@ -84,6 +84,7 @@
/* SEV-ES software-defined VMGEXIT events */
#define SVM_VMGEXIT_MMIO_READ 0x80000001
#define SVM_VMGEXIT_MMIO_WRITE 0x80000002
+#define SVM_VMGEXIT_NMI_COMPLETE 0x80000003
#define SVM_VMGEXIT_AP_HLT_LOOP 0x80000004
#define SVM_VMGEXIT_AP_JUMP_TABLE 0x80000005
#define SVM_VMGEXIT_SET_AP_JUMP_TABLE 0
diff --git a/arch/x86/kernel/nmi.c b/arch/x86/kernel/nmi.c
index 54c21d6abd5a..fc872a7e0ed1 100644
--- a/arch/x86/kernel/nmi.c
+++ b/arch/x86/kernel/nmi.c
@@ -37,6 +37,7 @@
#include <asm/reboot.h>
#include <asm/cache.h>
#include <asm/nospec-branch.h>
+#include <asm/sev-es.h>

#define CREATE_TRACE_POINTS
#include <trace/events/nmi.h>
@@ -510,6 +511,13 @@ NOKPROBE_SYMBOL(is_debug_stack);
dotraplinkage notrace void
do_nmi(struct pt_regs *regs, long error_code)
{
+ /*
+ * Re-enable NMIs right here when running as an SEV-ES guest. This might
+ * cause nested NMIs, but those can be handled safely.
+ */
+ if (sev_es_active())
+ sev_es_nmi_complete();
+
if (IS_ENABLED(CONFIG_SMP) && cpu_is_offline(smp_processor_id()))
return;

diff --git a/arch/x86/kernel/sev-es.c b/arch/x86/kernel/sev-es.c
index 3c22f256645e..a7e2739771e7 100644
--- a/arch/x86/kernel/sev-es.c
+++ b/arch/x86/kernel/sev-es.c
@@ -270,6 +270,24 @@ static phys_addr_t vc_slow_virt_to_phys(struct ghcb *ghcb, long vaddr)
/* Include code shared with pre-decompression boot stage */
#include "sev-es-shared.c"

+void sev_es_nmi_complete(void)
+{
+ struct ghcb_state state;
+ struct ghcb *ghcb;
+
+ ghcb = sev_es_get_ghcb(&state);
+
+ vc_ghcb_invalidate(ghcb);
+ ghcb_set_sw_exit_code(ghcb, SVM_VMGEXIT_NMI_COMPLETE);
+ ghcb_set_sw_exit_info_1(ghcb, 0);
+ ghcb_set_sw_exit_info_2(ghcb, 0);
+
+ sev_es_wr_ghcb_msr(__pa(ghcb));
+ VMGEXIT();
+
+ sev_es_put_ghcb(&state);
+}
+
static u64 sev_es_get_jump_table_addr(void)
{
struct ghcb_state state;
--
2.16.4

2020-03-20 14:43:34

by Dave Hansen

[permalink] [raw]
Subject: Re: [RFC PATCH v2.1] x86/sev-es: Handle NMI State

On 3/20/20 6:17 AM, Joerg Roedel wrote:
> On Thu, Mar 19, 2020 at 08:35:59AM -0700, Andy Lutomirski wrote:
>> 1. Just put the NMI unmask in do_nmi(). The kernel *already* knows
>> how to handle running do_nmi() with NMIs unmasked. This is much, much
>> simpler than your code.
> Okay, attached is the updated patch which implements this approach. I
> tested it in an SEV-ES guest with 'perf top' running for a little more
> than 30 minutes and all looked good. I also removed the dead code from
> the patch.

FWIW, perf plus the x86 selftests run in a big loop was my best way of
stressing the NMI path when we mucked with it for PTI. The selftests
make sure to hit some of the more rare entry/exit paths.

2020-03-20 19:43:57

by Joerg Roedel

[permalink] [raw]
Subject: Re: [RFC PATCH v2.1] x86/sev-es: Handle NMI State

On Fri, Mar 20, 2020 at 07:42:09AM -0700, Dave Hansen wrote:
> FWIW, perf plus the x86 selftests run in a big loop was my best way of
> stressing the NMI path when we mucked with it for PTI. The selftests
> make sure to hit some of the more rare entry/exit paths.

Yeah, I ran the x86 selftests in an SEV-ES guest on-top of these
patches, that works. But doing this together with 'perf top' is also on
the list of tests to do.

Regards,

Joerg

2020-03-20 19:49:31

by Joerg Roedel

[permalink] [raw]
Subject: Re: [PATCH 70/70] x86/sev-es: Add NMI state tracking

On Thu, Mar 19, 2020 at 02:27:49PM -0700, Andy Lutomirski wrote:
> AIUI the shift-ist stuff is because we aren't very good about the way
> that we handle tracing right now, and that can cause a limited degree
> of recursion. #DB uses IST for historical reasons that don't
> necessarily make sense. Right now, we need it for only one reason:
> the MOV SS issue. IIRC this isn't actually triggerable without
> debugging enabled -- MOV SS with no breakpoint but TF on doesn't seem
> to malfunction quite as badly.

I had a look at the shift_ist stuff today and it looks like a good
solution to the #VC nesting problem when it is turned into a #VC
handler. The devil is in the details, of course, as 3 or 4 stacks for
the #VC handler (per cpu) should only be allocated when actually running
in an SEV-ES guest. Let's see how this works out in practice.

Regards,

Joerg