2022-01-24 19:24:10

by Kirill A. Shutemov

[permalink] [raw]
Subject: [PATCHv2 05/29] x86/tdx: Add HLT support for TDX guests

The HLT instruction is a privileged instruction, executing it stops
instruction execution and places the processor in a HALT state. It
is used in kernel for cases like reboot, idle loop and exception fixup
handlers. For the idle case, interrupts will be enabled (using STI)
before the HLT instruction (this is also called safe_halt()).

To support the HLT instruction in TDX guests, it needs to be emulated
using TDVMCALL (hypercall to VMM). More details about it can be found
in Intel Trust Domain Extensions (Intel TDX) Guest-Host-Communication
Interface (GHCI) specification, section TDVMCALL[Instruction.HLT].

In TDX guests, executing HLT instruction will generate a #VE, which is
used to emulate the HLT instruction. But #VE based emulation will not
work for the safe_halt() flavor, because it requires STI instruction to
be executed just before the TDCALL. Since idle loop is the only user of
safe_halt() variant, handle it as a special case.

To avoid *safe_halt() call in the idle function, define the
tdx_guest_idle() and use it to override the "x86_idle" function pointer
for a valid TDX guest.

Alternative choices like PV ops have been considered for adding
safe_halt() support. But it was rejected because HLT paravirt calls
only exist under PARAVIRT_XXL, and enabling it in TDX guest just for
safe_halt() use case is not worth the cost.

Co-developed-by: Kuppuswamy Sathyanarayanan <[email protected]>
Signed-off-by: Kuppuswamy Sathyanarayanan <[email protected]>
Reviewed-by: Andi Kleen <[email protected]>
Reviewed-by: Tony Luck <[email protected]>
Signed-off-by: Kirill A. Shutemov <[email protected]>
---
arch/x86/include/asm/tdx.h | 3 ++
arch/x86/kernel/process.c | 5 +++
arch/x86/kernel/tdcall.S | 31 +++++++++++++++++
arch/x86/kernel/tdx.c | 70 ++++++++++++++++++++++++++++++++++++--
4 files changed, 107 insertions(+), 2 deletions(-)

diff --git a/arch/x86/include/asm/tdx.h b/arch/x86/include/asm/tdx.h
index d17143290f0a..9b4714a45bb9 100644
--- a/arch/x86/include/asm/tdx.h
+++ b/arch/x86/include/asm/tdx.h
@@ -74,10 +74,13 @@ bool tdx_get_ve_info(struct ve_info *ve);

bool tdx_handle_virt_exception(struct pt_regs *regs, struct ve_info *ve);

+void tdx_safe_halt(void);
+
#else

static inline void tdx_early_init(void) { };
static inline bool is_tdx_guest(void) { return false; }
+static inline void tdx_safe_halt(void) { };

#endif /* CONFIG_INTEL_TDX_GUEST */

diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index 81d8ef036637..d48afc69ebfa 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -46,6 +46,7 @@
#include <asm/proto.h>
#include <asm/frame.h>
#include <asm/unwind.h>
+#include <asm/tdx.h>

#include "process.h"

@@ -870,6 +871,10 @@ void select_idle_routine(const struct cpuinfo_x86 *c)
} else if (prefer_mwait_c1_over_halt(c)) {
pr_info("using mwait in idle threads\n");
x86_idle = mwait_idle;
+ } else if (cpu_feature_enabled(X86_FEATURE_TDX_GUEST)) {
+ pr_info("using TDX aware idle routine\n");
+ x86_idle = tdx_safe_halt;
+ return;
} else
x86_idle = default_idle;
}
diff --git a/arch/x86/kernel/tdcall.S b/arch/x86/kernel/tdcall.S
index 46a49a96cf6c..ae74da33ccc6 100644
--- a/arch/x86/kernel/tdcall.S
+++ b/arch/x86/kernel/tdcall.S
@@ -3,6 +3,7 @@
#include <asm/asm.h>
#include <asm/frame.h>
#include <asm/unwind_hints.h>
+#include <uapi/asm/vmx.h>

#include <linux/linkage.h>
#include <linux/bits.h>
@@ -39,6 +40,12 @@
*/
#define tdcall .byte 0x66,0x0f,0x01,0xcc

+/*
+ * Used in __tdx_hypercall() to determine whether to enable interrupts
+ * before issuing TDCALL for the EXIT_REASON_HLT case.
+ */
+#define ENABLE_IRQS_BEFORE_HLT 0x01
+
/*
* __tdx_module_call() - Used by TDX guests to request services from
* the TDX module (does not include VMM services).
@@ -230,6 +237,30 @@ SYM_FUNC_START(__tdx_hypercall)

movl $TDVMCALL_EXPOSE_REGS_MASK, %ecx

+ /*
+ * For the idle loop STI needs to be called directly before
+ * the TDCALL that enters idle (EXIT_REASON_HLT case). STI
+ * instruction enables interrupts only one instruction later.
+ * If there is a window between STI and the instruction that
+ * emulates the HALT state, there is a chance for interrupts to
+ * happen in this window, which can delay the HLT operation
+ * indefinitely. Since this is the not the desired result,
+ * conditionally call STI before TDCALL.
+ *
+ * Since STI instruction is only required for the idle case
+ * (a special case of EXIT_REASON_HLT), use the r15 register
+ * value to identify it. Since the R15 register is not used
+ * by the VMM as per EXIT_REASON_HLT ABI, re-use it in
+ * software to identify the STI case.
+ */
+ cmpl $EXIT_REASON_HLT, %r11d
+ jne .Lskip_sti
+ cmpl $ENABLE_IRQS_BEFORE_HLT, %r15d
+ jne .Lskip_sti
+ /* Set R15 register to 0, it is unused in EXIT_REASON_HLT case */
+ xor %r15, %r15
+ sti
+.Lskip_sti:
tdcall

/* Restore output pointer to R9 */
diff --git a/arch/x86/kernel/tdx.c b/arch/x86/kernel/tdx.c
index 5a5b25f9c4d3..eeb456631a65 100644
--- a/arch/x86/kernel/tdx.c
+++ b/arch/x86/kernel/tdx.c
@@ -6,6 +6,7 @@

#include <linux/cpufeature.h>
#include <asm/tdx.h>
+#include <asm/vmx.h>

/* TDX module Call Leaf IDs */
#define TDX_GET_VEINFO 3
@@ -35,6 +36,61 @@ static inline u64 _tdx_hypercall(u64 fn, u64 r12, u64 r13, u64 r14,
return out->r10;
}

+static u64 __cpuidle _tdx_halt(const bool irq_disabled, const bool do_sti)
+{
+ /*
+ * Emulate HLT operation via hypercall. More info about ABI
+ * can be found in TDX Guest-Host-Communication Interface
+ * (GHCI), sec 3.8 TDG.VP.VMCALL<Instruction.HLT>.
+ *
+ * The VMM uses the "IRQ disabled" param to understand IRQ
+ * enabled status (RFLAGS.IF) of the TD guest and to determine
+ * whether or not it should schedule the halted vCPU if an
+ * IRQ becomes pending. E.g. if IRQs are disabled, the VMM
+ * can keep the vCPU in virtual HLT, even if an IRQ is
+ * pending, without hanging/breaking the guest.
+ *
+ * do_sti parameter is used by the __tdx_hypercall() to decide
+ * whether to call the STI instruction before executing the
+ * TDCALL instruction.
+ */
+ return _tdx_hypercall(EXIT_REASON_HLT, irq_disabled, 0, 0,
+ do_sti, NULL);
+}
+
+static bool tdx_halt(void)
+{
+ /*
+ * Since non safe halt is mainly used in CPU offlining
+ * and the guest will always stay in the halt state, don't
+ * call the STI instruction (set do_sti as false).
+ */
+ const bool irq_disabled = irqs_disabled();
+ const bool do_sti = false;
+
+ if (_tdx_halt(irq_disabled, do_sti))
+ return false;
+
+ return true;
+}
+
+void __cpuidle tdx_safe_halt(void)
+{
+ /*
+ * For do_sti=true case, __tdx_hypercall() function enables
+ * interrupts using the STI instruction before the TDCALL. So
+ * set irq_disabled as false.
+ */
+ const bool irq_disabled = false;
+ const bool do_sti = true;
+
+ /*
+ * Use WARN_ONCE() to report the failure.
+ */
+ if (_tdx_halt(irq_disabled, do_sti))
+ WARN_ONCE(1, "HLT instruction emulation failed\n");
+}
+
bool tdx_get_ve_info(struct ve_info *ve)
{
struct tdx_module_output out;
@@ -75,8 +131,18 @@ static bool tdx_virt_exception_user(struct pt_regs *regs, struct ve_info *ve)
/* Handle the kernel #VE */
static bool tdx_virt_exception_kernel(struct pt_regs *regs, struct ve_info *ve)
{
- pr_warn("Unexpected #VE: %lld\n", ve->exit_reason);
- return false;
+ bool ret = false;
+
+ switch (ve->exit_reason) {
+ case EXIT_REASON_HLT:
+ ret = tdx_halt();
+ break;
+ default:
+ pr_warn("Unexpected #VE: %lld\n", ve->exit_reason);
+ break;
+ }
+
+ return ret;
}

bool tdx_handle_virt_exception(struct pt_regs *regs, struct ve_info *ve)
--
2.34.1


2022-02-01 02:14:44

by Borislav Petkov

[permalink] [raw]
Subject: Re: [PATCHv2 05/29] x86/tdx: Add HLT support for TDX guests

On Mon, Jan 24, 2022 at 06:01:51PM +0300, Kirill A. Shutemov wrote:
> @@ -870,6 +871,10 @@ void select_idle_routine(const struct cpuinfo_x86 *c)
> } else if (prefer_mwait_c1_over_halt(c)) {
> pr_info("using mwait in idle threads\n");
> x86_idle = mwait_idle;
> + } else if (cpu_feature_enabled(X86_FEATURE_TDX_GUEST)) {
> + pr_info("using TDX aware idle routine\n");
> + x86_idle = tdx_safe_halt;
> + return;

Forgot to remove that "return".

--
Regards/Gruss,
Boris.

https://people.kernel.org/tglx/notes-about-netiquette

2022-02-01 08:54:15

by Kirill A. Shutemov

[permalink] [raw]
Subject: [PATCHv2.1 05/29] x86/tdx: Add HLT support for TDX guests

The HLT instruction is a privileged instruction, executing it stops
instruction execution and places the processor in a HALT state. It
is used in kernel for cases like reboot, idle loop and exception fixup
handlers. For the idle case, interrupts will be enabled (using STI)
before the HLT instruction (this is also called safe_halt()).

To support the HLT instruction in TDX guests, it needs to be emulated
using TDVMCALL (hypercall to VMM). More details about it can be found
in Intel Trust Domain Extensions (Intel TDX) Guest-Host-Communication
Interface (GHCI) specification, section TDVMCALL[Instruction.HLT].

In TDX guests, executing HLT instruction will generate a #VE, which is
used to emulate the HLT instruction. But #VE based emulation will not
work for the safe_halt() flavor, because it requires STI instruction to
be executed just before the TDCALL. Since idle loop is the only user of
safe_halt() variant, handle it as a special case.

To avoid *safe_halt() call in the idle function, define the
tdx_guest_idle() and use it to override the "x86_idle" function pointer
for a valid TDX guest.

Alternative choices like PV ops have been considered for adding
safe_halt() support. But it was rejected because HLT paravirt calls
only exist under PARAVIRT_XXL, and enabling it in TDX guest just for
safe_halt() use case is not worth the cost.

Co-developed-by: Kuppuswamy Sathyanarayanan <[email protected]>
Signed-off-by: Kuppuswamy Sathyanarayanan <[email protected]>
Reviewed-by: Andi Kleen <[email protected]>
Reviewed-by: Tony Luck <[email protected]>
Signed-off-by: Kirill A. Shutemov <[email protected]>
---
arch/x86/include/asm/tdx.h | 3 ++
arch/x86/kernel/process.c | 4 +++
arch/x86/kernel/tdcall.S | 31 +++++++++++++++++
arch/x86/kernel/tdx.c | 70 ++++++++++++++++++++++++++++++++++++--
4 files changed, 106 insertions(+), 2 deletions(-)

diff --git a/arch/x86/include/asm/tdx.h b/arch/x86/include/asm/tdx.h
index d17143290f0a..9b4714a45bb9 100644
--- a/arch/x86/include/asm/tdx.h
+++ b/arch/x86/include/asm/tdx.h
@@ -74,10 +74,13 @@ bool tdx_get_ve_info(struct ve_info *ve);

bool tdx_handle_virt_exception(struct pt_regs *regs, struct ve_info *ve);

+void tdx_safe_halt(void);
+
#else

static inline void tdx_early_init(void) { };
static inline bool is_tdx_guest(void) { return false; }
+static inline void tdx_safe_halt(void) { };

#endif /* CONFIG_INTEL_TDX_GUEST */

diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index 81d8ef036637..71aa12082370 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -46,6 +46,7 @@
#include <asm/proto.h>
#include <asm/frame.h>
#include <asm/unwind.h>
+#include <asm/tdx.h>

#include "process.h"

@@ -870,6 +871,9 @@ void select_idle_routine(const struct cpuinfo_x86 *c)
} else if (prefer_mwait_c1_over_halt(c)) {
pr_info("using mwait in idle threads\n");
x86_idle = mwait_idle;
+ } else if (cpu_feature_enabled(X86_FEATURE_TDX_GUEST)) {
+ pr_info("using TDX aware idle routine\n");
+ x86_idle = tdx_safe_halt;
} else
x86_idle = default_idle;
}
diff --git a/arch/x86/kernel/tdcall.S b/arch/x86/kernel/tdcall.S
index 46a49a96cf6c..ae74da33ccc6 100644
--- a/arch/x86/kernel/tdcall.S
+++ b/arch/x86/kernel/tdcall.S
@@ -3,6 +3,7 @@
#include <asm/asm.h>
#include <asm/frame.h>
#include <asm/unwind_hints.h>
+#include <uapi/asm/vmx.h>

#include <linux/linkage.h>
#include <linux/bits.h>
@@ -39,6 +40,12 @@
*/
#define tdcall .byte 0x66,0x0f,0x01,0xcc

+/*
+ * Used in __tdx_hypercall() to determine whether to enable interrupts
+ * before issuing TDCALL for the EXIT_REASON_HLT case.
+ */
+#define ENABLE_IRQS_BEFORE_HLT 0x01
+
/*
* __tdx_module_call() - Used by TDX guests to request services from
* the TDX module (does not include VMM services).
@@ -230,6 +237,30 @@ SYM_FUNC_START(__tdx_hypercall)

movl $TDVMCALL_EXPOSE_REGS_MASK, %ecx

+ /*
+ * For the idle loop STI needs to be called directly before
+ * the TDCALL that enters idle (EXIT_REASON_HLT case). STI
+ * instruction enables interrupts only one instruction later.
+ * If there is a window between STI and the instruction that
+ * emulates the HALT state, there is a chance for interrupts to
+ * happen in this window, which can delay the HLT operation
+ * indefinitely. Since this is the not the desired result,
+ * conditionally call STI before TDCALL.
+ *
+ * Since STI instruction is only required for the idle case
+ * (a special case of EXIT_REASON_HLT), use the r15 register
+ * value to identify it. Since the R15 register is not used
+ * by the VMM as per EXIT_REASON_HLT ABI, re-use it in
+ * software to identify the STI case.
+ */
+ cmpl $EXIT_REASON_HLT, %r11d
+ jne .Lskip_sti
+ cmpl $ENABLE_IRQS_BEFORE_HLT, %r15d
+ jne .Lskip_sti
+ /* Set R15 register to 0, it is unused in EXIT_REASON_HLT case */
+ xor %r15, %r15
+ sti
+.Lskip_sti:
tdcall

/* Restore output pointer to R9 */
diff --git a/arch/x86/kernel/tdx.c b/arch/x86/kernel/tdx.c
index 5a5b25f9c4d3..eeb456631a65 100644
--- a/arch/x86/kernel/tdx.c
+++ b/arch/x86/kernel/tdx.c
@@ -6,6 +6,7 @@

#include <linux/cpufeature.h>
#include <asm/tdx.h>
+#include <asm/vmx.h>

/* TDX module Call Leaf IDs */
#define TDX_GET_VEINFO 3
@@ -35,6 +36,61 @@ static inline u64 _tdx_hypercall(u64 fn, u64 r12, u64 r13, u64 r14,
return out->r10;
}

+static u64 __cpuidle _tdx_halt(const bool irq_disabled, const bool do_sti)
+{
+ /*
+ * Emulate HLT operation via hypercall. More info about ABI
+ * can be found in TDX Guest-Host-Communication Interface
+ * (GHCI), sec 3.8 TDG.VP.VMCALL<Instruction.HLT>.
+ *
+ * The VMM uses the "IRQ disabled" param to understand IRQ
+ * enabled status (RFLAGS.IF) of the TD guest and to determine
+ * whether or not it should schedule the halted vCPU if an
+ * IRQ becomes pending. E.g. if IRQs are disabled, the VMM
+ * can keep the vCPU in virtual HLT, even if an IRQ is
+ * pending, without hanging/breaking the guest.
+ *
+ * do_sti parameter is used by the __tdx_hypercall() to decide
+ * whether to call the STI instruction before executing the
+ * TDCALL instruction.
+ */
+ return _tdx_hypercall(EXIT_REASON_HLT, irq_disabled, 0, 0,
+ do_sti, NULL);
+}
+
+static bool tdx_halt(void)
+{
+ /*
+ * Since non safe halt is mainly used in CPU offlining
+ * and the guest will always stay in the halt state, don't
+ * call the STI instruction (set do_sti as false).
+ */
+ const bool irq_disabled = irqs_disabled();
+ const bool do_sti = false;
+
+ if (_tdx_halt(irq_disabled, do_sti))
+ return false;
+
+ return true;
+}
+
+void __cpuidle tdx_safe_halt(void)
+{
+ /*
+ * For do_sti=true case, __tdx_hypercall() function enables
+ * interrupts using the STI instruction before the TDCALL. So
+ * set irq_disabled as false.
+ */
+ const bool irq_disabled = false;
+ const bool do_sti = true;
+
+ /*
+ * Use WARN_ONCE() to report the failure.
+ */
+ if (_tdx_halt(irq_disabled, do_sti))
+ WARN_ONCE(1, "HLT instruction emulation failed\n");
+}
+
bool tdx_get_ve_info(struct ve_info *ve)
{
struct tdx_module_output out;
@@ -75,8 +131,18 @@ static bool tdx_virt_exception_user(struct pt_regs *regs, struct ve_info *ve)
/* Handle the kernel #VE */
static bool tdx_virt_exception_kernel(struct pt_regs *regs, struct ve_info *ve)
{
- pr_warn("Unexpected #VE: %lld\n", ve->exit_reason);
- return false;
+ bool ret = false;
+
+ switch (ve->exit_reason) {
+ case EXIT_REASON_HLT:
+ ret = tdx_halt();
+ break;
+ default:
+ pr_warn("Unexpected #VE: %lld\n", ve->exit_reason);
+ break;
+ }
+
+ return ret;
}

bool tdx_handle_virt_exception(struct pt_regs *regs, struct ve_info *ve)
--
2.34.1

2022-02-02 11:51:47

by Thomas Gleixner

[permalink] [raw]
Subject: Re: [PATCHv2.1 05/29] x86/tdx: Add HLT support for TDX guests

On Sun, Jan 30 2022 at 01:30, Kirill A. Shutemov wrote:
> +/*
> + * Used in __tdx_hypercall() to determine whether to enable interrupts
> + * before issuing TDCALL for the EXIT_REASON_HLT case.
> + */
> +#define ENABLE_IRQS_BEFORE_HLT 0x01
> +
> /*
> * __tdx_module_call() - Used by TDX guests to request services from
> * the TDX module (does not include VMM services).
> @@ -230,6 +237,30 @@ SYM_FUNC_START(__tdx_hypercall)
>
> movl $TDVMCALL_EXPOSE_REGS_MASK, %ecx
>
> + /*
> + * For the idle loop STI needs to be called directly before
> + * the TDCALL that enters idle (EXIT_REASON_HLT case). STI
> + * instruction enables interrupts only one instruction later.
> + * If there is a window between STI and the instruction that
> + * emulates the HALT state, there is a chance for interrupts to
> + * happen in this window, which can delay the HLT operation
> + * indefinitely. Since this is the not the desired result,
> + * conditionally call STI before TDCALL.
> + *
> + * Since STI instruction is only required for the idle case
> + * (a special case of EXIT_REASON_HLT), use the r15 register
> + * value to identify it. Since the R15 register is not used
> + * by the VMM as per EXIT_REASON_HLT ABI, re-use it in
> + * software to identify the STI case.
> + */
> + cmpl $EXIT_REASON_HLT, %r11d
> + jne .Lskip_sti
> + cmpl $ENABLE_IRQS_BEFORE_HLT, %r15d
> + jne .Lskip_sti
> + /* Set R15 register to 0, it is unused in EXIT_REASON_HLT case */
> + xor %r15, %r15
> + sti
> +.Lskip_sti:
> tdcall

This really can be simplified:

cmpl $EXIT_REASON_SAFE_HLT, %r11d
jne .Lnohalt
movl $EXIT_REASON_HLT, %r11d
sti
.Lnohalt:
tdcall

and the below becomes:

static bool tdx_halt(void)
{
return !!__tdx_hypercall(EXIT_REASON_HLT, !!irqs_disabled(), 0, 0, 0, NULL);
}

void __cpuidle tdx_safe_halt(void)
{
if (__tdx_hypercall(EXIT_REASON_SAFE_HLT, 0, 0, 0, 0, NULL)
WARN_ONCE(1, "HLT instruction emulation failed\n");
}

Hmm?

Thanks,

tglx

2022-02-03 06:10:31

by Kirill A. Shutemov

[permalink] [raw]
Subject: Re: [PATCHv2.1 05/29] x86/tdx: Add HLT support for TDX guests

On Tue, Feb 01, 2022 at 10:21:58PM +0100, Thomas Gleixner wrote:
> On Sun, Jan 30 2022 at 01:30, Kirill A. Shutemov wrote:
> > +/*
> > + * Used in __tdx_hypercall() to determine whether to enable interrupts
> > + * before issuing TDCALL for the EXIT_REASON_HLT case.
> > + */
> > +#define ENABLE_IRQS_BEFORE_HLT 0x01
> > +
> > /*
> > * __tdx_module_call() - Used by TDX guests to request services from
> > * the TDX module (does not include VMM services).
> > @@ -230,6 +237,30 @@ SYM_FUNC_START(__tdx_hypercall)
> >
> > movl $TDVMCALL_EXPOSE_REGS_MASK, %ecx
> >
> > + /*
> > + * For the idle loop STI needs to be called directly before
> > + * the TDCALL that enters idle (EXIT_REASON_HLT case). STI
> > + * instruction enables interrupts only one instruction later.
> > + * If there is a window between STI and the instruction that
> > + * emulates the HALT state, there is a chance for interrupts to
> > + * happen in this window, which can delay the HLT operation
> > + * indefinitely. Since this is the not the desired result,
> > + * conditionally call STI before TDCALL.
> > + *
> > + * Since STI instruction is only required for the idle case
> > + * (a special case of EXIT_REASON_HLT), use the r15 register
> > + * value to identify it. Since the R15 register is not used
> > + * by the VMM as per EXIT_REASON_HLT ABI, re-use it in
> > + * software to identify the STI case.
> > + */
> > + cmpl $EXIT_REASON_HLT, %r11d
> > + jne .Lskip_sti
> > + cmpl $ENABLE_IRQS_BEFORE_HLT, %r15d
> > + jne .Lskip_sti
> > + /* Set R15 register to 0, it is unused in EXIT_REASON_HLT case */
> > + xor %r15, %r15
> > + sti
> > +.Lskip_sti:
> > tdcall
>
> This really can be simplified:
>
> cmpl $EXIT_REASON_SAFE_HLT, %r11d
> jne .Lnohalt
> movl $EXIT_REASON_HLT, %r11d
> sti
> .Lnohalt:
> tdcall
>
> and the below becomes:
>
> static bool tdx_halt(void)
> {
> return !!__tdx_hypercall(EXIT_REASON_HLT, !!irqs_disabled(), 0, 0, 0, NULL);
> }
>
> void __cpuidle tdx_safe_halt(void)
> {
> if (__tdx_hypercall(EXIT_REASON_SAFE_HLT, 0, 0, 0, 0, NULL)
> WARN_ONCE(1, "HLT instruction emulation failed\n");
> }
>
> Hmm?

EXIT_REASON_* are architectural, see SDM vol 3D, appendix C. There's no
EXIT_REASON_SAFE_HLT.

Do you want to define a synthetic one? Like

#define EXIT_REASON_SAFE_HLT 0x10000

?

Looks dubious to me, I donno. I worry about possible conflicts with the
spec in the future.

--
Kirill A. Shutemov

2022-02-03 13:52:04

by Thomas Gleixner

[permalink] [raw]
Subject: Re: [PATCHv2.1 05/29] x86/tdx: Add HLT support for TDX guests

On Wed, Feb 02 2022 at 15:48, Kirill A. Shutemov wrote:

> On Tue, Feb 01, 2022 at 10:21:58PM +0100, Thomas Gleixner wrote:
>> On Sun, Jan 30 2022 at 01:30, Kirill A. Shutemov wrote:
>> This really can be simplified:
>>
>> cmpl $EXIT_REASON_SAFE_HLT, %r11d
>> jne .Lnohalt
>> movl $EXIT_REASON_HLT, %r11d
>> sti
>> .Lnohalt:
>> tdcall
>>
>> and the below becomes:
>>
>> static bool tdx_halt(void)
>> {
>> return !!__tdx_hypercall(EXIT_REASON_HLT, !!irqs_disabled(), 0, 0, 0, NULL);
>> }
>>
>> void __cpuidle tdx_safe_halt(void)
>> {
>> if (__tdx_hypercall(EXIT_REASON_SAFE_HLT, 0, 0, 0, 0, NULL)
>> WARN_ONCE(1, "HLT instruction emulation failed\n");
>> }
>>
>> Hmm?
>
> EXIT_REASON_* are architectural, see SDM vol 3D, appendix C. There's no
> EXIT_REASON_SAFE_HLT.
>
> Do you want to define a synthetic one? Like
>
> #define EXIT_REASON_SAFE_HLT 0x10000
> ?

That was my idea, yes.

> Looks dubious to me, I donno. I worry about possible conflicts with the
> spec in the future.

The spec should have a reserved space for such things :)

But you might think about having a in/out struct similar to the module
call or just an array of u64.

and the signature would become:

__tdx_hypercall(u64 op, u64 flags, struct inout *args)
__tdx_hypercall(u64 op, u64 flags, u64 *args)

and have flag bits:

HCALL_ISSUE_STI
HCALL_HAS_OUTPUT

Hmm?

Thanks,

tglx

2022-02-06 08:21:23

by Kirill A. Shutemov

[permalink] [raw]
Subject: Re: [PATCHv2.1 05/29] x86/tdx: Add HLT support for TDX guests

On Wed, Feb 02, 2022 at 06:17:08PM +0100, Thomas Gleixner wrote:
> On Wed, Feb 02 2022 at 15:48, Kirill A. Shutemov wrote:
>
> > On Tue, Feb 01, 2022 at 10:21:58PM +0100, Thomas Gleixner wrote:
> >> On Sun, Jan 30 2022 at 01:30, Kirill A. Shutemov wrote:
> >> This really can be simplified:
> >>
> >> cmpl $EXIT_REASON_SAFE_HLT, %r11d
> >> jne .Lnohalt
> >> movl $EXIT_REASON_HLT, %r11d
> >> sti
> >> .Lnohalt:
> >> tdcall
> >>
> >> and the below becomes:
> >>
> >> static bool tdx_halt(void)
> >> {
> >> return !!__tdx_hypercall(EXIT_REASON_HLT, !!irqs_disabled(), 0, 0, 0, NULL);
> >> }
> >>
> >> void __cpuidle tdx_safe_halt(void)
> >> {
> >> if (__tdx_hypercall(EXIT_REASON_SAFE_HLT, 0, 0, 0, 0, NULL)
> >> WARN_ONCE(1, "HLT instruction emulation failed\n");
> >> }
> >>
> >> Hmm?
> >
> > EXIT_REASON_* are architectural, see SDM vol 3D, appendix C. There's no
> > EXIT_REASON_SAFE_HLT.
> >
> > Do you want to define a synthetic one? Like
> >
> > #define EXIT_REASON_SAFE_HLT 0x10000
> > ?
>
> That was my idea, yes.
>
> > Looks dubious to me, I donno. I worry about possible conflicts with the
> > spec in the future.
>
> The spec should have a reserved space for such things :)
>
> But you might think about having a in/out struct similar to the module
> call or just an array of u64.
>
> and the signature would become:
>
> __tdx_hypercall(u64 op, u64 flags, struct inout *args)
> __tdx_hypercall(u64 op, u64 flags, u64 *args)
>
> and have flag bits:
>
> HCALL_ISSUE_STI
> HCALL_HAS_OUTPUT
>
> Hmm?

We have two distinct cases: standard hypercalls (defined in GHCI) and KVM
hypercalls. In the first case R10 is 0 (indicating standard TDVMCALL) and
R11 defines the operation. For KVM hypercalls R10 encodes the operation
(KVM hypercalls indexed from 1) and R11 is the first argument. So we
cannot get away with simple "RDI is op" interface.

And we need to return two values: RAX indicates if TDCALL itself was
successful and R10 is result of the hypercall. So we cannot easily get
away without output struct. HCALL_HAS_OUTPUT is not needed.

I would rather keep assembly side simple: shuffle values from the struct
to registers and back. C side is resposible for making sense of the
registers.

With all this in mind the __tdx_hypercall() will boil down to

u64 __tdx_hypercall(struct tdx_hypercall_args *args, u64 flags);

with the only flag HCALL_ISSUE_STI. Is it what you want to see?

I personally don't see why flag is better than synthetic argument as we
have now.

--
Kirill A. Shutemov

2022-02-08 13:17:14

by Sean Christopherson

[permalink] [raw]
Subject: Re: [PATCHv2.1 05/29] x86/tdx: Add HLT support for TDX guests

On Fri, Feb 04, 2022, Kirill A. Shutemov wrote:
> > > Looks dubious to me, I donno. I worry about possible conflicts with the
> > > spec in the future.
> >
> > The spec should have a reserved space for such things :)

Heh, the problem is someone has to deal with munging the two things together.
E.g. if there's a EXIT_REASON_SAFE_HLT then the hypervisor would need a handler
that's identical to EXIT_REASON_HLT, except with guest.EFLAGS.IF forced to '1'.
The guest gets the short end of the stick because EXIT_REASON_HLT is already an
established VM-Exit reason.

> > But you might think about having a in/out struct similar to the module
> > call or just an array of u64.
> >
> > and the signature would become:
> >
> > __tdx_hypercall(u64 op, u64 flags, struct inout *args)
> > __tdx_hypercall(u64 op, u64 flags, u64 *args)
> >
> > and have flag bits:
> >
> > HCALL_ISSUE_STI
> > HCALL_HAS_OUTPUT
> >
> > Hmm?
>
> We have two distinct cases: standard hypercalls (defined in GHCI) and KVM
> hypercalls. In the first case R10 is 0 (indicating standard TDVMCALL) and
> R11 defines the operation. For KVM hypercalls R10 encodes the operation
> (KVM hypercalls indexed from 1) and R11 is the first argument. So we
> cannot get away with simple "RDI is op" interface.
>
> And we need to return two values: RAX indicates if TDCALL itself was
> successful and R10 is result of the hypercall. So we cannot easily get
> away without output struct. HCALL_HAS_OUTPUT is not needed.

But __tdx_hypercall() should never fail TDCALL. The TDX spec even says:

RAX TDCALL instruction return code. Always returns Intel TDX_SUCCESS (0).

IIRC, the original PoC went straight to a ud2 if tdcall failed. Why not do
something similar? That would get rid of the bajillion instances of:

if (__tdx_hypercall(...))
panic("Hypercall fn %llu failed (Buggy TDX module!)\n", fn);

E.g.

diff --git a/arch/x86/kernel/tdcall.S b/arch/x86/kernel/tdcall.S
index fde628791100..04284f0c198e 100644
--- a/arch/x86/kernel/tdcall.S
+++ b/arch/x86/kernel/tdcall.S
@@ -170,8 +170,10 @@ SYM_FUNC_START(__tdx_hypercall)
.Lskip_sti:
tdcall

+ test %rax, %rax,
+ jnz .Lerror
+
/* Copy hypercall result registers to arg struct: */
- movq %r10, TDX_HYPERCALL_r10(%rdi)
movq %r11, TDX_HYPERCALL_r11(%rdi)
movq %r12, TDX_HYPERCALL_r12(%rdi)
movq %r13, TDX_HYPERCALL_r13(%rdi)
@@ -194,7 +196,13 @@ SYM_FUNC_START(__tdx_hypercall)
pop %r14
pop %r15

- FRAME_END
+ FRAME_END
+
+ movq %r10, %rax
+ retq
+
+.Lerror:
+ <move stuff into correct registers if necessary>
+ call tdx_hypercall_error

- retq
SYM_FUNC_END(__tdx_hypercall)

2022-02-09 16:45:37

by Kirill A. Shutemov

[permalink] [raw]
Subject: Re: [PATCHv2.1 05/29] x86/tdx: Add HLT support for TDX guests

On Mon, Feb 07, 2022 at 10:52:19PM +0000, Sean Christopherson wrote:
> On Fri, Feb 04, 2022, Kirill A. Shutemov wrote:
> > > > Looks dubious to me, I donno. I worry about possible conflicts with the
> > > > spec in the future.
> > >
> > > The spec should have a reserved space for such things :)
>
> Heh, the problem is someone has to deal with munging the two things together.
> E.g. if there's a EXIT_REASON_SAFE_HLT then the hypervisor would need a handler
> that's identical to EXIT_REASON_HLT, except with guest.EFLAGS.IF forced to '1'.
> The guest gets the short end of the stick because EXIT_REASON_HLT is already an
> established VM-Exit reason.
>
> > > But you might think about having a in/out struct similar to the module
> > > call or just an array of u64.
> > >
> > > and the signature would become:
> > >
> > > __tdx_hypercall(u64 op, u64 flags, struct inout *args)
> > > __tdx_hypercall(u64 op, u64 flags, u64 *args)
> > >
> > > and have flag bits:
> > >
> > > HCALL_ISSUE_STI
> > > HCALL_HAS_OUTPUT
> > >
> > > Hmm?
> >
> > We have two distinct cases: standard hypercalls (defined in GHCI) and KVM
> > hypercalls. In the first case R10 is 0 (indicating standard TDVMCALL) and
> > R11 defines the operation. For KVM hypercalls R10 encodes the operation
> > (KVM hypercalls indexed from 1) and R11 is the first argument. So we
> > cannot get away with simple "RDI is op" interface.
> >
> > And we need to return two values: RAX indicates if TDCALL itself was
> > successful and R10 is result of the hypercall. So we cannot easily get
> > away without output struct. HCALL_HAS_OUTPUT is not needed.
>
> But __tdx_hypercall() should never fail TDCALL. The TDX spec even says:
>
> RAX TDCALL instruction return code. Always returns Intel TDX_SUCCESS (0).
>
> IIRC, the original PoC went straight to a ud2 if tdcall failed. Why not do
> something similar? That would get rid of the bajillion instances of:
>
> if (__tdx_hypercall(...))
> panic("Hypercall fn %llu failed (Buggy TDX module!)\n", fn);

Okay, below is my take on it. Boot tested.

I used UD2 as a way to deal with it-never-heppens condition. Not sure if
it the right way, but stack trace looks useful and I see it being used for
CONFIG_DEBUG_ENTRY.

Any comments?

/*
* __tdx_hypercall() - Make hypercalls to a TDX VMM.
*
* Transforms values in function call argument struct tdx_hypercall_args @args
* into the TDCALL register ABI. After TDCALL operation, VMM output is saved
* back in @args.
*
*-------------------------------------------------------------------------
* TD VMCALL ABI:
*-------------------------------------------------------------------------
*
* Input Registers:
*
* RAX - TDCALL instruction leaf number (0 - TDG.VP.VMCALL)
* RCX - BITMAP which controls which part of TD Guest GPR
* is passed as-is to the VMM and back.
* R10 - Set 0 to indicate TDCALL follows standard TDX ABI
* specification. Non zero value indicates vendor
* specific ABI.
* R11 - VMCALL sub function number
* RBX, RBP, RDI, RSI - Used to pass VMCALL sub function specific arguments.
* R8-R9, R12-R15 - Same as above.
*
* Output Registers:
*
* RAX - TDCALL instruction status (Not related to hypercall
* output).
* R10 - Hypercall output error code.
* R11-R15 - Hypercall sub function specific output values.
*
*-------------------------------------------------------------------------
*
* __tdx_hypercall() function ABI:
*
* @args (RDI) - struct tdx_hypercall_args for input and output
* @flags (RSI) - TDX_HCALL_* flags
*
* On successful completion, return the hypercall error code.
*/
SYM_FUNC_START(__tdx_hypercall)
FRAME_BEGIN

/* Save callee-saved GPRs as mandated by the x86_64 ABI */
push %r15
push %r14
push %r13
push %r12

/* Mangle function call ABI into TDCALL ABI: */
/* Set TDCALL leaf ID (TDVMCALL (0)) in RAX */
xor %eax, %eax

/* Copy hypercall registers from arg struct: */
movq TDX_HYPERCALL_r10(%rdi), %r10
movq TDX_HYPERCALL_r11(%rdi), %r11
movq TDX_HYPERCALL_r12(%rdi), %r12
movq TDX_HYPERCALL_r13(%rdi), %r13
movq TDX_HYPERCALL_r14(%rdi), %r14
movq TDX_HYPERCALL_r15(%rdi), %r15

movl $TDVMCALL_EXPOSE_REGS_MASK, %ecx

/*
* For the idle loop STI needs to be called directly before
* the TDCALL that enters idle (EXIT_REASON_HLT case). STI
* instruction enables interrupts only one instruction later.
* If there is a window between STI and the instruction that
* emulates the HALT state, there is a chance for interrupts to
* happen in this window, which can delay the HLT operation
* indefinitely. Since this is the not the desired result,
* conditionally call STI before TDCALL.
*/
testq $TDX_HCALL_ISSUE_STI, %rsi
jz .Lskip_sti
sti
.Lskip_sti:
tdcall

/*
* TDVMCALL leaf does not suppose to fail. If it fails something
* is horribly wrong with TDX module. Stop the world.
*/
test %rax, %rax
je .Lsuccess
ud2
.Lsuccess:
/* TDVMCALL leaf return code is in R10 */
movq %r10, %rax

/* Copy hypercall result registers to arg struct if needed */
testq $TDX_HCALL_HAS_OUTPUT, %rsi
jz .Lout

movq %r10, TDX_HYPERCALL_r10(%rdi)
movq %r11, TDX_HYPERCALL_r11(%rdi)
movq %r12, TDX_HYPERCALL_r12(%rdi)
movq %r13, TDX_HYPERCALL_r13(%rdi)
movq %r14, TDX_HYPERCALL_r14(%rdi)
movq %r15, TDX_HYPERCALL_r15(%rdi)
.Lout:
/*
* Zero out registers exposed to the VMM to avoid
* speculative execution with VMM-controlled values.
* This needs to include all registers present in
* TDVMCALL_EXPOSE_REGS_MASK (except R12-R15).
* R12-R15 context will be restored.
*/
xor %r10d, %r10d
xor %r11d, %r11d

/* Restore callee-saved GPRs as mandated by the x86_64 ABI */
pop %r12
pop %r13
pop %r14
pop %r15

FRAME_END

retq
SYM_FUNC_END(__tdx_hypercall)
--
Kirill A. Shutemov

2022-02-09 19:21:07

by Sean Christopherson

[permalink] [raw]
Subject: Re: [PATCHv2.1 05/29] x86/tdx: Add HLT support for TDX guests

On Wed, Feb 09, 2022, Kirill A. Shutemov wrote:
> On Mon, Feb 07, 2022 at 10:52:19PM +0000, Sean Christopherson wrote:
> .Lskip_sti:
> tdcall
>
> /*
> * TDVMCALL leaf does not suppose to fail. If it fails something
> * is horribly wrong with TDX module. Stop the world.
> */
> test %rax, %rax
> je .Lsuccess
> ud2

If the ud2 or call to an external "do panic" helper is out-of-line, then the happy
path avoids a taken branch. Not a big deal, but it's also trivial to do.

> .Lsuccess:
> /* TDVMCALL leaf return code is in R10 */
> movq %r10, %rax
>
> /* Copy hypercall result registers to arg struct if needed */
> testq $TDX_HCALL_HAS_OUTPUT, %rsi
> jz .Lout
>
> movq %r10, TDX_HYPERCALL_r10(%rdi)
> movq %r11, TDX_HYPERCALL_r11(%rdi)
> movq %r12, TDX_HYPERCALL_r12(%rdi)
> movq %r13, TDX_HYPERCALL_r13(%rdi)
> movq %r14, TDX_HYPERCALL_r14(%rdi)
> movq %r15, TDX_HYPERCALL_r15(%rdi)
> .Lout:
> /*
> * Zero out registers exposed to the VMM to avoid
> * speculative execution with VMM-controlled values.
> * This needs to include all registers present in
> * TDVMCALL_EXPOSE_REGS_MASK (except R12-R15).
> * R12-R15 context will be restored.

This comment block should use the "full" 80 chars.

> */
> xor %r10d, %r10d
> xor %r11d, %r11d
>
> /* Restore callee-saved GPRs as mandated by the x86_64 ABI */
> pop %r12
> pop %r13
> pop %r14
> pop %r15
>
> FRAME_END
>
> retq
> SYM_FUNC_END(__tdx_hypercall)
> --
> Kirill A. Shutemov

2022-02-09 23:57:10

by Kirill A. Shutemov

[permalink] [raw]
Subject: Re: [PATCHv2.1 05/29] x86/tdx: Add HLT support for TDX guests

On Wed, Feb 09, 2022 at 06:05:52PM +0000, Sean Christopherson wrote:
> On Wed, Feb 09, 2022, Kirill A. Shutemov wrote:
> > On Mon, Feb 07, 2022 at 10:52:19PM +0000, Sean Christopherson wrote:
> > .Lskip_sti:
> > tdcall
> >
> > /*
> > * TDVMCALL leaf does not suppose to fail. If it fails something
> > * is horribly wrong with TDX module. Stop the world.
> > */
> > test %rax, %rax
> > je .Lsuccess
> > ud2
>
> If the ud2 or call to an external "do panic" helper is out-of-line, then the happy
> path avoids a taken branch. Not a big deal, but it's also trivial to do.

Something like this?

I assume FRAME_END is irrelevent after UD2.

SYM_FUNC_START(__tdx_hypercall)
FRAME_BEGIN

/* Save callee-saved GPRs as mandated by the x86_64 ABI */
push %r15
push %r14
push %r13
push %r12

/* Mangle function call ABI into TDCALL ABI: */
/* Set TDCALL leaf ID (TDVMCALL (0)) in RAX */
xor %eax, %eax

/* Copy hypercall registers from arg struct: */
movq TDX_HYPERCALL_r10(%rdi), %r10
movq TDX_HYPERCALL_r11(%rdi), %r11
movq TDX_HYPERCALL_r12(%rdi), %r12
movq TDX_HYPERCALL_r13(%rdi), %r13
movq TDX_HYPERCALL_r14(%rdi), %r14
movq TDX_HYPERCALL_r15(%rdi), %r15

movl $TDVMCALL_EXPOSE_REGS_MASK, %ecx

/*
* For the idle loop STI needs to be called directly before the TDCALL
* that enters idle (EXIT_REASON_HLT case). STI instruction enables
* interrupts only one instruction later. If there is a window between
* STI and the instruction that emulates the HALT state, there is a
* chance for interrupts to happen in this window, which can delay the
* HLT operation indefinitely. Since this is the not the desired
* result, conditionally call STI before TDCALL.
*/
testq $TDX_HCALL_ISSUE_STI, %rsi
jz .Lskip_sti
sti
.Lskip_sti:
tdcall

/*
* TDVMCALL leaf does not suppose to fail. If it fails something
* is horribly wrong with TDX module. Stop the world.
*/
testq %rax, %rax
jne .Lpanic

/* TDVMCALL leaf return code is in R10 */
movq %r10, %rax

/* Copy hypercall result registers to arg struct if needed */
testq $TDX_HCALL_HAS_OUTPUT, %rsi
jz .Lout

movq %r10, TDX_HYPERCALL_r10(%rdi)
movq %r11, TDX_HYPERCALL_r11(%rdi)
movq %r12, TDX_HYPERCALL_r12(%rdi)
movq %r13, TDX_HYPERCALL_r13(%rdi)
movq %r14, TDX_HYPERCALL_r14(%rdi)
movq %r15, TDX_HYPERCALL_r15(%rdi)
.Lout:
/*
* Zero out registers exposed to the VMM to avoid speculative execution
* with VMM-controlled values. This needs to include all registers
* present in TDVMCALL_EXPOSE_REGS_MASK (except R12-R15). R12-R15
* context will be restored.
*/
xor %r10d, %r10d
xor %r11d, %r11d

/* Restore callee-saved GPRs as mandated by the x86_64 ABI */
pop %r12
pop %r13
pop %r14
pop %r15

FRAME_END

retq
.Lpanic:
ud2
SYM_FUNC_END(__tdx_hypercall)
--
Kirill A. Shutemov

2022-02-10 01:21:55

by Sean Christopherson

[permalink] [raw]
Subject: Re: [PATCHv2.1 05/29] x86/tdx: Add HLT support for TDX guests

On Thu, Feb 10, 2022, Kirill A. Shutemov wrote:
> On Wed, Feb 09, 2022 at 06:05:52PM +0000, Sean Christopherson wrote:
> > On Wed, Feb 09, 2022, Kirill A. Shutemov wrote:
> > > On Mon, Feb 07, 2022 at 10:52:19PM +0000, Sean Christopherson wrote:
> > > .Lskip_sti:
> > > tdcall
> > >
> > > /*
> > > * TDVMCALL leaf does not suppose to fail. If it fails something
> > > * is horribly wrong with TDX module. Stop the world.
> > > */
> > > test %rax, %rax
> > > je .Lsuccess
> > > ud2
> >
> > If the ud2 or call to an external "do panic" helper is out-of-line, then the happy
> > path avoids a taken branch. Not a big deal, but it's also trivial to do.
>
> Something like this?

Yep.

> I assume FRAME_END is irrelevent after UD2.

Not irrelevant, but we don't want to do FRAME_END in this case. Keeping the current
frame pointer (setup by FRAME_BEGIN, torn down by FRAME_END) will let the unwinder
do its thing when its using frame pointers.