Subject: Test Email

From: Kuppuswamy Sathyanarayanan <[email protected]>

Hi All,

Sending a test email to verify my mail server. please ignore it.

--
2.25.1


Subject: [RFC v1 02/26] x86/cpufeatures: Add TDX Guest CPU feature

Add CPU feature detection for Trusted Domain Extensions support.
TDX feature adds capabilities to keep guest register state and
memory isolated from hypervisor.

For TDX guest platforms, executing CPUID(0x21, 0) will return
following values in EAX, EBX, ECX and EDX.

EAX: Maximum sub-leaf number: 0
EBX/EDX/ECX: Vendor string:

EBX = “Inte”
EDX = ”lTDX”
ECX = “ “

So when above condition is true, set X86_FEATURE_TDX_GUEST
feature cap bit

Signed-off-by: Kuppuswamy Sathyanarayanan <[email protected]>
Reviewed-by: Andi Kleen <[email protected]>
---
arch/x86/include/asm/cpufeatures.h | 1 +
arch/x86/include/asm/tdx.h | 18 +++++++++++++++++
arch/x86/kernel/Makefile | 1 +
arch/x86/kernel/head64.c | 3 +++
arch/x86/kernel/tdx.c | 31 ++++++++++++++++++++++++++++++
5 files changed, 54 insertions(+)
create mode 100644 arch/x86/include/asm/tdx.h
create mode 100644 arch/x86/kernel/tdx.c

diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h
index 84b887825f12..989e2b302880 100644
--- a/arch/x86/include/asm/cpufeatures.h
+++ b/arch/x86/include/asm/cpufeatures.h
@@ -238,6 +238,7 @@
#define X86_FEATURE_VMW_VMMCALL ( 8*32+19) /* "" VMware prefers VMMCALL hypercall instruction */
#define X86_FEATURE_SEV_ES ( 8*32+20) /* AMD Secure Encrypted Virtualization - Encrypted State */
#define X86_FEATURE_VM_PAGE_FLUSH ( 8*32+21) /* "" VM Page Flush MSR is supported */
+#define X86_FEATURE_TDX_GUEST ( 8*32+22) /* Trusted Domain Extensions Guest */

/* Intel-defined CPU features, CPUID level 0x00000007:0 (EBX), word 9 */
#define X86_FEATURE_FSGSBASE ( 9*32+ 0) /* RDFSBASE, WRFSBASE, RDGSBASE, WRGSBASE instructions*/
diff --git a/arch/x86/include/asm/tdx.h b/arch/x86/include/asm/tdx.h
new file mode 100644
index 000000000000..2cc246c0cecf
--- /dev/null
+++ b/arch/x86/include/asm/tdx.h
@@ -0,0 +1,18 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/* Copyright (C) 2020 Intel Corporation */
+#ifndef _ASM_X86_TDX_H
+#define _ASM_X86_TDX_H
+
+#define TDX_CPUID_LEAF_ID 0x21
+
+#ifdef CONFIG_INTEL_TDX_GUEST
+
+void __init tdx_early_init(void);
+
+#else // !CONFIG_INTEL_TDX_GUEST
+
+static inline void tdx_early_init(void) { };
+
+#endif /* CONFIG_INTEL_TDX_GUEST */
+
+#endif /* _ASM_X86_TDX_H */
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index 5eeb808eb024..ba8ee9300f23 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -128,6 +128,7 @@ obj-$(CONFIG_PARAVIRT_CLOCK) += pvclock.o
obj-$(CONFIG_X86_PMEM_LEGACY_DEVICE) += pmem.o

obj-$(CONFIG_JAILHOUSE_GUEST) += jailhouse.o
+obj-$(CONFIG_INTEL_TDX_GUEST) += tdx.o

obj-$(CONFIG_EISA) += eisa.o
obj-$(CONFIG_PCSPKR_PLATFORM) += pcspeaker.o
diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c
index 5e9beb77cafd..75f2401cb5db 100644
--- a/arch/x86/kernel/head64.c
+++ b/arch/x86/kernel/head64.c
@@ -40,6 +40,7 @@
#include <asm/extable.h>
#include <asm/trapnr.h>
#include <asm/sev-es.h>
+#include <asm/tdx.h>

/*
* Manage page tables very early on.
@@ -491,6 +492,8 @@ asmlinkage __visible void __init x86_64_start_kernel(char * real_mode_data)

kasan_early_init();

+ tdx_early_init();
+
idt_setup_early_handler();

copy_bootdata(__va(real_mode_data));
diff --git a/arch/x86/kernel/tdx.c b/arch/x86/kernel/tdx.c
new file mode 100644
index 000000000000..473b4c1c0920
--- /dev/null
+++ b/arch/x86/kernel/tdx.c
@@ -0,0 +1,31 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/* Copyright (C) 2020 Intel Corporation */
+
+#include <asm/tdx.h>
+#include <asm/cpufeature.h>
+
+static inline bool cpuid_has_tdx_guest(void)
+{
+ u32 eax, signature[3];
+
+ if (cpuid_eax(0) < TDX_CPUID_LEAF_ID)
+ return false;
+
+ cpuid_count(TDX_CPUID_LEAF_ID, 0, &eax, &signature[0],
+ &signature[1], &signature[2]);
+
+ if (memcmp("IntelTDX ", signature, 12))
+ return false;
+
+ return true;
+}
+
+void __init tdx_early_init(void)
+{
+ if (!cpuid_has_tdx_guest())
+ return;
+
+ setup_force_cpu_cap(X86_FEATURE_TDX_GUEST);
+
+ pr_info("TDX guest is initialized\n");
+}
--
2.25.1

Subject: [RFC v1 03/26] x86/cpufeatures: Add is_tdx_guest() interface

Add helper function to detect TDX feature support. It will be used
to protect TDX specific code.

Co-developed-by: Sean Christopherson <[email protected]>
Signed-off-by: Sean Christopherson <[email protected]>
Reviewed-by: Andi Kleen <[email protected]>
Signed-off-by: Kuppuswamy Sathyanarayanan <[email protected]>
---
arch/x86/boot/compressed/Makefile | 1 +
arch/x86/boot/compressed/tdx.c | 32 +++++++++++++++++++++++++++++++
arch/x86/include/asm/tdx.h | 8 ++++++++
arch/x86/kernel/tdx.c | 6 ++++++
4 files changed, 47 insertions(+)
create mode 100644 arch/x86/boot/compressed/tdx.c

diff --git a/arch/x86/boot/compressed/Makefile b/arch/x86/boot/compressed/Makefile
index e0bc3988c3fa..a2554621cefe 100644
--- a/arch/x86/boot/compressed/Makefile
+++ b/arch/x86/boot/compressed/Makefile
@@ -96,6 +96,7 @@ ifdef CONFIG_X86_64
endif

vmlinux-objs-$(CONFIG_ACPI) += $(obj)/acpi.o
+vmlinux-objs-$(CONFIG_INTEL_TDX_GUEST) += $(obj)/tdx.o

vmlinux-objs-$(CONFIG_EFI_MIXED) += $(obj)/efi_thunk_$(BITS).o
efi-obj-$(CONFIG_EFI_STUB) = $(objtree)/drivers/firmware/efi/libstub/lib.a
diff --git a/arch/x86/boot/compressed/tdx.c b/arch/x86/boot/compressed/tdx.c
new file mode 100644
index 000000000000..0a87c1775b67
--- /dev/null
+++ b/arch/x86/boot/compressed/tdx.c
@@ -0,0 +1,32 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * tdx.c - Early boot code for TDX
+ */
+
+#include <asm/tdx.h>
+
+static int __ro_after_init tdx_guest = -1;
+
+static inline bool native_cpuid_has_tdx_guest(void)
+{
+ u32 eax = TDX_CPUID_LEAF_ID, signature[3] = {0};
+
+ if (native_cpuid_eax(0) < TDX_CPUID_LEAF_ID)
+ return false;
+
+ native_cpuid(&eax, &signature[0], &signature[1], &signature[2]);
+
+ if (memcmp("IntelTDX ", signature, 12))
+ return false;
+
+ return true;
+}
+
+bool is_tdx_guest(void)
+{
+ if (tdx_guest < 0)
+ tdx_guest = native_cpuid_has_tdx_guest();
+
+ return !!tdx_guest;
+}
+
diff --git a/arch/x86/include/asm/tdx.h b/arch/x86/include/asm/tdx.h
index 2cc246c0cecf..0b9d571b1f95 100644
--- a/arch/x86/include/asm/tdx.h
+++ b/arch/x86/include/asm/tdx.h
@@ -7,10 +7,18 @@

#ifdef CONFIG_INTEL_TDX_GUEST

+/* Common API to check TDX support in decompression and common kernel code. */
+bool is_tdx_guest(void);
+
void __init tdx_early_init(void);

#else // !CONFIG_INTEL_TDX_GUEST

+static inline bool is_tdx_guest(void)
+{
+ return false;
+}
+
static inline void tdx_early_init(void) { };

#endif /* CONFIG_INTEL_TDX_GUEST */
diff --git a/arch/x86/kernel/tdx.c b/arch/x86/kernel/tdx.c
index 473b4c1c0920..e44e55d1e519 100644
--- a/arch/x86/kernel/tdx.c
+++ b/arch/x86/kernel/tdx.c
@@ -20,6 +20,12 @@ static inline bool cpuid_has_tdx_guest(void)
return true;
}

+bool is_tdx_guest(void)
+{
+ return static_cpu_has(X86_FEATURE_TDX_GUEST);
+}
+EXPORT_SYMBOL_GPL(is_tdx_guest);
+
void __init tdx_early_init(void)
{
if (!cpuid_has_tdx_guest())
--
2.25.1

Subject: [RFC v1 04/26] x86/tdx: Get TD execution environment information via TDINFO

From: "Kirill A. Shutemov" <[email protected]>

Per Guest-Host-Communication Interface (GHCI) for Intel Trust
Domain Extensions (Intel TDX) specification, sec 2.4.2,
TDCALL[TDINFO] provides basic TD execution environment information, not
provided by CPUID.

Call TDINFO during early boot to be used for following system
initialization.

The call provides info on which bit in pfn is used to indicate that the
page is shared with the host and attributes of the TD, such as debug.

We don't save information about the number of cpus as there's no users
so far.

Signed-off-by: Kirill A. Shutemov <[email protected]>
Reviewed-by: Andi Kleen <[email protected]>
Signed-off-by: Kuppuswamy Sathyanarayanan <[email protected]>
---
arch/x86/include/asm/tdx.h | 9 +++++++++
arch/x86/kernel/tdx.c | 27 +++++++++++++++++++++++++++
2 files changed, 36 insertions(+)

diff --git a/arch/x86/include/asm/tdx.h b/arch/x86/include/asm/tdx.h
index 0b9d571b1f95..f8cdc8eb1046 100644
--- a/arch/x86/include/asm/tdx.h
+++ b/arch/x86/include/asm/tdx.h
@@ -7,6 +7,15 @@

#ifdef CONFIG_INTEL_TDX_GUEST

+/*
+ * TDCALL instruction is newly added in TDX architecture,
+ * used by TD for requesting the host VMM to provide
+ * (untrusted) services.
+ */
+#define TDCALL ".byte 0x66,0x0f,0x01,0xcc"
+
+#define TDINFO 1
+
/* Common API to check TDX support in decompression and common kernel code. */
bool is_tdx_guest(void);

diff --git a/arch/x86/kernel/tdx.c b/arch/x86/kernel/tdx.c
index e44e55d1e519..13303bfdfdd1 100644
--- a/arch/x86/kernel/tdx.c
+++ b/arch/x86/kernel/tdx.c
@@ -3,6 +3,14 @@

#include <asm/tdx.h>
#include <asm/cpufeature.h>
+#include <linux/cpu.h>
+#include <asm/tdx.h>
+#include <asm/vmx.h>
+
+static struct {
+ unsigned int gpa_width;
+ unsigned long attributes;
+} td_info __ro_after_init;

static inline bool cpuid_has_tdx_guest(void)
{
@@ -26,6 +34,23 @@ bool is_tdx_guest(void)
}
EXPORT_SYMBOL_GPL(is_tdx_guest);

+static void tdx_get_info(void)
+{
+ register long rcx asm("rcx");
+ register long rdx asm("rdx");
+ register long r8 asm("r8");
+ long ret;
+
+ asm volatile(TDCALL
+ : "=a"(ret), "=c"(rcx), "=r"(rdx), "=r"(r8)
+ : "a"(TDINFO)
+ : "r9", "r10", "r11", "memory");
+ BUG_ON(ret);
+
+ td_info.gpa_width = rcx & GENMASK(5, 0);
+ td_info.attributes = rdx;
+}
+
void __init tdx_early_init(void)
{
if (!cpuid_has_tdx_guest())
@@ -33,5 +58,7 @@ void __init tdx_early_init(void)

setup_force_cpu_cap(X86_FEATURE_TDX_GUEST);

+ tdx_get_info();
+
pr_info("TDX guest is initialized\n");
}
--
2.25.1

Subject: [RFC v1 05/26] x86/traps: Add #VE support for TDX guest

From: "Kirill A. Shutemov" <[email protected]>

The TDX module injects #VE exception to the guest TD in cases of
disallowed instructions, disallowed MSR accesses and subset of CPUID
leaves. Also, it's theoretically possible for CPU to inject #VE
exception on EPT violation, but the TDX module makes sure this does
not happen, as long as all memory used is properly accepted using
TDCALLs. You can find more details about it in, Guest-Host-Communication
Interface (GHCI) for Intel Trust Domain Extensions (Intel TDX)
specification, sec 2.3.

Add basic infrastructure to handle #VE. If there is no handler for a
given #VE, since its a unexpected event (fault case), treat it as a
general protection fault and handle it using do_general_protection()
call.

TDCALL[TDGETVEINFO] provides information about #VE such as exit reason.

More details on cases where #VE exceptions are allowed/not-allowed:

The #VE exception do not occur in the paranoid entry paths, like NMIs.
While other operations during an NMI might cause #VE, these are in the
NMI code that can handle nesting, so there is no concern about
reentrancy. This is similar to how #PF is handled in NMIs.

The #VE exception also cannot happen in entry/exit code with the
wrong gs, such as the SWAPGS code, so it's entry point does not
need "paranoid" handling.

Any memory accesses can cause #VE if it causes an EPT
violation.  However, the VMM is only in direct control of some of the
EPT tables.  The Secure EPT tables are controlled by the TDX module
which guarantees no EPT violations will result in #VE for the guest,
once the memory has been accepted.

Co-developed-by: Sean Christopherson <[email protected]>
Signed-off-by: Sean Christopherson <[email protected]>
Signed-off-by: Kirill A. Shutemov <[email protected]>
Reviewed-by: Andi Kleen <[email protected]>
Signed-off-by: Kuppuswamy Sathyanarayanan <[email protected]>
---
arch/x86/include/asm/idtentry.h | 4 ++
arch/x86/include/asm/tdx.h | 14 +++++++
arch/x86/kernel/idt.c | 6 +++
arch/x86/kernel/tdx.c | 31 ++++++++++++++
arch/x86/kernel/traps.c | 73 ++++++++++++++++++++++-----------
5 files changed, 105 insertions(+), 23 deletions(-)

diff --git a/arch/x86/include/asm/idtentry.h b/arch/x86/include/asm/idtentry.h
index 247a60a47331..a2cbb68f9ae8 100644
--- a/arch/x86/include/asm/idtentry.h
+++ b/arch/x86/include/asm/idtentry.h
@@ -615,6 +615,10 @@ DECLARE_IDTENTRY_VC(X86_TRAP_VC, exc_vmm_communication);
DECLARE_IDTENTRY_XENCB(X86_TRAP_OTHER, exc_xen_hypervisor_callback);
#endif

+#ifdef CONFIG_INTEL_TDX_GUEST
+DECLARE_IDTENTRY(X86_TRAP_VE, exc_virtualization_exception);
+#endif
+
/* Device interrupts common/spurious */
DECLARE_IDTENTRY_IRQ(X86_TRAP_OTHER, common_interrupt);
#ifdef CONFIG_X86_LOCAL_APIC
diff --git a/arch/x86/include/asm/tdx.h b/arch/x86/include/asm/tdx.h
index f8cdc8eb1046..90eb61b07d1f 100644
--- a/arch/x86/include/asm/tdx.h
+++ b/arch/x86/include/asm/tdx.h
@@ -15,6 +15,7 @@
#define TDCALL ".byte 0x66,0x0f,0x01,0xcc"

#define TDINFO 1
+#define TDGETVEINFO 3

/* Common API to check TDX support in decompression and common kernel code. */
bool is_tdx_guest(void);
@@ -32,4 +33,17 @@ static inline void tdx_early_init(void) { };

#endif /* CONFIG_INTEL_TDX_GUEST */

+struct ve_info {
+ unsigned int exit_reason;
+ unsigned long exit_qual;
+ unsigned long gla;
+ unsigned long gpa;
+ unsigned int instr_len;
+ unsigned int instr_info;
+};
+
+unsigned long tdx_get_ve_info(struct ve_info *ve);
+int tdx_handle_virtualization_exception(struct pt_regs *regs,
+ struct ve_info *ve);
+
#endif /* _ASM_X86_TDX_H */
diff --git a/arch/x86/kernel/idt.c b/arch/x86/kernel/idt.c
index ee1a283f8e96..546b6b636c7d 100644
--- a/arch/x86/kernel/idt.c
+++ b/arch/x86/kernel/idt.c
@@ -64,6 +64,9 @@ static const __initconst struct idt_data early_idts[] = {
*/
INTG(X86_TRAP_PF, asm_exc_page_fault),
#endif
+#ifdef CONFIG_INTEL_TDX_GUEST
+ INTG(X86_TRAP_VE, asm_exc_virtualization_exception),
+#endif
};

/*
@@ -87,6 +90,9 @@ static const __initconst struct idt_data def_idts[] = {
INTG(X86_TRAP_MF, asm_exc_coprocessor_error),
INTG(X86_TRAP_AC, asm_exc_alignment_check),
INTG(X86_TRAP_XF, asm_exc_simd_coprocessor_error),
+#ifdef CONFIG_INTEL_TDX_GUEST
+ INTG(X86_TRAP_VE, asm_exc_virtualization_exception),
+#endif

#ifdef CONFIG_X86_32
TSKG(X86_TRAP_DF, GDT_ENTRY_DOUBLEFAULT_TSS),
diff --git a/arch/x86/kernel/tdx.c b/arch/x86/kernel/tdx.c
index 13303bfdfdd1..ae2d5c847700 100644
--- a/arch/x86/kernel/tdx.c
+++ b/arch/x86/kernel/tdx.c
@@ -62,3 +62,34 @@ void __init tdx_early_init(void)

pr_info("TDX guest is initialized\n");
}
+
+unsigned long tdx_get_ve_info(struct ve_info *ve)
+{
+ register long r8 asm("r8");
+ register long r9 asm("r9");
+ register long r10 asm("r10");
+ unsigned long ret;
+
+ asm volatile(TDCALL
+ : "=a"(ret), "=c"(ve->exit_reason), "=d"(ve->exit_qual),
+ "=r"(r8), "=r"(r9), "=r"(r10)
+ : "a"(TDGETVEINFO)
+ :);
+
+ ve->gla = r8;
+ ve->gpa = r9;
+ ve->instr_len = r10 & UINT_MAX;
+ ve->instr_info = r10 >> 32;
+ return ret;
+}
+
+int tdx_handle_virtualization_exception(struct pt_regs *regs,
+ struct ve_info *ve)
+{
+ /*
+ * TODO: Add handler support for various #VE exit
+ * reasons
+ */
+ pr_warn("Unexpected #VE: %d\n", ve->exit_reason);
+ return -EFAULT;
+}
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index 7f5aec758f0e..ba98253b47cd 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -61,6 +61,7 @@
#include <asm/insn.h>
#include <asm/insn-eval.h>
#include <asm/vdso.h>
+#include <asm/tdx.h>

#ifdef CONFIG_X86_64
#include <asm/x86_init.h>
@@ -527,30 +528,14 @@ static enum kernel_gp_hint get_kernel_gp_address(struct pt_regs *regs,

#define GPFSTR "general protection fault"

-DEFINE_IDTENTRY_ERRORCODE(exc_general_protection)
+static void do_general_protection(struct pt_regs *regs, long error_code)
{
char desc[sizeof(GPFSTR) + 50 + 2*sizeof(unsigned long) + 1] = GPFSTR;
enum kernel_gp_hint hint = GP_NO_HINT;
- struct task_struct *tsk;
+ struct task_struct *tsk = current;
unsigned long gp_addr;
int ret;

- cond_local_irq_enable(regs);
-
- if (static_cpu_has(X86_FEATURE_UMIP)) {
- if (user_mode(regs) && fixup_umip_exception(regs))
- goto exit;
- }
-
- if (v8086_mode(regs)) {
- local_irq_enable();
- handle_vm86_fault((struct kernel_vm86_regs *) regs, error_code);
- local_irq_disable();
- return;
- }
-
- tsk = current;
-
if (user_mode(regs)) {
tsk->thread.error_code = error_code;
tsk->thread.trap_nr = X86_TRAP_GP;
@@ -560,11 +545,11 @@ DEFINE_IDTENTRY_ERRORCODE(exc_general_protection)

show_signal(tsk, SIGSEGV, "", desc, regs, error_code);
force_sig(SIGSEGV);
- goto exit;
+ return;
}

if (fixup_exception(regs, X86_TRAP_GP, error_code, 0))
- goto exit;
+ return;

tsk->thread.error_code = error_code;
tsk->thread.trap_nr = X86_TRAP_GP;
@@ -576,11 +561,11 @@ DEFINE_IDTENTRY_ERRORCODE(exc_general_protection)
if (!preemptible() &&
kprobe_running() &&
kprobe_fault_handler(regs, X86_TRAP_GP))
- goto exit;
+ return;

ret = notify_die(DIE_GPF, desc, regs, error_code, X86_TRAP_GP, SIGSEGV);
if (ret == NOTIFY_STOP)
- goto exit;
+ return;

if (error_code)
snprintf(desc, sizeof(desc), "segment-related " GPFSTR);
@@ -601,8 +586,27 @@ DEFINE_IDTENTRY_ERRORCODE(exc_general_protection)
gp_addr = 0;

die_addr(desc, regs, error_code, gp_addr);
+}

-exit:
+DEFINE_IDTENTRY_ERRORCODE(exc_general_protection)
+{
+ cond_local_irq_enable(regs);
+
+ if (static_cpu_has(X86_FEATURE_UMIP)) {
+ if (user_mode(regs) && fixup_umip_exception(regs)) {
+ cond_local_irq_disable(regs);
+ return;
+ }
+ }
+
+ if (v8086_mode(regs)) {
+ local_irq_enable();
+ handle_vm86_fault((struct kernel_vm86_regs *) regs, error_code);
+ local_irq_disable();
+ return;
+ }
+
+ do_general_protection(regs, error_code);
cond_local_irq_disable(regs);
}

@@ -1138,6 +1142,29 @@ DEFINE_IDTENTRY(exc_device_not_available)
}
}

+#ifdef CONFIG_INTEL_TDX_GUEST
+DEFINE_IDTENTRY(exc_virtualization_exception)
+{
+ struct ve_info ve;
+ int ret;
+
+ RCU_LOCKDEP_WARN(!rcu_is_watching(), "entry code didn't wake RCU");
+
+ /* Consume #VE info before re-enabling interrupts */
+ ret = tdx_get_ve_info(&ve);
+ cond_local_irq_enable(regs);
+ if (!ret)
+ ret = tdx_handle_virtualization_exception(regs, &ve);
+ /*
+ * If #VE exception handler could not handle it successfully, treat
+ * it as #GP(0) and handle it.
+ */
+ if (ret)
+ do_general_protection(regs, 0);
+ cond_local_irq_disable(regs);
+}
+#endif
+
#ifdef CONFIG_X86_32
DEFINE_IDTENTRY_SW(iret_error)
{
--
2.25.1

Subject: [RFC v1 01/26] x86/paravirt: Introduce CONFIG_PARAVIRT_XL

From: "Kirill A. Shutemov" <[email protected]>

Split off halt paravirt calls from CONFIG_PARAVIRT_XXL into
a separate config option. It provides a middle ground for
not-so-deep paravirtulized environments.

CONFIG_PARAVIRT_XL will be used by TDX that needs couple of paravirt
calls that was hidden under CONFIG_PARAVIRT_XXL, but the rest of the
config would be a bloat for TDX.

Signed-off-by: Kirill A. Shutemov <[email protected]>
Reviewed-by: Andi Kleen <[email protected]>
Signed-off-by: Kuppuswamy Sathyanarayanan <[email protected]>
---
arch/x86/Kconfig | 4 +++
arch/x86/boot/compressed/misc.h | 1 +
arch/x86/include/asm/irqflags.h | 42 +++++++++++++++------------
arch/x86/include/asm/paravirt.h | 22 +++++++-------
arch/x86/include/asm/paravirt_types.h | 3 +-
arch/x86/kernel/paravirt.c | 4 ++-
arch/x86/mm/mem_encrypt_identity.c | 1 +
7 files changed, 46 insertions(+), 31 deletions(-)

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 7b6dd10b162a..8fe91114bfee 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -776,8 +776,12 @@ config PARAVIRT
over full virtualization. However, when run without a hypervisor
the kernel is theoretically slower and slightly larger.

+config PARAVIRT_XL
+ bool
+
config PARAVIRT_XXL
bool
+ select PARAVIRT_XL

config PARAVIRT_DEBUG
bool "paravirt-ops debugging"
diff --git a/arch/x86/boot/compressed/misc.h b/arch/x86/boot/compressed/misc.h
index 901ea5ebec22..4b84abe43765 100644
--- a/arch/x86/boot/compressed/misc.h
+++ b/arch/x86/boot/compressed/misc.h
@@ -9,6 +9,7 @@
* paravirt and debugging variants are added.)
*/
#undef CONFIG_PARAVIRT
+#undef CONFIG_PARAVIRT_XL
#undef CONFIG_PARAVIRT_XXL
#undef CONFIG_PARAVIRT_SPINLOCKS
#undef CONFIG_KASAN
diff --git a/arch/x86/include/asm/irqflags.h b/arch/x86/include/asm/irqflags.h
index 2dfc8d380dab..299c9b1ed857 100644
--- a/arch/x86/include/asm/irqflags.h
+++ b/arch/x86/include/asm/irqflags.h
@@ -68,11 +68,33 @@ static inline __cpuidle void native_halt(void)

#endif

-#ifdef CONFIG_PARAVIRT_XXL
+#ifdef CONFIG_PARAVIRT_XL
#include <asm/paravirt.h>
#else
#ifndef __ASSEMBLY__
#include <linux/types.h>
+/*
+ * Used in the idle loop; sti takes one instruction cycle
+ * to complete:
+ */
+static inline __cpuidle void arch_safe_halt(void)
+{
+ native_safe_halt();
+}
+
+/*
+ * Used when interrupts are already enabled or to
+ * shutdown the processor:
+ */
+static inline __cpuidle void halt(void)
+{
+ native_halt();
+}
+#endif /* !__ASSEMBLY__ */
+#endif /* CONFIG_PARAVIRT_XL */
+
+#ifndef CONFIG_PARAVIRT_XXL
+#ifndef __ASSEMBLY__

static __always_inline unsigned long arch_local_save_flags(void)
{
@@ -94,24 +116,6 @@ static __always_inline void arch_local_irq_enable(void)
native_irq_enable();
}

-/*
- * Used in the idle loop; sti takes one instruction cycle
- * to complete:
- */
-static inline __cpuidle void arch_safe_halt(void)
-{
- native_safe_halt();
-}
-
-/*
- * Used when interrupts are already enabled or to
- * shutdown the processor:
- */
-static inline __cpuidle void halt(void)
-{
- native_halt();
-}
-
/*
* For spinlocks, etc:
*/
diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h
index f8dce11d2bc1..700b94abfd1b 100644
--- a/arch/x86/include/asm/paravirt.h
+++ b/arch/x86/include/asm/paravirt.h
@@ -84,6 +84,18 @@ static inline void paravirt_arch_exit_mmap(struct mm_struct *mm)
PVOP_VCALL1(mmu.exit_mmap, mm);
}

+#ifdef CONFIG_PARAVIRT_XL
+static inline void arch_safe_halt(void)
+{
+ PVOP_VCALL0(irq.safe_halt);
+}
+
+static inline void halt(void)
+{
+ PVOP_VCALL0(irq.halt);
+}
+#endif
+
#ifdef CONFIG_PARAVIRT_XXL
static inline void load_sp0(unsigned long sp0)
{
@@ -145,16 +157,6 @@ static inline void __write_cr4(unsigned long x)
PVOP_VCALL1(cpu.write_cr4, x);
}

-static inline void arch_safe_halt(void)
-{
- PVOP_VCALL0(irq.safe_halt);
-}
-
-static inline void halt(void)
-{
- PVOP_VCALL0(irq.halt);
-}
-
static inline void wbinvd(void)
{
PVOP_VCALL0(cpu.wbinvd);
diff --git a/arch/x86/include/asm/paravirt_types.h b/arch/x86/include/asm/paravirt_types.h
index b6b02b7c19cc..634482a0a60d 100644
--- a/arch/x86/include/asm/paravirt_types.h
+++ b/arch/x86/include/asm/paravirt_types.h
@@ -190,7 +190,8 @@ struct pv_irq_ops {
struct paravirt_callee_save restore_fl;
struct paravirt_callee_save irq_disable;
struct paravirt_callee_save irq_enable;
-
+#endif
+#ifdef CONFIG_PARAVIRT_XL
void (*safe_halt)(void);
void (*halt)(void);
#endif
diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c
index 6c3407ba6ee9..85714a6389d6 100644
--- a/arch/x86/kernel/paravirt.c
+++ b/arch/x86/kernel/paravirt.c
@@ -327,9 +327,11 @@ struct paravirt_patch_template pv_ops = {
.irq.restore_fl = __PV_IS_CALLEE_SAVE(native_restore_fl),
.irq.irq_disable = __PV_IS_CALLEE_SAVE(native_irq_disable),
.irq.irq_enable = __PV_IS_CALLEE_SAVE(native_irq_enable),
+#endif /* CONFIG_PARAVIRT_XXL */
+#ifdef CONFIG_PARAVIRT_XL
.irq.safe_halt = native_safe_halt,
.irq.halt = native_halt,
-#endif /* CONFIG_PARAVIRT_XXL */
+#endif /* CONFIG_PARAVIRT_XL */

/* Mmu ops. */
.mmu.flush_tlb_user = native_flush_tlb_local,
diff --git a/arch/x86/mm/mem_encrypt_identity.c b/arch/x86/mm/mem_encrypt_identity.c
index 6c5eb6f3f14f..20d0cb116557 100644
--- a/arch/x86/mm/mem_encrypt_identity.c
+++ b/arch/x86/mm/mem_encrypt_identity.c
@@ -24,6 +24,7 @@
* be extended when new paravirt and debugging variants are added.)
*/
#undef CONFIG_PARAVIRT
+#undef CONFIG_PARAVIRT_XL
#undef CONFIG_PARAVIRT_XXL
#undef CONFIG_PARAVIRT_SPINLOCKS

--
2.25.1

Subject: [RFC v1 17/26] x86/boot: Avoid unnecessary #VE during boot process

From: Sean Christopherson <[email protected]>

Skip writing EFER during secondary_startup_64() if the current value is
also the desired value. This avoids a #VE when running as a TDX guest,
as the TDX-Module does not allow writes to EFER (even when writing the
current, fixed value).

Also, preserve CR4.MCE instead of clearing it during boot to avoid a #VE
when running as a TDX guest. The TDX-Module (effectively part of the
hypervisor) requires CR4.MCE to be set at all times and injects a #VE
if the guest attempts to clear CR4.MCE.

Signed-off-by: Sean Christopherson <[email protected]>
Reviewed-by: Andi Kleen <[email protected]>
Signed-off-by: Kuppuswamy Sathyanarayanan <[email protected]>
---
arch/x86/boot/compressed/head_64.S | 5 ++++-
arch/x86/kernel/head_64.S | 13 +++++++++++--
2 files changed, 15 insertions(+), 3 deletions(-)

diff --git a/arch/x86/boot/compressed/head_64.S b/arch/x86/boot/compressed/head_64.S
index 37c2f37d4a0d..2d79e5f97360 100644
--- a/arch/x86/boot/compressed/head_64.S
+++ b/arch/x86/boot/compressed/head_64.S
@@ -622,7 +622,10 @@ SYM_CODE_START(trampoline_32bit_src)
popl %ecx

/* Enable PAE and LA57 (if required) paging modes */
- movl $X86_CR4_PAE, %eax
+ movl %cr4, %eax
+ /* Clearing CR4.MCE will #VE on TDX guests. Leave it alone. */
+ andl $X86_CR4_MCE, %eax
+ orl $X86_CR4_PAE, %eax
testl %edx, %edx
jz 1f
orl $X86_CR4_LA57, %eax
diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S
index 04bddaaba8e2..92c77cf75542 100644
--- a/arch/x86/kernel/head_64.S
+++ b/arch/x86/kernel/head_64.S
@@ -141,7 +141,10 @@ SYM_INNER_LABEL(secondary_startup_64_no_verify, SYM_L_GLOBAL)
1:

/* Enable PAE mode, PGE and LA57 */
- movl $(X86_CR4_PAE | X86_CR4_PGE), %ecx
+ movq %cr4, %rcx
+ /* Clearing CR4.MCE will #VE on TDX guests. Leave it alone. */
+ andl $X86_CR4_MCE, %ecx
+ orl $(X86_CR4_PAE | X86_CR4_PGE), %ecx
#ifdef CONFIG_X86_5LEVEL
testl $1, __pgtable_l5_enabled(%rip)
jz 1f
@@ -229,13 +232,19 @@ SYM_INNER_LABEL(secondary_startup_64_no_verify, SYM_L_GLOBAL)
/* Setup EFER (Extended Feature Enable Register) */
movl $MSR_EFER, %ecx
rdmsr
+ movl %eax, %edx
btsl $_EFER_SCE, %eax /* Enable System Call */
btl $20,%edi /* No Execute supported? */
jnc 1f
btsl $_EFER_NX, %eax
btsq $_PAGE_BIT_NX,early_pmd_flags(%rip)
-1: wrmsr /* Make changes effective */

+ /* Skip the WRMSR if the current value matches the desired value. */
+1: cmpl %edx, %eax
+ je 1f
+ xor %edx, %edx
+ wrmsr /* Make changes effective */
+1:
/* Setup cr0 */
movl $CR0_STATE, %eax
/* Make changes effective */
--
2.25.1

Subject: [RFC v1 06/26] x86/tdx: Add HLT support for TDX guest

From: "Kirill A. Shutemov" <[email protected]>

Per Guest-Host-Communication Interface (GHCI) for Intel Trust
Domain Extensions (Intel TDX) specification, sec 3.8,
TDVMCALL[Instruction.HLT] provides HLT operation. Use it to implement
halt() and safe_halt() paravirtualization calls.

The same TDVMCALL is used to handle #VE exception due to
EXIT_REASON_HLT.

Signed-off-by: Kirill A. Shutemov <[email protected]>
Reviewed-by: Andi Kleen <[email protected]>
Signed-off-by: Kuppuswamy Sathyanarayanan <[email protected]>
---
arch/x86/include/asm/tdx.h | 5 ++++
arch/x86/kernel/tdx.c | 61 ++++++++++++++++++++++++++++++++++----
2 files changed, 60 insertions(+), 6 deletions(-)

diff --git a/arch/x86/include/asm/tdx.h b/arch/x86/include/asm/tdx.h
index 90eb61b07d1f..b98de067257b 100644
--- a/arch/x86/include/asm/tdx.h
+++ b/arch/x86/include/asm/tdx.h
@@ -14,9 +14,14 @@
*/
#define TDCALL ".byte 0x66,0x0f,0x01,0xcc"

+#define TDVMCALL 0
#define TDINFO 1
#define TDGETVEINFO 3

+/* TDVMCALL R10 Input */
+#define TDVMCALL_STANDARD 0
+#define TDVMCALL_VENDOR 1
+
/* Common API to check TDX support in decompression and common kernel code. */
bool is_tdx_guest(void);

diff --git a/arch/x86/kernel/tdx.c b/arch/x86/kernel/tdx.c
index ae2d5c847700..25dd33bc2e49 100644
--- a/arch/x86/kernel/tdx.c
+++ b/arch/x86/kernel/tdx.c
@@ -51,6 +51,45 @@ static void tdx_get_info(void)
td_info.attributes = rdx;
}

+static __cpuidle void tdx_halt(void)
+{
+ register long r10 asm("r10") = TDVMCALL_STANDARD;
+ register long r11 asm("r11") = EXIT_REASON_HLT;
+ register long rcx asm("rcx");
+ long ret;
+
+ /* Allow to pass R10 and R11 down to the VMM */
+ rcx = BIT(10) | BIT(11);
+
+ asm volatile(TDCALL
+ : "=a"(ret), "=r"(r10), "=r"(r11)
+ : "a"(TDVMCALL), "r"(rcx), "r"(r10), "r"(r11)
+ : );
+
+ /* It should never fail */
+ BUG_ON(ret || r10);
+}
+
+static __cpuidle void tdx_safe_halt(void)
+{
+ register long r10 asm("r10") = TDVMCALL_STANDARD;
+ register long r11 asm("r11") = EXIT_REASON_HLT;
+ register long rcx asm("rcx");
+ long ret;
+
+ /* Allow to pass R10 and R11 down to the VMM */
+ rcx = BIT(10) | BIT(11);
+
+ /* Enable interrupts next to the TDVMCALL to avoid performance degradation */
+ asm volatile("sti\n\t" TDCALL
+ : "=a"(ret), "=r"(r10), "=r"(r11)
+ : "a"(TDVMCALL), "r"(rcx), "r"(r10), "r"(r11)
+ : );
+
+ /* It should never fail */
+ BUG_ON(ret || r10);
+}
+
void __init tdx_early_init(void)
{
if (!cpuid_has_tdx_guest())
@@ -60,6 +99,9 @@ void __init tdx_early_init(void)

tdx_get_info();

+ pv_ops.irq.safe_halt = tdx_safe_halt;
+ pv_ops.irq.halt = tdx_halt;
+
pr_info("TDX guest is initialized\n");
}

@@ -86,10 +128,17 @@ unsigned long tdx_get_ve_info(struct ve_info *ve)
int tdx_handle_virtualization_exception(struct pt_regs *regs,
struct ve_info *ve)
{
- /*
- * TODO: Add handler support for various #VE exit
- * reasons
- */
- pr_warn("Unexpected #VE: %d\n", ve->exit_reason);
- return -EFAULT;
+ switch (ve->exit_reason) {
+ case EXIT_REASON_HLT:
+ tdx_halt();
+ break;
+ default:
+ pr_warn("Unexpected #VE: %d\n", ve->exit_reason);
+ return -EFAULT;
+ }
+
+ /* After successful #VE handling, move the IP */
+ regs->ip += ve->instr_len;
+
+ return ret;
}
--
2.25.1

Subject: [RFC v1 07/26] x86/tdx: Wire up KVM hypercalls

From: "Kirill A. Shutemov" <[email protected]>

KVM hypercalls have to be wrapped into vendor-specific TDVMCALLs.

Signed-off-by: Kirill A. Shutemov <[email protected]>
Reviewed-by: Andi Kleen <[email protected]>
Signed-off-by: Kuppuswamy Sathyanarayanan <[email protected]>
---
arch/x86/include/asm/kvm_para.h | 21 ++++++
arch/x86/include/asm/tdx.h | 8 +++
arch/x86/kernel/tdx-kvm.c | 116 ++++++++++++++++++++++++++++++++
arch/x86/kernel/tdx.c | 4 ++
4 files changed, 149 insertions(+)
create mode 100644 arch/x86/kernel/tdx-kvm.c

diff --git a/arch/x86/include/asm/kvm_para.h b/arch/x86/include/asm/kvm_para.h
index 338119852512..2fa85481520b 100644
--- a/arch/x86/include/asm/kvm_para.h
+++ b/arch/x86/include/asm/kvm_para.h
@@ -6,6 +6,7 @@
#include <asm/alternative.h>
#include <linux/interrupt.h>
#include <uapi/asm/kvm_para.h>
+#include <asm/tdx.h>

extern void kvmclock_init(void);

@@ -34,6 +35,10 @@ static inline bool kvm_check_and_clear_guest_paused(void)
static inline long kvm_hypercall0(unsigned int nr)
{
long ret;
+
+ if (is_tdx_guest())
+ return tdx_kvm_hypercall0(nr);
+
asm volatile(KVM_HYPERCALL
: "=a"(ret)
: "a"(nr)
@@ -44,6 +49,10 @@ static inline long kvm_hypercall0(unsigned int nr)
static inline long kvm_hypercall1(unsigned int nr, unsigned long p1)
{
long ret;
+
+ if (is_tdx_guest())
+ return tdx_kvm_hypercall1(nr, p1);
+
asm volatile(KVM_HYPERCALL
: "=a"(ret)
: "a"(nr), "b"(p1)
@@ -55,6 +64,10 @@ static inline long kvm_hypercall2(unsigned int nr, unsigned long p1,
unsigned long p2)
{
long ret;
+
+ if (is_tdx_guest())
+ return tdx_kvm_hypercall2(nr, p1, p2);
+
asm volatile(KVM_HYPERCALL
: "=a"(ret)
: "a"(nr), "b"(p1), "c"(p2)
@@ -66,6 +79,10 @@ static inline long kvm_hypercall3(unsigned int nr, unsigned long p1,
unsigned long p2, unsigned long p3)
{
long ret;
+
+ if (is_tdx_guest())
+ return tdx_kvm_hypercall3(nr, p1, p2, p3);
+
asm volatile(KVM_HYPERCALL
: "=a"(ret)
: "a"(nr), "b"(p1), "c"(p2), "d"(p3)
@@ -78,6 +95,10 @@ static inline long kvm_hypercall4(unsigned int nr, unsigned long p1,
unsigned long p4)
{
long ret;
+
+ if (is_tdx_guest())
+ return tdx_kvm_hypercall4(nr, p1, p2, p3, p4);
+
asm volatile(KVM_HYPERCALL
: "=a"(ret)
: "a"(nr), "b"(p1), "c"(p2), "d"(p3), "S"(p4)
diff --git a/arch/x86/include/asm/tdx.h b/arch/x86/include/asm/tdx.h
index b98de067257b..8c3e5af88643 100644
--- a/arch/x86/include/asm/tdx.h
+++ b/arch/x86/include/asm/tdx.h
@@ -51,4 +51,12 @@ unsigned long tdx_get_ve_info(struct ve_info *ve);
int tdx_handle_virtualization_exception(struct pt_regs *regs,
struct ve_info *ve);

+long tdx_kvm_hypercall0(unsigned int nr);
+long tdx_kvm_hypercall1(unsigned int nr, unsigned long p1);
+long tdx_kvm_hypercall2(unsigned int nr, unsigned long p1, unsigned long p2);
+long tdx_kvm_hypercall3(unsigned int nr, unsigned long p1, unsigned long p2,
+ unsigned long p3);
+long tdx_kvm_hypercall4(unsigned int nr, unsigned long p1, unsigned long p2,
+ unsigned long p3, unsigned long p4);
+
#endif /* _ASM_X86_TDX_H */
diff --git a/arch/x86/kernel/tdx-kvm.c b/arch/x86/kernel/tdx-kvm.c
new file mode 100644
index 000000000000..323d43fcb338
--- /dev/null
+++ b/arch/x86/kernel/tdx-kvm.c
@@ -0,0 +1,116 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+
+long tdx_kvm_hypercall0(unsigned int nr)
+{
+ register long r10 asm("r10") = TDVMCALL_VENDOR;
+ register long r11 asm("r11") = nr;
+ register long rcx asm("rcx");
+ long ret;
+
+ /* Allow to pass R10 and R11 down to the VMM */
+ rcx = BIT(10) | BIT(11);
+
+ asm volatile(TDCALL
+ : "=a"(ret), "=r"(r10)
+ : "a"(TDVMCALL), "r"(rcx), "r"(r10), "r"(r11)
+ : "memory");
+
+ BUG_ON(ret);
+ return r10;
+}
+EXPORT_SYMBOL_GPL(tdx_kvm_hypercall0);
+
+long tdx_kvm_hypercall1(unsigned int nr, unsigned long p1)
+{
+ register long r10 asm("r10") = TDVMCALL_VENDOR;
+ register long r11 asm("r11") = nr;
+ register long r12 asm("r12") = p1;
+ register long rcx asm("rcx");
+ long ret;
+
+ /* Allow to pass R10, R11 and R12 down to the VMM */
+ rcx = BIT(10) | BIT(11) | BIT(12);
+
+ asm volatile(TDCALL
+ : "=a"(ret), "=r"(r10)
+ : "a"(TDVMCALL), "r"(rcx), "r"(r10), "r"(r11), "r"(r12)
+ : "memory");
+
+ BUG_ON(ret);
+ return r10;
+}
+EXPORT_SYMBOL_GPL(tdx_kvm_hypercall1);
+
+long tdx_kvm_hypercall2(unsigned int nr, unsigned long p1, unsigned long p2)
+{
+ register long r10 asm("r10") = TDVMCALL_VENDOR;
+ register long r11 asm("r11") = nr;
+ register long r12 asm("r12") = p1;
+ register long r13 asm("r13") = p2;
+ register long rcx asm("rcx");
+ long ret;
+
+ /* Allow to pass R10, R11, R12 and R13 down to the VMM */
+ rcx = BIT(10) | BIT(11) | BIT(12) | BIT(13);
+
+ asm volatile(TDCALL
+ : "=a"(ret), "=r"(r10)
+ : "a"(TDVMCALL), "r"(rcx), "r"(r10), "r"(r11), "r"(r12),
+ "r"(r13)
+ : "memory");
+
+ BUG_ON(ret);
+ return r10;
+}
+EXPORT_SYMBOL_GPL(tdx_kvm_hypercall2);
+
+long tdx_kvm_hypercall3(unsigned int nr, unsigned long p1, unsigned long p2,
+ unsigned long p3)
+{
+ register long r10 asm("r10") = TDVMCALL_VENDOR;
+ register long r11 asm("r11") = nr;
+ register long r12 asm("r12") = p1;
+ register long r13 asm("r13") = p2;
+ register long r14 asm("r14") = p3;
+ register long rcx asm("rcx");
+ long ret;
+
+ /* Allow to pass R10, R11, R12, R13 and R14 down to the VMM */
+ rcx = BIT(10) | BIT(11) | BIT(12) | BIT(13) | BIT(14);
+
+ asm volatile(TDCALL
+ : "=a"(ret), "=r"(r10)
+ : "a"(TDVMCALL), "r"(rcx), "r"(r10), "r"(r11), "r"(r12),
+ "r"(r13), "r"(r14)
+ : "memory");
+
+ BUG_ON(ret);
+ return r10;
+}
+EXPORT_SYMBOL_GPL(tdx_kvm_hypercall3);
+
+long tdx_kvm_hypercall4(unsigned int nr, unsigned long p1, unsigned long p2,
+ unsigned long p3, unsigned long p4)
+{
+ register long r10 asm("r10") = TDVMCALL_VENDOR;
+ register long r11 asm("r11") = nr;
+ register long r12 asm("r12") = p1;
+ register long r13 asm("r13") = p2;
+ register long r14 asm("r14") = p3;
+ register long r15 asm("r15") = p4;
+ register long rcx asm("rcx");
+ long ret;
+
+ /* Allow to pass R10, R11, R12, R13, R14 and R15 down to the VMM */
+ rcx = BIT(10) | BIT(11) | BIT(12) | BIT(13) | BIT(14) | BIT(15);
+
+ asm volatile(TDCALL
+ : "=a"(ret), "=r"(r10)
+ : "a"(TDVMCALL), "r"(rcx), "r"(r10), "r"(r11), "r"(r12),
+ "r"(r13), "r"(r14), "r"(r15)
+ : "memory");
+
+ BUG_ON(ret);
+ return r10;
+}
+EXPORT_SYMBOL_GPL(tdx_kvm_hypercall4);
diff --git a/arch/x86/kernel/tdx.c b/arch/x86/kernel/tdx.c
index 25dd33bc2e49..bbefe639a2ed 100644
--- a/arch/x86/kernel/tdx.c
+++ b/arch/x86/kernel/tdx.c
@@ -7,6 +7,10 @@
#include <asm/tdx.h>
#include <asm/vmx.h>

+#ifdef CONFIG_KVM_GUEST
+#include "tdx-kvm.c"
+#endif
+
static struct {
unsigned int gpa_width;
unsigned long attributes;
--
2.25.1

Subject: [RFC v1 08/26] x86/tdx: Add MSR support for TDX guest

From: "Kirill A. Shutemov" <[email protected]>

Operations on context-switched MSRs can be run natively. The rest of
MSRs should be handled through TDVMCALLs.

TDVMCALL[Instruction.RDMSR] and TDVMCALL[Instruction.WRMSR] provide
MSR oprations.

You can find RDMSR and WRMSR details in Guest-Host-Communication
Interface (GHCI) for Intel Trust Domain Extensions (Intel TDX)
specification, sec 3.10, 3.11.

Also, since CSTAR MSR is not used on Intel CPUs as SYSCALL
instruction, ignore accesses to CSTAR MSR. Ignore accesses to
the MSR for compatibility: no need in wrap callers in
!is_tdx_guest().

Signed-off-by: Kirill A. Shutemov <[email protected]>
Reviewed-by: Andi Kleen <[email protected]>
Signed-off-by: Kuppuswamy Sathyanarayanan <[email protected]>
---
arch/x86/kernel/tdx.c | 94 ++++++++++++++++++++++++++++++++++++++++++-
1 file changed, 93 insertions(+), 1 deletion(-)

diff --git a/arch/x86/kernel/tdx.c b/arch/x86/kernel/tdx.c
index bbefe639a2ed..5d961263601e 100644
--- a/arch/x86/kernel/tdx.c
+++ b/arch/x86/kernel/tdx.c
@@ -94,6 +94,84 @@ static __cpuidle void tdx_safe_halt(void)
BUG_ON(ret || r10);
}

+static bool tdx_is_context_switched_msr(unsigned int msr)
+{
+ /* XXX: Update the list of context-switched MSRs */
+
+ switch (msr) {
+ case MSR_EFER:
+ case MSR_IA32_CR_PAT:
+ case MSR_FS_BASE:
+ case MSR_GS_BASE:
+ case MSR_KERNEL_GS_BASE:
+ case MSR_IA32_SYSENTER_CS:
+ case MSR_IA32_SYSENTER_EIP:
+ case MSR_IA32_SYSENTER_ESP:
+ case MSR_STAR:
+ case MSR_LSTAR:
+ case MSR_SYSCALL_MASK:
+ case MSR_IA32_XSS:
+ case MSR_TSC_AUX:
+ case MSR_IA32_BNDCFGS:
+ return true;
+ }
+ return false;
+}
+
+static u64 tdx_read_msr_safe(unsigned int msr, int *err)
+{
+ register long r10 asm("r10") = TDVMCALL_STANDARD;
+ register long r11 asm("r11") = EXIT_REASON_MSR_READ;
+ register long r12 asm("r12") = msr;
+ register long rcx asm("rcx");
+ long ret;
+
+ WARN_ON_ONCE(tdx_is_context_switched_msr(msr));
+
+ if (msr == MSR_CSTAR)
+ return 0;
+
+ /* Allow to pass R10, R11 and R12 down to the VMM */
+ rcx = BIT(10) | BIT(11) | BIT(12);
+
+ asm volatile(TDCALL
+ : "=a"(ret), "=r"(r10), "=r"(r11), "=r"(r12)
+ : "a"(TDVMCALL), "r"(rcx), "r"(r10), "r"(r11), "r"(r12)
+ : );
+
+ /* XXX: Better error handling needed? */
+ *err = (ret || r10) ? -EIO : 0;
+
+ return r11;
+}
+
+static int tdx_write_msr_safe(unsigned int msr, unsigned int low,
+ unsigned int high)
+{
+ register long r10 asm("r10") = TDVMCALL_STANDARD;
+ register long r11 asm("r11") = EXIT_REASON_MSR_WRITE;
+ register long r12 asm("r12") = msr;
+ register long r13 asm("r13") = (u64)high << 32 | low;
+ register long rcx asm("rcx");
+ long ret;
+
+ WARN_ON_ONCE(tdx_is_context_switched_msr(msr));
+
+ if (msr == MSR_CSTAR)
+ return 0;
+
+ /* Allow to pass R10, R11, R12 and R13 down to the VMM */
+ rcx = BIT(10) | BIT(11) | BIT(12) | BIT(13);
+
+ asm volatile(TDCALL
+ : "=a"(ret), "=r"(r10), "=r"(r11), "=r"(r12), "=r"(r13)
+ : "a"(TDVMCALL), "r"(rcx), "r"(r10), "r"(r11), "r"(r12),
+ "r"(r13)
+ : );
+
+ return ret || r10 ? -EIO : 0;
+}
+
void __init tdx_early_init(void)
{
if (!cpuid_has_tdx_guest())
@@ -132,17 +210,31 @@ unsigned long tdx_get_ve_info(struct ve_info *ve)
int tdx_handle_virtualization_exception(struct pt_regs *regs,
struct ve_info *ve)
{
+ unsigned long val;
+ int ret = 0;
+
switch (ve->exit_reason) {
case EXIT_REASON_HLT:
tdx_halt();
break;
+ case EXIT_REASON_MSR_READ:
+ val = tdx_read_msr_safe(regs->cx, (unsigned int *)&ret);
+ if (!ret) {
+ regs->ax = val & UINT_MAX;
+ regs->dx = val >> 32;
+ }
+ break;
+ case EXIT_REASON_MSR_WRITE:
+ ret = tdx_write_msr_safe(regs->cx, regs->ax, regs->dx);
+ break;
default:
pr_warn("Unexpected #VE: %d\n", ve->exit_reason);
return -EFAULT;
}

/* After successful #VE handling, move the IP */
- regs->ip += ve->instr_len;
+ if (!ret)
+ regs->ip += ve->instr_len;

return ret;
}
--
2.25.1

Subject: [RFC v1 09/26] x86/tdx: Handle CPUID via #VE

From: "Kirill A. Shutemov" <[email protected]>

TDX has three classes of CPUID leaves: some CPUID leaves
are always handled by the CPU, others are handled by the TDX module,
and some others are handled by the VMM. Since the VMM cannot directly
intercept the instruction these are reflected with a #VE exception
to the guest, which then converts it into a TDCALL to the VMM,
or handled directly.

The TDX module EAS has a full list of CPUID leaves which are handled
natively or by the TDX module in 16.2. Only unknown CPUIDs are handled by
the #VE method. In practice this typically only applies to the
hypervisor specific CPUIDs unknown to the native CPU.

Therefore there is no risk of causing this in early CPUID code which
runs before the #VE handler is set up because it will never access
those exotic CPUID leaves.

Signed-off-by: Kirill A. Shutemov <[email protected]>
Reviewed-by: Andi Kleen <[email protected]>
Signed-off-by: Kuppuswamy Sathyanarayanan <[email protected]>
---
arch/x86/kernel/tdx.c | 32 ++++++++++++++++++++++++++++++++
1 file changed, 32 insertions(+)

diff --git a/arch/x86/kernel/tdx.c b/arch/x86/kernel/tdx.c
index 5d961263601e..e98058c048b5 100644
--- a/arch/x86/kernel/tdx.c
+++ b/arch/x86/kernel/tdx.c
@@ -172,6 +172,35 @@ static int tdx_write_msr_safe(unsigned int msr, unsigned int low,
return ret || r10 ? -EIO : 0;
}

+static void tdx_handle_cpuid(struct pt_regs *regs)
+{
+ register long r10 asm("r10") = TDVMCALL_STANDARD;
+ register long r11 asm("r11") = EXIT_REASON_CPUID;
+ register long r12 asm("r12") = regs->ax;
+ register long r13 asm("r13") = regs->cx;
+ register long r14 asm("r14");
+ register long r15 asm("r15");
+ register long rcx asm("rcx");
+ long ret;
+
+ /* Allow to pass R10, R11, R12, R13, R14 and R15 down to the VMM */
+ rcx = BIT(10) | BIT(11) | BIT(12) | BIT(13) | BIT(14) | BIT(15);
+
+ asm volatile(TDCALL
+ : "=a"(ret), "=r"(r10), "=r"(r11), "=r"(r12), "=r"(r13),
+ "=r"(r14), "=r"(r15)
+ : "a"(TDVMCALL), "r"(rcx), "r"(r10), "r"(r11), "r"(r12),
+ "r"(r13)
+ : );
+
+ regs->ax = r12;
+ regs->bx = r13;
+ regs->cx = r14;
+ regs->dx = r15;
+
+ WARN_ON(ret || r10);
+}
+
void __init tdx_early_init(void)
{
if (!cpuid_has_tdx_guest())
@@ -227,6 +256,9 @@ int tdx_handle_virtualization_exception(struct pt_regs *regs,
case EXIT_REASON_MSR_WRITE:
ret = tdx_write_msr_safe(regs->cx, regs->ax, regs->dx);
break;
+ case EXIT_REASON_CPUID:
+ tdx_handle_cpuid(regs);
+ break;
default:
pr_warn("Unexpected #VE: %d\n", ve->exit_reason);
return -EFAULT;
--
2.25.1

Subject: [RFC v1 10/26] x86/io: Allow to override inX() and outX() implementation

From: "Kirill A. Shutemov" <[email protected]>

The patch allows to override the implementation of the port IO
helpers. TDX code will provide an implementation that redirect the
helpers to paravirt calls.

Signed-off-by: Kirill A. Shutemov <[email protected]>
Reviewed-by: Andi Kleen <[email protected]>
Signed-off-by: Kuppuswamy Sathyanarayanan <[email protected]>
---
arch/x86/include/asm/io.h | 16 ++++++++++++----
1 file changed, 12 insertions(+), 4 deletions(-)

diff --git a/arch/x86/include/asm/io.h b/arch/x86/include/asm/io.h
index d726459d08e5..ef7a686a55a9 100644
--- a/arch/x86/include/asm/io.h
+++ b/arch/x86/include/asm/io.h
@@ -271,18 +271,26 @@ static inline bool sev_key_active(void) { return false; }

#endif /* CONFIG_AMD_MEM_ENCRYPT */

+#ifndef __out
+#define __out(bwl, bw) \
+ asm volatile("out" #bwl " %" #bw "0, %w1" : : "a"(value), "Nd"(port))
+#endif
+
+#ifndef __in
+#define __in(bwl, bw) \
+ asm volatile("in" #bwl " %w1, %" #bw "0" : "=a"(value) : "Nd"(port))
+#endif
+
#define BUILDIO(bwl, bw, type) \
static inline void out##bwl(unsigned type value, int port) \
{ \
- asm volatile("out" #bwl " %" #bw "0, %w1" \
- : : "a"(value), "Nd"(port)); \
+ __out(bwl, bw); \
} \
\
static inline unsigned type in##bwl(int port) \
{ \
unsigned type value; \
- asm volatile("in" #bwl " %w1, %" #bw "0" \
- : "=a"(value) : "Nd"(port)); \
+ __in(bwl, bw); \
return value; \
} \
\
--
2.25.1

Subject: [RFC v1 11/26] x86/tdx: Handle port I/O

From: "Kirill A. Shutemov" <[email protected]>

Unroll string operations and handle port I/O through TDVMCALLs.
Also handle #VE due to I/O operations with the same TDVMCALLs.

Decompression code uses port IO for earlyprintk. We must use
paravirt calls there too if we want to allow earlyprintk.

Decompresion code cannot deal with alternatives: use branches
instead to implement inX() and outX() helpers.

Signed-off-by: Kirill A. Shutemov <[email protected]>
Reviewed-by: Andi Kleen <[email protected]>
Signed-off-by: Kuppuswamy Sathyanarayanan <[email protected]>
---
arch/x86/boot/compressed/Makefile | 1 +
arch/x86/boot/compressed/tdx_io.S | 9 ++
arch/x86/include/asm/asm-prototypes.h | 1 +
arch/x86/include/asm/io.h | 5 +-
arch/x86/include/asm/tdx.h | 62 +++++++++--
arch/x86/kernel/Makefile | 2 +-
arch/x86/kernel/tdx.c | 72 +++++++++++++
arch/x86/kernel/tdx_io.S | 143 ++++++++++++++++++++++++++
8 files changed, 284 insertions(+), 11 deletions(-)
create mode 100644 arch/x86/boot/compressed/tdx_io.S
create mode 100644 arch/x86/kernel/tdx_io.S

diff --git a/arch/x86/boot/compressed/Makefile b/arch/x86/boot/compressed/Makefile
index a2554621cefe..54da333adc4e 100644
--- a/arch/x86/boot/compressed/Makefile
+++ b/arch/x86/boot/compressed/Makefile
@@ -97,6 +97,7 @@ endif

vmlinux-objs-$(CONFIG_ACPI) += $(obj)/acpi.o
vmlinux-objs-$(CONFIG_INTEL_TDX_GUEST) += $(obj)/tdx.o
+vmlinux-objs-$(CONFIG_INTEL_TDX_GUEST) += $(obj)/tdx_io.o

vmlinux-objs-$(CONFIG_EFI_MIXED) += $(obj)/efi_thunk_$(BITS).o
efi-obj-$(CONFIG_EFI_STUB) = $(objtree)/drivers/firmware/efi/libstub/lib.a
diff --git a/arch/x86/boot/compressed/tdx_io.S b/arch/x86/boot/compressed/tdx_io.S
new file mode 100644
index 000000000000..67498f67cb18
--- /dev/null
+++ b/arch/x86/boot/compressed/tdx_io.S
@@ -0,0 +1,9 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+
+#include <asm/export.h>
+
+/* Do not export symbols in decompression code */
+#undef EXPORT_SYMBOL
+#define EXPORT_SYMBOL(sym)
+
+#include "../../kernel/tdx_io.S"
diff --git a/arch/x86/include/asm/asm-prototypes.h b/arch/x86/include/asm/asm-prototypes.h
index 51e2bf27cc9b..6bc97aa39a21 100644
--- a/arch/x86/include/asm/asm-prototypes.h
+++ b/arch/x86/include/asm/asm-prototypes.h
@@ -6,6 +6,7 @@
#include <asm/page.h>
#include <asm/checksum.h>
#include <asm/mce.h>
+#include <asm/tdx.h>

#include <asm-generic/asm-prototypes.h>

diff --git a/arch/x86/include/asm/io.h b/arch/x86/include/asm/io.h
index ef7a686a55a9..30a3b30395ad 100644
--- a/arch/x86/include/asm/io.h
+++ b/arch/x86/include/asm/io.h
@@ -43,6 +43,7 @@
#include <asm/page.h>
#include <asm/early_ioremap.h>
#include <asm/pgtable_types.h>
+#include <asm/tdx.h>

#define build_mmio_read(name, size, type, reg, barrier) \
static inline type name(const volatile void __iomem *addr) \
@@ -309,7 +310,7 @@ static inline unsigned type in##bwl##_p(int port) \
\
static inline void outs##bwl(int port, const void *addr, unsigned long count) \
{ \
- if (sev_key_active()) { \
+ if (sev_key_active() || is_tdx_guest()) { \
unsigned type *value = (unsigned type *)addr; \
while (count) { \
out##bwl(*value, port); \
@@ -325,7 +326,7 @@ static inline void outs##bwl(int port, const void *addr, unsigned long count) \
\
static inline void ins##bwl(int port, void *addr, unsigned long count) \
{ \
- if (sev_key_active()) { \
+ if (sev_key_active() || is_tdx_guest()) { \
unsigned type *value = (unsigned type *)addr; \
while (count) { \
*value = in##bwl(port); \
diff --git a/arch/x86/include/asm/tdx.h b/arch/x86/include/asm/tdx.h
index 8c3e5af88643..b46ae140e39b 100644
--- a/arch/x86/include/asm/tdx.h
+++ b/arch/x86/include/asm/tdx.h
@@ -5,7 +5,16 @@

#define TDX_CPUID_LEAF_ID 0x21

-#ifdef CONFIG_INTEL_TDX_GUEST
+#define TDVMCALL 0
+#define TDINFO 1
+#define TDGETVEINFO 3
+
+/* TDVMCALL R10 Input */
+#define TDVMCALL_STANDARD 0
+#define TDVMCALL_VENDOR 1
+
+#ifndef __ASSEMBLY__
+#include <asm/cpufeature.h>

/*
* TDCALL instruction is newly added in TDX architecture,
@@ -14,19 +23,55 @@
*/
#define TDCALL ".byte 0x66,0x0f,0x01,0xcc"

-#define TDVMCALL 0
-#define TDINFO 1
-#define TDGETVEINFO 3
-
-/* TDVMCALL R10 Input */
-#define TDVMCALL_STANDARD 0
-#define TDVMCALL_VENDOR 1
+#ifdef CONFIG_INTEL_TDX_GUEST

/* Common API to check TDX support in decompression and common kernel code. */
bool is_tdx_guest(void);

void __init tdx_early_init(void);

+/* Decompression code doesn't know how to handle alternatives */
+#ifdef BOOT_COMPRESSED_MISC_H
+#define __out(bwl, bw) \
+do { \
+ if (is_tdx_guest()) { \
+ asm volatile("call tdx_out" #bwl : : \
+ "a"(value), "d"(port)); \
+ } else { \
+ asm volatile("out" #bwl " %" #bw "0, %w1" : : \
+ "a"(value), "Nd"(port)); \
+ } \
+} while (0)
+#define __in(bwl, bw) \
+do { \
+ if (is_tdx_guest()) { \
+ asm volatile("call tdx_in" #bwl : \
+ "=a"(value) : "d"(port)); \
+ } else { \
+ asm volatile("in" #bwl " %w1, %" #bw "0" : \
+ "=a"(value) : "Nd"(port)); \
+ } \
+} while (0)
+#else
+#define __out(bwl, bw) \
+ alternative_input("out" #bwl " %" #bw "1, %w2", \
+ "call tdx_out" #bwl, X86_FEATURE_TDX_GUEST, \
+ "a"(value), "d"(port))
+
+#define __in(bwl, bw) \
+ alternative_io("in" #bwl " %w2, %" #bw "0", \
+ "call tdx_in" #bwl, X86_FEATURE_TDX_GUEST, \
+ "=a"(value), "d"(port))
+#endif
+
+void tdx_outb(unsigned char value, unsigned short port);
+void tdx_outw(unsigned short value, unsigned short port);
+void tdx_outl(unsigned int value, unsigned short port);
+
+unsigned char tdx_inb(unsigned short port);
+unsigned short tdx_inw(unsigned short port);
+unsigned int tdx_inl(unsigned short port);
+
#else // !CONFIG_INTEL_TDX_GUEST

static inline bool is_tdx_guest(void)
@@ -59,4 +104,5 @@ long tdx_kvm_hypercall3(unsigned int nr, unsigned long p1, unsigned long p2,
long tdx_kvm_hypercall4(unsigned int nr, unsigned long p1, unsigned long p2,
unsigned long p3, unsigned long p4);

+#endif
#endif /* _ASM_X86_TDX_H */
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index ba8ee9300f23..c1ec77df3213 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -128,7 +128,7 @@ obj-$(CONFIG_PARAVIRT_CLOCK) += pvclock.o
obj-$(CONFIG_X86_PMEM_LEGACY_DEVICE) += pmem.o

obj-$(CONFIG_JAILHOUSE_GUEST) += jailhouse.o
-obj-$(CONFIG_INTEL_TDX_GUEST) += tdx.o
+obj-$(CONFIG_INTEL_TDX_GUEST) += tdx.o tdx_io.o

obj-$(CONFIG_EISA) += eisa.o
obj-$(CONFIG_PCSPKR_PLATFORM) += pcspeaker.o
diff --git a/arch/x86/kernel/tdx.c b/arch/x86/kernel/tdx.c
index e98058c048b5..3846d2807a7a 100644
--- a/arch/x86/kernel/tdx.c
+++ b/arch/x86/kernel/tdx.c
@@ -201,6 +201,75 @@ static void tdx_handle_cpuid(struct pt_regs *regs)
WARN_ON(ret || r10);
}

+static void tdx_out(int size, unsigned int value, int port)
+{
+ register long r10 asm("r10") = TDVMCALL_STANDARD;
+ register long r11 asm("r11") = EXIT_REASON_IO_INSTRUCTION;
+ register long r12 asm("r12") = size;
+ register long r13 asm("r13") = 1;
+ register long r14 asm("r14") = port;
+ register long r15 asm("r15") = value;
+ register long rcx asm("rcx");
+ long ret;
+
+ /* Allow to pass R10, R11, R12, R13, R14 and R15 down to the VMM */
+ rcx = BIT(10) | BIT(11) | BIT(12) | BIT(13) | BIT(14) | BIT(15);
+
+ asm volatile(TDCALL
+ : "=a"(ret), "=r"(r10), "=r"(r11), "=r"(r12), "=r"(r13),
+ "=r"(r14), "=r"(r15)
+ : "a"(TDVMCALL), "r"(rcx), "r"(r10), "r"(r11), "r"(r12),
+ "r"(r13), "r"(r14), "r"(r15)
+ : );
+
+ WARN_ON(ret || r10);
+}
+
+static unsigned int tdx_in(int size, int port)
+{
+ register long r10 asm("r10") = TDVMCALL_STANDARD;
+ register long r11 asm("r11") = EXIT_REASON_IO_INSTRUCTION;
+ register long r12 asm("r12") = size;
+ register long r13 asm("r13") = 0;
+ register long r14 asm("r14") = port;
+ register long rcx asm("rcx");
+ long ret;
+
+ /* Allow to pass R10, R11, R12, R13 and R14 down to the VMM */
+ rcx = BIT(10) | BIT(11) | BIT(12) | BIT(13) | BIT(14);
+
+ asm volatile(TDCALL
+ : "=a"(ret), "=r"(r10), "=r"(r11), "=r"(r12), "=r"(r13),
+ "=r"(r14)
+ : "a"(TDVMCALL), "r"(rcx), "r"(r10), "r"(r11), "r"(r12),
+ "r"(r13), "r"(r14)
+ : );
+
+ WARN_ON(ret || r10);
+
+ return r11;
+}
+
+static void tdx_handle_io(struct pt_regs *regs, u32 exit_qual)
+{
+ bool string = exit_qual & 16;
+ int out, size, port;
+
+ /* I/O strings ops are unrolled at build time. */
+ BUG_ON(string);
+
+ out = (exit_qual & 8) ? 0 : 1;
+ size = (exit_qual & 7) + 1;
+ port = exit_qual >> 16;
+
+ if (out) {
+ tdx_out(size, regs->ax, port);
+ } else {
+ regs->ax &= ~GENMASK(8 * size, 0);
+ regs->ax |= tdx_in(size, port) & GENMASK(8 * size, 0);
+ }
+}
+
void __init tdx_early_init(void)
{
if (!cpuid_has_tdx_guest())
@@ -259,6 +328,9 @@ int tdx_handle_virtualization_exception(struct pt_regs *regs,
case EXIT_REASON_CPUID:
tdx_handle_cpuid(regs);
break;
+ case EXIT_REASON_IO_INSTRUCTION:
+ tdx_handle_io(regs, ve->exit_qual);
+ break;
default:
pr_warn("Unexpected #VE: %d\n", ve->exit_reason);
return -EFAULT;
diff --git a/arch/x86/kernel/tdx_io.S b/arch/x86/kernel/tdx_io.S
new file mode 100644
index 000000000000..00ccbc9711fe
--- /dev/null
+++ b/arch/x86/kernel/tdx_io.S
@@ -0,0 +1,143 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+
+#include <linux/linkage.h>
+#include <asm/export.h>
+
+#include <asm/tdx.h>
+
+#define TDCALL .byte 0x66,0x0f,0x01,0xcc
+
+#define EXIT_REASON_IO_INSTRUCTION 30
+
+SYM_FUNC_START(tdx_outb)
+ push %r15
+ push %r12
+
+ xor %r15, %r15
+ mov %al, %r15b
+ mov $1, %r12
+ jmp 1f
+
+SYM_FUNC_START(tdx_outw)
+ push %r15
+ push %r12
+
+ xor %r15, %r15
+ mov %ax, %r15w
+ mov $2, %r12
+ jmp 1f
+
+SYM_FUNC_START(tdx_outl)
+ push %r15
+ push %r12
+
+ xor %r15, %r15
+ mov %eax, %r15d
+ mov $4, %r12
+1:
+ push %rax
+ push %rcx
+ push %r10
+ push %r11
+ push %r13
+ push %r14
+
+ mov $TDVMCALL, %rax
+ mov $TDVMCALL_STANDARD, %r10
+ mov $EXIT_REASON_IO_INSTRUCTION, %r11
+ mov $1, %r13
+ xor %r14, %r14
+ mov %dx, %r14w
+ /* Allow to pass R10, R11, R12, R13, R14 and R15 down to the VMM */
+ mov $0xfc00, %rcx
+
+ TDCALL
+
+ /* Panic if TDVMCALL reports failure */
+ test %rax, %rax
+ jnz 1f
+
+ /* Panic if TDVMCALL reports failure */
+ test %r10, %r10
+ jnz 1f
+
+ pop %r14
+ pop %r13
+ pop %r11
+ pop %r10
+ pop %rcx
+ pop %rax
+
+ pop %r12
+ pop %r15
+ ret
+1:
+ ud2
+SYM_FUNC_END(tdx_outb)
+SYM_FUNC_END(tdx_outw)
+SYM_FUNC_END(tdx_outl)
+EXPORT_SYMBOL(tdx_outb)
+EXPORT_SYMBOL(tdx_outw)
+EXPORT_SYMBOL(tdx_outl)
+
+SYM_FUNC_START(tdx_inb)
+ push %r12
+ mov $1, %r12
+ jmp 1f
+
+SYM_FUNC_START(tdx_inw)
+ push %r12
+ mov $2, %r12
+ jmp 1f
+
+SYM_FUNC_START(tdx_inl)
+ push %r12
+
+ mov $4, %r12
+1:
+ push %r11
+ push %rax
+ push %rcx
+ push %r10
+ push %r13
+ push %r14
+
+ mov $TDVMCALL, %rax
+ mov $TDVMCALL_STANDARD, %r10
+ mov $EXIT_REASON_IO_INSTRUCTION, %r11
+ mov $0, %r13
+ xor %r14, %r14
+ mov %dx, %r14w
+
+ /* Allow to pass R10, R11, R12, R13 and R14 down to the VMM */
+ mov $0x7c00, %rcx
+
+ TDCALL
+
+ /* Panic if TDVMCALL reports failure */
+ test %rax, %rax
+ jnz 1f
+
+ /* Panic if TDVMCALL reports failure */
+ test %r10, %r10
+ jnz 1f
+
+ pop %r14
+ pop %r13
+ pop %r10
+ pop %rcx
+ pop %rax
+
+ mov %r11d, %eax
+
+ pop %r11
+ pop %r12
+ ret
+1:
+ ud2
+SYM_FUNC_END(tdx_inb)
+SYM_FUNC_END(tdx_inw)
+SYM_FUNC_END(tdx_inl)
+EXPORT_SYMBOL(tdx_inb)
+EXPORT_SYMBOL(tdx_inw)
+EXPORT_SYMBOL(tdx_inl)
--
2.25.1

Subject: [RFC v1 12/26] x86/tdx: Handle in-kernel MMIO

From: "Kirill A. Shutemov" <[email protected]>

Handle #VE due to MMIO operations. MMIO triggers #VE with EPT_VIOLATION
exit reason.

For now we only handle subset of instruction that kernel uses for MMIO
oerations. User-space access triggers SIGBUS.

Signed-off-by: Kirill A. Shutemov <[email protected]>
Reviewed-by: Andi Kleen <[email protected]>
Signed-off-by: Kuppuswamy Sathyanarayanan <[email protected]>
---
arch/x86/kernel/tdx.c | 120 ++++++++++++++++++++++++++++++++++++++++++
1 file changed, 120 insertions(+)

diff --git a/arch/x86/kernel/tdx.c b/arch/x86/kernel/tdx.c
index 3846d2807a7a..eff58329751e 100644
--- a/arch/x86/kernel/tdx.c
+++ b/arch/x86/kernel/tdx.c
@@ -6,6 +6,8 @@
#include <linux/cpu.h>
#include <asm/tdx.h>
#include <asm/vmx.h>
+#include <asm/insn.h>
+#include <linux/sched/signal.h> /* force_sig_fault() */

#ifdef CONFIG_KVM_GUEST
#include "tdx-kvm.c"
@@ -270,6 +272,121 @@ static void tdx_handle_io(struct pt_regs *regs, u32 exit_qual)
}
}

+static unsigned long tdx_mmio(int size, bool write, unsigned long addr,
+ unsigned long val)
+{
+ register long r10 asm("r10") = TDVMCALL_STANDARD;
+ register long r11 asm("r11") = EXIT_REASON_EPT_VIOLATION;
+ register long r12 asm("r12") = size;
+ register long r13 asm("r13") = write;
+ register long r14 asm("r14") = addr;
+ register long r15 asm("r15") = val;
+ register long rcx asm("rcx");
+ long ret;
+
+ /* Allow to pass R10, R11, R12, R13, R14 and R15 down to the VMM */
+ rcx = BIT(10) | BIT(11) | BIT(12) | BIT(13) | BIT(14) | BIT(15);
+
+ asm volatile(TDCALL
+ : "=a"(ret), "=r"(r10), "=r"(r11), "=r"(r12), "=r"(r13),
+ "=r"(r14), "=r"(r15)
+ : "a"(TDVMCALL), "r"(rcx), "r"(r10), "r"(r11), "r"(r12),
+ "r"(r13), "r"(r14), "r"(r15)
+ : );
+
+ WARN_ON(ret || r10);
+
+ return r11;
+}
+
+static inline void *get_reg_ptr(struct pt_regs *regs, struct insn *insn)
+{
+ static const int regoff[] = {
+ offsetof(struct pt_regs, ax),
+ offsetof(struct pt_regs, cx),
+ offsetof(struct pt_regs, dx),
+ offsetof(struct pt_regs, bx),
+ offsetof(struct pt_regs, sp),
+ offsetof(struct pt_regs, bp),
+ offsetof(struct pt_regs, si),
+ offsetof(struct pt_regs, di),
+ offsetof(struct pt_regs, r8),
+ offsetof(struct pt_regs, r9),
+ offsetof(struct pt_regs, r10),
+ offsetof(struct pt_regs, r11),
+ offsetof(struct pt_regs, r12),
+ offsetof(struct pt_regs, r13),
+ offsetof(struct pt_regs, r14),
+ offsetof(struct pt_regs, r15),
+ };
+ int regno;
+
+ regno = X86_MODRM_REG(insn->modrm.value);
+ if (X86_REX_R(insn->rex_prefix.value))
+ regno += 8;
+
+ return (void *)regs + regoff[regno];
+}
+
+static int tdx_handle_mmio(struct pt_regs *regs, struct ve_info *ve)
+{
+ int size;
+ bool write;
+ unsigned long *reg;
+ struct insn insn;
+ unsigned long val = 0;
+
+ /*
+ * User mode would mean the kernel exposed a device directly
+ * to ring3, which shouldn't happen except for things like
+ * DPDK.
+ */
+ if (user_mode(regs)) {
+ pr_err("Unexpected user-mode MMIO access.\n");
+ force_sig_fault(SIGBUS, BUS_ADRERR, (void __user *) ve->gla);
+ return 0;
+ }
+
+ kernel_insn_init(&insn, (void *) regs->ip, MAX_INSN_SIZE);
+ insn_get_length(&insn);
+ insn_get_opcode(&insn);
+
+ write = ve->exit_qual & 0x2;
+
+ size = insn.opnd_bytes;
+ switch (insn.opcode.bytes[0]) {
+ /* MOV r/m8 r8 */
+ case 0x88:
+ /* MOV r8 r/m8 */
+ case 0x8A:
+ /* MOV r/m8 imm8 */
+ case 0xC6:
+ size = 1;
+ break;
+ }
+
+ if (inat_has_immediate(insn.attr)) {
+ BUG_ON(!write);
+ val = insn.immediate.value;
+ tdx_mmio(size, write, ve->gpa, val);
+ return insn.length;
+ }
+
+ BUG_ON(!inat_has_modrm(insn.attr));
+
+ reg = get_reg_ptr(regs, &insn);
+
+ if (write) {
+ memcpy(&val, reg, size);
+ tdx_mmio(size, write, ve->gpa, val);
+ } else {
+ val = tdx_mmio(size, write, ve->gpa, val);
+ memset(reg, 0, size);
+ memcpy(reg, &val, size);
+ }
+ return insn.length;
+}
+
void __init tdx_early_init(void)
{
if (!cpuid_has_tdx_guest())
@@ -331,6 +448,9 @@ int tdx_handle_virtualization_exception(struct pt_regs *regs,
case EXIT_REASON_IO_INSTRUCTION:
tdx_handle_io(regs, ve->exit_qual);
break;
+ case EXIT_REASON_EPT_VIOLATION:
+ ve->instr_len = tdx_handle_mmio(regs, ve);
+ break;
default:
pr_warn("Unexpected #VE: %d\n", ve->exit_reason);
return -EFAULT;
--
2.25.1

Subject: [RFC v1 13/26] x86/tdx: Handle MWAIT, MONITOR and WBINVD

In non-root TDX guest mode, MWAIT, MONITOR and WBINVD instructions
are not supported. So handle #VE due to these instructions as no ops.

Signed-off-by: Kuppuswamy Sathyanarayanan <[email protected]>
Reviewed-by: Andi Kleen <[email protected]>
---
arch/x86/kernel/tdx.c | 17 +++++++++++++++++
1 file changed, 17 insertions(+)

diff --git a/arch/x86/kernel/tdx.c b/arch/x86/kernel/tdx.c
index eff58329751e..8d1d7555fb56 100644
--- a/arch/x86/kernel/tdx.c
+++ b/arch/x86/kernel/tdx.c
@@ -451,6 +451,23 @@ int tdx_handle_virtualization_exception(struct pt_regs *regs,
case EXIT_REASON_EPT_VIOLATION:
ve->instr_len = tdx_handle_mmio(regs, ve);
break;
+ /*
+ * Per Guest-Host-Communication Interface (GHCI) for Intel Trust
+ * Domain Extensions (Intel TDX) specification, sec 2.4,
+ * some instructions that unconditionally cause #VE (such as WBINVD,
+ * MONITOR, MWAIT) do not have corresponding TDCALL
+ * [TDG.VP.VMCALL <Instruction>] leaves, since the TD has been designed
+ * with no deterministic way to confirm the result of those operations
+ * performed by the host VMM. In those cases, the goal is for the TD
+ * #VE handler to increment the RIP appropriately based on the VE
+ * information provided via TDCALL.
+ */
+ case EXIT_REASON_WBINVD:
+ pr_warn_once("WBINVD #VE Exception\n");
+ case EXIT_REASON_MWAIT_INSTRUCTION:
+ case EXIT_REASON_MONITOR_INSTRUCTION:
+ /* Handle as nops. */
+ break;
default:
pr_warn("Unexpected #VE: %d\n", ve->exit_reason);
return -EFAULT;
--
2.25.1

Subject: [RFC v1 14/26] ACPI: tables: Add multiprocessor wake-up support

As per Guest-Host Communication Interface (GHCI)
Specification for Intel TDX, sec 4.1, a new sub
structure – multiprocessor wake-up structure - is added to the
ACPI Multiple APIC Description Table (MADT) to describe the
information of the mailbox. If a platform firmware produces the
multiprocessor wake-up structure, then the BSP in OS may use this
new mailbox-based mechanism to wake up the APs.

Add ACPI MADT wake table parsing support and if MADT wake table is
present, update apic->wakeup_secondary_cpu with new API which
uses MADT wake mailbox to wake-up CPU.

Co-developed-by: Sean Christopherson <[email protected]>
Signed-off-by: Sean Christopherson <[email protected]>
Signed-off-by: Kuppuswamy Sathyanarayanan <[email protected]>
Reviewed-by: Andi Kleen <[email protected]>
---
arch/x86/include/asm/apic.h | 3 ++
arch/x86/kernel/acpi/boot.c | 56 +++++++++++++++++++++++++++++++++
arch/x86/kernel/apic/probe_32.c | 8 +++++
arch/x86/kernel/apic/probe_64.c | 8 +++++
drivers/acpi/tables.c | 9 ++++++
include/acpi/actbl2.h | 21 ++++++++++++-
6 files changed, 104 insertions(+), 1 deletion(-)

diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h
index 34cb3c159481..63f970c61cbe 100644
--- a/arch/x86/include/asm/apic.h
+++ b/arch/x86/include/asm/apic.h
@@ -497,6 +497,9 @@ static inline unsigned int read_apic_id(void)
return apic->get_apic_id(reg);
}

+typedef int (*wakeup_cpu_handler)(int apicid, unsigned long start_eip);
+extern void acpi_wake_cpu_handler_update(wakeup_cpu_handler handler);
+
extern int default_apic_id_valid(u32 apicid);
extern int default_acpi_madt_oem_check(char *, char *);
extern void default_setup_apic_routing(void);
diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c
index 7bdc0239a943..37ada1908fb7 100644
--- a/arch/x86/kernel/acpi/boot.c
+++ b/arch/x86/kernel/acpi/boot.c
@@ -65,6 +65,9 @@ int acpi_fix_pin2_polarity __initdata;
static u64 acpi_lapic_addr __initdata = APIC_DEFAULT_PHYS_BASE;
#endif

+static struct acpi_madt_mp_wake_mailbox *acpi_mp_wake_mailbox;
+static u64 acpi_mp_wake_mailbox_paddr;
+
#ifdef CONFIG_X86_IO_APIC
/*
* Locks related to IOAPIC hotplug
@@ -329,6 +332,29 @@ acpi_parse_lapic_nmi(union acpi_subtable_headers * header, const unsigned long e
return 0;
}

+static void acpi_mp_wake_mailbox_init(void)
+{
+ if (acpi_mp_wake_mailbox)
+ return;
+
+ acpi_mp_wake_mailbox = memremap(acpi_mp_wake_mailbox_paddr,
+ sizeof(*acpi_mp_wake_mailbox), MEMREMAP_WB);
+}
+
+static int acpi_wakeup_cpu(int apicid, unsigned long start_ip)
+{
+ acpi_mp_wake_mailbox_init();
+
+ if (!acpi_mp_wake_mailbox)
+ return -EINVAL;
+
+ WRITE_ONCE(acpi_mp_wake_mailbox->apic_id, apicid);
+ WRITE_ONCE(acpi_mp_wake_mailbox->wakeup_vector, start_ip);
+ WRITE_ONCE(acpi_mp_wake_mailbox->command, ACPI_MP_WAKE_COMMAND_WAKEUP);
+
+ return 0;
+}
+
#endif /*CONFIG_X86_LOCAL_APIC */

#ifdef CONFIG_X86_IO_APIC
@@ -1086,6 +1112,30 @@ static int __init acpi_parse_madt_lapic_entries(void)
}
return 0;
}
+
+static int __init acpi_parse_mp_wake(union acpi_subtable_headers *header,
+ const unsigned long end)
+{
+ struct acpi_madt_mp_wake *mp_wake = NULL;
+
+ if (!IS_ENABLED(CONFIG_SMP))
+ return -ENODEV;
+
+ mp_wake = (struct acpi_madt_mp_wake *)header;
+ if (BAD_MADT_ENTRY(mp_wake, end))
+ return -EINVAL;
+
+ if (acpi_mp_wake_mailbox)
+ return -EINVAL;
+
+ acpi_table_print_madt_entry(&header->common);
+
+ acpi_mp_wake_mailbox_paddr = mp_wake->mailbox_address;
+
+ acpi_wake_cpu_handler_update(acpi_wakeup_cpu);
+
+ return 0;
+}
#endif /* CONFIG_X86_LOCAL_APIC */

#ifdef CONFIG_X86_IO_APIC
@@ -1284,6 +1334,12 @@ static void __init acpi_process_madt(void)

smp_found_config = 1;
}
+
+ /*
+ * Parse MADT MP Wake entry.
+ */
+ acpi_table_parse_madt(ACPI_MADT_TYPE_MP_WAKE,
+ acpi_parse_mp_wake, 1);
}
if (error == -EINVAL) {
/*
diff --git a/arch/x86/kernel/apic/probe_32.c b/arch/x86/kernel/apic/probe_32.c
index a61f642b1b90..d450014841b2 100644
--- a/arch/x86/kernel/apic/probe_32.c
+++ b/arch/x86/kernel/apic/probe_32.c
@@ -207,3 +207,11 @@ int __init default_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
}
return 0;
}
+
+void __init acpi_wake_cpu_handler_update(wakeup_cpu_handler handler)
+{
+ struct apic **drv;
+
+ for (drv = __apicdrivers; drv < __apicdrivers_end; drv++)
+ (*drv)->wakeup_secondary_cpu = handler;
+}
diff --git a/arch/x86/kernel/apic/probe_64.c b/arch/x86/kernel/apic/probe_64.c
index c46720f185c0..986dbb68d3c4 100644
--- a/arch/x86/kernel/apic/probe_64.c
+++ b/arch/x86/kernel/apic/probe_64.c
@@ -50,3 +50,11 @@ int __init default_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
}
return 0;
}
+
+void __init acpi_wake_cpu_handler_update(wakeup_cpu_handler handler)
+{
+ struct apic **drv;
+
+ for (drv = __apicdrivers; drv < __apicdrivers_end; drv++)
+ (*drv)->wakeup_secondary_cpu = handler;
+}
diff --git a/drivers/acpi/tables.c b/drivers/acpi/tables.c
index e48690a006a4..5e38748c5db1 100644
--- a/drivers/acpi/tables.c
+++ b/drivers/acpi/tables.c
@@ -207,6 +207,15 @@ void acpi_table_print_madt_entry(struct acpi_subtable_header *header)
}
break;

+ case ACPI_MADT_TYPE_MP_WAKE:
+ {
+ struct acpi_madt_mp_wake *p =
+ (struct acpi_madt_mp_wake *)header;
+ pr_debug("MP Wake (version[%d] mailbox_address[%llx])\n",
+ p->version, p->mailbox_address);
+ }
+ break;
+
default:
pr_warn("Found unsupported MADT entry (type = 0x%x)\n",
header->type);
diff --git a/include/acpi/actbl2.h b/include/acpi/actbl2.h
index ec66779cb193..be953b638499 100644
--- a/include/acpi/actbl2.h
+++ b/include/acpi/actbl2.h
@@ -517,7 +517,8 @@ enum acpi_madt_type {
ACPI_MADT_TYPE_GENERIC_MSI_FRAME = 13,
ACPI_MADT_TYPE_GENERIC_REDISTRIBUTOR = 14,
ACPI_MADT_TYPE_GENERIC_TRANSLATOR = 15,
- ACPI_MADT_TYPE_RESERVED = 16 /* 16 and greater are reserved */
+ ACPI_MADT_TYPE_MP_WAKE = 16,
+ ACPI_MADT_TYPE_RESERVED = 17 /* 17 and greater are reserved */
};

/*
@@ -724,6 +725,24 @@ struct acpi_madt_generic_translator {
u32 reserved2;
};

+/* 16: MP Wake (ACPI 6.?) */
+
+struct acpi_madt_mp_wake {
+ struct acpi_subtable_header header;
+ u16 version;
+ u32 reserved2;
+ u64 mailbox_address;
+};
+
+struct acpi_madt_mp_wake_mailbox {
+ u16 command;
+ u16 flags;
+ u32 apic_id;
+ u64 wakeup_vector;
+};
+
+#define ACPI_MP_WAKE_COMMAND_WAKEUP 1
+
/*
* Common flags fields for MADT subtables
*/
--
2.25.1

Subject: [RFC v1 15/26] x86/boot: Add a trampoline for APs booting in 64-bit mode

From: Sean Christopherson <[email protected]>

Add a trampoline for booting APs in 64-bit mode via a software handoff
with BIOS, and use the new trampoline for the ACPI MP wake protocol used
by TDX.

Extend the real mode IDT pointer by four bytes to support LIDT in 64-bit
mode. For the GDT pointer, create a new entry as the existing storage
for the pointer occupies the zero entry in the GDT itself.

Reported-by: Kai Huang <[email protected]>
Signed-off-by: Sean Christopherson <[email protected]>
Reviewed-by: Andi Kleen <[email protected]>
Signed-off-by: Kuppuswamy Sathyanarayanan <[email protected]>
---
arch/x86/include/asm/realmode.h | 1 +
arch/x86/kernel/smpboot.c | 5 +++
arch/x86/realmode/rm/header.S | 1 +
arch/x86/realmode/rm/trampoline_64.S | 49 +++++++++++++++++++++++-
arch/x86/realmode/rm/trampoline_common.S | 5 ++-
5 files changed, 58 insertions(+), 3 deletions(-)

diff --git a/arch/x86/include/asm/realmode.h b/arch/x86/include/asm/realmode.h
index 5db5d083c873..5066c8b35e7c 100644
--- a/arch/x86/include/asm/realmode.h
+++ b/arch/x86/include/asm/realmode.h
@@ -25,6 +25,7 @@ struct real_mode_header {
u32 sev_es_trampoline_start;
#endif
#ifdef CONFIG_X86_64
+ u32 trampoline_start64;
u32 trampoline_pgd;
#endif
/* ACPI S3 wakeup */
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index 8ca66af96a54..11dd0deb4810 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -1035,6 +1035,11 @@ static int do_boot_cpu(int apicid, int cpu, struct task_struct *idle,
unsigned long boot_error = 0;
unsigned long timeout;

+#ifdef CONFIG_X86_64
+ if (is_tdx_guest())
+ start_ip = real_mode_header->trampoline_start64;
+#endif
+
idle->thread.sp = (unsigned long)task_pt_regs(idle);
early_gdt_descr.address = (unsigned long)get_cpu_gdt_rw(cpu);
initial_code = (unsigned long)start_secondary;
diff --git a/arch/x86/realmode/rm/header.S b/arch/x86/realmode/rm/header.S
index 8c1db5bf5d78..2eb62be6d256 100644
--- a/arch/x86/realmode/rm/header.S
+++ b/arch/x86/realmode/rm/header.S
@@ -24,6 +24,7 @@ SYM_DATA_START(real_mode_header)
.long pa_sev_es_trampoline_start
#endif
#ifdef CONFIG_X86_64
+ .long pa_trampoline_start64
.long pa_trampoline_pgd;
#endif
/* ACPI S3 wakeup */
diff --git a/arch/x86/realmode/rm/trampoline_64.S b/arch/x86/realmode/rm/trampoline_64.S
index 84c5d1b33d10..12b734b1da8b 100644
--- a/arch/x86/realmode/rm/trampoline_64.S
+++ b/arch/x86/realmode/rm/trampoline_64.S
@@ -143,13 +143,20 @@ SYM_CODE_START(startup_32)
movl %eax, %cr3

# Set up EFER
+ movl $MSR_EFER, %ecx
+ rdmsr
+ cmp pa_tr_efer, %eax
+ jne .Lwrite_efer
+ cmp pa_tr_efer + 4, %edx
+ je .Ldone_efer
+.Lwrite_efer:
movl pa_tr_efer, %eax
movl pa_tr_efer + 4, %edx
- movl $MSR_EFER, %ecx
wrmsr

+.Ldone_efer:
# Enable paging and in turn activate Long Mode
- movl $(X86_CR0_PG | X86_CR0_WP | X86_CR0_PE), %eax
+ movl $(X86_CR0_PG | X86_CR0_WP | X86_CR0_NE | X86_CR0_PE), %eax
movl %eax, %cr0

/*
@@ -161,6 +168,19 @@ SYM_CODE_START(startup_32)
ljmpl $__KERNEL_CS, $pa_startup_64
SYM_CODE_END(startup_32)

+SYM_CODE_START(pa_trampoline_compat)
+ /*
+ * In compatibility mode. Prep ESP and DX for startup_32, then disable
+ * paging and complete the switch to legacy 32-bit mode.
+ */
+ movl $rm_stack_end, %esp
+ movw $__KERNEL_DS, %dx
+
+ movl $(X86_CR0_NE | X86_CR0_PE), %eax
+ movl %eax, %cr0
+ ljmpl $__KERNEL32_CS, $pa_startup_32
+SYM_CODE_END(pa_trampoline_compat)
+
.section ".text64","ax"
.code64
.balign 4
@@ -169,6 +189,20 @@ SYM_CODE_START(startup_64)
jmpq *tr_start(%rip)
SYM_CODE_END(startup_64)

+SYM_CODE_START(trampoline_start64)
+ /*
+ * APs start here on a direct transfer from 64-bit BIOS with identity
+ * mapped page tables. Load the kernel's GDT in order to gear down to
+ * 32-bit mode (to handle 4-level vs. 5-level paging), and to (re)load
+ * segment registers. Load the zero IDT so any fault triggers a
+ * shutdown instead of jumping back into BIOS.
+ */
+ lidt tr_idt(%rip)
+ lgdt tr_gdt64(%rip)
+
+ ljmpl *tr_compat(%rip)
+SYM_CODE_END(trampoline_start64)
+
.section ".rodata","a"
# Duplicate the global descriptor table
# so the kernel can live anywhere
@@ -182,6 +216,17 @@ SYM_DATA_START(tr_gdt)
.quad 0x00cf93000000ffff # __KERNEL_DS
SYM_DATA_END_LABEL(tr_gdt, SYM_L_LOCAL, tr_gdt_end)

+SYM_DATA_START(tr_gdt64)
+ .short tr_gdt_end - tr_gdt - 1 # gdt limit
+ .long pa_tr_gdt
+ .long 0
+SYM_DATA_END(tr_gdt64)
+
+SYM_DATA_START(tr_compat)
+ .long pa_trampoline_compat
+ .short __KERNEL32_CS
+SYM_DATA_END(tr_compat)
+
.bss
.balign PAGE_SIZE
SYM_DATA(trampoline_pgd, .space PAGE_SIZE)
diff --git a/arch/x86/realmode/rm/trampoline_common.S b/arch/x86/realmode/rm/trampoline_common.S
index 5033e640f957..506d5897112a 100644
--- a/arch/x86/realmode/rm/trampoline_common.S
+++ b/arch/x86/realmode/rm/trampoline_common.S
@@ -1,4 +1,7 @@
/* SPDX-License-Identifier: GPL-2.0 */
.section ".rodata","a"
.balign 16
-SYM_DATA_LOCAL(tr_idt, .fill 1, 6, 0)
+SYM_DATA_START_LOCAL(tr_idt)
+ .short 0
+ .quad 0
+SYM_DATA_END(tr_idt)
--
2.25.1

2021-02-08 10:11:20

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [RFC v1 04/26] x86/tdx: Get TD execution environment information via TDINFO

On Fri, Feb 05, 2021 at 03:38:21PM -0800, Kuppuswamy Sathyanarayanan wrote:
> +/*
> + * TDCALL instruction is newly added in TDX architecture,
> + * used by TD for requesting the host VMM to provide
> + * (untrusted) services.
> + */
> +#define TDCALL ".byte 0x66,0x0f,0x01,0xcc"

This needs a binutils version number.

2021-02-08 10:34:11

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [RFC v1 05/26] x86/traps: Add #VE support for TDX guest

On Fri, Feb 05, 2021 at 03:38:22PM -0800, Kuppuswamy Sathyanarayanan wrote:
> From: "Kirill A. Shutemov" <[email protected]>
>
> The TDX module injects #VE exception to the guest TD in cases of
> disallowed instructions, disallowed MSR accesses and subset of CPUID
> leaves. Also, it's theoretically possible for CPU to inject #VE
> exception on EPT violation, but the TDX module makes sure this does
> not happen, as long as all memory used is properly accepted using
> TDCALLs. You can find more details about it in, Guest-Host-Communication
> Interface (GHCI) for Intel Trust Domain Extensions (Intel TDX)
> specification, sec 2.3.
>
> Add basic infrastructure to handle #VE. If there is no handler for a
> given #VE, since its a unexpected event (fault case), treat it as a
> general protection fault and handle it using do_general_protection()
> call.
>
> TDCALL[TDGETVEINFO] provides information about #VE such as exit reason.
>
> More details on cases where #VE exceptions are allowed/not-allowed:
>
> The #VE exception do not occur in the paranoid entry paths, like NMIs.
> While other operations during an NMI might cause #VE, these are in the
> NMI code that can handle nesting, so there is no concern about
> reentrancy. This is similar to how #PF is handled in NMIs.
>
> The #VE exception also cannot happen in entry/exit code with the
> wrong gs, such as the SWAPGS code, so it's entry point does not
> need "paranoid" handling.

All of the above are arranged by using the below secure EPT for init
text and data?

> Any memory accesses can cause #VE if it causes an EPT
> violation. ?However, the VMM is only in direct control of some of the
> EPT tables. ?The Secure EPT tables are controlled by the TDX module
> which guarantees no EPT violations will result in #VE for the guest,
> once the memory has been accepted.

Which is supposedly then set up to avoid #VE during the syscall gap,
yes? Which then results in #VE not having to be IST.

> +#ifdef CONFIG_INTEL_TDX_GUEST
> +DEFINE_IDTENTRY(exc_virtualization_exception)
> +{
> + struct ve_info ve;
> + int ret;
> +
> + RCU_LOCKDEP_WARN(!rcu_is_watching(), "entry code didn't wake RCU");
> +
> + /* Consume #VE info before re-enabling interrupts */

So what happens if NMI happens here, and triggers a nested #VE ?

> + ret = tdx_get_ve_info(&ve);
> + cond_local_irq_enable(regs);
> + if (!ret)
> + ret = tdx_handle_virtualization_exception(regs, &ve);
> + /*
> + * If #VE exception handler could not handle it successfully, treat
> + * it as #GP(0) and handle it.
> + */
> + if (ret)
> + do_general_protection(regs, 0);
> + cond_local_irq_disable(regs);
> +}
> +#endif

2021-02-08 18:37:36

by Andi Kleen

[permalink] [raw]
Subject: Re: [RFC v1 05/26] x86/traps: Add #VE support for TDX guest

> Which is supposedly then set up to avoid #VE during the syscall gap,
> yes? Which then results in #VE not having to be IST.

Yes that is currently true because all memory is pre-accepted.

If we ever do lazy accept we would need to make sure the memory accessed in
the syscall gap is already accepted, or move over to an IST.

> > +#ifdef CONFIG_INTEL_TDX_GUEST
> > +DEFINE_IDTENTRY(exc_virtualization_exception)
> > +{
> > + struct ve_info ve;
> > + int ret;
> > +
> > + RCU_LOCKDEP_WARN(!rcu_is_watching(), "entry code didn't wake RCU");
> > +
> > + /* Consume #VE info before re-enabling interrupts */
>
> So what happens if NMI happens here, and triggers a nested #VE ?

Yes that's a gap. We should probably bail out and reexecute the original
instruction. The VE handler would need to set a flag for that.

Or alternatively the NMI always gets the VE information and puts
it on some internal stack, but that would seem clunkier.


-Andi

2021-02-08 18:42:19

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [RFC v1 05/26] x86/traps: Add #VE support for TDX guest

On Mon, Feb 08, 2021 at 08:23:01AM -0800, Andi Kleen wrote:
> > Which is supposedly then set up to avoid #VE during the syscall gap,
> > yes? Which then results in #VE not having to be IST.
>
> Yes that is currently true because all memory is pre-accepted.
>
> If we ever do lazy accept we would need to make sure the memory accessed in
> the syscall gap is already accepted, or move over to an IST.

I think we're going to mandate the entry text/data will have to be
pre-accepted to avoid IST. ISTs really are crap.

> > > +#ifdef CONFIG_INTEL_TDX_GUEST
> > > +DEFINE_IDTENTRY(exc_virtualization_exception)
> > > +{
> > > + struct ve_info ve;
> > > + int ret;
> > > +
> > > + RCU_LOCKDEP_WARN(!rcu_is_watching(), "entry code didn't wake RCU");
> > > +
> > > + /* Consume #VE info before re-enabling interrupts */
> >
> > So what happens if NMI happens here, and triggers a nested #VE ?
>
> Yes that's a gap. We should probably bail out and reexecute the original
> instruction. The VE handler would need to set a flag for that.
>
> Or alternatively the NMI always gets the VE information and puts
> it on some internal stack, but that would seem clunkier.

The same is possible with MCE and #DB I imagine.

2021-02-08 18:46:20

by Sean Christopherson

[permalink] [raw]
Subject: Re: [RFC v1 05/26] x86/traps: Add #VE support for TDX guest

On Mon, Feb 08, 2021, Peter Zijlstra wrote:
> On Mon, Feb 08, 2021 at 08:23:01AM -0800, Andi Kleen wrote:
> > > > +#ifdef CONFIG_INTEL_TDX_GUEST
> > > > +DEFINE_IDTENTRY(exc_virtualization_exception)
> > > > +{
> > > > + struct ve_info ve;
> > > > + int ret;
> > > > +
> > > > + RCU_LOCKDEP_WARN(!rcu_is_watching(), "entry code didn't wake RCU");
> > > > +
> > > > + /* Consume #VE info before re-enabling interrupts */
> > >
> > > So what happens if NMI happens here, and triggers a nested #VE ?
> >
> > Yes that's a gap. We should probably bail out and reexecute the original
> > instruction. The VE handler would need to set a flag for that.

No, NMI cannot happen here. The TDX-Module "blocks" NMIs until the #VE info is
consumed by the guest.

> > Or alternatively the NMI always gets the VE information and puts
> > it on some internal stack, but that would seem clunkier.
>
> The same is possible with MCE and #DB I imagine.

The MCE "architecture" for a TDX guest is rather stupid. The guest is required
to keep CR4.MCE=1, but at least for TDX 1.0 the VMM is not allowed to inject #MC.
So, for better or worse, #MC is a non-issue.

#VE->#DB->#VE would be an issue, presumably this needs to be noinstr (or whatever
it is that prevents #DBs on functions).

2021-02-08 18:48:38

by Andi Kleen

[permalink] [raw]
Subject: Re: [RFC v1 05/26] x86/traps: Add #VE support for TDX guest

> > > So what happens if NMI happens here, and triggers a nested #VE ?
> >
> > Yes that's a gap. We should probably bail out and reexecute the original
> > instruction. The VE handler would need to set a flag for that.
> >
> > Or alternatively the NMI always gets the VE information and puts
> > it on some internal stack, but that would seem clunkier.
>
> The same is possible with MCE and #DB I imagine.

I don't think there are currently any plans to inject #MC into TDX guests. It's
doubtful this could be done securely.

#DB is trickier because it will happen every time, so simply reexecuting
won't work. I guess it would need the ve info stack, or some care in kprobes/kernel
debugger that it cannot happen. I think I would prefer the later.

-Andi

2021-02-08 18:57:48

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [RFC v1 05/26] x86/traps: Add #VE support for TDX guest

On Mon, Feb 08, 2021 at 08:46:23AM -0800, Sean Christopherson wrote:
> On Mon, Feb 08, 2021, Peter Zijlstra wrote:
> > On Mon, Feb 08, 2021 at 08:23:01AM -0800, Andi Kleen wrote:
> > > > > +#ifdef CONFIG_INTEL_TDX_GUEST
> > > > > +DEFINE_IDTENTRY(exc_virtualization_exception)
> > > > > +{
> > > > > + struct ve_info ve;
> > > > > + int ret;
> > > > > +
> > > > > + RCU_LOCKDEP_WARN(!rcu_is_watching(), "entry code didn't wake RCU");
> > > > > +
> > > > > + /* Consume #VE info before re-enabling interrupts */
> > > >
> > > > So what happens if NMI happens here, and triggers a nested #VE ?
> > >
> > > Yes that's a gap. We should probably bail out and reexecute the original
> > > instruction. The VE handler would need to set a flag for that.
>
> No, NMI cannot happen here. The TDX-Module "blocks" NMIs until the #VE info is
> consumed by the guest.

'cute', might be useful to have that mentioned somewhere.

> > > Or alternatively the NMI always gets the VE information and puts
> > > it on some internal stack, but that would seem clunkier.
> >
> > The same is possible with MCE and #DB I imagine.
>
> The MCE "architecture" for a TDX guest is rather stupid. The guest is required
> to keep CR4.MCE=1, but at least for TDX 1.0 the VMM is not allowed to inject #MC.
> So, for better or worse, #MC is a non-issue.
>
> #VE->#DB->#VE would be an issue, presumably this needs to be noinstr (or whatever
> it is that prevents #DBs on functions).

Ah, it is that already ofcourse, so yeah #DB can't happen here.

Subject: Re: [RFC v1 05/26] x86/traps: Add #VE support for TDX guest



On 2/8/21 8:59 AM, Peter Zijlstra wrote:
> 'cute', might be useful to have that mentioned somewhere.
we will add a note for it in comments.

--
Sathyanarayanan Kuppuswamy
Linux Kernel Developer

Subject: Re: [RFC v1 04/26] x86/tdx: Get TD execution environment information via TDINFO



On 2/8/21 2:00 AM, Peter Zijlstra wrote:
> This needs a binutils version number.
Yes, we will add it in next version.

--
Sathyanarayanan Kuppuswamy
Linux Kernel Developer

2021-02-12 19:23:57

by Dave Hansen

[permalink] [raw]
Subject: Re: [RFC v1 05/26] x86/traps: Add #VE support for TDX guest

On 2/5/21 3:38 PM, Kuppuswamy Sathyanarayanan wrote:
> More details on cases where #VE exceptions are allowed/not-allowed:
>
> The #VE exception do not occur in the paranoid entry paths, like NMIs.
> While other operations during an NMI might cause #VE, these are in the
> NMI code that can handle nesting, so there is no concern about
> reentrancy. This is similar to how #PF is handled in NMIs.
>
> The #VE exception also cannot happen in entry/exit code with the
> wrong gs, such as the SWAPGS code, so it's entry point does not
> need "paranoid" handling.

Considering:

https://lore.kernel.org/lkml/20200825171903.GA20660@sjchrist-ice/

I would suggest revisiting this part of the changelog.

2021-02-12 19:49:18

by Andy Lutomirski

[permalink] [raw]
Subject: Re: [RFC v1 05/26] x86/traps: Add #VE support for TDX guest

On Fri, Feb 5, 2021 at 3:39 PM Kuppuswamy Sathyanarayanan
<[email protected]> wrote:
>
> From: "Kirill A. Shutemov" <[email protected]>
>
> The TDX module injects #VE exception to the guest TD in cases of
> disallowed instructions, disallowed MSR accesses and subset of CPUID
> leaves. Also, it's theoretically possible for CPU to inject #VE
> exception on EPT violation, but the TDX module makes sure this does
> not happen, as long as all memory used is properly accepted using
> TDCALLs.

By my very cursory reading of the TDX arch specification 9.8.2,
"Secure" EPT violations don't send #VE. But the docs are quite
unclear, or at least the docs I found are. What happens if the guest
attempts to access a secure GPA that is not ACCEPTed? For example,
suppose the VMM does THH.MEM.PAGE.REMOVE on a secure address and the
guest accesses it, via instruction fetch or data access. What
happens?

2021-02-12 20:09:51

by Sean Christopherson

[permalink] [raw]
Subject: Re: [RFC v1 05/26] x86/traps: Add #VE support for TDX guest

On Fri, Feb 12, 2021, Andy Lutomirski wrote:
> On Fri, Feb 5, 2021 at 3:39 PM Kuppuswamy Sathyanarayanan
> <[email protected]> wrote:
> >
> > From: "Kirill A. Shutemov" <[email protected]>
> >
> > The TDX module injects #VE exception to the guest TD in cases of
> > disallowed instructions, disallowed MSR accesses and subset of CPUID
> > leaves. Also, it's theoretically possible for CPU to inject #VE
> > exception on EPT violation, but the TDX module makes sure this does
> > not happen, as long as all memory used is properly accepted using
> > TDCALLs.
>
> By my very cursory reading of the TDX arch specification 9.8.2,
> "Secure" EPT violations don't send #VE. But the docs are quite
> unclear, or at least the docs I found are.

The version I have also states that SUPPRESS_VE is always set. So either there
was a change in direction, or the public docs need to be updated. Lazy accept
requires a #VE, either from hardware or from the module. The latter would
require walking the Secure EPT tables on every EPT violation...

> What happens if the guest attempts to access a secure GPA that is not
> ACCEPTed? For example, suppose the VMM does THH.MEM.PAGE.REMOVE on a secure
> address and the guest accesses it, via instruction fetch or data access.
> What happens?

Well, as currently written in the spec, it will generate an EPT violation and
the host will have no choice but to kill the guest.

2021-02-12 20:20:05

by Dave Hansen

[permalink] [raw]
Subject: Re: [RFC v1 05/26] x86/traps: Add #VE support for TDX guest

On 2/12/21 12:06 PM, Sean Christopherson wrote:
>> What happens if the guest attempts to access a secure GPA that is not
>> ACCEPTed? For example, suppose the VMM does THH.MEM.PAGE.REMOVE on a secure
>> address and the guest accesses it, via instruction fetch or data access.
>> What happens?
> Well, as currently written in the spec, it will generate an EPT violation and
> the host will have no choice but to kill the guest.

That's actually perfect behavior from my perspective. Host does
something stupid. Host gets left holding the pieces. No enabling to do
in the guest.

This doesn't *preclude* the possibility that the VMM and guest could
establish a protocol to remove guest pages. It just means that the host
can't go it alone and that if they guest and host get out of sync, the
guest dies.

In other words, I think I'm rooting for the docs, as written. :)

2021-02-12 20:21:58

by Andy Lutomirski

[permalink] [raw]
Subject: Re: [RFC v1 05/26] x86/traps: Add #VE support for TDX guest


> On Feb 12, 2021, at 12:06 PM, Sean Christopherson <[email protected]> wrote:
>
> On Fri, Feb 12, 2021, Andy Lutomirski wrote:
>>> On Fri, Feb 5, 2021 at 3:39 PM Kuppuswamy Sathyanarayanan
>>> <[email protected]> wrote:
>>>
>>> From: "Kirill A. Shutemov" <[email protected]>
>>>
>>> The TDX module injects #VE exception to the guest TD in cases of
>>> disallowed instructions, disallowed MSR accesses and subset of CPUID
>>> leaves. Also, it's theoretically possible for CPU to inject #VE
>>> exception on EPT violation, but the TDX module makes sure this does
>>> not happen, as long as all memory used is properly accepted using
>>> TDCALLs.
>>
>> By my very cursory reading of the TDX arch specification 9.8.2,
>> "Secure" EPT violations don't send #VE. But the docs are quite
>> unclear, or at least the docs I found are.
>
> The version I have also states that SUPPRESS_VE is always set. So either there
> was a change in direction, or the public docs need to be updated. Lazy accept
> requires a #VE, either from hardware or from the module. The latter would
> require walking the Secure EPT tables on every EPT violation...
>
>> What happens if the guest attempts to access a secure GPA that is not
>> ACCEPTed? For example, suppose the VMM does THH.MEM.PAGE.REMOVE on a secure
>> address and the guest accesses it, via instruction fetch or data access.
>> What happens?
>
> Well, as currently written in the spec, it will generate an EPT violation and
> the host will have no choice but to kill the guest.

Or page the page back in and try again?

In regular virt guests, if the host pages out a guest page, it’s the host’s job to put it back when needed. In paravirt, a well designed async of protocol can sometimes let the guest to useful work when this happens. If a guest (or bare metal) has its memory hot removed (via balloon or whatever) and the kernel messes up and accesses removed memory, the guest (or bare metal) is toast.

I don’t see why TDX needs to be any different.

2021-02-12 20:40:02

by Sean Christopherson

[permalink] [raw]
Subject: Re: [RFC v1 05/26] x86/traps: Add #VE support for TDX guest

On Fri, Feb 12, 2021, Dave Hansen wrote:
> On 2/12/21 12:06 PM, Sean Christopherson wrote:
> >> What happens if the guest attempts to access a secure GPA that is not
> >> ACCEPTed? For example, suppose the VMM does THH.MEM.PAGE.REMOVE on a secure
> >> address and the guest accesses it, via instruction fetch or data access.
> >> What happens?
> > Well, as currently written in the spec, it will generate an EPT violation and
> > the host will have no choice but to kill the guest.
>
> That's actually perfect behavior from my perspective. Host does
> something stupid. Host gets left holding the pieces. No enabling to do
> in the guest.
>
> This doesn't *preclude* the possibility that the VMM and guest could
> establish a protocol to remove guest pages. It just means that the host
> can't go it alone and that if they guest and host get out of sync, the
> guest dies.
>
> In other words, I think I'm rooting for the docs, as written. :)

I tentatively agree that the host should not be able to remove pages without
guest approval, but that's not the only use case for #VE on EPT violations.
It's not even really an intended use case.

There needs to be a mechanism for lazy/deferred/on-demand acceptance of pages.
E.g. pre-accepting every page in a VM with hundreds of GB of memory will be
ridiculously slow.

#VE is the best option to do that:

- Relatively sane re-entrancy semantics.
- Hardware accelerated.
- Doesn't require stealing an IRQ from the guest.

2021-02-12 20:47:23

by Sean Christopherson

[permalink] [raw]
Subject: Re: [RFC v1 05/26] x86/traps: Add #VE support for TDX guest

On Fri, Feb 12, 2021, Andy Lutomirski wrote:
>
> > On Feb 12, 2021, at 12:06 PM, Sean Christopherson <[email protected]> wrote:
> >
> > On Fri, Feb 12, 2021, Andy Lutomirski wrote:
> >>> On Fri, Feb 5, 2021 at 3:39 PM Kuppuswamy Sathyanarayanan
> >>> <[email protected]> wrote:
> >>>
> >>> From: "Kirill A. Shutemov" <[email protected]>
> >>>
> >>> The TDX module injects #VE exception to the guest TD in cases of
> >>> disallowed instructions, disallowed MSR accesses and subset of CPUID
> >>> leaves. Also, it's theoretically possible for CPU to inject #VE
> >>> exception on EPT violation, but the TDX module makes sure this does
> >>> not happen, as long as all memory used is properly accepted using
> >>> TDCALLs.
> >>
> >> By my very cursory reading of the TDX arch specification 9.8.2,
> >> "Secure" EPT violations don't send #VE. But the docs are quite
> >> unclear, or at least the docs I found are.
> >
> > The version I have also states that SUPPRESS_VE is always set. So either there
> > was a change in direction, or the public docs need to be updated. Lazy accept
> > requires a #VE, either from hardware or from the module. The latter would
> > require walking the Secure EPT tables on every EPT violation...
> >
> >> What happens if the guest attempts to access a secure GPA that is not
> >> ACCEPTed? For example, suppose the VMM does THH.MEM.PAGE.REMOVE on a secure
> >> address and the guest accesses it, via instruction fetch or data access.
> >> What happens?
> >
> > Well, as currently written in the spec, it will generate an EPT violation and
> > the host will have no choice but to kill the guest.
>
> Or page the page back in and try again?

The intended use isn't for swapping a page or migrating a page. Those flows
have dedicated APIs, and do not _remove_ a page.

E.g. the KVM RFC patches already support zapping Secure EPT entries if NUMA
balancing kicks in. But, in TDX terminology, that is a BLOCK/UNBLOCK operation.

Removal is for converting a private page to a shared page, and for paravirt
memory ballooning.

> In regular virt guests, if the host pages out a guest page, it’s the host’s
> job to put it back when needed. In paravirt, a well designed async of
> protocol can sometimes let the guest to useful work when this happens. If a
> guest (or bare metal) has its memory hot removed (via balloon or whatever)
> and the kernel messes up and accesses removed memory, the guest (or bare
> metal) is toast.
>
> I don’t see why TDX needs to be any different.

The REMOVE API isn't intended for swap. In fact, it can't be used for swap. If
a page is removed, its contents are lost. Because the original contents are
lost, the guest is required to re-accept the page so that the host can't
silently get the guest to consume a zero page that the guest thinks has valid
data.

For swap, the contents are preserved, and so explicit re-acceptance is not
required. From the guest's perspective, it's really just a high-latency memory
access.

2021-02-12 20:49:43

by Dave Hansen

[permalink] [raw]
Subject: Re: [RFC v1 05/26] x86/traps: Add #VE support for TDX guest

On 2/12/21 12:37 PM, Sean Christopherson wrote:
> There needs to be a mechanism for lazy/deferred/on-demand acceptance of pages.
> E.g. pre-accepting every page in a VM with hundreds of GB of memory will be
> ridiculously slow.
>
> #VE is the best option to do that:
>
> - Relatively sane re-entrancy semantics.
> - Hardware accelerated.
> - Doesn't require stealing an IRQ from the guest.

TDX already provides a basic environment for the guest when it starts
up. The guest has some known, good memory. The guest also has a very,
very clear understanding of which physical pages it uses and when. It's
staged, of course, as decompression happens and the guest comes up.

But, the guest still knows which guest physical pages it accesses and
when. It doesn't need on-demand faulting in of non-accepted pages. It
can simply decline to expose non-accepted pages to the wider system
before they've been accepted.

It would be nuts to merrily free non-accepted pages into the page
allocator and handle the #VE fallout as they're touched from
god-knows-where.

I don't see *ANY* case for #VE to occur inside the guest kernel, outside
of *VERY* narrow places like copy_from_user(). Period. #VE from ring-0
is not OK.

So, no, #VE is not the best option. No #VE's in the first place is the
best option.

2021-02-12 20:58:08

by Sean Christopherson

[permalink] [raw]
Subject: Re: [RFC v1 05/26] x86/traps: Add #VE support for TDX guest

On Fri, Feb 12, 2021, Dave Hansen wrote:
> On 2/12/21 12:37 PM, Sean Christopherson wrote:
> > There needs to be a mechanism for lazy/deferred/on-demand acceptance of pages.
> > E.g. pre-accepting every page in a VM with hundreds of GB of memory will be
> > ridiculously slow.
> >
> > #VE is the best option to do that:
> >
> > - Relatively sane re-entrancy semantics.
> > - Hardware accelerated.
> > - Doesn't require stealing an IRQ from the guest.
>
> TDX already provides a basic environment for the guest when it starts
> up. The guest has some known, good memory. The guest also has a very,
> very clear understanding of which physical pages it uses and when. It's
> staged, of course, as decompression happens and the guest comes up.
>
> But, the guest still knows which guest physical pages it accesses and
> when. It doesn't need on-demand faulting in of non-accepted pages. It
> can simply decline to expose non-accepted pages to the wider system
> before they've been accepted.
>
> It would be nuts to merrily free non-accepted pages into the page
> allocator and handle the #VE fallout as they're touched from
> god-knows-where.
>
> I don't see *ANY* case for #VE to occur inside the guest kernel, outside
> of *VERY* narrow places like copy_from_user(). Period. #VE from ring-0
> is not OK.
>
> So, no, #VE is not the best option. No #VE's in the first place is the
> best option.

Ah, I see what you're thinking.

Treating an EPT #VE as fatal was also considered as an option. IIUC it was
thought that finding every nook and cranny that could access a page, without
forcing the kernel to pre-accept huge swaths of memory, would be very difficult.
It'd be wonderful if that's not the case.

2021-02-12 21:09:06

by Dave Hansen

[permalink] [raw]
Subject: Re: [RFC v1 05/26] x86/traps: Add #VE support for TDX guest

On 2/12/21 12:54 PM, Sean Christopherson wrote:
> Ah, I see what you're thinking.
>
> Treating an EPT #VE as fatal was also considered as an option. IIUC it was
> thought that finding every nook and cranny that could access a page, without
> forcing the kernel to pre-accept huge swaths of memory, would be very difficult.
> It'd be wonderful if that's not the case.

We have to manually set up the page table entries for every physical
page of memory (except for the hard-coded early stuff below 8MB or
whatever). We *KNOW*, 100% before physical memory is accessed.

There aren't nooks and crannies where memory is accessed. There are a
few, very well-defined choke points which must be crossed before memory
is accessed. Page table creation, bootmem and the core page allocator
come to mind.

If Linux doesn't have a really good handle on which physical pages are
accessed when, we've got bigger problems on our hands. Remember, we
even have debugging mechanisms that unmap pages from the kernel when
they're in the allocator. We know so well that nobody is accessing
those physical addresses that we even tell hypervisors they can toss the
page contents and remove the physical backing (guest free page hinting).

2021-02-12 21:39:59

by Sean Christopherson

[permalink] [raw]
Subject: Re: [RFC v1 05/26] x86/traps: Add #VE support for TDX guest

On Fri, Feb 12, 2021, Dave Hansen wrote:
> On 2/12/21 12:54 PM, Sean Christopherson wrote:
> > Ah, I see what you're thinking.
> >
> > Treating an EPT #VE as fatal was also considered as an option. IIUC it was
> > thought that finding every nook and cranny that could access a page, without
> > forcing the kernel to pre-accept huge swaths of memory, would be very difficult.
> > It'd be wonderful if that's not the case.
>
> We have to manually set up the page table entries for every physical
> page of memory (except for the hard-coded early stuff below 8MB or
> whatever). We *KNOW*, 100% before physical memory is accessed.
>
> There aren't nooks and crannies where memory is accessed. There are a
> few, very well-defined choke points which must be crossed before memory
> is accessed. Page table creation, bootmem and the core page allocator
> come to mind.

Heh, for me, that's two places too many beyond my knowledge domain to feel
comfortable putting a stake in the ground saying #VE isn't necessary.

Joking aside, I agree that treating EPT #VEs as fatal would be ideal, but from a
TDX architecture perspective, when considering all possible kernels, drivers,
configurations, etc..., it's risky to say that there will _never_ be a scenario
that "requires" #VE.

What about adding a property to the TD, e.g. via a flag set during TD creation,
that controls whether unaccepted accesses cause #VE or are, for all intents and
purposes, fatal? That would allow Linux to pursue treating EPT #VEs for private
GPAs as fatal, but would give us a safety and not prevent others from utilizing
#VEs.

I suspect it would also be helpful for debug, e.g. if the kernel manages to do
something stupid and maps memory it hasn't accepted, in which case debugging a
#VE in the guest is likely easier than an opaque EPT violation in the host.

> If Linux doesn't have a really good handle on which physical pages are
> accessed when, we've got bigger problems on our hands. Remember, we
> even have debugging mechanisms that unmap pages from the kernel when
> they're in the allocator. We know so well that nobody is accessing
> those physical addresses that we even tell hypervisors they can toss the
> page contents and remove the physical backing (guest free page hinting).

2021-02-12 21:50:33

by Andy Lutomirski

[permalink] [raw]
Subject: Re: [RFC v1 05/26] x86/traps: Add #VE support for TDX guest

On Fri, Feb 12, 2021 at 1:37 PM Sean Christopherson <[email protected]> wrote:
>
> On Fri, Feb 12, 2021, Dave Hansen wrote:
> > On 2/12/21 12:54 PM, Sean Christopherson wrote:
> > > Ah, I see what you're thinking.
> > >
> > > Treating an EPT #VE as fatal was also considered as an option. IIUC it was
> > > thought that finding every nook and cranny that could access a page, without
> > > forcing the kernel to pre-accept huge swaths of memory, would be very difficult.
> > > It'd be wonderful if that's not the case.
> >
> > We have to manually set up the page table entries for every physical
> > page of memory (except for the hard-coded early stuff below 8MB or
> > whatever). We *KNOW*, 100% before physical memory is accessed.
> >
> > There aren't nooks and crannies where memory is accessed. There are a
> > few, very well-defined choke points which must be crossed before memory
> > is accessed. Page table creation, bootmem and the core page allocator
> > come to mind.
>
> Heh, for me, that's two places too many beyond my knowledge domain to feel
> comfortable putting a stake in the ground saying #VE isn't necessary.
>
> Joking aside, I agree that treating EPT #VEs as fatal would be ideal, but from a
> TDX architecture perspective, when considering all possible kernels, drivers,
> configurations, etc..., it's risky to say that there will _never_ be a scenario
> that "requires" #VE.
>
> What about adding a property to the TD, e.g. via a flag set during TD creation,
> that controls whether unaccepted accesses cause #VE or are, for all intents and
> purposes, fatal? That would allow Linux to pursue treating EPT #VEs for private
> GPAs as fatal, but would give us a safety and not prevent others from utilizing
> #VEs.

That seems reasonable.

2021-02-12 21:50:42

by Dave Hansen

[permalink] [raw]
Subject: Re: [RFC v1 05/26] x86/traps: Add #VE support for TDX guest

On 2/12/21 1:47 PM, Andy Lutomirski wrote:
>> What about adding a property to the TD, e.g. via a flag set during TD creation,
>> that controls whether unaccepted accesses cause #VE or are, for all intents and
>> purposes, fatal? That would allow Linux to pursue treating EPT #VEs for private
>> GPAs as fatal, but would give us a safety and not prevent others from utilizing
>> #VEs.
> That seems reasonable.

Ditto.

We first need to double check to see if the docs are right, though.

2021-02-14 19:42:15

by Andi Kleen

[permalink] [raw]
Subject: Re: [RFC v1 05/26] x86/traps: Add #VE support for TDX guest

On Fri, Feb 12, 2021 at 01:48:36PM -0800, Dave Hansen wrote:
> On 2/12/21 1:47 PM, Andy Lutomirski wrote:
> >> What about adding a property to the TD, e.g. via a flag set during TD creation,
> >> that controls whether unaccepted accesses cause #VE or are, for all intents and
> >> purposes, fatal? That would allow Linux to pursue treating EPT #VEs for private
> >> GPAs as fatal, but would give us a safety and not prevent others from utilizing
> >> #VEs.
> > That seems reasonable.
>
> Ditto.
>
> We first need to double check to see if the docs are right, though.

I confirmed with the TDX module owners that #VE can only happen for:
- unaccepted pages
- instructions like MSR access or CPUID
- specific instructions that are no in the syscall gap

Also if there are future asynchronous #VEs they would only happen
with IF=1, which would also protect the gap.

So no need to make #VE an IST.

-Andi

2021-02-14 20:40:19

by Andy Lutomirski

[permalink] [raw]
Subject: Re: [RFC v1 05/26] x86/traps: Add #VE support for TDX guest

On Sun, Feb 14, 2021 at 11:33 AM Andi Kleen <[email protected]> wrote:
>
> On Fri, Feb 12, 2021 at 01:48:36PM -0800, Dave Hansen wrote:
> > On 2/12/21 1:47 PM, Andy Lutomirski wrote:
> > >> What about adding a property to the TD, e.g. via a flag set during TD creation,
> > >> that controls whether unaccepted accesses cause #VE or are, for all intents and
> > >> purposes, fatal? That would allow Linux to pursue treating EPT #VEs for private
> > >> GPAs as fatal, but would give us a safety and not prevent others from utilizing
> > >> #VEs.
> > > That seems reasonable.
> >
> > Ditto.
> >
> > We first need to double check to see if the docs are right, though.
>
> I confirmed with the TDX module owners that #VE can only happen for:
> - unaccepted pages

Can the hypervisor cause an already-accepted secure-EPT page to
transition to the unaccepted state? If so, NAK. Sorry, upstream
Linux does not need yet more hacks to make it kind-of-sort-of work on
the broken x86 exception architecture, especially for a feature that
is marketed for security.

As I understand it, the entire point of the TDX modular design is to
make it possible to fix at least some amount of architectural error
without silicon revisions. If it is indeed the case that an access to
an unaccepted secure-EPT page will cause #VE, then Intel needs to take
the following actions:

1. Update the documentation to make the behavior comprehensible to
mere mortals. Right now, this information appears to exist in the
form of emails and is, as far as I can tell, not present in the
documentation in a way that we can understand. Keep in mind that this
discussion includes a number of experts on the software aspects of the
x86 architecture, and the fact that none of us who don't work for
Intel can figure out, authoritatively, what the spec is trying to tell
us should be a huge red flag.

2. Fix the architecture. Barring some unexpected discovery, some
highly compelling reason, or a design entailing a number of
compromises that will, frankly, be rather embarrassing, upstream Linux
will not advertise itself as a secure implementation of a TDX guest
with the architecture in its current state. If you would like Linux
to print a giant message along the lines of "WARNING: The TDX
architecture is defective and, as a result, your system is vulnerable
to compromise attack by a malicious hypervisor that uses the
TDH.PAGE.MEM.REMOVE operation. The advertised security properties of
the Intel TDX architecture are not available. Use TDX at your own
risk.", we could consider that. I think it would look pretty bad.

3. Engage with the ISV community, including Linux, to meaningfully
review new Intel designs for software usability. Meaningful review
does not mean that you send us a spec, we tell you that it's broken,
and you ship it anyway. Meaningful review also means that the
questions that the software people ask you need to be answered in a
public, authoritative location, preferably the primary spec publicly
available at Intel's website. Emails don't count for this purpose.

There is no particular shortage of CVEs of varying degrees of severity
due to nonsensical warts in the x86 architecture causing CPL 0 kernels
to malfunction and become subject to privilege escalation. We are
telling you loud and clear that the current TDX architecture appears
to be a minefield and that it is *specifically* vulnerable to an
attack in which a page accessed early in SYSCALL path (or late in the
SYSRET path) causes #VE. You need to take this seriously.

--Andy

Subject: Re: [RFC v1 00/26] Add TDX Guest Support

Hi All,

On 2/5/21 3:38 PM, Kuppuswamy Sathyanarayanan wrote:
> Hi All,
>
> NOTE: This series is not ready for wide public review. It is being
> specifically posted so that Peter Z and other experts on the entry
> code can look for problems with the new exception handler (#VE).
> That's also why x86@ is not being spammed.

We are currently working on a solution to fix the issues raised in
"Add #VE support for TDX guest" patch. While we fix that issue, I would
like to know if there are issues in other patches in this series. So if
possible can you please review other patches in the series and let us
know your comments?.

If you want me to rebase the series on top of v5.12-rcX kernel and repost it,
please let me know.

>
> Intel's Trust Domain Extensions (TDX) protect guest VMs from malicious
> hosts and some physical attacks. This series adds the bare-minimum
> support to run a TDX guest. The host-side support will be submitted
> separately. Also support for advanced TD guest features like attestation
> or debug-mode will be submitted separately. Also, at this point it is not
> secure with some known holes in drivers, and also hasn’t been fully audited
> and fuzzed yet.
>
> TDX has a lot of similarities to SEV. It enhances confidentiality and
> of guest memory and state (like registers) and includes a new exception
> (#VE) for the same basic reasons as SEV-ES. Like SEV-SNP (not merged
> yet), TDX limits the host's ability to effect changes in the guest
> physical address space.
>
> In contrast to the SEV code in the kernel, TDX guest memory is integrity
> protected and isolated; the host is prevented from accessing guest
> memory (even ciphertext).
>
> The TDX architecture also includes a new CPU mode called
> Secure-Arbitration Mode (SEAM). The software (TDX module) running in this
> mode arbitrates interactions between host and guest and implements many of
> the guarantees of the TDX architecture.
>
> Some of the key differences between TD and regular VM is,
>
> 1. Multi CPU bring-up is done using the ACPI MADT wake-up table.
> 2. A new #VE exception handler is added. The TDX module injects #VE exception
> to the guest TD in cases of instructions that need to be emulated, disallowed
> MSR accesses, subset of CPUID leaves, etc.
> 3. By default memory is marked as private, and TD will selectively share it with
> VMM based on need.
> 4. Remote attestation is supported to enable a third party (either the owner of
> the workload or a user of the services provided by the workload) to establish
> that the workload is running on an Intel-TDX-enabled platform located within a
> TD prior to providing that workload data.
>
> You can find TDX related documents in the following link.
>
> https://software.intel.com/content/www/br/pt/develop/articles/intel-trust-domain-extensions.html
>
> This RFC series has been reviewed by Dave Hansen.
>
> Kirill A. Shutemov (16):
> x86/paravirt: Introduce CONFIG_PARAVIRT_XL
> x86/tdx: Get TD execution environment information via TDINFO
> x86/traps: Add #VE support for TDX guest
> x86/tdx: Add HLT support for TDX guest
> x86/tdx: Wire up KVM hypercalls
> x86/tdx: Add MSR support for TDX guest
> x86/tdx: Handle CPUID via #VE
> x86/io: Allow to override inX() and outX() implementation
> x86/tdx: Handle port I/O
> x86/tdx: Handle in-kernel MMIO
> x86/mm: Move force_dma_unencrypted() to common code
> x86/tdx: Exclude Shared bit from __PHYSICAL_MASK
> x86/tdx: Make pages shared in ioremap()
> x86/tdx: Add helper to do MapGPA TDVMALL
> x86/tdx: Make DMA pages shared
> x86/kvm: Use bounce buffers for TD guest
>
> Kuppuswamy Sathyanarayanan (6):
> x86/cpufeatures: Add TDX Guest CPU feature
> x86/cpufeatures: Add is_tdx_guest() interface
> x86/tdx: Handle MWAIT, MONITOR and WBINVD
> ACPI: tables: Add multiprocessor wake-up support
> x86/topology: Disable CPU hotplug support for TDX platforms.
> x86/tdx: Introduce INTEL_TDX_GUEST config option
>
> Sean Christopherson (4):
> x86/boot: Add a trampoline for APs booting in 64-bit mode
> x86/boot: Avoid #VE during compressed boot for TDX platforms
> x86/boot: Avoid unnecessary #VE during boot process
> x86/tdx: Forcefully disable legacy PIC for TDX guests
>
> arch/x86/Kconfig | 28 +-
> arch/x86/boot/compressed/Makefile | 2 +
> arch/x86/boot/compressed/head_64.S | 10 +-
> arch/x86/boot/compressed/misc.h | 1 +
> arch/x86/boot/compressed/pgtable.h | 2 +-
> arch/x86/boot/compressed/tdx.c | 32 ++
> arch/x86/boot/compressed/tdx_io.S | 9 +
> arch/x86/include/asm/apic.h | 3 +
> arch/x86/include/asm/asm-prototypes.h | 1 +
> arch/x86/include/asm/cpufeatures.h | 1 +
> arch/x86/include/asm/idtentry.h | 4 +
> arch/x86/include/asm/io.h | 25 +-
> arch/x86/include/asm/irqflags.h | 42 +-
> arch/x86/include/asm/kvm_para.h | 21 +
> arch/x86/include/asm/paravirt.h | 22 +-
> arch/x86/include/asm/paravirt_types.h | 3 +-
> arch/x86/include/asm/pgtable.h | 3 +
> arch/x86/include/asm/realmode.h | 1 +
> arch/x86/include/asm/tdx.h | 114 +++++
> arch/x86/kernel/Makefile | 1 +
> arch/x86/kernel/acpi/boot.c | 56 +++
> arch/x86/kernel/apic/probe_32.c | 8 +
> arch/x86/kernel/apic/probe_64.c | 8 +
> arch/x86/kernel/head64.c | 3 +
> arch/x86/kernel/head_64.S | 13 +-
> arch/x86/kernel/idt.c | 6 +
> arch/x86/kernel/paravirt.c | 4 +-
> arch/x86/kernel/pci-swiotlb.c | 2 +-
> arch/x86/kernel/smpboot.c | 5 +
> arch/x86/kernel/tdx-kvm.c | 116 +++++
> arch/x86/kernel/tdx.c | 560 +++++++++++++++++++++++
> arch/x86/kernel/tdx_io.S | 143 ++++++
> arch/x86/kernel/topology.c | 3 +-
> arch/x86/kernel/traps.c | 73 ++-
> arch/x86/mm/Makefile | 2 +
> arch/x86/mm/ioremap.c | 8 +-
> arch/x86/mm/mem_encrypt.c | 74 ---
> arch/x86/mm/mem_encrypt_common.c | 83 ++++
> arch/x86/mm/mem_encrypt_identity.c | 1 +
> arch/x86/mm/pat/set_memory.c | 23 +-
> arch/x86/realmode/rm/header.S | 1 +
> arch/x86/realmode/rm/trampoline_64.S | 49 +-
> arch/x86/realmode/rm/trampoline_common.S | 5 +-
> drivers/acpi/tables.c | 9 +
> include/acpi/actbl2.h | 21 +-
> 45 files changed, 1444 insertions(+), 157 deletions(-)
> create mode 100644 arch/x86/boot/compressed/tdx.c
> create mode 100644 arch/x86/boot/compressed/tdx_io.S
> create mode 100644 arch/x86/include/asm/tdx.h
> create mode 100644 arch/x86/kernel/tdx-kvm.c
> create mode 100644 arch/x86/kernel/tdx.c
> create mode 100644 arch/x86/kernel/tdx_io.S
> create mode 100644 arch/x86/mm/mem_encrypt_common.c
>

--
Sathyanarayanan Kuppuswamy
Linux Kernel Developer

2021-04-01 19:59:58

by Dave Hansen

[permalink] [raw]
Subject: Re: [RFC v1 12/26] x86/tdx: Handle in-kernel MMIO

On 2/5/21 3:38 PM, Kuppuswamy Sathyanarayanan wrote:
> From: "Kirill A. Shutemov" <[email protected]>
>
> Handle #VE due to MMIO operations. MMIO triggers #VE with EPT_VIOLATION
> exit reason.
>
> For now we only handle subset of instruction that kernel uses for MMIO
> oerations. User-space access triggers SIGBUS.
..
> + case EXIT_REASON_EPT_VIOLATION:
> + ve->instr_len = tdx_handle_mmio(regs, ve);
> + break;

Is MMIO literally the only thing that can cause an EPT violation for TDX
guests?

Forget userspace for a minute. #VE's from userspace are annoying, but
fine. We can't control what userspace does. If an action it takes
causes a #VE in the TDX architecture, tough cookies, the kernel must
handle it and try to recover or kill the app.

The kernel is very different. We know in advance (must know,
actually...) which instructions might cause exceptions of any kind.
That's why we have exception tables and copy_to/from_user(). That's why
we can handle kernel page faults on userspace, but not inside spinlocks.

Binary-dependent OSes are also very different. It's going to be natural
for them to want to take existing, signed drivers and use them in TDX
guests. They might want to do something like this.

But for an OS where we have source for the *ENTIRE* thing, and where we
have a chokepoint for MMIO accesses (arch/x86/include/asm/io.h), it
seems like an *AWFUL* idea to:
1. Have the kernel set up special mappings for I/O memory
2. Kernel generates special instructions to access that memory
3. Kernel faults on that memory
4. Kernel cracks its own special instructions to see what they were
doing
5. Kernel calls up to host to do the MMIO

Instead of doing 2/3/4, why not just have #2 call up to the host
directly? This patch seems a very slow, roundabout way to do
paravirtualized MMIO.

BTW, there's already some SEV special-casing in io.h.

2021-04-01 21:12:12

by Dave Hansen

[permalink] [raw]
Subject: Re: [RFC v1 03/26] x86/cpufeatures: Add is_tdx_guest() interface

On 2/5/21 3:38 PM, Kuppuswamy Sathyanarayanan wrote:
> +bool is_tdx_guest(void)
> +{
> + return static_cpu_has(X86_FEATURE_TDX_GUEST);
> +}

Why do you need is_tdx_guest() as opposed to calling
cpu_feature_enabled(X86_FEATURE_TDX_GUEST) everywhere?

Subject: Re: [RFC v1 03/26] x86/cpufeatures: Add is_tdx_guest() interface



On 4/1/21 2:08 PM, Dave Hansen wrote:
> On 2/5/21 3:38 PM, Kuppuswamy Sathyanarayanan wrote:
>> +bool is_tdx_guest(void)
>> +{
>> + return static_cpu_has(X86_FEATURE_TDX_GUEST);
>> +}
>
> Why do you need is_tdx_guest() as opposed to calling
> cpu_feature_enabled(X86_FEATURE_TDX_GUEST) everywhere?

is_tdx_guest() is also implemented/used in compressed
code (which uses native_cpuid calls). I don't think
we can use cpu_feature_enabled(X86_FEATURE_TDX_GUEST) in
compressed code right? Also is_tdx_guest() looks easy
to read and use.
>

--
Sathyanarayanan Kuppuswamy
Linux Kernel Developer

2021-04-01 21:20:53

by Dave Hansen

[permalink] [raw]
Subject: Re: [RFC v1 03/26] x86/cpufeatures: Add is_tdx_guest() interface

On 4/1/21 2:15 PM, Kuppuswamy, Sathyanarayanan wrote:
> On 4/1/21 2:08 PM, Dave Hansen wrote:
>> On 2/5/21 3:38 PM, Kuppuswamy Sathyanarayanan wrote:
>>> +bool is_tdx_guest(void)
>>> +{
>>> +    return static_cpu_has(X86_FEATURE_TDX_GUEST);
>>> +}
>>
>> Why do you need is_tdx_guest() as opposed to calling
>> cpu_feature_enabled(X86_FEATURE_TDX_GUEST) everywhere?
>
> is_tdx_guest() is also implemented/used in compressed
> code (which uses native_cpuid calls). I don't think
> we can use cpu_feature_enabled(X86_FEATURE_TDX_GUEST) in
> compressed code right? Also is_tdx_guest() looks easy
> to read and use.

OK, but how many of the is_tdx_guest() uses are in the compressed code?
Why has its use spread beyond that?

Subject: Re: [RFC v1 03/26] x86/cpufeatures: Add is_tdx_guest() interface



On 4/1/21 2:19 PM, Dave Hansen wrote:
> On 4/1/21 2:15 PM, Kuppuswamy, Sathyanarayanan wrote:
>> On 4/1/21 2:08 PM, Dave Hansen wrote:
>>> On 2/5/21 3:38 PM, Kuppuswamy Sathyanarayanan wrote:
>>>> +bool is_tdx_guest(void)
>>>> +{
>>>> +    return static_cpu_has(X86_FEATURE_TDX_GUEST);
>>>> +}
>>>
>>> Why do you need is_tdx_guest() as opposed to calling
>>> cpu_feature_enabled(X86_FEATURE_TDX_GUEST) everywhere?
>>
>> is_tdx_guest() is also implemented/used in compressed
>> code (which uses native_cpuid calls). I don't think
>> we can use cpu_feature_enabled(X86_FEATURE_TDX_GUEST) in
>> compressed code right? Also is_tdx_guest() looks easy
>> to read and use.
>
> OK, but how many of the is_tdx_guest() uses are in the compressed code?
> Why has its use spread beyond that?
Its only used in handling in/out instructions in compressed code. But this
code shared with in/out handling on non-compressed code.

#define __out(bwl, bw) \
do { \
if (is_tdx_guest()) { \
asm volatile("call tdg_out" #bwl : : \
"a"(value), "d"(port)); \
} else { \
asm volatile("out" #bwl " %" #bw "0, %w1" : : \
"a"(value), "Nd"(port)); \
} \
} while (0)
#define __in(bwl, bw) \
do { \
if (is_tdx_guest()) { \
asm volatile("call tdg_in" #bwl : \
"=a"(value) : "d"(port)); \
} else { \
asm volatile("in" #bwl " %w1, %" #bw "0" : \
"=a"(value) : "Nd"(port)); \
} \
} while (0)

>

--
Sathyanarayanan Kuppuswamy
Linux Kernel Developer

2021-04-01 22:29:56

by Sean Christopherson

[permalink] [raw]
Subject: Re: [RFC v1 12/26] x86/tdx: Handle in-kernel MMIO

On Thu, Apr 01, 2021, Dave Hansen wrote:
> On 2/5/21 3:38 PM, Kuppuswamy Sathyanarayanan wrote:
> > From: "Kirill A. Shutemov" <[email protected]>
> >
> > Handle #VE due to MMIO operations. MMIO triggers #VE with EPT_VIOLATION
> > exit reason.
> >
> > For now we only handle subset of instruction that kernel uses for MMIO
> > oerations. User-space access triggers SIGBUS.
> ..
> > + case EXIT_REASON_EPT_VIOLATION:
> > + ve->instr_len = tdx_handle_mmio(regs, ve);
> > + break;
>
> Is MMIO literally the only thing that can cause an EPT violation for TDX
> guests?

Any EPT Violation, or specifically EPT Violation #VE? Any memory access can
cause an EPT violation, but the VMM will get the ones that lead to VM-Exit. The
guest will only get the ones that cause #VE.

Assuming you're asking about #VE... No, any shared memory access can take a #VE
since the VMM controls the shared EPT tables and can clear the SUPPRESS_VE bit
at any time. But, if the VMM is friendly, #VE should be limited to MMIO.

There's also the unaccepted private memory case, but if Linux gets an option to
opt out of that, then #VE is limited to shared memory.

> Forget userspace for a minute. #VE's from userspace are annoying, but
> fine. We can't control what userspace does. If an action it takes
> causes a #VE in the TDX architecture, tough cookies, the kernel must
> handle it and try to recover or kill the app.
>
> The kernel is very different. We know in advance (must know,
> actually...) which instructions might cause exceptions of any kind.
> That's why we have exception tables and copy_to/from_user(). That's why
> we can handle kernel page faults on userspace, but not inside spinlocks.
>
> Binary-dependent OSes are also very different. It's going to be natural
> for them to want to take existing, signed drivers and use them in TDX
> guests. They might want to do something like this.
>
> But for an OS where we have source for the *ENTIRE* thing, and where we
> have a chokepoint for MMIO accesses (arch/x86/include/asm/io.h), it
> seems like an *AWFUL* idea to:
> 1. Have the kernel set up special mappings for I/O memory
> 2. Kernel generates special instructions to access that memory
> 3. Kernel faults on that memory
> 4. Kernel cracks its own special instructions to see what they were
> doing
> 5. Kernel calls up to host to do the MMIO
>
> Instead of doing 2/3/4, why not just have #2 call up to the host
> directly? This patch seems a very slow, roundabout way to do
> paravirtualized MMIO.
>
> BTW, there's already some SEV special-casing in io.h.

I implemented #2 a while back for build_mmio_{read,write}(), I'm guessing the
code is floating around somewhere. The gotcha is that there are nasty little
pieces of the kernel that don't use the helpers provided by io.h, e.g. the I/O
APIC code likes to access MMIO via a struct overlay, so the compiler is free to
use any instruction that satisfies the constraint.

The I/O APIC can and should be forced off, but dollars to donuts says there are
more special snowflakes lying in wait. If the kernel uses an allowlist for
drivers, then in theory it should be possible to hunt down all offenders. But
I think we'll want fallback logic to handle kernel MMIO #VEs, especially if the
kernel needs ISA cracking logic for userspace. Without fallback logic, any MMIO
#VE from the kernel would be fatal, which is too harsh IMO since the behavior
isn't so obviously wrong, e.g. versus the split lock #AC purge where there's no
legitimate reason for the kernel to generate a split lock.

2021-04-01 22:55:00

by Dave Hansen

[permalink] [raw]
Subject: Re: [RFC v1 12/26] x86/tdx: Handle in-kernel MMIO

On 4/1/21 3:26 PM, Sean Christopherson wrote:
> On Thu, Apr 01, 2021, Dave Hansen wrote:
>> On 2/5/21 3:38 PM, Kuppuswamy Sathyanarayanan wrote:
>>> From: "Kirill A. Shutemov" <[email protected]>
>>>
>>> Handle #VE due to MMIO operations. MMIO triggers #VE with EPT_VIOLATION
>>> exit reason.
>>>
>>> For now we only handle subset of instruction that kernel uses for MMIO
>>> oerations. User-space access triggers SIGBUS.
>> ..
>>> + case EXIT_REASON_EPT_VIOLATION:
>>> + ve->instr_len = tdx_handle_mmio(regs, ve);
>>> + break;
>>
>> Is MMIO literally the only thing that can cause an EPT violation for TDX
>> guests?
>
> Any EPT Violation, or specifically EPT Violation #VE? Any memory access can
> cause an EPT violation, but the VMM will get the ones that lead to VM-Exit. The
> guest will only get the ones that cause #VE.

I'll rephrase: Is MMIO literally the only thing that can cause us to get
into the EXIT_REASON_EPT_VIOLATION case of the switch() here?

> Assuming you're asking about #VE... No, any shared memory access can take a #VE
> since the VMM controls the shared EPT tables and can clear the SUPPRESS_VE bit
> at any time. But, if the VMM is friendly, #VE should be limited to MMIO.

OK, but what are we doing in the case of unfriendly VMMs? What does
*this* code do as-is, and where do we want to take it?

From the _looks_ of this patch, tdx_handle_mmio() is the be all end all
solution to all EXIT_REASON_EPT_VIOLATION events.

>> But for an OS where we have source for the *ENTIRE* thing, and where we
>> have a chokepoint for MMIO accesses (arch/x86/include/asm/io.h), it
>> seems like an *AWFUL* idea to:
>> 1. Have the kernel set up special mappings for I/O memory
>> 2. Kernel generates special instructions to access that memory
>> 3. Kernel faults on that memory
>> 4. Kernel cracks its own special instructions to see what they were
>> doing
>> 5. Kernel calls up to host to do the MMIO
>>
>> Instead of doing 2/3/4, why not just have #2 call up to the host
>> directly? This patch seems a very slow, roundabout way to do
>> paravirtualized MMIO.
>>
>> BTW, there's already some SEV special-casing in io.h.
>
> I implemented #2 a while back for build_mmio_{read,write}(), I'm guessing the
> code is floating around somewhere. The gotcha is that there are nasty little
> pieces of the kernel that don't use the helpers provided by io.h, e.g. the I/O
> APIC code likes to access MMIO via a struct overlay, so the compiler is free to
> use any instruction that satisfies the constraint.

So, there aren't an infinite number of these. It's also 100% possible
to add some tooling to the kernel today to help you find these. You
could also have added tooling to KVM hosts to help find these.

Folks are *also* saying that we'll need a driver audit just to trust
that drivers aren't vulnerable to attacks from devices or from the host.
This can quite easily be a part of that effort.

> The I/O APIC can and should be forced off, but dollars to donuts says there are
> more special snowflakes lying in wait. If the kernel uses an allowlist for
> drivers, then in theory it should be possible to hunt down all offenders. But
> I think we'll want fallback logic to handle kernel MMIO #VEs, especially if the
> kernel needs ISA cracking logic for userspace. Without fallback logic, any MMIO
> #VE from the kernel would be fatal, which is too harsh IMO since the behavior
> isn't so obviously wrong, e.g. versus the split lock #AC purge where there's no
> legitimate reason for the kernel to generate a split lock.

I'll buy that this patch is convenient for *debugging*. It helped folks
bootstrap the TDX support and get it going.

IMNHO, if a driver causes a #VE, it's a bug. Just like if it goes off
the rails and touches bad memory and #GP's or #PF's.

Are there any printk's in the #VE handler? Guess what those do. Print
to the console. Guess what consoles do. MMIO. You can't get away from
doing audits of the console drivers. Sure, you can go make #VE special,
like NMIs, but that's not going to be fun. At least the guest doesn't
have to deal with the fatality of a nested #VE, but it's still fatal.

I just don't like us pretending that we're Windows and have no control
over the code we run and throwing up our hands.

2021-04-02 00:04:49

by Dave Hansen

[permalink] [raw]
Subject: Re: [RFC v1 00/26] Add TDX Guest Support

On 2/5/21 3:38 PM, Kuppuswamy Sathyanarayanan wrote:
> Intel's Trust Domain Extensions (TDX) protect guest VMs from malicious
> hosts and some physical attacks. This series adds the bare-minimum
> support to run a TDX guest. The host-side support will be submitted
> separately. Also support for advanced TD guest features like attestation
> or debug-mode will be submitted separately. Also, at this point it is not
> secure with some known holes in drivers, and also hasn’t been fully audited
> and fuzzed yet.

I want to hear a lot more about this driver model.

I've heard things like "we need to harden the drivers" or "we need to do
audits" and that drivers might be "whitelisted".

What are we talking about specifically? Which drivers? How many
approximately? Just virtio? Are there any "real" hardware drivers
involved like how QEMU emulates an e1000 or rtl8139 device? What about
the APIC or HPET?

How broadly across the kernel is this going to go?

Without something concrete, it's really hard to figure out if we should
go full-blown paravirtualized MMIO, or do something like the #VE
trapping that's in this series currently.

2021-04-02 02:52:18

by Andi Kleen

[permalink] [raw]
Subject: Re: [RFC v1 00/26] Add TDX Guest Support

> I've heard things like "we need to harden the drivers" or "we need to do
> audits" and that drivers might be "whitelisted".

The basic driver allow listing patches are already in the repository,
but not currently posted or complete:

https://github.com/intel/tdx/commits/guest

>
> What are we talking about specifically? Which drivers? How many
> approximately? Just virtio?

Right now just virtio, later other drivers that hypervisors need.

> Are there any "real" hardware drivers
> involved like how QEMU emulates an e1000 or rtl8139 device?

Not currently (but some later hypervisor might rely on one of those)

> What about
> the APIC or HPET?

No IO-APIC, but the local APIC. No HPET.

>
> How broadly across the kernel is this going to go?

Not very broadly for drivers.

>
> Without something concrete, it's really hard to figure out if we should
> go full-blown paravirtualized MMIO, or do something like the #VE
> trapping that's in this series currently.

As Sean says the concern about MMIO is less drivers (which should
be generally ok if they work on other architectures which require MMIO
magic), but other odd code that only ran on x86 before.

I really don't understand your crusade against #VE. It really
isn't that bad if we can avoid the few corner cases.

For me it would seem wrong to force all MMIO for all drivers to some
complicated paravirt construct, blowing up code side everywhere
and adding complicated self modifying code, when it's only needed for very
few drivers. But we also don't want to patch every MMIO to be special cased
even those few drivers.

#VE based MMIO avoids all that cleanly while being nicely non intrusive.

-Andi

2021-04-02 15:31:14

by Dave Hansen

[permalink] [raw]
Subject: Re: [RFC v1 00/26] Add TDX Guest Support

On 4/1/21 7:48 PM, Andi Kleen wrote:
>> I've heard things like "we need to harden the drivers" or "we need to do
>> audits" and that drivers might be "whitelisted".
>
> The basic driver allow listing patches are already in the repository,
> but not currently posted or complete:
>
> https://github.com/intel/tdx/commits/guest

That lists exactly 8 ids:

> { PCI_VENDOR_ID_REDHAT_QUMRANET, 0x1000 }, /* Virtio NET */
> { PCI_VENDOR_ID_REDHAT_QUMRANET, 0x1001 }, /* Virtio block */
> { PCI_VENDOR_ID_REDHAT_QUMRANET, 0x1003 }, /* Virtio console */
> { PCI_VENDOR_ID_REDHAT_QUMRANET, 0x1009 }, /* Virtio FS */
>
> { PCI_VENDOR_ID_REDHAT_QUMRANET, 0x1041 }, /* Virtio 1.0 NET */
> { PCI_VENDOR_ID_REDHAT_QUMRANET, 0x1042 }, /* Virtio 1.0 block */
> { PCI_VENDOR_ID_REDHAT_QUMRANET, 0x1043 }, /* Virtio 1.0 console */
> { PCI_VENDOR_ID_REDHAT_QUMRANET, 0x1049 }, /* Virtio 1.0 FS */

How many places do those 8 drivers touch MMIO?

>> Are there any "real" hardware drivers
>> involved like how QEMU emulates an e1000 or rtl8139 device?
>
> Not currently (but some later hypervisor might rely on one of those)
>
>> What about
>> the APIC or HPET?
>
> No IO-APIC, but the local APIC. No HPET.

Sean seemed worried about other x86-specific oddities. Are there any
more, or is the local APIC the only non-driver MMIO?

>> Without something concrete, it's really hard to figure out if we should
>> go full-blown paravirtualized MMIO, or do something like the #VE
>> trapping that's in this series currently.
>
> As Sean says the concern about MMIO is less drivers (which should
> be generally ok if they work on other architectures which require MMIO
> magic), but other odd code that only ran on x86 before.
>
> I really don't understand your crusade against #VE. It really
> isn't that bad if we can avoid the few corner cases.

The problem isn't with #VE per se. It's with posting a series that
masquerades as a full solution while *NOT* covering or even enumerating
the corner cases. That's exactly what happened with #VE to start with:
it was implemented in a way that exposed the kernel to #VE during the
syscall gap (and the SWAPGS gap for that matter).

So, I'm pushing for a design that won't have corner cases. If MMIO
itself is disallowed, then we can scream about *any* detected MMIO.
Then, there's no worry about #VE nesting. No #VE, no #VE nesting. We
don't even have to consider if #VE needs NMI-like semantics.

> For me it would seem wrong to force all MMIO for all drivers to some
> complicated paravirt construct, blowing up code side everywhere
> and adding complicated self modifying code, when it's only needed for very
> few drivers. But we also don't want to patch every MMIO to be special cased
> even those few drivers.
>
> #VE based MMIO avoids all that cleanly while being nicely non intrusive.

But, we're not selling used cars here. Using #VE is has downsides.
Let's not pretend that it doesn't.

If we go this route, what are the rules and restrictions? Do we have to
say "no MMIO in #VE"?

I'm really the most worried about the console. Consoles and NMIs have
been a nightmare, IIRC. Doesn't this just make it *WORSE* because now
the deepest reaches of the console driver are guaranteed to #VE?

Which brings up another related point: How do you debug TD guests? Does
earlyprintk work?

2021-04-02 21:33:27

by Andi Kleen

[permalink] [raw]
Subject: Re: [RFC v1 00/26] Add TDX Guest Support

> If we go this route, what are the rules and restrictions? Do we have to
> say "no MMIO in #VE"?

All we have to say is "No MMIO in #VE before getting thd TDVEINFO arguments"
After that it can nest without problems.

If you nest before that the TDX will cause a triple fault.

The code that cannot do it is a few lines in the early handler which
runs with interrupts off.

The TDX module also makes sure to not inject NMIs while we're in
that region, so NMIs are of no concern.

That was the whole point of avoiding the system call gap problem. We don't
need to make it IST, so it can nest.

I'm not aware of any other special rules.

> Which brings up another related point: How do you debug TD guests? Does
> earlyprintk work?

Today it works actually because serial ports are allowed. But I expect it to
be closed eventually because serial code is a lot of code to audit.
But you can always disable the filtering with a command line option and
then it will always work for debugging.

-Andi

2021-04-03 16:32:13

by Dave Hansen

[permalink] [raw]
Subject: Re: [RFC v1 00/26] Add TDX Guest Support

On 4/2/21 2:32 PM, Andi Kleen wrote:
>> If we go this route, what are the rules and restrictions? Do we have to
>> say "no MMIO in #VE"?
>
> All we have to say is "No MMIO in #VE before getting thd TDVEINFO arguments"
> After that it can nest without problems.

Well, not exactly. You still can't do things that will could cause a n
unbounded recusive #VE.

It doesn't seem *that* far fetched to think that someone might try to
defer some work or dump data to the console.

> If you nest before that the TDX will cause a triple fault.
>
> The code that cannot do it is a few lines in the early handler which
> runs with interrupts off.

>> Which brings up another related point: How do you debug TD guests? Does
>> earlyprintk work?
>
> Today it works actually because serial ports are allowed. But I expect it to
> be closed eventually because serial code is a lot of code to audit.
> But you can always disable the filtering with a command line option and
> then it will always work for debugging.

Do we need a TDX-specific earlyprintk? I would imagine it's pretty easy
to implement.

2021-04-03 17:29:25

by Andi Kleen

[permalink] [raw]
Subject: Re: [RFC v1 00/26] Add TDX Guest Support

On Sat, Apr 03, 2021 at 09:26:24AM -0700, Dave Hansen wrote:
> On 4/2/21 2:32 PM, Andi Kleen wrote:
> >> If we go this route, what are the rules and restrictions? Do we have to
> >> say "no MMIO in #VE"?
> >
> > All we have to say is "No MMIO in #VE before getting thd TDVEINFO arguments"
> > After that it can nest without problems.
>
> Well, not exactly. You still can't do things that will could cause a n
> unbounded recusive #VE.

> It doesn't seem *that* far fetched to think that someone might try to
> defer some work or dump data to the console.

I believe the main console code has reentry protection.

I'm not sure about early_printk (with keep), buf it that's the case
it probably should be fixed anyways. I can take a look at that.

Not sure why deferring something would cause another #VE?


> > If you nest before that the TDX will cause a triple fault.
> >
> > The code that cannot do it is a few lines in the early handler which
> > runs with interrupts off.
>
> >> Which brings up another related point: How do you debug TD guests? Does
> >> earlyprintk work?
> >
> > Today it works actually because serial ports are allowed. But I expect it to
> > be closed eventually because serial code is a lot of code to audit.
> > But you can always disable the filtering with a command line option and
> > then it will always work for debugging.
>
> Do we need a TDX-specific earlyprintk? I would imagine it's pretty easy
> to implement.

Don't see a need at this point, the existing mechanisms work.

Maybe if we ever have a problem that only happen in lockdown *and* happens
early, but that's not very likely since lock down primarily changes code
behavior later.

There are also other debug mechanisms for such cases: in TDX if you configure
the TD for debug mode supports using the gdb stub on the hypervisor.

-Andi

2021-04-04 15:04:17

by Dave Hansen

[permalink] [raw]
Subject: Re: [RFC v1 00/26] Add TDX Guest Support

It occurred to me that I've been doing a lot of digging in the TDX spec
lately. I think we can all agree that the "Architecture Specification"
is not the world's easiest, most disgestable reading. It's hard to
figure out what the Linux relation to the spec is.

One bit of Documentation we need for TDX is a description of the memory
states. For instance, it would be nice to spell out the different
classes of memory, how they are selected, who selects them, and who
enforces the selection. What faults are generated on each type and who
can induce those?

For instance:

TD-Private memory is selected by the Shared/Private bit in Present=1
guest PTEs. When the hardware page walker sees that bit, it walk the
secure EPT. The secure EPT entries can only be written by the TDX
module, although they are written at the request of the VMM. The TDX
module enforces rules like ensuring that the memory mapped by secure EPT
is not mapped multiple times. The VMM can remove entries. From the
guest perspective, all private memory accesses are either successful, or
result in a #VE. Private memory access does not cause VMExits.

Would that be useful to folks?

2021-04-12 17:29:06

by Dan Williams

[permalink] [raw]
Subject: Re: [RFC v1 00/26] Add TDX Guest Support

On Sun, Apr 4, 2021 at 8:02 AM Dave Hansen <[email protected]> wrote:
>
> It occurred to me that I've been doing a lot of digging in the TDX spec
> lately. I think we can all agree that the "Architecture Specification"
> is not the world's easiest, most disgestable reading. It's hard to
> figure out what the Linux relation to the spec is.
>
> One bit of Documentation we need for TDX is a description of the memory
> states. For instance, it would be nice to spell out the different
> classes of memory, how they are selected, who selects them, and who
> enforces the selection. What faults are generated on each type and who
> can induce those?
>
> For instance:
>
> TD-Private memory is selected by the Shared/Private bit in Present=1
> guest PTEs. When the hardware page walker sees that bit, it walk the
> secure EPT. The secure EPT entries can only be written by the TDX
> module, although they are written at the request of the VMM. The TDX
> module enforces rules like ensuring that the memory mapped by secure EPT
> is not mapped multiple times. The VMM can remove entries. From the
> guest perspective, all private memory accesses are either successful, or
> result in a #VE. Private memory access does not cause VMExits.
>
> Would that be useful to folks?

That paragraph was useful for me as someone coming in cold to TDX
patch review. +1 for more of that style of commentary.