2022-01-24 19:26:38

by Kirill A. Shutemov

[permalink] [raw]
Subject: [PATCHv2 18/29] x86/boot: Avoid #VE during boot for TDX platforms

From: Sean Christopherson <[email protected]>

There are a few MSRs and control register bits that the kernel
normally needs to modify during boot. But, TDX disallows
modification of these registers to help provide consistent security
guarantees. Fortunately, TDX ensures that these are all in the correct
state before the kernel loads, which means the kernel does not need to
modify them.

The conditions to avoid are:

* Any writes to the EFER MSR
* Clearing CR0.NE
* Clearing CR3.MCE

This theoretically makes the guest boot more fragile. If, for instance,
EFER was set up incorrectly and a WRMSR was performed, it will trigger
early exception panic or a triple fault, if it's before early
exceptions are set up. However, this is likely to trip up the guest
BIOS long before control reaches the kernel. In any case, these kinds
of problems are unlikely to occur in production environments, and
developers have good debug tools to fix them quickly.

Change the common boot code to work on TDX and non-TDX systems.
This should have no functional effect on non-TDX systems.

Signed-off-by: Sean Christopherson <[email protected]>
Reviewed-by: Andi Kleen <[email protected]>
Reviewed-by: Dan Williams <[email protected]>
Signed-off-by: Kuppuswamy Sathyanarayanan <[email protected]>
Signed-off-by: Kirill A. Shutemov <[email protected]>
---
arch/x86/Kconfig | 1 +
arch/x86/boot/compressed/head_64.S | 25 +++++++++++++++++++++----
arch/x86/boot/compressed/pgtable.h | 2 +-
arch/x86/kernel/head_64.S | 24 ++++++++++++++++++++++--
arch/x86/realmode/rm/trampoline_64.S | 27 +++++++++++++++++++++++----
5 files changed, 68 insertions(+), 11 deletions(-)

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 1491f25c844e..1c59e02792e4 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -885,6 +885,7 @@ config INTEL_TDX_GUEST
depends on X86_64 && CPU_SUP_INTEL
depends on X86_X2APIC
select ARCH_HAS_CC_PLATFORM
+ select X86_MCE
help
Support running as a guest under Intel TDX. Without this support,
the guest kernel can not boot or run under TDX.
diff --git a/arch/x86/boot/compressed/head_64.S b/arch/x86/boot/compressed/head_64.S
index fd9441f40457..b576d23d37cb 100644
--- a/arch/x86/boot/compressed/head_64.S
+++ b/arch/x86/boot/compressed/head_64.S
@@ -643,12 +643,25 @@ SYM_CODE_START(trampoline_32bit_src)
movl $MSR_EFER, %ecx
rdmsr
btsl $_EFER_LME, %eax
+ /* Avoid writing EFER if no change was made (for TDX guest) */
+ jc 1f
wrmsr
- popl %edx
+1: popl %edx
popl %ecx

/* Enable PAE and LA57 (if required) paging modes */
- movl $X86_CR4_PAE, %eax
+ movl %cr4, %eax
+
+#ifdef CONFIG_X86_MCE
+ /*
+ * Preserve CR4.MCE if the kernel will enable #MC support. Clearing
+ * MCE may fault in some environments (that also force #MC support).
+ * Any machine check that occurs before #MC support is fully configured
+ * will crash the system regardless of the CR4.MCE value set here.
+ */
+ andl $X86_CR4_MCE, %eax
+#endif
+ orl $X86_CR4_PAE, %eax
testl %edx, %edx
jz 1f
orl $X86_CR4_LA57, %eax
@@ -662,8 +675,12 @@ SYM_CODE_START(trampoline_32bit_src)
pushl $__KERNEL_CS
pushl %eax

- /* Enable paging again */
- movl $(X86_CR0_PG | X86_CR0_PE), %eax
+ /*
+ * Enable paging again. Keep CR0.NE set, FERR# is no longer used
+ * to handle x87 FPU errors and clearing NE may fault in some
+ * environments.
+ */
+ movl $(X86_CR0_PG | X86_CR0_NE | X86_CR0_PE), %eax
movl %eax, %cr0

lret
diff --git a/arch/x86/boot/compressed/pgtable.h b/arch/x86/boot/compressed/pgtable.h
index 6ff7e81b5628..cc9b2529a086 100644
--- a/arch/x86/boot/compressed/pgtable.h
+++ b/arch/x86/boot/compressed/pgtable.h
@@ -6,7 +6,7 @@
#define TRAMPOLINE_32BIT_PGTABLE_OFFSET 0

#define TRAMPOLINE_32BIT_CODE_OFFSET PAGE_SIZE
-#define TRAMPOLINE_32BIT_CODE_SIZE 0x70
+#define TRAMPOLINE_32BIT_CODE_SIZE 0x80

#define TRAMPOLINE_32BIT_STACK_END TRAMPOLINE_32BIT_SIZE

diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S
index 9c63fc5988cd..652845cc527e 100644
--- a/arch/x86/kernel/head_64.S
+++ b/arch/x86/kernel/head_64.S
@@ -141,7 +141,17 @@ SYM_INNER_LABEL(secondary_startup_64_no_verify, SYM_L_GLOBAL)
1:

/* Enable PAE mode, PGE and LA57 */
- movl $(X86_CR4_PAE | X86_CR4_PGE), %ecx
+ movq %cr4, %rcx
+#ifdef CONFIG_X86_MCE
+ /*
+ * Preserve CR4.MCE if the kernel will enable #MC support. Clearing
+ * MCE may fault in some environments (that also force #MC support).
+ * Any machine check that occurs before #MC support is fully configured
+ * will crash the system regardless of the CR4.MCE value set here.
+ */
+ andl $X86_CR4_MCE, %ecx
+#endif
+ orl $(X86_CR4_PAE | X86_CR4_PGE), %ecx
#ifdef CONFIG_X86_5LEVEL
testl $1, __pgtable_l5_enabled(%rip)
jz 1f
@@ -246,13 +256,23 @@ SYM_INNER_LABEL(secondary_startup_64_no_verify, SYM_L_GLOBAL)
/* Setup EFER (Extended Feature Enable Register) */
movl $MSR_EFER, %ecx
rdmsr
+ /*
+ * Preserve current value of EFER for comparison and to skip
+ * EFER writes if no change was made (for TDX guest)
+ */
+ movl %eax, %edx
btsl $_EFER_SCE, %eax /* Enable System Call */
btl $20,%edi /* No Execute supported? */
jnc 1f
btsl $_EFER_NX, %eax
btsq $_PAGE_BIT_NX,early_pmd_flags(%rip)
-1: wrmsr /* Make changes effective */

+ /* Avoid writing EFER if no change was made (for TDX guest) */
+1: cmpl %edx, %eax
+ je 1f
+ xor %edx, %edx
+ wrmsr /* Make changes effective */
+1:
/* Setup cr0 */
movl $CR0_STATE, %eax
/* Make changes effective */
diff --git a/arch/x86/realmode/rm/trampoline_64.S b/arch/x86/realmode/rm/trampoline_64.S
index ae112a91592f..170f248d5769 100644
--- a/arch/x86/realmode/rm/trampoline_64.S
+++ b/arch/x86/realmode/rm/trampoline_64.S
@@ -143,13 +143,28 @@ SYM_CODE_START(startup_32)
movl %eax, %cr3

# Set up EFER
+ movl $MSR_EFER, %ecx
+ rdmsr
+ /*
+ * Skip writing to EFER if the register already has desired
+ * value (to avoid #VE for the TDX guest).
+ */
+ cmp pa_tr_efer, %eax
+ jne .Lwrite_efer
+ cmp pa_tr_efer + 4, %edx
+ je .Ldone_efer
+.Lwrite_efer:
movl pa_tr_efer, %eax
movl pa_tr_efer + 4, %edx
- movl $MSR_EFER, %ecx
wrmsr

- # Enable paging and in turn activate Long Mode
- movl $(X86_CR0_PG | X86_CR0_WP | X86_CR0_PE), %eax
+.Ldone_efer:
+ /*
+ * Enable paging and in turn activate Long Mode. Keep CR0.NE set, FERR#
+ * is no longer used to handle x87 FPU errors and clearing NE may fault
+ * in some environments.
+ */
+ movl $(X86_CR0_PG | X86_CR0_WP | X86_CR0_NE | X86_CR0_PE), %eax
movl %eax, %cr0

/*
@@ -169,7 +184,11 @@ SYM_CODE_START(pa_trampoline_compat)
movl $rm_stack_end, %esp
movw $__KERNEL_DS, %dx

- movl $X86_CR0_PE, %eax
+ /*
+ * Keep CR0.NE set, FERR# is no longer used to handle x87 FPU errors
+ * and clearing NE may fault in some environments.
+ */
+ movl $(X86_CR0_NE | X86_CR0_PE), %eax
movl %eax, %cr0
ljmpl $__KERNEL32_CS, $pa_startup_32
SYM_CODE_END(pa_trampoline_compat)
--
2.34.1


2022-02-03 14:03:22

by Thomas Gleixner

[permalink] [raw]
Subject: Re: [PATCHv2 18/29] x86/boot: Avoid #VE during boot for TDX platforms

On Mon, Jan 24 2022 at 18:02, Kirill A. Shutemov wrote:
>
> Change the common boot code to work on TDX and non-TDX systems.
> This should have no functional effect on non-TDX systems.

Emphasis on should? :)

> --- a/arch/x86/boot/compressed/head_64.S
> +++ b/arch/x86/boot/compressed/head_64.S
> @@ -643,12 +643,25 @@ SYM_CODE_START(trampoline_32bit_src)
> movl $MSR_EFER, %ecx
> rdmsr
> btsl $_EFER_LME, %eax
> + /* Avoid writing EFER if no change was made (for TDX guest) */
> + jc 1f
> wrmsr
> - popl %edx
> +1: popl %edx
> popl %ecx
>
> /* Enable PAE and LA57 (if required) paging modes */

This comment should move after the #endif below and here it wants a
comment which explains why reading cr4 and the following code sequence
is correct. If you write up that comment then you'll figure out that it
is incorrect.

> - movl $X86_CR4_PAE, %eax
> + movl %cr4, %eax

Assume CR4 has X86_CR4_MCE set then how is the below correct when
CONFIG_X86_MCE=n? Not to talk about any other bits which might be set in
CR4 and are only cleared by the CONFIG_X86_MCE dependent 'andl'.

> +#ifdef CONFIG_X86_MCE
> + /*
> + * Preserve CR4.MCE if the kernel will enable #MC support. Clearing
> + * MCE may fault in some environments (that also force #MC support).
> + * Any machine check that occurs before #MC support is fully configured
> + * will crash the system regardless of the CR4.MCE value set here.
> + */
> + andl $X86_CR4_MCE, %eax
> +#endif

So this wants to be

#ifdef CONFIG_X86_MCE
movl %cr4, %eax
andl $X86_CR4_MCE, %eax
#else
movl $0, %eax
#endif

No?

> + orl $X86_CR4_PAE, %eax
> testl %edx, %edx
> jz 1f
> orl $X86_CR4_LA57, %eax
> @@ -662,8 +675,12 @@ SYM_CODE_START(trampoline_32bit_src)
> pushl $__KERNEL_CS
> pushl %eax
>
> - /* Enable paging again */
> - movl $(X86_CR0_PG | X86_CR0_PE), %eax
> + /*
> + * Enable paging again. Keep CR0.NE set, FERR# is no longer used
> + * to handle x87 FPU errors and clearing NE may fault in some
> + * environments.

FERR# is no longer used is really not informative here. The point is
that any x86 CPU which is supported by the kernel requires CR0_NE to be
set. This code was wrong from the very beginning because 64bit CPUs
never supported #FERR. The reason why it exists is Copy&Pasta without
brain applied and the sad fact that the hardware does not enforce it in
native mode for whatever reason. So this want's to be a seperate patch
with a coherent comment and changelong.

> + */
> + movl $(X86_CR0_PG | X86_CR0_NE | X86_CR0_PE), %eax
> movl %eax, %cr0
>
> /* Enable PAE mode, PGE and LA57 */
> - movl $(X86_CR4_PAE | X86_CR4_PGE), %ecx
> + movq %cr4, %rcx

See above...

> +#ifdef CONFIG_X86_MCE
> + /*
> + * Preserve CR4.MCE if the kernel will enable #MC support. Clearing
> + * MCE may fault in some environments (that also force #MC support).
> + * Any machine check that occurs before #MC support is fully configured
> + * will crash the system regardless of the CR4.MCE value set here.
> + */
> + andl $X86_CR4_MCE, %ecx
> +#endif
> + orl $(X86_CR4_PAE | X86_CR4_PGE), %ecx
> #ifdef CONFIG_X86_5LEVEL
> testl $1, __pgtable_l5_enabled(%rip)
> jz 1f
> @@ -246,13 +256,23 @@ SYM_INNER_LABEL(secondary_startup_64_no_verify, SYM_L_GLOBAL)
> /* Setup EFER (Extended Feature Enable Register) */
> movl $MSR_EFER, %ecx
> rdmsr
> + /*
> + * Preserve current value of EFER for comparison and to skip
> + * EFER writes if no change was made (for TDX guest)
> + */
> + movl %eax, %edx
> btsl $_EFER_SCE, %eax /* Enable System Call */
> btl $20,%edi /* No Execute supported? */
> jnc 1f
> btsl $_EFER_NX, %eax
> btsq $_PAGE_BIT_NX,early_pmd_flags(%rip)
> -1: wrmsr /* Make changes effective */
>
> + /* Avoid writing EFER if no change was made (for TDX guest) */
> +1: cmpl %edx, %eax
> + je 1f
> + xor %edx, %edx
> + wrmsr /* Make changes effective */
> +1:
> /* Setup cr0 */
> movl $CR0_STATE, %eax
> /* Make changes effective */
> diff --git a/arch/x86/realmode/rm/trampoline_64.S b/arch/x86/realmode/rm/trampoline_64.S
> index ae112a91592f..170f248d5769 100644
> --- a/arch/x86/realmode/rm/trampoline_64.S
> +++ b/arch/x86/realmode/rm/trampoline_64.S
> @@ -143,13 +143,28 @@ SYM_CODE_START(startup_32)
> movl %eax, %cr3
>
> # Set up EFER
> + movl $MSR_EFER, %ecx
> + rdmsr
> + /*
> + * Skip writing to EFER if the register already has desired
> + * value (to avoid #VE for the TDX guest).
> + */
> + cmp pa_tr_efer, %eax
> + jne .Lwrite_efer
> + cmp pa_tr_efer + 4, %edx
> + je .Ldone_efer
> +.Lwrite_efer:
> movl pa_tr_efer, %eax
> movl pa_tr_efer + 4, %edx
> - movl $MSR_EFER, %ecx
> wrmsr
>
> - # Enable paging and in turn activate Long Mode
> - movl $(X86_CR0_PG | X86_CR0_WP | X86_CR0_PE), %eax
> +.Ldone_efer:
> + /*
> + * Enable paging and in turn activate Long Mode. Keep CR0.NE set, FERR#
> + * is no longer used to handle x87 FPU errors and clearing NE may fault
> + * in some environments.
> + */
> + movl $(X86_CR0_PG | X86_CR0_WP | X86_CR0_NE | X86_CR0_PE),
> %eax

See above.

> movl %eax, %cr0
>
> /*
> @@ -169,7 +184,11 @@ SYM_CODE_START(pa_trampoline_compat)
> movl $rm_stack_end, %esp
> movw $__KERNEL_DS, %dx
>
> - movl $X86_CR0_PE, %eax
> + /*
> + * Keep CR0.NE set, FERR# is no longer used to handle x87 FPU errors
> + * and clearing NE may fault in some environments.
> + */
> + movl $(X86_CR0_NE | X86_CR0_PE), %eax

Ditto.

> movl %eax, %cr0
> ljmpl $__KERNEL32_CS, $pa_startup_32
> SYM_CODE_END(pa_trampoline_compat)

Thanks,

tglx

2022-02-13 15:35:25

by Kirill A. Shutemov

[permalink] [raw]
Subject: Re: [PATCHv2 18/29] x86/boot: Avoid #VE during boot for TDX platforms

On Wed, Feb 02, 2022 at 01:04:34AM +0100, Thomas Gleixner wrote:
> > + orl $X86_CR4_PAE, %eax
> > testl %edx, %edx
> > jz 1f
> > orl $X86_CR4_LA57, %eax
> > @@ -662,8 +675,12 @@ SYM_CODE_START(trampoline_32bit_src)
> > pushl $__KERNEL_CS
> > pushl %eax
> >
> > - /* Enable paging again */
> > - movl $(X86_CR0_PG | X86_CR0_PE), %eax
> > + /*
> > + * Enable paging again. Keep CR0.NE set, FERR# is no longer used
> > + * to handle x87 FPU errors and clearing NE may fault in some
> > + * environments.
>
> FERR# is no longer used is really not informative here. The point is
> that any x86 CPU which is supported by the kernel requires CR0_NE to be
> set. This code was wrong from the very beginning because 64bit CPUs
> never supported #FERR. The reason why it exists is Copy&Pasta without
> brain applied and the sad fact that the hardware does not enforce it in
> native mode for whatever reason. So this want's to be a seperate patch
> with a coherent comment and changelong.

What about the patch below?

Instead of adding CR0.NE there I used CR0_STATE instead or keep existing
value, only modifing required bit.

I'm not familiar with float-point execption handling. I tried to read up
on that in attempt to make coherent commit message. Please correct me if I
wrote something wrong.

---------------------------------8<----------------------------------------

From: "Kirill A. Shutemov" <[email protected]>
Date: Fri, 11 Feb 2022 14:25:10 +0300
Subject: [PATCH] x86/boot: Set CR0.NE early and keep it set during the boot
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

TDX guest requires CR0.NE to be set. Clearing the bit triggers #GP(0).

If CR0.NE is 0, the MS-DOS compatibility mode for handling floating-point
exceptions is selected. In this mode, the software exception handler for
floating-point exceptions is invoked externally using the processor’s
FERR#, INTR, and IGNNE# pins.

Using FERR# and IGNNE# to handle floating-point exception is deprecated.
CR0.NE=0 also limits newer processors to operate with one logical
processor active.

Kernel uses CR0_STATE constant to initialize CR0. It has NE bit set.
But during early boot has more ad-hoc approach to setting bit in the
register.

Make CR0 initialization consistent, deriving the initial from CR0_STATE.

Signed-off-by: Kirill A. Shutemov <[email protected]>
---
arch/x86/boot/compressed/head_64.S | 7 ++++---
arch/x86/realmode/rm/trampoline_64.S | 8 ++++----
2 files changed, 8 insertions(+), 7 deletions(-)

diff --git a/arch/x86/boot/compressed/head_64.S b/arch/x86/boot/compressed/head_64.S
index fd9441f40457..d0c3d33f3542 100644
--- a/arch/x86/boot/compressed/head_64.S
+++ b/arch/x86/boot/compressed/head_64.S
@@ -289,7 +289,7 @@ SYM_FUNC_START(startup_32)
pushl %eax

/* Enter paged protected Mode, activating Long Mode */
- movl $(X86_CR0_PG | X86_CR0_PE), %eax /* Enable Paging and Protected mode */
+ movl $CR0_STATE, %eax
movl %eax, %cr0

/* Jump from 32bit compatibility mode into 64bit mode. */
@@ -662,8 +662,9 @@ SYM_CODE_START(trampoline_32bit_src)
pushl $__KERNEL_CS
pushl %eax

- /* Enable paging again */
- movl $(X86_CR0_PG | X86_CR0_PE), %eax
+ /* Enable paging again. */
+ movl %cr0, %eax
+ btsl $X86_CR0_PG_BIT, %eax
movl %eax, %cr0

lret
diff --git a/arch/x86/realmode/rm/trampoline_64.S b/arch/x86/realmode/rm/trampoline_64.S
index ae112a91592f..d380f2d1fd23 100644
--- a/arch/x86/realmode/rm/trampoline_64.S
+++ b/arch/x86/realmode/rm/trampoline_64.S
@@ -70,7 +70,7 @@ SYM_CODE_START(trampoline_start)
movw $__KERNEL_DS, %dx # Data segment descriptor

# Enable protected mode
- movl $X86_CR0_PE, %eax # protected mode (PE) bit
+ movl $(CR0_STATE & ~X86_CR0_PG), %eax
movl %eax, %cr0 # into protected mode

# flush prefetch and jump to startup_32
@@ -148,8 +148,8 @@ SYM_CODE_START(startup_32)
movl $MSR_EFER, %ecx
wrmsr

- # Enable paging and in turn activate Long Mode
- movl $(X86_CR0_PG | X86_CR0_WP | X86_CR0_PE), %eax
+ # Enable paging and in turn activate Long Mode.
+ movl $CR0_STATE, %eax
movl %eax, %cr0

/*
@@ -169,7 +169,7 @@ SYM_CODE_START(pa_trampoline_compat)
movl $rm_stack_end, %esp
movw $__KERNEL_DS, %dx

- movl $X86_CR0_PE, %eax
+ movl $(CR0_STATE & ~X86_CR0_PG), %eax
movl %eax, %cr0
ljmpl $__KERNEL32_CS, $pa_startup_32
SYM_CODE_END(pa_trampoline_compat)
--
Kirill A. Shutemov