From: Ard Biesheuvel <[email protected]>
This series removes the last remaining absolute symbol references from
.head.text. Doing so is necessary because code in this section may be
called from a 1:1 mapping of memory, which deviates from the mapping
this code was linked and/or relocated to run at. This is not something
that the toolchains support: even PIC/PIE code is still assumed to
execute from the same mapping that it was relocated to run from by the
startup code or dynamic loader. This means we are basically on our own
here, and need to add measures to ensure the code works as expected in
this manner.
Given that the startup code needs to create the kernel virtual mapping
in the page tables, early references to some kernel virtual addresses
are valid even if they cannot be dereferenced yet. To avoid having to
make this distinction at build time, patches #3 and #4 replace such
valid references with RIP-relative references with an offset applied.
Patches #1 and #2 remove some absolute references from .head.text that
don't need to be there in the first place.
Changes since v2:
- Rebase onto v6.10-rc2
- Tweak commit log of patch #3
Changes since v1/RFC:
- rename va_offset to p2v_offset
- take PA of _text in C code directly
Cc: Tom Lendacky <[email protected]>
Cc: Thomas Gleixner <[email protected]>
Cc: Ingo Molnar <[email protected]>
Cc: Borislav Petkov <[email protected]>
Cc: Dave Hansen <[email protected]>
Cc: Andy Lutomirski <[email protected]>
Cc: Arnd Bergmann <[email protected]>
Cc: Kees Cook <[email protected]>
Cc: Brian Gerst <[email protected]>
Ard Biesheuvel (4):
x86/sev: Avoid WARN()s in early boot code
x86/xen/pvh: Move startup code into .ref.text
x86/boot/64: Determine VA/PA offset before entering C code
x86/boot/64: Avoid intentional absolute symbol references in
.head.text
arch/x86/include/asm/setup.h | 2 +-
arch/x86/kernel/head64.c | 38 ++++++++++++--------
arch/x86/kernel/head_64.S | 9 ++++-
arch/x86/kernel/sev.c | 15 +++-----
arch/x86/platform/pvh/head.S | 2 +-
5 files changed, 38 insertions(+), 28 deletions(-)
--
2.45.1.288.g0e0cd299f1-goog
From: Ard Biesheuvel <[email protected]>
Using WARN() before the kernel is even mapped is unlikely to do anything
useful: the string literals are passed using their kernel virtual
addresses which are not even mapped yet. But even if they were, calling
into the printk machinery from the early 1:1 mapped code is not going to
get very far.
So drop the WARN()s entirely.
Signed-off-by: Ard Biesheuvel <[email protected]>
---
arch/x86/kernel/sev.c | 15 +++++----------
1 file changed, 5 insertions(+), 10 deletions(-)
diff --git a/arch/x86/kernel/sev.c b/arch/x86/kernel/sev.c
index 3342ed58e168..33a669e85e5b 100644
--- a/arch/x86/kernel/sev.c
+++ b/arch/x86/kernel/sev.c
@@ -720,7 +720,7 @@ early_set_pages_state(unsigned long vaddr, unsigned long paddr,
if (op == SNP_PAGE_STATE_SHARED) {
/* Page validation must be rescinded before changing to shared */
ret = pvalidate(vaddr, RMP_PG_SIZE_4K, false);
- if (WARN(ret, "Failed to validate address 0x%lx ret %d", paddr, ret))
+ if (ret)
goto e_term;
}
@@ -733,21 +733,16 @@ early_set_pages_state(unsigned long vaddr, unsigned long paddr,
val = sev_es_rd_ghcb_msr();
- if (WARN(GHCB_RESP_CODE(val) != GHCB_MSR_PSC_RESP,
- "Wrong PSC response code: 0x%x\n",
- (unsigned int)GHCB_RESP_CODE(val)))
+ if (GHCB_RESP_CODE(val) != GHCB_MSR_PSC_RESP)
goto e_term;
- if (WARN(GHCB_MSR_PSC_RESP_VAL(val),
- "Failed to change page state to '%s' paddr 0x%lx error 0x%llx\n",
- op == SNP_PAGE_STATE_PRIVATE ? "private" : "shared",
- paddr, GHCB_MSR_PSC_RESP_VAL(val)))
+ if (GHCB_MSR_PSC_RESP_VAL(val))
goto e_term;
if (op == SNP_PAGE_STATE_PRIVATE) {
/* Page validation must be performed after changing to private */
ret = pvalidate(vaddr, RMP_PG_SIZE_4K, true);
- if (WARN(ret, "Failed to validate address 0x%lx ret %d", paddr, ret))
+ if (ret)
goto e_term;
}
@@ -780,7 +775,7 @@ void __head early_snp_set_memory_private(unsigned long vaddr, unsigned long padd
early_set_pages_state(vaddr, paddr, npages, SNP_PAGE_STATE_PRIVATE);
}
-void __init early_snp_set_memory_shared(unsigned long vaddr, unsigned long paddr,
+void __head early_snp_set_memory_shared(unsigned long vaddr, unsigned long paddr,
unsigned long npages)
{
/*
--
2.45.1.288.g0e0cd299f1-goog
From: Ard Biesheuvel <[email protected]>
The Xen PVH startup code does not need to live in .head.text, given that
its entry point is not at a fixed offset, and is communicated to the
host/VMM via an ELF note.
So move it out of .head.text into another code section. To avoid
spurious warnings about references to .init code, move it into .ref.text
rather than .text. (Note that the ELF note itself is not .init and so
moving this code into .init.text would result in warnings as well)
Signed-off-by: Ard Biesheuvel <[email protected]>
---
arch/x86/platform/pvh/head.S | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/arch/x86/platform/pvh/head.S b/arch/x86/platform/pvh/head.S
index f7235ef87bc3..0cf6008e834b 100644
--- a/arch/x86/platform/pvh/head.S
+++ b/arch/x86/platform/pvh/head.S
@@ -20,7 +20,7 @@
#include <asm/nospec-branch.h>
#include <xen/interface/elfnote.h>
- __HEAD
+ __REF
/*
* Entry point for PVH guests.
--
2.45.1.288.g0e0cd299f1-goog
From: Ard Biesheuvel <[email protected]>
Implicit absolute symbol references (e.g., taking the address of a
global variable) must be avoided in the C code that runs from the early
1:1 mapping of the kernel, given that this is a practice that violates
assumptions on the part of the toolchain. I.e., RIP-relative and
absolute references are expected to produce the same values, and so the
compiler is free to choose either. However, the code currently assumes
that RIP-relative references are never emitted here.
So an explicit virtual-to-physical offset needs to be used instead to
derive the kernel virtual addresses of _text and _end, instead of simply
taking the addresses and assuming that the compiler will not choose to
use a RIP-relative references in this particular case.
Currently, phys_base is already used to perform such calculations, but
it is derived from the kernel virtual address of _text, which is taken
using an implicit absolute symbol reference. So instead, derive this
VA-to-PA offset in asm code, using the kernel VA of common_startup_64
(which we already keep in a global variable for other reasons), and pass
it to the C startup code.
Signed-off-by: Ard Biesheuvel <[email protected]>
---
arch/x86/include/asm/setup.h | 2 +-
arch/x86/kernel/head64.c | 8 +++++---
arch/x86/kernel/head_64.S | 9 ++++++++-
3 files changed, 14 insertions(+), 5 deletions(-)
diff --git a/arch/x86/include/asm/setup.h b/arch/x86/include/asm/setup.h
index e61e68d71cba..aca18be5a228 100644
--- a/arch/x86/include/asm/setup.h
+++ b/arch/x86/include/asm/setup.h
@@ -47,7 +47,7 @@ extern unsigned long saved_video_mode;
extern void reserve_standard_io_resources(void);
extern void i386_reserve_resources(void);
-extern unsigned long __startup_64(unsigned long physaddr, struct boot_params *bp);
+extern unsigned long __startup_64(unsigned long p2v_offset, struct boot_params *bp);
extern void startup_64_setup_gdt_idt(void);
extern void early_setup_idt(void);
extern void __init do_early_exception(struct pt_regs *regs, int trapnr);
diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c
index a817ed0724d1..81696a4967e6 100644
--- a/arch/x86/kernel/head64.c
+++ b/arch/x86/kernel/head64.c
@@ -138,12 +138,14 @@ static unsigned long __head sme_postprocess_startup(struct boot_params *bp, pmdv
* doesn't have to generate PC-relative relocations when accessing globals from
* that function. Clang actually does not generate them, which leads to
* boot-time crashes. To work around this problem, every global pointer must
- * be accessed using RIP_REL_REF().
+ * be accessed using RIP_REL_REF(). Kernel virtual addresses can be determined
+ * by subtracting p2v_offset from the RIP-relative address.
*/
-unsigned long __head __startup_64(unsigned long physaddr,
+unsigned long __head __startup_64(unsigned long p2v_offset,
struct boot_params *bp)
{
pmd_t (*early_pgts)[PTRS_PER_PMD] = RIP_REL_REF(early_dynamic_pgts);
+ unsigned long physaddr = (unsigned long)&RIP_REL_REF(_text);
unsigned long pgtable_flags;
unsigned long load_delta;
pgdval_t *pgd;
@@ -163,7 +165,7 @@ unsigned long __head __startup_64(unsigned long physaddr,
* Compute the delta between the address I am compiled to run at
* and the address I am actually running at.
*/
- load_delta = physaddr - (unsigned long)(_text - __START_KERNEL_map);
+ load_delta = __START_KERNEL_map + p2v_offset;
RIP_REL_REF(phys_base) = load_delta;
/* Is the address not 2M aligned? */
diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S
index 330922b328bf..d0e494607acc 100644
--- a/arch/x86/kernel/head_64.S
+++ b/arch/x86/kernel/head_64.S
@@ -100,13 +100,20 @@ SYM_CODE_START_NOALIGN(startup_64)
/* Sanitize CPU configuration */
call verify_cpu
+ /*
+ * Use the 1:1 physical and kernel virtual addresses of
+ * common_startup_64 to determine the physical-to-virtual offset, and
+ * pass it as the first argument to __startup_64().
+ */
+ leaq common_startup_64(%rip), %rdi
+ subq 0f(%rip), %rdi
+
/*
* Perform pagetable fixups. Additionally, if SME is active, encrypt
* the kernel and retrieve the modifier (SME encryption mask if SME
* is active) to be added to the initial pgdir entry that will be
* programmed into CR3.
*/
- leaq _text(%rip), %rdi
movq %r15, %rsi
call __startup_64
--
2.45.1.288.g0e0cd299f1-goog
From: Ard Biesheuvel <[email protected]>
The code in .head.text executes from a 1:1 mapping and cannot generally
refer to global variables using their kernel virtual addresses. However,
there are some occurrences of such references that are valid: the kernel
virtual addresses of _text and _end are needed to populate the page
tables correctly, and some other section markers are used in a similar
way.
To avoid the need for making exceptions to the rule that .head.text must
not contain any absolute symbol references, derive these addresses from
the RIP-relative 1:1 mapped physical addresses, which can be safely
determined using RIP_REL_REF().
Signed-off-by: Ard Biesheuvel <[email protected]>
---
arch/x86/kernel/head64.c | 30 ++++++++++++--------
1 file changed, 18 insertions(+), 12 deletions(-)
diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c
index 81696a4967e6..c0f20962f9b1 100644
--- a/arch/x86/kernel/head64.c
+++ b/arch/x86/kernel/head64.c
@@ -91,9 +91,11 @@ static inline bool check_la57_support(void)
return true;
}
-static unsigned long __head sme_postprocess_startup(struct boot_params *bp, pmdval_t *pmd)
+static unsigned long __head sme_postprocess_startup(struct boot_params *bp,
+ pmdval_t *pmd,
+ unsigned long p2v_offset)
{
- unsigned long vaddr, vaddr_end;
+ unsigned long paddr, paddr_end;
int i;
/* Encrypt the kernel and related (if SME is active) */
@@ -106,10 +108,10 @@ static unsigned long __head sme_postprocess_startup(struct boot_params *bp, pmdv
* attribute.
*/
if (sme_get_me_mask()) {
- vaddr = (unsigned long)__start_bss_decrypted;
- vaddr_end = (unsigned long)__end_bss_decrypted;
+ paddr = (unsigned long)&RIP_REL_REF(__start_bss_decrypted);
+ paddr_end = (unsigned long)&RIP_REL_REF(__end_bss_decrypted);
- for (; vaddr < vaddr_end; vaddr += PMD_SIZE) {
+ for (; paddr < paddr_end; paddr += PMD_SIZE) {
/*
* On SNP, transition the page to shared in the RMP table so that
* it is consistent with the page table attribute change.
@@ -118,11 +120,11 @@ static unsigned long __head sme_postprocess_startup(struct boot_params *bp, pmdv
* mapping (kernel .text). PVALIDATE, by way of
* early_snp_set_memory_shared(), requires a valid virtual
* address but the kernel is currently running off of the identity
- * mapping so use __pa() to get a *currently* valid virtual address.
+ * mapping so use the PA to get a *currently* valid virtual address.
*/
- early_snp_set_memory_shared(__pa(vaddr), __pa(vaddr), PTRS_PER_PMD);
+ early_snp_set_memory_shared(paddr, paddr, PTRS_PER_PMD);
- i = pmd_index(vaddr);
+ i = pmd_index(paddr - p2v_offset);
pmd[i] -= sme_get_me_mask();
}
}
@@ -146,6 +148,7 @@ unsigned long __head __startup_64(unsigned long p2v_offset,
{
pmd_t (*early_pgts)[PTRS_PER_PMD] = RIP_REL_REF(early_dynamic_pgts);
unsigned long physaddr = (unsigned long)&RIP_REL_REF(_text);
+ unsigned long va_text, va_end;
unsigned long pgtable_flags;
unsigned long load_delta;
pgdval_t *pgd;
@@ -172,6 +175,9 @@ unsigned long __head __startup_64(unsigned long p2v_offset,
if (load_delta & ~PMD_MASK)
for (;;);
+ va_text = physaddr - p2v_offset;
+ va_end = (unsigned long)&RIP_REL_REF(_end) - p2v_offset;
+
/* Include the SME encryption mask in the fixup value */
load_delta += sme_get_me_mask();
@@ -232,7 +238,7 @@ unsigned long __head __startup_64(unsigned long p2v_offset,
pmd_entry += sme_get_me_mask();
pmd_entry += physaddr;
- for (i = 0; i < DIV_ROUND_UP(_end - _text, PMD_SIZE); i++) {
+ for (i = 0; i < DIV_ROUND_UP(va_end - va_text, PMD_SIZE); i++) {
int idx = i + (physaddr >> PMD_SHIFT);
pmd[idx % PTRS_PER_PMD] = pmd_entry + i * PMD_SIZE;
@@ -257,11 +263,11 @@ unsigned long __head __startup_64(unsigned long p2v_offset,
pmd = &RIP_REL_REF(level2_kernel_pgt)->pmd;
/* invalidate pages before the kernel image */
- for (i = 0; i < pmd_index((unsigned long)_text); i++)
+ for (i = 0; i < pmd_index(va_text); i++)
pmd[i] &= ~_PAGE_PRESENT;
/* fixup pages that are part of the kernel image */
- for (; i <= pmd_index((unsigned long)_end); i++)
+ for (; i <= pmd_index(va_end); i++)
if (pmd[i] & _PAGE_PRESENT)
pmd[i] += load_delta;
@@ -269,7 +275,7 @@ unsigned long __head __startup_64(unsigned long p2v_offset,
for (; i < PTRS_PER_PMD; i++)
pmd[i] &= ~_PAGE_PRESENT;
- return sme_postprocess_startup(bp, pmd);
+ return sme_postprocess_startup(bp, pmd, p2v_offset);
}
/* Wipe all early page tables except for the kernel symbol map */
--
2.45.1.288.g0e0cd299f1-goog