Hi all,
This is v7 of the nVHE hypervisor stack enhancements. This version is based
on 5.18-rc1 and drops the hypervisor stack unwinding and overflow-stack
patches. These require further discussion and will be resent separately.
Previous versions can be found at:
v6: https://lore.kernel.org/r/[email protected]/
v5: https://lore.kernel.org/r/[email protected]/
v4: https://lore.kernel.org/r/[email protected]/
v3: https://lore.kernel.org/r/[email protected]/
v2: https://lore.kernel.org/r/[email protected]/
v1: https://lore.kernel.org/r/[email protected]/
Thanks,
Kalesh
-----
This series is based on 5.18-rc1 and adds stack guard pages to nVHE and pKVM
hypervisor; and symbolization of hypervisor addresses.
The guard page stack overflow detection is based on the technique used by
arm64 VMAP_STACK. i.e. the stack is aligned such that the 'stack shift' bit
of any valid SP is 1. The 'stack shift' bit can be tested in the exception
entry to detect overflow without corrupting GPRs.
Kalesh Singh (6):
KVM: arm64: Introduce hyp_alloc_private_va_range()
KVM: arm64: Introduce pkvm_alloc_private_va_range()
KVM: arm64: Add guard pages for KVM nVHE hypervisor stack
KVM: arm64: Add guard pages for pKVM (protected nVHE) hypervisor stack
KVM: arm64: Detect and handle hypervisor stack overflows
KVM: arm64: Symbolize the nVHE HYP addresses
arch/arm64/include/asm/kvm_asm.h | 1 +
arch/arm64/include/asm/kvm_mmu.h | 4 ++
arch/arm64/kvm/arm.c | 39 ++++++++++++--
arch/arm64/kvm/handle_exit.c | 13 ++---
arch/arm64/kvm/hyp/include/nvhe/mm.h | 6 ++-
arch/arm64/kvm/hyp/nvhe/host.S | 24 +++++++++
arch/arm64/kvm/hyp/nvhe/hyp-main.c | 18 ++++++-
arch/arm64/kvm/hyp/nvhe/mm.c | 78 ++++++++++++++++++----------
arch/arm64/kvm/hyp/nvhe/setup.c | 31 +++++++++--
arch/arm64/kvm/hyp/nvhe/switch.c | 7 ++-
arch/arm64/kvm/mmu.c | 70 ++++++++++++++++---------
scripts/kallsyms.c | 2 +-
12 files changed, 223 insertions(+), 70 deletions(-)
base-commit: 3123109284176b1532874591f7c81f3837bbdc17
--
2.35.1.1178.g4f1659d476-goog
The hypervisor stacks (for both nVHE Hyp mode and nVHE protected mode)
are aligned such that any valid stack address has PAGE_SHIFT bit as 1.
This allows us to conveniently check for overflow in the exception entry
without corrupting any GPRs. We won't recover from a stack overflow so
panic the hypervisor.
Signed-off-by: Kalesh Singh <[email protected]>
Tested-by: Fuad Tabba <[email protected]>
Reviewed-by: Fuad Tabba <[email protected]>
---
Changes in v7:
- Add Fuad's Reviewed-by and Tested-by tags.
Changes in v5:
- Valid stack addresses now have PAGE_SHIFT bit as 1 instead of 0
Changes in v3:
- Remove test_sp_overflow macro, per Mark
- Add asmlinkage attribute for hyp_panic, hyp_panic_bad_stack, per Ard
arch/arm64/kvm/hyp/nvhe/host.S | 24 ++++++++++++++++++++++++
arch/arm64/kvm/hyp/nvhe/switch.c | 7 ++++++-
2 files changed, 30 insertions(+), 1 deletion(-)
diff --git a/arch/arm64/kvm/hyp/nvhe/host.S b/arch/arm64/kvm/hyp/nvhe/host.S
index 3d613e721a75..be6d844279b1 100644
--- a/arch/arm64/kvm/hyp/nvhe/host.S
+++ b/arch/arm64/kvm/hyp/nvhe/host.S
@@ -153,6 +153,18 @@ SYM_FUNC_END(__host_hvc)
.macro invalid_host_el2_vect
.align 7
+
+ /*
+ * Test whether the SP has overflowed, without corrupting a GPR.
+ * nVHE hypervisor stacks are aligned so that the PAGE_SHIFT bit
+ * of SP should always be 1.
+ */
+ add sp, sp, x0 // sp' = sp + x0
+ sub x0, sp, x0 // x0' = sp' - x0 = (sp + x0) - x0 = sp
+ tbz x0, #PAGE_SHIFT, .L__hyp_sp_overflow\@
+ sub x0, sp, x0 // x0'' = sp' - x0' = (sp + x0) - sp = x0
+ sub sp, sp, x0 // sp'' = sp' - x0 = (sp + x0) - x0 = sp
+
/* If a guest is loaded, panic out of it. */
stp x0, x1, [sp, #-16]!
get_loaded_vcpu x0, x1
@@ -165,6 +177,18 @@ SYM_FUNC_END(__host_hvc)
* been partially clobbered by __host_enter.
*/
b hyp_panic
+
+.L__hyp_sp_overflow\@:
+ /*
+ * Reset SP to the top of the stack, to allow handling the hyp_panic.
+ * This corrupts the stack but is ok, since we won't be attempting
+ * any unwinding here.
+ */
+ ldr_this_cpu x0, kvm_init_params + NVHE_INIT_STACK_HYP_VA, x1
+ mov sp, x0
+
+ bl hyp_panic_bad_stack
+ ASM_BUG()
.endm
.macro invalid_host_el1_vect
diff --git a/arch/arm64/kvm/hyp/nvhe/switch.c b/arch/arm64/kvm/hyp/nvhe/switch.c
index 6410d21d8695..703a5d3f611b 100644
--- a/arch/arm64/kvm/hyp/nvhe/switch.c
+++ b/arch/arm64/kvm/hyp/nvhe/switch.c
@@ -347,7 +347,7 @@ int __kvm_vcpu_run(struct kvm_vcpu *vcpu)
return exit_code;
}
-void __noreturn hyp_panic(void)
+asmlinkage void __noreturn hyp_panic(void)
{
u64 spsr = read_sysreg_el2(SYS_SPSR);
u64 elr = read_sysreg_el2(SYS_ELR);
@@ -369,6 +369,11 @@ void __noreturn hyp_panic(void)
unreachable();
}
+asmlinkage void __noreturn hyp_panic_bad_stack(void)
+{
+ hyp_panic();
+}
+
asmlinkage void kvm_unexpected_el2_exception(void)
{
return __kvm_unexpected_el2_exception();
--
2.35.1.1178.g4f1659d476-goog
hyp_alloc_private_va_range() can be used to reserve private VA ranges
in the nVHE hypervisor. Allocations are aligned based on the order of
the requested size.
This will be used to implement stack guard pages for KVM nVHE hypervisor
(nVHE Hyp mode / not pKVM), in a subsequent patch in the series.
Signed-off-by: Kalesh Singh <[email protected]>
Tested-by: Fuad Tabba <[email protected]>
Reviewed-by: Fuad Tabba <[email protected]>
---
Changes in v7:
- Add Fuad's Reviewed-by and Tested-by tags.
Changes in v6:
- Update kernel-doc for hyp_alloc_private_va_range()
and add return description, per Stephen
- Update hyp_alloc_private_va_range() to return an int error code,
per Stephen
- Replace IS_ERR() checks with IS_ERR_VALUE() check, per Stephen
- Clean up goto, per Stephen
Changes in v5:
- Align private allocations based on the order of their size, per Marc
Changes in v4:
- Handle null ptr in hyp_alloc_private_va_range() and replace
IS_ERR_OR_NULL checks in callers with IS_ERR checks, per Fuad
- Fix kernel-doc comments format, per Fuad
Changes in v3:
- Handle null ptr in IS_ERR_OR_NULL checks, per Mark
arch/arm64/include/asm/kvm_mmu.h | 1 +
arch/arm64/kvm/mmu.c | 66 +++++++++++++++++++++-----------
2 files changed, 45 insertions(+), 22 deletions(-)
diff --git a/arch/arm64/include/asm/kvm_mmu.h b/arch/arm64/include/asm/kvm_mmu.h
index 74735a864eee..a50cbb5ba402 100644
--- a/arch/arm64/include/asm/kvm_mmu.h
+++ b/arch/arm64/include/asm/kvm_mmu.h
@@ -154,6 +154,7 @@ static __always_inline unsigned long __kern_hyp_va(unsigned long v)
int kvm_share_hyp(void *from, void *to);
void kvm_unshare_hyp(void *from, void *to);
int create_hyp_mappings(void *from, void *to, enum kvm_pgtable_prot prot);
+int hyp_alloc_private_va_range(size_t size, unsigned long *haddr);
int create_hyp_io_mappings(phys_addr_t phys_addr, size_t size,
void __iomem **kaddr,
void __iomem **haddr);
diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c
index 0d19259454d8..3d3efea4e991 100644
--- a/arch/arm64/kvm/mmu.c
+++ b/arch/arm64/kvm/mmu.c
@@ -457,23 +457,22 @@ int create_hyp_mappings(void *from, void *to, enum kvm_pgtable_prot prot)
return 0;
}
-static int __create_hyp_private_mapping(phys_addr_t phys_addr, size_t size,
- unsigned long *haddr,
- enum kvm_pgtable_prot prot)
+
+/**
+ * hyp_alloc_private_va_range - Allocates a private VA range.
+ * @size: The size of the VA range to reserve.
+ * @haddr: The hypervisor virtual start address of the allocation.
+ *
+ * The private virtual address (VA) range is allocated below io_map_base
+ * and aligned based on the order of @size.
+ *
+ * Return: 0 on success or negative error code on failure.
+ */
+int hyp_alloc_private_va_range(size_t size, unsigned long *haddr)
{
unsigned long base;
int ret = 0;
- if (!kvm_host_owns_hyp_mappings()) {
- base = kvm_call_hyp_nvhe(__pkvm_create_private_mapping,
- phys_addr, size, prot);
- if (IS_ERR_OR_NULL((void *)base))
- return PTR_ERR((void *)base);
- *haddr = base;
-
- return 0;
- }
-
mutex_lock(&kvm_hyp_pgd_mutex);
/*
@@ -484,30 +483,53 @@ static int __create_hyp_private_mapping(phys_addr_t phys_addr, size_t size,
*
* The allocated size is always a multiple of PAGE_SIZE.
*/
- size = PAGE_ALIGN(size + offset_in_page(phys_addr));
- base = io_map_base - size;
+ base = io_map_base - PAGE_ALIGN(size);
+
+ /* Align the allocation based on the order of its size */
+ base = ALIGN_DOWN(base, PAGE_SIZE << get_order(size));
/*
* Verify that BIT(VA_BITS - 1) hasn't been flipped by
* allocating the new area, as it would indicate we've
* overflowed the idmap/IO address range.
*/
- if ((base ^ io_map_base) & BIT(VA_BITS - 1))
+ if (!base || (base ^ io_map_base) & BIT(VA_BITS - 1))
ret = -ENOMEM;
else
- io_map_base = base;
+ *haddr = io_map_base = base;
mutex_unlock(&kvm_hyp_pgd_mutex);
+ return ret;
+}
+
+static int __create_hyp_private_mapping(phys_addr_t phys_addr, size_t size,
+ unsigned long *haddr,
+ enum kvm_pgtable_prot prot)
+{
+ unsigned long addr;
+ int ret = 0;
+
+ if (!kvm_host_owns_hyp_mappings()) {
+ addr = kvm_call_hyp_nvhe(__pkvm_create_private_mapping,
+ phys_addr, size, prot);
+ if (IS_ERR_VALUE(addr))
+ return addr;
+ *haddr = addr;
+
+ return 0;
+ }
+
+ size += offset_in_page(phys_addr);
+ ret = hyp_alloc_private_va_range(size, &addr);
if (ret)
- goto out;
+ return ret;
- ret = __create_hyp_mappings(base, size, phys_addr, prot);
+ ret = __create_hyp_mappings(addr, size, phys_addr, prot);
if (ret)
- goto out;
+ return ret;
- *haddr = base + offset_in_page(phys_addr);
-out:
+ *haddr = addr + offset_in_page(phys_addr);
return ret;
}
--
2.35.1.1178.g4f1659d476-goog
Reintroduce the __kvm_nvhe_ symbols in kallsyms, ignoring the local
symbols in this namespace. The local symbols are not informative and
can cause aliasing issues when symbolizing the addresses.
With the necessary symbols now in kallsyms we can symbolize nVHE
addresses using the %p print format specifier:
[ 98.916444][ T426] kvm [426]: nVHE hyp panic at: [<ffffffc0096156fc>] __kvm_nvhe_overflow_stack+0x8/0x34!
Signed-off-by: Kalesh Singh <[email protected]>
Tested-by: Fuad Tabba <[email protected]>
Reviewed-by: Fuad Tabba <[email protected]>
---
Changes in v6:
- Add Fuad's Reviewed-by and Tested-by tags.
Changes in v2:
- Fix printk warnings - %p expects (void *)
arch/arm64/kvm/handle_exit.c | 13 +++++--------
scripts/kallsyms.c | 2 +-
2 files changed, 6 insertions(+), 9 deletions(-)
diff --git a/arch/arm64/kvm/handle_exit.c b/arch/arm64/kvm/handle_exit.c
index 97fe14aab1a3..a377b871bf58 100644
--- a/arch/arm64/kvm/handle_exit.c
+++ b/arch/arm64/kvm/handle_exit.c
@@ -295,13 +295,8 @@ void __noreturn __cold nvhe_hyp_panic_handler(u64 esr, u64 spsr,
u64 elr_in_kimg = __phys_to_kimg(elr_phys);
u64 hyp_offset = elr_in_kimg - kaslr_offset() - elr_virt;
u64 mode = spsr & PSR_MODE_MASK;
+ u64 panic_addr = elr_virt + hyp_offset;
- /*
- * The nVHE hyp symbols are not included by kallsyms to avoid issues
- * with aliasing. That means that the symbols cannot be printed with the
- * "%pS" format specifier, so fall back to the vmlinux address if
- * there's no better option.
- */
if (mode != PSR_MODE_EL2t && mode != PSR_MODE_EL2h) {
kvm_err("Invalid host exception to nVHE hyp!\n");
} else if (ESR_ELx_EC(esr) == ESR_ELx_EC_BRK64 &&
@@ -321,9 +316,11 @@ void __noreturn __cold nvhe_hyp_panic_handler(u64 esr, u64 spsr,
if (file)
kvm_err("nVHE hyp BUG at: %s:%u!\n", file, line);
else
- kvm_err("nVHE hyp BUG at: %016llx!\n", elr_virt + hyp_offset);
+ kvm_err("nVHE hyp BUG at: [<%016llx>] %pB!\n", panic_addr,
+ (void *)panic_addr);
} else {
- kvm_err("nVHE hyp panic at: %016llx!\n", elr_virt + hyp_offset);
+ kvm_err("nVHE hyp panic at: [<%016llx>] %pB!\n", panic_addr,
+ (void *)panic_addr);
}
/*
diff --git a/scripts/kallsyms.c b/scripts/kallsyms.c
index 8caabddf817c..ad2c93640a92 100644
--- a/scripts/kallsyms.c
+++ b/scripts/kallsyms.c
@@ -111,7 +111,7 @@ static bool is_ignored_symbol(const char *name, char type)
".L", /* local labels, .LBB,.Ltmpxxx,.L__unnamed_xx,.LASANPC, etc. */
"__crc_", /* modversions */
"__efistub_", /* arm64 EFI stub namespace */
- "__kvm_nvhe_", /* arm64 non-VHE KVM namespace */
+ "__kvm_nvhe_$", /* arm64 local symbols in non-VHE KVM namespace */
"__AArch64ADRPThunk_", /* arm64 lld */
"__ARMV5PILongThunk_", /* arm lld */
"__ARMV7PILongThunk_",
--
2.35.1.1178.g4f1659d476-goog
On Fri, 08 Apr 2022 21:03:24 +0100,
Kalesh Singh <[email protected]> wrote:
>
> hyp_alloc_private_va_range() can be used to reserve private VA ranges
> in the nVHE hypervisor. Allocations are aligned based on the order of
> the requested size.
>
> This will be used to implement stack guard pages for KVM nVHE hypervisor
> (nVHE Hyp mode / not pKVM), in a subsequent patch in the series.
>
> Signed-off-by: Kalesh Singh <[email protected]>
> Tested-by: Fuad Tabba <[email protected]>
> Reviewed-by: Fuad Tabba <[email protected]>
> ---
>
> Changes in v7:
> - Add Fuad's Reviewed-by and Tested-by tags.
>
> Changes in v6:
> - Update kernel-doc for hyp_alloc_private_va_range()
> and add return description, per Stephen
> - Update hyp_alloc_private_va_range() to return an int error code,
> per Stephen
> - Replace IS_ERR() checks with IS_ERR_VALUE() check, per Stephen
> - Clean up goto, per Stephen
>
> Changes in v5:
> - Align private allocations based on the order of their size, per Marc
>
> Changes in v4:
> - Handle null ptr in hyp_alloc_private_va_range() and replace
> IS_ERR_OR_NULL checks in callers with IS_ERR checks, per Fuad
> - Fix kernel-doc comments format, per Fuad
>
> Changes in v3:
> - Handle null ptr in IS_ERR_OR_NULL checks, per Mark
>
>
> arch/arm64/include/asm/kvm_mmu.h | 1 +
> arch/arm64/kvm/mmu.c | 66 +++++++++++++++++++++-----------
> 2 files changed, 45 insertions(+), 22 deletions(-)
>
> diff --git a/arch/arm64/include/asm/kvm_mmu.h b/arch/arm64/include/asm/kvm_mmu.h
> index 74735a864eee..a50cbb5ba402 100644
> --- a/arch/arm64/include/asm/kvm_mmu.h
> +++ b/arch/arm64/include/asm/kvm_mmu.h
> @@ -154,6 +154,7 @@ static __always_inline unsigned long __kern_hyp_va(unsigned long v)
> int kvm_share_hyp(void *from, void *to);
> void kvm_unshare_hyp(void *from, void *to);
> int create_hyp_mappings(void *from, void *to, enum kvm_pgtable_prot prot);
> +int hyp_alloc_private_va_range(size_t size, unsigned long *haddr);
> int create_hyp_io_mappings(phys_addr_t phys_addr, size_t size,
> void __iomem **kaddr,
> void __iomem **haddr);
> diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c
> index 0d19259454d8..3d3efea4e991 100644
> --- a/arch/arm64/kvm/mmu.c
> +++ b/arch/arm64/kvm/mmu.c
> @@ -457,23 +457,22 @@ int create_hyp_mappings(void *from, void *to, enum kvm_pgtable_prot prot)
> return 0;
> }
>
> -static int __create_hyp_private_mapping(phys_addr_t phys_addr, size_t size,
> - unsigned long *haddr,
> - enum kvm_pgtable_prot prot)
> +
> +/**
> + * hyp_alloc_private_va_range - Allocates a private VA range.
> + * @size: The size of the VA range to reserve.
> + * @haddr: The hypervisor virtual start address of the allocation.
> + *
> + * The private virtual address (VA) range is allocated below io_map_base
> + * and aligned based on the order of @size.
> + *
> + * Return: 0 on success or negative error code on failure.
> + */
> +int hyp_alloc_private_va_range(size_t size, unsigned long *haddr)
> {
> unsigned long base;
> int ret = 0;
>
> - if (!kvm_host_owns_hyp_mappings()) {
> - base = kvm_call_hyp_nvhe(__pkvm_create_private_mapping,
> - phys_addr, size, prot);
> - if (IS_ERR_OR_NULL((void *)base))
> - return PTR_ERR((void *)base);
> - *haddr = base;
> -
> - return 0;
> - }
> -
> mutex_lock(&kvm_hyp_pgd_mutex);
>
> /*
> @@ -484,30 +483,53 @@ static int __create_hyp_private_mapping(phys_addr_t phys_addr, size_t size,
> *
> * The allocated size is always a multiple of PAGE_SIZE.
> */
> - size = PAGE_ALIGN(size + offset_in_page(phys_addr));
> - base = io_map_base - size;
> + base = io_map_base - PAGE_ALIGN(size);
> +
> + /* Align the allocation based on the order of its size */
> + base = ALIGN_DOWN(base, PAGE_SIZE << get_order(size));
>
> /*
> * Verify that BIT(VA_BITS - 1) hasn't been flipped by
> * allocating the new area, as it would indicate we've
> * overflowed the idmap/IO address range.
> */
> - if ((base ^ io_map_base) & BIT(VA_BITS - 1))
> + if (!base || (base ^ io_map_base) & BIT(VA_BITS - 1))
I don't get this '!base' check. Why isn't it encompassed by the
'VA_BITS - 1' flip check?
> ret = -ENOMEM;
> else
> - io_map_base = base;
> + *haddr = io_map_base = base;
>
> mutex_unlock(&kvm_hyp_pgd_mutex);
>
> + return ret;
> +}
> +
> +static int __create_hyp_private_mapping(phys_addr_t phys_addr, size_t size,
> + unsigned long *haddr,
> + enum kvm_pgtable_prot prot)
> +{
> + unsigned long addr;
> + int ret = 0;
> +
> + if (!kvm_host_owns_hyp_mappings()) {
> + addr = kvm_call_hyp_nvhe(__pkvm_create_private_mapping,
> + phys_addr, size, prot);
> + if (IS_ERR_VALUE(addr))
> + return addr;
> + *haddr = addr;
> +
> + return 0;
> + }
> +
> + size += offset_in_page(phys_addr);
This hardly makes any sense on its own. I get it that it is still
doing the right thing as hyp_alloc_private_va_range() will fix it up,
but I'd rather you keep the PAGE_ALIGN() here, even if it ends up
being duplicated.
> + ret = hyp_alloc_private_va_range(size, &addr);
> if (ret)
> - goto out;
> + return ret;
>
> - ret = __create_hyp_mappings(base, size, phys_addr, prot);
> + ret = __create_hyp_mappings(addr, size, phys_addr, prot);
> if (ret)
> - goto out;
> + return ret;
>
> - *haddr = base + offset_in_page(phys_addr);
> -out:
> + *haddr = addr + offset_in_page(phys_addr);
> return ret;
> }
>
Thanks,
M.
--
Without deviation from the norm, progress is not possible.
Map the stack pages in the flexible private VA range and allocate
guard pages below the stack as unbacked VA space. The stack is aligned
so that any valid stack address has PAGE_SHIFT bit as 1 - this is used
for overflow detection (implemented in a subsequent patch in the series)
Signed-off-by: Kalesh Singh <[email protected]>
Tested-by: Fuad Tabba <[email protected]>
Reviewed-by: Fuad Tabba <[email protected]>
---
Changes in v7:
- Add Fuad's Reviewed-by and Tested-by tags.
Changes in v6:
- Update call to pkvm_alloc_private_va_range() (return val and params)
Changes in v5:
- Use a single allocation for stack and guard pages to ensure they
are contiguous, per Marc
Changes in v4:
- Replace IS_ERR_OR_NULL check with IS_ERR check now that
pkvm_alloc_private_va_range() returns an error for null
pointer, per Fuad
Changes in v3:
- Handle null ptr in IS_ERR_OR_NULL checks, per Mark
arch/arm64/kvm/hyp/nvhe/setup.c | 31 ++++++++++++++++++++++++++++---
1 file changed, 28 insertions(+), 3 deletions(-)
diff --git a/arch/arm64/kvm/hyp/nvhe/setup.c b/arch/arm64/kvm/hyp/nvhe/setup.c
index 27af337f9fea..e8d4ea2fcfa0 100644
--- a/arch/arm64/kvm/hyp/nvhe/setup.c
+++ b/arch/arm64/kvm/hyp/nvhe/setup.c
@@ -99,17 +99,42 @@ static int recreate_hyp_mappings(phys_addr_t phys, unsigned long size,
return ret;
for (i = 0; i < hyp_nr_cpus; i++) {
+ struct kvm_nvhe_init_params *params = per_cpu_ptr(&kvm_init_params, i);
+ unsigned long hyp_addr;
+
start = (void *)kern_hyp_va(per_cpu_base[i]);
end = start + PAGE_ALIGN(hyp_percpu_size);
ret = pkvm_create_mappings(start, end, PAGE_HYP);
if (ret)
return ret;
- end = (void *)per_cpu_ptr(&kvm_init_params, i)->stack_hyp_va;
- start = end - PAGE_SIZE;
- ret = pkvm_create_mappings(start, end, PAGE_HYP);
+ /*
+ * Allocate a contiguous HYP private VA range for the stack
+ * and guard page. The allocation is also aligned based on
+ * the order of its size.
+ */
+ ret = pkvm_alloc_private_va_range(PAGE_SIZE * 2, &hyp_addr);
+ if (ret)
+ return ret;
+
+ /*
+ * Since the stack grows downwards, map the stack to the page
+ * at the higher address and leave the lower guard page
+ * unbacked.
+ *
+ * Any valid stack address now has the PAGE_SHIFT bit as 1
+ * and addresses corresponding to the guard page have the
+ * PAGE_SHIFT bit as 0 - this is used for overflow detection.
+ */
+ hyp_spin_lock(&pkvm_pgd_lock);
+ ret = kvm_pgtable_hyp_map(&pkvm_pgtable, hyp_addr + PAGE_SIZE,
+ PAGE_SIZE, params->stack_pa, PAGE_HYP);
+ hyp_spin_unlock(&pkvm_pgd_lock);
if (ret)
return ret;
+
+ /* Update stack_hyp_va to end of the stack's private VA range */
+ params->stack_hyp_va = hyp_addr + (2 * PAGE_SIZE);
}
/*
--
2.35.1.1178.g4f1659d476-goog
On Fri, 08 Apr 2022 21:03:29 +0100,
Kalesh Singh <[email protected]> wrote:
>
> Reintroduce the __kvm_nvhe_ symbols in kallsyms, ignoring the local
> symbols in this namespace. The local symbols are not informative and
> can cause aliasing issues when symbolizing the addresses.
>
> With the necessary symbols now in kallsyms we can symbolize nVHE
> addresses using the %p print format specifier:
>
> [ 98.916444][ T426] kvm [426]: nVHE hyp panic at: [<ffffffc0096156fc>] __kvm_nvhe_overflow_stack+0x8/0x34!
>
> Signed-off-by: Kalesh Singh <[email protected]>
> Tested-by: Fuad Tabba <[email protected]>
> Reviewed-by: Fuad Tabba <[email protected]>
> ---
>
> Changes in v6:
> - Add Fuad's Reviewed-by and Tested-by tags.
>
> Changes in v2:
> - Fix printk warnings - %p expects (void *)
>
>
> arch/arm64/kvm/handle_exit.c | 13 +++++--------
> scripts/kallsyms.c | 2 +-
> 2 files changed, 6 insertions(+), 9 deletions(-)
>
> diff --git a/arch/arm64/kvm/handle_exit.c b/arch/arm64/kvm/handle_exit.c
> index 97fe14aab1a3..a377b871bf58 100644
> --- a/arch/arm64/kvm/handle_exit.c
> +++ b/arch/arm64/kvm/handle_exit.c
> @@ -295,13 +295,8 @@ void __noreturn __cold nvhe_hyp_panic_handler(u64 esr, u64 spsr,
> u64 elr_in_kimg = __phys_to_kimg(elr_phys);
> u64 hyp_offset = elr_in_kimg - kaslr_offset() - elr_virt;
> u64 mode = spsr & PSR_MODE_MASK;
> + u64 panic_addr = elr_virt + hyp_offset;
>
> - /*
> - * The nVHE hyp symbols are not included by kallsyms to avoid issues
> - * with aliasing. That means that the symbols cannot be printed with the
> - * "%pS" format specifier, so fall back to the vmlinux address if
> - * there's no better option.
> - */
> if (mode != PSR_MODE_EL2t && mode != PSR_MODE_EL2h) {
> kvm_err("Invalid host exception to nVHE hyp!\n");
> } else if (ESR_ELx_EC(esr) == ESR_ELx_EC_BRK64 &&
> @@ -321,9 +316,11 @@ void __noreturn __cold nvhe_hyp_panic_handler(u64 esr, u64 spsr,
> if (file)
> kvm_err("nVHE hyp BUG at: %s:%u!\n", file, line);
> else
> - kvm_err("nVHE hyp BUG at: %016llx!\n", elr_virt + hyp_offset);
> + kvm_err("nVHE hyp BUG at: [<%016llx>] %pB!\n", panic_addr,
> + (void *)panic_addr);
> } else {
> - kvm_err("nVHE hyp panic at: %016llx!\n", elr_virt + hyp_offset);
> + kvm_err("nVHE hyp panic at: [<%016llx>] %pB!\n", panic_addr,
> + (void *)panic_addr);
> }
>
> /*
> diff --git a/scripts/kallsyms.c b/scripts/kallsyms.c
> index 8caabddf817c..ad2c93640a92 100644
> --- a/scripts/kallsyms.c
> +++ b/scripts/kallsyms.c
> @@ -111,7 +111,7 @@ static bool is_ignored_symbol(const char *name, char type)
> ".L", /* local labels, .LBB,.Ltmpxxx,.L__unnamed_xx,.LASANPC, etc. */
> "__crc_", /* modversions */
> "__efistub_", /* arm64 EFI stub namespace */
> - "__kvm_nvhe_", /* arm64 non-VHE KVM namespace */
> + "__kvm_nvhe_$", /* arm64 local symbols in non-VHE KVM namespace */
> "__AArch64ADRPThunk_", /* arm64 lld */
> "__ARMV5PILongThunk_", /* arm lld */
> "__ARMV7PILongThunk_",
If you are sanitising this, shouldn'tt you also apply the same rules
as the rest of the kernel for non-'__-kvm_nvhe_' symbols? For example,
I see a long list of .L* symbols:
0000000000000000 r __kvm_nvhe_.L144721
0000000000000090 r __kvm_nvhe_.L144721
00000000000000b4 r __kvm_nvhe_.L144721
00000000000004b0 r __kvm_nvhe_.L144721
000000000000051c r __kvm_nvhe_.L144721
0000000000000650 r __kvm_nvhe_.L144721
0000000000000694 r __kvm_nvhe_.L144721
0000000000000761 r __kvm_nvhe_.L144721
00000000000007a7 r __kvm_nvhe_.L144721
00000000000007c7 r __kvm_nvhe_.L144721
0000000000000814 r __kvm_nvhe_.L144721
0000000000000b08 r __kvm_nvhe_.L144721
0000000000003200 r __kvm_nvhe_.L144721
(83 of them in total on a local build) that I think should also be
trimmed.
Thanks,
M.
--
Without deviation from the norm, progress is not possible.
On Mon, Apr 18, 2022 at 3:09 AM Marc Zyngier <[email protected]> wrote:
>
> On Fri, 08 Apr 2022 21:03:28 +0100,
> Kalesh Singh <[email protected]> wrote:
> >
> > The hypervisor stacks (for both nVHE Hyp mode and nVHE protected mode)
> > are aligned such that any valid stack address has PAGE_SHIFT bit as 1.
> > This allows us to conveniently check for overflow in the exception entry
> > without corrupting any GPRs. We won't recover from a stack overflow so
> > panic the hypervisor.
> >
> > Signed-off-by: Kalesh Singh <[email protected]>
> > Tested-by: Fuad Tabba <[email protected]>
> > Reviewed-by: Fuad Tabba <[email protected]>
> > ---
> >
> > Changes in v7:
> > - Add Fuad's Reviewed-by and Tested-by tags.
> >
> > Changes in v5:
> > - Valid stack addresses now have PAGE_SHIFT bit as 1 instead of 0
> >
> > Changes in v3:
> > - Remove test_sp_overflow macro, per Mark
> > - Add asmlinkage attribute for hyp_panic, hyp_panic_bad_stack, per Ard
> >
> >
> > arch/arm64/kvm/hyp/nvhe/host.S | 24 ++++++++++++++++++++++++
> > arch/arm64/kvm/hyp/nvhe/switch.c | 7 ++++++-
> > 2 files changed, 30 insertions(+), 1 deletion(-)
> >
> > diff --git a/arch/arm64/kvm/hyp/nvhe/host.S b/arch/arm64/kvm/hyp/nvhe/host.S
> > index 3d613e721a75..be6d844279b1 100644
> > --- a/arch/arm64/kvm/hyp/nvhe/host.S
> > +++ b/arch/arm64/kvm/hyp/nvhe/host.S
> > @@ -153,6 +153,18 @@ SYM_FUNC_END(__host_hvc)
> >
> > .macro invalid_host_el2_vect
> > .align 7
> > +
> > + /*
> > + * Test whether the SP has overflowed, without corrupting a GPR.
> > + * nVHE hypervisor stacks are aligned so that the PAGE_SHIFT bit
> > + * of SP should always be 1.
> > + */
> > + add sp, sp, x0 // sp' = sp + x0
> > + sub x0, sp, x0 // x0' = sp' - x0 = (sp + x0) - x0 = sp
> > + tbz x0, #PAGE_SHIFT, .L__hyp_sp_overflow\@
> > + sub x0, sp, x0 // x0'' = sp' - x0' = (sp + x0) - sp = x0
> > + sub sp, sp, x0 // sp'' = sp' - x0 = (sp + x0) - x0 = sp
> > +
> > /* If a guest is loaded, panic out of it. */
> > stp x0, x1, [sp, #-16]!
> > get_loaded_vcpu x0, x1
> > @@ -165,6 +177,18 @@ SYM_FUNC_END(__host_hvc)
> > * been partially clobbered by __host_enter.
> > */
> > b hyp_panic
> > +
> > +.L__hyp_sp_overflow\@:
> > + /*
> > + * Reset SP to the top of the stack, to allow handling the hyp_panic.
> > + * This corrupts the stack but is ok, since we won't be attempting
> > + * any unwinding here.
> > + */
> > + ldr_this_cpu x0, kvm_init_params + NVHE_INIT_STACK_HYP_VA, x1
> > + mov sp, x0
> > +
> > + bl hyp_panic_bad_stack
>
> Why bl? You clearly don't expect to return here, given that you have
> an ASM_BUG() right below, and that you are calling a __no_return
> function. I think we should be consistent with the rest of the code
> and just do a simple branch.
The idea was to use bl to give the hyp_panic_bad_stack() frame in the
stack trace, which makes it easy to identify overflows. I can add a
comment and drop the redundant ASM_BUG()
Thanks,
Kalesh
>
> It also gives us a chance to preserve an extra register from the
> context.
>
> > + ASM_BUG()
> > .endm
> >
> > .macro invalid_host_el1_vect
> > diff --git a/arch/arm64/kvm/hyp/nvhe/switch.c b/arch/arm64/kvm/hyp/nvhe/switch.c
> > index 6410d21d8695..703a5d3f611b 100644
> > --- a/arch/arm64/kvm/hyp/nvhe/switch.c
> > +++ b/arch/arm64/kvm/hyp/nvhe/switch.c
> > @@ -347,7 +347,7 @@ int __kvm_vcpu_run(struct kvm_vcpu *vcpu)
> > return exit_code;
> > }
> >
> > -void __noreturn hyp_panic(void)
> > +asmlinkage void __noreturn hyp_panic(void)
> > {
> > u64 spsr = read_sysreg_el2(SYS_SPSR);
> > u64 elr = read_sysreg_el2(SYS_ELR);
> > @@ -369,6 +369,11 @@ void __noreturn hyp_panic(void)
> > unreachable();
> > }
> >
> > +asmlinkage void __noreturn hyp_panic_bad_stack(void)
> > +{
> > + hyp_panic();
> > +}
> > +
> > asmlinkage void kvm_unexpected_el2_exception(void)
> > {
> > return __kvm_unexpected_el2_exception();
> > --
> > 2.35.1.1178.g4f1659d476-goog
> >
> >
>
> Thanks,
>
> M.
>
> --
> Without deviation from the norm, progress is not possible.
On Mon, Apr 18, 2022 at 3:16 AM Marc Zyngier <[email protected]> wrote:
>
> On Fri, 08 Apr 2022 21:03:29 +0100,
> Kalesh Singh <[email protected]> wrote:
> >
> > Reintroduce the __kvm_nvhe_ symbols in kallsyms, ignoring the local
> > symbols in this namespace. The local symbols are not informative and
> > can cause aliasing issues when symbolizing the addresses.
> >
> > With the necessary symbols now in kallsyms we can symbolize nVHE
> > addresses using the %p print format specifier:
> >
> > [ 98.916444][ T426] kvm [426]: nVHE hyp panic at: [<ffffffc0096156fc>] __kvm_nvhe_overflow_stack+0x8/0x34!
> >
> > Signed-off-by: Kalesh Singh <[email protected]>
> > Tested-by: Fuad Tabba <[email protected]>
> > Reviewed-by: Fuad Tabba <[email protected]>
> > ---
> >
> > Changes in v6:
> > - Add Fuad's Reviewed-by and Tested-by tags.
> >
> > Changes in v2:
> > - Fix printk warnings - %p expects (void *)
> >
> >
> > arch/arm64/kvm/handle_exit.c | 13 +++++--------
> > scripts/kallsyms.c | 2 +-
> > 2 files changed, 6 insertions(+), 9 deletions(-)
> >
> > diff --git a/arch/arm64/kvm/handle_exit.c b/arch/arm64/kvm/handle_exit.c
> > index 97fe14aab1a3..a377b871bf58 100644
> > --- a/arch/arm64/kvm/handle_exit.c
> > +++ b/arch/arm64/kvm/handle_exit.c
> > @@ -295,13 +295,8 @@ void __noreturn __cold nvhe_hyp_panic_handler(u64 esr, u64 spsr,
> > u64 elr_in_kimg = __phys_to_kimg(elr_phys);
> > u64 hyp_offset = elr_in_kimg - kaslr_offset() - elr_virt;
> > u64 mode = spsr & PSR_MODE_MASK;
> > + u64 panic_addr = elr_virt + hyp_offset;
> >
> > - /*
> > - * The nVHE hyp symbols are not included by kallsyms to avoid issues
> > - * with aliasing. That means that the symbols cannot be printed with the
> > - * "%pS" format specifier, so fall back to the vmlinux address if
> > - * there's no better option.
> > - */
> > if (mode != PSR_MODE_EL2t && mode != PSR_MODE_EL2h) {
> > kvm_err("Invalid host exception to nVHE hyp!\n");
> > } else if (ESR_ELx_EC(esr) == ESR_ELx_EC_BRK64 &&
> > @@ -321,9 +316,11 @@ void __noreturn __cold nvhe_hyp_panic_handler(u64 esr, u64 spsr,
> > if (file)
> > kvm_err("nVHE hyp BUG at: %s:%u!\n", file, line);
> > else
> > - kvm_err("nVHE hyp BUG at: %016llx!\n", elr_virt + hyp_offset);
> > + kvm_err("nVHE hyp BUG at: [<%016llx>] %pB!\n", panic_addr,
> > + (void *)panic_addr);
> > } else {
> > - kvm_err("nVHE hyp panic at: %016llx!\n", elr_virt + hyp_offset);
> > + kvm_err("nVHE hyp panic at: [<%016llx>] %pB!\n", panic_addr,
> > + (void *)panic_addr);
> > }
> >
> > /*
> > diff --git a/scripts/kallsyms.c b/scripts/kallsyms.c
> > index 8caabddf817c..ad2c93640a92 100644
> > --- a/scripts/kallsyms.c
> > +++ b/scripts/kallsyms.c
> > @@ -111,7 +111,7 @@ static bool is_ignored_symbol(const char *name, char type)
> > ".L", /* local labels, .LBB,.Ltmpxxx,.L__unnamed_xx,.LASANPC, etc. */
> > "__crc_", /* modversions */
> > "__efistub_", /* arm64 EFI stub namespace */
> > - "__kvm_nvhe_", /* arm64 non-VHE KVM namespace */
> > + "__kvm_nvhe_$", /* arm64 local symbols in non-VHE KVM namespace */
> > "__AArch64ADRPThunk_", /* arm64 lld */
> > "__ARMV5PILongThunk_", /* arm lld */
> > "__ARMV7PILongThunk_",
>
> If you are sanitising this, shouldn'tt you also apply the same rules
> as the rest of the kernel for non-'__-kvm_nvhe_' symbols? For example,
> I see a long list of .L* symbols:
>
> 0000000000000000 r __kvm_nvhe_.L144721
> 0000000000000090 r __kvm_nvhe_.L144721
> 00000000000000b4 r __kvm_nvhe_.L144721
> 00000000000004b0 r __kvm_nvhe_.L144721
> 000000000000051c r __kvm_nvhe_.L144721
> 0000000000000650 r __kvm_nvhe_.L144721
> 0000000000000694 r __kvm_nvhe_.L144721
> 0000000000000761 r __kvm_nvhe_.L144721
> 00000000000007a7 r __kvm_nvhe_.L144721
> 00000000000007c7 r __kvm_nvhe_.L144721
> 0000000000000814 r __kvm_nvhe_.L144721
> 0000000000000b08 r __kvm_nvhe_.L144721
> 0000000000003200 r __kvm_nvhe_.L144721
>
> (83 of them in total on a local build) that I think should also be
> trimmed.
Good catch. I’ll fix it in the next version along with your other
comments. Thanks for the reviews Mark.
-Kalesh
>
> Thanks,
>
> M.
>
> --
> Without deviation from the norm, progress is not possible.
On Sun, Apr 10, 2022 at 11:52 PM Marc Zyngier <[email protected]> wrote:
>
> On Fri, 08 Apr 2022 21:03:24 +0100,
> Kalesh Singh <[email protected]> wrote:
> >
> > hyp_alloc_private_va_range() can be used to reserve private VA ranges
> > in the nVHE hypervisor. Allocations are aligned based on the order of
> > the requested size.
> >
> > This will be used to implement stack guard pages for KVM nVHE hypervisor
> > (nVHE Hyp mode / not pKVM), in a subsequent patch in the series.
> >
> > Signed-off-by: Kalesh Singh <[email protected]>
> > Tested-by: Fuad Tabba <[email protected]>
> > Reviewed-by: Fuad Tabba <[email protected]>
> > ---
> >
> > Changes in v7:
> > - Add Fuad's Reviewed-by and Tested-by tags.
> >
> > Changes in v6:
> > - Update kernel-doc for hyp_alloc_private_va_range()
> > and add return description, per Stephen
> > - Update hyp_alloc_private_va_range() to return an int error code,
> > per Stephen
> > - Replace IS_ERR() checks with IS_ERR_VALUE() check, per Stephen
> > - Clean up goto, per Stephen
> >
> > Changes in v5:
> > - Align private allocations based on the order of their size, per Marc
> >
> > Changes in v4:
> > - Handle null ptr in hyp_alloc_private_va_range() and replace
> > IS_ERR_OR_NULL checks in callers with IS_ERR checks, per Fuad
> > - Fix kernel-doc comments format, per Fuad
> >
> > Changes in v3:
> > - Handle null ptr in IS_ERR_OR_NULL checks, per Mark
> >
> >
> > arch/arm64/include/asm/kvm_mmu.h | 1 +
> > arch/arm64/kvm/mmu.c | 66 +++++++++++++++++++++-----------
> > 2 files changed, 45 insertions(+), 22 deletions(-)
> >
> > diff --git a/arch/arm64/include/asm/kvm_mmu.h b/arch/arm64/include/asm/kvm_mmu.h
> > index 74735a864eee..a50cbb5ba402 100644
> > --- a/arch/arm64/include/asm/kvm_mmu.h
> > +++ b/arch/arm64/include/asm/kvm_mmu.h
> > @@ -154,6 +154,7 @@ static __always_inline unsigned long __kern_hyp_va(unsigned long v)
> > int kvm_share_hyp(void *from, void *to);
> > void kvm_unshare_hyp(void *from, void *to);
> > int create_hyp_mappings(void *from, void *to, enum kvm_pgtable_prot prot);
> > +int hyp_alloc_private_va_range(size_t size, unsigned long *haddr);
> > int create_hyp_io_mappings(phys_addr_t phys_addr, size_t size,
> > void __iomem **kaddr,
> > void __iomem **haddr);
> > diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c
> > index 0d19259454d8..3d3efea4e991 100644
> > --- a/arch/arm64/kvm/mmu.c
> > +++ b/arch/arm64/kvm/mmu.c
> > @@ -457,23 +457,22 @@ int create_hyp_mappings(void *from, void *to, enum kvm_pgtable_prot prot)
> > return 0;
> > }
> >
> > -static int __create_hyp_private_mapping(phys_addr_t phys_addr, size_t size,
> > - unsigned long *haddr,
> > - enum kvm_pgtable_prot prot)
> > +
> > +/**
> > + * hyp_alloc_private_va_range - Allocates a private VA range.
> > + * @size: The size of the VA range to reserve.
> > + * @haddr: The hypervisor virtual start address of the allocation.
> > + *
> > + * The private virtual address (VA) range is allocated below io_map_base
> > + * and aligned based on the order of @size.
> > + *
> > + * Return: 0 on success or negative error code on failure.
> > + */
> > +int hyp_alloc_private_va_range(size_t size, unsigned long *haddr)
> > {
> > unsigned long base;
> > int ret = 0;
> >
> > - if (!kvm_host_owns_hyp_mappings()) {
> > - base = kvm_call_hyp_nvhe(__pkvm_create_private_mapping,
> > - phys_addr, size, prot);
> > - if (IS_ERR_OR_NULL((void *)base))
> > - return PTR_ERR((void *)base);
> > - *haddr = base;
> > -
> > - return 0;
> > - }
> > -
> > mutex_lock(&kvm_hyp_pgd_mutex);
> >
> > /*
> > @@ -484,30 +483,53 @@ static int __create_hyp_private_mapping(phys_addr_t phys_addr, size_t size,
> > *
> > * The allocated size is always a multiple of PAGE_SIZE.
> > */
> > - size = PAGE_ALIGN(size + offset_in_page(phys_addr));
> > - base = io_map_base - size;
> > + base = io_map_base - PAGE_ALIGN(size);
> > +
> > + /* Align the allocation based on the order of its size */
> > + base = ALIGN_DOWN(base, PAGE_SIZE << get_order(size));
> >
> > /*
> > * Verify that BIT(VA_BITS - 1) hasn't been flipped by
> > * allocating the new area, as it would indicate we've
> > * overflowed the idmap/IO address range.
> > */
> > - if ((base ^ io_map_base) & BIT(VA_BITS - 1))
> > + if (!base || (base ^ io_map_base) & BIT(VA_BITS - 1))
>
> I don't get this '!base' check. Why isn't it encompassed by the
> 'VA_BITS - 1' flip check?
Hi Marc, You're right. The flip check handles this as well. I’ll drop
in the next version.
>
> > ret = -ENOMEM;
> > else
> > - io_map_base = base;
> > + *haddr = io_map_base = base;
> >
> > mutex_unlock(&kvm_hyp_pgd_mutex);
> >
> > + return ret;
> > +}
> > +
> > +static int __create_hyp_private_mapping(phys_addr_t phys_addr, size_t size,
> > + unsigned long *haddr,
> > + enum kvm_pgtable_prot prot)
> > +{
> > + unsigned long addr;
> > + int ret = 0;
> > +
> > + if (!kvm_host_owns_hyp_mappings()) {
> > + addr = kvm_call_hyp_nvhe(__pkvm_create_private_mapping,
> > + phys_addr, size, prot);
> > + if (IS_ERR_VALUE(addr))
> > + return addr;
> > + *haddr = addr;
> > +
> > + return 0;
> > + }
> > +
> > + size += offset_in_page(phys_addr);
>
> This hardly makes any sense on its own. I get it that it is still
> doing the right thing as hyp_alloc_private_va_range() will fix it up,
> but I'd rather you keep the PAGE_ALIGN() here, even if it ends up
> being duplicated.
Ack
Thanks,
Kalesh
>
> > + ret = hyp_alloc_private_va_range(size, &addr);
> > if (ret)
> > - goto out;
> > + return ret;
> >
> > - ret = __create_hyp_mappings(base, size, phys_addr, prot);
> > + ret = __create_hyp_mappings(addr, size, phys_addr, prot);
> > if (ret)
> > - goto out;
> > + return ret;
> >
> > - *haddr = base + offset_in_page(phys_addr);
> > -out:
> > + *haddr = addr + offset_in_page(phys_addr);
> > return ret;
> > }
> >
>
> Thanks,
>
> M.
>
> --
> Without deviation from the norm, progress is not possible.
On Fri, 08 Apr 2022 21:03:28 +0100,
Kalesh Singh <[email protected]> wrote:
>
> The hypervisor stacks (for both nVHE Hyp mode and nVHE protected mode)
> are aligned such that any valid stack address has PAGE_SHIFT bit as 1.
> This allows us to conveniently check for overflow in the exception entry
> without corrupting any GPRs. We won't recover from a stack overflow so
> panic the hypervisor.
>
> Signed-off-by: Kalesh Singh <[email protected]>
> Tested-by: Fuad Tabba <[email protected]>
> Reviewed-by: Fuad Tabba <[email protected]>
> ---
>
> Changes in v7:
> - Add Fuad's Reviewed-by and Tested-by tags.
>
> Changes in v5:
> - Valid stack addresses now have PAGE_SHIFT bit as 1 instead of 0
>
> Changes in v3:
> - Remove test_sp_overflow macro, per Mark
> - Add asmlinkage attribute for hyp_panic, hyp_panic_bad_stack, per Ard
>
>
> arch/arm64/kvm/hyp/nvhe/host.S | 24 ++++++++++++++++++++++++
> arch/arm64/kvm/hyp/nvhe/switch.c | 7 ++++++-
> 2 files changed, 30 insertions(+), 1 deletion(-)
>
> diff --git a/arch/arm64/kvm/hyp/nvhe/host.S b/arch/arm64/kvm/hyp/nvhe/host.S
> index 3d613e721a75..be6d844279b1 100644
> --- a/arch/arm64/kvm/hyp/nvhe/host.S
> +++ b/arch/arm64/kvm/hyp/nvhe/host.S
> @@ -153,6 +153,18 @@ SYM_FUNC_END(__host_hvc)
>
> .macro invalid_host_el2_vect
> .align 7
> +
> + /*
> + * Test whether the SP has overflowed, without corrupting a GPR.
> + * nVHE hypervisor stacks are aligned so that the PAGE_SHIFT bit
> + * of SP should always be 1.
> + */
> + add sp, sp, x0 // sp' = sp + x0
> + sub x0, sp, x0 // x0' = sp' - x0 = (sp + x0) - x0 = sp
> + tbz x0, #PAGE_SHIFT, .L__hyp_sp_overflow\@
> + sub x0, sp, x0 // x0'' = sp' - x0' = (sp + x0) - sp = x0
> + sub sp, sp, x0 // sp'' = sp' - x0 = (sp + x0) - x0 = sp
> +
> /* If a guest is loaded, panic out of it. */
> stp x0, x1, [sp, #-16]!
> get_loaded_vcpu x0, x1
> @@ -165,6 +177,18 @@ SYM_FUNC_END(__host_hvc)
> * been partially clobbered by __host_enter.
> */
> b hyp_panic
> +
> +.L__hyp_sp_overflow\@:
> + /*
> + * Reset SP to the top of the stack, to allow handling the hyp_panic.
> + * This corrupts the stack but is ok, since we won't be attempting
> + * any unwinding here.
> + */
> + ldr_this_cpu x0, kvm_init_params + NVHE_INIT_STACK_HYP_VA, x1
> + mov sp, x0
> +
> + bl hyp_panic_bad_stack
Why bl? You clearly don't expect to return here, given that you have
an ASM_BUG() right below, and that you are calling a __no_return
function. I think we should be consistent with the rest of the code
and just do a simple branch.
It also gives us a chance to preserve an extra register from the
context.
> + ASM_BUG()
> .endm
>
> .macro invalid_host_el1_vect
> diff --git a/arch/arm64/kvm/hyp/nvhe/switch.c b/arch/arm64/kvm/hyp/nvhe/switch.c
> index 6410d21d8695..703a5d3f611b 100644
> --- a/arch/arm64/kvm/hyp/nvhe/switch.c
> +++ b/arch/arm64/kvm/hyp/nvhe/switch.c
> @@ -347,7 +347,7 @@ int __kvm_vcpu_run(struct kvm_vcpu *vcpu)
> return exit_code;
> }
>
> -void __noreturn hyp_panic(void)
> +asmlinkage void __noreturn hyp_panic(void)
> {
> u64 spsr = read_sysreg_el2(SYS_SPSR);
> u64 elr = read_sysreg_el2(SYS_ELR);
> @@ -369,6 +369,11 @@ void __noreturn hyp_panic(void)
> unreachable();
> }
>
> +asmlinkage void __noreturn hyp_panic_bad_stack(void)
> +{
> + hyp_panic();
> +}
> +
> asmlinkage void kvm_unexpected_el2_exception(void)
> {
> return __kvm_unexpected_el2_exception();
> --
> 2.35.1.1178.g4f1659d476-goog
>
>
Thanks,
M.
--
Without deviation from the norm, progress is not possible.
On Mon, Apr 18, 2022 at 7:41 PM Kalesh Singh <[email protected]> wrote:
>
> On Mon, Apr 18, 2022 at 3:09 AM Marc Zyngier <[email protected]> wrote:
> >
> > On Fri, 08 Apr 2022 21:03:28 +0100,
> > Kalesh Singh <[email protected]> wrote:
> > >
> > > The hypervisor stacks (for both nVHE Hyp mode and nVHE protected mode)
> > > are aligned such that any valid stack address has PAGE_SHIFT bit as 1.
> > > This allows us to conveniently check for overflow in the exception entry
> > > without corrupting any GPRs. We won't recover from a stack overflow so
> > > panic the hypervisor.
> > >
> > > Signed-off-by: Kalesh Singh <[email protected]>
> > > Tested-by: Fuad Tabba <[email protected]>
> > > Reviewed-by: Fuad Tabba <[email protected]>
> > > ---
> > >
> > > Changes in v7:
> > > - Add Fuad's Reviewed-by and Tested-by tags.
> > >
> > > Changes in v5:
> > > - Valid stack addresses now have PAGE_SHIFT bit as 1 instead of 0
> > >
> > > Changes in v3:
> > > - Remove test_sp_overflow macro, per Mark
> > > - Add asmlinkage attribute for hyp_panic, hyp_panic_bad_stack, per Ard
> > >
> > >
> > > arch/arm64/kvm/hyp/nvhe/host.S | 24 ++++++++++++++++++++++++
> > > arch/arm64/kvm/hyp/nvhe/switch.c | 7 ++++++-
> > > 2 files changed, 30 insertions(+), 1 deletion(-)
> > >
> > > diff --git a/arch/arm64/kvm/hyp/nvhe/host.S b/arch/arm64/kvm/hyp/nvhe/host.S
> > > index 3d613e721a75..be6d844279b1 100644
> > > --- a/arch/arm64/kvm/hyp/nvhe/host.S
> > > +++ b/arch/arm64/kvm/hyp/nvhe/host.S
> > > @@ -153,6 +153,18 @@ SYM_FUNC_END(__host_hvc)
> > >
> > > .macro invalid_host_el2_vect
> > > .align 7
> > > +
> > > + /*
> > > + * Test whether the SP has overflowed, without corrupting a GPR.
> > > + * nVHE hypervisor stacks are aligned so that the PAGE_SHIFT bit
> > > + * of SP should always be 1.
> > > + */
> > > + add sp, sp, x0 // sp' = sp + x0
> > > + sub x0, sp, x0 // x0' = sp' - x0 = (sp + x0) - x0 = sp
> > > + tbz x0, #PAGE_SHIFT, .L__hyp_sp_overflow\@
> > > + sub x0, sp, x0 // x0'' = sp' - x0' = (sp + x0) - sp = x0
> > > + sub sp, sp, x0 // sp'' = sp' - x0 = (sp + x0) - x0 = sp
> > > +
> > > /* If a guest is loaded, panic out of it. */
> > > stp x0, x1, [sp, #-16]!
> > > get_loaded_vcpu x0, x1
> > > @@ -165,6 +177,18 @@ SYM_FUNC_END(__host_hvc)
> > > * been partially clobbered by __host_enter.
> > > */
> > > b hyp_panic
> > > +
> > > +.L__hyp_sp_overflow\@:
> > > + /*
> > > + * Reset SP to the top of the stack, to allow handling the hyp_panic.
> > > + * This corrupts the stack but is ok, since we won't be attempting
> > > + * any unwinding here.
> > > + */
> > > + ldr_this_cpu x0, kvm_init_params + NVHE_INIT_STACK_HYP_VA, x1
> > > + mov sp, x0
> > > +
> > > + bl hyp_panic_bad_stack
> >
> > Why bl? You clearly don't expect to return here, given that you have
> > an ASM_BUG() right below, and that you are calling a __no_return
> > function. I think we should be consistent with the rest of the code
> > and just do a simple branch.
>
> The idea was to use bl to give the hyp_panic_bad_stack() frame in the
> stack trace, which makes it easy to identify overflows. I can add a
> comment and drop the redundant ASM_BUG()
Sorry, my mistake here: bl will give us the current frame in the stack
trace (hyp_host_vector) so it doesn't affect hyp_panic_bad_stack (next
frame) being in the strace trace. Addressed in v8:
https://lore.kernel.org/r/[email protected]/
Thanks,
Kalesh
>
> Thanks,
> Kalesh
>
> >
> > It also gives us a chance to preserve an extra register from the
> > context.
> >
> > > + ASM_BUG()
> > > .endm
> > >
> > > .macro invalid_host_el1_vect
> > > diff --git a/arch/arm64/kvm/hyp/nvhe/switch.c b/arch/arm64/kvm/hyp/nvhe/switch.c
> > > index 6410d21d8695..703a5d3f611b 100644
> > > --- a/arch/arm64/kvm/hyp/nvhe/switch.c
> > > +++ b/arch/arm64/kvm/hyp/nvhe/switch.c
> > > @@ -347,7 +347,7 @@ int __kvm_vcpu_run(struct kvm_vcpu *vcpu)
> > > return exit_code;
> > > }
> > >
> > > -void __noreturn hyp_panic(void)
> > > +asmlinkage void __noreturn hyp_panic(void)
> > > {
> > > u64 spsr = read_sysreg_el2(SYS_SPSR);
> > > u64 elr = read_sysreg_el2(SYS_ELR);
> > > @@ -369,6 +369,11 @@ void __noreturn hyp_panic(void)
> > > unreachable();
> > > }
> > >
> > > +asmlinkage void __noreturn hyp_panic_bad_stack(void)
> > > +{
> > > + hyp_panic();
> > > +}
> > > +
> > > asmlinkage void kvm_unexpected_el2_exception(void)
> > > {
> > > return __kvm_unexpected_el2_exception();
> > > --
> > > 2.35.1.1178.g4f1659d476-goog
> > >
> > >
> >
> > Thanks,
> >
> > M.
> >
> > --
> > Without deviation from the norm, progress is not possible.