2022-06-08 03:45:30

by Kalesh Singh

[permalink] [raw]
Subject: [PATCH v3 4/5] KVM: arm64: Allocate shared stacktrace pages

The nVHE hypervisor can use this shared area to dump its stacktrace
addresses on hyp_panic(). Symbolization and printing the stacktrace can
then be handled by the host in EL1 (done in a later patch in this series).

Signed-off-by: Kalesh Singh <[email protected]>
---
arch/arm64/include/asm/kvm_asm.h | 1 +
arch/arm64/kvm/arm.c | 34 ++++++++++++++++++++++++++++++++
arch/arm64/kvm/hyp/nvhe/setup.c | 11 +++++++++++
3 files changed, 46 insertions(+)

diff --git a/arch/arm64/include/asm/kvm_asm.h b/arch/arm64/include/asm/kvm_asm.h
index 2e277f2ed671..ad31ac68264f 100644
--- a/arch/arm64/include/asm/kvm_asm.h
+++ b/arch/arm64/include/asm/kvm_asm.h
@@ -174,6 +174,7 @@ struct kvm_nvhe_init_params {
unsigned long hcr_el2;
unsigned long vttbr;
unsigned long vtcr;
+ unsigned long stacktrace_hyp_va;
};

/* Translate a kernel address @ptr into its equivalent linear mapping */
diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c
index 400bb0fe2745..c0a936a7623d 100644
--- a/arch/arm64/kvm/arm.c
+++ b/arch/arm64/kvm/arm.c
@@ -50,6 +50,7 @@ DEFINE_STATIC_KEY_FALSE(kvm_protected_mode_initialized);
DECLARE_KVM_HYP_PER_CPU(unsigned long, kvm_hyp_vector);

static DEFINE_PER_CPU(unsigned long, kvm_arm_hyp_stack_page);
+DEFINE_PER_CPU(unsigned long, kvm_arm_hyp_stacktrace_page);
unsigned long kvm_arm_hyp_percpu_base[NR_CPUS];
DECLARE_KVM_NVHE_PER_CPU(struct kvm_nvhe_init_params, kvm_init_params);

@@ -1554,6 +1555,7 @@ static void cpu_prepare_hyp_mode(int cpu)
tcr |= (idmap_t0sz & GENMASK(TCR_TxSZ_WIDTH - 1, 0)) << TCR_T0SZ_OFFSET;
params->tcr_el2 = tcr;

+ params->stacktrace_hyp_va = kern_hyp_va(per_cpu(kvm_arm_hyp_stacktrace_page, cpu));
params->pgd_pa = kvm_mmu_get_httbr();
if (is_protected_kvm_enabled())
params->hcr_el2 = HCR_HOST_NVHE_PROTECTED_FLAGS;
@@ -1845,6 +1847,7 @@ static void teardown_hyp_mode(void)
free_hyp_pgds();
for_each_possible_cpu(cpu) {
free_page(per_cpu(kvm_arm_hyp_stack_page, cpu));
+ free_page(per_cpu(kvm_arm_hyp_stacktrace_page, cpu));
free_pages(kvm_arm_hyp_percpu_base[cpu], nvhe_percpu_order());
}
}
@@ -1936,6 +1939,23 @@ static int init_hyp_mode(void)
per_cpu(kvm_arm_hyp_stack_page, cpu) = stack_page;
}

+ /*
+ * Allocate stacktrace pages for Hypervisor-mode.
+ * This is used by the hypervisor to share its stacktrace
+ * with the host on a hyp_panic().
+ */
+ for_each_possible_cpu(cpu) {
+ unsigned long stacktrace_page;
+
+ stacktrace_page = __get_free_page(GFP_KERNEL);
+ if (!stacktrace_page) {
+ err = -ENOMEM;
+ goto out_err;
+ }
+
+ per_cpu(kvm_arm_hyp_stacktrace_page, cpu) = stacktrace_page;
+ }
+
/*
* Allocate and initialize pages for Hypervisor-mode percpu regions.
*/
@@ -2043,6 +2063,20 @@ static int init_hyp_mode(void)
params->stack_hyp_va = hyp_addr + (2 * PAGE_SIZE);
}

+ /*
+ * Map the hyp stacktrace pages.
+ */
+ for_each_possible_cpu(cpu) {
+ char *stacktrace_page = (char *)per_cpu(kvm_arm_hyp_stacktrace_page, cpu);
+
+ err = create_hyp_mappings(stacktrace_page, stacktrace_page + PAGE_SIZE,
+ PAGE_HYP);
+ if (err) {
+ kvm_err("Cannot map hyp stacktrace page\n");
+ goto out_err;
+ }
+ }
+
for_each_possible_cpu(cpu) {
char *percpu_begin = (char *)kvm_arm_hyp_percpu_base[cpu];
char *percpu_end = percpu_begin + nvhe_percpu_size();
diff --git a/arch/arm64/kvm/hyp/nvhe/setup.c b/arch/arm64/kvm/hyp/nvhe/setup.c
index e8d4ea2fcfa0..9b81bf2d40d7 100644
--- a/arch/arm64/kvm/hyp/nvhe/setup.c
+++ b/arch/arm64/kvm/hyp/nvhe/setup.c
@@ -135,6 +135,17 @@ static int recreate_hyp_mappings(phys_addr_t phys, unsigned long size,

/* Update stack_hyp_va to end of the stack's private VA range */
params->stack_hyp_va = hyp_addr + (2 * PAGE_SIZE);
+
+ /*
+ * Map the stacktrace pages as shared and transfer ownership to
+ * the hypervisor.
+ */
+ prot = pkvm_mkstate(PAGE_HYP, PKVM_PAGE_SHARED_OWNED);
+ start = (void *)params->stacktrace_hyp_va;
+ end = start + PAGE_SIZE;
+ ret = pkvm_create_mappings(start, end, prot);
+ if (ret)
+ return ret;
}

/*
--
2.36.1.255.ge46751e96f-goog


2022-06-08 10:10:29

by Marc Zyngier

[permalink] [raw]
Subject: Re: [PATCH v3 4/5] KVM: arm64: Allocate shared stacktrace pages

On Tue, 07 Jun 2022 17:50:46 +0100,
Kalesh Singh <[email protected]> wrote:
>
> The nVHE hypervisor can use this shared area to dump its stacktrace
> addresses on hyp_panic(). Symbolization and printing the stacktrace can
> then be handled by the host in EL1 (done in a later patch in this series).
>
> Signed-off-by: Kalesh Singh <[email protected]>
> ---
> arch/arm64/include/asm/kvm_asm.h | 1 +
> arch/arm64/kvm/arm.c | 34 ++++++++++++++++++++++++++++++++
> arch/arm64/kvm/hyp/nvhe/setup.c | 11 +++++++++++
> 3 files changed, 46 insertions(+)
>
> diff --git a/arch/arm64/include/asm/kvm_asm.h b/arch/arm64/include/asm/kvm_asm.h
> index 2e277f2ed671..ad31ac68264f 100644
> --- a/arch/arm64/include/asm/kvm_asm.h
> +++ b/arch/arm64/include/asm/kvm_asm.h
> @@ -174,6 +174,7 @@ struct kvm_nvhe_init_params {
> unsigned long hcr_el2;
> unsigned long vttbr;
> unsigned long vtcr;
> + unsigned long stacktrace_hyp_va;
> };
>
> /* Translate a kernel address @ptr into its equivalent linear mapping */
> diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c
> index 400bb0fe2745..c0a936a7623d 100644
> --- a/arch/arm64/kvm/arm.c
> +++ b/arch/arm64/kvm/arm.c
> @@ -50,6 +50,7 @@ DEFINE_STATIC_KEY_FALSE(kvm_protected_mode_initialized);
> DECLARE_KVM_HYP_PER_CPU(unsigned long, kvm_hyp_vector);
>
> static DEFINE_PER_CPU(unsigned long, kvm_arm_hyp_stack_page);
> +DEFINE_PER_CPU(unsigned long, kvm_arm_hyp_stacktrace_page);

Why isn't this static, since the address is passed via the
kvm_nvhe_init_params block?

> unsigned long kvm_arm_hyp_percpu_base[NR_CPUS];
> DECLARE_KVM_NVHE_PER_CPU(struct kvm_nvhe_init_params, kvm_init_params);
>
> @@ -1554,6 +1555,7 @@ static void cpu_prepare_hyp_mode(int cpu)
> tcr |= (idmap_t0sz & GENMASK(TCR_TxSZ_WIDTH - 1, 0)) << TCR_T0SZ_OFFSET;
> params->tcr_el2 = tcr;
>
> + params->stacktrace_hyp_va = kern_hyp_va(per_cpu(kvm_arm_hyp_stacktrace_page, cpu));
> params->pgd_pa = kvm_mmu_get_httbr();
> if (is_protected_kvm_enabled())
> params->hcr_el2 = HCR_HOST_NVHE_PROTECTED_FLAGS;
> @@ -1845,6 +1847,7 @@ static void teardown_hyp_mode(void)
> free_hyp_pgds();
> for_each_possible_cpu(cpu) {
> free_page(per_cpu(kvm_arm_hyp_stack_page, cpu));
> + free_page(per_cpu(kvm_arm_hyp_stacktrace_page, cpu));
> free_pages(kvm_arm_hyp_percpu_base[cpu], nvhe_percpu_order());
> }
> }
> @@ -1936,6 +1939,23 @@ static int init_hyp_mode(void)
> per_cpu(kvm_arm_hyp_stack_page, cpu) = stack_page;
> }
>
> + /*
> + * Allocate stacktrace pages for Hypervisor-mode.
> + * This is used by the hypervisor to share its stacktrace
> + * with the host on a hyp_panic().
> + */
> + for_each_possible_cpu(cpu) {
> + unsigned long stacktrace_page;
> +
> + stacktrace_page = __get_free_page(GFP_KERNEL);
> + if (!stacktrace_page) {
> + err = -ENOMEM;
> + goto out_err;
> + }
> +
> + per_cpu(kvm_arm_hyp_stacktrace_page, cpu) = stacktrace_page;

I have the same feeling as with the overflow stack. This is
potentially a huge amount of memory: on my test box, with 64k pages,
this is a whole 10MB that I give away for something that is only a
debug facility.

Can this somehow be limited? I don't see it being less than a page as
a problem, as the memory is always shared back with EL1 in the case of
pKVM (and normal KVM doesn't have that problem anyway).

Alternatively, this should be restricted to pKVM. With normal nVHE,
the host should be able to parse the EL2 stack directly with some
offsetting. Actually, this is probably the best option.

Thanks,

M.

--
Without deviation from the norm, progress is not possible.

2022-06-08 18:46:14

by Kalesh Singh

[permalink] [raw]
Subject: Re: [PATCH v3 4/5] KVM: arm64: Allocate shared stacktrace pages

On Wed, Jun 8, 2022 at 2:02 AM Marc Zyngier <[email protected]> wrote:
>
> On Tue, 07 Jun 2022 17:50:46 +0100,
> Kalesh Singh <[email protected]> wrote:
> >
> > The nVHE hypervisor can use this shared area to dump its stacktrace
> > addresses on hyp_panic(). Symbolization and printing the stacktrace can
> > then be handled by the host in EL1 (done in a later patch in this series).
> >
> > Signed-off-by: Kalesh Singh <[email protected]>
> > ---
> > arch/arm64/include/asm/kvm_asm.h | 1 +
> > arch/arm64/kvm/arm.c | 34 ++++++++++++++++++++++++++++++++
> > arch/arm64/kvm/hyp/nvhe/setup.c | 11 +++++++++++
> > 3 files changed, 46 insertions(+)
> >
> > diff --git a/arch/arm64/include/asm/kvm_asm.h b/arch/arm64/include/asm/kvm_asm.h
> > index 2e277f2ed671..ad31ac68264f 100644
> > --- a/arch/arm64/include/asm/kvm_asm.h
> > +++ b/arch/arm64/include/asm/kvm_asm.h
> > @@ -174,6 +174,7 @@ struct kvm_nvhe_init_params {
> > unsigned long hcr_el2;
> > unsigned long vttbr;
> > unsigned long vtcr;
> > + unsigned long stacktrace_hyp_va;
> > };
> >
> > /* Translate a kernel address @ptr into its equivalent linear mapping */
> > diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c
> > index 400bb0fe2745..c0a936a7623d 100644
> > --- a/arch/arm64/kvm/arm.c
> > +++ b/arch/arm64/kvm/arm.c
> > @@ -50,6 +50,7 @@ DEFINE_STATIC_KEY_FALSE(kvm_protected_mode_initialized);
> > DECLARE_KVM_HYP_PER_CPU(unsigned long, kvm_hyp_vector);
> >
> > static DEFINE_PER_CPU(unsigned long, kvm_arm_hyp_stack_page);
> > +DEFINE_PER_CPU(unsigned long, kvm_arm_hyp_stacktrace_page);
>
> Why isn't this static, since the address is passed via the
> kvm_nvhe_init_params block?

In the subsequent patch the host will need to read and symbolize the
addresses that the hypervisor dumped, it's done in stacktrace.c

>
> > unsigned long kvm_arm_hyp_percpu_base[NR_CPUS];
> > DECLARE_KVM_NVHE_PER_CPU(struct kvm_nvhe_init_params, kvm_init_params);
> >
> > @@ -1554,6 +1555,7 @@ static void cpu_prepare_hyp_mode(int cpu)
> > tcr |= (idmap_t0sz & GENMASK(TCR_TxSZ_WIDTH - 1, 0)) << TCR_T0SZ_OFFSET;
> > params->tcr_el2 = tcr;
> >
> > + params->stacktrace_hyp_va = kern_hyp_va(per_cpu(kvm_arm_hyp_stacktrace_page, cpu));
> > params->pgd_pa = kvm_mmu_get_httbr();
> > if (is_protected_kvm_enabled())
> > params->hcr_el2 = HCR_HOST_NVHE_PROTECTED_FLAGS;
> > @@ -1845,6 +1847,7 @@ static void teardown_hyp_mode(void)
> > free_hyp_pgds();
> > for_each_possible_cpu(cpu) {
> > free_page(per_cpu(kvm_arm_hyp_stack_page, cpu));
> > + free_page(per_cpu(kvm_arm_hyp_stacktrace_page, cpu));
> > free_pages(kvm_arm_hyp_percpu_base[cpu], nvhe_percpu_order());
> > }
> > }
> > @@ -1936,6 +1939,23 @@ static int init_hyp_mode(void)
> > per_cpu(kvm_arm_hyp_stack_page, cpu) = stack_page;
> > }
> >
> > + /*
> > + * Allocate stacktrace pages for Hypervisor-mode.
> > + * This is used by the hypervisor to share its stacktrace
> > + * with the host on a hyp_panic().
> > + */
> > + for_each_possible_cpu(cpu) {
> > + unsigned long stacktrace_page;
> > +
> > + stacktrace_page = __get_free_page(GFP_KERNEL);
> > + if (!stacktrace_page) {
> > + err = -ENOMEM;
> > + goto out_err;
> > + }
> > +
> > + per_cpu(kvm_arm_hyp_stacktrace_page, cpu) = stacktrace_page;
>
> I have the same feeling as with the overflow stack. This is
> potentially a huge amount of memory: on my test box, with 64k pages,
> this is a whole 10MB that I give away for something that is only a
> debug facility.
>
> Can this somehow be limited? I don't see it being less than a page as
> a problem, as the memory is always shared back with EL1 in the case of
> pKVM (and normal KVM doesn't have that problem anyway).
>
> Alternatively, this should be restricted to pKVM. With normal nVHE,
> the host should be able to parse the EL2 stack directly with some
> offsetting. Actually, this is probably the best option.

In a previous approach we had something similar to what you propose
(unwinding in el1) [1]. But doesn't work for our production use case
with protected mode since it disabled the stage 2 protection. We could
do the split and take a different approach for the protected and
unprotected mode, but I'm not sure if it is worth the extra
complexity? If it is the memory usage you are concerned with, it could
be reduced to half of the current stack size (page size) [2], if we
want to retain all frames. But we could also have some max depth, say
like 16? One downside being that it could make stack overflows more
challenging to root cause.


[1] https://lore.kernel.org/lkml/[email protected]/
[2] https://lore.kernel.org/r/CAC_TJvdCuGNEJC4M+bV6o48CSJRs_4GEUb3iiP_4ro79q=KesA@mail.gmail.com/

Thanks
Kalesh

>
> Thanks,
>
> M.
>
> --
> Without deviation from the norm, progress is not possible.