hyp_alloc_private_va_range() can be used to reserve private VA ranges
in the nVHE hypervisor. Allocations are aligned based on the order of
the requested size.
This will be used to implement stack guard pages for KVM nVHE hypervisor
(nVHE Hyp mode / not pKVM), in a subsequent patch in the series.
Signed-off-by: Kalesh Singh <[email protected]>
---
Changes in v5:
- Align private allocations based on the order of their size, per Marc
Changes in v4:
- Handle null ptr in hyp_alloc_private_va_range() and replace
IS_ERR_OR_NULL checks in callers with IS_ERR checks, per Fuad
- Fix kernel-doc comments format, per Fuad
Changes in v3:
- Handle null ptr in IS_ERR_OR_NULL checks, per Mark
arch/arm64/include/asm/kvm_mmu.h | 1 +
arch/arm64/kvm/mmu.c | 63 +++++++++++++++++++++-----------
2 files changed, 42 insertions(+), 22 deletions(-)
diff --git a/arch/arm64/include/asm/kvm_mmu.h b/arch/arm64/include/asm/kvm_mmu.h
index 81839e9a8a24..514cfee76597 100644
--- a/arch/arm64/include/asm/kvm_mmu.h
+++ b/arch/arm64/include/asm/kvm_mmu.h
@@ -153,6 +153,7 @@ static __always_inline unsigned long __kern_hyp_va(unsigned long v)
int kvm_share_hyp(void *from, void *to);
void kvm_unshare_hyp(void *from, void *to);
int create_hyp_mappings(void *from, void *to, enum kvm_pgtable_prot prot);
+unsigned long hyp_alloc_private_va_range(size_t size);
int create_hyp_io_mappings(phys_addr_t phys_addr, size_t size,
void __iomem **kaddr,
void __iomem **haddr);
diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c
index bc2aba953299..ccb2847ee2f4 100644
--- a/arch/arm64/kvm/mmu.c
+++ b/arch/arm64/kvm/mmu.c
@@ -457,22 +457,17 @@ int create_hyp_mappings(void *from, void *to, enum kvm_pgtable_prot prot)
return 0;
}
-static int __create_hyp_private_mapping(phys_addr_t phys_addr, size_t size,
- unsigned long *haddr,
- enum kvm_pgtable_prot prot)
+
+/**
+ * hyp_alloc_private_va_range - Allocates a private VA range.
+ * @size: The size of the VA range to reserve.
+ *
+ * The private VA range is allocated below io_map_base and
+ * aligned based on the order of @size.
+ */
+unsigned long hyp_alloc_private_va_range(size_t size)
{
unsigned long base;
- int ret = 0;
-
- if (!kvm_host_owns_hyp_mappings()) {
- base = kvm_call_hyp_nvhe(__pkvm_create_private_mapping,
- phys_addr, size, prot);
- if (IS_ERR_OR_NULL((void *)base))
- return PTR_ERR((void *)base);
- *haddr = base;
-
- return 0;
- }
mutex_lock(&kvm_hyp_pgd_mutex);
@@ -484,29 +479,53 @@ static int __create_hyp_private_mapping(phys_addr_t phys_addr, size_t size,
*
* The allocated size is always a multiple of PAGE_SIZE.
*/
- size = PAGE_ALIGN(size + offset_in_page(phys_addr));
- base = io_map_base - size;
+ base = io_map_base - PAGE_ALIGN(size);
+
+ /* Align the allocation based on the order of its size */
+ base = ALIGN_DOWN(base, PAGE_SIZE << get_order(size));
/*
* Verify that BIT(VA_BITS - 1) hasn't been flipped by
* allocating the new area, as it would indicate we've
* overflowed the idmap/IO address range.
*/
- if ((base ^ io_map_base) & BIT(VA_BITS - 1))
- ret = -ENOMEM;
+ if (!base || (base ^ io_map_base) & BIT(VA_BITS - 1))
+ base = (unsigned long)ERR_PTR(-ENOMEM);
else
io_map_base = base;
mutex_unlock(&kvm_hyp_pgd_mutex);
- if (ret)
- goto out;
+ return base;
+}
+
+static int __create_hyp_private_mapping(phys_addr_t phys_addr, size_t size,
+ unsigned long *haddr,
+ enum kvm_pgtable_prot prot)
+{
+ unsigned long addr;
+ int ret = 0;
+
+ if (!kvm_host_owns_hyp_mappings()) {
+ addr = kvm_call_hyp_nvhe(__pkvm_create_private_mapping,
+ phys_addr, size, prot);
+ if (IS_ERR((void *)addr))
+ return PTR_ERR((void *)addr);
+ *haddr = addr;
+
+ return 0;
+ }
+
+ size += offset_in_page(phys_addr);
+ addr = hyp_alloc_private_va_range(size);
+ if (IS_ERR((void *)addr))
+ return PTR_ERR((void *)addr);
- ret = __create_hyp_mappings(base, size, phys_addr, prot);
+ ret = __create_hyp_mappings(addr, size, phys_addr, prot);
if (ret)
goto out;
- *haddr = base + offset_in_page(phys_addr);
+ *haddr = addr + offset_in_page(phys_addr);
out:
return ret;
}
--
2.35.1.616.g0bdcbb4464-goog
Quoting Kalesh Singh (2022-03-07 10:48:59)
> diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c
> index bc2aba953299..ccb2847ee2f4 100644
> --- a/arch/arm64/kvm/mmu.c
> +++ b/arch/arm64/kvm/mmu.c
> @@ -457,22 +457,17 @@ int create_hyp_mappings(void *from, void *to, enum kvm_pgtable_prot prot)
> return 0;
> }
>
> -static int __create_hyp_private_mapping(phys_addr_t phys_addr, size_t size,
> - unsigned long *haddr,
> - enum kvm_pgtable_prot prot)
> +
> +/**
> + * hyp_alloc_private_va_range - Allocates a private VA range.
> + * @size: The size of the VA range to reserve.
> + *
> + * The private VA range is allocated below io_map_base and
> + * aligned based on the order of @size.
Add what it returns?
Return: Start address of allocated VA range or some error value... (I don't
understand this part).
It may also be a good idea to write out what VA is in the description:
The private virtual address (VA) range is allocated below io_map_base
> + */
> +unsigned long hyp_alloc_private_va_range(size_t size)
> {
> unsigned long base;
> - int ret = 0;
> -
> - if (!kvm_host_owns_hyp_mappings()) {
> - base = kvm_call_hyp_nvhe(__pkvm_create_private_mapping,
> - phys_addr, size, prot);
> - if (IS_ERR_OR_NULL((void *)base))
> - return PTR_ERR((void *)base);
> - *haddr = base;
> -
> - return 0;
> - }
>
> mutex_lock(&kvm_hyp_pgd_mutex);
>
> @@ -484,29 +479,53 @@ static int __create_hyp_private_mapping(phys_addr_t phys_addr, size_t size,
> *
> * The allocated size is always a multiple of PAGE_SIZE.
> */
> - size = PAGE_ALIGN(size + offset_in_page(phys_addr));
> - base = io_map_base - size;
> + base = io_map_base - PAGE_ALIGN(size);
> +
> + /* Align the allocation based on the order of its size */
> + base = ALIGN_DOWN(base, PAGE_SIZE << get_order(size));
>
> /*
> * Verify that BIT(VA_BITS - 1) hasn't been flipped by
> * allocating the new area, as it would indicate we've
> * overflowed the idmap/IO address range.
> */
> - if ((base ^ io_map_base) & BIT(VA_BITS - 1))
> - ret = -ENOMEM;
> + if (!base || (base ^ io_map_base) & BIT(VA_BITS - 1))
> + base = (unsigned long)ERR_PTR(-ENOMEM);
It looks odd to use an error pointer casted to unsigned long to return
from an address allocation function. Why not pass a pointer for base
like the function was written before and return an int from this
function with 0 for success and negative error value? Otherwise some
sort of define should made like DMA_MAPPING_ERROR and that can be used
to indicate to the caller that the allocation failed, or a simple zero
may work?
> else
> io_map_base = base;
>
> mutex_unlock(&kvm_hyp_pgd_mutex);
>
> - if (ret)
> - goto out;
> + return base;
> +}
> +
> +static int __create_hyp_private_mapping(phys_addr_t phys_addr, size_t size,
> + unsigned long *haddr,
> + enum kvm_pgtable_prot prot)
> +{
> + unsigned long addr;
> + int ret = 0;
> +
> + if (!kvm_host_owns_hyp_mappings()) {
> + addr = kvm_call_hyp_nvhe(__pkvm_create_private_mapping,
> + phys_addr, size, prot);
> + if (IS_ERR((void *)addr))
IS_ERR_VALUE()?
> + return PTR_ERR((void *)addr);
> + *haddr = addr;
> +
> + return 0;
> + }
> +
> + size += offset_in_page(phys_addr);
> + addr = hyp_alloc_private_va_range(size);
> + if (IS_ERR((void *)addr))
IS_ERR_VALUE()?
> + return PTR_ERR((void *)addr);
>
> - ret = __create_hyp_mappings(base, size, phys_addr, prot);
> + ret = __create_hyp_mappings(addr, size, phys_addr, prot);
> if (ret)
> goto out;
>
> - *haddr = base + offset_in_page(phys_addr);
> + *haddr = addr + offset_in_page(phys_addr);
> out:
> return ret;
Would be simpler to remove the goto, or return early.
if (!ret)
*haddr = addr + offset_in_page(phys_addr);
return ret;
> }
On Tue, Mar 8, 2022 at 12:21 PM Stephen Boyd <[email protected]> wrote:
>
HI Stephen. Thanks for the review.
> Quoting Kalesh Singh (2022-03-07 10:48:59)
> > diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c
> > index bc2aba953299..ccb2847ee2f4 100644
> > --- a/arch/arm64/kvm/mmu.c
> > +++ b/arch/arm64/kvm/mmu.c
> > @@ -457,22 +457,17 @@ int create_hyp_mappings(void *from, void *to, enum kvm_pgtable_prot prot)
> > return 0;
> > }
> >
> > -static int __create_hyp_private_mapping(phys_addr_t phys_addr, size_t size,
> > - unsigned long *haddr,
> > - enum kvm_pgtable_prot prot)
> > +
> > +/**
> > + * hyp_alloc_private_va_range - Allocates a private VA range.
> > + * @size: The size of the VA range to reserve.
> > + *
> > + * The private VA range is allocated below io_map_base and
> > + * aligned based on the order of @size.
>
> Add what it returns?
>
> Return: Start address of allocated VA range or some error value... (I don't
> understand this part).
>
> It may also be a good idea to write out what VA is in the description:
>
> The private virtual address (VA) range is allocated below io_map_base
>
Ack
> > + */
> > +unsigned long hyp_alloc_private_va_range(size_t size)
> > {
> > unsigned long base;
> > - int ret = 0;
> > -
> > - if (!kvm_host_owns_hyp_mappings()) {
> > - base = kvm_call_hyp_nvhe(__pkvm_create_private_mapping,
> > - phys_addr, size, prot);
> > - if (IS_ERR_OR_NULL((void *)base))
> > - return PTR_ERR((void *)base);
> > - *haddr = base;
> > -
> > - return 0;
> > - }
> >
> > mutex_lock(&kvm_hyp_pgd_mutex);
> >
> > @@ -484,29 +479,53 @@ static int __create_hyp_private_mapping(phys_addr_t phys_addr, size_t size,
> > *
> > * The allocated size is always a multiple of PAGE_SIZE.
> > */
> > - size = PAGE_ALIGN(size + offset_in_page(phys_addr));
> > - base = io_map_base - size;
> > + base = io_map_base - PAGE_ALIGN(size);
> > +
> > + /* Align the allocation based on the order of its size */
> > + base = ALIGN_DOWN(base, PAGE_SIZE << get_order(size));
> >
> > /*
> > * Verify that BIT(VA_BITS - 1) hasn't been flipped by
> > * allocating the new area, as it would indicate we've
> > * overflowed the idmap/IO address range.
> > */
> > - if ((base ^ io_map_base) & BIT(VA_BITS - 1))
> > - ret = -ENOMEM;
> > + if (!base || (base ^ io_map_base) & BIT(VA_BITS - 1))
> > + base = (unsigned long)ERR_PTR(-ENOMEM);
>
> It looks odd to use an error pointer casted to unsigned long to return
> from an address allocation function. Why not pass a pointer for base
> like the function was written before and return an int from this
> function with 0 for success and negative error value?Otherwise some
> sort of define should made like DMA_MAPPING_ERROR and that can be used
> to indicate to the caller that the allocation failed, or a simple zero
> may work?
I wanted to keep consistent between the pkvm and traditional nvhe
code. I will refactor both *alloc_private_va_range() functions to take
a pointer and return an int error if that's preferred. There would
still be a case of this kind of cast in
__pkvm_create_private_mapping() which does return an unsigned long
address or ERR_PTR(...). It looks like it was made to return the
address to facilitate use as a hypercall (@Quentin CMIW).
>
> > else
> > io_map_base = base;
> >
> > mutex_unlock(&kvm_hyp_pgd_mutex);
> >
> > - if (ret)
> > - goto out;
> > + return base;
> > +}
> > +
> > +static int __create_hyp_private_mapping(phys_addr_t phys_addr, size_t size,
> > + unsigned long *haddr,
> > + enum kvm_pgtable_prot prot)
> > +{
> > + unsigned long addr;
> > + int ret = 0;
> > +
> > + if (!kvm_host_owns_hyp_mappings()) {
> > + addr = kvm_call_hyp_nvhe(__pkvm_create_private_mapping,
> > + phys_addr, size, prot);
> > + if (IS_ERR((void *)addr))
>
> IS_ERR_VALUE()?
Good idea, will remove the extra cast.
>
> > + return PTR_ERR((void *)addr);
> > + *haddr = addr;
> > +
> > + return 0;
> > + }
> > +
> > + size += offset_in_page(phys_addr);
> > + addr = hyp_alloc_private_va_range(size);
> > + if (IS_ERR((void *)addr))
>
> IS_ERR_VALUE()?
Ack
>
> > + return PTR_ERR((void *)addr);
> >
> > - ret = __create_hyp_mappings(base, size, phys_addr, prot);
> > + ret = __create_hyp_mappings(addr, size, phys_addr, prot);
> > if (ret)
> > goto out;
> >
> > - *haddr = base + offset_in_page(phys_addr);
> > + *haddr = addr + offset_in_page(phys_addr);
> > out:
> > return ret;
>
> Would be simpler to remove the goto, or return early.
>
> if (!ret)
> *haddr = addr + offset_in_page(phys_addr);
>
> return ret;
Agreed, I'll remove the goto in the next version.
Thanks,
Kalesh
>
> > }
On Wed, Mar 9, 2022 at 8:50 AM Quentin Perret <[email protected]> wrote:
>
> On Tuesday 08 Mar 2022 at 15:09:18 (-0800), Kalesh Singh wrote:
> > On Tue, Mar 8, 2022 at 12:21 PM Stephen Boyd <[email protected]> wrote:
> > > It looks odd to use an error pointer casted to unsigned long to return
> > > from an address allocation function. Why not pass a pointer for base
> > > like the function was written before and return an int from this
> > > function with 0 for success and negative error value?Otherwise some
> > > sort of define should made like DMA_MAPPING_ERROR and that can be used
> > > to indicate to the caller that the allocation failed, or a simple zero
> > > may work?
> >
> > I wanted to keep consistent between the pkvm and traditional nvhe
> > code. I will refactor both *alloc_private_va_range() functions to take
> > a pointer and return an int error if that's preferred. There would
> > still be a case of this kind of cast in
> > __pkvm_create_private_mapping() which does return an unsigned long
> > address or ERR_PTR(...). It looks like it was made to return the
> > address to facilitate use as a hypercall (@Quentin CMIW).
>
> Yep, passing everything by value was much easier to cross the EL1/EL2
> boundary as that avoids having the hypervisor map kernel memory and all
> that fun. But Stephen's point is fair, so no objection from to keep this
> little dance confined to the hypercall wrapper and make the function
> signature nicer and easier to use for the rest of the code.
Thanks for clarifying Quentin. That sounds good to me.
- Kalesh
>
> Cheers,
> Quentin
On Tuesday 08 Mar 2022 at 15:09:18 (-0800), Kalesh Singh wrote:
> On Tue, Mar 8, 2022 at 12:21 PM Stephen Boyd <[email protected]> wrote:
> > It looks odd to use an error pointer casted to unsigned long to return
> > from an address allocation function. Why not pass a pointer for base
> > like the function was written before and return an int from this
> > function with 0 for success and negative error value?Otherwise some
> > sort of define should made like DMA_MAPPING_ERROR and that can be used
> > to indicate to the caller that the allocation failed, or a simple zero
> > may work?
>
> I wanted to keep consistent between the pkvm and traditional nvhe
> code. I will refactor both *alloc_private_va_range() functions to take
> a pointer and return an int error if that's preferred. There would
> still be a case of this kind of cast in
> __pkvm_create_private_mapping() which does return an unsigned long
> address or ERR_PTR(...). It looks like it was made to return the
> address to facilitate use as a hypercall (@Quentin CMIW).
Yep, passing everything by value was much easier to cross the EL1/EL2
boundary as that avoids having the hypervisor map kernel memory and all
that fun. But Stephen's point is fair, so no objection from to keep this
little dance confined to the hypercall wrapper and make the function
signature nicer and easier to use for the rest of the code.
Cheers,
Quentin