There is no real urgency to free a stage-2 subtree that was pruned.
Nonetheless, KVM does the tear down in the stage-2 fault path while
holding the MMU lock.
Free removed stage-2 subtrees after an RCU grace period. To guarantee
all stage-2 table pages are freed before killing a VM, add an
rcu_barrier() to the flush path.
Signed-off-by: Oliver Upton <[email protected]>
---
arch/arm64/kvm/mmu.c | 35 ++++++++++++++++++++++++++++++++++-
1 file changed, 34 insertions(+), 1 deletion(-)
diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c
index 91521f4aab97..265951c05879 100644
--- a/arch/arm64/kvm/mmu.c
+++ b/arch/arm64/kvm/mmu.c
@@ -97,6 +97,38 @@ static void *stage2_memcache_zalloc_page(void *arg)
return kvm_mmu_memory_cache_alloc(mc);
}
+#define STAGE2_PAGE_PRIVATE_LEVEL_MASK GENMASK_ULL(2, 0)
+
+static inline unsigned long stage2_page_private(u32 level, void *arg)
+{
+ unsigned long pvt = (unsigned long)arg;
+
+ BUILD_BUG_ON(KVM_PGTABLE_MAX_LEVELS > STAGE2_PAGE_PRIVATE_LEVEL_MASK);
+ WARN_ON_ONCE(pvt & STAGE2_PAGE_PRIVATE_LEVEL_MASK);
+
+ return pvt | level;
+}
+
+static void stage2_free_removed_table_rcu_cb(struct rcu_head *head)
+{
+ struct page *page = container_of(head, struct page, rcu_head);
+ unsigned long pvt = page_private(page);
+ void *arg = (void *)(pvt & ~STAGE2_PAGE_PRIVATE_LEVEL_MASK);
+ u32 level = (u32)(pvt & STAGE2_PAGE_PRIVATE_LEVEL_MASK);
+ void *pgtable = page_to_virt(page);
+
+ kvm_pgtable_stage2_free_removed(pgtable, level, arg);
+}
+
+static void stage2_free_removed_table(void *pgtable, u32 level, void *arg)
+{
+ unsigned long pvt = stage2_page_private(level, arg);
+ struct page *page = virt_to_page(pgtable);
+
+ set_page_private(page, (unsigned long)pvt);
+ call_rcu(&page->rcu_head, stage2_free_removed_table_rcu_cb);
+}
+
static void *kvm_host_zalloc_pages_exact(size_t size)
{
return alloc_pages_exact(size, GFP_KERNEL_ACCOUNT | __GFP_ZERO);
@@ -627,7 +659,7 @@ static struct kvm_pgtable_mm_ops kvm_s2_mm_ops = {
.zalloc_page = stage2_memcache_zalloc_page,
.zalloc_pages_exact = kvm_host_zalloc_pages_exact,
.free_pages_exact = free_pages_exact,
- .free_removed_table = kvm_pgtable_stage2_free_removed,
+ .free_removed_table = stage2_free_removed_table,
.get_page = kvm_host_get_page,
.put_page = kvm_host_put_page,
.page_count = kvm_host_page_count,
@@ -770,6 +802,7 @@ void kvm_free_stage2_pgd(struct kvm_s2_mmu *mmu)
if (pgt) {
kvm_pgtable_stage2_destroy(pgt);
kfree(pgt);
+ rcu_barrier();
}
}
--
2.37.2.672.g94769d06f0-goog
On Tue, Aug 30, 2022 at 07:41:27PM +0000, Oliver Upton wrote:
> There is no real urgency to free a stage-2 subtree that was pruned.
> Nonetheless, KVM does the tear down in the stage-2 fault path while
> holding the MMU lock.
>
> Free removed stage-2 subtrees after an RCU grace period. To guarantee
> all stage-2 table pages are freed before killing a VM, add an
> rcu_barrier() to the flush path.
>
> Signed-off-by: Oliver Upton <[email protected]>
> ---
> arch/arm64/kvm/mmu.c | 35 ++++++++++++++++++++++++++++++++++-
> 1 file changed, 34 insertions(+), 1 deletion(-)
>
> diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c
> index 91521f4aab97..265951c05879 100644
> --- a/arch/arm64/kvm/mmu.c
> +++ b/arch/arm64/kvm/mmu.c
> @@ -97,6 +97,38 @@ static void *stage2_memcache_zalloc_page(void *arg)
> return kvm_mmu_memory_cache_alloc(mc);
> }
>
> +#define STAGE2_PAGE_PRIVATE_LEVEL_MASK GENMASK_ULL(2, 0)
> +
> +static inline unsigned long stage2_page_private(u32 level, void *arg)
> +{
> + unsigned long pvt = (unsigned long)arg;
> +
> + BUILD_BUG_ON(KVM_PGTABLE_MAX_LEVELS > STAGE2_PAGE_PRIVATE_LEVEL_MASK);
> + WARN_ON_ONCE(pvt & STAGE2_PAGE_PRIVATE_LEVEL_MASK);
> +
> + return pvt | level;
> +}
> +
> +static void stage2_free_removed_table_rcu_cb(struct rcu_head *head)
> +{
> + struct page *page = container_of(head, struct page, rcu_head);
> + unsigned long pvt = page_private(page);
> + void *arg = (void *)(pvt & ~STAGE2_PAGE_PRIVATE_LEVEL_MASK);
> + u32 level = (u32)(pvt & STAGE2_PAGE_PRIVATE_LEVEL_MASK);
> + void *pgtable = page_to_virt(page);
> +
> + kvm_pgtable_stage2_free_removed(pgtable, level, arg);
> +}
> +
> +static void stage2_free_removed_table(void *pgtable, u32 level, void *arg)
> +{
> + unsigned long pvt = stage2_page_private(level, arg);
> + struct page *page = virt_to_page(pgtable);
> +
> + set_page_private(page, (unsigned long)pvt);
> + call_rcu(&page->rcu_head, stage2_free_removed_table_rcu_cb);
> +}
> +
> static void *kvm_host_zalloc_pages_exact(size_t size)
> {
> return alloc_pages_exact(size, GFP_KERNEL_ACCOUNT | __GFP_ZERO);
> @@ -627,7 +659,7 @@ static struct kvm_pgtable_mm_ops kvm_s2_mm_ops = {
> .zalloc_page = stage2_memcache_zalloc_page,
> .zalloc_pages_exact = kvm_host_zalloc_pages_exact,
> .free_pages_exact = free_pages_exact,
> - .free_removed_table = kvm_pgtable_stage2_free_removed,
> + .free_removed_table = stage2_free_removed_table,
> .get_page = kvm_host_get_page,
> .put_page = kvm_host_put_page,
> .page_count = kvm_host_page_count,
> @@ -770,6 +802,7 @@ void kvm_free_stage2_pgd(struct kvm_s2_mmu *mmu)
> if (pgt) {
> kvm_pgtable_stage2_destroy(pgt);
> kfree(pgt);
> + rcu_barrier();
A comment here would be useful to document the behavior. e.g.
/*
* Wait for all stage-2 page tables that are being freed
* asynchronously via RCU callback because ...
*/
Speaking of, what's the reason for this rcu_barrier()? Is there any
reason why KVM can't let in-flight stage-2 freeing RCU callbacks run at
the end of the next grace period?
> }
> }
>
> --
> 2.37.2.672.g94769d06f0-goog
>
On Wed, Sep 07, 2022 at 03:00:18PM -0700, David Matlack wrote:
> On Tue, Aug 30, 2022 at 07:41:27PM +0000, Oliver Upton wrote:
> > There is no real urgency to free a stage-2 subtree that was pruned.
> > Nonetheless, KVM does the tear down in the stage-2 fault path while
> > holding the MMU lock.
> >
> > Free removed stage-2 subtrees after an RCU grace period. To guarantee
> > all stage-2 table pages are freed before killing a VM, add an
> > rcu_barrier() to the flush path.
> >
> > Signed-off-by: Oliver Upton <[email protected]>
> > ---
> > arch/arm64/kvm/mmu.c | 35 ++++++++++++++++++++++++++++++++++-
> > 1 file changed, 34 insertions(+), 1 deletion(-)
> >
> > diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c
> > index 91521f4aab97..265951c05879 100644
> > --- a/arch/arm64/kvm/mmu.c
> > +++ b/arch/arm64/kvm/mmu.c
> > @@ -97,6 +97,38 @@ static void *stage2_memcache_zalloc_page(void *arg)
> > return kvm_mmu_memory_cache_alloc(mc);
> > }
> >
> > +#define STAGE2_PAGE_PRIVATE_LEVEL_MASK GENMASK_ULL(2, 0)
> > +
> > +static inline unsigned long stage2_page_private(u32 level, void *arg)
> > +{
> > + unsigned long pvt = (unsigned long)arg;
> > +
> > + BUILD_BUG_ON(KVM_PGTABLE_MAX_LEVELS > STAGE2_PAGE_PRIVATE_LEVEL_MASK);
> > + WARN_ON_ONCE(pvt & STAGE2_PAGE_PRIVATE_LEVEL_MASK);
> > +
> > + return pvt | level;
> > +}
> > +
> > +static void stage2_free_removed_table_rcu_cb(struct rcu_head *head)
> > +{
> > + struct page *page = container_of(head, struct page, rcu_head);
> > + unsigned long pvt = page_private(page);
> > + void *arg = (void *)(pvt & ~STAGE2_PAGE_PRIVATE_LEVEL_MASK);
> > + u32 level = (u32)(pvt & STAGE2_PAGE_PRIVATE_LEVEL_MASK);
> > + void *pgtable = page_to_virt(page);
> > +
> > + kvm_pgtable_stage2_free_removed(pgtable, level, arg);
> > +}
> > +
> > +static void stage2_free_removed_table(void *pgtable, u32 level, void *arg)
> > +{
> > + unsigned long pvt = stage2_page_private(level, arg);
> > + struct page *page = virt_to_page(pgtable);
> > +
> > + set_page_private(page, (unsigned long)pvt);
> > + call_rcu(&page->rcu_head, stage2_free_removed_table_rcu_cb);
> > +}
> > +
> > static void *kvm_host_zalloc_pages_exact(size_t size)
> > {
> > return alloc_pages_exact(size, GFP_KERNEL_ACCOUNT | __GFP_ZERO);
> > @@ -627,7 +659,7 @@ static struct kvm_pgtable_mm_ops kvm_s2_mm_ops = {
> > .zalloc_page = stage2_memcache_zalloc_page,
> > .zalloc_pages_exact = kvm_host_zalloc_pages_exact,
> > .free_pages_exact = free_pages_exact,
> > - .free_removed_table = kvm_pgtable_stage2_free_removed,
> > + .free_removed_table = stage2_free_removed_table,
> > .get_page = kvm_host_get_page,
> > .put_page = kvm_host_put_page,
> > .page_count = kvm_host_page_count,
> > @@ -770,6 +802,7 @@ void kvm_free_stage2_pgd(struct kvm_s2_mmu *mmu)
> > if (pgt) {
> > kvm_pgtable_stage2_destroy(pgt);
> > kfree(pgt);
> > + rcu_barrier();
>
> A comment here would be useful to document the behavior. e.g.
>
> /*
> * Wait for all stage-2 page tables that are being freed
> * asynchronously via RCU callback because ...
> */
>
> Speaking of, what's the reason for this rcu_barrier()? Is there any
> reason why KVM can't let in-flight stage-2 freeing RCU callbacks run at
> the end of the next grace period?
After thinking about this more I have 2 follow-up questions:
1. Should the RCU barrier come before kvm_pgtable_stage2_destroy() and
kfree(pgt)? Otherwise an RCU callback running
kvm_pgtable_stage2_free_removed() could access the pgt after it has
been freed?
2. In general, is it safe for kvm_pgtable_stage2_free_removed() to run
outside of the MMU lock? Yes the page tables have already been
disconnected from the tree, but kvm_pgtable_stage2_free_removed()
also accesses shared data structures likstruct kvm_pgtable. I *think*
it might be safe after you fix (1.) but it would be more robust to
avoid accessing shared data structures at all outside of the MMU lock
and just do the page table freeing in the RCU callback.
>
> > }
> > }
> >
> > --
> > 2.37.2.672.g94769d06f0-goog
> >
Hi Oliver,
On Tue, Aug 30, 2022 at 07:41:27PM +0000, Oliver Upton wrote:
> There is no real urgency to free a stage-2 subtree that was pruned.
> Nonetheless, KVM does the tear down in the stage-2 fault path while
> holding the MMU lock.
>
> Free removed stage-2 subtrees after an RCU grace period. To guarantee
> all stage-2 table pages are freed before killing a VM, add an
> rcu_barrier() to the flush path.
>
> Signed-off-by: Oliver Upton <[email protected]>
> ---
> arch/arm64/kvm/mmu.c | 35 ++++++++++++++++++++++++++++++++++-
> 1 file changed, 34 insertions(+), 1 deletion(-)
>
> diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c
> index 91521f4aab97..265951c05879 100644
> --- a/arch/arm64/kvm/mmu.c
> +++ b/arch/arm64/kvm/mmu.c
> @@ -97,6 +97,38 @@ static void *stage2_memcache_zalloc_page(void *arg)
> return kvm_mmu_memory_cache_alloc(mc);
> }
>
> +#define STAGE2_PAGE_PRIVATE_LEVEL_MASK GENMASK_ULL(2, 0)
> +
> +static inline unsigned long stage2_page_private(u32 level, void *arg)
> +{
> + unsigned long pvt = (unsigned long)arg;
> +
> + BUILD_BUG_ON(KVM_PGTABLE_MAX_LEVELS > STAGE2_PAGE_PRIVATE_LEVEL_MASK);
> + WARN_ON_ONCE(pvt & STAGE2_PAGE_PRIVATE_LEVEL_MASK);
If the pgt pointer (arg) is not aligned for some reason, I think it
might be better to BUG_ON(). Alternatively, why not trying to pass a new
struct (with level and arg) that's freed by the rcu callback.
> +
> + return pvt | level;
> +}
> +
> +static void stage2_free_removed_table_rcu_cb(struct rcu_head *head)
> +{
> + struct page *page = container_of(head, struct page, rcu_head);
> + unsigned long pvt = page_private(page);
> + void *arg = (void *)(pvt & ~STAGE2_PAGE_PRIVATE_LEVEL_MASK);
> + u32 level = (u32)(pvt & STAGE2_PAGE_PRIVATE_LEVEL_MASK);
> + void *pgtable = page_to_virt(page);
> +
> + kvm_pgtable_stage2_free_removed(pgtable, level, arg);
> +}
> +
> +static void stage2_free_removed_table(void *pgtable, u32 level, void *arg)
> +{
> + unsigned long pvt = stage2_page_private(level, arg);
> + struct page *page = virt_to_page(pgtable);
> +
> + set_page_private(page, (unsigned long)pvt);
> + call_rcu(&page->rcu_head, stage2_free_removed_table_rcu_cb);
> +}
> +
> static void *kvm_host_zalloc_pages_exact(size_t size)
> {
> return alloc_pages_exact(size, GFP_KERNEL_ACCOUNT | __GFP_ZERO);
> @@ -627,7 +659,7 @@ static struct kvm_pgtable_mm_ops kvm_s2_mm_ops = {
> .zalloc_page = stage2_memcache_zalloc_page,
> .zalloc_pages_exact = kvm_host_zalloc_pages_exact,
> .free_pages_exact = free_pages_exact,
> - .free_removed_table = kvm_pgtable_stage2_free_removed,
> + .free_removed_table = stage2_free_removed_table,
> .get_page = kvm_host_get_page,
> .put_page = kvm_host_put_page,
> .page_count = kvm_host_page_count,
> @@ -770,6 +802,7 @@ void kvm_free_stage2_pgd(struct kvm_s2_mmu *mmu)
> if (pgt) {
> kvm_pgtable_stage2_destroy(pgt);
> kfree(pgt);
> + rcu_barrier();
> }
> }
>
> --
> 2.37.2.672.g94769d06f0-goog
>