2022-04-26 07:54:34

by Yosry Ahmed

[permalink] [raw]
Subject: [PATCH v3 4/6] KVM: arm64/mmu: count KVM page table pages in pagetable stats

Count the pages used by KVM in arm64 for page tables in pagetable stats.

Account pages allocated for PTEs in pgtable init functions and
kvm_set_table_pte().

Since most page table pages are freed using put_page(), add a helper
function put_pte_page() that checks if this is the last ref for a pte
page before putting it, and unaccounts stats accordingly.

Signed-off-by: Yosry Ahmed <[email protected]>
---
arch/arm64/kernel/image-vars.h | 3 ++
arch/arm64/kvm/hyp/pgtable.c | 50 +++++++++++++++++++++-------------
2 files changed, 34 insertions(+), 19 deletions(-)

diff --git a/arch/arm64/kernel/image-vars.h b/arch/arm64/kernel/image-vars.h
index 241c86b67d01..25bf058714f6 100644
--- a/arch/arm64/kernel/image-vars.h
+++ b/arch/arm64/kernel/image-vars.h
@@ -143,6 +143,9 @@ KVM_NVHE_ALIAS(__hyp_rodata_end);
/* pKVM static key */
KVM_NVHE_ALIAS(kvm_protected_mode_initialized);

+/* Called by kvm_account_pgtable_pages() to update pagetable stats */
+KVM_NVHE_ALIAS(__mod_lruvec_page_state);
+
#endif /* CONFIG_KVM */

#endif /* __ARM64_KERNEL_IMAGE_VARS_H */
diff --git a/arch/arm64/kvm/hyp/pgtable.c b/arch/arm64/kvm/hyp/pgtable.c
index 2cb3867eb7c2..53e13c3313e9 100644
--- a/arch/arm64/kvm/hyp/pgtable.c
+++ b/arch/arm64/kvm/hyp/pgtable.c
@@ -152,6 +152,7 @@ static void kvm_set_table_pte(kvm_pte_t *ptep, kvm_pte_t *childp,

WARN_ON(kvm_pte_valid(old));
smp_store_release(ptep, pte);
+ kvm_account_pgtable_pages((void *)childp, +1);
}

static kvm_pte_t kvm_init_valid_leaf_pte(u64 pa, kvm_pte_t attr, u32 level)
@@ -326,6 +327,14 @@ int kvm_pgtable_get_leaf(struct kvm_pgtable *pgt, u64 addr,
return ret;
}

+static void put_pte_page(kvm_pte_t *ptep, struct kvm_pgtable_mm_ops *mm_ops)
+{
+ /* If this is the last page ref, decrement pagetable stats first. */
+ if (!mm_ops->page_count || mm_ops->page_count(ptep) == 1)
+ kvm_account_pgtable_pages((void *)ptep, -1);
+ mm_ops->put_page(ptep);
+}
+
struct hyp_map_data {
u64 phys;
kvm_pte_t attr;
@@ -488,10 +497,10 @@ static int hyp_unmap_walker(u64 addr, u64 end, u32 level, kvm_pte_t *ptep,

dsb(ish);
isb();
- mm_ops->put_page(ptep);
+ put_pte_page(ptep, mm_ops);

if (childp)
- mm_ops->put_page(childp);
+ put_pte_page(childp, mm_ops);

return 0;
}
@@ -522,6 +531,7 @@ int kvm_pgtable_hyp_init(struct kvm_pgtable *pgt, u32 va_bits,
pgt->pgd = (kvm_pte_t *)mm_ops->zalloc_page(NULL);
if (!pgt->pgd)
return -ENOMEM;
+ kvm_account_pgtable_pages((void *)pgt->pgd, +1);

pgt->ia_bits = va_bits;
pgt->start_level = KVM_PGTABLE_MAX_LEVELS - levels;
@@ -541,10 +551,10 @@ static int hyp_free_walker(u64 addr, u64 end, u32 level, kvm_pte_t *ptep,
if (!kvm_pte_valid(pte))
return 0;

- mm_ops->put_page(ptep);
+ put_pte_page(ptep, mm_ops);

if (kvm_pte_table(pte, level))
- mm_ops->put_page(kvm_pte_follow(pte, mm_ops));
+ put_pte_page(kvm_pte_follow(pte, mm_ops), mm_ops);

return 0;
}
@@ -558,7 +568,7 @@ void kvm_pgtable_hyp_destroy(struct kvm_pgtable *pgt)
};

WARN_ON(kvm_pgtable_walk(pgt, 0, BIT(pgt->ia_bits), &walker));
- pgt->mm_ops->put_page(pgt->pgd);
+ put_pte_page(pgt->pgd, pgt->mm_ops);
pgt->pgd = NULL;
}

@@ -694,7 +704,7 @@ static void stage2_put_pte(kvm_pte_t *ptep, struct kvm_s2_mmu *mmu, u64 addr,
kvm_call_hyp(__kvm_tlb_flush_vmid_ipa, mmu, addr, level);
}

- mm_ops->put_page(ptep);
+ put_pte_page(ptep, mm_ops);
}

static bool stage2_pte_cacheable(struct kvm_pgtable *pgt, kvm_pte_t pte)
@@ -795,7 +805,7 @@ static int stage2_map_walk_leaf(u64 addr, u64 end, u32 level, kvm_pte_t *ptep,

if (data->anchor) {
if (stage2_pte_is_counted(pte))
- mm_ops->put_page(ptep);
+ put_pte_page(ptep, mm_ops);

return 0;
}
@@ -848,8 +858,8 @@ static int stage2_map_walk_table_post(u64 addr, u64 end, u32 level,
childp = kvm_pte_follow(*ptep, mm_ops);
}

- mm_ops->put_page(childp);
- mm_ops->put_page(ptep);
+ put_pte_page(childp, mm_ops);
+ put_pte_page(ptep, mm_ops);

return ret;
}
@@ -962,7 +972,7 @@ static int stage2_unmap_walker(u64 addr, u64 end, u32 level, kvm_pte_t *ptep,
if (!kvm_pte_valid(pte)) {
if (stage2_pte_is_counted(pte)) {
kvm_clear_pte(ptep);
- mm_ops->put_page(ptep);
+ put_pte_page(ptep, mm_ops);
}
return 0;
}
@@ -988,7 +998,7 @@ static int stage2_unmap_walker(u64 addr, u64 end, u32 level, kvm_pte_t *ptep,
kvm_granule_size(level));

if (childp)
- mm_ops->put_page(childp);
+ put_pte_page(childp, mm_ops);

return 0;
}
@@ -1177,16 +1187,17 @@ int __kvm_pgtable_stage2_init(struct kvm_pgtable *pgt, struct kvm_s2_mmu *mmu,
enum kvm_pgtable_stage2_flags flags,
kvm_pgtable_force_pte_cb_t force_pte_cb)
{
- size_t pgd_sz;
+ u32 pgd_num;
u64 vtcr = mmu->arch->vtcr;
u32 ia_bits = VTCR_EL2_IPA(vtcr);
u32 sl0 = FIELD_GET(VTCR_EL2_SL0_MASK, vtcr);
u32 start_level = VTCR_EL2_TGRAN_SL0_BASE - sl0;

- pgd_sz = kvm_pgd_pages(ia_bits, start_level) * PAGE_SIZE;
- pgt->pgd = mm_ops->zalloc_pages_exact(pgd_sz);
+ pgd_num = kvm_pgd_pages(ia_bits, start_level);
+ pgt->pgd = mm_ops->zalloc_pages_exact(pgd_num * PAGE_SIZE);
if (!pgt->pgd)
return -ENOMEM;
+ kvm_account_pgtable_pages((void *)pgt->pgd, +pgd_num);

pgt->ia_bits = ia_bits;
pgt->start_level = start_level;
@@ -1210,17 +1221,17 @@ static int stage2_free_walker(u64 addr, u64 end, u32 level, kvm_pte_t *ptep,
if (!stage2_pte_is_counted(pte))
return 0;

- mm_ops->put_page(ptep);
+ put_pte_page(ptep, mm_ops);

if (kvm_pte_table(pte, level))
- mm_ops->put_page(kvm_pte_follow(pte, mm_ops));
+ put_pte_page(kvm_pte_follow(pte, mm_ops), mm_ops);

return 0;
}

void kvm_pgtable_stage2_destroy(struct kvm_pgtable *pgt)
{
- size_t pgd_sz;
+ u32 pgd_num;
struct kvm_pgtable_walker walker = {
.cb = stage2_free_walker,
.flags = KVM_PGTABLE_WALK_LEAF |
@@ -1229,7 +1240,8 @@ void kvm_pgtable_stage2_destroy(struct kvm_pgtable *pgt)
};

WARN_ON(kvm_pgtable_walk(pgt, 0, BIT(pgt->ia_bits), &walker));
- pgd_sz = kvm_pgd_pages(pgt->ia_bits, pgt->start_level) * PAGE_SIZE;
- pgt->mm_ops->free_pages_exact(pgt->pgd, pgd_sz);
+ pgd_num = kvm_pgd_pages(pgt->ia_bits, pgt->start_level);
+ kvm_account_pgtable_pages((void *)pgt->pgd, -pgd_num);
+ pgt->mm_ops->free_pages_exact(pgt->pgd, pgd_num * PAGE_SIZE);
pgt->pgd = NULL;
}
--
2.36.0.rc2.479.g8af0fa9b8e-goog


2022-04-26 12:12:50

by Oliver Upton

[permalink] [raw]
Subject: Re: [PATCH v3 4/6] KVM: arm64/mmu: count KVM page table pages in pagetable stats

Hi Yosry,

On Tue, Apr 26, 2022 at 05:39:02AM +0000, Yosry Ahmed wrote:
> Count the pages used by KVM in arm64 for page tables in pagetable stats.
>
> Account pages allocated for PTEs in pgtable init functions and
> kvm_set_table_pte().
>
> Since most page table pages are freed using put_page(), add a helper
> function put_pte_page() that checks if this is the last ref for a pte
> page before putting it, and unaccounts stats accordingly.
>
> Signed-off-by: Yosry Ahmed <[email protected]>
> ---
> arch/arm64/kernel/image-vars.h | 3 ++
> arch/arm64/kvm/hyp/pgtable.c | 50 +++++++++++++++++++++-------------
> 2 files changed, 34 insertions(+), 19 deletions(-)
>
> diff --git a/arch/arm64/kernel/image-vars.h b/arch/arm64/kernel/image-vars.h
> index 241c86b67d01..25bf058714f6 100644
> --- a/arch/arm64/kernel/image-vars.h
> +++ b/arch/arm64/kernel/image-vars.h
> @@ -143,6 +143,9 @@ KVM_NVHE_ALIAS(__hyp_rodata_end);
> /* pKVM static key */
> KVM_NVHE_ALIAS(kvm_protected_mode_initialized);
>
> +/* Called by kvm_account_pgtable_pages() to update pagetable stats */
> +KVM_NVHE_ALIAS(__mod_lruvec_page_state);
> +
> #endif /* CONFIG_KVM */
>
> #endif /* __ARM64_KERNEL_IMAGE_VARS_H */
> diff --git a/arch/arm64/kvm/hyp/pgtable.c b/arch/arm64/kvm/hyp/pgtable.c
> index 2cb3867eb7c2..53e13c3313e9 100644
> --- a/arch/arm64/kvm/hyp/pgtable.c
> +++ b/arch/arm64/kvm/hyp/pgtable.c
> @@ -152,6 +152,7 @@ static void kvm_set_table_pte(kvm_pte_t *ptep, kvm_pte_t *childp,
>
> WARN_ON(kvm_pte_valid(old));
> smp_store_release(ptep, pte);
> + kvm_account_pgtable_pages((void *)childp, +1);

What page tables do we want to account? KVM on ARM manages several page
tables.

For regular KVM, the host kernel manages allocations for the hyp stage 1
tables in addition to the stage 2 tables used for a particular VM. The
former is system overhead whereas the latter could be attributed to a
guest VM.

I imagine protected KVM is out of scope, since it actually manages its
own allocations outside of the host kernel.

Given this, I would recommend adding the accounting hooks to mmu.c as
that is where we alloc/free table pages and it is in the host address
space. kvm_s2_mm_ops and kvm_hyp_mm_ops point to all the relevant
functions, though the latter is only relevant if we want to count system
page tables too.

--
Thanks,
Oliver

2022-04-27 09:21:27

by Marc Zyngier

[permalink] [raw]
Subject: Re: [PATCH v3 4/6] KVM: arm64/mmu: count KVM page table pages in pagetable stats

On Tue, 26 Apr 2022 06:39:02 +0100,
Yosry Ahmed <[email protected]> wrote:
>
> Count the pages used by KVM in arm64 for page tables in pagetable stats.
>
> Account pages allocated for PTEs in pgtable init functions and
> kvm_set_table_pte().
>
> Since most page table pages are freed using put_page(), add a helper
> function put_pte_page() that checks if this is the last ref for a pte
> page before putting it, and unaccounts stats accordingly.
>
> Signed-off-by: Yosry Ahmed <[email protected]>
> ---
> arch/arm64/kernel/image-vars.h | 3 ++
> arch/arm64/kvm/hyp/pgtable.c | 50 +++++++++++++++++++++-------------
> 2 files changed, 34 insertions(+), 19 deletions(-)
>
> diff --git a/arch/arm64/kernel/image-vars.h b/arch/arm64/kernel/image-vars.h
> index 241c86b67d01..25bf058714f6 100644
> --- a/arch/arm64/kernel/image-vars.h
> +++ b/arch/arm64/kernel/image-vars.h
> @@ -143,6 +143,9 @@ KVM_NVHE_ALIAS(__hyp_rodata_end);
> /* pKVM static key */
> KVM_NVHE_ALIAS(kvm_protected_mode_initialized);
>
> +/* Called by kvm_account_pgtable_pages() to update pagetable stats */
> +KVM_NVHE_ALIAS(__mod_lruvec_page_state);

This cannot be right. It means that this function will be called
directly from the EL2 code when in protected mode, and will result in
extreme fireworks. There is no way you can call core kernel stuff
like this from this context.

Please do not add random symbols to this list just for the sake of
being able to link the kernel.

> +
> #endif /* CONFIG_KVM */
>
> #endif /* __ARM64_KERNEL_IMAGE_VARS_H */
> diff --git a/arch/arm64/kvm/hyp/pgtable.c b/arch/arm64/kvm/hyp/pgtable.c
> index 2cb3867eb7c2..53e13c3313e9 100644
> --- a/arch/arm64/kvm/hyp/pgtable.c
> +++ b/arch/arm64/kvm/hyp/pgtable.c
> @@ -152,6 +152,7 @@ static void kvm_set_table_pte(kvm_pte_t *ptep, kvm_pte_t *childp,
>
> WARN_ON(kvm_pte_valid(old));
> smp_store_release(ptep, pte);
> + kvm_account_pgtable_pages((void *)childp, +1);

Why the + sign?

> }
>
> static kvm_pte_t kvm_init_valid_leaf_pte(u64 pa, kvm_pte_t attr, u32 level)
> @@ -326,6 +327,14 @@ int kvm_pgtable_get_leaf(struct kvm_pgtable *pgt, u64 addr,
> return ret;
> }
>
> +static void put_pte_page(kvm_pte_t *ptep, struct kvm_pgtable_mm_ops *mm_ops)
> +{
> + /* If this is the last page ref, decrement pagetable stats first. */
> + if (!mm_ops->page_count || mm_ops->page_count(ptep) == 1)
> + kvm_account_pgtable_pages((void *)ptep, -1);
> + mm_ops->put_page(ptep);
> +}
> +
> struct hyp_map_data {
> u64 phys;
> kvm_pte_t attr;
> @@ -488,10 +497,10 @@ static int hyp_unmap_walker(u64 addr, u64 end, u32 level, kvm_pte_t *ptep,
>
> dsb(ish);
> isb();
> - mm_ops->put_page(ptep);
> + put_pte_page(ptep, mm_ops);
>
> if (childp)
> - mm_ops->put_page(childp);
> + put_pte_page(childp, mm_ops);
>
> return 0;
> }
> @@ -522,6 +531,7 @@ int kvm_pgtable_hyp_init(struct kvm_pgtable *pgt, u32 va_bits,
> pgt->pgd = (kvm_pte_t *)mm_ops->zalloc_page(NULL);
> if (!pgt->pgd)
> return -ENOMEM;
> + kvm_account_pgtable_pages((void *)pgt->pgd, +1);
>
> pgt->ia_bits = va_bits;
> pgt->start_level = KVM_PGTABLE_MAX_LEVELS - levels;
> @@ -541,10 +551,10 @@ static int hyp_free_walker(u64 addr, u64 end, u32 level, kvm_pte_t *ptep,
> if (!kvm_pte_valid(pte))
> return 0;
>
> - mm_ops->put_page(ptep);
> + put_pte_page(ptep, mm_ops);
>
> if (kvm_pte_table(pte, level))
> - mm_ops->put_page(kvm_pte_follow(pte, mm_ops));
> + put_pte_page(kvm_pte_follow(pte, mm_ops), mm_ops);

OK, I see the pattern. I don't think this workable as such. I'd rather
the callbacks themselves (put_page, zalloc_page*) call into the
accounting code when it makes sense, rather than spreading the
complexity and having to special case the protected case.

Thanks,

M.

--
Without deviation from the norm, progress is not possible.

2022-04-27 09:33:31

by Yosry Ahmed

[permalink] [raw]
Subject: Re: [PATCH v3 4/6] KVM: arm64/mmu: count KVM page table pages in pagetable stats

Hi Oliver,
Thanks so much for taking the time to take a look at this!

On Tue, Apr 26, 2022 at 12:35 AM Oliver Upton <[email protected]> wrote:
>
> Hi Yosry,
>
> On Tue, Apr 26, 2022 at 05:39:02AM +0000, Yosry Ahmed wrote:
> > Count the pages used by KVM in arm64 for page tables in pagetable stats.
> >
> > Account pages allocated for PTEs in pgtable init functions and
> > kvm_set_table_pte().
> >
> > Since most page table pages are freed using put_page(), add a helper
> > function put_pte_page() that checks if this is the last ref for a pte
> > page before putting it, and unaccounts stats accordingly.
> >
> > Signed-off-by: Yosry Ahmed <[email protected]>
> > ---
> > arch/arm64/kernel/image-vars.h | 3 ++
> > arch/arm64/kvm/hyp/pgtable.c | 50 +++++++++++++++++++++-------------
> > 2 files changed, 34 insertions(+), 19 deletions(-)
> >
> > diff --git a/arch/arm64/kernel/image-vars.h b/arch/arm64/kernel/image-vars.h
> > index 241c86b67d01..25bf058714f6 100644
> > --- a/arch/arm64/kernel/image-vars.h
> > +++ b/arch/arm64/kernel/image-vars.h
> > @@ -143,6 +143,9 @@ KVM_NVHE_ALIAS(__hyp_rodata_end);
> > /* pKVM static key */
> > KVM_NVHE_ALIAS(kvm_protected_mode_initialized);
> >
> > +/* Called by kvm_account_pgtable_pages() to update pagetable stats */
> > +KVM_NVHE_ALIAS(__mod_lruvec_page_state);
> > +
> > #endif /* CONFIG_KVM */
> >
> > #endif /* __ARM64_KERNEL_IMAGE_VARS_H */
> > diff --git a/arch/arm64/kvm/hyp/pgtable.c b/arch/arm64/kvm/hyp/pgtable.c
> > index 2cb3867eb7c2..53e13c3313e9 100644
> > --- a/arch/arm64/kvm/hyp/pgtable.c
> > +++ b/arch/arm64/kvm/hyp/pgtable.c
> > @@ -152,6 +152,7 @@ static void kvm_set_table_pte(kvm_pte_t *ptep, kvm_pte_t *childp,
> >
> > WARN_ON(kvm_pte_valid(old));
> > smp_store_release(ptep, pte);
> > + kvm_account_pgtable_pages((void *)childp, +1);
>
> What page tables do we want to account? KVM on ARM manages several page
> tables.
>
> For regular KVM, the host kernel manages allocations for the hyp stage 1
> tables in addition to the stage 2 tables used for a particular VM. The
> former is system overhead whereas the latter could be attributed to a
> guest VM.

Honestly I would love to get your input on this. The main motivation
here is to give users insights on the kernel memory usage on their
system (or in a cgroup). We currently have NR_PAGETABLE stats for
normal kernel page tables (allocated using
__pte_alloc_one()/pte_free()), this shows up in /proc/meminfo,
/path/to/cgroup/memory.stat, and node stats. The idea is to add
NR_SECONDARY_PAGETABLE that should include the memory used for kvm
pagetables, which should be a separate category (no overlap). What
gets included or not depends on the semantics of KVM and what exactly
falls under the category of secondary pagetables from the user's pov.

Currently it looks like s2 page table allocations get accounted to
kmem of memory control groups (GFP_KERNEL_ACCOUNT), while hyp page
table allocations do not (GFP_KERNEL). So we could either follow this
and only account s2 page table allocations in the stats, or make hyp
allocations use GFP_KERNEL_ACCOUNT as well and add them to the stats.
Let me know what you think.

>
> I imagine protected KVM is out of scope, since it actually manages its
> own allocations outside of the host kernel.
>
> Given this, I would recommend adding the accounting hooks to mmu.c as
> that is where we alloc/free table pages and it is in the host address
> space. kvm_s2_mm_ops and kvm_hyp_mm_ops point to all the relevant
> functions, though the latter is only relevant if we want to count system
> page tables too.

Yeah moving the accounting hooks to mmu.c is much cleaner, I will do
this in the next version. The only reason I did not do this is that I
found other kvm_pgtable_mm_ops structs (such as pkvm_pgtable_mm_ops),
but it looks like these may be irrelevant here.

>
> --
> Thanks,
> Oliver

2022-04-27 10:12:34

by Yosry Ahmed

[permalink] [raw]
Subject: Re: [PATCH v3 4/6] KVM: arm64/mmu: count KVM page table pages in pagetable stats

Thanks a lot for taking the time to look at this!

On Tue, Apr 26, 2022 at 8:58 AM Marc Zyngier <[email protected]> wrote:
>
> On Tue, 26 Apr 2022 06:39:02 +0100,
> Yosry Ahmed <[email protected]> wrote:
> >
> > Count the pages used by KVM in arm64 for page tables in pagetable stats.
> >
> > Account pages allocated for PTEs in pgtable init functions and
> > kvm_set_table_pte().
> >
> > Since most page table pages are freed using put_page(), add a helper
> > function put_pte_page() that checks if this is the last ref for a pte
> > page before putting it, and unaccounts stats accordingly.
> >
> > Signed-off-by: Yosry Ahmed <[email protected]>
> > ---
> > arch/arm64/kernel/image-vars.h | 3 ++
> > arch/arm64/kvm/hyp/pgtable.c | 50 +++++++++++++++++++++-------------
> > 2 files changed, 34 insertions(+), 19 deletions(-)
> >
> > diff --git a/arch/arm64/kernel/image-vars.h b/arch/arm64/kernel/image-vars.h
> > index 241c86b67d01..25bf058714f6 100644
> > --- a/arch/arm64/kernel/image-vars.h
> > +++ b/arch/arm64/kernel/image-vars.h
> > @@ -143,6 +143,9 @@ KVM_NVHE_ALIAS(__hyp_rodata_end);
> > /* pKVM static key */
> > KVM_NVHE_ALIAS(kvm_protected_mode_initialized);
> >
> > +/* Called by kvm_account_pgtable_pages() to update pagetable stats */
> > +KVM_NVHE_ALIAS(__mod_lruvec_page_state);
>
> This cannot be right. It means that this function will be called
> directly from the EL2 code when in protected mode, and will result in
> extreme fireworks. There is no way you can call core kernel stuff
> like this from this context.
>
> Please do not add random symbols to this list just for the sake of
> being able to link the kernel.

Excuse my ignorance, this is my first time touching kvm code. Thanks a
lot for pointing this out.

>
> > +
> > #endif /* CONFIG_KVM */
> >
> > #endif /* __ARM64_KERNEL_IMAGE_VARS_H */
> > diff --git a/arch/arm64/kvm/hyp/pgtable.c b/arch/arm64/kvm/hyp/pgtable.c
> > index 2cb3867eb7c2..53e13c3313e9 100644
> > --- a/arch/arm64/kvm/hyp/pgtable.c
> > +++ b/arch/arm64/kvm/hyp/pgtable.c
> > @@ -152,6 +152,7 @@ static void kvm_set_table_pte(kvm_pte_t *ptep, kvm_pte_t *childp,
> >
> > WARN_ON(kvm_pte_valid(old));
> > smp_store_release(ptep, pte);
> > + kvm_account_pgtable_pages((void *)childp, +1);
>
> Why the + sign?

I am following conventions in other existing stat accounting hooks
(e.g. kvm_mod_used_mmu_pages(vcpu->kvm, +1) call in
arch/x86/kvm/mmu/mmu.c), but I can certainly remove it if you think
this is better.

>
> > }
> >
> > static kvm_pte_t kvm_init_valid_leaf_pte(u64 pa, kvm_pte_t attr, u32 level)
> > @@ -326,6 +327,14 @@ int kvm_pgtable_get_leaf(struct kvm_pgtable *pgt, u64 addr,
> > return ret;
> > }
> >
> > +static void put_pte_page(kvm_pte_t *ptep, struct kvm_pgtable_mm_ops *mm_ops)
> > +{
> > + /* If this is the last page ref, decrement pagetable stats first. */
> > + if (!mm_ops->page_count || mm_ops->page_count(ptep) == 1)
> > + kvm_account_pgtable_pages((void *)ptep, -1);
> > + mm_ops->put_page(ptep);
> > +}
> > +
> > struct hyp_map_data {
> > u64 phys;
> > kvm_pte_t attr;
> > @@ -488,10 +497,10 @@ static int hyp_unmap_walker(u64 addr, u64 end, u32 level, kvm_pte_t *ptep,
> >
> > dsb(ish);
> > isb();
> > - mm_ops->put_page(ptep);
> > + put_pte_page(ptep, mm_ops);
> >
> > if (childp)
> > - mm_ops->put_page(childp);
> > + put_pte_page(childp, mm_ops);
> >
> > return 0;
> > }
> > @@ -522,6 +531,7 @@ int kvm_pgtable_hyp_init(struct kvm_pgtable *pgt, u32 va_bits,
> > pgt->pgd = (kvm_pte_t *)mm_ops->zalloc_page(NULL);
> > if (!pgt->pgd)
> > return -ENOMEM;
> > + kvm_account_pgtable_pages((void *)pgt->pgd, +1);
> >
> > pgt->ia_bits = va_bits;
> > pgt->start_level = KVM_PGTABLE_MAX_LEVELS - levels;
> > @@ -541,10 +551,10 @@ static int hyp_free_walker(u64 addr, u64 end, u32 level, kvm_pte_t *ptep,
> > if (!kvm_pte_valid(pte))
> > return 0;
> >
> > - mm_ops->put_page(ptep);
> > + put_pte_page(ptep, mm_ops);
> >
> > if (kvm_pte_table(pte, level))
> > - mm_ops->put_page(kvm_pte_follow(pte, mm_ops));
> > + put_pte_page(kvm_pte_follow(pte, mm_ops), mm_ops);
>
> OK, I see the pattern. I don't think this workable as such. I'd rather
> the callbacks themselves (put_page, zalloc_page*) call into the
> accounting code when it makes sense, rather than spreading the
> complexity and having to special case the protected case.
>

This makes sense. I am working on moving calls to
kvm_account_pgtable_pages to callbacks in mmu.c in the next version
(stage2_memcache_zalloc_page, kvm_host_put_page, etc).


> Thanks,
>
> M.
>
> --
> Without deviation from the norm, progress is not possible.

2022-04-29 04:40:38

by Oliver Upton

[permalink] [raw]
Subject: Re: [PATCH v3 4/6] KVM: arm64/mmu: count KVM page table pages in pagetable stats

On Tue, Apr 26, 2022 at 12:27:57PM -0700, Yosry Ahmed wrote:
> > What page tables do we want to account? KVM on ARM manages several page
> > tables.
> >
> > For regular KVM, the host kernel manages allocations for the hyp stage 1
> > tables in addition to the stage 2 tables used for a particular VM. The
> > former is system overhead whereas the latter could be attributed to a
> > guest VM.
>
> Honestly I would love to get your input on this. The main motivation
> here is to give users insights on the kernel memory usage on their
> system (or in a cgroup). We currently have NR_PAGETABLE stats for
> normal kernel page tables (allocated using
> __pte_alloc_one()/pte_free()), this shows up in /proc/meminfo,
> /path/to/cgroup/memory.stat, and node stats. The idea is to add
> NR_SECONDARY_PAGETABLE that should include the memory used for kvm
> pagetables, which should be a separate category (no overlap). What
> gets included or not depends on the semantics of KVM and what exactly
> falls under the category of secondary pagetables from the user's pov.
>
> Currently it looks like s2 page table allocations get accounted to
> kmem of memory control groups (GFP_KERNEL_ACCOUNT), while hyp page
> table allocations do not (GFP_KERNEL). So we could either follow this
> and only account s2 page table allocations in the stats, or make hyp
> allocations use GFP_KERNEL_ACCOUNT as well and add them to the stats.
> Let me know what you think.

I think it is reasonable to just focus on stage 2 table allocations and
ignore all else. As Marc pointed out it isn't workable in other
contexts anyway (pKVM), and keeps the patch tidy too.

GFP_KERNEL_ACCOUNT for hyp allocations wouldn't make sense, as it is
done at init to build out the system page tables for EL2.

--
Thanks,
Oliver

2022-04-29 22:01:23

by Yosry Ahmed

[permalink] [raw]
Subject: Re: [PATCH v3 4/6] KVM: arm64/mmu: count KVM page table pages in pagetable stats

On Thu, Apr 28, 2022 at 10:45 AM Oliver Upton <[email protected]> wrote:
>
> On Tue, Apr 26, 2022 at 12:27:57PM -0700, Yosry Ahmed wrote:
> > > What page tables do we want to account? KVM on ARM manages several page
> > > tables.
> > >
> > > For regular KVM, the host kernel manages allocations for the hyp stage 1
> > > tables in addition to the stage 2 tables used for a particular VM. The
> > > former is system overhead whereas the latter could be attributed to a
> > > guest VM.
> >
> > Honestly I would love to get your input on this. The main motivation
> > here is to give users insights on the kernel memory usage on their
> > system (or in a cgroup). We currently have NR_PAGETABLE stats for
> > normal kernel page tables (allocated using
> > __pte_alloc_one()/pte_free()), this shows up in /proc/meminfo,
> > /path/to/cgroup/memory.stat, and node stats. The idea is to add
> > NR_SECONDARY_PAGETABLE that should include the memory used for kvm
> > pagetables, which should be a separate category (no overlap). What
> > gets included or not depends on the semantics of KVM and what exactly
> > falls under the category of secondary pagetables from the user's pov.
> >
> > Currently it looks like s2 page table allocations get accounted to
> > kmem of memory control groups (GFP_KERNEL_ACCOUNT), while hyp page
> > table allocations do not (GFP_KERNEL). So we could either follow this
> > and only account s2 page table allocations in the stats, or make hyp
> > allocations use GFP_KERNEL_ACCOUNT as well and add them to the stats.
> > Let me know what you think.
>
> I think it is reasonable to just focus on stage 2 table allocations and
> ignore all else. As Marc pointed out it isn't workable in other
> contexts anyway (pKVM), and keeps the patch tidy too.
>
> GFP_KERNEL_ACCOUNT for hyp allocations wouldn't make sense, as it is
> done at init to build out the system page tables for EL2.

Thanks so much for the insights, will send out v4 according to our discussion.

>
> --
> Thanks,
> Oliver