LinuxLists.cc - [PATCH V3 04/12] KVM: X86/MMU: Add local shadow pages

2022-05-22 13:29:50

Subject: [PATCH V3 04/12] KVM: X86/MMU: Add local shadow pages

From: Lai Jiangshan <[email protected]>

Local shadow pages are shadow pages to hold PDPTEs for 32bit guest or
higher level shadow pages having children local shadow pages when
shadowing nested NPT for 32bit L1 in 64 bit L0.

Current code use mmu->pae_root, mmu->pml4_root, and mmu->pml5_root to
setup local root page. The initialization code is complex and the root
pages are not associated with struct kvm_mmu_page which causes the code
more complex.

Add kvm_mmu_alloc_local_shadow_page() and mmu_free_local_root_page() to
allocate and free local shadow pages and prepare for using local
shadow pages to replace current logic and share the most logic with
non-local shadow pages.

The code is not activated since using_local_root_page() is false in
the place where it is inserted.

Signed-off-by: Lai Jiangshan <[email protected]>
---
arch/x86/kvm/mmu/mmu.c | 109 ++++++++++++++++++++++++++++++++++++++++-
1 file changed, 108 insertions(+), 1 deletion(-)

diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index 240ebe589caf..c941a5931bc3 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -1764,6 +1764,76 @@ static bool using_local_root_page(struct kvm_mmu *mmu)
return mmu->cpu_role.base.level <= PT32E_ROOT_LEVEL;
}

+/*
+ * Local shadow pages are shadow pages to hold PDPTEs for 32bit guest or higher
+ * level shadow pages having children local shadow pages when shadowing nested
+ * NPT for 32bit L1 in 64 bit L0.
+ *
+ * Local shadow pages are often local shadow root pages (or local root pages for
+ * short) except when shadowing nested NPT for 32bit L1 in 64 bit L0 which has
+ * 2 or 3 levels of local shadow pages on top of non-local shadow pages.
+ *
+ * Local shadow pages are locally allocated. If the local shadow page's level
+ * is PT32E_ROOT_LEVEL, it will use the preallocated mmu->pae_root for its
+ * sp->spt. Because sp->spt may need to be put in the 32 bits CR3 (even in
+ * x86_64) or decrypted. Using the preallocated one to handle these
+ * requirements makes the allocation simpler.
+ *
+ * Local shadow pages are only visible to local VCPU except through
+ * sp->parent_ptes rmap from their children, so they are not in the
+ * kvm->arch.active_mmu_pages nor in the hash.
+ *
+ * And they are neither accounted nor write-protected since they don't shadow a
+ * guest page table.
+ *
+ * Because of above, local shadow pages can not be freed nor zapped like
+ * non-local shadow pages. They are freed directly when the local root page
+ * is freed, see mmu_free_local_root_page().
+ *
+ * Local root page can not be put on mmu->prev_roots because the comparison
+ * must use PDPTEs instead of CR3 and mmu->pae_root can not be shared for multi
+ * local root pages.
+ *
+ * Except above limitations, all the other abilities are the same as other
+ * shadow page, like link, parent rmap, sync, unsync etc.
+ *
+ * Local shadow pages can be obsoleted in a little different way other than
+ * the non-local shadow pages. When the obsoleting process is done, all the
+ * obsoleted non-local shadow pages are unlinked from the local shadow pages
+ * by the help of the sp->parent_ptes rmap and the local shadow pages become
+ * theoretically valid again except sp->mmu_valid_gen may be still outdated.
+ * If there is no other event to cause a VCPU to free the local root page and
+ * the VCPU is being preempted by the host during two obsoleting processes,
+ * sp->mmu_valid_gen might become valid again and the VCPU can reuse it when
+ * the VCPU is back. It is different from the non-local shadow pages which
+ * are always freed after obsoleted.
+ */
+static struct kvm_mmu_page *
+kvm_mmu_alloc_local_shadow_page(struct kvm_vcpu *vcpu, union kvm_mmu_page_role role)
+{
+ struct kvm_mmu_page *sp;
+
+ sp = kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_page_header_cache);
+ sp->gfn = 0;
+ sp->role = role;
+ /*
+ * Use the preallocated mmu->pae_root when the shadow page's
+ * level is PT32E_ROOT_LEVEL which may need to be put in the 32 bits
+ * CR3 (even in x86_64) or decrypted. The preallocated one is prepared
+ * for the requirements.
+ */
+ if (role.level == PT32E_ROOT_LEVEL &&
+ !WARN_ON_ONCE(!vcpu->arch.mmu->pae_root))
+ sp->spt = vcpu->arch.mmu->pae_root;
+ else
+ sp->spt = kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_shadow_page_cache);
+ /* sp->gfns is not used for local shadow page */
+ set_page_private(virt_to_page(sp->spt), (unsigned long)sp);
+ sp->mmu_valid_gen = vcpu->kvm->arch.mmu_valid_gen;
+
+ return sp;
+}
+
static struct kvm_mmu_page *kvm_mmu_alloc_page(struct kvm_vcpu *vcpu, int direct)
{
struct kvm_mmu_page *sp;
@@ -2121,6 +2191,9 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
if (level <= vcpu->arch.mmu->cpu_role.base.level)
role.passthrough = 0;

+ if (unlikely(level >= PT32E_ROOT_LEVEL && using_local_root_page(vcpu->arch.mmu)))
+ return kvm_mmu_alloc_local_shadow_page(vcpu, role);
+
sp_list = &vcpu->kvm->arch.mmu_page_hash[kvm_page_table_hashfn(gfn)];
for_each_valid_sp(vcpu->kvm, sp, sp_list) {
if (sp->gfn != gfn) {
@@ -3351,6 +3424,37 @@ static void mmu_free_root_page(struct kvm *kvm, hpa_t *root_hpa,
*root_hpa = INVALID_PAGE;
}

+static void mmu_free_local_root_page(struct kvm *kvm, struct kvm_mmu *mmu)
+{
+ u64 spte = mmu->root.hpa;
+ struct kvm_mmu_page *sp = to_shadow_page(spte & PT64_BASE_ADDR_MASK);
+ int i;
+
+ /* Free level 5 or 4 roots for shadow NPT for 32 bit L1 */
+ while (sp->role.level > PT32E_ROOT_LEVEL)
+ {
+ spte = sp->spt[0];
+ mmu_page_zap_pte(kvm, sp, sp->spt + 0, NULL);
+ free_page((unsigned long)sp->spt);
+ kmem_cache_free(mmu_page_header_cache, sp);
+ if (!is_shadow_present_pte(spte))
+ return;
+ sp = to_shadow_page(spte & PT64_BASE_ADDR_MASK);
+ }
+
+ if (WARN_ON_ONCE(sp->role.level != PT32E_ROOT_LEVEL))
+ return;
+
+ /* Disconnect PAE root from the 4 PAE page directories */
+ for (i = 0; i < 4; i++)
+ mmu_page_zap_pte(kvm, sp, sp->spt + i, NULL);
+
+ if (sp->spt != mmu->pae_root)
+ free_page((unsigned long)sp->spt);
+
+ kmem_cache_free(mmu_page_header_cache, sp);
+}
+
/* roots_to_free must be some combination of the KVM_MMU_ROOT_* flags */
void kvm_mmu_free_roots(struct kvm *kvm, struct kvm_mmu *mmu,
ulong roots_to_free)
@@ -3384,7 +3488,10 @@ void kvm_mmu_free_roots(struct kvm *kvm, struct kvm_mmu *mmu,

if (free_active_root) {
if (to_shadow_page(mmu->root.hpa)) {
- mmu_free_root_page(kvm, &mmu->root.hpa, &invalid_list);
+ if (using_local_root_page(mmu))
+ mmu_free_local_root_page(kvm, mmu);
+ else
+ mmu_free_root_page(kvm, &mmu->root.hpa, &invalid_list);
} else if (mmu->pae_root) {
for (i = 0; i < 4; ++i) {
if (!IS_VALID_PAE_ROOT(mmu->pae_root[i]))
--
2.19.1.6.gb485710b

2022-05-28 09:47:37

by David Matlack

[permalink] [raw]

Subject: Re: [PATCH V3 04/12] KVM: X86/MMU: Add local shadow pages

On Sat, May 21, 2022 at 09:16:52PM +0800, Lai Jiangshan wrote:
> From: Lai Jiangshan <[email protected]>
>
> Local shadow pages are shadow pages to hold PDPTEs for 32bit guest or
> higher level shadow pages having children local shadow pages when
> shadowing nested NPT for 32bit L1 in 64 bit L0.
>
> Current code use mmu->pae_root, mmu->pml4_root, and mmu->pml5_root to
> setup local root page. The initialization code is complex and the root
> pages are not associated with struct kvm_mmu_page which causes the code
> more complex.
>
> Add kvm_mmu_alloc_local_shadow_page() and mmu_free_local_root_page() to
> allocate and free local shadow pages and prepare for using local
> shadow pages to replace current logic and share the most logic with
> non-local shadow pages.
>
> The code is not activated since using_local_root_page() is false in
> the place where it is inserted.
>
> Signed-off-by: Lai Jiangshan <[email protected]>
> ---
> arch/x86/kvm/mmu/mmu.c | 109 ++++++++++++++++++++++++++++++++++++++++-
> 1 file changed, 108 insertions(+), 1 deletion(-)
>
> diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
> index 240ebe589caf..c941a5931bc3 100644
> --- a/arch/x86/kvm/mmu/mmu.c
> +++ b/arch/x86/kvm/mmu/mmu.c
> @@ -1764,6 +1764,76 @@ static bool using_local_root_page(struct kvm_mmu *mmu)
> return mmu->cpu_role.base.level <= PT32E_ROOT_LEVEL;
> }
>
> +/*
> + * Local shadow pages are shadow pages to hold PDPTEs for 32bit guest or higher
> + * level shadow pages having children local shadow pages when shadowing nested
> + * NPT for 32bit L1 in 64 bit L0.
> + *
> + * Local shadow pages are often local shadow root pages (or local root pages for
> + * short) except when shadowing nested NPT for 32bit L1 in 64 bit L0 which has
> + * 2 or 3 levels of local shadow pages on top of non-local shadow pages.
> + *
> + * Local shadow pages are locally allocated. If the local shadow page's level
> + * is PT32E_ROOT_LEVEL, it will use the preallocated mmu->pae_root for its
> + * sp->spt. Because sp->spt may need to be put in the 32 bits CR3 (even in
> + * x86_64) or decrypted. Using the preallocated one to handle these
> + * requirements makes the allocation simpler.
> + *
> + * Local shadow pages are only visible to local VCPU except through
> + * sp->parent_ptes rmap from their children, so they are not in the
> + * kvm->arch.active_mmu_pages nor in the hash.
> + *
> + * And they are neither accounted nor write-protected since they don't shadow a
> + * guest page table.
> + *
> + * Because of above, local shadow pages can not be freed nor zapped like
> + * non-local shadow pages. They are freed directly when the local root page
> + * is freed, see mmu_free_local_root_page().
> + *
> + * Local root page can not be put on mmu->prev_roots because the comparison
> + * must use PDPTEs instead of CR3 and mmu->pae_root can not be shared for multi
> + * local root pages.
> + *
> + * Except above limitations, all the other abilities are the same as other
> + * shadow page, like link, parent rmap, sync, unsync etc.
> + *
> + * Local shadow pages can be obsoleted in a little different way other than
> + * the non-local shadow pages. When the obsoleting process is done, all the
> + * obsoleted non-local shadow pages are unlinked from the local shadow pages
> + * by the help of the sp->parent_ptes rmap and the local shadow pages become
> + * theoretically valid again except sp->mmu_valid_gen may be still outdated.
> + * If there is no other event to cause a VCPU to free the local root page and
> + * the VCPU is being preempted by the host during two obsoleting processes,
> + * sp->mmu_valid_gen might become valid again and the VCPU can reuse it when
> + * the VCPU is back. It is different from the non-local shadow pages which
> + * are always freed after obsoleted.
> + */
> +static struct kvm_mmu_page *
> +kvm_mmu_alloc_local_shadow_page(struct kvm_vcpu *vcpu, union kvm_mmu_page_role role)
> +{
> + struct kvm_mmu_page *sp;
> +
> + sp = kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_page_header_cache);
> + sp->gfn = 0;
> + sp->role = role;
> + /*
> + * Use the preallocated mmu->pae_root when the shadow page's
> + * level is PT32E_ROOT_LEVEL which may need to be put in the 32 bits
> + * CR3 (even in x86_64) or decrypted. The preallocated one is prepared
> + * for the requirements.
> + */
> + if (role.level == PT32E_ROOT_LEVEL &&
> + !WARN_ON_ONCE(!vcpu->arch.mmu->pae_root))
> + sp->spt = vcpu->arch.mmu->pae_root;

FYI this (and a couple other parts of this series) conflict with Nested
MMU Eager Page Splitting, since it uses struct kvm_vcpu in kvm_mmu_get_page().

Hopefully Paolo can queue Nested MMU Eager Page Splitting for 5.20 so
you can apply this series on top. I think that'd be simpler than trying
to do it the other way around.

> + else
> + sp->spt = kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_shadow_page_cache);
> + /* sp->gfns is not used for local shadow page */
> + set_page_private(virt_to_page(sp->spt), (unsigned long)sp);
> + sp->mmu_valid_gen = vcpu->kvm->arch.mmu_valid_gen;
> +
> + return sp;
> +}
> +
> static struct kvm_mmu_page *kvm_mmu_alloc_page(struct kvm_vcpu *vcpu, int direct)
> {
> struct kvm_mmu_page *sp;
> @@ -2121,6 +2191,9 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
> if (level <= vcpu->arch.mmu->cpu_role.base.level)
> role.passthrough = 0;
>
> + if (unlikely(level >= PT32E_ROOT_LEVEL && using_local_root_page(vcpu->arch.mmu)))
> + return kvm_mmu_alloc_local_shadow_page(vcpu, role);
> +
> sp_list = &vcpu->kvm->arch.mmu_page_hash[kvm_page_table_hashfn(gfn)];
> for_each_valid_sp(vcpu->kvm, sp, sp_list) {
> if (sp->gfn != gfn) {
> @@ -3351,6 +3424,37 @@ static void mmu_free_root_page(struct kvm *kvm, hpa_t *root_hpa,
> *root_hpa = INVALID_PAGE;
> }
>
> +static void mmu_free_local_root_page(struct kvm *kvm, struct kvm_mmu *mmu)
> +{
> + u64 spte = mmu->root.hpa;
> + struct kvm_mmu_page *sp = to_shadow_page(spte & PT64_BASE_ADDR_MASK);
> + int i;
> +
> + /* Free level 5 or 4 roots for shadow NPT for 32 bit L1 */
> + while (sp->role.level > PT32E_ROOT_LEVEL)
> + {
> + spte = sp->spt[0];
> + mmu_page_zap_pte(kvm, sp, sp->spt + 0, NULL);
> + free_page((unsigned long)sp->spt);
> + kmem_cache_free(mmu_page_header_cache, sp);
> + if (!is_shadow_present_pte(spte))
> + return;
> + sp = to_shadow_page(spte & PT64_BASE_ADDR_MASK);
> + }
> +
> + if (WARN_ON_ONCE(sp->role.level != PT32E_ROOT_LEVEL))
> + return;
> +
> + /* Disconnect PAE root from the 4 PAE page directories */
> + for (i = 0; i < 4; i++)
> + mmu_page_zap_pte(kvm, sp, sp->spt + i, NULL);
> +
> + if (sp->spt != mmu->pae_root)
> + free_page((unsigned long)sp->spt);
> +
> + kmem_cache_free(mmu_page_header_cache, sp);
> +}
> +
> /* roots_to_free must be some combination of the KVM_MMU_ROOT_* flags */
> void kvm_mmu_free_roots(struct kvm *kvm, struct kvm_mmu *mmu,
> ulong roots_to_free)
> @@ -3384,7 +3488,10 @@ void kvm_mmu_free_roots(struct kvm *kvm, struct kvm_mmu *mmu,
>
> if (free_active_root) {
> if (to_shadow_page(mmu->root.hpa)) {
> - mmu_free_root_page(kvm, &mmu->root.hpa, &invalid_list);
> + if (using_local_root_page(mmu))
> + mmu_free_local_root_page(kvm, mmu);
> + else
> + mmu_free_root_page(kvm, &mmu->root.hpa, &invalid_list);
> } else if (mmu->pae_root) {
> for (i = 0; i < 4; ++i) {
> if (!IS_VALID_PAE_ROOT(mmu->pae_root[i]))
> --
> 2.19.1.6.gb485710b
>

2022-05-28 20:03:34

by David Matlack

[permalink] [raw]

Subject: Re: [PATCH V3 04/12] KVM: X86/MMU: Add local shadow pages

On Sat, May 21, 2022 at 09:16:52PM +0800, Lai Jiangshan wrote:
> From: Lai Jiangshan <[email protected]>
>
> Local shadow pages are shadow pages to hold PDPTEs for 32bit guest or
> higher level shadow pages having children local shadow pages when
> shadowing nested NPT for 32bit L1 in 64 bit L0.
>
> Current code use mmu->pae_root, mmu->pml4_root, and mmu->pml5_root to
> setup local root page. The initialization code is complex and the root
> pages are not associated with struct kvm_mmu_page which causes the code
> more complex.
>
> Add kvm_mmu_alloc_local_shadow_page() and mmu_free_local_root_page() to
> allocate and free local shadow pages and prepare for using local
> shadow pages to replace current logic and share the most logic with
> non-local shadow pages.
>
> The code is not activated since using_local_root_page() is false in
> the place where it is inserted.
>
> Signed-off-by: Lai Jiangshan <[email protected]>
> ---
> arch/x86/kvm/mmu/mmu.c | 109 ++++++++++++++++++++++++++++++++++++++++-
> 1 file changed, 108 insertions(+), 1 deletion(-)
>
> diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
> index 240ebe589caf..c941a5931bc3 100644
> --- a/arch/x86/kvm/mmu/mmu.c
> +++ b/arch/x86/kvm/mmu/mmu.c
> @@ -1764,6 +1764,76 @@ static bool using_local_root_page(struct kvm_mmu *mmu)
> return mmu->cpu_role.base.level <= PT32E_ROOT_LEVEL;
> }
>
> +/*
> + * Local shadow pages are shadow pages to hold PDPTEs for 32bit guest or higher
> + * level shadow pages having children local shadow pages when shadowing nested
> + * NPT for 32bit L1 in 64 bit L0.
> + *
> + * Local shadow pages are often local shadow root pages (or local root pages for
> + * short) except when shadowing nested NPT for 32bit L1 in 64 bit L0 which has
> + * 2 or 3 levels of local shadow pages on top of non-local shadow pages.
> + *
> + * Local shadow pages are locally allocated. If the local shadow page's level

Can you clarify what you mean by "locally allocated"?

> + * is PT32E_ROOT_LEVEL, it will use the preallocated mmu->pae_root for its
> + * sp->spt. Because sp->spt may need to be put in the 32 bits CR3 (even in
> + * x86_64) or decrypted. Using the preallocated one to handle these
> + * requirements makes the allocation simpler.
> + *
> + * Local shadow pages are only visible to local VCPU except through
> + * sp->parent_ptes rmap from their children, so they are not in the
> + * kvm->arch.active_mmu_pages nor in the hash.
> + *
> + * And they are neither accounted nor write-protected since they don't shadow a
> + * guest page table.
> + *
> + * Because of above, local shadow pages can not be freed nor zapped like
> + * non-local shadow pages. They are freed directly when the local root page
> + * is freed, see mmu_free_local_root_page().
> + *
> + * Local root page can not be put on mmu->prev_roots because the comparison
> + * must use PDPTEs instead of CR3 and mmu->pae_root can not be shared for multi
> + * local root pages.
> + *
> + * Except above limitations, all the other abilities are the same as other
> + * shadow page, like link, parent rmap, sync, unsync etc.
> + *
> + * Local shadow pages can be obsoleted in a little different way other than
> + * the non-local shadow pages. When the obsoleting process is done, all the
> + * obsoleted non-local shadow pages are unlinked from the local shadow pages
> + * by the help of the sp->parent_ptes rmap and the local shadow pages become
> + * theoretically valid again except sp->mmu_valid_gen may be still outdated.
> + * If there is no other event to cause a VCPU to free the local root page and
> + * the VCPU is being preempted by the host during two obsoleting processes,
> + * sp->mmu_valid_gen might become valid again and the VCPU can reuse it when
> + * the VCPU is back. It is different from the non-local shadow pages which
> + * are always freed after obsoleted.
> + */
> +static struct kvm_mmu_page *
> +kvm_mmu_alloc_local_shadow_page(struct kvm_vcpu *vcpu, union kvm_mmu_page_role role)
> +{
> + struct kvm_mmu_page *sp;
> +
> + sp = kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_page_header_cache);
> + sp->gfn = 0;
> + sp->role = role;
> + /*
> + * Use the preallocated mmu->pae_root when the shadow page's
> + * level is PT32E_ROOT_LEVEL which may need to be put in the 32 bits
> + * CR3 (even in x86_64) or decrypted. The preallocated one is prepared
> + * for the requirements.

Thanks for adding this comment. It helps a lot.

> + */
> + if (role.level == PT32E_ROOT_LEVEL &&
> + !WARN_ON_ONCE(!vcpu->arch.mmu->pae_root))
> + sp->spt = vcpu->arch.mmu->pae_root;
> + else
> + sp->spt = kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_shadow_page_cache);
> + /* sp->gfns is not used for local shadow page */
> + set_page_private(virt_to_page(sp->spt), (unsigned long)sp);
> + sp->mmu_valid_gen = vcpu->kvm->arch.mmu_valid_gen;
> +
> + return sp;
> +}
> +
> static struct kvm_mmu_page *kvm_mmu_alloc_page(struct kvm_vcpu *vcpu, int direct)
> {
> struct kvm_mmu_page *sp;
> @@ -2121,6 +2191,9 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
> if (level <= vcpu->arch.mmu->cpu_role.base.level)
> role.passthrough = 0;
>
> + if (unlikely(level >= PT32E_ROOT_LEVEL && using_local_root_page(vcpu->arch.mmu)))
> + return kvm_mmu_alloc_local_shadow_page(vcpu, role);
> +
> sp_list = &vcpu->kvm->arch.mmu_page_hash[kvm_page_table_hashfn(gfn)];
> for_each_valid_sp(vcpu->kvm, sp, sp_list) {
> if (sp->gfn != gfn) {
> @@ -3351,6 +3424,37 @@ static void mmu_free_root_page(struct kvm *kvm, hpa_t *root_hpa,
> *root_hpa = INVALID_PAGE;
> }
>
> +static void mmu_free_local_root_page(struct kvm *kvm, struct kvm_mmu *mmu)
> +{
> + u64 spte = mmu->root.hpa;
> + struct kvm_mmu_page *sp = to_shadow_page(spte & PT64_BASE_ADDR_MASK);
> + int i;
> +
> + /* Free level 5 or 4 roots for shadow NPT for 32 bit L1 */
> + while (sp->role.level > PT32E_ROOT_LEVEL)
> + {
> + spte = sp->spt[0];
> + mmu_page_zap_pte(kvm, sp, sp->spt + 0, NULL);
> + free_page((unsigned long)sp->spt);
> + kmem_cache_free(mmu_page_header_cache, sp);
> + if (!is_shadow_present_pte(spte))
> + return;
> + sp = to_shadow_page(spte & PT64_BASE_ADDR_MASK);
> + }
> +
> + if (WARN_ON_ONCE(sp->role.level != PT32E_ROOT_LEVEL))
> + return;
> +
> + /* Disconnect PAE root from the 4 PAE page directories */
> + for (i = 0; i < 4; i++)
> + mmu_page_zap_pte(kvm, sp, sp->spt + i, NULL);
> +
> + if (sp->spt != mmu->pae_root)
> + free_page((unsigned long)sp->spt);
> +
> + kmem_cache_free(mmu_page_header_cache, sp);
> +}
> +
> /* roots_to_free must be some combination of the KVM_MMU_ROOT_* flags */
> void kvm_mmu_free_roots(struct kvm *kvm, struct kvm_mmu *mmu,
> ulong roots_to_free)
> @@ -3384,7 +3488,10 @@ void kvm_mmu_free_roots(struct kvm *kvm, struct kvm_mmu *mmu,
>
> if (free_active_root) {
> if (to_shadow_page(mmu->root.hpa)) {
> - mmu_free_root_page(kvm, &mmu->root.hpa, &invalid_list);
> + if (using_local_root_page(mmu))
> + mmu_free_local_root_page(kvm, mmu);
> + else
> + mmu_free_root_page(kvm, &mmu->root.hpa, &invalid_list);

What do you think about adding a separate patch to rename
mmu_free_root_page() to mmu_put_root_page()? I think that would make the
code much more clear when combined with my suggestion to use "private".
i.e. We'd end up with:

if (using_private_root_page(mmu))
mmu_free_private_root_page(mmu);
else
mmu_put_root_page(kvm, &mmu->root.hpa, &invalid_list);

This makes it clear that the vCPU owns private root pages, so can free
them directly. But for shared root pages (i.e. else clause), we are just
putting a reference and only freeing if the reference (root_count) goes
to 0.

> } else if (mmu->pae_root) {
> for (i = 0; i < 4; ++i) {
> if (!IS_VALID_PAE_ROOT(mmu->pae_root[i]))
> --
> 2.19.1.6.gb485710b
>

2022-07-20 00:40:54

by Sean Christopherson

[permalink] [raw]

Subject: Re: [PATCH V3 04/12] KVM: X86/MMU: Add local shadow pages

On Sat, May 21, 2022, Lai Jiangshan wrote:
> +static struct kvm_mmu_page *
> +kvm_mmu_alloc_local_shadow_page(struct kvm_vcpu *vcpu, union kvm_mmu_page_role role)

Don't split the function name to a new line, even if it means running (well) over
the 80 char soft limit.

static struct kvm_mmu_page *kvm_mmu_alloc_per_vcpu_shadow_page(struct kvm_vcpu *vcpu,
union kvm_mmu_page_role role)

> +{
> + struct kvm_mmu_page *sp;
> +
> + sp = kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_page_header_cache);
> + sp->gfn = 0;

Why explicitly zero gfn but not gfns? Either rely on __GFP_ZERO or don't, mixing
behavior is confusing. If there's an assumption that "gfn" be zero, e.g. due to
masking, then that would be a good WARN candidate.

> + sp->role = role;
> + /*
> + * Use the preallocated mmu->pae_root when the shadow page's
> + * level is PT32E_ROOT_LEVEL which may need to be put in the 32 bits
> + * CR3 (even in x86_64) or decrypted. The preallocated one is prepared
> + * for the requirements.
> + */
> + if (role.level == PT32E_ROOT_LEVEL &&
> + !WARN_ON_ONCE(!vcpu->arch.mmu->pae_root))
> + sp->spt = vcpu->arch.mmu->pae_root;
> + else
> + sp->spt = kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_shadow_page_cache);
> + /* sp->gfns is not used for local shadow page */

This comment isn't helpful as it doesn't provide any information as to _why_ gfns
isn't used. For simple enforcement, a KVM_BUG_ON() is much more effective as it
documents the underlying assumption, e.g.

KVM_BUG_ON(sp_has_gptes(sp), vcpu->kvm);

but I'm fairly confident that won't actually work, because sp_has_gptes() will
return true for pages that are backed by pae_root, i.e. are not passthrough.

In other words, this all subtly relies on the PDPTEs not being write-protected
and not being reachable through things like mmu_page_hash. I don't know that we
need to add a dedicated flag for these pages, but we need _something_ to document
what's going on.

Hmm, but if we do add kvm_mmu_page_role.per_vcpu, it would allow for code
consolidation, and I think it will yield more intuitive code. And sp_has_gptes()
is easy to fix.

> + set_page_private(virt_to_page(sp->spt), (unsigned long)sp);
> + sp->mmu_valid_gen = vcpu->kvm->arch.mmu_valid_gen;

I would prefer that kvm_mmu_alloc_per_vcpu_shadow_page() and kvm_mmu_alloc_page()
share common bits, and then add comments for the differences. For example, this
path fails to invoke kvm_mod_used_mmu_pages(), which arguably it should do when
not using pae_root, i.e. when it actually "allocates" a page.

I've always found it annoying/odd that kvm_mmu_alloc_page() adds the page to
active_mmu_pages, but the caller adds the page to mmu_page_hash. This is a good
excuse to fix that.

If role.per_vcpu is a thing, and is tracked in vcpu->arch.mmu->root_role, then
we can do:

if (level < PT32E_ROOT_LEVEL)
role.per_vcpu = 0;

/* Per-vCPU roots are (obviously) not tracked in the per-VM lists. */
if (unlikely(role.per_vcpu))
return kvm_mmu_alloc_page(vcpu, role, true, gfn);

sp_list = &vcpu->kvm->arch.mmu_page_hash[kvm_page_table_hashfn(gfn)];
for_each_valid_sp(vcpu->kvm, sp, sp_list) {
...
}

++vcpu->kvm->stat.mmu_cache_miss;

sp = kvm_mmu_alloc_page(vcpu, role, gfn);

and kvm_mmu_alloc_page() becomes something like (completely untested, and I'm not
at all confident about the gfn logic).

static struct kvm_mmu_page *kvm_mmu_alloc_page(struct kvm_vcpu *vcpu,
union kvm_mmu_page_role role,
gfn_t gfn)
{
struct kvm_mmu_page *sp;

sp = kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_page_header_cache);

/*
* Use the preallocated mmu->pae_root when the shadow page's level is
* PT32E_ROOT_LEVEL. When using PAE paging, the backing page may need
* to have a 32-bit physical address (to go into a 32-bit CR3), and/or
* may need to be decrypted (!TDP + SME). The preallocated pae_root
* is prepared for said requirements.
*/
if (role.per_vcpu && role.level == PT32E_ROOT_LEVEL) {
sp->spt = vcpu->arch.mmu->pae_root;
memset(sp->spt, 0, sizeof(u64) * 4);
} else {
sp->spt = kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_shadow_page_cache);
}

if (sp_has_gptes(sp))
sp->gfns = kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_gfn_array_cache);

WARN_ON_ONCE(role.per_vcpu && gfn);
sp->gfn = gfn;
sp->role = role;

set_page_private(virt_to_page(sp->spt), (unsigned long)sp);

/*
* active_mmu_pages must be a FIFO list, as kvm_zap_obsolete_pages()
* depends on valid pages being added to the head of the list. See
* comments in kvm_zap_obsolete_pages().
*/
sp->mmu_valid_gen = vcpu->kvm->arch.mmu_valid_gen;
kvm_mod_used_mmu_pages(vcpu->kvm, +1);
return sp;
}

> +
> + return sp;
> +}
> +
> static struct kvm_mmu_page *kvm_mmu_alloc_page(struct kvm_vcpu *vcpu, int direct)
> {
> struct kvm_mmu_page *sp;
> @@ -2121,6 +2191,9 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
> if (level <= vcpu->arch.mmu->cpu_role.base.level)
> role.passthrough = 0;
>
> + if (unlikely(level >= PT32E_ROOT_LEVEL && using_local_root_page(vcpu->arch.mmu)))
> + return kvm_mmu_alloc_local_shadow_page(vcpu, role);
> +
> sp_list = &vcpu->kvm->arch.mmu_page_hash[kvm_page_table_hashfn(gfn)];
> for_each_valid_sp(vcpu->kvm, sp, sp_list) {
> if (sp->gfn != gfn) {
> @@ -3351,6 +3424,37 @@ static void mmu_free_root_page(struct kvm *kvm, hpa_t *root_hpa,
> *root_hpa = INVALID_PAGE;
> }
>
> +static void mmu_free_local_root_page(struct kvm *kvm, struct kvm_mmu *mmu)
> +{
> + u64 spte = mmu->root.hpa;
> + struct kvm_mmu_page *sp = to_shadow_page(spte & PT64_BASE_ADDR_MASK);
> + int i;
> +
> + /* Free level 5 or 4 roots for shadow NPT for 32 bit L1 */
> + while (sp->role.level > PT32E_ROOT_LEVEL)

Maybe a for-loop?

/* Free level 5 or 4 roots for shadow NPT for 32 bit L1 */
for (sp = to_shadow_page(mmu->root.hpa & PT64_BASE_ADDR_MASK);
sp->role.level > PT32E_ROOT_LEVEL;
sp = to_shadow_page(spte & PT64_BASE_ADDR_MASK)) {

> + {
> + spte = sp->spt[0];
> + mmu_page_zap_pte(kvm, sp, sp->spt + 0, NULL);
> + free_page((unsigned long)sp->spt);
> + kmem_cache_free(mmu_page_header_cache, sp);

Probably worth a helper for free_page()+kmem_cache_free(), especially if the
!pae_root case is accounted. And then we can combine with tdp_mmu_free_sp() if
we ever decide to fully account TDP MMU pages (to play nice with reclaim).

E.g.

static void __mmu_free_per_vcpu_root_page(struct kvm *kvm,
struct kvm_mmu_page *sp)
{
if (sp->spt != mmu->pae_root) {
free_page((unsigned long)sp->spt);
kvm_mod_used_mmu_pages(kvm, -1);
}

kmem_cache_free(mmu_page_header_cache, sp);
}

static void mmu_free_per_vcpu_root_page(struct kvm *kvm, struct kvm_mmu *mmu)
{
struct kvm_mmu_page *sp;
u64 spte;
int i;

/* Free level 5 or 4 roots for shadow NPT for 32 bit L1 */
for (sp = to_shadow_page(mmu->root.hpa & PT64_BASE_ADDR_MASK);
sp->role.level > PT32E_ROOT_LEVEL;
sp = to_shadow_page(spte & PT64_BASE_ADDR_MASK)) {
spte = sp->spt[0];
mmu_page_zap_pte(kvm, sp, sp->spt + 0, NULL);

__mmu_free_per_vcpu_root_page(kvm, sp);

if (!is_shadow_present_pte(spte))
return;
}

if (WARN_ON_ONCE(sp->role.level != PT32E_ROOT_LEVEL))
return;

/* Disconnect PAE root from the 4 PAE page directories */
for (i = 0; i < 4; i++)
mmu_page_zap_pte(kvm, sp, sp->spt + i, NULL);

__mmu_free_per_vcpu_root_page(kvm, sp);
}

> + if (!is_shadow_present_pte(spte))
> + return;
> + sp = to_shadow_page(spte & PT64_BASE_ADDR_MASK);
> + }