From: Lai Jiangshan <[email protected]>
mmu->pae_root for non-PAE paging is allocated on-demand, but
mmu->pae_root for PAE paging is allocated early when struct kvm_mmu is
being created.
Simplify the code to allocate mmu->pae_root for PAE paging and make
it on-demand.
Signed-off-by: Lai Jiangshan <[email protected]>
---
arch/x86/include/asm/kvm_host.h | 2 +-
arch/x86/kvm/mmu/mmu.c | 101 +++++++++++++-------------------
arch/x86/kvm/x86.c | 4 +-
3 files changed, 44 insertions(+), 63 deletions(-)
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 9cdc5bbd721f..fb9751dfc1a7 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -1615,7 +1615,7 @@ int kvm_mmu_vendor_module_init(void);
void kvm_mmu_vendor_module_exit(void);
void kvm_mmu_destroy(struct kvm_vcpu *vcpu);
-int kvm_mmu_create(struct kvm_vcpu *vcpu);
+void kvm_mmu_create(struct kvm_vcpu *vcpu);
int kvm_mmu_init_vm(struct kvm *kvm);
void kvm_mmu_uninit_vm(struct kvm *kvm);
diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index 90b715eefe6a..63c2b2c6122c 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -668,6 +668,41 @@ static void walk_shadow_page_lockless_end(struct kvm_vcpu *vcpu)
}
}
+static int mmu_alloc_pae_root(struct kvm_vcpu *vcpu)
+{
+ struct page *page;
+
+ if (vcpu->arch.mmu->root_role.level != PT32E_ROOT_LEVEL)
+ return 0;
+ if (vcpu->arch.mmu->pae_root)
+ return 0;
+
+ /*
+ * Allocate a page to hold the four PDPTEs for PAE paging when emulating
+ * 32-bit mode. CR3 is only 32 bits even on x86_64 in this case.
+ * Therefore we need to allocate the PDP table in the first 4GB of
+ * memory, which happens to fit the DMA32 zone.
+ */
+ page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO | __GFP_DMA32);
+ if (!page)
+ return -ENOMEM;
+ vcpu->arch.mmu->pae_root = page_address(page);
+
+ /*
+ * CR3 is only 32 bits when PAE paging is used, thus it's impossible to
+ * get the CPU to treat the PDPTEs as encrypted. Decrypt the page so
+ * that KVM's writes and the CPU's reads get along. Note, this is
+ * only necessary when using shadow paging, as 64-bit NPT can get at
+ * the C-bit even when shadowing 32-bit NPT, and SME isn't supported
+ * by 32-bit kernels (when KVM itself uses 32-bit NPT).
+ */
+ if (!tdp_enabled)
+ set_memory_decrypted((unsigned long)vcpu->arch.mmu->pae_root, 1);
+ else
+ WARN_ON_ONCE(shadow_me_value);
+ return 0;
+}
+
static int mmu_topup_memory_caches(struct kvm_vcpu *vcpu, bool maybe_indirect)
{
int r;
@@ -5127,6 +5162,9 @@ int kvm_mmu_load(struct kvm_vcpu *vcpu)
r = mmu_topup_memory_caches(vcpu, !vcpu->arch.mmu->root_role.direct);
if (r)
goto out;
+ r = mmu_alloc_pae_root(vcpu);
+ if (r)
+ return r;
r = mmu_alloc_special_roots(vcpu);
if (r)
goto out;
@@ -5591,63 +5629,18 @@ static void free_mmu_pages(struct kvm_mmu *mmu)
free_page((unsigned long)mmu->pml5_root);
}
-static int __kvm_mmu_create(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu)
+static void __kvm_mmu_create(struct kvm_mmu *mmu)
{
- struct page *page;
int i;
mmu->root.hpa = INVALID_PAGE;
mmu->root.pgd = 0;
for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++)
mmu->prev_roots[i] = KVM_MMU_ROOT_INFO_INVALID;
-
- /* vcpu->arch.guest_mmu isn't used when !tdp_enabled. */
- if (!tdp_enabled && mmu == &vcpu->arch.guest_mmu)
- return 0;
-
- /*
- * When using PAE paging, the four PDPTEs are treated as 'root' pages,
- * while the PDP table is a per-vCPU construct that's allocated at MMU
- * creation. When emulating 32-bit mode, cr3 is only 32 bits even on
- * x86_64. Therefore we need to allocate the PDP table in the first
- * 4GB of memory, which happens to fit the DMA32 zone. TDP paging
- * generally doesn't use PAE paging and can skip allocating the PDP
- * table. The main exception, handled here, is SVM's 32-bit NPT. The
- * other exception is for shadowing L1's 32-bit or PAE NPT on 64-bit
- * KVM; that horror is handled on-demand by mmu_alloc_special_roots().
- */
- if (tdp_enabled && kvm_mmu_get_tdp_level(vcpu) > PT32E_ROOT_LEVEL)
- return 0;
-
- page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_DMA32);
- if (!page)
- return -ENOMEM;
-
- mmu->pae_root = page_address(page);
-
- /*
- * CR3 is only 32 bits when PAE paging is used, thus it's impossible to
- * get the CPU to treat the PDPTEs as encrypted. Decrypt the page so
- * that KVM's writes and the CPU's reads get along. Note, this is
- * only necessary when using shadow paging, as 64-bit NPT can get at
- * the C-bit even when shadowing 32-bit NPT, and SME isn't supported
- * by 32-bit kernels (when KVM itself uses 32-bit NPT).
- */
- if (!tdp_enabled)
- set_memory_decrypted((unsigned long)mmu->pae_root, 1);
- else
- WARN_ON_ONCE(shadow_me_value);
-
- for (i = 0; i < 4; ++i)
- mmu->pae_root[i] = INVALID_PAE_ROOT;
-
- return 0;
}
-int kvm_mmu_create(struct kvm_vcpu *vcpu)
+void kvm_mmu_create(struct kvm_vcpu *vcpu)
{
- int ret;
-
vcpu->arch.mmu_pte_list_desc_cache.kmem_cache = pte_list_desc_cache;
vcpu->arch.mmu_pte_list_desc_cache.gfp_zero = __GFP_ZERO;
@@ -5659,18 +5652,8 @@ int kvm_mmu_create(struct kvm_vcpu *vcpu)
vcpu->arch.mmu = &vcpu->arch.root_mmu;
vcpu->arch.walk_mmu = &vcpu->arch.root_mmu;
- ret = __kvm_mmu_create(vcpu, &vcpu->arch.guest_mmu);
- if (ret)
- return ret;
-
- ret = __kvm_mmu_create(vcpu, &vcpu->arch.root_mmu);
- if (ret)
- goto fail_allocate_root;
-
- return ret;
- fail_allocate_root:
- free_mmu_pages(&vcpu->arch.guest_mmu);
- return ret;
+ __kvm_mmu_create(&vcpu->arch.guest_mmu);
+ __kvm_mmu_create(&vcpu->arch.root_mmu);
}
#define BATCH_ZAP_PAGES 10
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 04812eaaf61b..064aecb188dc 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -11285,9 +11285,7 @@ int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu)
else
vcpu->arch.mp_state = KVM_MP_STATE_UNINITIALIZED;
- r = kvm_mmu_create(vcpu);
- if (r < 0)
- return r;
+ kvm_mmu_create(vcpu);
if (irqchip_in_kernel(vcpu->kvm)) {
r = kvm_create_lapic(vcpu, lapic_timer_advance_ns);
--
2.19.1.6.gb485710b
On Sat, May 21, 2022, Lai Jiangshan wrote:
> From: Lai Jiangshan <[email protected]>
>
> mmu->pae_root for non-PAE paging is allocated on-demand, but
> mmu->pae_root for PAE paging is allocated early when struct kvm_mmu is
> being created.
>
> Simplify the code to allocate mmu->pae_root for PAE paging and make
> it on-demand.
Hmm, I'm not convinced this simplifies things enough to justify the risk. There's
a non-zero chance that the __GFP_DMA32 allocation was intentionally done during VM
creation in order to avoid OOM on low memory.
Maybe move this patch to the tail end of the series so that it has a higher chance
of reverting cleanly if on-demand allocation breaks someone's setup?
> Signed-off-by: Lai Jiangshan <[email protected]>
> ---
> arch/x86/include/asm/kvm_host.h | 2 +-
> arch/x86/kvm/mmu/mmu.c | 101 +++++++++++++-------------------
> arch/x86/kvm/x86.c | 4 +-
> 3 files changed, 44 insertions(+), 63 deletions(-)
>
> diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
> index 9cdc5bbd721f..fb9751dfc1a7 100644
> --- a/arch/x86/include/asm/kvm_host.h
> +++ b/arch/x86/include/asm/kvm_host.h
> @@ -1615,7 +1615,7 @@ int kvm_mmu_vendor_module_init(void);
> void kvm_mmu_vendor_module_exit(void);
>
> void kvm_mmu_destroy(struct kvm_vcpu *vcpu);
> -int kvm_mmu_create(struct kvm_vcpu *vcpu);
> +void kvm_mmu_create(struct kvm_vcpu *vcpu);
> int kvm_mmu_init_vm(struct kvm *kvm);
> void kvm_mmu_uninit_vm(struct kvm *kvm);
>
> diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
> index 90b715eefe6a..63c2b2c6122c 100644
> --- a/arch/x86/kvm/mmu/mmu.c
> +++ b/arch/x86/kvm/mmu/mmu.c
> @@ -668,6 +668,41 @@ static void walk_shadow_page_lockless_end(struct kvm_vcpu *vcpu)
> }
> }
>
> +static int mmu_alloc_pae_root(struct kvm_vcpu *vcpu)
Now that pae_root isn't the "full" root, just the page table, I think we should
rename pae_root to something else, and then name this accordingly.
pae_root_backing_page and mmu_alloc_pae_root_backing_page()? Definitely don't
love the name if someone has a better idea.
> +{
> + struct page *page;
> +
> + if (vcpu->arch.mmu->root_role.level != PT32E_ROOT_LEVEL)
> + return 0;
I think I'd prefer to move this check to the caller, it's confusing to see an
unconditional call to a PAE-specific helper.
> + if (vcpu->arch.mmu->pae_root)
> + return 0;
> +
> + /*
> + * Allocate a page to hold the four PDPTEs for PAE paging when emulating
> + * 32-bit mode. CR3 is only 32 bits even on x86_64 in this case.
> + * Therefore we need to allocate the PDP table in the first 4GB of
> + * memory, which happens to fit the DMA32 zone.
> + */
> + page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO | __GFP_DMA32);
Leave off __GFP_ZERO, it's unnecesary in both cases, and actively misleading in
when TDP is disabled. KVM _must_ write the page after making it decrypted. And
since I can't find any code that actually does initialize "pae_root", I suspect
this series is buggy.
But if there is a bug, it was introduced earlier in this series, either by
KVM: X86/MMU: Add local shadow pages
or by
KVM: X86/MMU: Activate local shadow pages and remove old logic
depending on whether you want to blame the function that is buggy, or the patch
that uses the buggy function..
The right place to initialize the root is kvm_mmu_alloc_local_shadow_page().
KVM sets __GFP_ZERO for mmu_shadow_page_cache, i.e. relies on new sp->spt pages
to be zeroed prior to "allocating" from the cache.
The PAE root backing page on the other hand is allocated once and then reused
over and over.
if (role.level == PT32E_ROOT_LEVEL &&
!WARN_ON_ONCE(!vcpu->arch.mmu->pae_root)) {
sp->spt = vcpu->arch.mmu->pae_root;
kvm_mmu_initialize_pae_root(sp->spt): <==== something like this
} else {
sp->spt = kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_shadow_page_cache);
}
> - for (i = 0; i < 4; ++i)
> - mmu->pae_root[i] = INVALID_PAE_ROOT;
Please remove this code in a separate patch. I don't care if it is removed before
or after (I'm pretty sure the existing behavior is paranoia), but I don't want
multiple potentially-functional changes in this patch.
On Tue, Jul 19, 2022, Sean Christopherson wrote:
> On Sat, May 21, 2022, Lai Jiangshan wrote:
> > + /*
> > + * Allocate a page to hold the four PDPTEs for PAE paging when emulating
> > + * 32-bit mode. CR3 is only 32 bits even on x86_64 in this case.
> > + * Therefore we need to allocate the PDP table in the first 4GB of
> > + * memory, which happens to fit the DMA32 zone.
> > + */
> > + page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO | __GFP_DMA32);
>
> Leave off __GFP_ZERO, it's unnecesary in both cases, and actively misleading in
> when TDP is disabled. KVM _must_ write the page after making it decrypted. And
> since I can't find any code that actually does initialize "pae_root", I suspect
> this series is buggy.
>
> But if there is a bug, it was introduced earlier in this series, either by
>
> KVM: X86/MMU: Add local shadow pages
>
> or by
>
> KVM: X86/MMU: Activate local shadow pages and remove old logic
>
> depending on whether you want to blame the function that is buggy, or the patch
> that uses the buggy function..
>
> The right place to initialize the root is kvm_mmu_alloc_local_shadow_page().
> KVM sets __GFP_ZERO for mmu_shadow_page_cache, i.e. relies on new sp->spt pages
> to be zeroed prior to "allocating" from the cache.
>
> The PAE root backing page on the other hand is allocated once and then reused
> over and over.
>
> if (role.level == PT32E_ROOT_LEVEL &&
> !WARN_ON_ONCE(!vcpu->arch.mmu->pae_root)) {
> sp->spt = vcpu->arch.mmu->pae_root;
> kvm_mmu_initialize_pae_root(sp->spt): <==== something like this
> } else {
> sp->spt = kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_shadow_page_cache);
> }
Ah, I believe this is handled for the non-SME case in mmu_free_local_root_page().
But that won't play nice with the decryption path. And either way, the PDPDTEs
should be explicitly initialized/zeroed when the shadow page is "allocated"
> > - for (i = 0; i < 4; ++i)
> > - mmu->pae_root[i] = INVALID_PAE_ROOT;
>
> Please remove this code in a separate patch. I don't care if it is removed before
> or after (I'm pretty sure the existing behavior is paranoia), but I don't want
> multiple potentially-functional changes in this patch.