2021-08-18 16:57:59

by Wei Huang

[permalink] [raw]
Subject: [PATCH v3 0/3] SVM 5-level page table support

This patch set adds 5-level page table support for AMD SVM. When the
5-level page table is enabled on host OS, the nested page table for guest
VMs will use the same format as host OS (i.e. 5-level NPT). These patches
were tested with various combination of different settings and test cases
(nested/regular VMs, AMD64/i686 kernels, kvm-unit-tests, etc.)

v2->v3:
* Change the way of building root_hpa by following the existing flow (Sean)

v1->v2:
* Remove v1's arch-specific get_tdp_level() and add a new parameter,
tdp_forced_root_level, to allow forced TDP level (Sean)
* Add additional comment on tdp_root table chaining trick and change the
PML root table allocation code (Sean)
* Revise Patch 1's commit msg (Sean and Jim)

Thanks,
-Wei

Wei Huang (3):
KVM: x86: Allow CPU to force vendor-specific TDP level
KVM: x86: Handle the case of 5-level shadow page table
KVM: SVM: Add 5-level page table support for SVM

arch/x86/include/asm/kvm_host.h | 6 ++--
arch/x86/kvm/mmu/mmu.c | 56 ++++++++++++++++++++++-----------
arch/x86/kvm/svm/svm.c | 13 ++++----
arch/x86/kvm/vmx/vmx.c | 3 +-
4 files changed, 49 insertions(+), 29 deletions(-)

--
2.31.1


2021-08-18 16:59:04

by Wei Huang

[permalink] [raw]
Subject: [PATCH v3 1/3] KVM: x86: Allow CPU to force vendor-specific TDP level

AMD future CPUs will require a 5-level NPT if host CR4.LA57 is set.
To prevent kvm_mmu_get_tdp_level() from incorrectly changing NPT level
on behalf of CPUs, add a new parameter in kvm_configure_mmu() to force
a fixed TDP level.

Signed-off-by: Wei Huang <[email protected]>
---
arch/x86/include/asm/kvm_host.h | 5 ++---
arch/x86/kvm/mmu/mmu.c | 10 ++++++++--
arch/x86/kvm/svm/svm.c | 4 +++-
arch/x86/kvm/vmx/vmx.c | 3 ++-
4 files changed, 15 insertions(+), 7 deletions(-)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index af6ce8d4c86a..9daa86aa5649 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -723,7 +723,6 @@ struct kvm_vcpu_arch {

u64 reserved_gpa_bits;
int maxphyaddr;
- int max_tdp_level;

/* emulate context */

@@ -1754,8 +1753,8 @@ void kvm_mmu_invalidate_gva(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
void kvm_mmu_invpcid_gva(struct kvm_vcpu *vcpu, gva_t gva, unsigned long pcid);
void kvm_mmu_new_pgd(struct kvm_vcpu *vcpu, gpa_t new_pgd);

-void kvm_configure_mmu(bool enable_tdp, int tdp_max_root_level,
- int tdp_huge_page_level);
+void kvm_configure_mmu(bool enable_tdp, int tdp_forced_root_level,
+ int tdp_max_root_level, int tdp_huge_page_level);

static inline u16 kvm_read_ldt(void)
{
diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index 47b765270239..14eac57bdaaa 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -97,6 +97,7 @@ module_param_named(flush_on_reuse, force_flush_and_sync_on_reuse, bool, 0644);
bool tdp_enabled = false;

static int max_huge_page_level __read_mostly;
+static int tdp_root_level __read_mostly;
static int max_tdp_level __read_mostly;

enum {
@@ -4588,6 +4589,10 @@ static union kvm_mmu_role kvm_calc_mmu_role_common(struct kvm_vcpu *vcpu,

static inline int kvm_mmu_get_tdp_level(struct kvm_vcpu *vcpu)
{
+ /* tdp_root_level is architecture forced level, use it if nonzero */
+ if (tdp_root_level)
+ return tdp_root_level;
+
/* Use 5-level TDP if and only if it's useful/necessary. */
if (max_tdp_level == 5 && cpuid_maxphyaddr(vcpu) <= 48)
return 4;
@@ -5279,10 +5284,11 @@ void kvm_mmu_invpcid_gva(struct kvm_vcpu *vcpu, gva_t gva, unsigned long pcid)
*/
}

-void kvm_configure_mmu(bool enable_tdp, int tdp_max_root_level,
- int tdp_huge_page_level)
+void kvm_configure_mmu(bool enable_tdp, int tdp_forced_root_level,
+ int tdp_max_root_level, int tdp_huge_page_level)
{
tdp_enabled = enable_tdp;
+ tdp_root_level = tdp_forced_root_level;
max_tdp_level = tdp_max_root_level;

/*
diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
index 69639f9624f5..b34840a2ffa7 100644
--- a/arch/x86/kvm/svm/svm.c
+++ b/arch/x86/kvm/svm/svm.c
@@ -1015,7 +1015,9 @@ static __init int svm_hardware_setup(void)
if (!boot_cpu_has(X86_FEATURE_NPT))
npt_enabled = false;

- kvm_configure_mmu(npt_enabled, get_max_npt_level(), PG_LEVEL_1G);
+ /* Force VM NPT level equal to the host's max NPT level */
+ kvm_configure_mmu(npt_enabled, get_max_npt_level(),
+ get_max_npt_level(), PG_LEVEL_1G);
pr_info("kvm: Nested Paging %sabled\n", npt_enabled ? "en" : "dis");

/* Note, SEV setup consumes npt_enabled. */
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index 927a552393b9..034e1397c7d5 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -7803,7 +7803,8 @@ static __init int hardware_setup(void)
ept_lpage_level = PG_LEVEL_2M;
else
ept_lpage_level = PG_LEVEL_4K;
- kvm_configure_mmu(enable_ept, vmx_get_max_tdp_level(), ept_lpage_level);
+ kvm_configure_mmu(enable_ept, 0, vmx_get_max_tdp_level(),
+ ept_lpage_level);

/*
* Only enable PML when hardware supports PML feature, and both EPT
--
2.31.1

2021-08-18 16:59:04

by Wei Huang

[permalink] [raw]
Subject: [PATCH v3 3/3] KVM: SVM: Add 5-level page table support for SVM

When the 5-level page table is enabled on host OS, the nested page table
for guest VMs must use 5-level as well. Update get_npt_level() function
to reflect this requirement. In the meanwhile, remove the code that
prevents kvm-amd driver from being loaded when 5-level page table is
detected.

Signed-off-by: Wei Huang <[email protected]>
Signed-off-by: Paolo Bonzini <[email protected]>
---
arch/x86/kvm/svm/svm.c | 9 +++------
1 file changed, 3 insertions(+), 6 deletions(-)

diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
index b34840a2ffa7..ecc4bb8e4ea0 100644
--- a/arch/x86/kvm/svm/svm.c
+++ b/arch/x86/kvm/svm/svm.c
@@ -261,7 +261,9 @@ u32 svm_msrpm_offset(u32 msr)
static int get_max_npt_level(void)
{
#ifdef CONFIG_X86_64
- return PT64_ROOT_4LEVEL;
+ bool la57 = (cr4_read_shadow() & X86_CR4_LA57) != 0;
+
+ return la57 ? PT64_ROOT_5LEVEL : PT64_ROOT_4LEVEL;
#else
return PT32E_ROOT_LEVEL;
#endif
@@ -462,11 +464,6 @@ static int has_svm(void)
return 0;
}

- if (pgtable_l5_enabled()) {
- pr_info("KVM doesn't yet support 5-level paging on AMD SVM\n");
- return 0;
- }
-
return 1;
}

--
2.31.1

2021-08-18 16:59:49

by Wei Huang

[permalink] [raw]
Subject: [PATCH v3 2/3] KVM: x86: Handle the case of 5-level shadow page table

When the 5-level page table CPU flag is exposed, KVM code needs to handle
this case by pointing mmu->root_hpa to a properly-constructed 5-level page
table.

Suggested-by: Paolo Bonzini <[email protected]>
Signed-off-by: Wei Huang <[email protected]>
---
arch/x86/include/asm/kvm_host.h | 1 +
arch/x86/kvm/mmu/mmu.c | 46 +++++++++++++++++++++------------
2 files changed, 31 insertions(+), 16 deletions(-)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 9daa86aa5649..0024b72a2b32 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -447,6 +447,7 @@ struct kvm_mmu {

u64 *pae_root;
u64 *pml4_root;
+ u64 *pml5_root;

/*
* check zero bits on shadow page table entries, these
diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index 14eac57bdaaa..0fb0937c1ea7 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -3456,15 +3456,22 @@ static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu)
* the shadow page table may be a PAE or a long mode page table.
*/
pm_mask = PT_PRESENT_MASK | shadow_me_mask;
- if (mmu->shadow_root_level == PT64_ROOT_4LEVEL) {
+ if (mmu->shadow_root_level >= PT64_ROOT_4LEVEL) {
pm_mask |= PT_ACCESSED_MASK | PT_WRITABLE_MASK | PT_USER_MASK;

if (WARN_ON_ONCE(!mmu->pml4_root)) {
r = -EIO;
goto out_unlock;
}
-
mmu->pml4_root[0] = __pa(mmu->pae_root) | pm_mask;
+
+ if (mmu->shadow_root_level == PT64_ROOT_5LEVEL) {
+ if (WARN_ON_ONCE(!mmu->pml5_root)) {
+ r = -EIO;
+ goto out_unlock;
+ }
+ mmu->pml5_root[0] = __pa(mmu->pml4_root) | pm_mask;
+ }
}

for (i = 0; i < 4; ++i) {
@@ -3483,7 +3490,9 @@ static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu)
mmu->pae_root[i] = root | pm_mask;
}

- if (mmu->shadow_root_level == PT64_ROOT_4LEVEL)
+ if (mmu->shadow_root_level == PT64_ROOT_5LEVEL)
+ mmu->root_hpa = __pa(mmu->pml5_root);
+ else if (mmu->shadow_root_level == PT64_ROOT_4LEVEL)
mmu->root_hpa = __pa(mmu->pml4_root);
else
mmu->root_hpa = __pa(mmu->pae_root);
@@ -3499,7 +3508,7 @@ static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu)
static int mmu_alloc_special_roots(struct kvm_vcpu *vcpu)
{
struct kvm_mmu *mmu = vcpu->arch.mmu;
- u64 *pml4_root, *pae_root;
+ u64 *pml5_root, *pml4_root, *pae_root;

/*
* When shadowing 32-bit or PAE NPT with 64-bit NPT, the PML4 and PDP
@@ -3511,21 +3520,15 @@ static int mmu_alloc_special_roots(struct kvm_vcpu *vcpu)
mmu->shadow_root_level < PT64_ROOT_4LEVEL)
return 0;

- /*
- * This mess only works with 4-level paging and needs to be updated to
- * work with 5-level paging.
- */
- if (WARN_ON_ONCE(mmu->shadow_root_level != PT64_ROOT_4LEVEL))
- return -EIO;
-
- if (mmu->pae_root && mmu->pml4_root)
+ if (mmu->pae_root && mmu->pml4_root && mmu->pml5_root)
return 0;

/*
* The special roots should always be allocated in concert. Yell and
* bail if KVM ends up in a state where only one of the roots is valid.
*/
- if (WARN_ON_ONCE(!tdp_enabled || mmu->pae_root || mmu->pml4_root))
+ if (WARN_ON_ONCE(!tdp_enabled || mmu->pae_root || mmu->pml4_root ||
+ mmu->pml5_root))
return -EIO;

/*
@@ -3537,15 +3540,25 @@ static int mmu_alloc_special_roots(struct kvm_vcpu *vcpu)
return -ENOMEM;

pml4_root = (void *)get_zeroed_page(GFP_KERNEL_ACCOUNT);
- if (!pml4_root) {
- free_page((unsigned long)pae_root);
- return -ENOMEM;
+ if (!pml4_root)
+ goto err_pml4;
+
+ if (mmu->shadow_root_level > PT64_ROOT_4LEVEL) {
+ pml5_root = (void *)get_zeroed_page(GFP_KERNEL_ACCOUNT);
+ if (!pml5_root)
+ goto err_pml5;
}

mmu->pae_root = pae_root;
mmu->pml4_root = pml4_root;
+ mmu->pml5_root = pml5_root;

return 0;
+err_pml5:
+ free_page((unsigned long)pml4_root);
+err_pml4:
+ free_page((unsigned long)pae_root);
+ return -ENOMEM;
}

void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu)
@@ -5364,6 +5377,7 @@ static void free_mmu_pages(struct kvm_mmu *mmu)
set_memory_encrypted((unsigned long)mmu->pae_root, 1);
free_page((unsigned long)mmu->pae_root);
free_page((unsigned long)mmu->pml4_root);
+ free_page((unsigned long)mmu->pml5_root);
}

static int __kvm_mmu_create(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu)
--
2.31.1

2021-08-18 17:17:58

by Sean Christopherson

[permalink] [raw]
Subject: Re: [PATCH v3 2/3] KVM: x86: Handle the case of 5-level shadow page table

The shortlog is very misleading. KVM already supports 5-level paging for
traditional shadow paging. This is specifically for shadowing nNPT, and it's
specifically for shadow everything _except_ 5-level nNPT. Something like:

KVM: x86/mmu: Support shadowing nNPT when 5-level paging is enabled in host


On Wed, Aug 18, 2021, Wei Huang wrote:
> When the 5-level page table CPU flag is exposed, KVM code needs to handle
> this case by pointing mmu->root_hpa to a properly-constructed 5-level page
> table.

Similarly, this is wrong, or maybe just poorly worded. This has nothing to do
with LA57 being exposed to the guest, it's purely the host using 5-level paging
and NPT being enabled and exposed to L1.

2021-08-18 17:36:26

by Sean Christopherson

[permalink] [raw]
Subject: Re: [PATCH v3 3/3] KVM: SVM: Add 5-level page table support for SVM

On Wed, Aug 18, 2021, Wei Huang wrote:
> When the 5-level page table is enabled on host OS, the nested page table
> for guest VMs must use 5-level as well. Update get_npt_level() function
> to reflect this requirement. In the meanwhile, remove the code that
> prevents kvm-amd driver from being loaded when 5-level page table is
> detected.
>
> Signed-off-by: Wei Huang <[email protected]>
> Signed-off-by: Paolo Bonzini <[email protected]>

If this patch authored by Paolo, he needs to be attributed via From:. If Paolo
is a co-author, he needs a Co-developed-by:. If either of those is true, your
SOB needs to be last in the chain since you are the last handler of the patch.
If neither is true, Paolo's SOB should be removed.

> ---
> arch/x86/kvm/svm/svm.c | 9 +++------
> 1 file changed, 3 insertions(+), 6 deletions(-)
>
> diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
> index b34840a2ffa7..ecc4bb8e4ea0 100644
> --- a/arch/x86/kvm/svm/svm.c
> +++ b/arch/x86/kvm/svm/svm.c
> @@ -261,7 +261,9 @@ u32 svm_msrpm_offset(u32 msr)
> static int get_max_npt_level(void)
> {
> #ifdef CONFIG_X86_64
> - return PT64_ROOT_4LEVEL;
> + bool la57 = (cr4_read_shadow() & X86_CR4_LA57) != 0;
> +
> + return la57 ? PT64_ROOT_5LEVEL : PT64_ROOT_4LEVEL;

Why obfuscate this? KVM is completely hosed if pgtable_l5_enabled() doesn't
match host CR4. E.g.

return pgtable_l5_enabled() ? PT64_ROOT_5LEVEL : PT64_ROOT_4LEVEL;

> #else
> return PT32E_ROOT_LEVEL;
> #endif
> @@ -462,11 +464,6 @@ static int has_svm(void)
> return 0;
> }
>
> - if (pgtable_l5_enabled()) {
> - pr_info("KVM doesn't yet support 5-level paging on AMD SVM\n");
> - return 0;
> - }
> -
> return 1;
> }
>
> --
> 2.31.1
>

2021-08-18 18:03:09

by Tom Lendacky

[permalink] [raw]
Subject: Re: [PATCH v3 2/3] KVM: x86: Handle the case of 5-level shadow page table

On 8/18/21 11:55 AM, Wei Huang wrote:
> When the 5-level page table CPU flag is exposed, KVM code needs to handle
> this case by pointing mmu->root_hpa to a properly-constructed 5-level page
> table.
>
> Suggested-by: Paolo Bonzini <[email protected]>
> Signed-off-by: Wei Huang <[email protected]>
> ---

...

> pml4_root = (void *)get_zeroed_page(GFP_KERNEL_ACCOUNT);
> - if (!pml4_root) {
> - free_page((unsigned long)pae_root);
> - return -ENOMEM;
> + if (!pml4_root)
> + goto err_pml4;
> +
> + if (mmu->shadow_root_level > PT64_ROOT_4LEVEL) {
> + pml5_root = (void *)get_zeroed_page(GFP_KERNEL_ACCOUNT);
> + if (!pml5_root)
> + goto err_pml5;
> }
>
> mmu->pae_root = pae_root;
> mmu->pml4_root = pml4_root;
> + mmu->pml5_root = pml5_root;

It looks like pml5_root could be used uninitialized here. You should
initialize it to NULL or set it to NULL as an else path of the new check
above.

Thanks,
Tom

>
> return 0;
> +err_pml5:
> + free_page((unsigned long)pml4_root);
> +err_pml4:
> + free_page((unsigned long)pae_root);
> + return -ENOMEM;
> }
>
> void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu)
> @@ -5364,6 +5377,7 @@ static void free_mmu_pages(struct kvm_mmu *mmu)
> set_memory_encrypted((unsigned long)mmu->pae_root, 1);
> free_page((unsigned long)mmu->pae_root);
> free_page((unsigned long)mmu->pml4_root);
> + free_page((unsigned long)mmu->pml5_root);
> }
>
> static int __kvm_mmu_create(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu)
>

2021-08-19 16:38:39

by Paolo Bonzini

[permalink] [raw]
Subject: Re: [PATCH v3 2/3] KVM: x86: Handle the case of 5-level shadow page table

On 18/08/21 19:15, Sean Christopherson wrote:
>> When the 5-level page table CPU flag is exposed, KVM code needs to handle
>> this case by pointing mmu->root_hpa to a properly-constructed 5-level page
>> table.
> Similarly, this is wrong, or maybe just poorly worded. This has nothing to do
> with LA57 being exposed to the guest, it's purely the host using 5-level paging
> and NPT being enabled and exposed to L1.

Like this:

---
KVM: x86/mmu: Support shadowing NPT when 5-level paging is enabled in host

When the 5-level page table CPU flag is set in the host, but the guest
has CR4.LA57=0 (including the case of a 32-bit guest), the top level of
the shadow NPT page tables will be fixed, consisting of one pointer to
a lower-level table and 511 non-present entries. Extend the existing
code that creates the fixed PML4 or PDP table, to provide a fixed PML5
table if needed.

This is not needed on EPT because the number of layers in the tables
is specified in the EPTP instead of depending on the host CR4.
---

Paolo

2021-08-19 16:41:15

by Paolo Bonzini

[permalink] [raw]
Subject: Re: [PATCH v3 3/3] KVM: SVM: Add 5-level page table support for SVM

On 18/08/21 19:32, Sean Christopherson wrote:
> On Wed, Aug 18, 2021, Wei Huang wrote:
>> When the 5-level page table is enabled on host OS, the nested page table
>> for guest VMs must use 5-level as well. Update get_npt_level() function
>> to reflect this requirement. In the meanwhile, remove the code that
>> prevents kvm-amd driver from being loaded when 5-level page table is
>> detected.
>>
>> Signed-off-by: Wei Huang <[email protected]>
>> Signed-off-by: Paolo Bonzini <[email protected]>
>
> If this patch authored by Paolo, he needs to be attributed via From:. If Paolo
> is a co-author, he needs a Co-developed-by:. If either of those is true, your
> SOB needs to be last in the chain since you are the last handler of the patch.
> If neither is true, Paolo's SOB should be removed.

I didn't even remember writing this, but it's possible I pseudocoded in
an email just like you did below.

>> - return PT64_ROOT_4LEVEL;
>> + bool la57 = (cr4_read_shadow() & X86_CR4_LA57) != 0;
>> +
>> + return la57 ? PT64_ROOT_5LEVEL : PT64_ROOT_4LEVEL;
>
> Why obfuscate this? KVM is completely hosed if pgtable_l5_enabled() doesn't
> match host CR4. E.g.
>
> return pgtable_l5_enabled() ? PT64_ROOT_5LEVEL : PT64_ROOT_4LEVEL;

That also suggests the above pseudocoding scenario, where I'd be too
lazy to look up the correct spelling of pgtable_l5_enabled().

Paolo

>> #else
>> return PT32E_ROOT_LEVEL;
>> #endif
>> @@ -462,11 +464,6 @@ static int has_svm(void)
>> return 0;
>> }
>>
>> - if (pgtable_l5_enabled()) {
>> - pr_info("KVM doesn't yet support 5-level paging on AMD SVM\n");
>> - return 0;
>> - }
>> -
>> return 1;
>> }
>>
>> --
>> 2.31.1
>>
>

2021-08-19 16:45:47

by Paolo Bonzini

[permalink] [raw]
Subject: Re: [PATCH v3 0/3] SVM 5-level page table support

On 18/08/21 18:55, Wei Huang wrote:
> This patch set adds 5-level page table support for AMD SVM. When the
> 5-level page table is enabled on host OS, the nested page table for guest
> VMs will use the same format as host OS (i.e. 5-level NPT). These patches
> were tested with various combination of different settings and test cases
> (nested/regular VMs, AMD64/i686 kernels, kvm-unit-tests, etc.)
>
> v2->v3:
> * Change the way of building root_hpa by following the existing flow (Sean)
>
> v1->v2:
> * Remove v1's arch-specific get_tdp_level() and add a new parameter,
> tdp_forced_root_level, to allow forced TDP level (Sean)
> * Add additional comment on tdp_root table chaining trick and change the
> PML root table allocation code (Sean)
> * Revise Patch 1's commit msg (Sean and Jim)
>
> Thanks,
> -Wei
>
> Wei Huang (3):
> KVM: x86: Allow CPU to force vendor-specific TDP level
> KVM: x86: Handle the case of 5-level shadow page table
> KVM: SVM: Add 5-level page table support for SVM
>
> arch/x86/include/asm/kvm_host.h | 6 ++--
> arch/x86/kvm/mmu/mmu.c | 56 ++++++++++++++++++++++-----------
> arch/x86/kvm/svm/svm.c | 13 ++++----
> arch/x86/kvm/vmx/vmx.c | 3 +-
> 4 files changed, 49 insertions(+), 29 deletions(-)
>

Queued, thanks, with NULL initializations according to Tom's review.

Paolo

2021-08-23 09:23:24

by Maxim Levitsky

[permalink] [raw]
Subject: Re: [PATCH v3 0/3] SVM 5-level page table support

On Thu, 2021-08-19 at 18:43 +0200, Paolo Bonzini wrote:
> On 18/08/21 18:55, Wei Huang wrote:
> > This patch set adds 5-level page table support for AMD SVM. When the
> > 5-level page table is enabled on host OS, the nested page table for guest
> > VMs will use the same format as host OS (i.e. 5-level NPT). These patches
> > were tested with various combination of different settings and test cases
> > (nested/regular VMs, AMD64/i686 kernels, kvm-unit-tests, etc.)
> >
> > v2->v3:
> > * Change the way of building root_hpa by following the existing flow (Sean)
> >
> > v1->v2:
> > * Remove v1's arch-specific get_tdp_level() and add a new parameter,
> > tdp_forced_root_level, to allow forced TDP level (Sean)
> > * Add additional comment on tdp_root table chaining trick and change the
> > PML root table allocation code (Sean)
> > * Revise Patch 1's commit msg (Sean and Jim)
> >
> > Thanks,
> > -Wei
> >
> > Wei Huang (3):
> > KVM: x86: Allow CPU to force vendor-specific TDP level
> > KVM: x86: Handle the case of 5-level shadow page table
> > KVM: SVM: Add 5-level page table support for SVM
> >
> > arch/x86/include/asm/kvm_host.h | 6 ++--
> > arch/x86/kvm/mmu/mmu.c | 56 ++++++++++++++++++++++-----------
> > arch/x86/kvm/svm/svm.c | 13 ++++----
> > arch/x86/kvm/vmx/vmx.c | 3 +-
> > 4 files changed, 49 insertions(+), 29 deletions(-)
> >
>
> Queued, thanks, with NULL initializations according to Tom's review.
>
> Paolo
>

Hi,
Yesterday while testing my SMM patches, I noticed a minor issue:
It seems that this patchset breaks my 32 bit nested VM testcase with NPT=0.

This hack makes it work again for me (I don't yet use TDP mmu).

diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index caa3f9aee7d1..c25e0d40a620 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -3562,7 +3562,7 @@ static int mmu_alloc_special_roots(struct kvm_vcpu *vcpu)
mmu->shadow_root_level < PT64_ROOT_4LEVEL)
return 0;

- if (mmu->pae_root && mmu->pml4_root && mmu->pml5_root)
+ if (mmu->pae_root && mmu->pml4_root)
return 0;

/*



Best regards,
Maxim Levitsky

2021-08-23 15:19:58

by Wei Huang

[permalink] [raw]
Subject: Re: [PATCH v3 0/3] SVM 5-level page table support

On 08/23 12:20, Maxim Levitsky wrote:
> On Thu, 2021-08-19 at 18:43 +0200, Paolo Bonzini wrote:
> > On 18/08/21 18:55, Wei Huang wrote:
> > > This patch set adds 5-level page table support for AMD SVM. When the
> > > 5-level page table is enabled on host OS, the nested page table for guest
> > > VMs will use the same format as host OS (i.e. 5-level NPT). These patches
> > > were tested with various combination of different settings and test cases
> > > (nested/regular VMs, AMD64/i686 kernels, kvm-unit-tests, etc.)
> > >
> > > v2->v3:
> > > * Change the way of building root_hpa by following the existing flow (Sean)
> > >
> > > v1->v2:
> > > * Remove v1's arch-specific get_tdp_level() and add a new parameter,
> > > tdp_forced_root_level, to allow forced TDP level (Sean)
> > > * Add additional comment on tdp_root table chaining trick and change the
> > > PML root table allocation code (Sean)
> > > * Revise Patch 1's commit msg (Sean and Jim)
> > >
> > > Thanks,
> > > -Wei
> > >
> > > Wei Huang (3):
> > > KVM: x86: Allow CPU to force vendor-specific TDP level
> > > KVM: x86: Handle the case of 5-level shadow page table
> > > KVM: SVM: Add 5-level page table support for SVM
> > >
> > > arch/x86/include/asm/kvm_host.h | 6 ++--
> > > arch/x86/kvm/mmu/mmu.c | 56 ++++++++++++++++++++++-----------
> > > arch/x86/kvm/svm/svm.c | 13 ++++----
> > > arch/x86/kvm/vmx/vmx.c | 3 +-
> > > 4 files changed, 49 insertions(+), 29 deletions(-)
> > >
> >
> > Queued, thanks, with NULL initializations according to Tom's review.
> >
> > Paolo
> >
>
> Hi,
> Yesterday while testing my SMM patches, I noticed a minor issue:
> It seems that this patchset breaks my 32 bit nested VM testcase with NPT=0.
>

Could you elaborate the detailed setup? NPT=0 for KVM running on L1?
Which VM is 32bit - L1 or L2?

Thanks,
-Wei

> This hack makes it work again for me (I don't yet use TDP mmu).
>
> diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
> index caa3f9aee7d1..c25e0d40a620 100644
> --- a/arch/x86/kvm/mmu/mmu.c
> +++ b/arch/x86/kvm/mmu/mmu.c
> @@ -3562,7 +3562,7 @@ static int mmu_alloc_special_roots(struct kvm_vcpu *vcpu)
> mmu->shadow_root_level < PT64_ROOT_4LEVEL)
> return 0;
>
> - if (mmu->pae_root && mmu->pml4_root && mmu->pml5_root)
> + if (mmu->pae_root && mmu->pml4_root)
> return 0;
>
> /*
>
>
>
> Best regards,
> Maxim Levitsky
>

2021-08-23 16:11:42

by Sean Christopherson

[permalink] [raw]
Subject: Re: [PATCH v3 0/3] SVM 5-level page table support

On Mon, Aug 23, 2021, Wei Huang wrote:
> On 08/23 12:20, Maxim Levitsky wrote:
> > This hack makes it work again for me (I don't yet use TDP mmu).
> >
> > diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
> > index caa3f9aee7d1..c25e0d40a620 100644
> > --- a/arch/x86/kvm/mmu/mmu.c
> > +++ b/arch/x86/kvm/mmu/mmu.c
> > @@ -3562,7 +3562,7 @@ static int mmu_alloc_special_roots(struct kvm_vcpu *vcpu)
> > mmu->shadow_root_level < PT64_ROOT_4LEVEL)
> > return 0;
> >
> > - if (mmu->pae_root && mmu->pml4_root && mmu->pml5_root)

Maxim, I assume you hit this WARN and bail?

if (WARN_ON_ONCE(!tdp_enabled || mmu->pae_root || mmu->pml4_root ||
mmu->pml5_root))
return -EIO;

Because as the comment states, KVM expects all the special roots to be allocated
together. The 5-level paging supported breaks that assumption because pml5_root
will be allocated iff the host is using 5-level paging.

if (mmu->shadow_root_level > PT64_ROOT_4LEVEL) {
pml5_root = (void *)get_zeroed_page(GFP_KERNEL_ACCOUNT);
if (!pml5_root)
goto err_pml5;
}

I think this is the least awful fix, I'll test and send a proper patch later today.

diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index 4853c033e6ce..93b2ed422b48 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -3548,6 +3548,7 @@ static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu)
static int mmu_alloc_special_roots(struct kvm_vcpu *vcpu)
{
struct kvm_mmu *mmu = vcpu->arch.mmu;
+ bool need_pml5 = mmu->shadow_root_level > PT64_ROOT_4LEVEL;
u64 *pml5_root = NULL;
u64 *pml4_root = NULL;
u64 *pae_root;
@@ -3562,7 +3563,14 @@ static int mmu_alloc_special_roots(struct kvm_vcpu *vcpu)
mmu->shadow_root_level < PT64_ROOT_4LEVEL)
return 0;

- if (mmu->pae_root && mmu->pml4_root && mmu->pml5_root)
+ /*
+ * NPT, the only paging mode that uses this horror, uses a fixed number
+ * of levels for the shadow page tables, e.g. all MMUs are 4-level or
+ * all MMus are 5-level. Thus, this can safely require that pml5_root
+ * is allocated if the other roots are valid and pml5 is needed, as any
+ * prior MMU would also have required pml5.
+ */
+ if (mmu->pae_root && mmu->pml4_root && (!need_pml5 || mmu->pml5_root))
return 0;

/*
@@ -3570,7 +3578,7 @@ static int mmu_alloc_special_roots(struct kvm_vcpu *vcpu)
* bail if KVM ends up in a state where only one of the roots is valid.
*/
if (WARN_ON_ONCE(!tdp_enabled || mmu->pae_root || mmu->pml4_root ||
- mmu->pml5_root))
+ (need_pml5 && mmu->pml5_root)))
return -EIO;

/*

> > + if (mmu->pae_root && mmu->pml4_root)
> > return 0;
> >
> > /*
> >
> >
> >
> > Best regards,
> > Maxim Levitsky
> >

2021-08-23 18:10:10

by Maxim Levitsky

[permalink] [raw]
Subject: Re: [PATCH v3 0/3] SVM 5-level page table support

On Mon, 2021-08-23 at 10:15 -0500, Wei Huang wrote:
> On 08/23 12:20, Maxim Levitsky wrote:
> > On Thu, 2021-08-19 at 18:43 +0200, Paolo Bonzini wrote:
> > > On 18/08/21 18:55, Wei Huang wrote:
> > > > This patch set adds 5-level page table support for AMD SVM. When the
> > > > 5-level page table is enabled on host OS, the nested page table for guest
> > > > VMs will use the same format as host OS (i.e. 5-level NPT). These patches
> > > > were tested with various combination of different settings and test cases
> > > > (nested/regular VMs, AMD64/i686 kernels, kvm-unit-tests, etc.)
> > > >
> > > > v2->v3:
> > > > * Change the way of building root_hpa by following the existing flow (Sean)
> > > >
> > > > v1->v2:
> > > > * Remove v1's arch-specific get_tdp_level() and add a new parameter,
> > > > tdp_forced_root_level, to allow forced TDP level (Sean)
> > > > * Add additional comment on tdp_root table chaining trick and change the
> > > > PML root table allocation code (Sean)
> > > > * Revise Patch 1's commit msg (Sean and Jim)
> > > >
> > > > Thanks,
> > > > -Wei
> > > >
> > > > Wei Huang (3):
> > > > KVM: x86: Allow CPU to force vendor-specific TDP level
> > > > KVM: x86: Handle the case of 5-level shadow page table
> > > > KVM: SVM: Add 5-level page table support for SVM
> > > >
> > > > arch/x86/include/asm/kvm_host.h | 6 ++--
> > > > arch/x86/kvm/mmu/mmu.c | 56 ++++++++++++++++++++++-----------
> > > > arch/x86/kvm/svm/svm.c | 13 ++++----
> > > > arch/x86/kvm/vmx/vmx.c | 3 +-
> > > > 4 files changed, 49 insertions(+), 29 deletions(-)
> > > >
> > >
> > > Queued, thanks, with NULL initializations according to Tom's review.
> > >
> > > Paolo
> > >
> >
> > Hi,
> > Yesterday while testing my SMM patches, I noticed a minor issue:
> > It seems that this patchset breaks my 32 bit nested VM testcase with NPT=0.
> >
>
> Could you elaborate the detailed setup? NPT=0 for KVM running on L1?
> Which VM is 32bit - L1 or L2?

NPT=0, L1 and L2 were 32 bit PAE VMs. The test was done to see how well
this setup deals with SMM mode entry/exits with SMM generated by L1 guest,
and see if I have any PDPTR related shenanigans.

I disabled the TDP MMU for now, although in this setup it won't be used anyway.

BIOS was seabios, patched to use PAE itself during bootm, as well in SMM.
(from https://mail.coreboot.org/pipermail/seabios/2015-September/009788.html, patch applied by hand)

Failure was immediate without my hack - L1 died as soon as L2 was started due to an assert in
this code.


Best regards,
Maxim Levitsky
>
> Thanks,
> -Wei
>
> > This hack makes it work again for me (I don't yet use TDP mmu).
> >
> > diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
> > index caa3f9aee7d1..c25e0d40a620 100644
> > --- a/arch/x86/kvm/mmu/mmu.c
> > +++ b/arch/x86/kvm/mmu/mmu.c
> > @@ -3562,7 +3562,7 @@ static int mmu_alloc_special_roots(struct kvm_vcpu *vcpu)
> > mmu->shadow_root_level < PT64_ROOT_4LEVEL)
> > return 0;
> >
> > - if (mmu->pae_root && mmu->pml4_root && mmu->pml5_root)
> > + if (mmu->pae_root && mmu->pml4_root)
> > return 0;
> >
> > /*
> >
> >
> >
> > Best regards,
> > Maxim Levitsky
> >


2021-08-23 18:12:40

by Maxim Levitsky

[permalink] [raw]
Subject: Re: [PATCH v3 0/3] SVM 5-level page table support

On Mon, 2021-08-23 at 16:10 +0000, Sean Christopherson wrote:
> On Mon, Aug 23, 2021, Wei Huang wrote:
> > On 08/23 12:20, Maxim Levitsky wrote:
> > > This hack makes it work again for me (I don't yet use TDP mmu).
> > >
> > > diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
> > > index caa3f9aee7d1..c25e0d40a620 100644
> > > --- a/arch/x86/kvm/mmu/mmu.c
> > > +++ b/arch/x86/kvm/mmu/mmu.c
> > > @@ -3562,7 +3562,7 @@ static int mmu_alloc_special_roots(struct kvm_vcpu *vcpu)
> > > mmu->shadow_root_level < PT64_ROOT_4LEVEL)
> > > return 0;
> > >
> > > - if (mmu->pae_root && mmu->pml4_root && mmu->pml5_root)
>
> Maxim, I assume you hit this WARN and bail?
Yep.
>
> if (WARN_ON_ONCE(!tdp_enabled || mmu->pae_root || mmu->pml4_root ||
> mmu->pml5_root))
> return -EIO;
>
> Because as the comment states, KVM expects all the special roots to be allocated
> together. The 5-level paging supported breaks that assumption because pml5_root
> will be allocated iff the host is using 5-level paging.
>
> if (mmu->shadow_root_level > PT64_ROOT_4LEVEL) {
> pml5_root = (void *)get_zeroed_page(GFP_KERNEL_ACCOUNT);
> if (!pml5_root)
> goto err_pml5;
> }
>
> I think this is the least awful fix, I'll test and send a proper patch later today.
>
> diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
> index 4853c033e6ce..93b2ed422b48 100644
> --- a/arch/x86/kvm/mmu/mmu.c
> +++ b/arch/x86/kvm/mmu/mmu.c
> @@ -3548,6 +3548,7 @@ static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu)
> static int mmu_alloc_special_roots(struct kvm_vcpu *vcpu)
> {
> struct kvm_mmu *mmu = vcpu->arch.mmu;
> + bool need_pml5 = mmu->shadow_root_level > PT64_ROOT_4LEVEL;
> u64 *pml5_root = NULL;
> u64 *pml4_root = NULL;
> u64 *pae_root;
> @@ -3562,7 +3563,14 @@ static int mmu_alloc_special_roots(struct kvm_vcpu *vcpu)
> mmu->shadow_root_level < PT64_ROOT_4LEVEL)
> return 0;
>
> - if (mmu->pae_root && mmu->pml4_root && mmu->pml5_root)
> + /*
> + * NPT, the only paging mode that uses this horror, uses a fixed number
> + * of levels for the shadow page tables, e.g. all MMUs are 4-level or
> + * all MMus are 5-level. Thus, this can safely require that pml5_root
> + * is allocated if the other roots are valid and pml5 is needed, as any
> + * prior MMU would also have required pml5.
> + */
> + if (mmu->pae_root && mmu->pml4_root && (!need_pml5 || mmu->pml5_root))
> return 0;
>
> /*
> @@ -3570,7 +3578,7 @@ static int mmu_alloc_special_roots(struct kvm_vcpu *vcpu)
> * bail if KVM ends up in a state where only one of the roots is valid.
> */
> if (WARN_ON_ONCE(!tdp_enabled || mmu->pae_root || mmu->pml4_root ||
> - mmu->pml5_root))
> + (need_pml5 && mmu->pml5_root)))
> return -EIO;
>
> /*
>
> > > + if (mmu->pae_root && mmu->pml4_root)
> > > return 0;
> > >
> > > /*

Makes sense, works, and without digging too much into this
I expected this to be fixed by something like that, so:

Reviewed-by: Maxim Levitsky <[email protected]>

Thanks,
Best regards,
Maxim Levitsky



> > >
> > >
> > >
> > > Best regards,
> > > Maxim Levitsky
> > >