Return-Path: Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S932690AbdCIPQm (ORCPT ); Thu, 9 Mar 2017 10:16:42 -0500 Received: from mail-wm0-f65.google.com ([74.125.82.65]:35002 "EHLO mail-wm0-f65.google.com" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S1754427AbdCIPNM (ORCPT ); Thu, 9 Mar 2017 10:13:12 -0500 Subject: Re: [PATCH RFC 3/4] KVM: MMU: Add 5 level EPT & Shadow page table support. To: Liang Li , kvm@vger.kernel.org References: <1483003563-25847-1-git-send-email-liang.z.li@intel.com> <1483003563-25847-4-git-send-email-liang.z.li@intel.com> Cc: linux-kernel@vger.kernel.org, tglx@linutronix.de, mingo@redhat.com, kirill.shutemov@linux.intel.com, dave.hansen@linux.intel.com, guangrong.xiao@linux.intel.com, rkrcmar@redhat.com From: Paolo Bonzini Message-ID: <8b09d6dc-fe12-3af2-01f0-49eb0a8baeae@redhat.com> Date: Thu, 9 Mar 2017 16:12:34 +0100 User-Agent: Mozilla/5.0 (X11; Linux x86_64; rv:45.0) Gecko/20100101 Thunderbird/45.7.0 MIME-Version: 1.0 In-Reply-To: <1483003563-25847-4-git-send-email-liang.z.li@intel.com> Content-Type: text/plain; charset=utf-8 Content-Transfer-Encoding: 8bit Sender: linux-kernel-owner@vger.kernel.org List-ID: X-Mailing-List: linux-kernel@vger.kernel.org Content-Length: 22324 Lines: 618 On 29/12/2016 10:26, Liang Li wrote: > The future Intel CPU will extend the max physical address to 52 bits. > To support the new physical address width, EPT is extended to support > 5 level page table. > This patch add the 5 level EPT and extend shadow page to support > 5 level paging guest. As the RFC version, this patch enables 5 level > EPT once the hardware supports, and this is not a good choice because > 5 level EPT requires more memory access comparing to use 4 level EPT. > The right thing is to use 5 level EPT only when it's needed, will > change in the future version. > > Signed-off-by: Liang Li > Cc: Thomas Gleixner > Cc: Ingo Molnar > Cc: Kirill A. Shutemov > Cc: Dave Hansen > Cc: Xiao Guangrong > Cc: Paolo Bonzini > Cc: "Radim Krčmář" > --- > arch/x86/include/asm/kvm_host.h | 3 +- > arch/x86/include/asm/vmx.h | 1 + > arch/x86/kvm/cpuid.h | 8 ++ > arch/x86/kvm/mmu.c | 167 +++++++++++++++++++++++++++++++--------- > arch/x86/kvm/mmu_audit.c | 5 +- > arch/x86/kvm/paging_tmpl.h | 19 ++++- > arch/x86/kvm/vmx.c | 19 +++-- > arch/x86/kvm/x86.h | 10 +++ > 8 files changed, 184 insertions(+), 48 deletions(-) > > diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h > index a7066dc..e505dac 100644 > --- a/arch/x86/include/asm/kvm_host.h > +++ b/arch/x86/include/asm/kvm_host.h > @@ -124,6 +124,7 @@ static inline gfn_t gfn_to_index(gfn_t gfn, gfn_t base_gfn, int level) > #define KVM_NR_VAR_MTRR 8 > > #define ASYNC_PF_PER_VCPU 64 > +#define PT64_ROOT_5LEVEL 5 > > enum kvm_reg { > VCPU_REGS_RAX = 0, > @@ -310,7 +311,7 @@ struct kvm_pio_request { > }; > > struct rsvd_bits_validate { > - u64 rsvd_bits_mask[2][4]; > + u64 rsvd_bits_mask[2][PT64_ROOT_5LEVEL]; > u64 bad_mt_xwr; > }; > > diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h > index 2b5b2d4..bf2f178 100644 > --- a/arch/x86/include/asm/vmx.h > +++ b/arch/x86/include/asm/vmx.h > @@ -442,6 +442,7 @@ enum vmcs_field { > > #define VMX_EPT_EXECUTE_ONLY_BIT (1ull) > #define VMX_EPT_PAGE_WALK_4_BIT (1ull << 6) > +#define VMX_EPT_PAGE_WALK_5_BIT (1ull << 7) > #define VMX_EPTP_UC_BIT (1ull << 8) > #define VMX_EPTP_WB_BIT (1ull << 14) > #define VMX_EPT_2MB_PAGE_BIT (1ull << 16) > diff --git a/arch/x86/kvm/cpuid.h b/arch/x86/kvm/cpuid.h > index 35058c2..4bdf3dc 100644 > --- a/arch/x86/kvm/cpuid.h > +++ b/arch/x86/kvm/cpuid.h > @@ -88,6 +88,14 @@ static inline bool guest_cpuid_has_pku(struct kvm_vcpu *vcpu) > return best && (best->ecx & bit(X86_FEATURE_PKU)); > } > > +static inline bool guest_cpuid_has_la57(struct kvm_vcpu *vcpu) > +{ > + struct kvm_cpuid_entry2 *best; > + > + best = kvm_find_cpuid_entry(vcpu, 7, 0); > + return best && (best->ecx & bit(X86_FEATURE_LA57)); > +} > + > static inline bool guest_cpuid_has_longmode(struct kvm_vcpu *vcpu) > { > struct kvm_cpuid_entry2 *best; > diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c > index 4c40273..0a56f27 100644 > --- a/arch/x86/kvm/mmu.c > +++ b/arch/x86/kvm/mmu.c > @@ -1986,8 +1986,8 @@ static bool kvm_sync_pages(struct kvm_vcpu *vcpu, gfn_t gfn, > } > > struct mmu_page_path { > - struct kvm_mmu_page *parent[PT64_ROOT_4LEVEL]; > - unsigned int idx[PT64_ROOT_4LEVEL]; > + struct kvm_mmu_page *parent[PT64_ROOT_5LEVEL]; > + unsigned int idx[PT64_ROOT_5LEVEL]; > }; > > #define for_each_sp(pvec, sp, parents, i) \ > @@ -2198,6 +2198,11 @@ static void shadow_walk_init(struct kvm_shadow_walk_iterator *iterator, > !vcpu->arch.mmu.direct_map) > --iterator->level; > > + if (iterator->level == PT64_ROOT_5LEVEL && > + vcpu->arch.mmu.root_level < PT64_ROOT_5LEVEL && > + !vcpu->arch.mmu.direct_map) > + iterator->level -= 2; This (and the "if" before it as well) might actually be dead code. Please remove it in a separate patch. > if (iterator->level == PT32E_ROOT_LEVEL) { > iterator->shadow_addr > = vcpu->arch.mmu.pae_root[(addr >> 30) & 3]; > @@ -3061,9 +3066,12 @@ static void mmu_free_roots(struct kvm_vcpu *vcpu) > if (!VALID_PAGE(vcpu->arch.mmu.root_hpa)) > return; > > - if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_4LEVEL && > - (vcpu->arch.mmu.root_level == PT64_ROOT_4LEVEL || > - vcpu->arch.mmu.direct_map)) { > + if ((vcpu->arch.mmu.shadow_root_level == PT64_ROOT_4LEVEL && > + (vcpu->arch.mmu.root_level == PT64_ROOT_4LEVEL || > + vcpu->arch.mmu.direct_map)) || > + (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_5LEVEL && > + (vcpu->arch.mmu.root_level == PT64_ROOT_5LEVEL || > + vcpu->arch.mmu.direct_map))) { Same here: if (vcpu->arch.mmu.shadow_root_level >= PT64_ROOT_4LEVEL) should be enough. In general, checking >= PT64_ROOT_4LEVEL is better IMHO than checking for == PT64_ROOT_4LEVEL || == PT64_ROOT_5LEVEL. These "if"s basically need to single out PAE. A hypothetical 6-level page table extension would in all likelihood behave just like 64-bit LA48 and LA57 paging. > hpa_t root = vcpu->arch.mmu.root_hpa; > > spin_lock(&vcpu->kvm->mmu_lock); > @@ -3114,10 +3122,12 @@ static int mmu_alloc_direct_roots(struct kvm_vcpu *vcpu) > struct kvm_mmu_page *sp; > unsigned i; > > - if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_4LEVEL) { > + if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_4LEVEL || > + vcpu->arch.mmu.shadow_root_level == PT64_ROOT_5LEVEL) { Same here and everywhere else. > spin_lock(&vcpu->kvm->mmu_lock); > make_mmu_pages_available(vcpu); > - sp = kvm_mmu_get_page(vcpu, 0, 0, PT64_ROOT_4LEVEL, 1, ACC_ALL); > + sp = kvm_mmu_get_page(vcpu, 0, 0, > + vcpu->arch.mmu.shadow_root_level, 1, ACC_ALL); > ++sp->root_count; > spin_unlock(&vcpu->kvm->mmu_lock); > vcpu->arch.mmu.root_hpa = __pa(sp->spt); > @@ -3158,15 +3168,16 @@ static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu) > * Do we shadow a long mode page table? If so we need to > * write-protect the guests page table root. > */ > - if (vcpu->arch.mmu.root_level == PT64_ROOT_4LEVEL) { > + if (vcpu->arch.mmu.root_level == PT64_ROOT_4LEVEL || > + vcpu->arch.mmu.root_level == PT64_ROOT_5LEVEL) { > hpa_t root = vcpu->arch.mmu.root_hpa; > > MMU_WARN_ON(VALID_PAGE(root)); > > spin_lock(&vcpu->kvm->mmu_lock); > make_mmu_pages_available(vcpu); > - sp = kvm_mmu_get_page(vcpu, root_gfn, 0, PT64_ROOT_4LEVEL, > - 0, ACC_ALL); > + sp = kvm_mmu_get_page(vcpu, root_gfn, 0, > + vcpu->arch.mmu.root_level, 0, ACC_ALL); > root = __pa(sp->spt); > ++sp->root_count; > spin_unlock(&vcpu->kvm->mmu_lock); > @@ -3180,7 +3191,8 @@ static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu) > * the shadow page table may be a PAE or a long mode page table. > */ > pm_mask = PT_PRESENT_MASK; > - if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_4LEVEL) > + if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_4LEVEL || > + vcpu->arch.mmu.shadow_root_level == PT64_ROOT_5LEVEL) > pm_mask |= PT_ACCESSED_MASK | PT_WRITABLE_MASK | PT_USER_MASK; > > for (i = 0; i < 4; ++i) { > @@ -3213,7 +3225,8 @@ static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu) > * If we shadow a 32 bit page table with a long mode page > * table we enter this path. > */ > - if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_4LEVEL) { > + if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_4LEVEL || > + vcpu->arch.mmu.shadow_root_level == PT64_ROOT_5LEVEL) { > if (vcpu->arch.mmu.lm_root == NULL) { > /* > * The additional page necessary for this is only > @@ -3257,8 +3270,8 @@ static void mmu_sync_roots(struct kvm_vcpu *vcpu) > return; > > vcpu_clear_mmio_info(vcpu, MMIO_GVA_ANY); > - kvm_mmu_audit(vcpu, AUDIT_PRE_SYNC); > - if (vcpu->arch.mmu.root_level == PT64_ROOT_4LEVEL) { > + if (vcpu->arch.mmu.root_level == PT64_ROOT_4LEVEL || > + vcpu->arch.mmu.root_level == PT64_ROOT_5LEVEL) { > hpa_t root = vcpu->arch.mmu.root_hpa; > sp = page_header(root); > mmu_sync_children(vcpu, sp); > @@ -3334,7 +3347,7 @@ static bool mmio_info_in_cache(struct kvm_vcpu *vcpu, u64 addr, bool direct) > walk_shadow_page_get_mmio_spte(struct kvm_vcpu *vcpu, u64 addr, u64 *sptep) > { > struct kvm_shadow_walk_iterator iterator; > - u64 sptes[PT64_ROOT_4LEVEL], spte = 0ull; > + u64 sptes[PT64_ROOT_5LEVEL], spte = 0ull; > int root, leaf; > bool reserved = false; > > @@ -3655,10 +3668,16 @@ static inline bool is_last_gpte(struct kvm_mmu *mmu, > } > > #define PTTYPE_EPT 18 /* arbitrary */ > +#define PTTYPE_LA57 57 > + > #define PTTYPE PTTYPE_EPT > #include "paging_tmpl.h" > #undef PTTYPE > > +#define PTTYPE PTTYPE_LA57 > +#include "paging_tmpl.h" > +#undef PTTYPE This is not needed. The format for LA57 page tables is the same as for LA48. > #define PTTYPE 64 > #include "paging_tmpl.h" > #undef PTTYPE > @@ -3747,6 +3766,26 @@ static inline bool is_last_gpte(struct kvm_mmu *mmu, > rsvd_check->rsvd_bits_mask[1][0] = > rsvd_check->rsvd_bits_mask[0][0]; > break; > + case PT64_ROOT_5LEVEL: > + rsvd_check->rsvd_bits_mask[0][4] = exb_bit_rsvd | > + nonleaf_bit8_rsvd | rsvd_bits(7, 7); > + rsvd_check->rsvd_bits_mask[0][3] = exb_bit_rsvd | > + nonleaf_bit8_rsvd | rsvd_bits(7, 7); I think the code for this and PT64_ROOT_4LEVEL should be the same (setting rsvd_bits_mask[x][4] for PT64_ROOT_4LEVEL is okay). You are assuming that MAXPHYADDR=52, but the Intel whitepaper doesn't say this is going to be always the case. rsvd_bits in arch/x86/kvm/mmu.h is not a hot path, feel free to add an if (e < s) return 0; there. > + rsvd_check->rsvd_bits_mask[0][2] = exb_bit_rsvd | > + nonleaf_bit8_rsvd | gbpages_bit_rsvd; > + rsvd_check->rsvd_bits_mask[0][1] = exb_bit_rsvd; > + rsvd_check->rsvd_bits_mask[0][0] = exb_bit_rsvd; > + rsvd_check->rsvd_bits_mask[1][4] = > + rsvd_check->rsvd_bits_mask[0][4]; > + rsvd_check->rsvd_bits_mask[1][3] = > + rsvd_check->rsvd_bits_mask[0][3]; > + rsvd_check->rsvd_bits_mask[1][2] = exb_bit_rsvd | > + gbpages_bit_rsvd | rsvd_bits(13, 29); > + rsvd_check->rsvd_bits_mask[1][1] = exb_bit_rsvd | > + rsvd_bits(13, 20); /* large page */ > + rsvd_check->rsvd_bits_mask[1][0] = > + rsvd_check->rsvd_bits_mask[0][0]; > + break; > } > } > > @@ -3761,25 +3800,43 @@ static void reset_rsvds_bits_mask(struct kvm_vcpu *vcpu, > > static void > __reset_rsvds_bits_mask_ept(struct rsvd_bits_validate *rsvd_check, > - int maxphyaddr, bool execonly) > + int maxphyaddr, bool execonly, int ept_level) > { > u64 bad_mt_xwr; > > - rsvd_check->rsvd_bits_mask[0][3] = > - rsvd_bits(maxphyaddr, 51) | rsvd_bits(3, 7); > - rsvd_check->rsvd_bits_mask[0][2] = > - rsvd_bits(maxphyaddr, 51) | rsvd_bits(3, 6); > - rsvd_check->rsvd_bits_mask[0][1] = > - rsvd_bits(maxphyaddr, 51) | rsvd_bits(3, 6); > - rsvd_check->rsvd_bits_mask[0][0] = rsvd_bits(maxphyaddr, 51); > - > - /* large page */ > - rsvd_check->rsvd_bits_mask[1][3] = rsvd_check->rsvd_bits_mask[0][3]; > - rsvd_check->rsvd_bits_mask[1][2] = > - rsvd_bits(maxphyaddr, 51) | rsvd_bits(12, 29); > - rsvd_check->rsvd_bits_mask[1][1] = > - rsvd_bits(maxphyaddr, 51) | rsvd_bits(12, 20); > - rsvd_check->rsvd_bits_mask[1][0] = rsvd_check->rsvd_bits_mask[0][0]; > + if (ept_level == 5) { > + rsvd_check->rsvd_bits_mask[0][4] = rsvd_bits(3, 7); Same here, this "if" is not needed at all and the new ept_level argument shouldn't be required either. > + rsvd_check->rsvd_bits_mask[0][3] = rsvd_bits(3, 7); > + rsvd_check->rsvd_bits_mask[0][2] = rsvd_bits(3, 6); > + rsvd_check->rsvd_bits_mask[0][1] = rsvd_bits(3, 6); > + rsvd_check->rsvd_bits_mask[0][0] = 0; > + > + /* large page */ > + rsvd_check->rsvd_bits_mask[1][4] = > + rsvd_check->rsvd_bits_mask[0][4]; > + rsvd_check->rsvd_bits_mask[1][3] = > + rsvd_check->rsvd_bits_mask[0][3]; > + rsvd_check->rsvd_bits_mask[1][2] = rsvd_bits(12, 29); > + rsvd_check->rsvd_bits_mask[1][1] = rsvd_bits(12, 20); > + rsvd_check->rsvd_bits_mask[1][0] = 0; > + } else { > + rsvd_check->rsvd_bits_mask[0][3] = > + rsvd_bits(maxphyaddr, 51) | rsvd_bits(3, 7); > + rsvd_check->rsvd_bits_mask[0][2] = > + rsvd_bits(maxphyaddr, 51) | rsvd_bits(3, 6); > + rsvd_check->rsvd_bits_mask[0][1] = > + rsvd_bits(maxphyaddr, 51) | rsvd_bits(3, 6); > + rsvd_check->rsvd_bits_mask[0][0] = rsvd_bits(maxphyaddr, 51); > + /* large page */ > + rsvd_check->rsvd_bits_mask[1][3] = > + rsvd_check->rsvd_bits_mask[0][3]; > + rsvd_check->rsvd_bits_mask[1][2] = > + rsvd_bits(maxphyaddr, 51) | rsvd_bits(12, 29); > + rsvd_check->rsvd_bits_mask[1][1] = > + rsvd_bits(maxphyaddr, 51) | rsvd_bits(12, 20); > + rsvd_check->rsvd_bits_mask[1][0] = > + rsvd_check->rsvd_bits_mask[0][0]; > + } > > bad_mt_xwr = 0xFFull << (2 * 8); /* bits 3..5 must not be 2 */ > bad_mt_xwr |= 0xFFull << (3 * 8); /* bits 3..5 must not be 3 */ > @@ -3794,10 +3851,10 @@ static void reset_rsvds_bits_mask(struct kvm_vcpu *vcpu, > } > > static void reset_rsvds_bits_mask_ept(struct kvm_vcpu *vcpu, > - struct kvm_mmu *context, bool execonly) > + struct kvm_mmu *context, bool execonly, int ept_level) > { > __reset_rsvds_bits_mask_ept(&context->guest_rsvd_check, > - cpuid_maxphyaddr(vcpu), execonly); > + cpuid_maxphyaddr(vcpu), execonly, ept_level); > } > > /* > @@ -3844,8 +3901,8 @@ static inline bool boot_cpu_is_amd(void) > true, true); > else > __reset_rsvds_bits_mask_ept(&context->shadow_zero_check, > - boot_cpu_data.x86_phys_bits, > - false); > + boot_cpu_data.x86_phys_bits, false, > + context->shadow_root_level); > > } > > @@ -3858,7 +3915,8 @@ static inline bool boot_cpu_is_amd(void) > struct kvm_mmu *context, bool execonly) > { > __reset_rsvds_bits_mask_ept(&context->shadow_zero_check, > - boot_cpu_data.x86_phys_bits, execonly); > + boot_cpu_data.x86_phys_bits, execonly, > + context->shadow_root_level); > } > > static void update_permission_bitmask(struct kvm_vcpu *vcpu, > @@ -4037,6 +4095,28 @@ static void paging64_init_context(struct kvm_vcpu *vcpu, > paging64_init_context_common(vcpu, context, PT64_ROOT_4LEVEL); > } > > +static void paging_la57_init_context(struct kvm_vcpu *vcpu, > + struct kvm_mmu *context) > +{ > + context->nx = is_nx(vcpu); > + context->root_level = PT64_ROOT_5LEVEL; > + > + reset_rsvds_bits_mask(vcpu, context); > + update_permission_bitmask(vcpu, context, false); > + update_pkru_bitmask(vcpu, context, false); > + update_last_nonleaf_level(vcpu, context); > + > + MMU_WARN_ON(!is_pae(vcpu)); > + context->page_fault = paging_la57_page_fault; > + context->gva_to_gpa = paging_la57_gva_to_gpa; > + context->sync_page = paging_la57_sync_page; > + context->invlpg = paging_la57_invlpg; > + context->update_pte = paging_la57_update_pte; > + context->shadow_root_level = PT64_ROOT_5LEVEL; > + context->root_hpa = INVALID_PAGE; > + context->direct_map = false; This should be using paging64_init_context_common. Even better, paging64_init_context could do int root_level = is_la57_mode(vcpu) ? PT64_ROOT_5LEVEL : PT64_ROOT_4LEVEL; paging64_init_context_common(vcpu, context, root_level); and then you can skip the change in kvm_init_shadow_mmu. > +} > + > static void paging32_init_context(struct kvm_vcpu *vcpu, > struct kvm_mmu *context) > { > @@ -4086,6 +4166,11 @@ static void init_kvm_tdp_mmu(struct kvm_vcpu *vcpu) > context->nx = false; > context->gva_to_gpa = nonpaging_gva_to_gpa; > context->root_level = 0; > + } else if (is_la57_mode(vcpu)) { > + context->nx = is_nx(vcpu); > + context->root_level = PT64_ROOT_5LEVEL; > + reset_rsvds_bits_mask(vcpu, context); > + context->gva_to_gpa = paging_la57_gva_to_gpa; Please put the if (is_la57_mode(vcpu)) inside the is_long_mode branch below, since the only difference is context->root_level. > } else if (is_long_mode(vcpu)) { > context->nx = is_nx(vcpu); > context->root_level = PT64_ROOT_4LEVEL; > @@ -4119,6 +4204,8 @@ void kvm_init_shadow_mmu(struct kvm_vcpu *vcpu) > > if (!is_paging(vcpu)) > nonpaging_init_context(vcpu, context); > + else if (is_la57_mode(vcpu)) > + paging_la57_init_context(vcpu, context); > else if (is_long_mode(vcpu)) > paging64_init_context(vcpu, context); > else if (is_pae(vcpu)) > @@ -4158,7 +4245,8 @@ void kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, bool execonly) > > update_permission_bitmask(vcpu, context, true); > update_pkru_bitmask(vcpu, context, true); > - reset_rsvds_bits_mask_ept(vcpu, context, execonly); > + reset_rsvds_bits_mask_ept(vcpu, context, execonly, > + context->shadow_root_level); > reset_ept_shadow_zero_bits_mask(vcpu, context, execonly); > } > EXPORT_SYMBOL_GPL(kvm_init_shadow_ept_mmu); > @@ -4194,6 +4282,11 @@ static void init_kvm_nested_mmu(struct kvm_vcpu *vcpu) > g_context->nx = false; > g_context->root_level = 0; > g_context->gva_to_gpa = nonpaging_gva_to_gpa_nested; > + } else if (is_la57_mode(vcpu)) { > + g_context->nx = is_nx(vcpu); > + g_context->root_level = PT64_ROOT_5LEVEL; > + reset_rsvds_bits_mask(vcpu, g_context); > + g_context->gva_to_gpa = paging_la57_gva_to_gpa_nested; Same here. > } else if (is_long_mode(vcpu)) { > g_context->nx = is_nx(vcpu); > g_context->root_level = PT64_ROOT_4LEVEL; > diff --git a/arch/x86/kvm/mmu_audit.c b/arch/x86/kvm/mmu_audit.c > index 2e6996d..bb40094 100644 > --- a/arch/x86/kvm/mmu_audit.c > +++ b/arch/x86/kvm/mmu_audit.c > @@ -62,11 +62,12 @@ static void mmu_spte_walk(struct kvm_vcpu *vcpu, inspect_spte_fn fn) > if (!VALID_PAGE(vcpu->arch.mmu.root_hpa)) > return; > > - if (vcpu->arch.mmu.root_level == PT64_ROOT_4LEVEL) { > + if (vcpu->arch.mmu.root_level == PT64_ROOT_4LEVEL || > + vcpu->arch.mmu.root_level == PT64_ROOT_5LEVEL) { As above, please use >= PT64_ROOT_4LEVEL here. > hpa_t root = vcpu->arch.mmu.root_hpa; > > sp = page_header(root); > - __mmu_spte_walk(vcpu, sp, fn, PT64_ROOT_4LEVEL); > + __mmu_spte_walk(vcpu, sp, fn, vcpu->arch.mmu.root_level); > return; > } > > diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h > index a011054..c126cd3 100644 > --- a/arch/x86/kvm/paging_tmpl.h > +++ b/arch/x86/kvm/paging_tmpl.h This is not needed. > @@ -50,6 +50,21 @@ extern u64 __pure __using_nonexistent_pte_bit(void) > #define CMPXCHG cmpxchg64 > #define PT_MAX_FULL_LEVELS 2 > #endif > +#elif PTTYPE == PTTYPE_LA57 > + #define pt_element_t u64 > + #define guest_walker guest_walker_la57 > + #define FNAME(name) paging_la57_##name > + #define PT_BASE_ADDR_MASK PT64_BASE_ADDR_MASK > + #define PT_LVL_ADDR_MASK(lvl) PT64_LVL_ADDR_MASK(lvl) > + #define PT_LVL_OFFSET_MASK(lvl) PT64_LVL_OFFSET_MASK(lvl) > + #define PT_INDEX(addr, level) PT64_INDEX(addr, level) > + #define PT_LEVEL_BITS PT64_LEVEL_BITS > + #define PT_GUEST_ACCESSED_MASK PT_ACCESSED_MASK > + #define PT_GUEST_DIRTY_MASK PT_DIRTY_MASK > + #define PT_GUEST_DIRTY_SHIFT PT_DIRTY_SHIFT > + #define PT_GUEST_ACCESSED_SHIFT PT_ACCESSED_SHIFT > + #define PT_MAX_FULL_LEVELS 5 > + #define CMPXCHG cmpxchg > #elif PTTYPE == 32 > #define pt_element_t u32 > #define guest_walker guest_walker32 > @@ -266,7 +281,7 @@ static int FNAME(update_accessed_dirty_bits)(struct kvm_vcpu *vcpu, > static inline unsigned FNAME(gpte_pkeys)(struct kvm_vcpu *vcpu, u64 gpte) > { > unsigned pkeys = 0; > -#if PTTYPE == 64 > +#if PTTYPE == 64 || PTTYPE == PTTYPE_LA57 > pte_t pte = {.pte = gpte}; > > pkeys = pte_flags_pkey(pte_flags(pte)); > @@ -300,7 +315,7 @@ static int FNAME(walk_addr_generic)(struct guest_walker *walker, > walker->level = mmu->root_level; > pte = mmu->get_cr3(vcpu); > > -#if PTTYPE == 64 > +#if PTTYPE == 64 || PTTYPE == PTTYPE_LA57 > if (walker->level == PT32E_ROOT_LEVEL) { > pte = mmu->get_pdptr(vcpu, (addr >> 30) & 3); > trace_kvm_mmu_paging_element(pte, walker->level); > diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c > index 24db5fb..bfc9f0a 100644 > --- a/arch/x86/kvm/vmx.c > +++ b/arch/x86/kvm/vmx.c > @@ -1220,6 +1220,11 @@ static inline bool cpu_has_vmx_ept_4levels(void) > return vmx_capability.ept & VMX_EPT_PAGE_WALK_4_BIT; > } > > +static inline bool cpu_has_vmx_ept_5levels(void) > +{ > + return vmx_capability.ept & VMX_EPT_PAGE_WALK_5_BIT; > +} > + > static inline bool cpu_has_vmx_ept_ad_bits(void) > { > return vmx_capability.ept & VMX_EPT_AD_BIT; > @@ -4249,13 +4254,20 @@ static void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) > vmx->emulation_required = emulation_required(vcpu); > } > > +static int get_ept_level(void) > +{ > + if (cpu_has_vmx_ept_5levels()) > + return VMX_EPT_MAX_GAW + 1; > + return VMX_EPT_DEFAULT_GAW + 1; > +} > + > static u64 construct_eptp(unsigned long root_hpa) > { > u64 eptp; > > /* TODO write the value reading from MSR */ > eptp = VMX_EPT_DEFAULT_MT | > - VMX_EPT_DEFAULT_GAW << VMX_EPT_GAW_EPTP_SHIFT; > + (get_ept_level() - 1) << VMX_EPT_GAW_EPTP_SHIFT; > if (enable_ept_ad_bits) > eptp |= VMX_EPT_AD_ENABLE_BIT; > eptp |= (root_hpa & PAGE_MASK); For nested virt you need to set the shift to what L1 uses, so I think you need to add a root_level argument here and in kvm_init_shadow_ept_mmu. Paolo > @@ -9356,11 +9368,6 @@ static void __init vmx_check_processor_compat(void *rtn) > } > } > > -static int get_ept_level(void) > -{ > - return VMX_EPT_DEFAULT_GAW + 1; > -} > - > static u64 vmx_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio) > { > u8 cache; > diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h > index e8ff3e4..26627df 100644 > --- a/arch/x86/kvm/x86.h > +++ b/arch/x86/kvm/x86.h > @@ -60,6 +60,16 @@ static inline bool is_64_bit_mode(struct kvm_vcpu *vcpu) > return cs_l; > } > > +static inline bool is_la57_mode(struct kvm_vcpu *vcpu) > +{ > +#ifdef CONFIG_X86_64 > + return (vcpu->arch.efer & EFER_LMA) && > + kvm_read_cr4_bits(vcpu, X86_CR4_LA57); > +#else > + return 0; > +#endif > +} > + > static inline bool mmu_is_nested(struct kvm_vcpu *vcpu) > { > return vcpu->arch.walk_mmu == &vcpu->arch.nested_mmu; >