Subject: Re: [PATCH RFC 3/4] KVM: MMU: Add 5 level EPT & Shadow page table
 support.
To: Liang Li <liang.z.li@intel.com>, kvm@vger.kernel.org
References: <1483003563-25847-1-git-send-email-liang.z.li@intel.com>
 <1483003563-25847-4-git-send-email-liang.z.li@intel.com>
Cc: linux-kernel@vger.kernel.org, tglx@linutronix.de, mingo@redhat.com,
        kirill.shutemov@linux.intel.com, dave.hansen@linux.intel.com,
        guangrong.xiao@linux.intel.com, rkrcmar@redhat.com
From: Paolo Bonzini <pbonzini@redhat.com>
Message-ID: <8b09d6dc-fe12-3af2-01f0-49eb0a8baeae@redhat.com>
Date: Thu, 9 Mar 2017 16:12:34 +0100
User-Agent: Mozilla/5.0 (X11; Linux x86_64; rv:45.0) Gecko/20100101
 Thunderbird/45.7.0
MIME-Version: 1.0
In-Reply-To: <1483003563-25847-4-git-send-email-liang.z.li@intel.com>
Content-Type: text/plain; charset=utf-8
Content-Transfer-Encoding: 8bit
Sender: linux-kernel-owner@vger.kernel.org
Content-Length: 22324
Lines: 618


On 29/12/2016 10:26, Liang Li wrote:
> The future Intel CPU will extend the max physical address to 52 bits.
> To support the new physical address width, EPT is extended to support
> 5 level page table.
> This patch add the 5 level EPT and extend shadow page to support
> 5 level paging guest. As the RFC version, this patch enables 5 level
> EPT once the hardware supports, and this is not a good choice because
> 5 level EPT requires more memory access comparing to use 4 level EPT.
> The right thing is to use 5 level EPT only when it's needed, will
> change in the future version.
> 
> Signed-off-by: Liang Li <liang.z.li@intel.com>
> Cc: Thomas Gleixner <tglx@linutronix.de>
> Cc: Ingo Molnar <mingo@redhat.com>
> Cc: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
> Cc: Dave Hansen <dave.hansen@linux.intel.com>
> Cc: Xiao Guangrong <guangrong.xiao@linux.intel.com>
> Cc: Paolo Bonzini <pbonzini@redhat.com>
> Cc: "Radim Krčmář" <rkrcmar@redhat.com>
> ---
>  arch/x86/include/asm/kvm_host.h |   3 +-
>  arch/x86/include/asm/vmx.h      |   1 +
>  arch/x86/kvm/cpuid.h            |   8 ++
>  arch/x86/kvm/mmu.c              | 167 +++++++++++++++++++++++++++++++---------
>  arch/x86/kvm/mmu_audit.c        |   5 +-
>  arch/x86/kvm/paging_tmpl.h      |  19 ++++-
>  arch/x86/kvm/vmx.c              |  19 +++--
>  arch/x86/kvm/x86.h              |  10 +++
>  8 files changed, 184 insertions(+), 48 deletions(-)
> 
> diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
> index a7066dc..e505dac 100644
> --- a/arch/x86/include/asm/kvm_host.h
> +++ b/arch/x86/include/asm/kvm_host.h
> @@ -124,6 +124,7 @@ static inline gfn_t gfn_to_index(gfn_t gfn, gfn_t base_gfn, int level)
>  #define KVM_NR_VAR_MTRR 8
>  
>  #define ASYNC_PF_PER_VCPU 64
> +#define PT64_ROOT_5LEVEL 5
>  
>  enum kvm_reg {
>  	VCPU_REGS_RAX = 0,
> @@ -310,7 +311,7 @@ struct kvm_pio_request {
>  };
>  
>  struct rsvd_bits_validate {
> -	u64 rsvd_bits_mask[2][4];
> +	u64 rsvd_bits_mask[2][PT64_ROOT_5LEVEL];
>  	u64 bad_mt_xwr;
>  };
>  
> diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h
> index 2b5b2d4..bf2f178 100644
> --- a/arch/x86/include/asm/vmx.h
> +++ b/arch/x86/include/asm/vmx.h
> @@ -442,6 +442,7 @@ enum vmcs_field {
>  
>  #define VMX_EPT_EXECUTE_ONLY_BIT		(1ull)
>  #define VMX_EPT_PAGE_WALK_4_BIT			(1ull << 6)
> +#define VMX_EPT_PAGE_WALK_5_BIT			(1ull << 7)
>  #define VMX_EPTP_UC_BIT				(1ull << 8)
>  #define VMX_EPTP_WB_BIT				(1ull << 14)
>  #define VMX_EPT_2MB_PAGE_BIT			(1ull << 16)
> diff --git a/arch/x86/kvm/cpuid.h b/arch/x86/kvm/cpuid.h
> index 35058c2..4bdf3dc 100644
> --- a/arch/x86/kvm/cpuid.h
> +++ b/arch/x86/kvm/cpuid.h
> @@ -88,6 +88,14 @@ static inline bool guest_cpuid_has_pku(struct kvm_vcpu *vcpu)
>  	return best && (best->ecx & bit(X86_FEATURE_PKU));
>  }
>  
> +static inline bool guest_cpuid_has_la57(struct kvm_vcpu *vcpu)
> +{
> +	struct kvm_cpuid_entry2 *best;
> +
> +	best = kvm_find_cpuid_entry(vcpu, 7, 0);
> +	return best && (best->ecx & bit(X86_FEATURE_LA57));
> +}
> +
>  static inline bool guest_cpuid_has_longmode(struct kvm_vcpu *vcpu)
>  {
>  	struct kvm_cpuid_entry2 *best;
> diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
> index 4c40273..0a56f27 100644
> --- a/arch/x86/kvm/mmu.c
> +++ b/arch/x86/kvm/mmu.c
> @@ -1986,8 +1986,8 @@ static bool kvm_sync_pages(struct kvm_vcpu *vcpu, gfn_t gfn,
>  }
>  
>  struct mmu_page_path {
> -	struct kvm_mmu_page *parent[PT64_ROOT_4LEVEL];
> -	unsigned int idx[PT64_ROOT_4LEVEL];
> +	struct kvm_mmu_page *parent[PT64_ROOT_5LEVEL];
> +	unsigned int idx[PT64_ROOT_5LEVEL];
>  };
>  
>  #define for_each_sp(pvec, sp, parents, i)			\
> @@ -2198,6 +2198,11 @@ static void shadow_walk_init(struct kvm_shadow_walk_iterator *iterator,
>  	    !vcpu->arch.mmu.direct_map)
>  		--iterator->level;
>  
> +	if (iterator->level == PT64_ROOT_5LEVEL &&
> +	    vcpu->arch.mmu.root_level < PT64_ROOT_5LEVEL &&
> +	    !vcpu->arch.mmu.direct_map)
> +		iterator->level -= 2;

This (and the "if" before it as well) might actually be dead code.
Please remove it in a separate patch.

>  	if (iterator->level == PT32E_ROOT_LEVEL) {
>  		iterator->shadow_addr
>  			= vcpu->arch.mmu.pae_root[(addr >> 30) & 3];
> @@ -3061,9 +3066,12 @@ static void mmu_free_roots(struct kvm_vcpu *vcpu)
>  	if (!VALID_PAGE(vcpu->arch.mmu.root_hpa))
>  		return;
>  
> -	if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_4LEVEL &&
> -	    (vcpu->arch.mmu.root_level == PT64_ROOT_4LEVEL ||
> -	     vcpu->arch.mmu.direct_map)) {
> +	if ((vcpu->arch.mmu.shadow_root_level == PT64_ROOT_4LEVEL &&
> +	     (vcpu->arch.mmu.root_level == PT64_ROOT_4LEVEL ||
> +	      vcpu->arch.mmu.direct_map)) ||
> +	    (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_5LEVEL &&
> +	     (vcpu->arch.mmu.root_level == PT64_ROOT_5LEVEL ||
> +	      vcpu->arch.mmu.direct_map))) {

Same here:

	if (vcpu->arch.mmu.shadow_root_level >= PT64_ROOT_4LEVEL)

should be enough.  In general, checking >= PT64_ROOT_4LEVEL is better
IMHO than checking for == PT64_ROOT_4LEVEL || == PT64_ROOT_5LEVEL.
These "if"s basically need to single out PAE.  A hypothetical 6-level
page table extension would in all likelihood behave just like 64-bit
LA48 and LA57 paging.

>  		hpa_t root = vcpu->arch.mmu.root_hpa;
>  
>  		spin_lock(&vcpu->kvm->mmu_lock);
> @@ -3114,10 +3122,12 @@ static int mmu_alloc_direct_roots(struct kvm_vcpu *vcpu)
>  	struct kvm_mmu_page *sp;
>  	unsigned i;
>  
> -	if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_4LEVEL) {
> +	if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_4LEVEL ||
> +	    vcpu->arch.mmu.shadow_root_level == PT64_ROOT_5LEVEL) {

Same here and everywhere else.

>  		spin_lock(&vcpu->kvm->mmu_lock);
>  		make_mmu_pages_available(vcpu);
> -		sp = kvm_mmu_get_page(vcpu, 0, 0, PT64_ROOT_4LEVEL, 1, ACC_ALL);
> +		sp = kvm_mmu_get_page(vcpu, 0, 0,
> +				vcpu->arch.mmu.shadow_root_level, 1, ACC_ALL);
>  		++sp->root_count;
>  		spin_unlock(&vcpu->kvm->mmu_lock);
>  		vcpu->arch.mmu.root_hpa = __pa(sp->spt);
> @@ -3158,15 +3168,16 @@ static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu)
>  	 * Do we shadow a long mode page table? If so we need to
>  	 * write-protect the guests page table root.
>  	 */
> -	if (vcpu->arch.mmu.root_level == PT64_ROOT_4LEVEL) {
> +	if (vcpu->arch.mmu.root_level == PT64_ROOT_4LEVEL ||
> +	    vcpu->arch.mmu.root_level == PT64_ROOT_5LEVEL) {
>  		hpa_t root = vcpu->arch.mmu.root_hpa;
>  
>  		MMU_WARN_ON(VALID_PAGE(root));
>  
>  		spin_lock(&vcpu->kvm->mmu_lock);
>  		make_mmu_pages_available(vcpu);
> -		sp = kvm_mmu_get_page(vcpu, root_gfn, 0, PT64_ROOT_4LEVEL,
> -				      0, ACC_ALL);
> +		sp = kvm_mmu_get_page(vcpu, root_gfn, 0,
> +				vcpu->arch.mmu.root_level, 0, ACC_ALL);
>  		root = __pa(sp->spt);
>  		++sp->root_count;
>  		spin_unlock(&vcpu->kvm->mmu_lock);
> @@ -3180,7 +3191,8 @@ static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu)
>  	 * the shadow page table may be a PAE or a long mode page table.
>  	 */
>  	pm_mask = PT_PRESENT_MASK;
> -	if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_4LEVEL)
> +	if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_4LEVEL ||
> +	    vcpu->arch.mmu.shadow_root_level == PT64_ROOT_5LEVEL)
>  		pm_mask |= PT_ACCESSED_MASK | PT_WRITABLE_MASK | PT_USER_MASK;
>  
>  	for (i = 0; i < 4; ++i) {
> @@ -3213,7 +3225,8 @@ static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu)
>  	 * If we shadow a 32 bit page table with a long mode page
>  	 * table we enter this path.
>  	 */
> -	if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_4LEVEL) {
> +	if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_4LEVEL ||
> +	    vcpu->arch.mmu.shadow_root_level == PT64_ROOT_5LEVEL) {
>  		if (vcpu->arch.mmu.lm_root == NULL) {
>  			/*
>  			 * The additional page necessary for this is only
> @@ -3257,8 +3270,8 @@ static void mmu_sync_roots(struct kvm_vcpu *vcpu)
>  		return;
>  
>  	vcpu_clear_mmio_info(vcpu, MMIO_GVA_ANY);
> -	kvm_mmu_audit(vcpu, AUDIT_PRE_SYNC);
> -	if (vcpu->arch.mmu.root_level == PT64_ROOT_4LEVEL) {
> +	if (vcpu->arch.mmu.root_level == PT64_ROOT_4LEVEL ||
> +	    vcpu->arch.mmu.root_level == PT64_ROOT_5LEVEL) {
>  		hpa_t root = vcpu->arch.mmu.root_hpa;
>  		sp = page_header(root);
>  		mmu_sync_children(vcpu, sp);
> @@ -3334,7 +3347,7 @@ static bool mmio_info_in_cache(struct kvm_vcpu *vcpu, u64 addr, bool direct)
>  walk_shadow_page_get_mmio_spte(struct kvm_vcpu *vcpu, u64 addr, u64 *sptep)
>  {
>  	struct kvm_shadow_walk_iterator iterator;
> -	u64 sptes[PT64_ROOT_4LEVEL], spte = 0ull;
> +	u64 sptes[PT64_ROOT_5LEVEL], spte = 0ull;
>  	int root, leaf;
>  	bool reserved = false;
>  
> @@ -3655,10 +3668,16 @@ static inline bool is_last_gpte(struct kvm_mmu *mmu,
>  }
>  
>  #define PTTYPE_EPT 18 /* arbitrary */
> +#define PTTYPE_LA57 57
> +
>  #define PTTYPE PTTYPE_EPT
>  #include "paging_tmpl.h"
>  #undef PTTYPE
>  
> +#define PTTYPE PTTYPE_LA57
> +#include "paging_tmpl.h"
> +#undef PTTYPE

This is not needed.  The format for LA57 page tables is the same as for
LA48.

>  #define PTTYPE 64
>  #include "paging_tmpl.h"
>  #undef PTTYPE
> @@ -3747,6 +3766,26 @@ static inline bool is_last_gpte(struct kvm_mmu *mmu,
>  		rsvd_check->rsvd_bits_mask[1][0] =
>  			rsvd_check->rsvd_bits_mask[0][0];
>  		break;
> +	case PT64_ROOT_5LEVEL:
> +		rsvd_check->rsvd_bits_mask[0][4] = exb_bit_rsvd |
> +			nonleaf_bit8_rsvd | rsvd_bits(7, 7);
> +		rsvd_check->rsvd_bits_mask[0][3] = exb_bit_rsvd |
> +			nonleaf_bit8_rsvd | rsvd_bits(7, 7);

I think the code for this and PT64_ROOT_4LEVEL should be the same
(setting rsvd_bits_mask[x][4] for PT64_ROOT_4LEVEL is okay).

You are assuming that MAXPHYADDR=52, but the Intel whitepaper doesn't
say this is going to be always the case.  rsvd_bits in
arch/x86/kvm/mmu.h is not a hot path, feel free to add an

	if (e < s)
		return 0;

there.

> +		rsvd_check->rsvd_bits_mask[0][2] = exb_bit_rsvd |
> +			nonleaf_bit8_rsvd | gbpages_bit_rsvd;
> +		rsvd_check->rsvd_bits_mask[0][1] = exb_bit_rsvd;
> +		rsvd_check->rsvd_bits_mask[0][0] = exb_bit_rsvd;
> +		rsvd_check->rsvd_bits_mask[1][4] =
> +			rsvd_check->rsvd_bits_mask[0][4];
> +		rsvd_check->rsvd_bits_mask[1][3] =
> +			rsvd_check->rsvd_bits_mask[0][3];
> +		rsvd_check->rsvd_bits_mask[1][2] = exb_bit_rsvd |
> +			gbpages_bit_rsvd | rsvd_bits(13, 29);
> +		rsvd_check->rsvd_bits_mask[1][1] = exb_bit_rsvd |
> +			rsvd_bits(13, 20);		/* large page */
> +		rsvd_check->rsvd_bits_mask[1][0] =
> +			rsvd_check->rsvd_bits_mask[0][0];
> +		break;
>  	}
>  }
>  
> @@ -3761,25 +3800,43 @@ static void reset_rsvds_bits_mask(struct kvm_vcpu *vcpu,
>  
>  static void
>  __reset_rsvds_bits_mask_ept(struct rsvd_bits_validate *rsvd_check,
> -			    int maxphyaddr, bool execonly)
> +			    int maxphyaddr, bool execonly, int ept_level)
>  {
>  	u64 bad_mt_xwr;
>  
> -	rsvd_check->rsvd_bits_mask[0][3] =
> -		rsvd_bits(maxphyaddr, 51) | rsvd_bits(3, 7);
> -	rsvd_check->rsvd_bits_mask[0][2] =
> -		rsvd_bits(maxphyaddr, 51) | rsvd_bits(3, 6);
> -	rsvd_check->rsvd_bits_mask[0][1] =
> -		rsvd_bits(maxphyaddr, 51) | rsvd_bits(3, 6);
> -	rsvd_check->rsvd_bits_mask[0][0] = rsvd_bits(maxphyaddr, 51);
> -
> -	/* large page */
> -	rsvd_check->rsvd_bits_mask[1][3] = rsvd_check->rsvd_bits_mask[0][3];
> -	rsvd_check->rsvd_bits_mask[1][2] =
> -		rsvd_bits(maxphyaddr, 51) | rsvd_bits(12, 29);
> -	rsvd_check->rsvd_bits_mask[1][1] =
> -		rsvd_bits(maxphyaddr, 51) | rsvd_bits(12, 20);
> -	rsvd_check->rsvd_bits_mask[1][0] = rsvd_check->rsvd_bits_mask[0][0];
> +	if (ept_level == 5) {
> +		rsvd_check->rsvd_bits_mask[0][4] = rsvd_bits(3, 7);

Same here, this "if" is not needed at all and the new ept_level argument
shouldn't be required either.

> +		rsvd_check->rsvd_bits_mask[0][3] = rsvd_bits(3, 7);
> +		rsvd_check->rsvd_bits_mask[0][2] = rsvd_bits(3, 6);
> +		rsvd_check->rsvd_bits_mask[0][1] = rsvd_bits(3, 6);
> +		rsvd_check->rsvd_bits_mask[0][0] = 0;
> +
> +		/* large page */
> +		rsvd_check->rsvd_bits_mask[1][4] =
> +			 rsvd_check->rsvd_bits_mask[0][4];
> +		rsvd_check->rsvd_bits_mask[1][3] =
> +			 rsvd_check->rsvd_bits_mask[0][3];
> +		rsvd_check->rsvd_bits_mask[1][2] = rsvd_bits(12, 29);
> +		rsvd_check->rsvd_bits_mask[1][1] = rsvd_bits(12, 20);
> +		rsvd_check->rsvd_bits_mask[1][0] = 0;
> +	} else {
> +		rsvd_check->rsvd_bits_mask[0][3] =
> +			rsvd_bits(maxphyaddr, 51) | rsvd_bits(3, 7);
> +		rsvd_check->rsvd_bits_mask[0][2] =
> +			rsvd_bits(maxphyaddr, 51) | rsvd_bits(3, 6);
> +		rsvd_check->rsvd_bits_mask[0][1] =
> +			rsvd_bits(maxphyaddr, 51) | rsvd_bits(3, 6);
> +		rsvd_check->rsvd_bits_mask[0][0] = rsvd_bits(maxphyaddr, 51);
> +		/* large page */
> +		rsvd_check->rsvd_bits_mask[1][3] =
> +			 rsvd_check->rsvd_bits_mask[0][3];
> +		rsvd_check->rsvd_bits_mask[1][2] =
> +			rsvd_bits(maxphyaddr, 51) | rsvd_bits(12, 29);
> +		rsvd_check->rsvd_bits_mask[1][1] =
> +			rsvd_bits(maxphyaddr, 51) | rsvd_bits(12, 20);
> +		rsvd_check->rsvd_bits_mask[1][0] =
> +			 rsvd_check->rsvd_bits_mask[0][0];
> +	}
>  
>  	bad_mt_xwr = 0xFFull << (2 * 8);	/* bits 3..5 must not be 2 */
>  	bad_mt_xwr |= 0xFFull << (3 * 8);	/* bits 3..5 must not be 3 */
> @@ -3794,10 +3851,10 @@ static void reset_rsvds_bits_mask(struct kvm_vcpu *vcpu,
>  }
>  
>  static void reset_rsvds_bits_mask_ept(struct kvm_vcpu *vcpu,
> -		struct kvm_mmu *context, bool execonly)
> +		struct kvm_mmu *context, bool execonly, int ept_level)
>  {
>  	__reset_rsvds_bits_mask_ept(&context->guest_rsvd_check,
> -				    cpuid_maxphyaddr(vcpu), execonly);
> +			cpuid_maxphyaddr(vcpu), execonly, ept_level);
>  }
>  
>  /*
> @@ -3844,8 +3901,8 @@ static inline bool boot_cpu_is_amd(void)
>  					true, true);
>  	else
>  		__reset_rsvds_bits_mask_ept(&context->shadow_zero_check,
> -					    boot_cpu_data.x86_phys_bits,
> -					    false);
> +					    boot_cpu_data.x86_phys_bits, false,
> +					    context->shadow_root_level);
>  
>  }
>  
> @@ -3858,7 +3915,8 @@ static inline bool boot_cpu_is_amd(void)
>  				struct kvm_mmu *context, bool execonly)
>  {
>  	__reset_rsvds_bits_mask_ept(&context->shadow_zero_check,
> -				    boot_cpu_data.x86_phys_bits, execonly);
> +				    boot_cpu_data.x86_phys_bits, execonly,
> +				    context->shadow_root_level);
>  }
>  
>  static void update_permission_bitmask(struct kvm_vcpu *vcpu,
> @@ -4037,6 +4095,28 @@ static void paging64_init_context(struct kvm_vcpu *vcpu,
>  	paging64_init_context_common(vcpu, context, PT64_ROOT_4LEVEL);
>  }
>  
> +static void paging_la57_init_context(struct kvm_vcpu *vcpu,
> +				  struct kvm_mmu *context)
> +{
> +	context->nx = is_nx(vcpu);
> +	context->root_level = PT64_ROOT_5LEVEL;
> +
> +	reset_rsvds_bits_mask(vcpu, context);
> +	update_permission_bitmask(vcpu, context, false);
> +	update_pkru_bitmask(vcpu, context, false);
> +	update_last_nonleaf_level(vcpu, context);
> +
> +	MMU_WARN_ON(!is_pae(vcpu));
> +	context->page_fault = paging_la57_page_fault;
> +	context->gva_to_gpa = paging_la57_gva_to_gpa;
> +	context->sync_page = paging_la57_sync_page;
> +	context->invlpg = paging_la57_invlpg;
> +	context->update_pte = paging_la57_update_pte;
> +	context->shadow_root_level = PT64_ROOT_5LEVEL;
> +	context->root_hpa = INVALID_PAGE;
> +	context->direct_map = false;

This should be using paging64_init_context_common.

Even better, paging64_init_context could do

	int root_level =
	    is_la57_mode(vcpu) ? PT64_ROOT_5LEVEL : PT64_ROOT_4LEVEL;
	paging64_init_context_common(vcpu, context, root_level);

and then you can skip the change in kvm_init_shadow_mmu.

> +}
> +
>  static void paging32_init_context(struct kvm_vcpu *vcpu,
>  				  struct kvm_mmu *context)
>  {
> @@ -4086,6 +4166,11 @@ static void init_kvm_tdp_mmu(struct kvm_vcpu *vcpu)
>  		context->nx = false;
>  		context->gva_to_gpa = nonpaging_gva_to_gpa;
>  		context->root_level = 0;
> +	} else if (is_la57_mode(vcpu)) {
> +		context->nx = is_nx(vcpu);
> +		context->root_level = PT64_ROOT_5LEVEL;
> +		reset_rsvds_bits_mask(vcpu, context);
> +		context->gva_to_gpa = paging_la57_gva_to_gpa;

Please put the

	if (is_la57_mode(vcpu))

inside the is_long_mode branch below, since the only difference is
context->root_level.

>  	} else if (is_long_mode(vcpu)) {
>  		context->nx = is_nx(vcpu);
>  		context->root_level = PT64_ROOT_4LEVEL;
> @@ -4119,6 +4204,8 @@ void kvm_init_shadow_mmu(struct kvm_vcpu *vcpu)
>  
>  	if (!is_paging(vcpu))
>  		nonpaging_init_context(vcpu, context);
> +	else if (is_la57_mode(vcpu))
> +		paging_la57_init_context(vcpu, context);
>  	else if (is_long_mode(vcpu))
>  		paging64_init_context(vcpu, context);
>  	else if (is_pae(vcpu))
> @@ -4158,7 +4245,8 @@ void kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, bool execonly)
>  
>  	update_permission_bitmask(vcpu, context, true);
>  	update_pkru_bitmask(vcpu, context, true);
> -	reset_rsvds_bits_mask_ept(vcpu, context, execonly);
> +	reset_rsvds_bits_mask_ept(vcpu, context, execonly,
> +				  context->shadow_root_level);
>  	reset_ept_shadow_zero_bits_mask(vcpu, context, execonly);
>  }
>  EXPORT_SYMBOL_GPL(kvm_init_shadow_ept_mmu);
> @@ -4194,6 +4282,11 @@ static void init_kvm_nested_mmu(struct kvm_vcpu *vcpu)
>  		g_context->nx = false;
>  		g_context->root_level = 0;
>  		g_context->gva_to_gpa = nonpaging_gva_to_gpa_nested;
> +	} else if (is_la57_mode(vcpu)) {
> +		g_context->nx = is_nx(vcpu);
> +		g_context->root_level = PT64_ROOT_5LEVEL;
> +		reset_rsvds_bits_mask(vcpu, g_context);
> +		g_context->gva_to_gpa = paging_la57_gva_to_gpa_nested;

Same here.

>  	} else if (is_long_mode(vcpu)) {
>  		g_context->nx = is_nx(vcpu);
>  		g_context->root_level = PT64_ROOT_4LEVEL;
> diff --git a/arch/x86/kvm/mmu_audit.c b/arch/x86/kvm/mmu_audit.c
> index 2e6996d..bb40094 100644
> --- a/arch/x86/kvm/mmu_audit.c
> +++ b/arch/x86/kvm/mmu_audit.c
> @@ -62,11 +62,12 @@ static void mmu_spte_walk(struct kvm_vcpu *vcpu, inspect_spte_fn fn)
>  	if (!VALID_PAGE(vcpu->arch.mmu.root_hpa))
>  		return;
>  
> -	if (vcpu->arch.mmu.root_level == PT64_ROOT_4LEVEL) {
> +	if (vcpu->arch.mmu.root_level == PT64_ROOT_4LEVEL ||
> +	    vcpu->arch.mmu.root_level == PT64_ROOT_5LEVEL) {

As above, please use >= PT64_ROOT_4LEVEL here.

>  		hpa_t root = vcpu->arch.mmu.root_hpa;
>  
>  		sp = page_header(root);
> -		__mmu_spte_walk(vcpu, sp, fn, PT64_ROOT_4LEVEL);
> +		__mmu_spte_walk(vcpu, sp, fn, vcpu->arch.mmu.root_level);
>  		return;
>  	}
>  
> diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
> index a011054..c126cd3 100644
> --- a/arch/x86/kvm/paging_tmpl.h
> +++ b/arch/x86/kvm/paging_tmpl.h

This is not needed.

> @@ -50,6 +50,21 @@ extern u64 __pure __using_nonexistent_pte_bit(void)
>  	#define CMPXCHG cmpxchg64
>  	#define PT_MAX_FULL_LEVELS 2
>  	#endif
> +#elif PTTYPE == PTTYPE_LA57
> +	#define pt_element_t u64
> +	#define guest_walker guest_walker_la57
> +	#define FNAME(name) paging_la57_##name
> +	#define PT_BASE_ADDR_MASK PT64_BASE_ADDR_MASK
> +	#define PT_LVL_ADDR_MASK(lvl) PT64_LVL_ADDR_MASK(lvl)
> +	#define PT_LVL_OFFSET_MASK(lvl) PT64_LVL_OFFSET_MASK(lvl)
> +	#define PT_INDEX(addr, level) PT64_INDEX(addr, level)
> +	#define PT_LEVEL_BITS PT64_LEVEL_BITS
> +	#define PT_GUEST_ACCESSED_MASK PT_ACCESSED_MASK
> +	#define PT_GUEST_DIRTY_MASK PT_DIRTY_MASK
> +	#define PT_GUEST_DIRTY_SHIFT PT_DIRTY_SHIFT
> +	#define PT_GUEST_ACCESSED_SHIFT PT_ACCESSED_SHIFT
> +	#define PT_MAX_FULL_LEVELS 5
> +	#define CMPXCHG cmpxchg
>  #elif PTTYPE == 32
>  	#define pt_element_t u32
>  	#define guest_walker guest_walker32
> @@ -266,7 +281,7 @@ static int FNAME(update_accessed_dirty_bits)(struct kvm_vcpu *vcpu,
>  static inline unsigned FNAME(gpte_pkeys)(struct kvm_vcpu *vcpu, u64 gpte)
>  {
>  	unsigned pkeys = 0;
> -#if PTTYPE == 64
> +#if PTTYPE == 64 || PTTYPE == PTTYPE_LA57
>  	pte_t pte = {.pte = gpte};
>  
>  	pkeys = pte_flags_pkey(pte_flags(pte));
> @@ -300,7 +315,7 @@ static int FNAME(walk_addr_generic)(struct guest_walker *walker,
>  	walker->level = mmu->root_level;
>  	pte           = mmu->get_cr3(vcpu);
>  
> -#if PTTYPE == 64
> +#if PTTYPE == 64 || PTTYPE == PTTYPE_LA57
>  	if (walker->level == PT32E_ROOT_LEVEL) {
>  		pte = mmu->get_pdptr(vcpu, (addr >> 30) & 3);
>  		trace_kvm_mmu_paging_element(pte, walker->level);
> diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
> index 24db5fb..bfc9f0a 100644
> --- a/arch/x86/kvm/vmx.c
> +++ b/arch/x86/kvm/vmx.c
> @@ -1220,6 +1220,11 @@ static inline bool cpu_has_vmx_ept_4levels(void)
>  	return vmx_capability.ept & VMX_EPT_PAGE_WALK_4_BIT;
>  }
>  
> +static inline bool cpu_has_vmx_ept_5levels(void)
> +{
> +	return vmx_capability.ept & VMX_EPT_PAGE_WALK_5_BIT;
> +}
> +
>  static inline bool cpu_has_vmx_ept_ad_bits(void)
>  {
>  	return vmx_capability.ept & VMX_EPT_AD_BIT;
> @@ -4249,13 +4254,20 @@ static void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
>  	vmx->emulation_required = emulation_required(vcpu);
>  }
>  
> +static int get_ept_level(void)
> +{
> +	if (cpu_has_vmx_ept_5levels())
> +		return VMX_EPT_MAX_GAW + 1;
> +	return VMX_EPT_DEFAULT_GAW + 1;
> +}
> +
>  static u64 construct_eptp(unsigned long root_hpa)
>  {
>  	u64 eptp;
>  
>  	/* TODO write the value reading from MSR */
>  	eptp = VMX_EPT_DEFAULT_MT |
> -		VMX_EPT_DEFAULT_GAW << VMX_EPT_GAW_EPTP_SHIFT;
> +		(get_ept_level() - 1) << VMX_EPT_GAW_EPTP_SHIFT;
>  	if (enable_ept_ad_bits)
>  		eptp |= VMX_EPT_AD_ENABLE_BIT;
>  	eptp |= (root_hpa & PAGE_MASK);

For nested virt you need to set the shift to what L1 uses, so I think
you need to add a root_level argument here and in kvm_init_shadow_ept_mmu.

Paolo

> @@ -9356,11 +9368,6 @@ static void __init vmx_check_processor_compat(void *rtn)
>  	}
>  }
>  
> -static int get_ept_level(void)
> -{
> -	return VMX_EPT_DEFAULT_GAW + 1;
> -}
> -
>  static u64 vmx_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio)
>  {
>  	u8 cache;
> diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h
> index e8ff3e4..26627df 100644
> --- a/arch/x86/kvm/x86.h
> +++ b/arch/x86/kvm/x86.h
> @@ -60,6 +60,16 @@ static inline bool is_64_bit_mode(struct kvm_vcpu *vcpu)
>  	return cs_l;
>  }
>  
> +static inline bool is_la57_mode(struct kvm_vcpu *vcpu)
> +{
> +#ifdef CONFIG_X86_64
> +	return (vcpu->arch.efer & EFER_LMA) &&
> +		 kvm_read_cr4_bits(vcpu, X86_CR4_LA57);
> +#else
> +	return 0;
> +#endif
> +}
> +
>  static inline bool mmu_is_nested(struct kvm_vcpu *vcpu)
>  {
>  	return vcpu->arch.walk_mmu == &vcpu->arch.nested_mmu;
>