Subject: Re: [PATCH 1/4] KVM: nVMX: support restore of VMX capability MSRs
To: David Matlack <dmatlack@google.com>, kvm@vger.kernel.org
References: <1479863680-117511-1-git-send-email-dmatlack@google.com>
 <1479863680-117511-2-git-send-email-dmatlack@google.com>
Cc: linux-kernel@vger.kernel.org, jmattson@google.com, rkrcmar@redhat.com
From: Paolo Bonzini <pbonzini@redhat.com>
Message-ID: <b542b08f-f190-6bd6-e124-51c57931345d@redhat.com>
Date: Wed, 23 Nov 2016 12:44:31 +0100
User-Agent: Mozilla/5.0 (X11; Linux x86_64; rv:45.0) Gecko/20100101
 Thunderbird/45.4.0
MIME-Version: 1.0
In-Reply-To: <1479863680-117511-2-git-send-email-dmatlack@google.com>
Content-Type: text/plain; charset=utf-8
Content-Transfer-Encoding: 8bit
Sender: linux-kernel-owner@vger.kernel.org
Content-Length: 6914
Lines: 202


On 23/11/2016 02:14, David Matlack wrote:
> The VMX capability MSRs advertise the set of features the KVM virtual
> CPU can support. This set of features vary across different host CPUs
> and KVM versions. This patch aims to addresses both sources of
> differences, allowing VMs to be migrated across CPUs and KVM versions
> without guest-visible changes to these MSRs. Note that cross-KVM-
> version migration is only supported from this point forward.
> 
> When the VMX capability MSRs are restored, they are audited to check
> that the set of features advertised are a subset of what KVM and the
> CPU support.
> 
> Since the VMX capability MSRs are read-only, they do not need to be on
> the default MSR save/restore lists. The userspace hypervisor can set
> the values of these MSRs or read them from KVM at VCPU creation time,
> and restore the same value after every save/restore.
> 
> Signed-off-by: David Matlack <dmatlack@google.com>
> ---
>  arch/x86/include/asm/vmx.h |  31 +++++
>  arch/x86/kvm/vmx.c         | 317 +++++++++++++++++++++++++++++++++++++++++----
>  2 files changed, 324 insertions(+), 24 deletions(-)
> 
> diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h
> index a002b07..a4ca897 100644
> --- a/arch/x86/include/asm/vmx.h
> +++ b/arch/x86/include/asm/vmx.h
> @@ -25,6 +25,7 @@
>  #define VMX_H
>  
>  
> +#include <linux/bitops.h>
>  #include <linux/types.h>
>  #include <uapi/asm/vmx.h>
>  
> @@ -110,6 +111,36 @@
>  #define VMX_MISC_SAVE_EFER_LMA			0x00000020
>  #define VMX_MISC_ACTIVITY_HLT			0x00000040
>  
> +static inline u32 vmx_basic_vmcs_revision_id(u64 vmx_basic)
> +{
> +	return vmx_basic & GENMASK_ULL(30, 0);
> +}
> +
> +static inline u32 vmx_basic_vmcs_size(u64 vmx_basic)
> +{
> +	return (vmx_basic & GENMASK_ULL(44, 32)) >> 32;
> +}
> +
> +static inline int vmx_misc_preemption_timer_rate(u64 vmx_misc)
> +{
> +	return vmx_misc & VMX_MISC_PREEMPTION_TIMER_RATE_MASK;
> +}
> +
> +static inline int vmx_misc_cr3_count(u64 vmx_misc)
> +{
> +	return (vmx_misc & GENMASK_ULL(24, 16)) >> 16;
> +}
> +
> +static inline int vmx_misc_max_msr(u64 vmx_misc)
> +{
> +	return (vmx_misc & GENMASK_ULL(27, 25)) >> 25;
> +}
> +
> +static inline int vmx_misc_mseg_revid(u64 vmx_misc)
> +{
> +	return (vmx_misc & GENMASK_ULL(63, 32)) >> 32;
> +}
> +
>  /* VMCS Encodings */
>  enum vmcs_field {
>  	VIRTUAL_PROCESSOR_ID            = 0x00000000,
> diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
> index 5382b82..6ec3832 100644
> --- a/arch/x86/kvm/vmx.c
> +++ b/arch/x86/kvm/vmx.c
> @@ -463,6 +463,12 @@ struct nested_vmx {
>  	u32 nested_vmx_misc_high;
>  	u32 nested_vmx_ept_caps;

> +/*
> + * Called when userspace is restoring VMX MSRs.
> + *
> + * Returns 0 on success, non-0 otherwise.
> + */
> +static int vmx_set_vmx_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data)
>  {
>  	struct vcpu_vmx *vmx = to_vmx(vcpu);
>  
>  	switch (msr_index) {
>  	case MSR_IA32_VMX_BASIC:
> +		return vmx_restore_vmx_basic(vmx, data);
> +	case MSR_IA32_VMX_TRUE_PINBASED_CTLS:
> +	case MSR_IA32_VMX_PINBASED_CTLS:
> +	case MSR_IA32_VMX_TRUE_PROCBASED_CTLS:
> +	case MSR_IA32_VMX_PROCBASED_CTLS:
> +	case MSR_IA32_VMX_TRUE_EXIT_CTLS:
> +	case MSR_IA32_VMX_EXIT_CTLS:
> +	case MSR_IA32_VMX_TRUE_ENTRY_CTLS:
> +	case MSR_IA32_VMX_ENTRY_CTLS:

PINBASED_CTLS, PROCBASED_CTLS, EXIT_CTLS and ENTRY_CTLS can be derived
from their "true" counterparts, so I think it's better to remove the
"non-true" ones from struct nested_vmx (and/or add the "true" ones when
missing) and make them entirely computed.  But it can be done on top.

Paolo

> +	case MSR_IA32_VMX_PROCBASED_CTLS2:
> +		return vmx_restore_control_msr(vmx, msr_index, data);
> +	case MSR_IA32_VMX_MISC:
> +		return vmx_restore_vmx_misc(vmx, data);
> +	case MSR_IA32_VMX_CR0_FIXED0:
> +	case MSR_IA32_VMX_CR4_FIXED0:
> +		return vmx_restore_fixed0_msr(vmx, msr_index, data);
> +	case MSR_IA32_VMX_CR0_FIXED1:
> +	case MSR_IA32_VMX_CR4_FIXED1:
> +		return vmx_restore_fixed1_msr(vmx, msr_index, data);
> +	case MSR_IA32_VMX_EPT_VPID_CAP:
> +		return vmx_restore_vmx_ept_vpid_cap(vmx, data);
> +	case MSR_IA32_VMX_VMCS_ENUM:
> +		vmx->nested.nested_vmx_vmcs_enum = data;
> +		return 0;
> +	default:
>  		/*
> -		 * This MSR reports some information about VMX support. We
> -		 * should return information about the VMX we emulate for the
> -		 * guest, and the VMCS structure we give it - not about the
> -		 * VMX support of the underlying hardware.
> +		 * The rest of the VMX capability MSRs do not support restore.
>  		 */
> -		*pdata = VMCS12_REVISION | VMX_BASIC_TRUE_CTLS |
> -			   ((u64)VMCS12_SIZE << VMX_BASIC_VMCS_SIZE_SHIFT) |
> -			   (VMX_BASIC_MEM_TYPE_WB << VMX_BASIC_MEM_TYPE_SHIFT);
> -		if (cpu_has_vmx_basic_inout())
> -			*pdata |= VMX_BASIC_INOUT;
> +		return -EINVAL;
> +	}
> +
> +	return 0;
> +}
> +
> +/* Returns 0 on success, non-0 otherwise. */
> +static int vmx_get_vmx_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata)
> +{
> +	struct vcpu_vmx *vmx = to_vmx(vcpu);
> +
> +	switch (msr_index) {
> +	case MSR_IA32_VMX_BASIC:
> +		*pdata = vmx->nested.nested_vmx_basic;
>  		break;
>  	case MSR_IA32_VMX_TRUE_PINBASED_CTLS:
>  	case MSR_IA32_VMX_PINBASED_CTLS:
> @@ -2904,27 +3176,20 @@ static int vmx_get_vmx_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata)
>  			vmx->nested.nested_vmx_misc_low,
>  			vmx->nested.nested_vmx_misc_high);
>  		break;
> -	/*
> -	 * These MSRs specify bits which the guest must keep fixed (on or off)
> -	 * while L1 is in VMXON mode (in L1's root mode, or running an L2).
> -	 * We picked the standard core2 setting.
> -	 */
> -#define VMXON_CR0_ALWAYSON	(X86_CR0_PE | X86_CR0_PG | X86_CR0_NE)
> -#define VMXON_CR4_ALWAYSON	X86_CR4_VMXE
>  	case MSR_IA32_VMX_CR0_FIXED0:
> -		*pdata = VMXON_CR0_ALWAYSON;
> +		*pdata = vmx->nested.nested_vmx_cr0_fixed0;
>  		break;
>  	case MSR_IA32_VMX_CR0_FIXED1:
> -		*pdata = -1ULL;
> +		*pdata = vmx->nested.nested_vmx_cr0_fixed1;
>  		break;
>  	case MSR_IA32_VMX_CR4_FIXED0:
> -		*pdata = VMXON_CR4_ALWAYSON;
> +		*pdata = vmx->nested.nested_vmx_cr4_fixed0;
>  		break;
>  	case MSR_IA32_VMX_CR4_FIXED1:
> -		*pdata = -1ULL;
> +		*pdata = vmx->nested.nested_vmx_cr4_fixed1;
>  		break;
>  	case MSR_IA32_VMX_VMCS_ENUM:
> -		*pdata = 0x2e; /* highest index: VMX_PREEMPTION_TIMER_VALUE */
> +		*pdata = vmx->nested.nested_vmx_vmcs_enum;
>  		break;
>  	case MSR_IA32_VMX_PROCBASED_CTLS2:
>  		*pdata = vmx_control_msr(
> @@ -3107,7 +3372,11 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
>  			vmx_leave_nested(vcpu);
>  		break;
>  	case MSR_IA32_VMX_BASIC ... MSR_IA32_VMX_VMFUNC:
> -		return 1; /* they are read-only */
> +		if (!msr_info->host_initiated)
> +			return 1; /* they are read-only */
> +		if (!nested_vmx_allowed(vcpu))
> +			return 1;
> +		return vmx_set_vmx_msr(vcpu, msr_index, data);
>  	case MSR_IA32_XSS:
>  		if (!vmx_xsaves_supported())
>  			return 1;
>