MIME-Version: 1.0
In-Reply-To: <20161129204041.8839-4-khuey@kylehuey.com>
References: <20161129204041.8839-1-khuey@kylehuey.com> <20161129204041.8839-4-khuey@kylehuey.com>
From: David Matlack <dmatlack@google.com>
Date: Mon, 19 Dec 2016 12:54:35 -0800
Message-ID: <CALzav=cQbfQJAgXkcXxvHy1qMJiytzSyAa6+tXY+e8ZK92=ReA@mail.gmail.com>
Subject: Re: [PATCH v2 3/5] KVM: VMX: Move skip_emulated_instruction out of nested_vmx_check_vmcs12
To: Kyle Huey <me@kylehuey.com>
Cc: Paolo Bonzini <pbonzini@redhat.com>,
        =?UTF-8?B?UmFkaW0gS3LEjW3DocWZ?= <rkrcmar@redhat.com>,
        Thomas Gleixner <tglx@linutronix.de>, Ingo Molnar <mingo@redhat.com>,
        "H. Peter Anvin" <hpa@zytor.com>, X86 ML <x86@kernel.org>,
        Joerg Roedel <joro@8bytes.org>, kvm list <kvm@vger.kernel.org>,
        "linux-kernel@vger.kernel.org" <linux-kernel@vger.kernel.org>
Content-Type: text/plain; charset=UTF-8
Sender: linux-kernel-owner@vger.kernel.org
Content-Length: 12887
Lines: 295

On Tue, Nov 29, 2016 at 12:40 PM, Kyle Huey <me@kylehuey.com> wrote:
> We can't return both the pass/fail boolean for the vmcs and the upcoming
> continue/exit-to-userspace boolean for skip_emulated_instruction out of
> nested_vmx_check_vmcs, so move skip_emulated_instruction out of it instead.
>
> Additionally, VMENTER/VMRESUME only trigger singlestep exceptions when
> they advance the IP to the following instruction, not when they a) succeed,
> b) fail MSR validation or c) throw an exception. Add a separate call to
> skip_emulated_instruction that will later not be converted to the variant
> that checks the singlestep flag.
>
> Signed-off-by: Kyle Huey <khuey@kylehuey.com>
> ---
>  arch/x86/kvm/vmx.c | 53 +++++++++++++++++++++++++++++++++--------------------
>  1 file changed, 33 insertions(+), 20 deletions(-)
>
> diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
> index f2f9cf5..f4f6304 100644
> --- a/arch/x86/kvm/vmx.c
> +++ b/arch/x86/kvm/vmx.c
> @@ -7319,33 +7319,36 @@ static void copy_vmcs12_to_shadow(struct vcpu_vmx *vmx)
>   * VMX instructions which assume a current vmcs12 (i.e., that VMPTRLD was
>   * used before) all generate the same failure when it is missing.
>   */
>  static int nested_vmx_check_vmcs12(struct kvm_vcpu *vcpu)
>  {
>         struct vcpu_vmx *vmx = to_vmx(vcpu);
>         if (vmx->nested.current_vmptr == -1ull) {
>                 nested_vmx_failInvalid(vcpu);
> -               skip_emulated_instruction(vcpu);
>                 return 0;
>         }
>         return 1;
>  }
>
>  static int handle_vmread(struct kvm_vcpu *vcpu)
>  {
>         unsigned long field;
>         u64 field_value;
>         unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
>         u32 vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO);
>         gva_t gva = 0;
>
> -       if (!nested_vmx_check_permission(vcpu) ||
> -           !nested_vmx_check_vmcs12(vcpu))
> +       if (!nested_vmx_check_permission(vcpu))
> +               return 1;
> +
> +       if (!nested_vmx_check_vmcs12(vcpu)) {
> +               skip_emulated_instruction(vcpu);
>                 return 1;
> +       }
>
>         /* Decode instruction info and find the field to read */
>         field = kvm_register_readl(vcpu, (((vmx_instruction_info) >> 28) & 0xf));
>         /* Read the field, zero-extended to a u64 field_value */
>         if (vmcs12_read_any(vcpu, field, &field_value) < 0) {
>                 nested_vmx_failValid(vcpu, VMXERR_UNSUPPORTED_VMCS_COMPONENT);
>                 skip_emulated_instruction(vcpu);
>                 return 1;
> @@ -7383,20 +7386,24 @@ static int handle_vmwrite(struct kvm_vcpu *vcpu)
>          * mode, and eventually we need to write that into a field of several
>          * possible lengths. The code below first zero-extends the value to 64
>          * bit (field_value), and then copies only the appropriate number of
>          * bits into the vmcs12 field.
>          */
>         u64 field_value = 0;
>         struct x86_exception e;
>
> -       if (!nested_vmx_check_permission(vcpu) ||
> -           !nested_vmx_check_vmcs12(vcpu))
> +       if (!nested_vmx_check_permission(vcpu))
>                 return 1;
>
> +       if (!nested_vmx_check_vmcs12(vcpu)) {
> +               skip_emulated_instruction(vcpu);
> +               return 1;
> +       }
> +
>         if (vmx_instruction_info & (1u << 10))
>                 field_value = kvm_register_readl(vcpu,
>                         (((vmx_instruction_info) >> 3) & 0xf));
>         else {
>                 if (get_vmx_mem_address(vcpu, exit_qualification,
>                                 vmx_instruction_info, false, &gva))
>                         return 1;
>                 if (kvm_read_guest_virt(&vcpu->arch.emulate_ctxt, gva,
> @@ -10041,21 +10048,22 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch)
>  {
>         struct vmcs12 *vmcs12;
>         struct vcpu_vmx *vmx = to_vmx(vcpu);
>         int cpu;
>         struct loaded_vmcs *vmcs02;
>         bool ia32e;
>         u32 msr_entry_idx;
>
> -       if (!nested_vmx_check_permission(vcpu) ||
> -           !nested_vmx_check_vmcs12(vcpu))
> +       if (!nested_vmx_check_permission(vcpu))
>                 return 1;
>
> -       skip_emulated_instruction(vcpu);
> +       if (!nested_vmx_check_vmcs12(vcpu))
> +               goto out;
> +
>         vmcs12 = get_vmcs12(vcpu);
>
>         if (enable_shadow_vmcs)
>                 copy_shadow_to_vmcs12(vmx);
>
>         /*
>          * The nested entry process starts with enforcing various prerequisites
>          * on vmcs12 as required by the Intel SDM, and act appropriately when
> @@ -10065,43 +10073,43 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch)
>          * To speed up the normal (success) code path, we should avoid checking
>          * for misconfigurations which will anyway be caught by the processor
>          * when using the merged vmcs02.
>          */
>         if (vmcs12->launch_state == launch) {
>                 nested_vmx_failValid(vcpu,
>                         launch ? VMXERR_VMLAUNCH_NONCLEAR_VMCS
>                                : VMXERR_VMRESUME_NONLAUNCHED_VMCS);
> -               return 1;
> +               goto out;
>         }
>
>         if (vmcs12->guest_activity_state != GUEST_ACTIVITY_ACTIVE &&
>             vmcs12->guest_activity_state != GUEST_ACTIVITY_HLT) {
>                 nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD);
> -               return 1;
> +               goto out;
>         }
>
>         if (!nested_get_vmcs12_pages(vcpu, vmcs12)) {
>                 nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD);
> -               return 1;
> +               goto out;
>         }
>
>         if (nested_vmx_check_msr_bitmap_controls(vcpu, vmcs12)) {
>                 nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD);
> -               return 1;
> +               goto out;
>         }
>
>         if (nested_vmx_check_apicv_controls(vcpu, vmcs12)) {
>                 nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD);
> -               return 1;
> +               goto out;
>         }
>
>         if (nested_vmx_check_msr_switch_controls(vcpu, vmcs12)) {
>                 nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD);
> -               return 1;
> +               goto out;
>         }
>
>         if (!vmx_control_verify(vmcs12->cpu_based_vm_exec_control,
>                                 vmx->nested.nested_vmx_true_procbased_ctls_low,
>                                 vmx->nested.nested_vmx_procbased_ctls_high) ||
>             !vmx_control_verify(vmcs12->secondary_vm_exec_control,
>                                 vmx->nested.nested_vmx_secondary_ctls_low,
>                                 vmx->nested.nested_vmx_secondary_ctls_high) ||
> @@ -10111,36 +10119,36 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch)
>             !vmx_control_verify(vmcs12->vm_exit_controls,
>                                 vmx->nested.nested_vmx_true_exit_ctls_low,
>                                 vmx->nested.nested_vmx_exit_ctls_high) ||
>             !vmx_control_verify(vmcs12->vm_entry_controls,
>                                 vmx->nested.nested_vmx_true_entry_ctls_low,
>                                 vmx->nested.nested_vmx_entry_ctls_high))
>         {
>                 nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD);
> -               return 1;
> +               goto out;
>         }
>
>         if (((vmcs12->host_cr0 & VMXON_CR0_ALWAYSON) != VMXON_CR0_ALWAYSON) ||
>             ((vmcs12->host_cr4 & VMXON_CR4_ALWAYSON) != VMXON_CR4_ALWAYSON)) {
>                 nested_vmx_failValid(vcpu,
>                         VMXERR_ENTRY_INVALID_HOST_STATE_FIELD);
> -               return 1;
> +               goto out;
>         }
>
>         if (!nested_cr0_valid(vcpu, vmcs12->guest_cr0) ||
>             ((vmcs12->guest_cr4 & VMXON_CR4_ALWAYSON) != VMXON_CR4_ALWAYSON)) {
>                 nested_vmx_entry_failure(vcpu, vmcs12,
>                         EXIT_REASON_INVALID_STATE, ENTRY_FAIL_DEFAULT);
> -               return 1;
> +               goto out;

It's a bug to call kvm_skip_emulated_instruction() after
nested_vmx_entry_failure. I sent out "KVM: nVMX: fix instruction
skipping during emulated vm-entry" with a fix.

>         }
>         if (vmcs12->vmcs_link_pointer != -1ull) {
>                 nested_vmx_entry_failure(vcpu, vmcs12,
>                         EXIT_REASON_INVALID_STATE, ENTRY_FAIL_VMCS_LINK_PTR);
> -               return 1;
> +               goto out;
>         }
>
>         /*
>          * If the load IA32_EFER VM-entry control is 1, the following checks
>          * are performed on the field for the IA32_EFER MSR:
>          * - Bits reserved in the IA32_EFER MSR must be 0.
>          * - Bit 10 (corresponding to IA32_EFER.LMA) must equal the value of
>          *   the IA-32e mode guest VM-exit control. It must also be identical
> @@ -10150,17 +10158,17 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch)
>         if (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_EFER) {
>                 ia32e = (vmcs12->vm_entry_controls & VM_ENTRY_IA32E_MODE) != 0;
>                 if (!kvm_valid_efer(vcpu, vmcs12->guest_ia32_efer) ||
>                     ia32e != !!(vmcs12->guest_ia32_efer & EFER_LMA) ||
>                     ((vmcs12->guest_cr0 & X86_CR0_PG) &&
>                      ia32e != !!(vmcs12->guest_ia32_efer & EFER_LME))) {
>                         nested_vmx_entry_failure(vcpu, vmcs12,
>                                 EXIT_REASON_INVALID_STATE, ENTRY_FAIL_DEFAULT);
> -                       return 1;
> +                       goto out;
>                 }
>         }
>
>         /*
>          * If the load IA32_EFER VM-exit control is 1, bits reserved in the
>          * IA32_EFER MSR must be 0 in the field for that register. In addition,
>          * the values of the LMA and LME bits in the field must each be that of
>          * the host address-space size VM-exit control.
> @@ -10168,29 +10176,30 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch)
>         if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_EFER) {
>                 ia32e = (vmcs12->vm_exit_controls &
>                          VM_EXIT_HOST_ADDR_SPACE_SIZE) != 0;
>                 if (!kvm_valid_efer(vcpu, vmcs12->host_ia32_efer) ||
>                     ia32e != !!(vmcs12->host_ia32_efer & EFER_LMA) ||
>                     ia32e != !!(vmcs12->host_ia32_efer & EFER_LME)) {
>                         nested_vmx_entry_failure(vcpu, vmcs12,
>                                 EXIT_REASON_INVALID_STATE, ENTRY_FAIL_DEFAULT);
> -                       return 1;
> +                       goto out;
>                 }
>         }
>
>         /*
>          * We're finally done with prerequisite checking, and can start with
>          * the nested entry.
>          */
>
>         vmcs02 = nested_get_current_vmcs02(vmx);
>         if (!vmcs02)
>                 return -ENOMEM;
>
> +       skip_emulated_instruction(vcpu);

I wonder if we can drop this call. Advancing RIP past the
VMLAUNCH/VMRESUME instruction does nothing because we are going to
resume L1 at vmcs12->host_rip. But skip_emulated_instruction() has a
second side-effect which is to clear the interrupt shadow, and I'm not
sure if that is important here.

>         enter_guest_mode(vcpu);
>
>         if (!(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS))
>                 vmx->nested.vmcs01_debugctl = vmcs_read64(GUEST_IA32_DEBUGCTL);
>
>         cpu = get_cpu();
>         vmx->loaded_vmcs = vmcs02;
>         vmx_vcpu_put(vcpu);
> @@ -10222,16 +10231,20 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch)
>
>         /*
>          * Note no nested_vmx_succeed or nested_vmx_fail here. At this point
>          * we are no longer running L1, and VMLAUNCH/VMRESUME has not yet
>          * returned as far as L1 is concerned. It will only return (and set
>          * the success flag) when L2 exits (see nested_vmx_vmexit()).
>          */
>         return 1;
> +
> +out:
> +       skip_emulated_instruction(vcpu);
> +       return 1;
>  }
>
>  /*
>   * On a nested exit from L2 to L1, vmcs12.guest_cr0 might not be up-to-date
>   * because L2 may have changed some cr0 bits directly (CRO_GUEST_HOST_MASK).
>   * This function returns the new value we should put in vmcs12.guest_cr0.
>   * It's not enough to just return the vmcs02 GUEST_CR0. Rather,
>   *  1. Bits that neither L0 nor L1 trapped, were set directly by L2 and are now
> --
> 2.10.2
>
> --
> To unsubscribe from this list: send the line "unsubscribe kvm" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html