From: Paolo Bonzini <pbonzini@redhat.com>
To: linux-kernel@vger.kernel.org, kvm@vger.kernel.org
Cc: wanpeng.li@hotmail.com, david@redhat.com, rkrcmar@redhat.com,
        jmattson@google.com
Subject: [PATCH 4/4] KVM: x86: allow overwriting L2 reinjected exception with L1 vmexit
Date: Wed, 23 Aug 2017 22:43:58 +0200
Message-Id: <1503521038-21073-5-git-send-email-pbonzini@redhat.com>
In-Reply-To: <1503521038-21073-1-git-send-email-pbonzini@redhat.com>
References: <1503521038-21073-1-git-send-email-pbonzini@redhat.com>
Sender: linux-kernel-owner@vger.kernel.org
Content-Length: 10758
Lines: 293

A reinjected exception is already recorded in either the IDT-vectored
event information fields or the EXITINTINFO fields; if the handling of
an exception in L0 causes a vmexit, we don't really need to keep the
reinjected exception in vcpu->arch.exception.

Teach kvm_multiple_exception to recognize this scenario through a
new kvm_x86_ops callback.

Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/include/asm/kvm_host.h |  2 +
 arch/x86/kvm/svm.c              | 79 +++++++++++++++++++-------------------
 arch/x86/kvm/vmx.c              | 84 +++++++++++++++++++++--------------------
 arch/x86/kvm/x86.c              |  9 +++++
 4 files changed, 95 insertions(+), 79 deletions(-)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 6db0ed9cf59e..643308143bea 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -962,6 +962,8 @@ struct kvm_x86_ops {
 				unsigned char *hypercall_addr);
 	void (*set_irq)(struct kvm_vcpu *vcpu);
 	void (*set_nmi)(struct kvm_vcpu *vcpu);
+	int (*nested_check_exception)(struct kvm_vcpu *vcpu,
+				      struct kvm_queued_exception *ex);
 	void (*queue_exception)(struct kvm_vcpu *vcpu);
 	void (*cancel_injection)(struct kvm_vcpu *vcpu);
 	int (*interrupt_allowed)(struct kvm_vcpu *vcpu);
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 7e190b21a30b..32c8d8f62985 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -291,8 +291,6 @@ struct amd_svm_iommu_ir {
 static int nested_svm_exit_handled(struct vcpu_svm *svm);
 static int nested_svm_intercept(struct vcpu_svm *svm);
 static int nested_svm_vmexit(struct vcpu_svm *svm);
-static int nested_svm_check_exception(struct vcpu_svm *svm, unsigned nr,
-				      bool has_error_code, u32 error_code);
 
 enum {
 	VMCB_INTERCEPTS, /* Intercept vectors, TSC offset,
@@ -650,6 +648,42 @@ static void skip_emulated_instruction(struct kvm_vcpu *vcpu)
 	svm_set_interrupt_shadow(vcpu, 0);
 }
 
+static void nested_svm_queue_exception(struct kvm_vcpu *vcpu,
+				       struct kvm_queued_exception *ex)
+{
+	struct vcpu_svm *svm = to_svm(vcpu);
+
+	svm->vmcb->control.exit_code = SVM_EXIT_EXCP_BASE + ex->nr;
+	svm->vmcb->control.exit_code_hi = 0;
+	svm->vmcb->control.exit_info_1 = ex->error_code;
+
+	/*
+	 * FIXME: we should not write CR2 when L1 intercepts an L2 #PF exception.
+	 * The fix is to add the ancillary datum (CR2 or DR6) to structs
+	 * kvm_queued_exception and kvm_vcpu_events, so that CR2 and DR6 can be
+	 * written only when inject_pending_event runs (DR6 would written here
+	 * too).  This should be conditional on a new capability---if the
+	 * capability is disabled, kvm_multiple_exception would write the
+	 * ancillary information to CR2 or DR6, for backwards ABI-compatibility.
+	 */
+	if (ex->nested_apf)
+		svm->vmcb->control.exit_info_2 = svm->vcpu.arch.apf.nested_apf_token;
+	else
+		svm->vmcb->control.exit_info_2 = svm->vcpu.arch.cr2;
+
+	svm->nested.exit_required = true;
+}
+
+static int nested_svm_check_exception(struct kvm_vcpu *vcpu,
+				      struct kvm_queued_exception *ex)
+{
+	struct vcpu_svm *svm = to_svm(vcpu);
+	unsigned int nr = ex->nr;
+
+	return ((svm->nested.intercept_exceptions & (1 << nr)) ||
+		(nr == PF_VECTOR && ex->nested_apf));
+}
+
 static void svm_queue_exception(struct kvm_vcpu *vcpu)
 {
 	struct vcpu_svm *svm = to_svm(vcpu);
@@ -662,9 +696,11 @@ static void svm_queue_exception(struct kvm_vcpu *vcpu)
 	 * If we are within a nested VM we'd better #VMEXIT and let the guest
 	 * handle the exception
 	 */
-	if (!reinject &&
-	    nested_svm_check_exception(svm, nr, has_error_code, error_code))
+	if (!reinject && is_guest_mode(vcpu) &&
+	    nested_svm_check_exception(vcpu, &vcpu->arch.exception)) {
+		nested_svm_queue_exception(vcpu, &vcpu->arch.exception);
 		return;
+	}
 
 	if (nr == BP_VECTOR && !static_cpu_has(X86_FEATURE_NRIPS)) {
 		unsigned long rip, old_rip = kvm_rip_read(&svm->vcpu);
@@ -2428,40 +2464,6 @@ static int nested_svm_check_permissions(struct vcpu_svm *svm)
 	return 0;
 }
 
-static int nested_svm_check_exception(struct vcpu_svm *svm, unsigned nr,
-				      bool has_error_code, u32 error_code)
-{
-	int vmexit;
-
-	if (!is_guest_mode(&svm->vcpu))
-		return 0;
-
-	vmexit = nested_svm_intercept(svm);
-	if (vmexit != NESTED_EXIT_DONE)
-		return 0;
-
-	svm->vmcb->control.exit_code = SVM_EXIT_EXCP_BASE + nr;
-	svm->vmcb->control.exit_code_hi = 0;
-	svm->vmcb->control.exit_info_1 = error_code;
-
-	/*
-	 * FIXME: we should not write CR2 when L1 intercepts an L2 #PF exception.
-	 * The fix is to add the ancillary datum (CR2 or DR6) to structs
-	 * kvm_queued_exception and kvm_vcpu_events, so that CR2 and DR6 can be
-	 * written only when inject_pending_event runs (DR6 would written here
-	 * too).  This should be conditional on a new capability---if the
-	 * capability is disabled, kvm_multiple_exception would write the
-	 * ancillary information to CR2 or DR6, for backwards ABI-compatibility.
-	 */
-	if (svm->vcpu.arch.exception.nested_apf)
-		svm->vmcb->control.exit_info_2 = svm->vcpu.arch.apf.nested_apf_token;
-	else
-		svm->vmcb->control.exit_info_2 = svm->vcpu.arch.cr2;
-
-	svm->nested.exit_required = true;
-	return vmexit;
-}
-
 /* This function returns true if it is save to enable the irq window */
 static inline bool nested_svm_intr(struct vcpu_svm *svm)
 {
@@ -5448,6 +5450,7 @@ static void svm_setup_mce(struct kvm_vcpu *vcpu)
 	.patch_hypercall = svm_patch_hypercall,
 	.set_irq = svm_set_irq,
 	.set_nmi = svm_inject_nmi,
+	.nested_check_exception = nested_svm_check_exception,
 	.queue_exception = svm_queue_exception,
 	.cancel_injection = svm_cancel_injection,
 	.interrupt_allowed = svm_interrupt_allowed,
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index f8ef38094acc..ddabed8425b3 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -2438,15 +2438,41 @@ static void skip_emulated_instruction(struct kvm_vcpu *vcpu)
 	vmx_set_interrupt_shadow(vcpu, 0);
 }
 
-static void nested_vmx_inject_exception_vmexit(struct kvm_vcpu *vcpu,
-					       unsigned long exit_qual)
+static void nested_vmx_queue_exception(struct kvm_vcpu *vcpu,
+				       struct kvm_queued_exception *ex)
 {
 	struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
-	unsigned int nr = vcpu->arch.exception.nr;
-	u32 intr_info = nr | INTR_INFO_VALID_MASK;
+	unsigned int nr = ex->nr;
+   	u32 intr_info = nr | INTR_INFO_VALID_MASK;
+	unsigned long exit_qual = 0;
 
-	if (vcpu->arch.exception.has_error_code) {
-		vmcs12->vm_exit_intr_error_code = vcpu->arch.exception.error_code;
+	/*
+	 * FIXME: we must not write CR2 or DR6 when L1 intercepts an L2 #PF
+	 * or #DB exception.
+	 *
+	 * The fix is to add the ancillary datum (CR2 or DR6) to structs
+	 * kvm_queued_exception and kvm_vcpu_events, so that CR2 and DR6
+	 * can be written only when inject_pending_event runs.  This should be
+	 * conditional on a new capability---if the capability is disabled,
+	 * kvm_multiple_exception would write the ancillary information to
+	 * CR2 or DR6, for backwards ABI-compatibility.
+	 */
+	switch (nr) {
+	case PF_VECTOR:
+		if (ex->nested_apf)
+			exit_qual = vcpu->arch.apf.nested_apf_token;
+		else
+			exit_qual = vcpu->arch.cr2;
+		break;
+	case DB_VECTOR:
+		exit_qual = vcpu->arch.dr6;
+		break;
+	default:
+		exit_qual = 0;
+	}
+
+	if (ex->has_error_code) {
+		vmcs12->vm_exit_intr_error_code = ex->error_code;
 		intr_info |= INTR_INFO_DELIVER_CODE_MASK;
 	}
 
@@ -2466,43 +2492,16 @@ static void nested_vmx_inject_exception_vmexit(struct kvm_vcpu *vcpu,
  * KVM wants to inject page-faults which it got to the guest. This function
  * checks whether in a nested guest, we need to inject them to L1 or L2.
  */
-static int nested_vmx_check_exception(struct kvm_vcpu *vcpu)
+static int nested_vmx_check_exception(struct kvm_vcpu *vcpu,
+				      struct kvm_queued_exception *ex)
 {
 	struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
-	unsigned int nr = vcpu->arch.exception.nr;
+	unsigned int nr = ex->nr;
 
-	if (nr == PF_VECTOR) {
-		if (vcpu->arch.exception.nested_apf) {
-			nested_vmx_inject_exception_vmexit(vcpu,
-							   vcpu->arch.apf.nested_apf_token);
-			return 1;
-		}
-		/*
-		 * FIXME: we must not write CR2 when L1 intercepts an L2 #PF exception.
-		 * The fix is to add the ancillary datum (CR2 or DR6) to structs
-		 * kvm_queued_exception and kvm_vcpu_events, so that CR2 and DR6
-		 * can be written only when inject_pending_event runs.  This should be
-		 * conditional on a new capability---if the capability is disabled,
-		 * kvm_multiple_exception would write the ancillary information to
-		 * CR2 or DR6, for backwards ABI-compatibility.
-		 */
-		if (nested_vmx_is_page_fault_vmexit(vmcs12,
-						    vcpu->arch.exception.error_code)) {
-			nested_vmx_inject_exception_vmexit(vcpu, vcpu->arch.cr2);
-			return 1;
-		}
-	} else {
-		unsigned long exit_qual = 0;
-		if (nr == DB_VECTOR)
-			exit_qual = vcpu->arch.dr6;
-
-		if (vmcs12->exception_bitmap & (1u << nr)) {
-			nested_vmx_inject_exception_vmexit(vcpu, exit_qual);
-			return 1;
-		}
-	}
-
-	return 0;
+	return (nr == PF_VECTOR
+		? (ex->nested_apf ||
+		   nested_vmx_is_page_fault_vmexit(vmcs12, ex->error_code))
+		: vmcs12->exception_bitmap & (1u << nr));
 }
 
 static void vmx_queue_exception(struct kvm_vcpu *vcpu)
@@ -2515,8 +2514,10 @@ static void vmx_queue_exception(struct kvm_vcpu *vcpu)
 	u32 intr_info = nr | INTR_INFO_VALID_MASK;
 
 	if (!reinject && is_guest_mode(vcpu) &&
-	    nested_vmx_check_exception(vcpu))
+	    nested_vmx_check_exception(vcpu, &vcpu->arch.exception)) {
+		nested_vmx_queue_exception(vcpu, &vcpu->arch.exception);
 		return;
+	}
 
 	if (has_error_code) {
 		vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE, error_code);
@@ -11886,6 +11887,7 @@ static void vmx_setup_mce(struct kvm_vcpu *vcpu)
 	.patch_hypercall = vmx_patch_hypercall,
 	.set_irq = vmx_inject_irq,
 	.set_nmi = vmx_inject_nmi,
+	.nested_check_exception = nested_vmx_check_exception,
 	.queue_exception = vmx_queue_exception,
 	.cancel_injection = vmx_cancel_injection,
 	.interrupt_allowed = vmx_interrupt_allowed,
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 88b91114c5a8..55c709531eb9 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -390,6 +390,15 @@ static void kvm_multiple_exception(struct kvm_vcpu *vcpu,
 	if (!vcpu->arch.exception.pending)
 		goto queue;
 
+	/*
+	 * If the previous exception was recorded during an L2->L0
+	 * exit, we can overwrite it with one that causes an L0->L1
+	 * nested vmexit.
+	 */
+	if (vcpu->arch.exception.reinject && is_guest_mode(vcpu) &&
+	    kvm_x86_ops->nested_check_exception(vcpu, &ex))
+		goto queue;
+
 	/* to check exception */
 	prev_nr = vcpu->arch.exception.nr;
 	if (prev_nr == DF_VECTOR) {
-- 
1.8.3.1