Re-inject INT3/INTO instead of retrying the instruction if the CPU
encountered an intercepted exception while vectoring the software
exception, e.g. if vectoring INT3 encounters a #PF and KVM is using
shadow paging. Retrying the instruction is architecturally wrong, e.g.
will result in a spurious #DB if there's a code breakpoint on the INT3/O,
and lack of re-injection also breaks nested virtualization, e.g. if L1
injects a software exception and vectoring the injected exception
encounters an exception that is intercepted by L0 but not L1.
Due to, ahem, deficiencies in the SVM architecture, acquiring the next
RIP may require flowing through the emulator even if NRIPS is supported,
as the CPU clears next_rip if the VM-Exit is due to an exception other
than "exceptions caused by the INT3, INTO, and BOUND instructions". To
deal with this, "skip" the instruction to calculate next_rip (if it's
not already known), and then unwind the RIP write and any side effects
(RFLAGS updates).
Save the computed next_rip and use it to re-stuff next_rip if injection
doesn't complete. This allows KVM to do the right thing if next_rip was
known prior to injection, e.g. if L1 injects a soft event into L2, and
there is no backing INTn instruction, e.g. if L1 is injecting an
arbitrary event.
Note, it's impossible to guarantee architectural correctness given SVM's
architectural flaws. E.g. if the guest executes INTn (no KVM injection),
an exit occurs while vectoring the INTn, and the guest modifies the code
stream while the exit is being handled, KVM will compute the incorrect
next_rip due to "skipping" the wrong instruction. A future enhancement
to make this less awful would be for KVM to detect that the decoded
instruction is not the correct INTn and drop the to-be-injected soft
event (retrying is a lesser evil compared to shoving the wrong RIP on the
exception stack).
Reported-by: Maciej S. Szmigiero <[email protected]>
Signed-off-by: Sean Christopherson <[email protected]>
---
arch/x86/kvm/svm/nested.c | 28 +++++++-
arch/x86/kvm/svm/svm.c | 140 +++++++++++++++++++++++++++-----------
arch/x86/kvm/svm/svm.h | 6 +-
3 files changed, 130 insertions(+), 44 deletions(-)
diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c
index 461c5f247801..0163238aa198 100644
--- a/arch/x86/kvm/svm/nested.c
+++ b/arch/x86/kvm/svm/nested.c
@@ -609,6 +609,21 @@ static void nested_vmcb02_prepare_save(struct vcpu_svm *svm, struct vmcb *vmcb12
}
}
+static inline bool is_evtinj_soft(u32 evtinj)
+{
+ u32 type = evtinj & SVM_EVTINJ_TYPE_MASK;
+ u8 vector = evtinj & SVM_EVTINJ_VEC_MASK;
+
+ if (!(evtinj & SVM_EVTINJ_VALID))
+ return false;
+
+ /*
+ * Intentionally return false for SOFT events, SVM doesn't yet support
+ * re-injecting soft interrupts.
+ */
+ return type == SVM_EVTINJ_TYPE_EXEPT && kvm_exception_is_soft(vector);
+}
+
static void nested_vmcb02_prepare_control(struct vcpu_svm *svm,
unsigned long vmcb12_rip)
{
@@ -677,6 +692,16 @@ static void nested_vmcb02_prepare_control(struct vcpu_svm *svm,
else if (boot_cpu_has(X86_FEATURE_NRIPS))
vmcb02->control.next_rip = vmcb12_rip;
+ if (is_evtinj_soft(vmcb02->control.event_inj)) {
+ svm->soft_int_injected = true;
+ svm->soft_int_csbase = svm->vmcb->save.cs.base;
+ svm->soft_int_old_rip = vmcb12_rip;
+ if (svm->nrips_enabled)
+ svm->soft_int_next_rip = svm->nested.ctl.next_rip;
+ else
+ svm->soft_int_next_rip = vmcb12_rip;
+ }
+
vmcb02->control.virt_ext = vmcb01->control.virt_ext &
LBR_CTL_ENABLE_MASK;
if (svm->lbrv_enabled)
@@ -849,6 +874,7 @@ int nested_svm_vmrun(struct kvm_vcpu *vcpu)
out_exit_err:
svm->nested.nested_run_pending = 0;
+ svm->soft_int_injected = false;
svm->vmcb->control.exit_code = SVM_EXIT_ERR;
svm->vmcb->control.exit_code_hi = 0;
@@ -1618,7 +1644,7 @@ static int svm_set_nested_state(struct kvm_vcpu *vcpu,
nested_copy_vmcb_control_to_cache(svm, ctl);
svm_switch_vmcb(svm, &svm->nested.vmcb02);
- nested_vmcb02_prepare_control(svm, save->rip);
+ nested_vmcb02_prepare_control(svm, svm->vmcb->save.rip);
/*
* While the nested guest CR3 is already checked and set by
diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
index 14bc4e87437b..8321f9ce5e35 100644
--- a/arch/x86/kvm/svm/svm.c
+++ b/arch/x86/kvm/svm/svm.c
@@ -342,9 +342,11 @@ static void svm_set_interrupt_shadow(struct kvm_vcpu *vcpu, int mask)
}
-static int svm_skip_emulated_instruction(struct kvm_vcpu *vcpu)
+static int __svm_skip_emulated_instruction(struct kvm_vcpu *vcpu,
+ bool commit_side_effects)
{
struct vcpu_svm *svm = to_svm(vcpu);
+ unsigned long old_rflags;
/*
* SEV-ES does not expose the next RIP. The RIP update is controlled by
@@ -359,18 +361,75 @@ static int svm_skip_emulated_instruction(struct kvm_vcpu *vcpu)
}
if (!svm->next_rip) {
+ if (unlikely(!commit_side_effects))
+ old_rflags = svm->vmcb->save.rflags;
+
if (!kvm_emulate_instruction(vcpu, EMULTYPE_SKIP))
return 0;
+
+ if (unlikely(!commit_side_effects))
+ svm->vmcb->save.rflags = old_rflags;
} else {
kvm_rip_write(vcpu, svm->next_rip);
}
done:
- svm_set_interrupt_shadow(vcpu, 0);
+ if (likely(commit_side_effects))
+ svm_set_interrupt_shadow(vcpu, 0);
return 1;
}
+static int svm_skip_emulated_instruction(struct kvm_vcpu *vcpu)
+{
+ return __svm_skip_emulated_instruction(vcpu, true);
+}
+
+static int svm_update_soft_interrupt_rip(struct kvm_vcpu *vcpu)
+{
+ unsigned long rip, old_rip = kvm_rip_read(vcpu);
+ struct vcpu_svm *svm = to_svm(vcpu);
+
+ /*
+ * Due to architectural shortcomings, the CPU doesn't always provide
+ * NextRIP, e.g. if KVM intercepted an exception that occurred while
+ * the CPU was vectoring an INTO/INT3 in the guest. Temporarily skip
+ * the instruction even if NextRIP is supported to acquire the next
+ * RIP so that it can be shoved into the NextRIP field, otherwise
+ * hardware will fail to advance guest RIP during event injection.
+ * Drop the exception/interrupt if emulation fails and effectively
+ * retry the instruction, it's the least awful option. If NRIPS is
+ * in use, the skip must not commit any side effects such as clearing
+ * the interrupt shadow or RFLAGS.RF.
+ */
+ if (!__svm_skip_emulated_instruction(vcpu, !nrips))
+ return -EIO;
+
+ rip = kvm_rip_read(vcpu);
+
+ /*
+ * Save the injection information, even when using next_rip, as the
+ * VMCB's next_rip will be lost (cleared on VM-Exit) if the injection
+ * doesn't complete due to a VM-Exit occurring while the CPU is
+ * vectoring the event. Decoding the instruction isn't guaranteed to
+ * work as there may be no backing instruction, e.g. if the event is
+ * being injected by L1 for L2, or if the guest is patching INT3 into
+ * a different instruction.
+ */
+ svm->soft_int_injected = true;
+ svm->soft_int_csbase = svm->vmcb->save.cs.base;
+ svm->soft_int_old_rip = old_rip;
+ svm->soft_int_next_rip = rip;
+
+ if (nrips)
+ kvm_rip_write(vcpu, old_rip);
+
+ if (static_cpu_has(X86_FEATURE_NRIPS))
+ svm->vmcb->control.next_rip = rip;
+
+ return 0;
+}
+
static void svm_queue_exception(struct kvm_vcpu *vcpu)
{
struct vcpu_svm *svm = to_svm(vcpu);
@@ -380,25 +439,9 @@ static void svm_queue_exception(struct kvm_vcpu *vcpu)
kvm_deliver_exception_payload(vcpu);
- if (nr == BP_VECTOR && !nrips) {
- unsigned long rip, old_rip = kvm_rip_read(vcpu);
-
- /*
- * For guest debugging where we have to reinject #BP if some
- * INT3 is guest-owned:
- * Emulate nRIP by moving RIP forward. Will fail if injection
- * raises a fault that is not intercepted. Still better than
- * failing in all cases.
- */
- (void)svm_skip_emulated_instruction(vcpu);
- rip = kvm_rip_read(vcpu);
-
- if (boot_cpu_has(X86_FEATURE_NRIPS))
- svm->vmcb->control.next_rip = rip;
-
- svm->int3_rip = rip + svm->vmcb->save.cs.base;
- svm->int3_injected = rip - old_rip;
- }
+ if (kvm_exception_is_soft(nr) &&
+ svm_update_soft_interrupt_rip(vcpu))
+ return;
svm->vmcb->control.event_inj = nr
| SVM_EVTINJ_VALID
@@ -3671,15 +3714,46 @@ static inline void sync_lapic_to_cr8(struct kvm_vcpu *vcpu)
svm->vmcb->control.int_ctl |= cr8 & V_TPR_MASK;
}
+static void svm_complete_soft_interrupt(struct kvm_vcpu *vcpu, u8 vector,
+ int type)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+
+ /*
+ * If NRIPS is enabled, KVM must snapshot the pre-VMRUN next_rip that's
+ * associated with the original soft exception/interrupt. next_rip is
+ * cleared on all exits that can occur while vectoring an event, so KVM
+ * needs to manually set next_rip for re-injection. Unlike the !nrips
+ * case below, this needs to be done if and only if KVM is re-injecting
+ * the same event, i.e. if the event is a soft exception/interrupt,
+ * otherwise next_rip is unused on VMRUN.
+ */
+ if (nrips && type == SVM_EXITINTINFO_TYPE_EXEPT &&
+ kvm_exception_is_soft(vector) &&
+ kvm_is_linear_rip(vcpu, svm->soft_int_old_rip + svm->soft_int_csbase))
+ svm->vmcb->control.next_rip = svm->soft_int_next_rip;
+ /*
+ * If NRIPS isn't enabled, KVM must manually advance RIP prior to
+ * injecting the soft exception/interrupt. That advancement needs to
+ * be unwound if vectoring didn't complete. Note, the new event may
+ * not be the injected event, e.g. if KVM injected an INTn, the INTn
+ * hit a #NP in the guest, and the #NP encountered a #PF, the #NP will
+ * be the reported vectored event, but RIP still needs to be unwound.
+ */
+ else if (!nrips && type == SVM_EXITINTINFO_TYPE_EXEPT &&
+ kvm_is_linear_rip(vcpu, svm->soft_int_next_rip + svm->soft_int_csbase))
+ kvm_rip_write(vcpu, svm->soft_int_old_rip);
+}
+
static void svm_complete_interrupts(struct kvm_vcpu *vcpu)
{
struct vcpu_svm *svm = to_svm(vcpu);
u8 vector;
int type;
u32 exitintinfo = svm->vmcb->control.exit_int_info;
- unsigned int3_injected = svm->int3_injected;
+ bool soft_int_injected = svm->soft_int_injected;
- svm->int3_injected = 0;
+ svm->soft_int_injected = false;
/*
* If we've made progress since setting HF_IRET_MASK, we've
@@ -3704,17 +3778,8 @@ static void svm_complete_interrupts(struct kvm_vcpu *vcpu)
vector = exitintinfo & SVM_EXITINTINFO_VEC_MASK;
type = exitintinfo & SVM_EXITINTINFO_TYPE_MASK;
- /*
- * If NextRIP isn't enabled, KVM must manually advance RIP prior to
- * injecting the soft exception/interrupt. That advancement needs to
- * be unwound if vectoring didn't complete. Note, the new event may
- * not be the injected event, e.g. if KVM injected an INTn, the INTn
- * hit a #NP in the guest, and the #NP encountered a #PF, the #NP will
- * be the reported vectored event, but RIP still needs to be unwound.
- */
- if (int3_injected && type == SVM_EXITINTINFO_TYPE_EXEPT &&
- kvm_is_linear_rip(vcpu, svm->int3_rip))
- kvm_rip_write(vcpu, kvm_rip_read(vcpu) - int3_injected);
+ if (soft_int_injected)
+ svm_complete_soft_interrupt(vcpu, vector, type);
switch (type) {
case SVM_EXITINTINFO_TYPE_NMI:
@@ -3727,13 +3792,6 @@ static void svm_complete_interrupts(struct kvm_vcpu *vcpu)
if (vector == X86_TRAP_VC)
break;
- /*
- * In case of software exceptions, do not reinject the vector,
- * but re-execute the instruction instead.
- */
- if (kvm_exception_is_soft(vector))
- break;
-
if (exitintinfo & SVM_EXITINTINFO_VALID_ERR) {
u32 err = svm->vmcb->control.exit_int_info_err;
kvm_requeue_exception_e(vcpu, vector, err);
diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h
index 7d97e4d18c8b..6acb494e3598 100644
--- a/arch/x86/kvm/svm/svm.h
+++ b/arch/x86/kvm/svm/svm.h
@@ -230,8 +230,10 @@ struct vcpu_svm {
bool nmi_singlestep;
u64 nmi_singlestep_guest_rflags;
- unsigned int3_injected;
- unsigned long int3_rip;
+ unsigned long soft_int_csbase;
+ unsigned long soft_int_old_rip;
+ unsigned long soft_int_next_rip;
+ bool soft_int_injected;
/* optional nested SVM features that are enabled for this guest */
bool nrips_enabled : 1;
--
2.36.0.rc2.479.g8af0fa9b8e-goog
On 23.04.2022 04:14, Sean Christopherson wrote:
> Re-inject INT3/INTO instead of retrying the instruction if the CPU
> encountered an intercepted exception while vectoring the software
> exception, e.g. if vectoring INT3 encounters a #PF and KVM is using
> shadow paging. Retrying the instruction is architecturally wrong, e.g.
> will result in a spurious #DB if there's a code breakpoint on the INT3/O,
> and lack of re-injection also breaks nested virtualization, e.g. if L1
> injects a software exception and vectoring the injected exception
> encounters an exception that is intercepted by L0 but not L1.
>
> Due to, ahem, deficiencies in the SVM architecture, acquiring the next
> RIP may require flowing through the emulator even if NRIPS is supported,
> as the CPU clears next_rip if the VM-Exit is due to an exception other
> than "exceptions caused by the INT3, INTO, and BOUND instructions". To
> deal with this, "skip" the instruction to calculate next_rip (if it's
> not already known), and then unwind the RIP write and any side effects
> (RFLAGS updates).
>
> Save the computed next_rip and use it to re-stuff next_rip if injection
> doesn't complete. This allows KVM to do the right thing if next_rip was
> known prior to injection, e.g. if L1 injects a soft event into L2, and
> there is no backing INTn instruction, e.g. if L1 is injecting an
> arbitrary event.
>
> Note, it's impossible to guarantee architectural correctness given SVM's
> architectural flaws. E.g. if the guest executes INTn (no KVM injection),
> an exit occurs while vectoring the INTn, and the guest modifies the code
> stream while the exit is being handled, KVM will compute the incorrect
> next_rip due to "skipping" the wrong instruction. A future enhancement
> to make this less awful would be for KVM to detect that the decoded
> instruction is not the correct INTn and drop the to-be-injected soft
> event (retrying is a lesser evil compared to shoving the wrong RIP on the
> exception stack).
>
> Reported-by: Maciej S. Szmigiero <[email protected]>
> Signed-off-by: Sean Christopherson <[email protected]>
> ---
> arch/x86/kvm/svm/nested.c | 28 +++++++-
> arch/x86/kvm/svm/svm.c | 140 +++++++++++++++++++++++++++-----------
> arch/x86/kvm/svm/svm.h | 6 +-
> 3 files changed, 130 insertions(+), 44 deletions(-)
>
> diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c
> index 461c5f247801..0163238aa198 100644
> --- a/arch/x86/kvm/svm/nested.c
> +++ b/arch/x86/kvm/svm/nested.c
> @@ -609,6 +609,21 @@ static void nested_vmcb02_prepare_save(struct vcpu_svm *svm, struct vmcb *vmcb12
> }
> }
>
> +static inline bool is_evtinj_soft(u32 evtinj)
> +{
> + u32 type = evtinj & SVM_EVTINJ_TYPE_MASK;
> + u8 vector = evtinj & SVM_EVTINJ_VEC_MASK;
> +
> + if (!(evtinj & SVM_EVTINJ_VALID))
> + return false;
> +
> + /*
> + * Intentionally return false for SOFT events, SVM doesn't yet support
> + * re-injecting soft interrupts.
> + */
> + return type == SVM_EVTINJ_TYPE_EXEPT && kvm_exception_is_soft(vector);
> +}
> +
> static void nested_vmcb02_prepare_control(struct vcpu_svm *svm,
> unsigned long vmcb12_rip)
> {
> @@ -677,6 +692,16 @@ static void nested_vmcb02_prepare_control(struct vcpu_svm *svm,
> else if (boot_cpu_has(X86_FEATURE_NRIPS))
> vmcb02->control.next_rip = vmcb12_rip;
>
> + if (is_evtinj_soft(vmcb02->control.event_inj)) {
> + svm->soft_int_injected = true;
> + svm->soft_int_csbase = svm->vmcb->save.cs.base;
> + svm->soft_int_old_rip = vmcb12_rip;
> + if (svm->nrips_enabled)
> + svm->soft_int_next_rip = svm->nested.ctl.next_rip;
> + else
> + svm->soft_int_next_rip = vmcb12_rip;
The above branch can be actually reduced to simply
"svm->soft_int_next_rip = vmcb02->control.next_rip" if NRIPS is required
for (at least) nSVM.
Thanks,
Maciej
On 28.04.2022 11:37, Maxim Levitsky wrote:
> On Sat, 2022-04-23 at 02:14 +0000, Sean Christopherson wrote:
>> Re-inject INT3/INTO instead of retrying the instruction if the CPU
>> encountered an intercepted exception while vectoring the software
>> exception, e.g. if vectoring INT3 encounters a #PF and KVM is using
>> shadow paging. Retrying the instruction is architecturally wrong, e.g.
>> will result in a spurious #DB if there's a code breakpoint on the INT3/O,
>> and lack of re-injection also breaks nested virtualization, e.g. if L1
>> injects a software exception and vectoring the injected exception
>> encounters an exception that is intercepted by L0 but not L1.
>>
>> Due to, ahem, deficiencies in the SVM architecture, acquiring the next
>> RIP may require flowing through the emulator even if NRIPS is supported,
>> as the CPU clears next_rip if the VM-Exit is due to an exception other
>> than "exceptions caused by the INT3, INTO, and BOUND instructions". To
>> deal with this, "skip" the instruction to calculate next_rip (if it's
>> not already known), and then unwind the RIP write and any side effects
>> (RFLAGS updates).
>>
>> Save the computed next_rip and use it to re-stuff next_rip if injection
>> doesn't complete. This allows KVM to do the right thing if next_rip was
>> known prior to injection, e.g. if L1 injects a soft event into L2, and
>> there is no backing INTn instruction, e.g. if L1 is injecting an
>> arbitrary event.
>>
>> Note, it's impossible to guarantee architectural correctness given SVM's
>> architectural flaws. E.g. if the guest executes INTn (no KVM injection),
>> an exit occurs while vectoring the INTn, and the guest modifies the code
>> stream while the exit is being handled, KVM will compute the incorrect
>> next_rip due to "skipping" the wrong instruction. A future enhancement
>> to make this less awful would be for KVM to detect that the decoded
>> instruction is not the correct INTn and drop the to-be-injected soft
>> event (retrying is a lesser evil compared to shoving the wrong RIP on the
>> exception stack).
>>
>> Reported-by: Maciej S. Szmigiero <[email protected]>
>> Signed-off-by: Sean Christopherson <[email protected]>
>> ---
>> arch/x86/kvm/svm/nested.c | 28 +++++++-
>> arch/x86/kvm/svm/svm.c | 140 +++++++++++++++++++++++++++-----------
>> arch/x86/kvm/svm/svm.h | 6 +-
>> 3 files changed, 130 insertions(+), 44 deletions(-)
>>
>> diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c
>> index 461c5f247801..0163238aa198 100644
>> --- a/arch/x86/kvm/svm/nested.c
>> +++ b/arch/x86/kvm/svm/nested.c
>> @@ -609,6 +609,21 @@ static void nested_vmcb02_prepare_save(struct vcpu_svm *svm, struct vmcb *vmcb12
>> }
>> }
>>
>> +static inline bool is_evtinj_soft(u32 evtinj)
>> +{
>> + u32 type = evtinj & SVM_EVTINJ_TYPE_MASK;
>> + u8 vector = evtinj & SVM_EVTINJ_VEC_MASK;
>> +
>> + if (!(evtinj & SVM_EVTINJ_VALID))
>> + return false;
>> +
>> + /*
>> + * Intentionally return false for SOFT events, SVM doesn't yet support
>> + * re-injecting soft interrupts.
>> + */
>> + return type == SVM_EVTINJ_TYPE_EXEPT && kvm_exception_is_soft(vector);
>> +}
>> +
>> static void nested_vmcb02_prepare_control(struct vcpu_svm *svm,
>> unsigned long vmcb12_rip)
>> {
>> @@ -677,6 +692,16 @@ static void nested_vmcb02_prepare_control(struct vcpu_svm *svm,
>> else if (boot_cpu_has(X86_FEATURE_NRIPS))
>> vmcb02->control.next_rip = vmcb12_rip;
>>
>> + if (is_evtinj_soft(vmcb02->control.event_inj)) {
>> + svm->soft_int_injected = true;
>> + svm->soft_int_csbase = svm->vmcb->save.cs.base;
>> + svm->soft_int_old_rip = vmcb12_rip;
>> + if (svm->nrips_enabled)
>> + svm->soft_int_next_rip = svm->nested.ctl.next_rip;
>> + else
>> + svm->soft_int_next_rip = vmcb12_rip;
>> + }
>> +
>> vmcb02->control.virt_ext = vmcb01->control.virt_ext &
>> LBR_CTL_ENABLE_MASK;
>> if (svm->lbrv_enabled)
>> @@ -849,6 +874,7 @@ int nested_svm_vmrun(struct kvm_vcpu *vcpu)
>>
>> out_exit_err:
>> svm->nested.nested_run_pending = 0;
>> + svm->soft_int_injected = false;
>>
>> svm->vmcb->control.exit_code = SVM_EXIT_ERR;
>> svm->vmcb->control.exit_code_hi = 0;
>> @@ -1618,7 +1644,7 @@ static int svm_set_nested_state(struct kvm_vcpu *vcpu,
>> nested_copy_vmcb_control_to_cache(svm, ctl);
>>
>> svm_switch_vmcb(svm, &svm->nested.vmcb02);
>> - nested_vmcb02_prepare_control(svm, save->rip);
>> + nested_vmcb02_prepare_control(svm, svm->vmcb->save.rip);
>
> Is this change intentional?
It looks to me the final code is correct since "svm->vmcb->save"
contains L2 register save, while "save" has L1 register save.
It was the patch 1 from this series that was incorrect in
using "save->rip" here instead.
Thanks,
Maciej
On Thu, Apr 28, 2022, Maciej S. Szmigiero wrote:
> On 28.04.2022 11:37, Maxim Levitsky wrote:
> > On Sat, 2022-04-23 at 02:14 +0000, Sean Christopherson wrote:
> > > @@ -1618,7 +1644,7 @@ static int svm_set_nested_state(struct kvm_vcpu *vcpu,
> > > nested_copy_vmcb_control_to_cache(svm, ctl);
> > > svm_switch_vmcb(svm, &svm->nested.vmcb02);
> > > - nested_vmcb02_prepare_control(svm, save->rip);
> > > + nested_vmcb02_prepare_control(svm, svm->vmcb->save.rip);
> >
> > Is this change intentional?
>
> It looks to me the final code is correct since "svm->vmcb->save"
> contains L2 register save, while "save" has L1 register save.
>
> It was the patch 1 from this series that was incorrect in
> using "save->rip" here instead.
Yeah, I botched the fixup.
On Sat, 2022-04-23 at 02:14 +0000, Sean Christopherson wrote:
> Re-inject INT3/INTO instead of retrying the instruction if the CPU
> encountered an intercepted exception while vectoring the software
> exception, e.g. if vectoring INT3 encounters a #PF and KVM is using
> shadow paging. Retrying the instruction is architecturally wrong, e.g.
> will result in a spurious #DB if there's a code breakpoint on the INT3/O,
> and lack of re-injection also breaks nested virtualization, e.g. if L1
> injects a software exception and vectoring the injected exception
> encounters an exception that is intercepted by L0 but not L1.
>
> Due to, ahem, deficiencies in the SVM architecture, acquiring the next
> RIP may require flowing through the emulator even if NRIPS is supported,
> as the CPU clears next_rip if the VM-Exit is due to an exception other
> than "exceptions caused by the INT3, INTO, and BOUND instructions". To
> deal with this, "skip" the instruction to calculate next_rip (if it's
> not already known), and then unwind the RIP write and any side effects
> (RFLAGS updates).
>
> Save the computed next_rip and use it to re-stuff next_rip if injection
> doesn't complete. This allows KVM to do the right thing if next_rip was
> known prior to injection, e.g. if L1 injects a soft event into L2, and
> there is no backing INTn instruction, e.g. if L1 is injecting an
> arbitrary event.
>
> Note, it's impossible to guarantee architectural correctness given SVM's
> architectural flaws. E.g. if the guest executes INTn (no KVM injection),
> an exit occurs while vectoring the INTn, and the guest modifies the code
> stream while the exit is being handled, KVM will compute the incorrect
> next_rip due to "skipping" the wrong instruction. A future enhancement
> to make this less awful would be for KVM to detect that the decoded
> instruction is not the correct INTn and drop the to-be-injected soft
> event (retrying is a lesser evil compared to shoving the wrong RIP on the
> exception stack).
>
> Reported-by: Maciej S. Szmigiero <[email protected]>
> Signed-off-by: Sean Christopherson <[email protected]>
> ---
> arch/x86/kvm/svm/nested.c | 28 +++++++-
> arch/x86/kvm/svm/svm.c | 140 +++++++++++++++++++++++++++-----------
> arch/x86/kvm/svm/svm.h | 6 +-
> 3 files changed, 130 insertions(+), 44 deletions(-)
>
> diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c
> index 461c5f247801..0163238aa198 100644
> --- a/arch/x86/kvm/svm/nested.c
> +++ b/arch/x86/kvm/svm/nested.c
> @@ -609,6 +609,21 @@ static void nested_vmcb02_prepare_save(struct vcpu_svm *svm, struct vmcb *vmcb12
> }
> }
>
> +static inline bool is_evtinj_soft(u32 evtinj)
> +{
> + u32 type = evtinj & SVM_EVTINJ_TYPE_MASK;
> + u8 vector = evtinj & SVM_EVTINJ_VEC_MASK;
> +
> + if (!(evtinj & SVM_EVTINJ_VALID))
> + return false;
> +
> + /*
> + * Intentionally return false for SOFT events, SVM doesn't yet support
> + * re-injecting soft interrupts.
> + */
> + return type == SVM_EVTINJ_TYPE_EXEPT && kvm_exception_is_soft(vector);
> +}
> +
> static void nested_vmcb02_prepare_control(struct vcpu_svm *svm,
> unsigned long vmcb12_rip)
> {
> @@ -677,6 +692,16 @@ static void nested_vmcb02_prepare_control(struct vcpu_svm *svm,
> else if (boot_cpu_has(X86_FEATURE_NRIPS))
> vmcb02->control.next_rip = vmcb12_rip;
>
> + if (is_evtinj_soft(vmcb02->control.event_inj)) {
> + svm->soft_int_injected = true;
> + svm->soft_int_csbase = svm->vmcb->save.cs.base;
> + svm->soft_int_old_rip = vmcb12_rip;
> + if (svm->nrips_enabled)
> + svm->soft_int_next_rip = svm->nested.ctl.next_rip;
> + else
> + svm->soft_int_next_rip = vmcb12_rip;
> + }
> +
> vmcb02->control.virt_ext = vmcb01->control.virt_ext &
> LBR_CTL_ENABLE_MASK;
> if (svm->lbrv_enabled)
> @@ -849,6 +874,7 @@ int nested_svm_vmrun(struct kvm_vcpu *vcpu)
>
> out_exit_err:
> svm->nested.nested_run_pending = 0;
> + svm->soft_int_injected = false;
>
> svm->vmcb->control.exit_code = SVM_EXIT_ERR;
> svm->vmcb->control.exit_code_hi = 0;
> @@ -1618,7 +1644,7 @@ static int svm_set_nested_state(struct kvm_vcpu *vcpu,
> nested_copy_vmcb_control_to_cache(svm, ctl);
>
> svm_switch_vmcb(svm, &svm->nested.vmcb02);
> - nested_vmcb02_prepare_control(svm, save->rip);
> + nested_vmcb02_prepare_control(svm, svm->vmcb->save.rip);
Is this change intentional?
>
> /*
> * While the nested guest CR3 is already checked and set by
> diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
> index 14bc4e87437b..8321f9ce5e35 100644
> --- a/arch/x86/kvm/svm/svm.c
> +++ b/arch/x86/kvm/svm/svm.c
> @@ -342,9 +342,11 @@ static void svm_set_interrupt_shadow(struct kvm_vcpu *vcpu, int mask)
>
> }
>
> -static int svm_skip_emulated_instruction(struct kvm_vcpu *vcpu)
> +static int __svm_skip_emulated_instruction(struct kvm_vcpu *vcpu,
> + bool commit_side_effects)
> {
> struct vcpu_svm *svm = to_svm(vcpu);
> + unsigned long old_rflags;
>
> /*
> * SEV-ES does not expose the next RIP. The RIP update is controlled by
> @@ -359,18 +361,75 @@ static int svm_skip_emulated_instruction(struct kvm_vcpu *vcpu)
> }
>
> if (!svm->next_rip) {
> + if (unlikely(!commit_side_effects))
> + old_rflags = svm->vmcb->save.rflags;
> +
> if (!kvm_emulate_instruction(vcpu, EMULTYPE_SKIP))
> return 0;
> +
> + if (unlikely(!commit_side_effects))
> + svm->vmcb->save.rflags = old_rflags;
> } else {
> kvm_rip_write(vcpu, svm->next_rip);
> }
>
> done:
> - svm_set_interrupt_shadow(vcpu, 0);
> + if (likely(commit_side_effects))
> + svm_set_interrupt_shadow(vcpu, 0);
>
> return 1;
> }
>
> +static int svm_skip_emulated_instruction(struct kvm_vcpu *vcpu)
> +{
> + return __svm_skip_emulated_instruction(vcpu, true);
> +}
> +
> +static int svm_update_soft_interrupt_rip(struct kvm_vcpu *vcpu)
> +{
> + unsigned long rip, old_rip = kvm_rip_read(vcpu);
> + struct vcpu_svm *svm = to_svm(vcpu);
> +
> + /*
> + * Due to architectural shortcomings, the CPU doesn't always provide
> + * NextRIP, e.g. if KVM intercepted an exception that occurred while
> + * the CPU was vectoring an INTO/INT3 in the guest. Temporarily skip
> + * the instruction even if NextRIP is supported to acquire the next
> + * RIP so that it can be shoved into the NextRIP field, otherwise
> + * hardware will fail to advance guest RIP during event injection.
> + * Drop the exception/interrupt if emulation fails and effectively
> + * retry the instruction, it's the least awful option. If NRIPS is
> + * in use, the skip must not commit any side effects such as clearing
> + * the interrupt shadow or RFLAGS.RF.
> + */
> + if (!__svm_skip_emulated_instruction(vcpu, !nrips))
> + return -EIO;
> +
> + rip = kvm_rip_read(vcpu);
> +
> + /*
> + * Save the injection information, even when using next_rip, as the
> + * VMCB's next_rip will be lost (cleared on VM-Exit) if the injection
> + * doesn't complete due to a VM-Exit occurring while the CPU is
> + * vectoring the event. Decoding the instruction isn't guaranteed to
> + * work as there may be no backing instruction, e.g. if the event is
> + * being injected by L1 for L2, or if the guest is patching INT3 into
> + * a different instruction.
> + */
> + svm->soft_int_injected = true;
> + svm->soft_int_csbase = svm->vmcb->save.cs.base;
> + svm->soft_int_old_rip = old_rip;
> + svm->soft_int_next_rip = rip;
> +
> + if (nrips)
> + kvm_rip_write(vcpu, old_rip);
> +
> + if (static_cpu_has(X86_FEATURE_NRIPS))
> + svm->vmcb->control.next_rip = rip;
> +
> + return 0;
> +}
> +
> static void svm_queue_exception(struct kvm_vcpu *vcpu)
> {
> struct vcpu_svm *svm = to_svm(vcpu);
> @@ -380,25 +439,9 @@ static void svm_queue_exception(struct kvm_vcpu *vcpu)
>
> kvm_deliver_exception_payload(vcpu);
>
> - if (nr == BP_VECTOR && !nrips) {
> - unsigned long rip, old_rip = kvm_rip_read(vcpu);
> -
> - /*
> - * For guest debugging where we have to reinject #BP if some
> - * INT3 is guest-owned:
> - * Emulate nRIP by moving RIP forward. Will fail if injection
> - * raises a fault that is not intercepted. Still better than
> - * failing in all cases.
> - */
> - (void)svm_skip_emulated_instruction(vcpu);
> - rip = kvm_rip_read(vcpu);
> -
> - if (boot_cpu_has(X86_FEATURE_NRIPS))
> - svm->vmcb->control.next_rip = rip;
> -
> - svm->int3_rip = rip + svm->vmcb->save.cs.base;
> - svm->int3_injected = rip - old_rip;
> - }
> + if (kvm_exception_is_soft(nr) &&
> + svm_update_soft_interrupt_rip(vcpu))
> + return;
>
> svm->vmcb->control.event_inj = nr
> | SVM_EVTINJ_VALID
> @@ -3671,15 +3714,46 @@ static inline void sync_lapic_to_cr8(struct kvm_vcpu *vcpu)
> svm->vmcb->control.int_ctl |= cr8 & V_TPR_MASK;
> }
>
> +static void svm_complete_soft_interrupt(struct kvm_vcpu *vcpu, u8 vector,
> + int type)
> +{
> + struct vcpu_svm *svm = to_svm(vcpu);
> +
> + /*
> + * If NRIPS is enabled, KVM must snapshot the pre-VMRUN next_rip that's
> + * associated with the original soft exception/interrupt. next_rip is
> + * cleared on all exits that can occur while vectoring an event, so KVM
> + * needs to manually set next_rip for re-injection. Unlike the !nrips
> + * case below, this needs to be done if and only if KVM is re-injecting
> + * the same event, i.e. if the event is a soft exception/interrupt,
> + * otherwise next_rip is unused on VMRUN.
> + */
> + if (nrips && type == SVM_EXITINTINFO_TYPE_EXEPT &&
> + kvm_exception_is_soft(vector) &&
> + kvm_is_linear_rip(vcpu, svm->soft_int_old_rip + svm->soft_int_csbase))
> + svm->vmcb->control.next_rip = svm->soft_int_next_rip;
> + /*
> + * If NRIPS isn't enabled, KVM must manually advance RIP prior to
> + * injecting the soft exception/interrupt. That advancement needs to
> + * be unwound if vectoring didn't complete. Note, the new event may
> + * not be the injected event, e.g. if KVM injected an INTn, the INTn
> + * hit a #NP in the guest, and the #NP encountered a #PF, the #NP will
> + * be the reported vectored event, but RIP still needs to be unwound.
> + */
> + else if (!nrips && type == SVM_EXITINTINFO_TYPE_EXEPT &&
> + kvm_is_linear_rip(vcpu, svm->soft_int_next_rip + svm->soft_int_csbase))
> + kvm_rip_write(vcpu, svm->soft_int_old_rip);
> +}
> +
> static void svm_complete_interrupts(struct kvm_vcpu *vcpu)
> {
> struct vcpu_svm *svm = to_svm(vcpu);
> u8 vector;
> int type;
> u32 exitintinfo = svm->vmcb->control.exit_int_info;
> - unsigned int3_injected = svm->int3_injected;
> + bool soft_int_injected = svm->soft_int_injected;
>
> - svm->int3_injected = 0;
> + svm->soft_int_injected = false;
>
> /*
> * If we've made progress since setting HF_IRET_MASK, we've
> @@ -3704,17 +3778,8 @@ static void svm_complete_interrupts(struct kvm_vcpu *vcpu)
> vector = exitintinfo & SVM_EXITINTINFO_VEC_MASK;
> type = exitintinfo & SVM_EXITINTINFO_TYPE_MASK;
>
> - /*
> - * If NextRIP isn't enabled, KVM must manually advance RIP prior to
> - * injecting the soft exception/interrupt. That advancement needs to
> - * be unwound if vectoring didn't complete. Note, the new event may
> - * not be the injected event, e.g. if KVM injected an INTn, the INTn
> - * hit a #NP in the guest, and the #NP encountered a #PF, the #NP will
> - * be the reported vectored event, but RIP still needs to be unwound.
> - */
> - if (int3_injected && type == SVM_EXITINTINFO_TYPE_EXEPT &&
> - kvm_is_linear_rip(vcpu, svm->int3_rip))
> - kvm_rip_write(vcpu, kvm_rip_read(vcpu) - int3_injected);
> + if (soft_int_injected)
> + svm_complete_soft_interrupt(vcpu, vector, type);
>
> switch (type) {
> case SVM_EXITINTINFO_TYPE_NMI:
> @@ -3727,13 +3792,6 @@ static void svm_complete_interrupts(struct kvm_vcpu *vcpu)
> if (vector == X86_TRAP_VC)
> break;
>
> - /*
> - * In case of software exceptions, do not reinject the vector,
> - * but re-execute the instruction instead.
> - */
> - if (kvm_exception_is_soft(vector))
> - break;
> -
> if (exitintinfo & SVM_EXITINTINFO_VALID_ERR) {
> u32 err = svm->vmcb->control.exit_int_info_err;
> kvm_requeue_exception_e(vcpu, vector, err);
> diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h
> index 7d97e4d18c8b..6acb494e3598 100644
> --- a/arch/x86/kvm/svm/svm.h
> +++ b/arch/x86/kvm/svm/svm.h
> @@ -230,8 +230,10 @@ struct vcpu_svm {
> bool nmi_singlestep;
> u64 nmi_singlestep_guest_rflags;
>
> - unsigned int3_injected;
> - unsigned long int3_rip;
> + unsigned long soft_int_csbase;
> + unsigned long soft_int_old_rip;
> + unsigned long soft_int_next_rip;
> + bool soft_int_injected;
>
> /* optional nested SVM features that are enabled for this guest */
> bool nrips_enabled : 1;
Overall looks good, but I need to give it a bit more though to be sure,
I'll wait for next version.
Best regards,
Maxim Levitsky