2022-04-05 01:04:01

by Maxim Levitsky

[permalink] [raw]
Subject: Re: [PATCH 5/8] KVM: SVM: Re-inject INT3/INTO instead of retrying the instruction

On Sat, 2022-04-02 at 01:09 +0000, Sean Christopherson wrote:
> Re-inject INT3/INTO instead of retrying the instruction if the CPU
> encountered an intercepted exception while vectoring the software
> exception, e.g. if vectoring INT3 encounters a #PF and KVM is using
> shadow paging. Retrying the instruction is architecturally wrong, e.g.
> will result in a spurious #DB if there's a code breakpoint on the INT3/O,
> and lack of re-injection also breaks nested virtualization, e.g. if L1
> injects a software exception and vectoring the injected exception
> encounters an exception that is intercepted by L0 but not L1.
>
> Due to, ahem, deficiencies in the SVM architecture, acquiring the next
> RIP may require flowing through the emulator even if NRIPS is supported,
> as the CPU clears next_rip if the VM-Exit is due to an exception other
> than "exceptions caused by the INT3, INTO, and BOUND instructions". To
> deal with this, "skip" the instruction to calculate next_ript, and then
> unwind the RIP write and any side effects (RFLAGS updates).
>
> Reported-by: Maciej S. Szmigiero <[email protected]>
> Signed-off-by: Sean Christopherson <[email protected]>
> ---
> arch/x86/kvm/svm/svm.c | 111 ++++++++++++++++++++++++++++-------------
> arch/x86/kvm/svm/svm.h | 4 +-
> 2 files changed, 79 insertions(+), 36 deletions(-)
>
> diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
> index 6ea8f16e39ac..ecc828d6921e 100644
> --- a/arch/x86/kvm/svm/svm.c
> +++ b/arch/x86/kvm/svm/svm.c
> @@ -341,9 +341,11 @@ static void svm_set_interrupt_shadow(struct kvm_vcpu *vcpu, int mask)
>
> }
>
> -static int svm_skip_emulated_instruction(struct kvm_vcpu *vcpu)
> +static int __svm_skip_emulated_instruction(struct kvm_vcpu *vcpu,
> + bool commit_side_effects)
> {
> struct vcpu_svm *svm = to_svm(vcpu);
> + unsigned long old_rflags;
>
> /*
> * SEV-ES does not expose the next RIP. The RIP update is controlled by
> @@ -358,18 +360,71 @@ static int svm_skip_emulated_instruction(struct kvm_vcpu *vcpu)
> }
>
> if (!svm->next_rip) {
> + if (unlikely(!commit_side_effects))
> + old_rflags = svm->vmcb->save.rflags;
> +
> if (!kvm_emulate_instruction(vcpu, EMULTYPE_SKIP))
> return 0;
> +
> + if (unlikely(!commit_side_effects))
> + svm->vmcb->save.rflags = old_rflags;
> } else {
> kvm_rip_write(vcpu, svm->next_rip);
> }
>
> done:
> - svm_set_interrupt_shadow(vcpu, 0);
> + if (likely(commit_side_effects))
> + svm_set_interrupt_shadow(vcpu, 0);
>
> return 1;
> }
>
> +static int svm_skip_emulated_instruction(struct kvm_vcpu *vcpu)
> +{
> + return __svm_skip_emulated_instruction(vcpu, true);
> +}
> +
> +static int svm_update_soft_interrupt_rip(struct kvm_vcpu *vcpu)
> +{
> + unsigned long rip, old_rip = kvm_rip_read(vcpu);
> + struct vcpu_svm *svm = to_svm(vcpu);
> +
> + /*
> + * Due to architectural shortcomings, the CPU doesn't always provide
> + * NextRIP, e.g. if KVM intercepted an exception that occurred while
> + * the CPU was vectoring an INTO/INT3 in the guest. Temporarily skip
> + * the instruction even if NextRIP is supported to acquire the next
> + * RIP so that it can be shoved into the NextRIP field, otherwise
> + * hardware will fail to advance guest RIP during event injection.
> + * Drop the exception/interrupt if emulation fails and effectively
> + * retry the instruction, it's the least awful option. If NRIPS is
> + * in use, the skip must not commit any side effects such as clearing
> + * the interrupt shadow or RFLAGS.RF.
> + */
> + if (!__svm_skip_emulated_instruction(vcpu, !nrips))
> + return -EIO;
> +
> + rip = kvm_rip_read(vcpu);
> +
> + /*
> + * If NextRIP is supported, rewind RIP and update NextRip. If NextRip
> + * isn't supported, keep the result of the skip as the CPU obviously
> + * won't advance RIP, but stash away the injection information so that
> + * RIP can be unwound if injection fails.
> + */
> + if (nrips) {
> + kvm_rip_write(vcpu, old_rip);
> + svm->vmcb->control.next_rip = rip;
> + } else {
> + if (boot_cpu_has(X86_FEATURE_NRIPS))
> + svm->vmcb->control.next_rip = rip;
> +
> + svm->soft_int_linear_rip = rip + svm->vmcb->save.cs.base;
> + svm->soft_int_injected = rip - old_rip;
> + }
> + return 0;
> +}
> +
> static void svm_queue_exception(struct kvm_vcpu *vcpu)
> {
> struct vcpu_svm *svm = to_svm(vcpu);
> @@ -379,25 +434,9 @@ static void svm_queue_exception(struct kvm_vcpu *vcpu)
>
> kvm_deliver_exception_payload(vcpu);
>
> - if (nr == BP_VECTOR && !nrips) {
> - unsigned long rip, old_rip = kvm_rip_read(vcpu);
> -
> - /*
> - * For guest debugging where we have to reinject #BP if some
> - * INT3 is guest-owned:
> - * Emulate nRIP by moving RIP forward. Will fail if injection
> - * raises a fault that is not intercepted. Still better than
> - * failing in all cases.
> - */
> - (void)svm_skip_emulated_instruction(vcpu);
> - rip = kvm_rip_read(vcpu);
> -
> - if (boot_cpu_has(X86_FEATURE_NRIPS))
> - svm->vmcb->control.next_rip = rip;
> -
> - svm->int3_rip = rip + svm->vmcb->save.cs.base;
> - svm->int3_injected = rip - old_rip;
> - }
> + if (kvm_exception_is_soft(nr) &&
> + svm_update_soft_interrupt_rip(vcpu))
> + return;
>
> svm->vmcb->control.event_inj = nr
> | SVM_EVTINJ_VALID
> @@ -3676,9 +3715,9 @@ static void svm_complete_interrupts(struct kvm_vcpu *vcpu)
> u8 vector;
> int type;
> u32 exitintinfo = svm->vmcb->control.exit_int_info;
> - unsigned int3_injected = svm->int3_injected;
> + unsigned soft_int_injected = svm->soft_int_injected;
>
> - svm->int3_injected = 0;
> + svm->soft_int_injected = 0;
>
> /*
> * If we've made progress since setting HF_IRET_MASK, we've
> @@ -3698,6 +3737,18 @@ static void svm_complete_interrupts(struct kvm_vcpu *vcpu)
> if (!(exitintinfo & SVM_EXITINTINFO_VALID))
> return;
>
> + /*
> + * If NextRIP isn't enabled, KVM must manually advance RIP prior to
> + * injecting the soft exception/interrupt. That advancement needs to
> + * be unwound if vectoring didn't complete. Note, the _new_ event may
> + * not be the injected event, e.g. if KVM injected an INTn, the INTn
> + * hit a #NP in the guest, and the #NP encountered a #PF, the #NP will
> + * be the reported vectored event, but RIP still needs to be unwound.
> + */
> + if (soft_int_injected &&
> + kvm_is_linear_rip(vcpu, to_svm(vcpu)->soft_int_linear_rip))
> + kvm_rip_write(vcpu, kvm_rip_read(vcpu) - soft_int_injected);
> +
> kvm_make_request(KVM_REQ_EVENT, vcpu);
>
> vector = exitintinfo & SVM_EXITINTINFO_VEC_MASK;
> @@ -3711,9 +3762,9 @@ static void svm_complete_interrupts(struct kvm_vcpu *vcpu)
> * hit a #NP in the guest, and the #NP encountered a #PF, the #NP will
> * be the reported vectored event, but RIP still needs to be unwound.
> */
> - if (int3_injected && type == SVM_EXITINTINFO_TYPE_EXEPT &&
> - kvm_is_linear_rip(vcpu, svm->int3_rip))
> - kvm_rip_write(vcpu, kvm_rip_read(vcpu) - int3_injected);
> + if (soft_int_injected && type == SVM_EXITINTINFO_TYPE_EXEPT &&
> + kvm_is_linear_rip(vcpu, svm->soft_int_linear_rip))
> + kvm_rip_write(vcpu, kvm_rip_read(vcpu) - soft_int_injected);
>
> switch (type) {
> case SVM_EXITINTINFO_TYPE_NMI:
> @@ -3726,14 +3777,6 @@ static void svm_complete_interrupts(struct kvm_vcpu *vcpu)
> if (vector == X86_TRAP_VC)
> break;
>
> - /*
> - * In case of software exceptions, do not reinject the vector,
> - * but re-execute the instruction instead. Rewind RIP first
> - * if we emulated INT3 before.
> - */
> - if (kvm_exception_is_soft(vector))
> - break;
> -
> if (exitintinfo & SVM_EXITINTINFO_VALID_ERR) {
> u32 err = svm->vmcb->control.exit_int_info_err;
> kvm_requeue_exception_e(vcpu, vector, err);
> diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h
> index 47e7427d0395..a770a1c7ddd2 100644
> --- a/arch/x86/kvm/svm/svm.h
> +++ b/arch/x86/kvm/svm/svm.h
> @@ -230,8 +230,8 @@ struct vcpu_svm {
> bool nmi_singlestep;
> u64 nmi_singlestep_guest_rflags;
>
> - unsigned int3_injected;
> - unsigned long int3_rip;
> + unsigned soft_int_injected;
> + unsigned long soft_int_linear_rip;
>
> /* optional nested SVM features that are enabled for this guest */
> bool nrips_enabled : 1;


I mostly agree with this patch, but think that it doesn't address the original issue that
Maciej wanted to address:

Suppose that there is *no* instruction in L2 code which caused the software exception,
but rather L1 set arbitrary next_rip, and set EVENTINJ to software exception with some vector,
and that injection got interrupted.

I don't think that this code will support this.

I think that svm_complete_interrupts should store next_rip it in some field like VMX does
(vcpu->arch.event_exit_inst_len).

That field also should be migrated, or we must prove that it works anyway.
E.g, what happens when we tried to inject event,
injection was interrupted by other exception, and then we migrate?

Best regards,
Maxim Levitsky



2022-04-05 02:07:14

by Sean Christopherson

[permalink] [raw]
Subject: Re: [PATCH 5/8] KVM: SVM: Re-inject INT3/INTO instead of retrying the instruction

On Mon, Apr 04, 2022, Maxim Levitsky wrote:
> On Sat, 2022-04-02 at 01:09 +0000, Sean Christopherson wrote:
> > Re-inject INT3/INTO instead of retrying the instruction if the CPU
> > encountered an intercepted exception while vectoring the software
> > exception, e.g. if vectoring INT3 encounters a #PF and KVM is using
> > shadow paging. Retrying the instruction is architecturally wrong, e.g.
> > will result in a spurious #DB if there's a code breakpoint on the INT3/O,
> > and lack of re-injection also breaks nested virtualization, e.g. if L1
> > injects a software exception and vectoring the injected exception
> > encounters an exception that is intercepted by L0 but not L1.
> >
> > Due to, ahem, deficiencies in the SVM architecture, acquiring the next
> > RIP may require flowing through the emulator even if NRIPS is supported,
> > as the CPU clears next_rip if the VM-Exit is due to an exception other
> > than "exceptions caused by the INT3, INTO, and BOUND instructions". To
> > deal with this, "skip" the instruction to calculate next_ript, and then
> > unwind the RIP write and any side effects (RFLAGS updates).

...

> > @@ -3698,6 +3737,18 @@ static void svm_complete_interrupts(struct kvm_vcpu *vcpu)
> > if (!(exitintinfo & SVM_EXITINTINFO_VALID))
> > return;
> >
> > + /*
> > + * If NextRIP isn't enabled, KVM must manually advance RIP prior to
> > + * injecting the soft exception/interrupt. That advancement needs to
> > + * be unwound if vectoring didn't complete. Note, the _new_ event may
> > + * not be the injected event, e.g. if KVM injected an INTn, the INTn
> > + * hit a #NP in the guest, and the #NP encountered a #PF, the #NP will
> > + * be the reported vectored event, but RIP still needs to be unwound.
> > + */
> > + if (soft_int_injected &&
> > + kvm_is_linear_rip(vcpu, to_svm(vcpu)->soft_int_linear_rip))
> > + kvm_rip_write(vcpu, kvm_rip_read(vcpu) - soft_int_injected);

Doh, I botched my last minute rebase. This is duplicate code and needs to be
dropped.

> > +
> > kvm_make_request(KVM_REQ_EVENT, vcpu);
> >
> > vector = exitintinfo & SVM_EXITINTINFO_VEC_MASK;
> > @@ -3711,9 +3762,9 @@ static void svm_complete_interrupts(struct kvm_vcpu *vcpu)
> > * hit a #NP in the guest, and the #NP encountered a #PF, the #NP will
> > * be the reported vectored event, but RIP still needs to be unwound.
> > */
> > - if (int3_injected && type == SVM_EXITINTINFO_TYPE_EXEPT &&
> > - kvm_is_linear_rip(vcpu, svm->int3_rip))
> > - kvm_rip_write(vcpu, kvm_rip_read(vcpu) - int3_injected);
> > + if (soft_int_injected && type == SVM_EXITINTINFO_TYPE_EXEPT &&
> > + kvm_is_linear_rip(vcpu, svm->soft_int_linear_rip))
> > + kvm_rip_write(vcpu, kvm_rip_read(vcpu) - soft_int_injected);
> >
> > switch (type) {
> > case SVM_EXITINTINFO_TYPE_NMI:
> > @@ -3726,14 +3777,6 @@ static void svm_complete_interrupts(struct kvm_vcpu *vcpu)
> > if (vector == X86_TRAP_VC)
> > break;
> >
> > - /*
> > - * In case of software exceptions, do not reinject the vector,
> > - * but re-execute the instruction instead. Rewind RIP first
> > - * if we emulated INT3 before.
> > - */
> > - if (kvm_exception_is_soft(vector))
> > - break;
> > -
> > if (exitintinfo & SVM_EXITINTINFO_VALID_ERR) {
> > u32 err = svm->vmcb->control.exit_int_info_err;
> > kvm_requeue_exception_e(vcpu, vector, err);
> > diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h
> > index 47e7427d0395..a770a1c7ddd2 100644
> > --- a/arch/x86/kvm/svm/svm.h
> > +++ b/arch/x86/kvm/svm/svm.h
> > @@ -230,8 +230,8 @@ struct vcpu_svm {
> > bool nmi_singlestep;
> > u64 nmi_singlestep_guest_rflags;
> >
> > - unsigned int3_injected;
> > - unsigned long int3_rip;
> > + unsigned soft_int_injected;
> > + unsigned long soft_int_linear_rip;
> >
> > /* optional nested SVM features that are enabled for this guest */
> > bool nrips_enabled : 1;
>
>
> I mostly agree with this patch, but think that it doesn't address the
> original issue that Maciej wanted to address:
>
> Suppose that there is *no* instruction in L2 code which caused the software
> exception, but rather L1 set arbitrary next_rip, and set EVENTINJ to software
> exception with some vector, and that injection got interrupted.
>
> I don't think that this code will support this.

Argh, you're right. Maciej's selftest injects without an instruction, but it doesn't
configure the scenario where that injection fails due to an exception+VM-Exit that
isn't intercepted by L1 and is handled by L0. The event_inj test gets the coverage
for the latter, but always has a backing instruction.

> I think that svm_complete_interrupts should store next_rip it in some field
> like VMX does (vcpu->arch.event_exit_inst_len).

Yeah. The ugly part is that because next_rip is guaranteed to be cleared on exit
(the exit is gauranteed to be due to a fault-like exception), KVM has to snapshot
next_rip during the "original" injection and use the linear_rip matching heuristic
to detect this scenario.

> That field also should be migrated, or we must prove that it works anyway.
> E.g, what happens when we tried to inject event,
> injection was interrupted by other exception, and then we migrate?

Ya, should Just Work if control.next_rip is used to cache the next rip.

Handling this doesn't seem to be too awful (haven't tested yet), it's largely the
same logic as the existing !nrips code.

In svm_update_soft_interrupt_rip(), snapshot all information regardless of whether
or not nrips is enabled:

svm->soft_int_injected = true;
svm->soft_int_csbase = svm->vmcb->save.cs.base;
svm->soft_int_old_rip = old_rip;
svm->soft_int_next_rip = rip;

if (nrips)
kvm_rip_write(vcpu, old_rip);

if (static_cpu_has(X86_FEATURE_NRIPS))
svm->vmcb->control.next_rip = rip;

and then in svm_complete_interrupts(), change the linear RIP matching code to look
for the old rip in the nrips case and stuff svm->vmcb->control.next_rip on match.

bool soft_int_injected = svm->soft_int_injected;
unsigned soft_int_rip;

svm->soft_int_injected = false;

if (soft_int_injected) {
if (nrips)
soft_int_rip = svm->soft_int_old_rip;
else
soft_int_rip = svm->soft_int_next_rip;
}

...

if soft_int_injected && type == SVM_EXITINTINFO_TYPE_EXEPT &&
kvm_is_linear_rip(vcpu, soft_int_rip + svm->soft_int_csbase)) {
if (nrips)
svm->vmcb->control.next_rip = svm->soft_int_next_rip;
else
kvm_rip_write(vcpu, svm->soft_int_old_rip);
}



2022-04-05 02:14:25

by Maciej S. Szmigiero

[permalink] [raw]
Subject: Re: [PATCH 5/8] KVM: SVM: Re-inject INT3/INTO instead of retrying the instruction

On 4.04.2022 18:49, Sean Christopherson wrote:
> On Mon, Apr 04, 2022, Maxim Levitsky wrote:
>> On Sat, 2022-04-02 at 01:09 +0000, Sean Christopherson wrote:
>>> Re-inject INT3/INTO instead of retrying the instruction if the CPU
>>> encountered an intercepted exception while vectoring the software
>>> exception, e.g. if vectoring INT3 encounters a #PF and KVM is using
>>> shadow paging. Retrying the instruction is architecturally wrong, e.g.
>>> will result in a spurious #DB if there's a code breakpoint on the INT3/O,
>>> and lack of re-injection also breaks nested virtualization, e.g. if L1
>>> injects a software exception and vectoring the injected exception
>>> encounters an exception that is intercepted by L0 but not L1.
>>>
>>> Due to, ahem, deficiencies in the SVM architecture, acquiring the next
>>> RIP may require flowing through the emulator even if NRIPS is supported,
>>> as the CPU clears next_rip if the VM-Exit is due to an exception other
>>> than "exceptions caused by the INT3, INTO, and BOUND instructions". To
>>> deal with this, "skip" the instruction to calculate next_ript, and then
>>> unwind the RIP write and any side effects (RFLAGS updates).
>
> ...
>
(..)
>>> +
>>> kvm_make_request(KVM_REQ_EVENT, vcpu);
>>>
>>> vector = exitintinfo & SVM_EXITINTINFO_VEC_MASK;
>>> @@ -3711,9 +3762,9 @@ static void svm_complete_interrupts(struct kvm_vcpu *vcpu)
>>> * hit a #NP in the guest, and the #NP encountered a #PF, the #NP will
>>> * be the reported vectored event, but RIP still needs to be unwound.
>>> */
>>> - if (int3_injected && type == SVM_EXITINTINFO_TYPE_EXEPT &&
>>> - kvm_is_linear_rip(vcpu, svm->int3_rip))
>>> - kvm_rip_write(vcpu, kvm_rip_read(vcpu) - int3_injected);
>>> + if (soft_int_injected && type == SVM_EXITINTINFO_TYPE_EXEPT &&
>>> + kvm_is_linear_rip(vcpu, svm->soft_int_linear_rip))
>>> + kvm_rip_write(vcpu, kvm_rip_read(vcpu) - soft_int_injected);
>>>
>>> switch (type) {
>>> case SVM_EXITINTINFO_TYPE_NMI:
>>> @@ -3726,14 +3777,6 @@ static void svm_complete_interrupts(struct kvm_vcpu *vcpu)
>>> if (vector == X86_TRAP_VC)
>>> break;
>>>
>>> - /*
>>> - * In case of software exceptions, do not reinject the vector,
>>> - * but re-execute the instruction instead. Rewind RIP first
>>> - * if we emulated INT3 before.
>>> - */
>>> - if (kvm_exception_is_soft(vector))
>>> - break;
>>> -
>>> if (exitintinfo & SVM_EXITINTINFO_VALID_ERR) {
>>> u32 err = svm->vmcb->control.exit_int_info_err;
>>> kvm_requeue_exception_e(vcpu, vector, err);
>>> diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h
>>> index 47e7427d0395..a770a1c7ddd2 100644
>>> --- a/arch/x86/kvm/svm/svm.h
>>> +++ b/arch/x86/kvm/svm/svm.h
>>> @@ -230,8 +230,8 @@ struct vcpu_svm {
>>> bool nmi_singlestep;
>>> u64 nmi_singlestep_guest_rflags;
>>>
>>> - unsigned int3_injected;
>>> - unsigned long int3_rip;
>>> + unsigned soft_int_injected;
>>> + unsigned long soft_int_linear_rip;
>>>
>>> /* optional nested SVM features that are enabled for this guest */
>>> bool nrips_enabled : 1;
>>
>>
>> I mostly agree with this patch, but think that it doesn't address the
>> original issue that Maciej wanted to address:
>>
>> Suppose that there is *no* instruction in L2 code which caused the software
>> exception, but rather L1 set arbitrary next_rip, and set EVENTINJ to software
>> exception with some vector, and that injection got interrupted.
>>
>> I don't think that this code will support this.
>
> Argh, you're right. Maciej's selftest injects without an instruction, but it doesn't
> configure the scenario where that injection fails due to an exception+VM-Exit that
> isn't intercepted by L1 and is handled by L0. The event_inj test gets the coverage
> for the latter, but always has a backing instruction.
>
>> I think that svm_complete_interrupts should store next_rip it in some field
>> like VMX does (vcpu->arch.event_exit_inst_len).
>
> Yeah. The ugly part is that because next_rip is guaranteed to be cleared on exit
> (the exit is gauranteed to be due to a fault-like exception), KVM has to snapshot
> next_rip during the "original" injection and use the linear_rip matching heuristic
> to detect this scenario.
>
>> That field also should be migrated, or we must prove that it works anyway.
>> E.g, what happens when we tried to inject event,
>> injection was interrupted by other exception, and then we migrate?
>
> Ya, should Just Work if control.next_rip is used to cache the next rip.
>
> Handling this doesn't seem to be too awful (haven't tested yet), it's largely the
> same logic as the existing !nrips code.
>
> In svm_update_soft_interrupt_rip(), snapshot all information regardless of whether
> or not nrips is enabled:
>
> svm->soft_int_injected = true;
> svm->soft_int_csbase = svm->vmcb->save.cs.base;
> svm->soft_int_old_rip = old_rip;
> svm->soft_int_next_rip = rip;
>
> if (nrips)
> kvm_rip_write(vcpu, old_rip);
>
> if (static_cpu_has(X86_FEATURE_NRIPS))
> svm->vmcb->control.next_rip = rip;
>
> and then in svm_complete_interrupts(), change the linear RIP matching code to look
> for the old rip in the nrips case and stuff svm->vmcb->control.next_rip on match.
>
> bool soft_int_injected = svm->soft_int_injected;
> unsigned soft_int_rip;
>
> svm->soft_int_injected = false;
>
> if (soft_int_injected) {
> if (nrips)
> soft_int_rip = svm->soft_int_old_rip;
> else
> soft_int_rip = svm->soft_int_next_rip;
> }
>
> ...
>
> if soft_int_injected && type == SVM_EXITINTINFO_TYPE_EXEPT &&
> kvm_is_linear_rip(vcpu, soft_int_rip + svm->soft_int_csbase)) {
> if (nrips)
> svm->vmcb->control.next_rip = svm->soft_int_next_rip;
> else
> kvm_rip_write(vcpu, svm->soft_int_old_rip);
> }
>
>
>

Despite what the svm_update_soft_interrupt_rip() name might suggest this
handles only *soft exceptions*, not *soft interrupts*
(which are injected by svm_inject_irq() and also need proper next_rip
management).

Also, I'm not sure that even the proposed updated code above will
actually restore the L1-requested next_rip correctly on L1 -> L2
re-injection (will review once the full version is available).

Thanks,
Maciej

2022-04-06 16:42:28

by Sean Christopherson

[permalink] [raw]
Subject: Re: [PATCH 5/8] KVM: SVM: Re-inject INT3/INTO instead of retrying the instruction

On Mon, Apr 04, 2022, Maciej S. Szmigiero wrote:
> On 4.04.2022 18:49, Sean Christopherson wrote:
> > On Mon, Apr 04, 2022, Maxim Levitsky wrote:
> > In svm_update_soft_interrupt_rip(), snapshot all information regardless of whether
> > or not nrips is enabled:
> >
> > svm->soft_int_injected = true;
> > svm->soft_int_csbase = svm->vmcb->save.cs.base;
> > svm->soft_int_old_rip = old_rip;
> > svm->soft_int_next_rip = rip;
> >
> > if (nrips)
> > kvm_rip_write(vcpu, old_rip);
> >
> > if (static_cpu_has(X86_FEATURE_NRIPS))
> > svm->vmcb->control.next_rip = rip;
> >
> > and then in svm_complete_interrupts(), change the linear RIP matching code to look
> > for the old rip in the nrips case and stuff svm->vmcb->control.next_rip on match.
> >
> > bool soft_int_injected = svm->soft_int_injected;
> > unsigned soft_int_rip;
> >
> > svm->soft_int_injected = false;
> >
> > if (soft_int_injected) {
> > if (nrips)
> > soft_int_rip = svm->soft_int_old_rip;
> > else
> > soft_int_rip = svm->soft_int_next_rip;
> > }
> >
> > ...
> >
> > if soft_int_injected && type == SVM_EXITINTINFO_TYPE_EXEPT &&
> > kvm_is_linear_rip(vcpu, soft_int_rip + svm->soft_int_csbase)) {
> > if (nrips)
> > svm->vmcb->control.next_rip = svm->soft_int_next_rip;
> > else
> > kvm_rip_write(vcpu, svm->soft_int_old_rip);
> > }
> >
> >
> >
>
> Despite what the svm_update_soft_interrupt_rip() name might suggest this
> handles only *soft exceptions*, not *soft interrupts*
> (which are injected by svm_inject_irq() and also need proper next_rip
> management).

Yeah, soft interrupts are handled in the next patch. I couldn't come up with a
less awful name.

> Also, I'm not sure that even the proposed updated code above will
> actually restore the L1-requested next_rip correctly on L1 -> L2
> re-injection (will review once the full version is available).

Spoiler alert, it doesn't. Save yourself the review time. :-)

The missing piece is stashing away the injected event on nested VMRUN. Those
events don't get routed through the normal interrupt/exception injection code and
so the next_rip info is lost on the subsequent #NPF.

Treating soft interrupts/exceptions like they were injected by KVM (which they
are, technically) works and doesn't seem too gross. E.g. when prepping vmcb02

if (svm->nrips_enabled)
vmcb02->control.next_rip = svm->nested.ctl.next_rip;
else if (boot_cpu_has(X86_FEATURE_NRIPS))
vmcb02->control.next_rip = vmcb12_rip;

if (is_evtinj_soft(vmcb02->control.event_inj)) {
svm->soft_int_injected = true;
svm->soft_int_csbase = svm->vmcb->save.cs.base;
svm->soft_int_old_rip = vmcb12_rip;
if (svm->nrips_enabled)
svm->soft_int_next_rip = svm->nested.ctl.next_rip;
else
svm->soft_int_next_rip = vmcb12_rip;
}

And then the VMRUN error path just needs to clear soft_int_injected.