2022-03-01 17:09:50

by Maxim Levitsky

[permalink] [raw]
Subject: [PATCH v3 4/7] KVM: x86: nSVM: support PAUSE filter threshold and count when cpu_pm=on

Allow L1 to use these settings if L0 disables PAUSE interception
(AKA cpu_pm=on)

Signed-off-by: Maxim Levitsky <[email protected]>
---
arch/x86/kvm/svm/nested.c | 6 ++++++
arch/x86/kvm/svm/svm.c | 17 +++++++++++++++++
arch/x86/kvm/svm/svm.h | 2 ++
3 files changed, 25 insertions(+)

diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c
index 37510cb206190..4cb0bc49986d5 100644
--- a/arch/x86/kvm/svm/nested.c
+++ b/arch/x86/kvm/svm/nested.c
@@ -664,6 +664,12 @@ static void nested_vmcb02_prepare_control(struct vcpu_svm *svm)
if (!nested_vmcb_needs_vls_intercept(svm))
svm->vmcb->control.virt_ext |= VIRTUAL_VMLOAD_VMSAVE_ENABLE_MASK;

+ if (svm->pause_filter_enabled)
+ svm->vmcb->control.pause_filter_count = svm->nested.ctl.pause_filter_count;
+
+ if (svm->pause_threshold_enabled)
+ svm->vmcb->control.pause_filter_thresh = svm->nested.ctl.pause_filter_thresh;
+
nested_svm_transition_tlb_flush(vcpu);

/* Enter Guest-Mode */
diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
index 6a571eed32ef4..52198e63c5fc4 100644
--- a/arch/x86/kvm/svm/svm.c
+++ b/arch/x86/kvm/svm/svm.c
@@ -4008,6 +4008,17 @@ static void svm_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu)

svm->v_vmload_vmsave_enabled = vls && guest_cpuid_has(vcpu, X86_FEATURE_V_VMSAVE_VMLOAD);

+ if (kvm_pause_in_guest(vcpu->kvm)) {
+ svm->pause_filter_enabled = pause_filter_count > 0 &&
+ guest_cpuid_has(vcpu, X86_FEATURE_PAUSEFILTER);
+
+ svm->pause_threshold_enabled = pause_filter_thresh > 0 &&
+ guest_cpuid_has(vcpu, X86_FEATURE_PFTHRESHOLD);
+ } else {
+ svm->pause_filter_enabled = false;
+ svm->pause_threshold_enabled = false;
+ }
+
svm_recalc_instruction_intercepts(vcpu, svm);

/* For sev guests, the memory encryption bit is not reserved in CR3. */
@@ -4763,6 +4774,12 @@ static __init void svm_set_cpu_caps(void)
if (vls)
kvm_cpu_cap_set(X86_FEATURE_V_VMSAVE_VMLOAD);

+ if (pause_filter_count)
+ kvm_cpu_cap_set(X86_FEATURE_PAUSEFILTER);
+
+ if (pause_filter_thresh)
+ kvm_cpu_cap_set(X86_FEATURE_PFTHRESHOLD);
+
/* Nested VM can receive #VMEXIT instead of triggering #GP */
kvm_cpu_cap_set(X86_FEATURE_SVME_ADDR_CHK);
}
diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h
index a3c93f9c02847..6fa81eb3ffb78 100644
--- a/arch/x86/kvm/svm/svm.h
+++ b/arch/x86/kvm/svm/svm.h
@@ -234,6 +234,8 @@ struct vcpu_svm {
bool tsc_scaling_enabled : 1;
bool lbrv_enabled : 1;
bool v_vmload_vmsave_enabled : 1;
+ bool pause_filter_enabled : 1;
+ bool pause_threshold_enabled : 1;

u32 ldr_reg;
u32 dfr_reg;
--
2.26.3


2022-03-09 14:23:16

by Paolo Bonzini

[permalink] [raw]
Subject: Re: [PATCH v3 4/7] KVM: x86: nSVM: support PAUSE filter threshold and count when cpu_pm=on

On 3/1/22 15:36, Maxim Levitsky wrote:
> Allow L1 to use these settings if L0 disables PAUSE interception
> (AKA cpu_pm=on)
>
> Signed-off-by: Maxim Levitsky <[email protected]>
> ---
> arch/x86/kvm/svm/nested.c | 6 ++++++
> arch/x86/kvm/svm/svm.c | 17 +++++++++++++++++
> arch/x86/kvm/svm/svm.h | 2 ++
> 3 files changed, 25 insertions(+)
>
> diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c
> index 37510cb206190..4cb0bc49986d5 100644
> --- a/arch/x86/kvm/svm/nested.c
> +++ b/arch/x86/kvm/svm/nested.c
> @@ -664,6 +664,12 @@ static void nested_vmcb02_prepare_control(struct vcpu_svm *svm)
> if (!nested_vmcb_needs_vls_intercept(svm))
> svm->vmcb->control.virt_ext |= VIRTUAL_VMLOAD_VMSAVE_ENABLE_MASK;
>
> + if (svm->pause_filter_enabled)
> + svm->vmcb->control.pause_filter_count = svm->nested.ctl.pause_filter_count;
> +
> + if (svm->pause_threshold_enabled)
> + svm->vmcb->control.pause_filter_thresh = svm->nested.ctl.pause_filter_thresh;

I think this should be

if (kvm_pause_in_guest(vcpu->kvm)) {
/* copy from VMCB12 if guest has CPUID, else set to 0 */
} else {
/* copy from VMCB01, unconditionally */
}

and likewise it should be copied back to VMCB01 unconditionally on
vmexit if !kvm_pause_in_guest(vcpu->kvm).

> nested_svm_transition_tlb_flush(vcpu);
>
> /* Enter Guest-Mode */
> diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
> index 6a571eed32ef4..52198e63c5fc4 100644
> --- a/arch/x86/kvm/svm/svm.c
> +++ b/arch/x86/kvm/svm/svm.c
> @@ -4008,6 +4008,17 @@ static void svm_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu)
>
> svm->v_vmload_vmsave_enabled = vls && guest_cpuid_has(vcpu, X86_FEATURE_V_VMSAVE_VMLOAD);
>
> + if (kvm_pause_in_guest(vcpu->kvm)) {
> + svm->pause_filter_enabled = pause_filter_count > 0 &&
> + guest_cpuid_has(vcpu, X86_FEATURE_PAUSEFILTER);
> +
> + svm->pause_threshold_enabled = pause_filter_thresh > 0 &&
> + guest_cpuid_has(vcpu, X86_FEATURE_PFTHRESHOLD);

Why only if the module parameters are >0? The module parameter is
unused if pause-in-guest is active.

> + } else {
> + svm->pause_filter_enabled = false;
> + svm->pause_threshold_enabled = false;
> + }
> +
> svm_recalc_instruction_intercepts(vcpu, svm);
>
> /* For sev guests, the memory encryption bit is not reserved in CR3. */
> @@ -4763,6 +4774,12 @@ static __init void svm_set_cpu_caps(void)
> if (vls)
> kvm_cpu_cap_set(X86_FEATURE_V_VMSAVE_VMLOAD);
>
> + if (pause_filter_count)
> + kvm_cpu_cap_set(X86_FEATURE_PAUSEFILTER);
> +
> + if (pause_filter_thresh)
> + kvm_cpu_cap_set(X86_FEATURE_PFTHRESHOLD);

Likewise, this should be set using just boot_cpu_has, not the module
parameters.

Paolo

> /* Nested VM can receive #VMEXIT instead of triggering #GP */
> kvm_cpu_cap_set(X86_FEATURE_SVME_ADDR_CHK);
> }
> diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h
> index a3c93f9c02847..6fa81eb3ffb78 100644
> --- a/arch/x86/kvm/svm/svm.h
> +++ b/arch/x86/kvm/svm/svm.h
> @@ -234,6 +234,8 @@ struct vcpu_svm {
> bool tsc_scaling_enabled : 1;
> bool lbrv_enabled : 1;
> bool v_vmload_vmsave_enabled : 1;
> + bool pause_filter_enabled : 1;
> + bool pause_threshold_enabled : 1;
>
> u32 ldr_reg;
> u32 dfr_reg;

2022-03-09 19:34:20

by Jim Mattson

[permalink] [raw]
Subject: Re: [PATCH v3 4/7] KVM: x86: nSVM: support PAUSE filter threshold and count when cpu_pm=on

On Tue, Mar 1, 2022 at 6:37 AM Maxim Levitsky <[email protected]> wrote:
>
> Allow L1 to use these settings if L0 disables PAUSE interception
> (AKA cpu_pm=on)
>
> Signed-off-by: Maxim Levitsky <[email protected]>

I didn't think pause filtering was virtualizable, since the value of
the internal counter isn't exposed on VM-exit.

On bare metal, for instance, assuming the hypervisor doesn't intercept
CPUID, the following code would quickly trigger a PAUSE #VMEXIT with
the filter count set to 2.

1:
pause
cpuid
jmp 1

Since L0 intercepts CPUID, however, L2 will exit to L0 on each loop
iteration, and when L0 resumes L2, the internal counter will be set to
2 again. L1 will never see a PAUSE #VMEXIT.

How do you handle this?

2022-03-09 21:51:24

by Paolo Bonzini

[permalink] [raw]
Subject: Re: [PATCH v3 4/7] KVM: x86: nSVM: support PAUSE filter threshold and count when cpu_pm=on

On 3/9/22 19:35, Jim Mattson wrote:
> I didn't think pause filtering was virtualizable, since the value of
> the internal counter isn't exposed on VM-exit.
>
> On bare metal, for instance, assuming the hypervisor doesn't intercept
> CPUID, the following code would quickly trigger a PAUSE #VMEXIT with
> the filter count set to 2.
>
> 1:
> pause
> cpuid
> jmp 1
>
> Since L0 intercepts CPUID, however, L2 will exit to L0 on each loop
> iteration, and when L0 resumes L2, the internal counter will be set to
> 2 again. L1 will never see a PAUSE #VMEXIT.
>
> How do you handle this?
>

I would expect that the same would happen on an SMI or a host interrupt.

1:
pause
outl al, 0xb2
jmp 1

In general a PAUSE vmexit will mostly benefit the VM that is pausing, so
having a partial implementation would be better than disabling it
altogether.

Paolo

2022-03-10 04:21:11

by Jim Mattson

[permalink] [raw]
Subject: Re: [PATCH v3 4/7] KVM: x86: nSVM: support PAUSE filter threshold and count when cpu_pm=on

On Wed, Mar 9, 2022 at 10:47 AM Paolo Bonzini <[email protected]> wrote:
>
> On 3/9/22 19:35, Jim Mattson wrote:
> > I didn't think pause filtering was virtualizable, since the value of
> > the internal counter isn't exposed on VM-exit.
> >
> > On bare metal, for instance, assuming the hypervisor doesn't intercept
> > CPUID, the following code would quickly trigger a PAUSE #VMEXIT with
> > the filter count set to 2.
> >
> > 1:
> > pause
> > cpuid
> > jmp 1
> >
> > Since L0 intercepts CPUID, however, L2 will exit to L0 on each loop
> > iteration, and when L0 resumes L2, the internal counter will be set to
> > 2 again. L1 will never see a PAUSE #VMEXIT.
> >
> > How do you handle this?
> >
>
> I would expect that the same would happen on an SMI or a host interrupt.
>
> 1:
> pause
> outl al, 0xb2
> jmp 1
>
> In general a PAUSE vmexit will mostly benefit the VM that is pausing, so
> having a partial implementation would be better than disabling it
> altogether.

Indeed, the APM does say, "Certain events, including SMI, can cause
the internal count to be reloaded from the VMCB." However, expanding
that set of events so much that some pause loops will *never* trigger
a #VMEXIT seems problematic. If the hypervisor knew that the PAUSE
filter may not be triggered, it could always choose to exit on every
PAUSE.

Having a partial implementation is only better than disabling it
altogether if the L2 pause loop doesn't contain a hidden #VMEXIT to
L0.

2022-03-21 23:39:37

by Jim Mattson

[permalink] [raw]
Subject: Re: [PATCH v3 4/7] KVM: x86: nSVM: support PAUSE filter threshold and count when cpu_pm=on

On Mon, Mar 21, 2022 at 3:11 PM Maxim Levitsky <[email protected]> wrote:
>
> On Mon, 2022-03-21 at 14:59 -0700, Jim Mattson wrote:
> > On Mon, Mar 21, 2022 at 2:36 PM Maxim Levitsky <[email protected]> wrote:
> > > On Wed, 2022-03-09 at 11:07 -0800, Jim Mattson wrote:
> > > > On Wed, Mar 9, 2022 at 10:47 AM Paolo Bonzini <[email protected]> wrote:
> > > > > On 3/9/22 19:35, Jim Mattson wrote:
> > > > > > I didn't think pause filtering was virtualizable, since the value of
> > > > > > the internal counter isn't exposed on VM-exit.
> > > > > >
> > > > > > On bare metal, for instance, assuming the hypervisor doesn't intercept
> > > > > > CPUID, the following code would quickly trigger a PAUSE #VMEXIT with
> > > > > > the filter count set to 2.
> > > > > >
> > > > > > 1:
> > > > > > pause
> > > > > > cpuid
> > > > > > jmp 1
> > > > > >
> > > > > > Since L0 intercepts CPUID, however, L2 will exit to L0 on each loop
> > > > > > iteration, and when L0 resumes L2, the internal counter will be set to
> > > > > > 2 again. L1 will never see a PAUSE #VMEXIT.
> > > > > >
> > > > > > How do you handle this?
> > > > > >
> > > > >
> > > > > I would expect that the same would happen on an SMI or a host interrupt.
> > > > >
> > > > > 1:
> > > > > pause
> > > > > outl al, 0xb2
> > > > > jmp 1
> > > > >
> > > > > In general a PAUSE vmexit will mostly benefit the VM that is pausing, so
> > > > > having a partial implementation would be better than disabling it
> > > > > altogether.
> > > >
> > > > Indeed, the APM does say, "Certain events, including SMI, can cause
> > > > the internal count to be reloaded from the VMCB." However, expanding
> > > > that set of events so much that some pause loops will *never* trigger
> > > > a #VMEXIT seems problematic. If the hypervisor knew that the PAUSE
> > > > filter may not be triggered, it could always choose to exit on every
> > > > PAUSE.
> > > >
> > > > Having a partial implementation is only better than disabling it
> > > > altogether if the L2 pause loop doesn't contain a hidden #VMEXIT to
> > > > L0.
> > > >
> > >
> > > Hi!
> > >
> > > You bring up a very valid point, which I didn't think about.
> > >
> > > However after thinking about this, I think that in practice,
> > > this isn't a show stopper problem for exposing this feature to the guest.
> > >
> > >
> > > This is what I am thinking:
> > >
> > > First lets assume that the L2 is malicious. In this case no doubt
> > > it can craft such a loop which will not VMexit on PAUSE.
> > > But that isn't a problem - instead of this guest could have just used NOP
> > > which is not possible to intercept anyway - no harm is done.
> > >
> > > Now lets assume a non malicious L2:
> > >
> > >
> > > First of all the problem can only happen when a VM exit is intercepted by L0,
> > > and not by L1. Both above cases usually don't pass this criteria since L1 is highly
> > > likely to intercept both CPUID and IO port access. It is also highly unlikely
> > > to allow L2 direct access to L1's mmio ranges.
> > >
> > > Overall there are very few cases of deterministic vm exit which is intercepted
> > > by L0 but not L1. If that happens then L1 will not catch the PAUSE loop,
> > > which is not different much from not catching it because of not suitable
> > > thresholds.
> > >
> > > Also note that this is an optimization only - due to count and threshold,
> > > it is not guaranteed to catch all pause loops - in fact hypervisor has
> > > to guess these values, and update them in attempt to catch as many such
> > > loops as it can.
> > >
> > > I think overall it is OK to expose that feature to the guest
> > > and it should even improve performance in some cases - currently
> > > at least nested KVM intercepts every PAUSE otherwise.
> >
> > Can I at least request that this behavior be documented as a KVM
> > virtual CPU erratum?
>
> 100%. Do you have a pointer where to document it?

I think this will be the first KVM virtual CPU erratum documented,
though there are plenty of others that I'd like to see documented
(e.g. nVMX processes posted interrupts on emulated VM-entry, AMD's
merged PMU counters are only 48 bits wide, etc.).

Maybe Paolo has some ideas?

> Best regards,
> Maxim Levitsky
>
> >
> > > Best regards,
> > > Maxim Levitsky
> > >
> > >
> > >
> > >
>
>

2022-03-21 23:40:11

by Maxim Levitsky

[permalink] [raw]
Subject: Re: [PATCH v3 4/7] KVM: x86: nSVM: support PAUSE filter threshold and count when cpu_pm=on

On Mon, 2022-03-21 at 14:59 -0700, Jim Mattson wrote:
> On Mon, Mar 21, 2022 at 2:36 PM Maxim Levitsky <[email protected]> wrote:
> > On Wed, 2022-03-09 at 11:07 -0800, Jim Mattson wrote:
> > > On Wed, Mar 9, 2022 at 10:47 AM Paolo Bonzini <[email protected]> wrote:
> > > > On 3/9/22 19:35, Jim Mattson wrote:
> > > > > I didn't think pause filtering was virtualizable, since the value of
> > > > > the internal counter isn't exposed on VM-exit.
> > > > >
> > > > > On bare metal, for instance, assuming the hypervisor doesn't intercept
> > > > > CPUID, the following code would quickly trigger a PAUSE #VMEXIT with
> > > > > the filter count set to 2.
> > > > >
> > > > > 1:
> > > > > pause
> > > > > cpuid
> > > > > jmp 1
> > > > >
> > > > > Since L0 intercepts CPUID, however, L2 will exit to L0 on each loop
> > > > > iteration, and when L0 resumes L2, the internal counter will be set to
> > > > > 2 again. L1 will never see a PAUSE #VMEXIT.
> > > > >
> > > > > How do you handle this?
> > > > >
> > > >
> > > > I would expect that the same would happen on an SMI or a host interrupt.
> > > >
> > > > 1:
> > > > pause
> > > > outl al, 0xb2
> > > > jmp 1
> > > >
> > > > In general a PAUSE vmexit will mostly benefit the VM that is pausing, so
> > > > having a partial implementation would be better than disabling it
> > > > altogether.
> > >
> > > Indeed, the APM does say, "Certain events, including SMI, can cause
> > > the internal count to be reloaded from the VMCB." However, expanding
> > > that set of events so much that some pause loops will *never* trigger
> > > a #VMEXIT seems problematic. If the hypervisor knew that the PAUSE
> > > filter may not be triggered, it could always choose to exit on every
> > > PAUSE.
> > >
> > > Having a partial implementation is only better than disabling it
> > > altogether if the L2 pause loop doesn't contain a hidden #VMEXIT to
> > > L0.
> > >
> >
> > Hi!
> >
> > You bring up a very valid point, which I didn't think about.
> >
> > However after thinking about this, I think that in practice,
> > this isn't a show stopper problem for exposing this feature to the guest.
> >
> >
> > This is what I am thinking:
> >
> > First lets assume that the L2 is malicious. In this case no doubt
> > it can craft such a loop which will not VMexit on PAUSE.
> > But that isn't a problem - instead of this guest could have just used NOP
> > which is not possible to intercept anyway - no harm is done.
> >
> > Now lets assume a non malicious L2:
> >
> >
> > First of all the problem can only happen when a VM exit is intercepted by L0,
> > and not by L1. Both above cases usually don't pass this criteria since L1 is highly
> > likely to intercept both CPUID and IO port access. It is also highly unlikely
> > to allow L2 direct access to L1's mmio ranges.
> >
> > Overall there are very few cases of deterministic vm exit which is intercepted
> > by L0 but not L1. If that happens then L1 will not catch the PAUSE loop,
> > which is not different much from not catching it because of not suitable
> > thresholds.
> >
> > Also note that this is an optimization only - due to count and threshold,
> > it is not guaranteed to catch all pause loops - in fact hypervisor has
> > to guess these values, and update them in attempt to catch as many such
> > loops as it can.
> >
> > I think overall it is OK to expose that feature to the guest
> > and it should even improve performance in some cases - currently
> > at least nested KVM intercepts every PAUSE otherwise.
>
> Can I at least request that this behavior be documented as a KVM
> virtual CPU erratum?

100%. Do you have a pointer where to document it?

Best regards,
Maxim Levitsky

>
> > Best regards,
> > Maxim Levitsky
> >
> >
> >
> >


2022-03-21 23:40:30

by Jim Mattson

[permalink] [raw]
Subject: Re: [PATCH v3 4/7] KVM: x86: nSVM: support PAUSE filter threshold and count when cpu_pm=on

On Mon, Mar 21, 2022 at 2:36 PM Maxim Levitsky <[email protected]> wrote:
>
> On Wed, 2022-03-09 at 11:07 -0800, Jim Mattson wrote:
> > On Wed, Mar 9, 2022 at 10:47 AM Paolo Bonzini <[email protected]> wrote:
> > > On 3/9/22 19:35, Jim Mattson wrote:
> > > > I didn't think pause filtering was virtualizable, since the value of
> > > > the internal counter isn't exposed on VM-exit.
> > > >
> > > > On bare metal, for instance, assuming the hypervisor doesn't intercept
> > > > CPUID, the following code would quickly trigger a PAUSE #VMEXIT with
> > > > the filter count set to 2.
> > > >
> > > > 1:
> > > > pause
> > > > cpuid
> > > > jmp 1
> > > >
> > > > Since L0 intercepts CPUID, however, L2 will exit to L0 on each loop
> > > > iteration, and when L0 resumes L2, the internal counter will be set to
> > > > 2 again. L1 will never see a PAUSE #VMEXIT.
> > > >
> > > > How do you handle this?
> > > >
> > >
> > > I would expect that the same would happen on an SMI or a host interrupt.
> > >
> > > 1:
> > > pause
> > > outl al, 0xb2
> > > jmp 1
> > >
> > > In general a PAUSE vmexit will mostly benefit the VM that is pausing, so
> > > having a partial implementation would be better than disabling it
> > > altogether.
> >
> > Indeed, the APM does say, "Certain events, including SMI, can cause
> > the internal count to be reloaded from the VMCB." However, expanding
> > that set of events so much that some pause loops will *never* trigger
> > a #VMEXIT seems problematic. If the hypervisor knew that the PAUSE
> > filter may not be triggered, it could always choose to exit on every
> > PAUSE.
> >
> > Having a partial implementation is only better than disabling it
> > altogether if the L2 pause loop doesn't contain a hidden #VMEXIT to
> > L0.
> >
>
> Hi!
>
> You bring up a very valid point, which I didn't think about.
>
> However after thinking about this, I think that in practice,
> this isn't a show stopper problem for exposing this feature to the guest.
>
>
> This is what I am thinking:
>
> First lets assume that the L2 is malicious. In this case no doubt
> it can craft such a loop which will not VMexit on PAUSE.
> But that isn't a problem - instead of this guest could have just used NOP
> which is not possible to intercept anyway - no harm is done.
>
> Now lets assume a non malicious L2:
>
>
> First of all the problem can only happen when a VM exit is intercepted by L0,
> and not by L1. Both above cases usually don't pass this criteria since L1 is highly
> likely to intercept both CPUID and IO port access. It is also highly unlikely
> to allow L2 direct access to L1's mmio ranges.
>
> Overall there are very few cases of deterministic vm exit which is intercepted
> by L0 but not L1. If that happens then L1 will not catch the PAUSE loop,
> which is not different much from not catching it because of not suitable
> thresholds.
>
> Also note that this is an optimization only - due to count and threshold,
> it is not guaranteed to catch all pause loops - in fact hypervisor has
> to guess these values, and update them in attempt to catch as many such
> loops as it can.
>
> I think overall it is OK to expose that feature to the guest
> and it should even improve performance in some cases - currently
> at least nested KVM intercepts every PAUSE otherwise.

Can I at least request that this behavior be documented as a KVM
virtual CPU erratum?

>
> Best regards,
> Maxim Levitsky
>
>
>
>

2022-03-21 23:46:29

by Maxim Levitsky

[permalink] [raw]
Subject: Re: [PATCH v3 4/7] KVM: x86: nSVM: support PAUSE filter threshold and count when cpu_pm=on

On Wed, 2022-03-09 at 11:07 -0800, Jim Mattson wrote:
> On Wed, Mar 9, 2022 at 10:47 AM Paolo Bonzini <[email protected]> wrote:
> > On 3/9/22 19:35, Jim Mattson wrote:
> > > I didn't think pause filtering was virtualizable, since the value of
> > > the internal counter isn't exposed on VM-exit.
> > >
> > > On bare metal, for instance, assuming the hypervisor doesn't intercept
> > > CPUID, the following code would quickly trigger a PAUSE #VMEXIT with
> > > the filter count set to 2.
> > >
> > > 1:
> > > pause
> > > cpuid
> > > jmp 1
> > >
> > > Since L0 intercepts CPUID, however, L2 will exit to L0 on each loop
> > > iteration, and when L0 resumes L2, the internal counter will be set to
> > > 2 again. L1 will never see a PAUSE #VMEXIT.
> > >
> > > How do you handle this?
> > >
> >
> > I would expect that the same would happen on an SMI or a host interrupt.
> >
> > 1:
> > pause
> > outl al, 0xb2
> > jmp 1
> >
> > In general a PAUSE vmexit will mostly benefit the VM that is pausing, so
> > having a partial implementation would be better than disabling it
> > altogether.
>
> Indeed, the APM does say, "Certain events, including SMI, can cause
> the internal count to be reloaded from the VMCB." However, expanding
> that set of events so much that some pause loops will *never* trigger
> a #VMEXIT seems problematic. If the hypervisor knew that the PAUSE
> filter may not be triggered, it could always choose to exit on every
> PAUSE.
>
> Having a partial implementation is only better than disabling it
> altogether if the L2 pause loop doesn't contain a hidden #VMEXIT to
> L0.
>

Hi!

You bring up a very valid point, which I didn't think about.

However after thinking about this, I think that in practice,
this isn't a show stopper problem for exposing this feature to the guest.


This is what I am thinking:

First lets assume that the L2 is malicious. In this case no doubt
it can craft such a loop which will not VMexit on PAUSE.
But that isn't a problem - instead of this guest could have just used NOP
which is not possible to intercept anyway - no harm is done.

Now lets assume a non malicious L2:


First of all the problem can only happen when a VM exit is intercepted by L0,
and not by L1. Both above cases usually don't pass this criteria since L1 is highly
likely to intercept both CPUID and IO port access. It is also highly unlikely
to allow L2 direct access to L1's mmio ranges.

Overall there are very few cases of deterministic vm exit which is intercepted
by L0 but not L1. If that happens then L1 will not catch the PAUSE loop,
which is not different much from not catching it because of not suitable
thresholds.

Also note that this is an optimization only - due to count and threshold,
it is not guaranteed to catch all pause loops - in fact hypervisor has
to guess these values, and update them in attempt to catch as many such
loops as it can.

I think overall it is OK to expose that feature to the guest
and it should even improve performance in some cases - currently
at least nested KVM intercepts every PAUSE otherwise.

Best regards,
Maxim Levitsky




2022-03-22 12:17:18

by Paolo Bonzini

[permalink] [raw]
Subject: Re: [PATCH v3 4/7] KVM: x86: nSVM: support PAUSE filter threshold and count when cpu_pm=on

On 3/21/22 23:41, Jim Mattson wrote:
>> 100%. Do you have a pointer where to document it?
> I think this will be the first KVM virtual CPU erratum documented,
> though there are plenty of others that I'd like to see documented
> (e.g. nVMX processes posted interrupts on emulated VM-entry, AMD's
> merged PMU counters are only 48 bits wide, etc.).
>
> Maybe Paolo has some ideas?

So let's document them, that's a great idea. I can help writing them
down if you have a pointer to prior email discussions. I'll send a
skeleton.

Paolo

2022-03-22 14:14:56

by Maxim Levitsky

[permalink] [raw]
Subject: Re: [PATCH v3 4/7] KVM: x86: nSVM: support PAUSE filter threshold and count when cpu_pm=on

On Tue, 2022-03-22 at 11:12 +0100, Paolo Bonzini wrote:
> On 3/21/22 23:41, Jim Mattson wrote:
> > > 100%. Do you have a pointer where to document it?
> > I think this will be the first KVM virtual CPU erratum documented,
> > though there are plenty of others that I'd like to see documented
> > (e.g. nVMX processes posted interrupts on emulated VM-entry, AMD's
> > merged PMU counters are only 48 bits wide, etc.).
> >
> > Maybe Paolo has some ideas?
>
> So let's document them, that's a great idea. I can help writing them
> down if you have a pointer to prior email discussions. I'll send a
> skeleton.
>
> Paolo
>

Things that I know that don't work 100% correctly in KVM:

* Relocation apic base. changing apic id also likely broken at least in some
cases, and sure is with AVIC enabled.

also likely some other obscure bits of the in-kernel emulation of APIC/IO apic/PIC/etc
don't work correctly.

* Emulator is not complete, so if you do unsupported instruction
on mmio, it should fail.
Also without unrestricted guest, emulator has to be used sometimes
for arbitrary code so it wil fail fast.

* Shadow mmu doesn't fully reflect real tlb, as tlb is usualy
not shared between cpus.
Also KVM's shadow mmu is more speculative vs real mmu - breaks old guests like win9x.

Also no way to disable 1GB pages when NPT/EPT is enabled, since guest paging doesn't
trap into the KVM.

* Various minor issues with debug based on single stepping / DRs, etc,
most of which I don't know well. Most of these can be fixed but it low priority,
and I have seen many fixes in this area recently.
Also proper support for nested monitor trap likely broken.

* Various msrs are hardcoded/not supported - not much specific info on this.
In particular no real support for mtrrs / pat - in fact KVM likes the guest memory to be
always WB to avoid various cpu erratas.


Best regards,
Maxim Levitsky

2022-03-23 01:10:30

by Maxim Levitsky

[permalink] [raw]
Subject: Re: [PATCH v3 4/7] KVM: x86: nSVM: support PAUSE filter threshold and count when cpu_pm=on

On Wed, 2022-03-09 at 14:12 +0100, Paolo Bonzini wrote:
> On 3/1/22 15:36, Maxim Levitsky wrote:
> > Allow L1 to use these settings if L0 disables PAUSE interception
> > (AKA cpu_pm=on)
> >
> > Signed-off-by: Maxim Levitsky <[email protected]>
> > ---
> > arch/x86/kvm/svm/nested.c | 6 ++++++
> > arch/x86/kvm/svm/svm.c | 17 +++++++++++++++++
> > arch/x86/kvm/svm/svm.h | 2 ++
> > 3 files changed, 25 insertions(+)
> >
> > diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c
> > index 37510cb206190..4cb0bc49986d5 100644
> > --- a/arch/x86/kvm/svm/nested.c
> > +++ b/arch/x86/kvm/svm/nested.c
> > @@ -664,6 +664,12 @@ static void nested_vmcb02_prepare_control(struct vcpu_svm *svm)
> > if (!nested_vmcb_needs_vls_intercept(svm))
> > svm->vmcb->control.virt_ext |= VIRTUAL_VMLOAD_VMSAVE_ENABLE_MASK;
> >
> > + if (svm->pause_filter_enabled)
> > + svm->vmcb->control.pause_filter_count = svm->nested.ctl.pause_filter_count;
> > +
> > + if (svm->pause_threshold_enabled)
> > + svm->vmcb->control.pause_filter_thresh = svm->nested.ctl.pause_filter_thresh;
>
> I think this should be
>
> if (kvm_pause_in_guest(vcpu->kvm)) {
> /* copy from VMCB12 if guest has CPUID, else set to 0 */
> } else {
> /* copy from VMCB01, unconditionally */
> }

> and likewise it should be copied back to VMCB01 unconditionally on
> vmexit if !kvm_pause_in_guest(vcpu->kvm).


I did something different in the next version of the patches.
Please take a look.


>
> > nested_svm_transition_tlb_flush(vcpu);
> >
> > /* Enter Guest-Mode */
> > diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
> > index 6a571eed32ef4..52198e63c5fc4 100644
> > --- a/arch/x86/kvm/svm/svm.c
> > +++ b/arch/x86/kvm/svm/svm.c
> > @@ -4008,6 +4008,17 @@ static void svm_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu)
> >
> > svm->v_vmload_vmsave_enabled = vls && guest_cpuid_has(vcpu, X86_FEATURE_V_VMSAVE_VMLOAD);
> >
> > + if (kvm_pause_in_guest(vcpu->kvm)) {
> > + svm->pause_filter_enabled = pause_filter_count > 0 &&
> > + guest_cpuid_has(vcpu, X86_FEATURE_PAUSEFILTER);
> > +
> > + svm->pause_threshold_enabled = pause_filter_thresh > 0 &&
> > + guest_cpuid_has(vcpu, X86_FEATURE_PFTHRESHOLD);
>
> Why only if the module parameters are >0? The module parameter is
> unused if pause-in-guest is active.

Agree, will do.

>
> > + } else {
> > + svm->pause_filter_enabled = false;
> > + svm->pause_threshold_enabled = false;
> > + }
> > +
> > svm_recalc_instruction_intercepts(vcpu, svm);
> >
> > /* For sev guests, the memory encryption bit is not reserved in CR3. */
> > @@ -4763,6 +4774,12 @@ static __init void svm_set_cpu_caps(void)
> > if (vls)
> > kvm_cpu_cap_set(X86_FEATURE_V_VMSAVE_VMLOAD);
> >
> > + if (pause_filter_count)
> > + kvm_cpu_cap_set(X86_FEATURE_PAUSEFILTER);
> > +
> > + if (pause_filter_thresh)
> > + kvm_cpu_cap_set(X86_FEATURE_PFTHRESHOLD);
>
> Likewise, this should be set using just boot_cpu_has, not the module
> parameters.

Agree as well + the check above is wrong - it should have been inverted.

>
> Paolo
>
> > /* Nested VM can receive #VMEXIT instead of triggering #GP */
> > kvm_cpu_cap_set(X86_FEATURE_SVME_ADDR_CHK);
> > }
> > diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h
> > index a3c93f9c02847..6fa81eb3ffb78 100644
> > --- a/arch/x86/kvm/svm/svm.h
> > +++ b/arch/x86/kvm/svm/svm.h
> > @@ -234,6 +234,8 @@ struct vcpu_svm {
> > bool tsc_scaling_enabled : 1;
> > bool lbrv_enabled : 1;
> > bool v_vmload_vmsave_enabled : 1;
> > + bool pause_filter_enabled : 1;
> > + bool pause_threshold_enabled : 1;
> >
> > u32 ldr_reg;
> > u32 dfr_reg;


Thanks for the review!

Best regards,
Maxim Levitsky