Started working on bringing the PLE(pause loop exit) logic from VMX
to SVM. We noticed some improvements in certain cases where numerious
pause is generated.
Please take a look. If you have any suggestions to make things better,
let me know.
Babu Moger (3):
arch/x86/kvm: SVM: Introduce pause filter threshold
arch/x86/kvm: VMX: Bring the common code to header file
arch/x86/kvm: SVM: Introduce pause loop exit logic in SVM
arch/x86/include/asm/svm.h | 3 +-
arch/x86/kvm/svm.c | 116 ++++++++++++++++++++++++++++++++++++++++++++-
arch/x86/kvm/vmx.c | 53 ++++-----------------
arch/x86/kvm/x86.h | 35 ++++++++++++++
4 files changed, 162 insertions(+), 45 deletions(-)
--
1.8.3.1
This patch adds the support for pause filtering threshold. This feature
support is indicated by CPUID Fn8000_000A_EDX. See AMD APM Vol 2 Section
15.14.4 Pause Intercept Filtering for more details
In this mode, a 16-bit pause filter threshold field is added in VMCB.
The threshold value is a cycle count that is used to reset the pause
counter. As with simple pause filtering, VMRUN loads the pause count
value from VMCB into an internal counter. Then, on each pause instruction
the hardware checks the elapsed number of cycles since the most recent
pause instruction against the pause Filter Threshold. If the elapsed cycle
count is greater than the pause filter threshold, then the internal pause
count is reloaded from VMCB and execution continues. If the elapsed cycle
count is less than the pause filter threshold, then the internal pause
count is decremented. If the count value is less than zero and pause
intercept is enabled, a #VMEXIT is triggered. If advanced pause filtering
is supported and pause Filter Threshold field is set to zero, the filter
will operate in the simpler, count only mode.
Signed-off-by: Babu Moger <[email protected]>
---
arch/x86/include/asm/svm.h | 3 ++-
arch/x86/kvm/svm.c | 2 ++
2 files changed, 4 insertions(+), 1 deletion(-)
diff --git a/arch/x86/include/asm/svm.h b/arch/x86/include/asm/svm.h
index 78dd9df..7a3d9c7 100644
--- a/arch/x86/include/asm/svm.h
+++ b/arch/x86/include/asm/svm.h
@@ -60,7 +60,8 @@ struct __attribute__ ((__packed__)) vmcb_control_area {
u32 intercept_dr;
u32 intercept_exceptions;
u64 intercept;
- u8 reserved_1[42];
+ u8 reserved_1[40];
+ u16 pause_filter_thresh;
u16 pause_filter_count;
u64 iopm_base_pa;
u64 msrpm_base_pa;
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index f40d0da..50a4e95 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -4175,6 +4175,8 @@ static void dump_vmcb(struct kvm_vcpu *vcpu)
pr_err("%-20s%08x\n", "exceptions:", control->intercept_exceptions);
pr_err("%-20s%016llx\n", "intercepts:", control->intercept);
pr_err("%-20s%d\n", "pause filter count:", control->pause_filter_count);
+ pr_err("%-20s%d\n", "pause filter threshold:",
+ control->pause_filter_thresh);
pr_err("%-20s%016llx\n", "iopm_base_pa:", control->iopm_base_pa);
pr_err("%-20s%016llx\n", "msrpm_base_pa:", control->msrpm_base_pa);
pr_err("%-20s%016llx\n", "tsc_offset:", control->tsc_offset);
--
1.8.3.1
This patch is brings some of the code from vmx to x86.h. We can
share this code between vmx and svm. Modified couple of functions
to make it common. No functional change.
Signed-off-by: Babu Moger <[email protected]>
---
arch/x86/kvm/vmx.c | 53 ++++++++++-------------------------------------------
arch/x86/kvm/x86.h | 34 ++++++++++++++++++++++++++++++++++
2 files changed, 44 insertions(+), 43 deletions(-)
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index c829d89..6b9fa7e 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -156,25 +156,19 @@
* Time is measured based on a counter that runs at the same rate as the TSC,
* refer SDM volume 3b section 21.6.13 & 22.1.3.
*/
-#define KVM_VMX_DEFAULT_PLE_GAP 128
-#define KVM_VMX_DEFAULT_PLE_WINDOW 4096
-#define KVM_VMX_DEFAULT_PLE_WINDOW_GROW 2
-#define KVM_VMX_DEFAULT_PLE_WINDOW_SHRINK 0
-#define KVM_VMX_DEFAULT_PLE_WINDOW_MAX \
- INT_MAX / KVM_VMX_DEFAULT_PLE_WINDOW_GROW
-
-static int ple_gap = KVM_VMX_DEFAULT_PLE_GAP;
+
+static int ple_gap = KVM_DEFAULT_PLE_GAP;
module_param(ple_gap, int, S_IRUGO);
-static int ple_window = KVM_VMX_DEFAULT_PLE_WINDOW;
+static int ple_window = KVM_DEFAULT_PLE_WINDOW;
module_param(ple_window, int, S_IRUGO);
/* Default doubles per-vcpu window every exit. */
-static int ple_window_grow = KVM_VMX_DEFAULT_PLE_WINDOW_GROW;
+static int ple_window_grow = KVM_DEFAULT_PLE_WINDOW_GROW;
module_param(ple_window_grow, int, S_IRUGO);
/* Default resets per-vcpu window every exit to ple_window. */
-static int ple_window_shrink = KVM_VMX_DEFAULT_PLE_WINDOW_SHRINK;
+static int ple_window_shrink = KVM_DEFAULT_PLE_WINDOW_SHRINK;
module_param(ple_window_shrink, int, S_IRUGO);
/* Default is to compute the maximum so we can never overflow. */
@@ -6640,40 +6634,13 @@ static int handle_invalid_guest_state(struct kvm_vcpu *vcpu)
return ret;
}
-static int __grow_ple_window(int val)
-{
- if (ple_window_grow < 1)
- return ple_window;
-
- val = min(val, ple_window_actual_max);
-
- if (ple_window_grow < ple_window)
- val *= ple_window_grow;
- else
- val += ple_window_grow;
-
- return val;
-}
-
-static int __shrink_ple_window(int val, int modifier, int minimum)
-{
- if (modifier < 1)
- return ple_window;
-
- if (modifier < ple_window)
- val /= modifier;
- else
- val -= modifier;
-
- return max(val, minimum);
-}
-
static void grow_ple_window(struct kvm_vcpu *vcpu)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
int old = vmx->ple_window;
- vmx->ple_window = __grow_ple_window(old);
+ vmx->ple_window = __grow_ple_window(old, ple_window, ple_window_grow,
+ ple_window_actual_max);
if (vmx->ple_window != old)
vmx->ple_window_dirty = true;
@@ -6686,7 +6653,7 @@ static void shrink_ple_window(struct kvm_vcpu *vcpu)
struct vcpu_vmx *vmx = to_vmx(vcpu);
int old = vmx->ple_window;
- vmx->ple_window = __shrink_ple_window(old,
+ vmx->ple_window = __shrink_ple_window(old, ple_window,
ple_window_shrink, ple_window);
if (vmx->ple_window != old)
@@ -6706,8 +6673,8 @@ static void shrink_ple_window(struct kvm_vcpu *vcpu)
static void update_ple_window_actual_max(void)
{
ple_window_actual_max =
- __shrink_ple_window(max(ple_window_max, ple_window),
- ple_window_grow, INT_MIN);
+ __shrink_ple_window(max(ple_window_max, ple_window),
+ ple_window, ple_window_grow, INT_MIN);
}
/*
diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h
index d0b95b7..d1fb7bb 100644
--- a/arch/x86/kvm/x86.h
+++ b/arch/x86/kvm/x86.h
@@ -9,7 +9,41 @@
#include "kvm_cache_regs.h"
#define MSR_IA32_CR_PAT_DEFAULT 0x0007040600070406ULL
+#define KVM_DEFAULT_PLE_GAP 128
+#define KVM_DEFAULT_PLE_WINDOW 4096
+#define KVM_DEFAULT_PLE_WINDOW_GROW 2
+#define KVM_DEFAULT_PLE_WINDOW_SHRINK 0
+#define KVM_VMX_DEFAULT_PLE_WINDOW_MAX \
+ (INT_MAX / KVM_DEFAULT_PLE_WINDOW_GROW)
+
+static inline int __grow_ple_window(int val, int base, int modifier, int max)
+{
+ if (modifier < 1)
+ return base;
+
+ val = min(val, max);
+
+ if (modifier < base)
+ val *= modifier;
+ else
+ val += modifier;
+
+ return val;
+}
+static inline int __shrink_ple_window(int val, int base, int modifier,
+ int minimum)
+{
+ if (modifier < 1)
+ return base;
+
+ if (modifier < base)
+ val /= modifier;
+ else
+ val -= modifier;
+
+ return max(val, minimum);
+}
static inline void kvm_clear_exception_queue(struct kvm_vcpu *vcpu)
{
vcpu->arch.exception.injected = false;
--
1.8.3.1
Bring the PLE(pause loop exit) logic to AMD svm driver.
We have noticed it help in situations where numerous pauses are generated
due to spinlock or other scenarios. Tested it with idle=poll and noticed
pause interceptions go down considerably.
Signed-off-by: Babu Moger <[email protected]>
---
arch/x86/kvm/svm.c | 114 ++++++++++++++++++++++++++++++++++++++++++++++++++++-
arch/x86/kvm/x86.h | 1 +
2 files changed, 114 insertions(+), 1 deletion(-)
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 50a4e95..30bc851 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -263,6 +263,55 @@ struct amd_svm_iommu_ir {
static bool npt_enabled;
#endif
+/*
+ * These 2 parameters are used to config the controls for Pause-Loop Exiting:
+ * pause_filter_thresh: On processors that support Pause filtering(indicated
+ * by CPUID Fn8000_000A_EDX), the VMCB provides a 16 bit pause filter
+ * count value. On VMRUN this value is loaded into an internal counter.
+ * Each time a pause instruction is executed, this counter is decremented
+ * until it reaches zero at which time a #VMEXIT is generated if pause
+ * intercept is enabled. Refer to AMD APM Vol 2 Section 15.14.4 Pause
+ * Intercept Filtering for more details.
+ * This also indicate if ple logic enabled.
+ *
+ * pause_filter_count: In addition, some processor families support advanced
+ * pause filtering (indicated by CPUID Fn8000_000A_EDX) upper bound on
+ * the amount of time a guest is allowed to execute in a pause loop.
+ * In this mode, a 16-bit pause filter threshold field is added in the
+ * VMCB. The threshold value is a cycle count that is used to reset the
+ * pause counter. As with simple pause filtering, VMRUN loads the pause
+ * count value from VMCB into an internal counter. Then, on each pause
+ * instruction the hardware checks the elapsed number of cycles since
+ * the most recent pause instruction against the pause filter threshold.
+ * If the elapsed cycle count is greater than the pause filter threshold,
+ * then the internal pause count is reloaded from the VMCB and execution
+ * continues. If the elapsed cycle count is less than the pause filter
+ * threshold, then the internal pause count is decremented. If the count
+ * value is less than zero and PAUSE intercept is enabled, a #VMEXIT is
+ * triggered. If advanced pause filtering is supported and pause filter
+ * threshold field is set to zero, the filter will operate in the simpler,
+ * count only mode.
+ */
+
+static int pause_filter_thresh = KVM_DEFAULT_PLE_GAP;
+module_param(pause_filter_thresh, int, S_IRUGO);
+
+static int pause_filter_count = KVM_DEFAULT_PLE_WINDOW;
+module_param(pause_filter_count, int, S_IRUGO);
+
+/* Default doubles per-vcpu window every exit. */
+static int ple_window_grow = KVM_DEFAULT_PLE_WINDOW_GROW;
+module_param(ple_window_grow, int, S_IRUGO);
+
+/* Default resets per-vcpu window every exit to ple_window. */
+static int ple_window_shrink = KVM_DEFAULT_PLE_WINDOW_SHRINK;
+module_param(ple_window_shrink, int, S_IRUGO);
+
+/* Default is to compute the maximum so we can never overflow. */
+static int ple_window_actual_max = KVM_SVM_DEFAULT_PLE_WINDOW_MAX;
+static int ple_window_max = KVM_SVM_DEFAULT_PLE_WINDOW_MAX;
+module_param(ple_window_max, int, S_IRUGO);
+
/* allow nested paging (virtualized MMU) for all guests */
static int npt = true;
module_param(npt, int, S_IRUGO);
@@ -1046,6 +1095,58 @@ static int avic_ga_log_notifier(u32 ga_tag)
return 0;
}
+static void grow_ple_window(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+ struct vmcb_control_area *control = &svm->vmcb->control;
+ int old = control->pause_filter_count;
+
+ control->pause_filter_count = __grow_ple_window(old,
+ pause_filter_count,
+ ple_window_grow,
+ ple_window_actual_max);
+
+ if (control->pause_filter_count != old)
+ mark_dirty(svm->vmcb, VMCB_INTERCEPTS);
+
+ trace_kvm_ple_window_grow(vcpu->vcpu_id,
+ control->pause_filter_count, old);
+}
+
+static void shrink_ple_window(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+ struct vmcb_control_area *control = &svm->vmcb->control;
+ int old = control->pause_filter_count;
+
+ control->pause_filter_count = __shrink_ple_window(old,
+ pause_filter_count,
+ ple_window_shrink,
+ pause_filter_count);
+
+ if (control->pause_filter_count != old)
+ mark_dirty(svm->vmcb, VMCB_INTERCEPTS);
+
+ trace_kvm_ple_window_shrink(vcpu->vcpu_id,
+ control->pause_filter_count, old);
+}
+
+/*
+ * ple_window_actual_max is computed to be one grow_ple_window() below
+ * ple_window_max. (See __grow_ple_window for the reason.)
+ * This prevents overflows, because ple_window_max is int.
+ * ple_window_max effectively rounded down to a multiple of ple_window_grow in
+ * this process.
+ * ple_window_max is also prevented from setting control->pause_filter_count <
+ * pause_filter_count.
+ */
+static void update_ple_window_actual_max(void)
+{
+ ple_window_actual_max =
+ __shrink_ple_window(max(ple_window_max, pause_filter_count),
+ pause_filter_count,
+ ple_window_grow, SHRT_MIN);
+}
static __init int svm_hardware_setup(void)
{
int cpu;
@@ -1101,6 +1202,8 @@ static __init int svm_hardware_setup(void)
} else
kvm_disable_tdp();
+ update_ple_window_actual_max();
+
if (avic) {
if (!npt_enabled ||
!boot_cpu_has(X86_FEATURE_AVIC) ||
@@ -1309,7 +1412,11 @@ static void init_vmcb(struct vcpu_svm *svm)
svm->vcpu.arch.hflags = 0;
if (boot_cpu_has(X86_FEATURE_PAUSEFILTER)) {
- control->pause_filter_count = 3000;
+ control->pause_filter_count = pause_filter_count;
+ if (boot_cpu_has(X86_FEATURE_PFTHRESHOLD))
+ control->pause_filter_thresh = pause_filter_thresh;
+ else
+ pause_filter_thresh = 0;
set_intercept(svm, INTERCEPT_PAUSE);
}
@@ -3802,6 +3909,9 @@ static int pause_interception(struct vcpu_svm *svm)
struct kvm_vcpu *vcpu = &svm->vcpu;
bool in_kernel = (svm_get_cpl(vcpu) == 0);
+ if (pause_filter_thresh)
+ grow_ple_window(vcpu);
+
kvm_vcpu_on_spin(vcpu, in_kernel);
return 1;
}
@@ -5424,6 +5534,8 @@ static void svm_handle_external_intr(struct kvm_vcpu *vcpu)
static void svm_sched_in(struct kvm_vcpu *vcpu, int cpu)
{
+ if (pause_filter_thresh)
+ shrink_ple_window(vcpu);
}
static inline void avic_post_state_restore(struct kvm_vcpu *vcpu)
diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h
index d1fb7bb..4c4f6b8 100644
--- a/arch/x86/kvm/x86.h
+++ b/arch/x86/kvm/x86.h
@@ -15,6 +15,7 @@
#define KVM_DEFAULT_PLE_WINDOW_SHRINK 0
#define KVM_VMX_DEFAULT_PLE_WINDOW_MAX \
(INT_MAX / KVM_DEFAULT_PLE_WINDOW_GROW)
+#define KVM_SVM_DEFAULT_PLE_WINDOW_MAX USHRT_MAX
static inline int __grow_ple_window(int val, int base, int modifier, int max)
{
--
1.8.3.1
2018-03-02 11:17-0500, Babu Moger:
> Bring the PLE(pause loop exit) logic to AMD svm driver.
> We have noticed it help in situations where numerous pauses are generated
> due to spinlock or other scenarios. Tested it with idle=poll and noticed
> pause interceptions go down considerably.
>
> Signed-off-by: Babu Moger <[email protected]>
> ---
> arch/x86/kvm/svm.c | 114 ++++++++++++++++++++++++++++++++++++++++++++++++++++-
> arch/x86/kvm/x86.h | 1 +
> 2 files changed, 114 insertions(+), 1 deletion(-)
>
> diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
> index 50a4e95..30bc851 100644
> --- a/arch/x86/kvm/svm.c
> +++ b/arch/x86/kvm/svm.c
> @@ -263,6 +263,55 @@ struct amd_svm_iommu_ir {
> static bool npt_enabled;
> #endif
>
> +/*
> + * These 2 parameters are used to config the controls for Pause-Loop Exiting:
> + * pause_filter_thresh: On processors that support Pause filtering(indicated
> + * by CPUID Fn8000_000A_EDX), the VMCB provides a 16 bit pause filter
> + * count value. On VMRUN this value is loaded into an internal counter.
> + * Each time a pause instruction is executed, this counter is decremented
> + * until it reaches zero at which time a #VMEXIT is generated if pause
> + * intercept is enabled. Refer to AMD APM Vol 2 Section 15.14.4 Pause
> + * Intercept Filtering for more details.
> + * This also indicate if ple logic enabled.
> + *
> + * pause_filter_count: In addition, some processor families support advanced
The comment has thresh/count flipped.
> + * pause filtering (indicated by CPUID Fn8000_000A_EDX) upper bound on
> + * the amount of time a guest is allowed to execute in a pause loop.
> + * In this mode, a 16-bit pause filter threshold field is added in the
> + * VMCB. The threshold value is a cycle count that is used to reset the
> + * pause counter. As with simple pause filtering, VMRUN loads the pause
> + * count value from VMCB into an internal counter. Then, on each pause
> + * instruction the hardware checks the elapsed number of cycles since
> + * the most recent pause instruction against the pause filter threshold.
> + * If the elapsed cycle count is greater than the pause filter threshold,
> + * then the internal pause count is reloaded from the VMCB and execution
> + * continues. If the elapsed cycle count is less than the pause filter
> + * threshold, then the internal pause count is decremented. If the count
> + * value is less than zero and PAUSE intercept is enabled, a #VMEXIT is
> + * triggered. If advanced pause filtering is supported and pause filter
> + * threshold field is set to zero, the filter will operate in the simpler,
> + * count only mode.
> + */
> +
> +static int pause_filter_thresh = KVM_DEFAULT_PLE_GAP;
> +module_param(pause_filter_thresh, int, S_IRUGO);
I think it was a mistake to put signed values in VMX ...
Please use unsigned variants and also properly sized.
(The module param type would be "ushort" instead of "int".)
> +static int pause_filter_count = KVM_DEFAULT_PLE_WINDOW;
> +module_param(pause_filter_count, int, S_IRUGO);
We are going to want a different default for pause_filter_count, because
they have a different meaning. On Intel, it's the number of cycles, on
AMD, it's the number of PAUSE instructions.
The AMD's 3k is a bit high in comparison to Intel's 4k, but I'd keep 3k
unless we have other benchmark results.
> +static int ple_window_grow = KVM_DEFAULT_PLE_WINDOW_GROW;
The naming would be nicer with a consistent prefix. We're growing
pause_filter_count, so pause_filter_count_grow is easier to understand.
(Albeit unwieldy.)
> +module_param(ple_window_grow, int, S_IRUGO);
(This is better as unsigned too ... VMX should have had that.)
> @@ -1046,6 +1095,58 @@ static int avic_ga_log_notifier(u32 ga_tag)
> return 0;
> }
>
> +static void grow_ple_window(struct kvm_vcpu *vcpu)
> +{
> + struct vcpu_svm *svm = to_svm(vcpu);
> + struct vmcb_control_area *control = &svm->vmcb->control;
> + int old = control->pause_filter_count;
> +
> + control->pause_filter_count = __grow_ple_window(old,
> + pause_filter_count,
> + ple_window_grow,
> + ple_window_actual_max);
> +
> + if (control->pause_filter_count != old)
> + mark_dirty(svm->vmcb, VMCB_INTERCEPTS);
> +
> + trace_kvm_ple_window_grow(vcpu->vcpu_id,
> + control->pause_filter_count, old);
Please move the tracing into __shrink_ple_window to share the code.
This probably belongs to patch [2/3].
> +/*
> + * ple_window_actual_max is computed to be one grow_ple_window() below
> + * ple_window_max. (See __grow_ple_window for the reason.)
> + * This prevents overflows, because ple_window_max is int.
> + * ple_window_max effectively rounded down to a multiple of ple_window_grow in
> + * this process.
> + * ple_window_max is also prevented from setting control->pause_filter_count <
> + * pause_filter_count.
> + */
> +static void update_ple_window_actual_max(void)
> +{
> + ple_window_actual_max =
> + __shrink_ple_window(max(ple_window_max, pause_filter_count),
(I have no idea what I was thinking when I wrote that for VMX. :[
I'll write a patch to get rid of ple_window_actual_max, because its
benefits are really minuscule and the logic is complicated.)
> + pause_filter_count,
> + ple_window_grow, SHRT_MIN);
> +}
> static __init int svm_hardware_setup(void)
> {
> int cpu;
> @@ -1309,7 +1412,11 @@ static void init_vmcb(struct vcpu_svm *svm)
> svm->vcpu.arch.hflags = 0;
>
> if (boot_cpu_has(X86_FEATURE_PAUSEFILTER)) {
> - control->pause_filter_count = 3000;
> + control->pause_filter_count = pause_filter_count;
> + if (boot_cpu_has(X86_FEATURE_PFTHRESHOLD))
> + control->pause_filter_thresh = pause_filter_thresh;
> + else
> + pause_filter_thresh = 0;
Please move this to hardware_setup and also clear pause_filter_count if
X86_FEATURE_PAUSEFILTER is not present.
> set_intercept(svm, INTERCEPT_PAUSE);
The intercept should then be disabled iff pause_filter_count == 0.
The functionality looks correct,
thanks!
Radim,
Thanks for the comments. Taken care of most of the comments.
I have few questions/comments. Please see inline.
> -----Original Message-----
> From: Radim Kr?m?? <[email protected]>
> Sent: Friday, March 9, 2018 12:13 PM
> To: Moger, Babu <[email protected]>
> Cc: [email protected]; [email protected]; [email protected];
> [email protected]; [email protected]; [email protected];
> [email protected]; [email protected]
> Subject: Re: [RFC 3/3] arch/x86/kvm: SVM: Introduce pause loop exit logic in
> SVM
>
> 2018-03-02 11:17-0500, Babu Moger:
> > Bring the PLE(pause loop exit) logic to AMD svm driver.
> > We have noticed it help in situations where numerous pauses are
> generated
> > due to spinlock or other scenarios. Tested it with idle=poll and noticed
> > pause interceptions go down considerably.
> >
> > Signed-off-by: Babu Moger <[email protected]>
> > ---
> > arch/x86/kvm/svm.c | 114
> ++++++++++++++++++++++++++++++++++++++++++++++++++++-
> > arch/x86/kvm/x86.h | 1 +
> > 2 files changed, 114 insertions(+), 1 deletion(-)
> >
> > diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
> > index 50a4e95..30bc851 100644
> > --- a/arch/x86/kvm/svm.c
> > +++ b/arch/x86/kvm/svm.c
> > @@ -263,6 +263,55 @@ struct amd_svm_iommu_ir {
> > static bool npt_enabled;
> > #endif
> >
> > +/*
> > + * These 2 parameters are used to config the controls for Pause-Loop
> Exiting:
> > + * pause_filter_thresh: On processors that support Pause
> filtering(indicated
> > + * by CPUID Fn8000_000A_EDX), the VMCB provides a 16 bit pause filter
> > + * count value. On VMRUN this value is loaded into an internal counter.
> > + * Each time a pause instruction is executed, this counter is
> decremented
> > + * until it reaches zero at which time a #VMEXIT is generated if pause
> > + * intercept is enabled. Refer to AMD APM Vol 2 Section 15.14.4 Pause
> > + * Intercept Filtering for more details.
> > + * This also indicate if ple logic enabled.
> > + *
> > + * pause_filter_count: In addition, some processor families support
> advanced
>
> The comment has thresh/count flipped.
Good catch. Thanks
>
> > + * pause filtering (indicated by CPUID Fn8000_000A_EDX) upper bound
> on
> > + * the amount of time a guest is allowed to execute in a pause loop.
> > + * In this mode, a 16-bit pause filter threshold field is added in the
> > + * VMCB. The threshold value is a cycle count that is used to reset the
> > + * pause counter. As with simple pause filtering, VMRUN loads the
> pause
> > + * count value from VMCB into an internal counter. Then, on each
> pause
> > + * instruction the hardware checks the elapsed number of cycles since
> > + * the most recent pause instruction against the pause filter threshold.
> > + * If the elapsed cycle count is greater than the pause filter threshold,
> > + * then the internal pause count is reloaded from the VMCB and
> execution
> > + * continues. If the elapsed cycle count is less than the pause filter
> > + * threshold, then the internal pause count is decremented. If the
> count
> > + * value is less than zero and PAUSE intercept is enabled, a #VMEXIT is
> > + * triggered. If advanced pause filtering is supported and pause filter
> > + * threshold field is set to zero, the filter will operate in the simpler,
> > + * count only mode.
> > + */
> > +
> > +static int pause_filter_thresh = KVM_DEFAULT_PLE_GAP;
> > +module_param(pause_filter_thresh, int, S_IRUGO);
>
> I think it was a mistake to put signed values in VMX ...
> Please use unsigned variants and also properly sized.
> (The module param type would be "ushort" instead of "int".)
Sure. Will take care.
>
> > +static int pause_filter_count = KVM_DEFAULT_PLE_WINDOW;
> > +module_param(pause_filter_count, int, S_IRUGO);
>
> We are going to want a different default for pause_filter_count, because
> they have a different meaning. On Intel, it's the number of cycles, on
> AMD, it's the number of PAUSE instructions.
>
> The AMD's 3k is a bit high in comparison to Intel's 4k, but I'd keep 3k
> unless we have other benchmark results.
Ok. Testing with pause_filter_count = 3k for AMD. If everything goes fine, will make these changes.
>
> > +static int ple_window_grow = KVM_DEFAULT_PLE_WINDOW_GROW;
>
> The naming would be nicer with a consistent prefix. We're growing
> pause_filter_count, so pause_filter_count_grow is easier to understand.
> (Albeit unwieldy.)
Sure. Will take care.
>
> > +module_param(ple_window_grow, int, S_IRUGO);
>
> (This is better as unsigned too ... VMX should have had that.)
Yes. Will fix it.
>
> > @@ -1046,6 +1095,58 @@ static int avic_ga_log_notifier(u32 ga_tag)
> > return 0;
> > }
> >
> > +static void grow_ple_window(struct kvm_vcpu *vcpu)
> > +{
> > + struct vcpu_svm *svm = to_svm(vcpu);
> > + struct vmcb_control_area *control = &svm->vmcb->control;
> > + int old = control->pause_filter_count;
> > +
> > + control->pause_filter_count = __grow_ple_window(old,
> > + pause_filter_count,
> > + ple_window_grow,
> > +
> ple_window_actual_max);
> > +
> > + if (control->pause_filter_count != old)
> > + mark_dirty(svm->vmcb, VMCB_INTERCEPTS);
> > +
> > + trace_kvm_ple_window_grow(vcpu->vcpu_id,
> > + control->pause_filter_count, old);
>
> Please move the tracing into __shrink_ple_window to share the code.
> This probably belongs to patch [2/3].
I will have to pass vcpu_id, and have to make few changes to display old and new values.
I am afraid it might add few more extra instructions.
>
> > +/*
> > + * ple_window_actual_max is computed to be one grow_ple_window()
> below
> > + * ple_window_max. (See __grow_ple_window for the reason.)
> > + * This prevents overflows, because ple_window_max is int.
> > + * ple_window_max effectively rounded down to a multiple of
> ple_window_grow in
> > + * this process.
> > + * ple_window_max is also prevented from setting control-
> >pause_filter_count <
> > + * pause_filter_count.
> > + */
> > +static void update_ple_window_actual_max(void)
> > +{
> > + ple_window_actual_max =
> > + __shrink_ple_window(max(ple_window_max,
> pause_filter_count),
>
> (I have no idea what I was thinking when I wrote that for VMX. :[
> I'll write a patch to get rid of ple_window_actual_max, because its
> benefits are really minuscule and the logic is complicated.)
If you are thinking of just straight forward removal, I can take care of it.
>
> > + pause_filter_count,
> > + ple_window_grow, SHRT_MIN);
> > +}
> > static __init int svm_hardware_setup(void)
> > {
> > int cpu;
> > @@ -1309,7 +1412,11 @@ static void init_vmcb(struct vcpu_svm *svm)
> > svm->vcpu.arch.hflags = 0;
> >
> > if (boot_cpu_has(X86_FEATURE_PAUSEFILTER)) {
> > - control->pause_filter_count = 3000;
> > + control->pause_filter_count = pause_filter_count;
> > + if (boot_cpu_has(X86_FEATURE_PFTHRESHOLD))
> > + control->pause_filter_thresh = pause_filter_thresh;
> > + else
> > + pause_filter_thresh = 0;
>
> Please move this to hardware_setup and also clear pause_filter_count if
Moving this to hardware_setup will be a problem. We don't have access to svm data structure in hardware_setup.
> X86_FEATURE_PAUSEFILTER is not present.
Sure. Will clear pause_filter_count if X86_FEATURE_PAUSEFILTER is not present.
>
> > set_intercept(svm, INTERCEPT_PAUSE);
>
> The intercept should then be disabled iff pause_filter_count == 0.
Yes, will disable intercept if pause_filter_count is zero.
>
> The functionality looks correct,
>
> thanks!
2018-03-10 05:07+0000, Moger, Babu:
> Radim,
> Thanks for the comments. Taken care of most of the comments.
> I have few questions/comments. Please see inline.
>
> > -----Original Message-----
> > From: Radim Krčmář <[email protected]>
> > Sent: Friday, March 9, 2018 12:13 PM
> > To: Moger, Babu <[email protected]>
> > Cc: [email protected]; [email protected]; [email protected];
> > [email protected]; [email protected]; [email protected];
> > [email protected]; [email protected]
> > Subject: Re: [RFC 3/3] arch/x86/kvm: SVM: Introduce pause loop exit logic in
> > SVM
> >
> > 2018-03-02 11:17-0500, Babu Moger:
> > > Bring the PLE(pause loop exit) logic to AMD svm driver.
> > > We have noticed it help in situations where numerous pauses are
> > generated
> > > due to spinlock or other scenarios. Tested it with idle=poll and noticed
> > > pause interceptions go down considerably.
> > >
> > > Signed-off-by: Babu Moger <[email protected]>
> > > ---
> > > @@ -1046,6 +1095,58 @@ static int avic_ga_log_notifier(u32 ga_tag)
> > > return 0;
> > > }
> > >
> > > +static void grow_ple_window(struct kvm_vcpu *vcpu)
> > > +{
> > > + struct vcpu_svm *svm = to_svm(vcpu);
> > > + struct vmcb_control_area *control = &svm->vmcb->control;
> > > + int old = control->pause_filter_count;
> > > +
> > > + control->pause_filter_count = __grow_ple_window(old,
> > > + pause_filter_count,
> > > + ple_window_grow,
> > > +
> > ple_window_actual_max);
> > > +
> > > + if (control->pause_filter_count != old)
> > > + mark_dirty(svm->vmcb, VMCB_INTERCEPTS);
> > > +
> > > + trace_kvm_ple_window_grow(vcpu->vcpu_id,
> > > + control->pause_filter_count, old);
> >
> > Please move the tracing into __shrink_ple_window to share the code.
> > This probably belongs to patch [2/3].
>
> I will have to pass vcpu_id, and have to make few changes to display old and new values.
> I am afraid it might add few more extra instructions.
Right, vcpu_id isn't available in that function.
Keeping it like this is ok.
> >
> > > +/*
> > > + * ple_window_actual_max is computed to be one grow_ple_window()
> > below
> > > + * ple_window_max. (See __grow_ple_window for the reason.)
> > > + * This prevents overflows, because ple_window_max is int.
> > > + * ple_window_max effectively rounded down to a multiple of
> > ple_window_grow in
> > > + * this process.
> > > + * ple_window_max is also prevented from setting control-
> > >pause_filter_count <
> > > + * pause_filter_count.
> > > + */
> > > +static void update_ple_window_actual_max(void)
> > > +{
> > > + ple_window_actual_max =
> > > + __shrink_ple_window(max(ple_window_max,
> > pause_filter_count),
> >
> > (I have no idea what I was thinking when I wrote that for VMX. :[
> > I'll write a patch to get rid of ple_window_actual_max, because its
> > benefits are really minuscule and the logic is complicated.)
>
> If you are thinking of just straight forward removal, I can take care of it.
And tweaking the overflow handling to account for that. Go ahead if
you'd like to.
> >
> > > + pause_filter_count,
> > > + ple_window_grow, SHRT_MIN);
> > > +}
> > > static __init int svm_hardware_setup(void)
> > > {
> > > int cpu;
> > > @@ -1309,7 +1412,11 @@ static void init_vmcb(struct vcpu_svm *svm)
> > > svm->vcpu.arch.hflags = 0;
> > >
> > > if (boot_cpu_has(X86_FEATURE_PAUSEFILTER)) {
> > > - control->pause_filter_count = 3000;
> > > + control->pause_filter_count = pause_filter_count;
> > > + if (boot_cpu_has(X86_FEATURE_PFTHRESHOLD))
> > > + control->pause_filter_thresh = pause_filter_thresh;
> > > + else
> > > + pause_filter_thresh = 0;
> >
> > Please move this to hardware_setup and also clear pause_filter_count if
>
> Moving this to hardware_setup will be a problem. We don't have access to svm data structure in hardware_setup.
I mean just the pause_filter_thresh = 0 and pause_filter_count = 0 logic
based on boot_cpu_has (it's weird if the user-visible parameters are
corrected after starting a VM); VMCB configuration stays,
thanks.
> -----Original Message-----
> From: Radim Krčmář <[email protected]>
> Sent: Wednesday, March 14, 2018 8:26 AM
> To: Moger, Babu <[email protected]>
> Cc: [email protected]; [email protected]; [email protected];
> [email protected]; [email protected]; [email protected];
> [email protected]; [email protected]
> Subject: Re: [RFC 3/3] arch/x86/kvm: SVM: Introduce pause loop exit logic in
> SVM
>
> 2018-03-10 05:07+0000, Moger, Babu:
> > Radim,
> > Thanks for the comments. Taken care of most of the comments.
> > I have few questions/comments. Please see inline.
> >
> > > -----Original Message-----
> > > From: Radim Krčmář <[email protected]>
> > > Sent: Friday, March 9, 2018 12:13 PM
> > > To: Moger, Babu <[email protected]>
> > > Cc: [email protected]; [email protected]; [email protected];
> > > [email protected]; [email protected]; [email protected];
> > > [email protected]; [email protected]
> > > Subject: Re: [RFC 3/3] arch/x86/kvm: SVM: Introduce pause loop exit
> logic in
> > > SVM
> > >
> > > 2018-03-02 11:17-0500, Babu Moger:
> > > > Bring the PLE(pause loop exit) logic to AMD svm driver.
> > > > We have noticed it help in situations where numerous pauses are
> > > generated
> > > > due to spinlock or other scenarios. Tested it with idle=poll and noticed
> > > > pause interceptions go down considerably.
> > > >
> > > > Signed-off-by: Babu Moger <[email protected]>
> > > > ---
> > > > @@ -1046,6 +1095,58 @@ static int avic_ga_log_notifier(u32 ga_tag)
> > > > return 0;
> > > > }
> > > >
> > > > +static void grow_ple_window(struct kvm_vcpu *vcpu)
> > > > +{
> > > > + struct vcpu_svm *svm = to_svm(vcpu);
> > > > + struct vmcb_control_area *control = &svm->vmcb->control;
> > > > + int old = control->pause_filter_count;
> > > > +
> > > > + control->pause_filter_count = __grow_ple_window(old,
> > > > + pause_filter_count,
> > > > + ple_window_grow,
> > > > +
> > > ple_window_actual_max);
> > > > +
> > > > + if (control->pause_filter_count != old)
> > > > + mark_dirty(svm->vmcb, VMCB_INTERCEPTS);
> > > > +
> > > > + trace_kvm_ple_window_grow(vcpu->vcpu_id,
> > > > + control->pause_filter_count, old);
> > >
> > > Please move the tracing into __shrink_ple_window to share the code.
> > > This probably belongs to patch [2/3].
> >
> > I will have to pass vcpu_id, and have to make few changes to display old
> and new values.
> > I am afraid it might add few more extra instructions.
>
> Right, vcpu_id isn't available in that function.
> Keeping it like this is ok.
>
> > >
> > > > +/*
> > > > + * ple_window_actual_max is computed to be one
> grow_ple_window()
> > > below
> > > > + * ple_window_max. (See __grow_ple_window for the reason.)
> > > > + * This prevents overflows, because ple_window_max is int.
> > > > + * ple_window_max effectively rounded down to a multiple of
> > > ple_window_grow in
> > > > + * this process.
> > > > + * ple_window_max is also prevented from setting control-
> > > >pause_filter_count <
> > > > + * pause_filter_count.
> > > > + */
> > > > +static void update_ple_window_actual_max(void)
> > > > +{
> > > > + ple_window_actual_max =
> > > > + __shrink_ple_window(max(ple_window_max,
> > > pause_filter_count),
> > >
> > > (I have no idea what I was thinking when I wrote that for VMX. :[
> > > I'll write a patch to get rid of ple_window_actual_max, because its
> > > benefits are really minuscule and the logic is complicated.)
> >
> > If you are thinking of just straight forward removal, I can take care of it.
>
> And tweaking the overflow handling to account for that. Go ahead if
> you'd like to.
Ok. Will add new patch to the series to handle this. Thanks.
>
> > >
> > > > + pause_filter_count,
> > > > + ple_window_grow, SHRT_MIN);
> > > > +}
> > > > static __init int svm_hardware_setup(void)
> > > > {
> > > > int cpu;
> > > > @@ -1309,7 +1412,11 @@ static void init_vmcb(struct vcpu_svm *svm)
> > > > svm->vcpu.arch.hflags = 0;
> > > >
> > > > if (boot_cpu_has(X86_FEATURE_PAUSEFILTER)) {
> > > > - control->pause_filter_count = 3000;
> > > > + control->pause_filter_count = pause_filter_count;
> > > > + if (boot_cpu_has(X86_FEATURE_PFTHRESHOLD))
> > > > + control->pause_filter_thresh = pause_filter_thresh;
> > > > + else
> > > > + pause_filter_thresh = 0;
> > >
> > > Please move this to hardware_setup and also clear pause_filter_count if
> >
> > Moving this to hardware_setup will be a problem. We don't have access to
> svm data structure in hardware_setup.
>
> I mean just the pause_filter_thresh = 0 and pause_filter_count = 0 logic
Sure. Will take care.
> based on boot_cpu_has (it's weird if the user-visible parameters are
> corrected after starting a VM); VMCB configuration stays,
>
> thanks.