The aperf/mperf are used to report current CPU frequency after 7d5905dc14a
"x86 / CPU: Always show current CPU frequency in /proc/cpuinfo". But guest
kernel always reports a fixed VCPU frequency in the /proc/cpuinfo, which
may confuse users especially when turbo is enabled on the host.
Emulate guest APERF/MPERF capability based their values on the host.
Co-developed-by: Li RongQing <[email protected]>
Signed-off-by: Li RongQing <[email protected]>
Reviewed-by: Chai Wen <[email protected]>
Reviewed-by: Jia Lina <[email protected]>
Signed-off-by: Like Xu <[email protected]>
---
arch/x86/include/asm/kvm_host.h | 12 ++++++
arch/x86/kvm/cpuid.c | 8 +++-
arch/x86/kvm/x86.c | 76 ++++++++++++++++++++++++++++++++-
3 files changed, 94 insertions(+), 2 deletions(-)
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index f852ee350beb..c48b9a0a086e 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -539,6 +539,16 @@ struct kvm_vcpu_hv_stimer {
bool msg_pending;
};
+/* vCPU thermal and power context */
+struct kvm_vcpu_hwp {
+ /* Hardware Coordination Feedback Capability (Presence of APERF/MPERF) */
+ bool hw_coord_fb_cap;
+ /* MPERF increases with a fixed frequency */
+ u64 mperf;
+ /* APERF increases with the current/actual frequency */
+ u64 aperf;
+};
+
/* Hyper-V synthetic interrupt controller (SynIC)*/
struct kvm_vcpu_hv_synic {
u64 version;
@@ -829,6 +839,8 @@ struct kvm_vcpu_arch {
/* AMD MSRC001_0015 Hardware Configuration */
u64 msr_hwcr;
+
+ struct kvm_vcpu_hwp hwp;
};
struct kvm_lpage_info {
diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
index 8a294f9747aa..7057809e7cfd 100644
--- a/arch/x86/kvm/cpuid.c
+++ b/arch/x86/kvm/cpuid.c
@@ -78,6 +78,11 @@ int kvm_update_cpuid(struct kvm_vcpu *vcpu)
apic->lapic_timer.timer_mode_mask = 1 << 17;
}
+ best = kvm_find_cpuid_entry(vcpu, 0x6, 0);
+ if (best && best->function == 0x6 &&
+ boot_cpu_has(X86_FEATURE_APERFMPERF) && (best->ecx & 0x1))
+ vcpu->arch.hwp.hw_coord_fb_cap = true;
+
best = kvm_find_cpuid_entry(vcpu, 7, 0);
if (best && boot_cpu_has(X86_FEATURE_PKU) && best->function == 0x7)
cpuid_entry_change(best, X86_FEATURE_OSPKE,
@@ -561,7 +566,8 @@ static inline int __do_cpuid_func(struct kvm_cpuid_array *array, u32 function)
case 6: /* Thermal management */
entry->eax = 0x4; /* allow ARAT */
entry->ebx = 0;
- entry->ecx = 0;
+ /* allow aperf/mperf to report the true VCPU frequency. */
+ entry->ecx = boot_cpu_has(X86_FEATURE_APERFMPERF) ? 0x1 : 0;
entry->edx = 0;
break;
/* function 7 has additional index. */
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 00c88c2f34e4..d220d9cc904a 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -3056,6 +3056,16 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
return 1;
vcpu->arch.msr_misc_features_enables = data;
break;
+ case MSR_IA32_MPERF:
+ if (!msr_info->host_initiated && !vcpu->arch.hwp.hw_coord_fb_cap)
+ return 1;
+ vcpu->arch.hwp.mperf = 0;
+ return 0;
+ case MSR_IA32_APERF:
+ if (!msr_info->host_initiated && !vcpu->arch.hwp.hw_coord_fb_cap)
+ return 1;
+ vcpu->arch.hwp.aperf = 0;
+ return 0;
default:
if (msr && (msr == vcpu->kvm->arch.xen_hvm_config.msr))
return xen_hvm_config(vcpu, data);
@@ -3323,6 +3333,16 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
case MSR_K7_HWCR:
msr_info->data = vcpu->arch.msr_hwcr;
break;
+ case MSR_IA32_MPERF:
+ if (!msr_info->host_initiated && !vcpu->arch.hwp.hw_coord_fb_cap)
+ return 1;
+ msr_info->data = vcpu->arch.hwp.mperf;
+ break;
+ case MSR_IA32_APERF:
+ if (!msr_info->host_initiated && !vcpu->arch.hwp.hw_coord_fb_cap)
+ return 1;
+ msr_info->data = vcpu->arch.hwp.aperf;
+ break;
default:
if (kvm_pmu_is_valid_msr(vcpu, msr_info->index))
return kvm_pmu_get_msr(vcpu, msr_info);
@@ -8300,6 +8320,50 @@ void __kvm_request_immediate_exit(struct kvm_vcpu *vcpu)
}
EXPORT_SYMBOL_GPL(__kvm_request_immediate_exit);
+static inline void get_host_amperf(u64 *mperf, u64 *aperf)
+{
+ rdmsrl(MSR_IA32_MPERF, *mperf);
+ rdmsrl(MSR_IA32_APERF, *aperf);
+}
+
+static inline u64 get_amperf_delta(u64 enter, u64 exit)
+{
+ return (exit >= enter) ? (exit - enter) : (ULONG_MAX - enter + exit);
+}
+
+static inline void vcpu_update_amperf(struct kvm_vcpu *vcpu, u64 adelta, u64 mdelta)
+{
+ u64 aperf_left, mperf_left, delta, tmp;
+
+ aperf_left = ULONG_MAX - vcpu->arch.hwp.aperf;
+ mperf_left = ULONG_MAX - vcpu->arch.hwp.mperf;
+
+ /* fast path when neither MSR overflows */
+ if (adelta <= aperf_left && mdelta <= mperf_left) {
+ vcpu->arch.hwp.aperf += adelta;
+ vcpu->arch.hwp.mperf += mdelta;
+ return;
+ }
+
+ /* when either MSR overflows, both MSRs are reset to zero and continue to increment. */
+ delta = min(adelta, mdelta);
+ if (delta > aperf_left || delta > mperf_left) {
+ tmp = max(vcpu->arch.hwp.aperf, vcpu->arch.hwp.mperf);
+ tmp = delta - (ULONG_MAX - tmp) - 1;
+ vcpu->arch.hwp.aperf = tmp + adelta - delta;
+ vcpu->arch.hwp.mperf = tmp + mdelta - delta;
+ return;
+ }
+
+ if (mdelta > adelta && mdelta > aperf_left) {
+ vcpu->arch.hwp.mperf = mdelta - mperf_left - 1;
+ vcpu->arch.hwp.aperf = 0;
+ } else {
+ vcpu->arch.hwp.mperf = 0;
+ vcpu->arch.hwp.aperf = adelta - aperf_left - 1;
+ }
+}
+
/*
* Returns 1 to let vcpu_run() continue the guest execution loop without
* exiting to the userspace. Otherwise, the value will be returned to the
@@ -8312,7 +8376,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
dm_request_for_irq_injection(vcpu) &&
kvm_cpu_accept_dm_intr(vcpu);
fastpath_t exit_fastpath;
-
+ u64 enter_mperf = 0, enter_aperf = 0, exit_mperf = 0, exit_aperf = 0;
bool req_immediate_exit = false;
if (kvm_request_pending(vcpu)) {
@@ -8516,8 +8580,17 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
vcpu->arch.switch_db_regs &= ~KVM_DEBUGREG_RELOAD;
}
+ if (unlikely(vcpu->arch.hwp.hw_coord_fb_cap))
+ get_host_amperf(&enter_mperf, &enter_aperf);
+
exit_fastpath = kvm_x86_ops.run(vcpu);
+ if (unlikely(vcpu->arch.hwp.hw_coord_fb_cap)) {
+ get_host_amperf(&exit_mperf, &exit_aperf);
+ vcpu_update_amperf(vcpu, get_amperf_delta(enter_aperf, exit_aperf),
+ get_amperf_delta(enter_mperf, exit_mperf));
+ }
+
/*
* Do this here before restoring debug registers on the host. And
* since we do this before handling the vmexit, a DR access vmexit
@@ -9482,6 +9555,7 @@ int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu)
vcpu->arch.pending_external_vector = -1;
vcpu->arch.preempted_in_kernel = false;
+ vcpu->arch.hwp.hw_coord_fb_cap = false;
kvm_hv_vcpu_init(vcpu);
--
2.21.3
On Tue, Jun 23, 2020 at 02:35:30PM +0800, Like Xu wrote:
> The aperf/mperf are used to report current CPU frequency after 7d5905dc14a
> "x86 / CPU: Always show current CPU frequency in /proc/cpuinfo". But guest
> kernel always reports a fixed VCPU frequency in the /proc/cpuinfo, which
> may confuse users especially when turbo is enabled on the host.
>
> Emulate guest APERF/MPERF capability based their values on the host.
>
> Co-developed-by: Li RongQing <[email protected]>
> Signed-off-by: Li RongQing <[email protected]>
> Reviewed-by: Chai Wen <[email protected]>
> Reviewed-by: Jia Lina <[email protected]>
> Signed-off-by: Like Xu <[email protected]>
> ---
...
> @@ -8312,7 +8376,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
> dm_request_for_irq_injection(vcpu) &&
> kvm_cpu_accept_dm_intr(vcpu);
> fastpath_t exit_fastpath;
> -
> + u64 enter_mperf = 0, enter_aperf = 0, exit_mperf = 0, exit_aperf = 0;
> bool req_immediate_exit = false;
>
> if (kvm_request_pending(vcpu)) {
> @@ -8516,8 +8580,17 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
> vcpu->arch.switch_db_regs &= ~KVM_DEBUGREG_RELOAD;
> }
>
> + if (unlikely(vcpu->arch.hwp.hw_coord_fb_cap))
> + get_host_amperf(&enter_mperf, &enter_aperf);
> +
> exit_fastpath = kvm_x86_ops.run(vcpu);
>
> + if (unlikely(vcpu->arch.hwp.hw_coord_fb_cap)) {
> + get_host_amperf(&exit_mperf, &exit_aperf);
> + vcpu_update_amperf(vcpu, get_amperf_delta(enter_aperf, exit_aperf),
> + get_amperf_delta(enter_mperf, exit_mperf));
> + }
> +
Is there an alternative approach that doesn't require 4 RDMSRs on every VMX
round trip? That's literally more expensive than VM-Enter + VM-Exit
combined.
E.g. what about adding KVM_X86_DISABLE_EXITS_APERF_MPERF and exposing the
MSRs for read when that capability is enabled?
On Tue, Jun 23, 2020 at 11:29 AM Sean Christopherson
<[email protected]> wrote:
>
> On Tue, Jun 23, 2020 at 02:35:30PM +0800, Like Xu wrote:
> > The aperf/mperf are used to report current CPU frequency after 7d5905dc14a
> > "x86 / CPU: Always show current CPU frequency in /proc/cpuinfo". But guest
> > kernel always reports a fixed VCPU frequency in the /proc/cpuinfo, which
> > may confuse users especially when turbo is enabled on the host.
> >
> > Emulate guest APERF/MPERF capability based their values on the host.
> >
> > Co-developed-by: Li RongQing <[email protected]>
> > Signed-off-by: Li RongQing <[email protected]>
> > Reviewed-by: Chai Wen <[email protected]>
> > Reviewed-by: Jia Lina <[email protected]>
> > Signed-off-by: Like Xu <[email protected]>
> > ---
>
> ...
>
> > @@ -8312,7 +8376,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
> > dm_request_for_irq_injection(vcpu) &&
> > kvm_cpu_accept_dm_intr(vcpu);
> > fastpath_t exit_fastpath;
> > -
> > + u64 enter_mperf = 0, enter_aperf = 0, exit_mperf = 0, exit_aperf = 0;
> > bool req_immediate_exit = false;
> >
> > if (kvm_request_pending(vcpu)) {
> > @@ -8516,8 +8580,17 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
> > vcpu->arch.switch_db_regs &= ~KVM_DEBUGREG_RELOAD;
> > }
> >
> > + if (unlikely(vcpu->arch.hwp.hw_coord_fb_cap))
> > + get_host_amperf(&enter_mperf, &enter_aperf);
> > +
> > exit_fastpath = kvm_x86_ops.run(vcpu);
> >
> > + if (unlikely(vcpu->arch.hwp.hw_coord_fb_cap)) {
> > + get_host_amperf(&exit_mperf, &exit_aperf);
> > + vcpu_update_amperf(vcpu, get_amperf_delta(enter_aperf, exit_aperf),
> > + get_amperf_delta(enter_mperf, exit_mperf));
> > + }
> > +
>
> Is there an alternative approach that doesn't require 4 RDMSRs on every VMX
> round trip? That's literally more expensive than VM-Enter + VM-Exit
> combined.
>
> E.g. what about adding KVM_X86_DISABLE_EXITS_APERF_MPERF and exposing the
> MSRs for read when that capability is enabled?
When would you load the hardware MSRs with the guest/host values?
On Tue, Jun 23, 2020 at 11:39:16AM -0700, Jim Mattson wrote:
> On Tue, Jun 23, 2020 at 11:29 AM Sean Christopherson
> <[email protected]> wrote:
> >
> > On Tue, Jun 23, 2020 at 02:35:30PM +0800, Like Xu wrote:
> > > The aperf/mperf are used to report current CPU frequency after 7d5905dc14a
> > > "x86 / CPU: Always show current CPU frequency in /proc/cpuinfo". But guest
> > > kernel always reports a fixed VCPU frequency in the /proc/cpuinfo, which
> > > may confuse users especially when turbo is enabled on the host.
> > >
> > > Emulate guest APERF/MPERF capability based their values on the host.
> > >
> > > Co-developed-by: Li RongQing <[email protected]>
> > > Signed-off-by: Li RongQing <[email protected]>
> > > Reviewed-by: Chai Wen <[email protected]>
> > > Reviewed-by: Jia Lina <[email protected]>
> > > Signed-off-by: Like Xu <[email protected]>
> > > ---
> >
> > ...
> >
> > > @@ -8312,7 +8376,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
> > > dm_request_for_irq_injection(vcpu) &&
> > > kvm_cpu_accept_dm_intr(vcpu);
> > > fastpath_t exit_fastpath;
> > > -
> > > + u64 enter_mperf = 0, enter_aperf = 0, exit_mperf = 0, exit_aperf = 0;
> > > bool req_immediate_exit = false;
> > >
> > > if (kvm_request_pending(vcpu)) {
> > > @@ -8516,8 +8580,17 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
> > > vcpu->arch.switch_db_regs &= ~KVM_DEBUGREG_RELOAD;
> > > }
> > >
> > > + if (unlikely(vcpu->arch.hwp.hw_coord_fb_cap))
> > > + get_host_amperf(&enter_mperf, &enter_aperf);
> > > +
> > > exit_fastpath = kvm_x86_ops.run(vcpu);
> > >
> > > + if (unlikely(vcpu->arch.hwp.hw_coord_fb_cap)) {
> > > + get_host_amperf(&exit_mperf, &exit_aperf);
> > > + vcpu_update_amperf(vcpu, get_amperf_delta(enter_aperf, exit_aperf),
> > > + get_amperf_delta(enter_mperf, exit_mperf));
> > > + }
> > > +
> >
> > Is there an alternative approach that doesn't require 4 RDMSRs on every VMX
> > round trip? That's literally more expensive than VM-Enter + VM-Exit
> > combined.
> >
> > E.g. what about adding KVM_X86_DISABLE_EXITS_APERF_MPERF and exposing the
> > MSRs for read when that capability is enabled?
>
> When would you load the hardware MSRs with the guest/host values?
Ugh, I was thinking the MSRs were read-only.
Doesn't this also interact with TSC scaling?
On Tue, Jun 23, 2020 at 12:05 PM Sean Christopherson
<[email protected]> wrote:
>
> On Tue, Jun 23, 2020 at 11:39:16AM -0700, Jim Mattson wrote:
> > On Tue, Jun 23, 2020 at 11:29 AM Sean Christopherson
> > <[email protected]> wrote:
> > >
> > > On Tue, Jun 23, 2020 at 02:35:30PM +0800, Like Xu wrote:
> > > > The aperf/mperf are used to report current CPU frequency after 7d5905dc14a
> > > > "x86 / CPU: Always show current CPU frequency in /proc/cpuinfo". But guest
> > > > kernel always reports a fixed VCPU frequency in the /proc/cpuinfo, which
> > > > may confuse users especially when turbo is enabled on the host.
> > > >
> > > > Emulate guest APERF/MPERF capability based their values on the host.
> > > >
> > > > Co-developed-by: Li RongQing <[email protected]>
> > > > Signed-off-by: Li RongQing <[email protected]>
> > > > Reviewed-by: Chai Wen <[email protected]>
> > > > Reviewed-by: Jia Lina <[email protected]>
> > > > Signed-off-by: Like Xu <[email protected]>
> > > > ---
> > >
> > > ...
> > >
> > > > @@ -8312,7 +8376,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
> > > > dm_request_for_irq_injection(vcpu) &&
> > > > kvm_cpu_accept_dm_intr(vcpu);
> > > > fastpath_t exit_fastpath;
> > > > -
> > > > + u64 enter_mperf = 0, enter_aperf = 0, exit_mperf = 0, exit_aperf = 0;
> > > > bool req_immediate_exit = false;
> > > >
> > > > if (kvm_request_pending(vcpu)) {
> > > > @@ -8516,8 +8580,17 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
> > > > vcpu->arch.switch_db_regs &= ~KVM_DEBUGREG_RELOAD;
> > > > }
> > > >
> > > > + if (unlikely(vcpu->arch.hwp.hw_coord_fb_cap))
> > > > + get_host_amperf(&enter_mperf, &enter_aperf);
> > > > +
> > > > exit_fastpath = kvm_x86_ops.run(vcpu);
> > > >
> > > > + if (unlikely(vcpu->arch.hwp.hw_coord_fb_cap)) {
> > > > + get_host_amperf(&exit_mperf, &exit_aperf);
> > > > + vcpu_update_amperf(vcpu, get_amperf_delta(enter_aperf, exit_aperf),
> > > > + get_amperf_delta(enter_mperf, exit_mperf));
> > > > + }
> > > > +
> > >
> > > Is there an alternative approach that doesn't require 4 RDMSRs on every VMX
> > > round trip? That's literally more expensive than VM-Enter + VM-Exit
> > > combined.
> > >
> > > E.g. what about adding KVM_X86_DISABLE_EXITS_APERF_MPERF and exposing the
> > > MSRs for read when that capability is enabled?
> >
> > When would you load the hardware MSRs with the guest/host values?
>
> Ugh, I was thinking the MSRs were read-only.
EVen if they were read-only, they should power on to zero, and they
will most likely not be zero when a guest powers on.
> Doesn't this also interact with TSC scaling?
Yes, it should!
On 24/6/2020 4:34 am, Jim Mattson wrote:
> On Tue, Jun 23, 2020 at 12:05 PM Sean Christopherson
> <[email protected]> wrote:
>>
>> On Tue, Jun 23, 2020 at 11:39:16AM -0700, Jim Mattson wrote:
>>> On Tue, Jun 23, 2020 at 11:29 AM Sean Christopherson
>>> <[email protected]> wrote:
>>>>
>>>> On Tue, Jun 23, 2020 at 02:35:30PM +0800, Like Xu wrote:
>>>>> The aperf/mperf are used to report current CPU frequency after 7d5905dc14a
>>>>> "x86 / CPU: Always show current CPU frequency in /proc/cpuinfo". But guest
>>>>> kernel always reports a fixed VCPU frequency in the /proc/cpuinfo, which
>>>>> may confuse users especially when turbo is enabled on the host.
>>>>>
>>>>> Emulate guest APERF/MPERF capability based their values on the host.
>>>>>
>>>>> Co-developed-by: Li RongQing <[email protected]>
>>>>> Signed-off-by: Li RongQing <[email protected]>
>>>>> Reviewed-by: Chai Wen <[email protected]>
>>>>> Reviewed-by: Jia Lina <[email protected]>
>>>>> Signed-off-by: Like Xu <[email protected]>
>>>>> ---
>>>>
>>>> ...
>>>>
>>>>> @@ -8312,7 +8376,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
>>>>> dm_request_for_irq_injection(vcpu) &&
>>>>> kvm_cpu_accept_dm_intr(vcpu);
>>>>> fastpath_t exit_fastpath;
>>>>> -
>>>>> + u64 enter_mperf = 0, enter_aperf = 0, exit_mperf = 0, exit_aperf = 0;
>>>>> bool req_immediate_exit = false;
>>>>>
>>>>> if (kvm_request_pending(vcpu)) {
>>>>> @@ -8516,8 +8580,17 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
>>>>> vcpu->arch.switch_db_regs &= ~KVM_DEBUGREG_RELOAD;
>>>>> }
>>>>>
>>>>> + if (unlikely(vcpu->arch.hwp.hw_coord_fb_cap))
>>>>> + get_host_amperf(&enter_mperf, &enter_aperf);
>>>>> +
>>>>> exit_fastpath = kvm_x86_ops.run(vcpu);
>>>>>
>>>>> + if (unlikely(vcpu->arch.hwp.hw_coord_fb_cap)) {
>>>>> + get_host_amperf(&exit_mperf, &exit_aperf);
>>>>> + vcpu_update_amperf(vcpu, get_amperf_delta(enter_aperf, exit_aperf),
>>>>> + get_amperf_delta(enter_mperf, exit_mperf));
>>>>> + }
>>>>> +
>>>>
>>>> Is there an alternative approach that doesn't require 4 RDMSRs on every VMX
>>>> round trip? That's literally more expensive than VM-Enter + VM-Exit
>>>> combined.
It looks like we have quite a few users who are expecting this feature in
different scenarios.
I will add a fast path for RO usage and a slow path if the guest tries to change
the AMPERF values.
>>>>
>>>> E.g. what about adding KVM_X86_DISABLE_EXITS_APERF_MPERF and exposing the
>>>> MSRs for read when that capability is enabled?
>>>
>>> When would you load the hardware MSRs with the guest/host values?
>>
>> Ugh, I was thinking the MSRs were read-only.
>
> EVen if they were read-only, they should power on to zero, and they
> will most likely not be zero when a guest powers on.
Can we assume that "not zero when the guest is on" will not harm any guests ?
>
>> Doesn't this also interact with TSC scaling?
>
> Yes, it should!
We have too much of a historical burden on TSC emulations.
For practical reasons, what if we only expose the AMPERF cap
if the host/guest has both CONSTANT_TSC and NONSTOP_TSC ?
One more design concern, I wonder if it is *safe* for the guest to
read amperf on pCPU[x] the first time and on pCPU[y] the next time.
Any input ?
Thanks,
Like Xu