v2:
- Rename x86_get_freq() to x86_get_cpufreq_khz() to make function
specific. (Suggested by Thomas Gleixner)
- Use cpu_feature_enabled(X86_FEATURE_TSC) instead of
cpu_has(&cpu_data(cpu), X86_FEATURE_TSC) to test feature.
(Suggested by Thomas Gleixner)
- Spell fixes. (Suggested by Thomas Gleixner)
v1:
- Introduce x86_get_freq(), hide detailed implemention from proc
routine.
- KVM uses x86_get_freq() to get CPU frequecy.
zhenwei pi (2):
x86/cpu: Introduce x86_get_cpufreq_khz()
KVM: x86: use x86_get_freq to get freq for kvmclock
arch/x86/include/asm/processor.h | 2 ++
arch/x86/kernel/cpu/common.c | 19 +++++++++++++++++++
arch/x86/kernel/cpu/proc.c | 13 +++----------
arch/x86/kvm/x86.c | 4 +---
4 files changed, 25 insertions(+), 13 deletions(-)
--
2.25.1
Wrapper function x86_get_cpufreq_khz() to get frequency on a x86
platform, hide detailed implementation from proc routine.
Also export this function for the further use, a typical case is that
kvm module gets the frequency of the host and tell the guest side by
kvmclock.
Signed-off-by: zhenwei pi <[email protected]>
---
arch/x86/include/asm/processor.h | 2 ++
arch/x86/kernel/cpu/common.c | 19 +++++++++++++++++++
arch/x86/kernel/cpu/proc.c | 13 +++----------
3 files changed, 24 insertions(+), 10 deletions(-)
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index 355d38c0cf60..22f183dee593 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -855,4 +855,6 @@ enum mds_mitigations {
MDS_MITIGATION_VMWERV,
};
+unsigned int x86_get_cpufreq_khz(unsigned int cpu);
+
#endif /* _ASM_X86_PROCESSOR_H */
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index 0083464de5e3..997026fedbb4 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -22,6 +22,7 @@
#include <linux/io.h>
#include <linux/syscore_ops.h>
#include <linux/pgtable.h>
+#include <linux/cpufreq.h>
#include <asm/cmdline.h>
#include <asm/stackprotector.h>
@@ -2104,3 +2105,21 @@ void arch_smt_update(void)
/* Check whether IPI broadcasting can be enabled */
apic_smt_update();
}
+
+unsigned int x86_get_cpufreq_khz(unsigned int cpu)
+{
+ unsigned int freq = 0;
+
+ if (!cpu_feature_enabled(X86_FEATURE_TSC))
+ return 0;
+
+ freq = aperfmperf_get_khz(cpu);
+ if (!freq)
+ freq = cpufreq_quick_get(cpu);
+
+ if (!freq)
+ freq = cpu_khz;
+
+ return freq;
+}
+EXPORT_SYMBOL_GPL(x86_get_cpufreq_khz);
diff --git a/arch/x86/kernel/cpu/proc.c b/arch/x86/kernel/cpu/proc.c
index 4eec8889b0ff..8ed17f969f72 100644
--- a/arch/x86/kernel/cpu/proc.c
+++ b/arch/x86/kernel/cpu/proc.c
@@ -3,7 +3,6 @@
#include <linux/timex.h>
#include <linux/string.h>
#include <linux/seq_file.h>
-#include <linux/cpufreq.h>
#include "cpu.h"
@@ -61,7 +60,7 @@ static void show_cpuinfo_misc(struct seq_file *m, struct cpuinfo_x86 *c)
static int show_cpuinfo(struct seq_file *m, void *v)
{
struct cpuinfo_x86 *c = v;
- unsigned int cpu;
+ unsigned int cpu, freq;
int i;
cpu = c->cpu_index;
@@ -83,16 +82,10 @@ static int show_cpuinfo(struct seq_file *m, void *v)
if (c->microcode)
seq_printf(m, "microcode\t: 0x%x\n", c->microcode);
- if (cpu_has(c, X86_FEATURE_TSC)) {
- unsigned int freq = aperfmperf_get_khz(cpu);
-
- if (!freq)
- freq = cpufreq_quick_get(cpu);
- if (!freq)
- freq = cpu_khz;
+ freq = x86_get_cpufreq_khz(cpu);
+ if (freq)
seq_printf(m, "cpu MHz\t\t: %u.%03u\n",
freq / 1000, (freq % 1000));
- }
/* Cache size */
if (c->x86_cache_size)
--
2.25.1
If the host side supports APERF&MPERF feature, the guest side may get
mismatched frequency.
KVM uses x86_get_cpufreq_khz() to get the same frequency for guest side.
Signed-off-by: zhenwei pi <[email protected]>
---
arch/x86/kvm/x86.c | 4 +---
1 file changed, 1 insertion(+), 3 deletions(-)
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 5a403d92833f..125ed3c8b21a 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -8305,10 +8305,8 @@ static void tsc_khz_changed(void *data)
if (data)
khz = freq->new;
- else if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC))
- khz = cpufreq_quick_get(raw_smp_processor_id());
if (!khz)
- khz = tsc_khz;
+ khz = x86_get_cpufreq_khz(raw_smp_processor_id());
__this_cpu_write(cpu_tsc_khz, khz);
}
--
2.25.1
On Wed, Dec 01 2021 at 10:46, zhenwei pi wrote:
> If the host side supports APERF&MPERF feature, the guest side may get
> mismatched frequency.
>
> KVM uses x86_get_cpufreq_khz() to get the same frequency for guest side.
>
> Signed-off-by: zhenwei pi <[email protected]>
> ---
> arch/x86/kvm/x86.c | 4 +---
> 1 file changed, 1 insertion(+), 3 deletions(-)
>
> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> index 5a403d92833f..125ed3c8b21a 100644
> --- a/arch/x86/kvm/x86.c
> +++ b/arch/x86/kvm/x86.c
> @@ -8305,10 +8305,8 @@ static void tsc_khz_changed(void *data)
>
> if (data)
> khz = freq->new;
> - else if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC))
> - khz = cpufreq_quick_get(raw_smp_processor_id());
> if (!khz)
> - khz = tsc_khz;
> + khz = x86_get_cpufreq_khz(raw_smp_processor_id());
my brain compiler tells me that this is broken.
On 12/2/21 10:48 AM, Thomas Gleixner wrote:
> On Wed, Dec 01 2021 at 10:46, zhenwei pi wrote:
>> If the host side supports APERF&MPERF feature, the guest side may get
>> mismatched frequency.
>>
>> KVM uses x86_get_cpufreq_khz() to get the same frequency for guest side.
>>
>> Signed-off-by: zhenwei pi <[email protected]>
>> ---
>> arch/x86/kvm/x86.c | 4 +---
>> 1 file changed, 1 insertion(+), 3 deletions(-)
>>
>> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
>> index 5a403d92833f..125ed3c8b21a 100644
>> --- a/arch/x86/kvm/x86.c
>> +++ b/arch/x86/kvm/x86.c
>> @@ -8305,10 +8305,8 @@ static void tsc_khz_changed(void *data)
>>
>> if (data)
>> khz = freq->new;
>> - else if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC))
>> - khz = cpufreq_quick_get(raw_smp_processor_id());
>> if (!khz)
>> - khz = tsc_khz;
>> + khz = x86_get_cpufreq_khz(raw_smp_processor_id());
>
> my brain compiler tells me that this is broken.
> Without this patch:
1, boot_cpu_has(X86_FEATURE_CONSTANT_TSC) is true:
no kvmclock_cpufreq_notifier, and khz = tsc_khz;
2, boot_cpu_has(X86_FEATURE_CONSTANT_TSC) is false:
during installing kmod, try cpufreq_quick_get(), or use tsc_khz;
and get changed by kvmclock_cpufreq_notifier.
With this patch:
1, boot_cpu_has(X86_FEATURE_CONSTANT_TSC) is true:
no kvmclock_cpufreq_notifier, try aperf/mperf, or try
cpufreq_quick_get(), or use cpu_khz
2, boot_cpu_has(X86_FEATURE_CONSTANT_TSC) is false:
during installing kmod, try aperf/mperf, or try cpufreq_quick_get(), or
use cpu_khz;
and get changed by kvmclock_cpufreq_notifier.
I tested on Skylake&Icelake CPU, and got different CPU frequency from
host & guest, the main purpose of this patch is to get the same frequency.
--
zhenwei pi
On Thu, 2021-12-02 at 13:26 +0800, zhenwei pi wrote:
> On 12/2/21 10:48 AM, Thomas Gleixner wrote:
> > On Wed, Dec 01 2021 at 10:46, zhenwei pi wrote:
> > > If the host side supports APERF&MPERF feature, the guest side may get
> > > mismatched frequency.
> > >
> > > KVM uses x86_get_cpufreq_khz() to get the same frequency for guest side.
> > >
> > > Signed-off-by: zhenwei pi <[email protected]>
> > > ---
> > > arch/x86/kvm/x86.c | 4 +---
> > > 1 file changed, 1 insertion(+), 3 deletions(-)
> > >
> > > diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> > > index 5a403d92833f..125ed3c8b21a 100644
> > > --- a/arch/x86/kvm/x86.c
> > > +++ b/arch/x86/kvm/x86.c
> > > @@ -8305,10 +8305,8 @@ static void tsc_khz_changed(void *data)
> > >
> > > if (data)
> > > khz = freq->new;
> > > - else if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC))
> > > - khz = cpufreq_quick_get(raw_smp_processor_id());
> > > if (!khz)
> > > - khz = tsc_khz;
> > > + khz = x86_get_cpufreq_khz(raw_smp_processor_id());
> >
> > my brain compiler tells me that this is broken.
> > Without this patch:
> 1, boot_cpu_has(X86_FEATURE_CONSTANT_TSC) is true:
> no kvmclock_cpufreq_notifier, and khz = tsc_khz;
>
> 2, boot_cpu_has(X86_FEATURE_CONSTANT_TSC) is false:
> during installing kmod, try cpufreq_quick_get(), or use tsc_khz;
> and get changed by kvmclock_cpufreq_notifier.
>
> With this patch:
> 1, boot_cpu_has(X86_FEATURE_CONSTANT_TSC) is true:
> no kvmclock_cpufreq_notifier, try aperf/mperf, or try
> cpufreq_quick_get(), or use cpu_khz
>
> 2, boot_cpu_has(X86_FEATURE_CONSTANT_TSC) is false:
> during installing kmod, try aperf/mperf, or try cpufreq_quick_get(), or
> use cpu_khz;
> and get changed by kvmclock_cpufreq_notifier.
>
> I tested on Skylake&Icelake CPU, and got different CPU frequency from
> host & guest, the main purpose of this patch is to get the same frequency.
>
Note that on my Zen2 machine (3970X), aperf/mperf returns current cpu freqency,
as now see in /proc/cpuinfo, while TSC is always running with base CPU clock frequency (3.7 GHZ)
(that is max frequency that CPU is guranteed to run with, anything above is boost 'bonus')
[mlevitsk@starship ~/Kernel/br-vm-64/src]$cat /proc/cpuinfo | grep "cpu MHz"
cpu MHz : 3685.333
cpu MHz : 2200.000
cpu MHz : 2200.000
cpu MHz : 2200.000
cpu MHz : 2200.000
cpu MHz : 2200.000
cpu MHz : 2200.000
cpu MHz : 2200.000
cpu MHz : 2200.000
cpu MHz : 2200.000
cpu MHz : 2200.000
cpu MHz : 2761.946
cpu MHz : 2200.000
cpu MHz : 2200.000
cpu MHz : 2200.000
...
[mlevitsk@starship ~/Kernel/master/src]$dmesg | grep tsc
[ 0.000000] tsc: Fast TSC calibration using PIT
[ 0.000000] tsc: Detected 3700.230 MHz processor
...
Before I forget about it I do want to point out few things
that are not 100% related to this thread but do related to TSC:
1. It sucks that on AMD, the TSC frequency is calibrated from other
clocksources like PIT/HPET, since the result is not exact and varies
from boot to boot. I do wonder if they have something like that
APERF/MPERF thing which sadly is not what I was looking for.
2. In the guest on AMD, we mark the TSC as unsynchronized always due to the code
in unsynchronized_tsc, unless invariant tsc is used in guest cpuid,
which is IMHO not fair to AMD as we don't do this for Intel cpus.
(look at unsynchronized_tsc function)
3. I wish the kernel would export the tsc frequency it found to userspace
somewhere in /sys or /proc, as this would be very useful for userspace applications.
Currently it can only be found in dmesg if I am not mistaken..
I don't mind if such frequency would only be exported if the TSC is stable,
always running, not affected by CPUfreq, etc.
Best regards,
Maxim Levitsky
On Wed, Dec 01, 2021 at 10:46:49AM +0800, zhenwei pi wrote:
> Wrapper function x86_get_cpufreq_khz() to get frequency on a x86
> platform, hide detailed implementation from proc routine.
>
> Also export this function for the further use, a typical case is that
> kvm module gets the frequency of the host and tell the guest side by
> kvmclock.
This function was already complete crap, and now you want to export it,
*WHY* ?!
What possible use does KVM have for this random number generator?
On Thu, Dec 02, 2021 at 09:19:25AM +0200, Maxim Levitsky wrote:
> On Thu, 2021-12-02 at 13:26 +0800, zhenwei pi wrote:
> Note that on my Zen2 machine (3970X), aperf/mperf returns current cpu freqency,
Correct, and it computes it over a random period of history. IOW, it's a
random number generator.
> 1. It sucks that on AMD, the TSC frequency is calibrated from other
> clocksources like PIT/HPET, since the result is not exact and varies
> from boot to boot.
CPUID.15h is supposed to tell us the actual frequency; except even Intel
find it very hard to actually put the right (or any, really) number in
there :/ Bribe your friendly AMD engineer with beers or something.
> 2. In the guest on AMD, we mark the TSC as unsynchronized always due to the code
> in unsynchronized_tsc, unless invariant tsc is used in guest cpuid,
> which is IMHO not fair to AMD as we don't do this for Intel cpus.
> (look at unsynchronized_tsc function)
Possibly we could treat >= Zen similar to Intel there. Also that comment
there is hillarious, it talks about multi-socket and then tests
num_possible_cpus(). Clearly that code hasn't been touched in like
forever.
> 3. I wish the kernel would export the tsc frequency it found to userspace
> somewhere in /sys or /proc, as this would be very useful for userspace applications.
> Currently it can only be found in dmesg if I am not mistaken..
> I don't mind if such frequency would only be exported if the TSC is stable,
> always running, not affected by CPUfreq, etc.
Perf exposes it, it's not really convenient if you're not using perf,
but it can be found there.
---
diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
index 2e076a459a0c..09da2935534a 100644
--- a/arch/x86/kernel/tsc.c
+++ b/arch/x86/kernel/tsc.c
@@ -29,6 +29,7 @@
#include <asm/intel-family.h>
#include <asm/i8259.h>
#include <asm/uv/uv.h>
+#include <asm/topology.h>
unsigned int __read_mostly cpu_khz; /* TSC clocks / usec, not used here */
EXPORT_SYMBOL(cpu_khz);
@@ -1221,9 +1222,20 @@ int unsynchronized_tsc(void)
* Intel systems are normally all synchronized.
* Exceptions must mark TSC as unstable:
*/
- if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL) {
+ switch (boot_cpu_data.x86_vendor) {
+ case X86_VENDOR_INTEL:
+ /* Really only Core and later */
+ break;
+
+ case X86_VENDOR_AMD:
+ case X86_VENDOR_HYGON:
+ if (boot_cpu_data.x86 >= 0x17) /* >= Zen */
+ break;
+ fallthrough;
+
+ default:
/* assume multi socket systems are not synchronized: */
- if (num_possible_cpus() > 1)
+ if (topology_max_packages() > 1)
return 1;
}
On 12/3/21 6:25 AM, Peter Zijlstra wrote:
> On Wed, Dec 01, 2021 at 10:46:49AM +0800, zhenwei pi wrote:
>> Wrapper function x86_get_cpufreq_khz() to get frequency on a x86
>> platform, hide detailed implementation from proc routine.
>>
>> Also export this function for the further use, a typical case is that
>> kvm module gets the frequency of the host and tell the guest side by
>> kvmclock.
>
> This function was already complete crap, and now you want to export it,
> *WHY* ?!
>
> What possible use does KVM have for this random number generator?
>
A KVM guest overwrites the '.calibrate_tsc' and '.calibrate_cpu' if
kvmclock is supported:
in function kvmclock_init(void) (linux/arch/x86/kernel/kvmclock.c)
...
x86_platform.calibrate_tsc = kvm_get_tsc_khz;
x86_platform.calibrate_cpu = kvm_get_tsc_khz;
...
And kvm_get_tsc_khz reads PV data from host side. Before guest reads
this, KVM should writes the frequency into the PV data structure.
And the problem is that KVM gets tsc_khz directly without aperf/mperf
detection. So user may gets different frequency(cat /proc/cpuinfo) from
guest & host.
Or is that possible to export function 'aperfmperf_get_khz'?
--
zhenwei pi
On Thu, 2021-12-02 at 23:45 +0100, Peter Zijlstra wrote:
> On Thu, Dec 02, 2021 at 09:19:25AM +0200, Maxim Levitsky wrote:
> > On Thu, 2021-12-02 at 13:26 +0800, zhenwei pi wrote:
> > Note that on my Zen2 machine (3970X), aperf/mperf returns current cpu freqency,
>
> Correct, and it computes it over a random period of history. IOW, it's a
> random number generator.
>
> > 1. It sucks that on AMD, the TSC frequency is calibrated from other
> > clocksources like PIT/HPET, since the result is not exact and varies
> > from boot to boot.
>
> CPUID.15h is supposed to tell us the actual frequency; except even Intel
> find it very hard to actually put the right (or any, really) number in
> there :/ Bribe your friendly AMD engineer with beers or something.
That what I thought. I asked just in case maybe AMD does have some vendor specific msrs
you know about but we didn't bother to support it.
I didn't find any in their PRM.
>
> > 2. In the guest on AMD, we mark the TSC as unsynchronized always due to the code
> > in unsynchronized_tsc, unless invariant tsc is used in guest cpuid,
> > which is IMHO not fair to AMD as we don't do this for Intel cpus.
> > (look at unsynchronized_tsc function)
>
> Possibly we could treat >= Zen similar to Intel there. Also that comment
> there is hillarious, it talks about multi-socket and then tests
> num_possible_cpus(). Clearly that code hasn't been touched in like
> forever.
Thank you!
>
> > 3. I wish the kernel would export the tsc frequency it found to userspace
> > somewhere in /sys or /proc, as this would be very useful for userspace applications.
> > Currently it can only be found in dmesg if I am not mistaken..
> > I don't mind if such frequency would only be exported if the TSC is stable,
> > always running, not affected by CPUfreq, etc.
>
> Perf exposes it, it's not really convenient if you're not using perf,
> but it can be found there.
That is good to know! I will check out the source but if you remember,
is there cli option in perf to show it, or it only uses it for internal
purposes?
>
>
> ---
> diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
> index 2e076a459a0c..09da2935534a 100644
> --- a/arch/x86/kernel/tsc.c
> +++ b/arch/x86/kernel/tsc.c
> @@ -29,6 +29,7 @@
> #include <asm/intel-family.h>
> #include <asm/i8259.h>
> #include <asm/uv/uv.h>
> +#include <asm/topology.h>
>
> unsigned int __read_mostly cpu_khz; /* TSC clocks / usec, not used here */
> EXPORT_SYMBOL(cpu_khz);
> @@ -1221,9 +1222,20 @@ int unsynchronized_tsc(void)
> * Intel systems are normally all synchronized.
> * Exceptions must mark TSC as unstable:
> */
> - if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL) {
> + switch (boot_cpu_data.x86_vendor) {
> + case X86_VENDOR_INTEL:
> + /* Really only Core and later */
> + break;
> +
> + case X86_VENDOR_AMD:
> + case X86_VENDOR_HYGON:
> + if (boot_cpu_data.x86 >= 0x17) /* >= Zen */
> + break;
> + fallthrough;
> +
> + default:
> /* assume multi socket systems are not synchronized: */
> - if (num_possible_cpus() > 1)
> + if (topology_max_packages() > 1)
> return 1;
> }
>
This makes sense!
>
Best regards,
Maxim Levitsky
On Fri, Dec 03, 2021 at 09:53:53AM +0200, Maxim Levitsky wrote:
> > Perf exposes it, it's not really convenient if you're not using perf,
> > but it can be found there.
> That is good to know! I will check out the source but if you remember,
> is there cli option in perf to show it, or it only uses it for internal
> purposes?
perf tool doesn't expose it I think, it's stuffed in
perf_event_mmap_page::time_* fields, see
include/uapi/linux/perf_event.h, it has comments on.
On Fri, Dec 03, 2021 at 03:34:04PM +0800, zhenwei pi wrote:
> A KVM guest overwrites the '.calibrate_tsc' and '.calibrate_cpu' if kvmclock
> is supported:
>
> in function kvmclock_init(void) (linux/arch/x86/kernel/kvmclock.c)
> ...
> x86_platform.calibrate_tsc = kvm_get_tsc_khz;
> x86_platform.calibrate_cpu = kvm_get_tsc_khz;
> ...
>
> And kvm_get_tsc_khz reads PV data from host side. Before guest reads this,
> KVM should writes the frequency into the PV data structure.
>
> And the problem is that KVM gets tsc_khz directly without aperf/mperf
> detection. So user may gets different frequency(cat /proc/cpuinfo) from
> guest & host.
>
> Or is that possible to export function 'aperfmperf_get_khz'?
TSC frequency and aperf/mperf are unrelated. You're trying to make apple
juice with carrots.