Intel CPUs have a new alternative MSR range (starting from MSR_IA32_PMC0)
for GP counters that allows writing the full counter width. Enable this
range from a new capability bit (IA32_PERF_CAPABILITIES.FW_WRITE[bit 13]).
The guest would query CPUID to get the counter width, and sign extends
the counter values as needed. The traditional MSRs always limit to 32bit,
even though the counter internally is larger (usually 48 bits).
When the new capability is set, use the alternative range which do not
have these restrictions. This lowers the overhead of perf stat slightly
because it has to do less interrupts to accumulate the counter value.
Signed-off-by: Like Xu <[email protected]>
---
arch/x86/include/asm/kvm_host.h | 1 +
arch/x86/kvm/cpuid.c | 2 +-
arch/x86/kvm/vmx/capabilities.h | 15 ++++++++++++++
arch/x86/kvm/vmx/pmu_intel.c | 35 +++++++++++++++++++++++++++++----
arch/x86/kvm/vmx/vmx.c | 5 +++++
arch/x86/kvm/x86.c | 8 ++++++++
6 files changed, 61 insertions(+), 5 deletions(-)
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 42a2d0d3984a..1c2e3e79490b 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -481,6 +481,7 @@ struct kvm_pmu {
u64 counter_bitmask[2];
u64 global_ctrl_mask;
u64 global_ovf_ctrl_mask;
+ u64 perf_capabilities;
u64 reserved_bits;
u8 version;
struct kvm_pmc gp_counters[INTEL_PMC_MAX_GENERIC];
diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
index 901cd1fdecd9..654ec2718fe4 100644
--- a/arch/x86/kvm/cpuid.c
+++ b/arch/x86/kvm/cpuid.c
@@ -297,7 +297,7 @@ void kvm_set_cpu_caps(void)
F(XMM3) | F(PCLMULQDQ) | 0 /* DTES64, MONITOR */ |
0 /* DS-CPL, VMX, SMX, EST */ |
0 /* TM2 */ | F(SSSE3) | 0 /* CNXT-ID */ | 0 /* Reserved */ |
- F(FMA) | F(CX16) | 0 /* xTPR Update, PDCM */ |
+ F(FMA) | F(CX16) | 0 /* xTPR Update */ | F(PDCM) |
F(PCID) | 0 /* Reserved, DCA */ | F(XMM4_1) |
F(XMM4_2) | F(X2APIC) | F(MOVBE) | F(POPCNT) |
0 /* Reserved*/ | F(AES) | F(XSAVE) | 0 /* OSXSAVE */ | F(AVX) |
diff --git a/arch/x86/kvm/vmx/capabilities.h b/arch/x86/kvm/vmx/capabilities.h
index 8903475f751e..9c4123292656 100644
--- a/arch/x86/kvm/vmx/capabilities.h
+++ b/arch/x86/kvm/vmx/capabilities.h
@@ -367,4 +367,19 @@ static inline bool vmx_pt_mode_is_host_guest(void)
return pt_mode == PT_MODE_HOST_GUEST;
}
+#define PMU_CAP_FW_WRITES (1ULL << 13)
+
+static inline u64 vmx_get_perf_capabilities(void)
+{
+ u64 perf_cap = 0;
+
+ if (boot_cpu_has(X86_FEATURE_PDCM))
+ rdmsrl(MSR_IA32_PERF_CAPABILITIES, perf_cap);
+
+ /* Currently, KVM only supports Full-Width Writes. */
+ perf_cap &= PMU_CAP_FW_WRITES;
+
+ return perf_cap;
+}
+
#endif /* __KVM_X86_VMX_CAPS_H */
diff --git a/arch/x86/kvm/vmx/pmu_intel.c b/arch/x86/kvm/vmx/pmu_intel.c
index 7c857737b438..c15b5b03de38 100644
--- a/arch/x86/kvm/vmx/pmu_intel.c
+++ b/arch/x86/kvm/vmx/pmu_intel.c
@@ -150,6 +150,16 @@ static struct kvm_pmc *intel_rdpmc_ecx_to_pmc(struct kvm_vcpu *vcpu,
return &counters[array_index_nospec(idx, num_counters)];
}
+static inline bool fw_writes_is_enabled(struct kvm_pmu *pmu)
+{
+ struct kvm_vcpu *vcpu = pmu_to_vcpu(pmu);
+
+ if (!guest_cpuid_has(vcpu, X86_FEATURE_PDCM))
+ return false;
+
+ return pmu->perf_capabilities & PMU_CAP_FW_WRITES;
+}
+
static bool intel_is_valid_msr(struct kvm_vcpu *vcpu, u32 msr)
{
struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
@@ -162,10 +172,15 @@ static bool intel_is_valid_msr(struct kvm_vcpu *vcpu, u32 msr)
case MSR_CORE_PERF_GLOBAL_OVF_CTRL:
ret = pmu->version > 1;
break;
+ case MSR_IA32_PERF_CAPABILITIES:
+ ret = guest_cpuid_has(vcpu, X86_FEATURE_PDCM);
+ break;
default:
ret = get_gp_pmc(pmu, msr, MSR_IA32_PERFCTR0) ||
get_gp_pmc(pmu, msr, MSR_P6_EVNTSEL0) ||
- get_fixed_pmc(pmu, msr);
+ get_fixed_pmc(pmu, msr) ||
+ (fw_writes_is_enabled(pmu) &&
+ get_gp_pmc(pmu, msr, MSR_IA32_PMC0));
break;
}
@@ -202,8 +217,12 @@ static int intel_pmu_get_msr(struct kvm_vcpu *vcpu, u32 msr, u64 *data)
case MSR_CORE_PERF_GLOBAL_OVF_CTRL:
*data = pmu->global_ovf_ctrl;
return 0;
+ case MSR_IA32_PERF_CAPABILITIES:
+ *data = pmu->perf_capabilities;
+ return 0;
default:
- if ((pmc = get_gp_pmc(pmu, msr, MSR_IA32_PERFCTR0))) {
+ if ((pmc = get_gp_pmc(pmu, msr, MSR_IA32_PERFCTR0)) ||
+ (pmc = get_gp_pmc(pmu, msr, MSR_IA32_PMC0))) {
u64 val = pmc_read_counter(pmc);
*data = val & pmu->counter_bitmask[KVM_PMC_GP];
return 0;
@@ -258,9 +277,14 @@ static int intel_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
return 0;
}
break;
+ case MSR_IA32_PERF_CAPABILITIES:
+ return 1; /* RO MSR */
default:
- if ((pmc = get_gp_pmc(pmu, msr, MSR_IA32_PERFCTR0))) {
- if (!msr_info->host_initiated)
+ if ((pmc = get_gp_pmc(pmu, msr, MSR_IA32_PERFCTR0)) ||
+ (pmc = get_gp_pmc(pmu, msr, MSR_IA32_PMC0))) {
+ if (data & ~pmu->counter_bitmask[KVM_PMC_GP])
+ return 1;
+ if (!fw_writes_is_enabled(pmu))
data = (s64)(s32)data;
pmc->counter += data - pmc_read_counter(pmc);
if (pmc->perf_event)
@@ -300,6 +324,7 @@ static void intel_pmu_refresh(struct kvm_vcpu *vcpu)
pmu->counter_bitmask[KVM_PMC_FIXED] = 0;
pmu->version = 0;
pmu->reserved_bits = 0xffffffff00200000ull;
+ pmu->perf_capabilities = 0;
entry = kvm_find_cpuid_entry(vcpu, 0xa, 0);
if (!entry)
@@ -312,6 +337,8 @@ static void intel_pmu_refresh(struct kvm_vcpu *vcpu)
return;
perf_get_x86_pmu_capability(&x86_pmu);
+ if (guest_cpuid_has(vcpu, X86_FEATURE_PDCM))
+ pmu->perf_capabilities = vmx_get_perf_capabilities();
pmu->nr_arch_gp_counters = min_t(int, eax.split.num_counters,
x86_pmu.num_counters_gp);
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index c2c6335a998c..f0c70a76f8fd 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -1772,6 +1772,11 @@ static int vmx_get_msr_feature(struct kvm_msr_entry *msr)
if (!nested)
return 1;
return vmx_get_vmx_msr(&vmcs_config.nested, msr->index, &msr->data);
+ case MSR_IA32_PERF_CAPABILITIES:
+ if (!nested)
+ return 1;
+ msr->data = vmx_get_perf_capabilities();
+ return 0;
default:
return 1;
}
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index c5835f9cb9ad..59b6b272ca17 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -1220,6 +1220,13 @@ static const u32 msrs_to_save_all[] = {
MSR_ARCH_PERFMON_EVENTSEL0 + 12, MSR_ARCH_PERFMON_EVENTSEL0 + 13,
MSR_ARCH_PERFMON_EVENTSEL0 + 14, MSR_ARCH_PERFMON_EVENTSEL0 + 15,
MSR_ARCH_PERFMON_EVENTSEL0 + 16, MSR_ARCH_PERFMON_EVENTSEL0 + 17,
+
+ MSR_IA32_PMC0, MSR_IA32_PMC0 + 1, MSR_IA32_PMC0 + 2,
+ MSR_IA32_PMC0 + 3, MSR_IA32_PMC0 + 4, MSR_IA32_PMC0 + 5,
+ MSR_IA32_PMC0 + 6, MSR_IA32_PMC0 + 7, MSR_IA32_PMC0 + 8,
+ MSR_IA32_PMC0 + 9, MSR_IA32_PMC0 + 10, MSR_IA32_PMC0 + 11,
+ MSR_IA32_PMC0 + 12, MSR_IA32_PMC0 + 13, MSR_IA32_PMC0 + 14,
+ MSR_IA32_PMC0 + 15, MSR_IA32_PMC0 + 16, MSR_IA32_PMC0 + 17,
};
static u32 msrs_to_save[ARRAY_SIZE(msrs_to_save_all)];
@@ -1314,6 +1321,7 @@ static const u32 msr_based_features_all[] = {
MSR_F10H_DECFG,
MSR_IA32_UCODE_REV,
MSR_IA32_ARCH_CAPABILITIES,
+ MSR_IA32_PERF_CAPABILITIES,
};
static u32 msr_based_features[ARRAY_SIZE(msr_based_features_all)];
--
2.21.1
On 07/05/20 04:14, Like Xu wrote:
> +static inline u64 vmx_get_perf_capabilities(void)
> +{
> + u64 perf_cap = 0;
> +
> + if (boot_cpu_has(X86_FEATURE_PDCM))
> + rdmsrl(MSR_IA32_PERF_CAPABILITIES, perf_cap);
> +
> + /* Currently, KVM only supports Full-Width Writes. */
> + perf_cap &= PMU_CAP_FW_WRITES;
> +
> + return perf_cap;
> +}
> +
Since counters are virtualized, it seems to me that you can support
PMU_CAP_FW_WRITES unconditionally, even if the host lacks it. So just
return PMU_CAP_FW_WRITES from this function.
> + case MSR_IA32_PERF_CAPABILITIES:
> + return 1; /* RO MSR */
> default:
You need to allow writes from the host if (data &
~vmx_get_perf_capabilities()) == 0.
> - if (!msr_info->host_initiated)
> + if ((pmc = get_gp_pmc(pmu, msr, MSR_IA32_PERFCTR0)) ||
> + (pmc = get_gp_pmc(pmu, msr, MSR_IA32_PMC0))) {
> + if (data & ~pmu->counter_bitmask[KVM_PMC_GP])
> + return 1;
> + if (!fw_writes_is_enabled(pmu))
> data = (s64)(s32)data;
You are dropping the test on msr_info->host_initiated here, you should
keep it otherwise you allow full-width write to MSR_IA32_PERFCTR0 as
well. So:
#define MSR_PMC_FULL_WIDTH_BIT (MSR_IA32_PMC0 - MSR_IA32_PERFCTR0)
if (!msr_info->host_initiated && !(msr & MSR_PMC_FULL_WIDTH_BIT))
data = (s64)(s32)data;
> + case MSR_IA32_PERF_CAPABILITIES:
> + if (!nested)
> + return 1;
> + msr->data = vmx_get_perf_capabilities();
> + return 0;
The !nested check is wrong.
>
> +++ b/arch/x86/kvm/x86.c
> @@ -1220,6 +1220,13 @@ static const u32 msrs_to_save_all[] = {
> MSR_ARCH_PERFMON_EVENTSEL0 + 12, MSR_ARCH_PERFMON_EVENTSEL0 + 13,
> MSR_ARCH_PERFMON_EVENTSEL0 + 14, MSR_ARCH_PERFMON_EVENTSEL0 + 15,
> MSR_ARCH_PERFMON_EVENTSEL0 + 16, MSR_ARCH_PERFMON_EVENTSEL0 + 17,
> +
> + MSR_IA32_PMC0, MSR_IA32_PMC0 + 1, MSR_IA32_PMC0 + 2,
> + MSR_IA32_PMC0 + 3, MSR_IA32_PMC0 + 4, MSR_IA32_PMC0 + 5,
> + MSR_IA32_PMC0 + 6, MSR_IA32_PMC0 + 7, MSR_IA32_PMC0 + 8,
> + MSR_IA32_PMC0 + 9, MSR_IA32_PMC0 + 10, MSR_IA32_PMC0 + 11,
> + MSR_IA32_PMC0 + 12, MSR_IA32_PMC0 + 13, MSR_IA32_PMC0 + 14,
> + MSR_IA32_PMC0 + 15, MSR_IA32_PMC0 + 16, MSR_IA32_PMC0 + 17,
> };
This is not needed because the full-width content is already accessible
from the host via MSR_IA32_PERFCTRn.
Given the bugs, it is clear that you should also modify the pmu.c
testcase for kvm-unit-tests to cover full-width writes (and especially
the non-full-width write behavior of MSR_IA32_PERFCTRn). Even before
the QEMU side is begin worked on, you can test it with "-cpu
host,migratable=off".
Thanks,
Paolo
Hi Paolo,
Thanks for your detailed comments.
On 2020/5/7 15:57, Paolo Bonzini wrote:
> On 07/05/20 04:14, Like Xu wrote:
>> +static inline u64 vmx_get_perf_capabilities(void)
>> +{
>> + u64 perf_cap = 0;
>> +
>> + if (boot_cpu_has(X86_FEATURE_PDCM))
>> + rdmsrl(MSR_IA32_PERF_CAPABILITIES, perf_cap);
>> +
>> + /* Currently, KVM only supports Full-Width Writes. */
>> + perf_cap &= PMU_CAP_FW_WRITES;
>> +
>> + return perf_cap;
>> +}
>> +
> Since counters are virtualized, it seems to me that you can support
> PMU_CAP_FW_WRITES unconditionally, even if the host lacks it. So just
> return PMU_CAP_FW_WRITES from this function.
Sure, let's export PMU_CAP_FW_WRITES unconditionally.
>
>> + case MSR_IA32_PERF_CAPABILITIES:
>> + return 1; /* RO MSR */
>> default:
> You need to allow writes from the host if (data &
> ~vmx_get_perf_capabilities()) == 0.
Yes, it makes sense after I understand the intention of host_initiated.
>
>> - if (!msr_info->host_initiated)
>> + if ((pmc = get_gp_pmc(pmu, msr, MSR_IA32_PERFCTR0)) ||
>> + (pmc = get_gp_pmc(pmu, msr, MSR_IA32_PMC0))) {
>> + if (data & ~pmu->counter_bitmask[KVM_PMC_GP])
>> + return 1;
>> + if (!fw_writes_is_enabled(pmu))
>> data = (s64)(s32)data;
>
> You are dropping the test on msr_info->host_initiated here, you should
> keep it otherwise you allow full-width write to MSR_IA32_PERFCTR0 as
> well. So:
>
> #define MSR_PMC_FULL_WIDTH_BIT (MSR_IA32_PMC0 - MSR_IA32_PERFCTR0)
>
> if (!msr_info->host_initiated && !(msr & MSR_PMC_FULL_WIDTH_BIT))
> data = (s64)(s32)data;
Thanks, it looks good to me and I'll apply it.
>
>> + case MSR_IA32_PERF_CAPABILITIES:
>> + if (!nested)
>> + return 1;
>> + msr->data = vmx_get_perf_capabilities();
>> + return 0;
> The !nested check is wrong.
You're absolutely right about this.
>
>> +++ b/arch/x86/kvm/x86.c
>> @@ -1220,6 +1220,13 @@ static const u32 msrs_to_save_all[] = {
>> MSR_ARCH_PERFMON_EVENTSEL0 + 12, MSR_ARCH_PERFMON_EVENTSEL0 + 13,
>> MSR_ARCH_PERFMON_EVENTSEL0 + 14, MSR_ARCH_PERFMON_EVENTSEL0 + 15,
>> MSR_ARCH_PERFMON_EVENTSEL0 + 16, MSR_ARCH_PERFMON_EVENTSEL0 + 17,
>> +
>> + MSR_IA32_PMC0, MSR_IA32_PMC0 + 1, MSR_IA32_PMC0 + 2,
>> + MSR_IA32_PMC0 + 3, MSR_IA32_PMC0 + 4, MSR_IA32_PMC0 + 5,
>> + MSR_IA32_PMC0 + 6, MSR_IA32_PMC0 + 7, MSR_IA32_PMC0 + 8,
>> + MSR_IA32_PMC0 + 9, MSR_IA32_PMC0 + 10, MSR_IA32_PMC0 + 11,
>> + MSR_IA32_PMC0 + 12, MSR_IA32_PMC0 + 13, MSR_IA32_PMC0 + 14,
>> + MSR_IA32_PMC0 + 15, MSR_IA32_PMC0 + 16, MSR_IA32_PMC0 + 17,
>> };
> This is not needed because the full-width content is already accessible
> from the host via MSR_IA32_PERFCTRn.
That's true because they're just alias registers for MSR_IA32_PERFCTRn.
>
> Given the bugs, it is clear that you should also modify the pmu.c
> testcase for kvm-unit-tests to cover full-width writes (and especially
> the non-full-width write behavior of MSR_IA32_PERFCTRn). Even before
> the QEMU side is begin worked on, you can test it with "-cpu
> host,migratable=off".
Sure, I added some testcases in pmu.c to cover this feature.
Please review the v3 patch
https://lore.kernel.org/kvm/[email protected]/
as well as the kvm-unit-tests testcase.
Thanks,
Like Xu
>
> Thanks,
>
> Paolo
>
On 08/05/20 10:42, Xu, Like wrote:
>> Given the bugs, it is clear that you should also modify the pmu.c
>> testcase for kvm-unit-tests to cover full-width writes (and especially
>> the non-full-width write behavior of MSR_IA32_PERFCTRn). Even before
>> the QEMU side is begin worked on, you can test it with "-cpu
>> host,migratable=off".
>
> Sure, I added some testcases in pmu.c to cover this feature.
>
> Please review the v3 patch
> https://lore.kernel.org/kvm/[email protected]/
>
> as well as the kvm-unit-tests testcase.
Awesome, thanks for the quick reply!
Paolo