Intel CPUs have a new alternative MSR range (starting from MSR_IA32_PMC0)
for GP counters that allows writing the full counter width. Enable this
range from a new capability bit (IA32_PERF_CAPABILITIES.FW_WRITE[bit 13]).
The guest would query CPUID to get the counter width, and sign extends
the counter values as needed. The traditional MSRs always limit to 32bit,
even though the counter internally is larger (usually 48 bits).
When the new capability is set, use the alternative range which do not
have these restrictions. This lowers the overhead of perf stat slightly
because it has to do less interrupts to accumulate the counter value.
Signed-off-by: Like Xu <[email protected]>
---
arch/x86/include/asm/kvm_host.h | 1 +
arch/x86/kvm/cpuid.c | 2 +-
arch/x86/kvm/vmx/capabilities.h | 11 +++++++++
arch/x86/kvm/vmx/pmu_intel.c | 42 +++++++++++++++++++++++++++++----
arch/x86/kvm/vmx/vmx.c | 3 +++
arch/x86/kvm/x86.c | 1 +
6 files changed, 55 insertions(+), 5 deletions(-)
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 35a915787559..8c3ae83f63d9 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -599,6 +599,7 @@ struct kvm_vcpu_arch {
u64 ia32_xss;
u64 microcode_version;
u64 arch_capabilities;
+ u64 perf_capabilities;
/*
* Paging state of the vcpu
diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
index 35845704cf57..411ce1b58341 100644
--- a/arch/x86/kvm/cpuid.c
+++ b/arch/x86/kvm/cpuid.c
@@ -294,7 +294,7 @@ void kvm_set_cpu_caps(void)
F(XMM3) | F(PCLMULQDQ) | 0 /* DTES64, MONITOR */ |
0 /* DS-CPL, VMX, SMX, EST */ |
0 /* TM2 */ | F(SSSE3) | 0 /* CNXT-ID */ | 0 /* Reserved */ |
- F(FMA) | F(CX16) | 0 /* xTPR Update, PDCM */ |
+ F(FMA) | F(CX16) | 0 /* xTPR Update */ | F(PDCM) |
F(PCID) | 0 /* Reserved, DCA */ | F(XMM4_1) |
F(XMM4_2) | F(X2APIC) | F(MOVBE) | F(POPCNT) |
0 /* Reserved*/ | F(AES) | F(XSAVE) | 0 /* OSXSAVE */ | F(AVX) |
diff --git a/arch/x86/kvm/vmx/capabilities.h b/arch/x86/kvm/vmx/capabilities.h
index 8903475f751e..4bbd8b448d22 100644
--- a/arch/x86/kvm/vmx/capabilities.h
+++ b/arch/x86/kvm/vmx/capabilities.h
@@ -18,6 +18,8 @@ extern int __read_mostly pt_mode;
#define PT_MODE_SYSTEM 0
#define PT_MODE_HOST_GUEST 1
+#define PMU_CAP_FW_WRITES (1ULL << 13)
+
struct nested_vmx_msrs {
/*
* We only store the "true" versions of the VMX capability MSRs. We
@@ -367,4 +369,13 @@ static inline bool vmx_pt_mode_is_host_guest(void)
return pt_mode == PT_MODE_HOST_GUEST;
}
+static inline u64 vmx_get_perf_capabilities(void)
+{
+ /*
+ * Since counters are virtualized, KVM would support full
+ * width counting unconditionally, even if the host lacks it.
+ */
+ return PMU_CAP_FW_WRITES;
+}
+
#endif /* __KVM_X86_VMX_CAPS_H */
diff --git a/arch/x86/kvm/vmx/pmu_intel.c b/arch/x86/kvm/vmx/pmu_intel.c
index 7c857737b438..008c204306ea 100644
--- a/arch/x86/kvm/vmx/pmu_intel.c
+++ b/arch/x86/kvm/vmx/pmu_intel.c
@@ -18,6 +18,8 @@
#include "nested.h"
#include "pmu.h"
+#define MSR_PMC_FULL_WIDTH_BIT (MSR_IA32_PMC0 - MSR_IA32_PERFCTR0)
+
static struct kvm_event_hw_type_mapping intel_arch_events[] = {
/* Index must match CPUID 0x0A.EBX bit vector */
[0] = { 0x3c, 0x00, PERF_COUNT_HW_CPU_CYCLES },
@@ -150,6 +152,14 @@ static struct kvm_pmc *intel_rdpmc_ecx_to_pmc(struct kvm_vcpu *vcpu,
return &counters[array_index_nospec(idx, num_counters)];
}
+static inline bool fw_writes_is_enabled(struct kvm_vcpu *vcpu)
+{
+ if (!guest_cpuid_has(vcpu, X86_FEATURE_PDCM))
+ return false;
+
+ return vcpu->arch.perf_capabilities & PMU_CAP_FW_WRITES;
+}
+
static bool intel_is_valid_msr(struct kvm_vcpu *vcpu, u32 msr)
{
struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
@@ -162,10 +172,15 @@ static bool intel_is_valid_msr(struct kvm_vcpu *vcpu, u32 msr)
case MSR_CORE_PERF_GLOBAL_OVF_CTRL:
ret = pmu->version > 1;
break;
+ case MSR_IA32_PERF_CAPABILITIES:
+ ret = guest_cpuid_has(vcpu, X86_FEATURE_PDCM);
+ break;
default:
ret = get_gp_pmc(pmu, msr, MSR_IA32_PERFCTR0) ||
get_gp_pmc(pmu, msr, MSR_P6_EVNTSEL0) ||
- get_fixed_pmc(pmu, msr);
+ get_fixed_pmc(pmu, msr) ||
+ (fw_writes_is_enabled(vcpu) &&
+ get_gp_pmc(pmu, msr, MSR_IA32_PMC0));
break;
}
@@ -202,8 +217,12 @@ static int intel_pmu_get_msr(struct kvm_vcpu *vcpu, u32 msr, u64 *data)
case MSR_CORE_PERF_GLOBAL_OVF_CTRL:
*data = pmu->global_ovf_ctrl;
return 0;
+ case MSR_IA32_PERF_CAPABILITIES:
+ *data = vcpu->arch.perf_capabilities;
+ return 0;
default:
- if ((pmc = get_gp_pmc(pmu, msr, MSR_IA32_PERFCTR0))) {
+ if ((pmc = get_gp_pmc(pmu, msr, MSR_IA32_PERFCTR0)) ||
+ (pmc = get_gp_pmc(pmu, msr, MSR_IA32_PMC0))) {
u64 val = pmc_read_counter(pmc);
*data = val & pmu->counter_bitmask[KVM_PMC_GP];
return 0;
@@ -258,9 +277,21 @@ static int intel_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
return 0;
}
break;
+ case MSR_IA32_PERF_CAPABILITIES:
+ if (msr_info->host_initiated &&
+ !(data & ~vmx_get_perf_capabilities())) {
+ vcpu->arch.perf_capabilities = data;
+ return 0;
+ }
+ return 1;
default:
- if ((pmc = get_gp_pmc(pmu, msr, MSR_IA32_PERFCTR0))) {
- if (!msr_info->host_initiated)
+ if ((pmc = get_gp_pmc(pmu, msr, MSR_IA32_PERFCTR0)) ||
+ (pmc = get_gp_pmc(pmu, msr, MSR_IA32_PMC0))) {
+ if ((msr & MSR_PMC_FULL_WIDTH_BIT) &&
+ (data & ~pmu->counter_bitmask[KVM_PMC_GP]))
+ return 1;
+ if (!msr_info->host_initiated &&
+ !(msr & MSR_PMC_FULL_WIDTH_BIT))
data = (s64)(s32)data;
pmc->counter += data - pmc_read_counter(pmc);
if (pmc->perf_event)
@@ -300,6 +331,7 @@ static void intel_pmu_refresh(struct kvm_vcpu *vcpu)
pmu->counter_bitmask[KVM_PMC_FIXED] = 0;
pmu->version = 0;
pmu->reserved_bits = 0xffffffff00200000ull;
+ vcpu->arch.perf_capabilities = 0;
entry = kvm_find_cpuid_entry(vcpu, 0xa, 0);
if (!entry)
@@ -312,6 +344,8 @@ static void intel_pmu_refresh(struct kvm_vcpu *vcpu)
return;
perf_get_x86_pmu_capability(&x86_pmu);
+ if (guest_cpuid_has(vcpu, X86_FEATURE_PDCM))
+ vcpu->arch.perf_capabilities = vmx_get_perf_capabilities();
pmu->nr_arch_gp_counters = min_t(int, eax.split.num_counters,
x86_pmu.num_counters_gp);
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index bc5e5cf1d4cc..ee94d94e855a 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -1789,6 +1789,9 @@ static int vmx_get_msr_feature(struct kvm_msr_entry *msr)
if (!nested)
return 1;
return vmx_get_vmx_msr(&vmcs_config.nested, msr->index, &msr->data);
+ case MSR_IA32_PERF_CAPABILITIES:
+ msr->data = vmx_get_perf_capabilities();
+ return 0;
default:
return 1;
}
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 7e46027f405a..8d94d0b74fbb 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -1323,6 +1323,7 @@ static const u32 msr_based_features_all[] = {
MSR_F10H_DECFG,
MSR_IA32_UCODE_REV,
MSR_IA32_ARCH_CAPABILITIES,
+ MSR_IA32_PERF_CAPABILITIES,
};
static u32 msr_based_features[ARRAY_SIZE(msr_based_features_all)];
--
2.21.1
When the full-width writes capability is set, use the alternative MSR
range to write larger sign counter values (up to GP counter width).
Signed-off-by: Like Xu <[email protected]>
---
lib/x86/msr.h | 1 +
x86/pmu.c | 125 ++++++++++++++++++++++++++++++++++++++++----------
2 files changed, 102 insertions(+), 24 deletions(-)
diff --git a/lib/x86/msr.h b/lib/x86/msr.h
index 8dca964..6ef5502 100644
--- a/lib/x86/msr.h
+++ b/lib/x86/msr.h
@@ -35,6 +35,7 @@
#define MSR_IA32_SPEC_CTRL 0x00000048
#define MSR_IA32_PRED_CMD 0x00000049
+#define MSR_IA32_PMC0 0x000004c1
#define MSR_IA32_PERFCTR0 0x000000c1
#define MSR_IA32_PERFCTR1 0x000000c2
#define MSR_FSB_FREQ 0x000000cd
diff --git a/x86/pmu.c b/x86/pmu.c
index f45621a..8644f90 100644
--- a/x86/pmu.c
+++ b/x86/pmu.c
@@ -91,6 +91,9 @@ struct pmu_event {
{"fixed 3", MSR_CORE_PERF_FIXED_CTR0 + 2, 0.1*N, 30*N}
};
+#define PMU_CAP_FW_WRITES (1ULL << 13)
+static u64 gp_counter_base = MSR_IA32_PERFCTR0;
+
static int num_counters;
char *buf;
@@ -125,12 +128,13 @@ static bool check_irq(void)
static bool is_gp(pmu_counter_t *evt)
{
- return evt->ctr < MSR_CORE_PERF_FIXED_CTR0;
+ return evt->ctr < MSR_CORE_PERF_FIXED_CTR0 ||
+ evt->ctr >= MSR_IA32_PMC0;
}
static int event_to_global_idx(pmu_counter_t *cnt)
{
- return cnt->ctr - (is_gp(cnt) ? MSR_IA32_PERFCTR0 :
+ return cnt->ctr - (is_gp(cnt) ? gp_counter_base :
(MSR_CORE_PERF_FIXED_CTR0 - FIXED_CNT_INDEX));
}
@@ -226,7 +230,7 @@ static bool verify_counter(pmu_counter_t *cnt)
static void check_gp_counter(struct pmu_event *evt)
{
pmu_counter_t cnt = {
- .ctr = MSR_IA32_PERFCTR0,
+ .ctr = gp_counter_base,
.config = EVNTSEL_OS | EVNTSEL_USR | evt->unit_sel,
};
int i;
@@ -276,7 +280,7 @@ static void check_counters_many(void)
continue;
cnt[n].count = 0;
- cnt[n].ctr = MSR_IA32_PERFCTR0 + n;
+ cnt[n].ctr = gp_counter_base + n;
cnt[n].config = EVNTSEL_OS | EVNTSEL_USR |
gp_events[i % ARRAY_SIZE(gp_events)].unit_sel;
n++;
@@ -302,7 +306,7 @@ static void check_counter_overflow(void)
uint64_t count;
int i;
pmu_counter_t cnt = {
- .ctr = MSR_IA32_PERFCTR0,
+ .ctr = gp_counter_base,
.config = EVNTSEL_OS | EVNTSEL_USR | gp_events[1].unit_sel /* instructions */,
.count = 0,
};
@@ -319,6 +323,8 @@ static void check_counter_overflow(void)
int idx;
cnt.count = 1 - count;
+ if (gp_counter_base == MSR_IA32_PMC0)
+ cnt.count &= (1ul << eax.split.bit_width) - 1;
if (i == num_counters) {
cnt.ctr = fixed_events[0].unit_sel;
@@ -346,7 +352,7 @@ static void check_counter_overflow(void)
static void check_gp_counter_cmask(void)
{
pmu_counter_t cnt = {
- .ctr = MSR_IA32_PERFCTR0,
+ .ctr = gp_counter_base,
.config = EVNTSEL_OS | EVNTSEL_USR | gp_events[1].unit_sel /* instructions */,
.count = 0,
};
@@ -369,7 +375,7 @@ static void do_rdpmc_fast(void *ptr)
static void check_rdpmc(void)
{
- uint64_t val = 0x1f3456789ull;
+ uint64_t val = 0xff0123456789ull;
bool exc;
int i;
@@ -378,20 +384,23 @@ static void check_rdpmc(void)
for (i = 0; i < num_counters; i++) {
uint64_t x;
pmu_counter_t cnt = {
- .ctr = MSR_IA32_PERFCTR0 + i,
+ .ctr = gp_counter_base + i,
.idx = i
};
- /*
- * Only the low 32 bits are writable, and the value is
- * sign-extended.
- */
- x = (uint64_t)(int64_t)(int32_t)val;
+ /*
+ * Without full-width writes, only the low 32 bits are writable,
+ * and the value is sign-extended.
+ */
+ if (gp_counter_base == MSR_IA32_PERFCTR0)
+ x = (uint64_t)(int64_t)(int32_t)val;
+ else
+ x = (uint64_t)(int64_t)val;
/* Mask according to the number of supported bits */
x &= (1ull << eax.split.bit_width) - 1;
- wrmsr(MSR_IA32_PERFCTR0 + i, val);
+ wrmsr(gp_counter_base + i, val);
report(rdpmc(i) == x, "cntr-%d", i);
exc = test_for_exception(GP_VECTOR, do_rdpmc_fast, &cnt);
@@ -423,8 +432,9 @@ static void check_rdpmc(void)
static void check_running_counter_wrmsr(void)
{
uint64_t status;
+ uint64_t count;
pmu_counter_t evt = {
- .ctr = MSR_IA32_PERFCTR0,
+ .ctr = gp_counter_base,
.config = EVNTSEL_OS | EVNTSEL_USR | gp_events[1].unit_sel,
.count = 0,
};
@@ -433,7 +443,7 @@ static void check_running_counter_wrmsr(void)
start_event(&evt);
loop();
- wrmsr(MSR_IA32_PERFCTR0, 0);
+ wrmsr(gp_counter_base, 0);
stop_event(&evt);
report(evt.count < gp_events[1].min, "cntr");
@@ -443,7 +453,13 @@ static void check_running_counter_wrmsr(void)
evt.count = 0;
start_event(&evt);
- wrmsr(MSR_IA32_PERFCTR0, -1);
+
+ count = -1;
+ if (gp_counter_base == MSR_IA32_PMC0)
+ count &= (1ul << eax.split.bit_width) - 1;
+
+ wrmsr(gp_counter_base, count);
+
loop();
stop_event(&evt);
status = rdmsr(MSR_CORE_PERF_GLOBAL_STATUS);
@@ -452,6 +468,66 @@ static void check_running_counter_wrmsr(void)
report_prefix_pop();
}
+static void check_counters(void)
+{
+ check_gp_counters();
+ check_fixed_counters();
+ check_rdpmc();
+ check_counters_many();
+ check_counter_overflow();
+ check_gp_counter_cmask();
+ check_running_counter_wrmsr();
+}
+
+static void do_unsupported_width_counter_write(void *index)
+{
+ wrmsr(MSR_IA32_PMC0 + *((int *) index), 0xffffff0123456789ull);
+}
+
+static void check_gp_counters_write_width(void)
+{
+ u64 val_64 = 0xffffff0123456789ull;
+ u64 val_32 = val_64 & ((1ul << 32) - 1);
+ u64 val_max_width = val_64 & ((1ul << eax.split.bit_width) - 1);
+ int i;
+
+ /*
+ * MSR_IA32_PERFCTRn supports 64-bit writes,
+ * but only the lowest 32 bits are valid.
+ */
+ for (i = 0; i < num_counters; i++) {
+ wrmsr(MSR_IA32_PERFCTR0 + i, val_32);
+ assert(rdmsr(MSR_IA32_PERFCTR0 + i) == val_32);
+ assert(rdmsr(MSR_IA32_PMC0 + i) == val_32);
+
+ wrmsr(MSR_IA32_PERFCTR0 + i, val_max_width);
+ assert(rdmsr(MSR_IA32_PERFCTR0 + i) == val_32);
+ assert(rdmsr(MSR_IA32_PMC0 + i) == val_32);
+
+ wrmsr(MSR_IA32_PERFCTR0 + i, val_64);
+ assert(rdmsr(MSR_IA32_PERFCTR0 + i) == val_32);
+ assert(rdmsr(MSR_IA32_PMC0 + i) == val_32);
+ }
+
+ /*
+ * MSR_IA32_PMCn supports writing values up to GP counter width,
+ * and only the lowest bits of GP counter width are valid.
+ */
+ for (i = 0; i < num_counters; i++) {
+ wrmsr(MSR_IA32_PMC0 + i, val_32);
+ assert(rdmsr(MSR_IA32_PMC0 + i) == val_32);
+ assert(rdmsr(MSR_IA32_PERFCTR0 + i) == val_32);
+
+ wrmsr(MSR_IA32_PMC0 + i, val_max_width);
+ assert(rdmsr(MSR_IA32_PMC0 + i) == val_max_width);
+ assert(rdmsr(MSR_IA32_PERFCTR0 + i) == val_max_width);
+
+ report(test_for_exception(GP_VECTOR,
+ do_unsupported_width_counter_write, &i),
+ "writing unsupported width to MSR_IA32_PMC%d raises #GP", i);
+ }
+}
+
int main(int ac, char **av)
{
struct cpuid id = cpuid(10);
@@ -480,13 +556,14 @@ int main(int ac, char **av)
apic_write(APIC_LVTPC, PC_VECTOR);
- check_gp_counters();
- check_fixed_counters();
- check_rdpmc();
- check_counters_many();
- check_counter_overflow();
- check_gp_counter_cmask();
- check_running_counter_wrmsr();
+ check_counters();
+
+ if (rdmsr(MSR_IA32_PERF_CAPABILITIES) & PMU_CAP_FW_WRITES) {
+ gp_counter_base = MSR_IA32_PMC0;
+ report_prefix_push("full-width writes");
+ check_counters();
+ check_gp_counters_write_width();
+ }
return report_summary();
}
--
2.21.1
I would just do small changes to the validity checks for MSRs.
On 08/05/20 10:32, Like Xu wrote:
> return 0;
> + case MSR_IA32_PERF_CAPABILITIES:
> + *data = vcpu->arch.perf_capabilities;
> + return 0;
This should be:
if (!msr_info->host_initiated &&
!guest_cpuid_has(vcpu, X86_FEATURE_PDCM))
return 1;
> default:
> - if ((pmc = get_gp_pmc(pmu, msr, MSR_IA32_PERFCTR0))) {
> + if ((pmc = get_gp_pmc(pmu, msr, MSR_IA32_PERFCTR0)) ||
> + (pmc = get_gp_pmc(pmu, msr, MSR_IA32_PMC0))) {
> u64 val = pmc_read_counter(pmc);
> *data = val & pmu->counter_bitmask[KVM_PMC_GP];
> return 0;
> @@ -258,9 +277,21 @@ static int intel_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
> return 0;
> }
> break;
> + case MSR_IA32_PERF_CAPABILITIES:
> + if (msr_info->host_initiated &&
> + !(data & ~vmx_get_perf_capabilities())) {
Likewise:
if (!msr->info->host_initiated)
return 1;
if (guest_cpuid_has(vcpu, X86_FEATURE_PDCM))
? data & ~vmx_get_perf_capabilities()
: data)
return 1;
Otherwise looks good, I'm going to queue this.
Paolo
From: Wei Wang <[email protected]>
Change kvm_pmu_get_msr() to get the msr_data struct, as the host_initiated
field from the struct could be used by get_msr. This also makes this API
consistent with kvm_pmu_set_msr. No functional changes.
Signed-off-by: Wei Wang <[email protected]>
---
arch/x86/kvm/pmu.c | 4 ++--
arch/x86/kvm/pmu.h | 4 ++--
arch/x86/kvm/svm/pmu.c | 7 ++++---
arch/x86/kvm/vmx/pmu_intel.c | 19 +++++++++++--------
arch/x86/kvm/x86.c | 4 ++--
5 files changed, 21 insertions(+), 17 deletions(-)
diff --git a/arch/x86/kvm/pmu.c b/arch/x86/kvm/pmu.c
index a5078841bdac..b86346903f2e 100644
--- a/arch/x86/kvm/pmu.c
+++ b/arch/x86/kvm/pmu.c
@@ -397,9 +397,9 @@ static void kvm_pmu_mark_pmc_in_use(struct kvm_vcpu *vcpu, u32 msr)
__set_bit(pmc->idx, pmu->pmc_in_use);
}
-int kvm_pmu_get_msr(struct kvm_vcpu *vcpu, u32 msr, u64 *data)
+int kvm_pmu_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
{
- return kvm_x86_ops.pmu_ops->get_msr(vcpu, msr, data);
+ return kvm_x86_ops.pmu_ops->get_msr(vcpu, msr_info);
}
int kvm_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
diff --git a/arch/x86/kvm/pmu.h b/arch/x86/kvm/pmu.h
index a6c78a797cb1..ab85eed8a6cc 100644
--- a/arch/x86/kvm/pmu.h
+++ b/arch/x86/kvm/pmu.h
@@ -32,7 +32,7 @@ struct kvm_pmu_ops {
struct kvm_pmc *(*msr_idx_to_pmc)(struct kvm_vcpu *vcpu, u32 msr);
int (*is_valid_rdpmc_ecx)(struct kvm_vcpu *vcpu, unsigned int idx);
bool (*is_valid_msr)(struct kvm_vcpu *vcpu, u32 msr);
- int (*get_msr)(struct kvm_vcpu *vcpu, u32 msr, u64 *data);
+ int (*get_msr)(struct kvm_vcpu *vcpu, struct msr_data *msr_info);
int (*set_msr)(struct kvm_vcpu *vcpu, struct msr_data *msr_info);
void (*refresh)(struct kvm_vcpu *vcpu);
void (*init)(struct kvm_vcpu *vcpu);
@@ -147,7 +147,7 @@ void kvm_pmu_handle_event(struct kvm_vcpu *vcpu);
int kvm_pmu_rdpmc(struct kvm_vcpu *vcpu, unsigned pmc, u64 *data);
int kvm_pmu_is_valid_rdpmc_ecx(struct kvm_vcpu *vcpu, unsigned int idx);
bool kvm_pmu_is_valid_msr(struct kvm_vcpu *vcpu, u32 msr);
-int kvm_pmu_get_msr(struct kvm_vcpu *vcpu, u32 msr, u64 *data);
+int kvm_pmu_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info);
int kvm_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info);
void kvm_pmu_refresh(struct kvm_vcpu *vcpu);
void kvm_pmu_reset(struct kvm_vcpu *vcpu);
diff --git a/arch/x86/kvm/svm/pmu.c b/arch/x86/kvm/svm/pmu.c
index ce0b10fe5e2b..035da07500e8 100644
--- a/arch/x86/kvm/svm/pmu.c
+++ b/arch/x86/kvm/svm/pmu.c
@@ -215,21 +215,22 @@ static struct kvm_pmc *amd_msr_idx_to_pmc(struct kvm_vcpu *vcpu, u32 msr)
return pmc;
}
-static int amd_pmu_get_msr(struct kvm_vcpu *vcpu, u32 msr, u64 *data)
+static int amd_pmu_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
{
struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
struct kvm_pmc *pmc;
+ u32 msr = msr_info->index;
/* MSR_PERFCTRn */
pmc = get_gp_pmc_amd(pmu, msr, PMU_TYPE_COUNTER);
if (pmc) {
- *data = pmc_read_counter(pmc);
+ msr_info->data = pmc_read_counter(pmc);
return 0;
}
/* MSR_EVNTSELn */
pmc = get_gp_pmc_amd(pmu, msr, PMU_TYPE_EVNTSEL);
if (pmc) {
- *data = pmc->eventsel;
+ msr_info->data = pmc->eventsel;
return 0;
}
diff --git a/arch/x86/kvm/vmx/pmu_intel.c b/arch/x86/kvm/vmx/pmu_intel.c
index 7c857737b438..e1a303fefc16 100644
--- a/arch/x86/kvm/vmx/pmu_intel.c
+++ b/arch/x86/kvm/vmx/pmu_intel.c
@@ -184,35 +184,38 @@ static struct kvm_pmc *intel_msr_idx_to_pmc(struct kvm_vcpu *vcpu, u32 msr)
return pmc;
}
-static int intel_pmu_get_msr(struct kvm_vcpu *vcpu, u32 msr, u64 *data)
+static int intel_pmu_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
{
struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
struct kvm_pmc *pmc;
+ u32 msr = msr_info->index;
switch (msr) {
case MSR_CORE_PERF_FIXED_CTR_CTRL:
- *data = pmu->fixed_ctr_ctrl;
+ msr_info->data = pmu->fixed_ctr_ctrl;
return 0;
case MSR_CORE_PERF_GLOBAL_STATUS:
- *data = pmu->global_status;
+ msr_info->data = pmu->global_status;
return 0;
case MSR_CORE_PERF_GLOBAL_CTRL:
- *data = pmu->global_ctrl;
+ msr_info->data = pmu->global_ctrl;
return 0;
case MSR_CORE_PERF_GLOBAL_OVF_CTRL:
- *data = pmu->global_ovf_ctrl;
+ msr_info->data = pmu->global_ovf_ctrl;
return 0;
default:
if ((pmc = get_gp_pmc(pmu, msr, MSR_IA32_PERFCTR0))) {
u64 val = pmc_read_counter(pmc);
- *data = val & pmu->counter_bitmask[KVM_PMC_GP];
+ msr_info->data =
+ val & pmu->counter_bitmask[KVM_PMC_GP];
return 0;
} else if ((pmc = get_fixed_pmc(pmu, msr))) {
u64 val = pmc_read_counter(pmc);
- *data = val & pmu->counter_bitmask[KVM_PMC_FIXED];
+ msr_info->data =
+ val & pmu->counter_bitmask[KVM_PMC_FIXED];
return 0;
} else if ((pmc = get_gp_pmc(pmu, msr, MSR_P6_EVNTSEL0))) {
- *data = pmc->eventsel;
+ msr_info->data = pmc->eventsel;
return 0;
}
}
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 7e46027f405a..23fe511c6ba0 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -3106,7 +3106,7 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
case MSR_P6_PERFCTR0 ... MSR_P6_PERFCTR1:
case MSR_P6_EVNTSEL0 ... MSR_P6_EVNTSEL1:
if (kvm_pmu_is_valid_msr(vcpu, msr_info->index))
- return kvm_pmu_get_msr(vcpu, msr_info->index, &msr_info->data);
+ return kvm_pmu_get_msr(vcpu, msr_info);
msr_info->data = 0;
break;
case MSR_IA32_UCODE_REV:
@@ -3268,7 +3268,7 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
break;
default:
if (kvm_pmu_is_valid_msr(vcpu, msr_info->index))
- return kvm_pmu_get_msr(vcpu, msr_info->index, &msr_info->data);
+ return kvm_pmu_get_msr(vcpu, msr_info);
if (!ignore_msrs) {
vcpu_debug_ratelimited(vcpu, "unhandled rdmsr: 0x%x\n",
msr_info->index);
--
2.21.3
Intel CPUs have a new alternative MSR range (starting from MSR_IA32_PMC0)
for GP counters that allows writing the full counter width. Enable this
range from a new capability bit (IA32_PERF_CAPABILITIES.FW_WRITE[bit 13]).
The guest would query CPUID to get the counter width, and sign extends
the counter values as needed. The traditional MSRs always limit to 32bit,
even though the counter internally is larger (usually 48 bits).
When the new capability is set, use the alternative range which do not
have these restrictions. This lowers the overhead of perf stat slightly
because it has to do less interrupts to accumulate the counter value.
Signed-off-by: Like Xu <[email protected]>
---
arch/x86/include/asm/kvm_host.h | 1 +
arch/x86/kvm/cpuid.c | 2 +-
arch/x86/kvm/vmx/capabilities.h | 11 ++++++++
arch/x86/kvm/vmx/pmu_intel.c | 46 ++++++++++++++++++++++++++++++---
arch/x86/kvm/vmx/vmx.c | 3 +++
arch/x86/kvm/x86.c | 1 +
6 files changed, 59 insertions(+), 5 deletions(-)
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 35a915787559..8c3ae83f63d9 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -599,6 +599,7 @@ struct kvm_vcpu_arch {
u64 ia32_xss;
u64 microcode_version;
u64 arch_capabilities;
+ u64 perf_capabilities;
/*
* Paging state of the vcpu
diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
index 35845704cf57..411ce1b58341 100644
--- a/arch/x86/kvm/cpuid.c
+++ b/arch/x86/kvm/cpuid.c
@@ -294,7 +294,7 @@ void kvm_set_cpu_caps(void)
F(XMM3) | F(PCLMULQDQ) | 0 /* DTES64, MONITOR */ |
0 /* DS-CPL, VMX, SMX, EST */ |
0 /* TM2 */ | F(SSSE3) | 0 /* CNXT-ID */ | 0 /* Reserved */ |
- F(FMA) | F(CX16) | 0 /* xTPR Update, PDCM */ |
+ F(FMA) | F(CX16) | 0 /* xTPR Update */ | F(PDCM) |
F(PCID) | 0 /* Reserved, DCA */ | F(XMM4_1) |
F(XMM4_2) | F(X2APIC) | F(MOVBE) | F(POPCNT) |
0 /* Reserved*/ | F(AES) | F(XSAVE) | 0 /* OSXSAVE */ | F(AVX) |
diff --git a/arch/x86/kvm/vmx/capabilities.h b/arch/x86/kvm/vmx/capabilities.h
index 8903475f751e..4bbd8b448d22 100644
--- a/arch/x86/kvm/vmx/capabilities.h
+++ b/arch/x86/kvm/vmx/capabilities.h
@@ -18,6 +18,8 @@ extern int __read_mostly pt_mode;
#define PT_MODE_SYSTEM 0
#define PT_MODE_HOST_GUEST 1
+#define PMU_CAP_FW_WRITES (1ULL << 13)
+
struct nested_vmx_msrs {
/*
* We only store the "true" versions of the VMX capability MSRs. We
@@ -367,4 +369,13 @@ static inline bool vmx_pt_mode_is_host_guest(void)
return pt_mode == PT_MODE_HOST_GUEST;
}
+static inline u64 vmx_get_perf_capabilities(void)
+{
+ /*
+ * Since counters are virtualized, KVM would support full
+ * width counting unconditionally, even if the host lacks it.
+ */
+ return PMU_CAP_FW_WRITES;
+}
+
#endif /* __KVM_X86_VMX_CAPS_H */
diff --git a/arch/x86/kvm/vmx/pmu_intel.c b/arch/x86/kvm/vmx/pmu_intel.c
index e1a303fefc16..f66a3e2e42cd 100644
--- a/arch/x86/kvm/vmx/pmu_intel.c
+++ b/arch/x86/kvm/vmx/pmu_intel.c
@@ -18,6 +18,8 @@
#include "nested.h"
#include "pmu.h"
+#define MSR_PMC_FULL_WIDTH_BIT (MSR_IA32_PMC0 - MSR_IA32_PERFCTR0)
+
static struct kvm_event_hw_type_mapping intel_arch_events[] = {
/* Index must match CPUID 0x0A.EBX bit vector */
[0] = { 0x3c, 0x00, PERF_COUNT_HW_CPU_CYCLES },
@@ -150,6 +152,14 @@ static struct kvm_pmc *intel_rdpmc_ecx_to_pmc(struct kvm_vcpu *vcpu,
return &counters[array_index_nospec(idx, num_counters)];
}
+static inline bool fw_writes_is_enabled(struct kvm_vcpu *vcpu)
+{
+ if (!guest_cpuid_has(vcpu, X86_FEATURE_PDCM))
+ return false;
+
+ return vcpu->arch.perf_capabilities & PMU_CAP_FW_WRITES;
+}
+
static bool intel_is_valid_msr(struct kvm_vcpu *vcpu, u32 msr)
{
struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
@@ -162,10 +172,15 @@ static bool intel_is_valid_msr(struct kvm_vcpu *vcpu, u32 msr)
case MSR_CORE_PERF_GLOBAL_OVF_CTRL:
ret = pmu->version > 1;
break;
+ case MSR_IA32_PERF_CAPABILITIES:
+ ret = guest_cpuid_has(vcpu, X86_FEATURE_PDCM);
+ break;
default:
ret = get_gp_pmc(pmu, msr, MSR_IA32_PERFCTR0) ||
get_gp_pmc(pmu, msr, MSR_P6_EVNTSEL0) ||
- get_fixed_pmc(pmu, msr);
+ get_fixed_pmc(pmu, msr) ||
+ (fw_writes_is_enabled(vcpu) &&
+ get_gp_pmc(pmu, msr, MSR_IA32_PMC0));
break;
}
@@ -203,8 +218,15 @@ static int intel_pmu_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
case MSR_CORE_PERF_GLOBAL_OVF_CTRL:
msr_info->data = pmu->global_ovf_ctrl;
return 0;
+ case MSR_IA32_PERF_CAPABILITIES:
+ if (!msr_info->host_initiated &&
+ !guest_cpuid_has(vcpu, X86_FEATURE_PDCM))
+ return 1;
+ msr_info->data = vcpu->arch.perf_capabilities;
+ return 0;
default:
- if ((pmc = get_gp_pmc(pmu, msr, MSR_IA32_PERFCTR0))) {
+ if ((pmc = get_gp_pmc(pmu, msr, MSR_IA32_PERFCTR0)) ||
+ (pmc = get_gp_pmc(pmu, msr, MSR_IA32_PMC0))) {
u64 val = pmc_read_counter(pmc);
msr_info->data =
val & pmu->counter_bitmask[KVM_PMC_GP];
@@ -261,9 +283,22 @@ static int intel_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
return 0;
}
break;
+ case MSR_IA32_PERF_CAPABILITIES:
+ if (!msr_info->host_initiated)
+ return 1;
+ if (guest_cpuid_has(vcpu, X86_FEATURE_PDCM) ?
+ (data & ~vmx_get_perf_capabilities()) : data)
+ return 1;
+ vcpu->arch.perf_capabilities = data;
+ return 0;
default:
- if ((pmc = get_gp_pmc(pmu, msr, MSR_IA32_PERFCTR0))) {
- if (!msr_info->host_initiated)
+ if ((pmc = get_gp_pmc(pmu, msr, MSR_IA32_PERFCTR0)) ||
+ (pmc = get_gp_pmc(pmu, msr, MSR_IA32_PMC0))) {
+ if ((msr & MSR_PMC_FULL_WIDTH_BIT) &&
+ (data & ~pmu->counter_bitmask[KVM_PMC_GP]))
+ return 1;
+ if (!msr_info->host_initiated &&
+ !(msr & MSR_PMC_FULL_WIDTH_BIT))
data = (s64)(s32)data;
pmc->counter += data - pmc_read_counter(pmc);
if (pmc->perf_event)
@@ -303,6 +338,7 @@ static void intel_pmu_refresh(struct kvm_vcpu *vcpu)
pmu->counter_bitmask[KVM_PMC_FIXED] = 0;
pmu->version = 0;
pmu->reserved_bits = 0xffffffff00200000ull;
+ vcpu->arch.perf_capabilities = 0;
entry = kvm_find_cpuid_entry(vcpu, 0xa, 0);
if (!entry)
@@ -315,6 +351,8 @@ static void intel_pmu_refresh(struct kvm_vcpu *vcpu)
return;
perf_get_x86_pmu_capability(&x86_pmu);
+ if (guest_cpuid_has(vcpu, X86_FEATURE_PDCM))
+ vcpu->arch.perf_capabilities = vmx_get_perf_capabilities();
pmu->nr_arch_gp_counters = min_t(int, eax.split.num_counters,
x86_pmu.num_counters_gp);
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index bc5e5cf1d4cc..ee94d94e855a 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -1789,6 +1789,9 @@ static int vmx_get_msr_feature(struct kvm_msr_entry *msr)
if (!nested)
return 1;
return vmx_get_vmx_msr(&vmcs_config.nested, msr->index, &msr->data);
+ case MSR_IA32_PERF_CAPABILITIES:
+ msr->data = vmx_get_perf_capabilities();
+ return 0;
default:
return 1;
}
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 23fe511c6ba0..b577fadffb1d 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -1323,6 +1323,7 @@ static const u32 msr_based_features_all[] = {
MSR_F10H_DECFG,
MSR_IA32_UCODE_REV,
MSR_IA32_ARCH_CAPABILITIES,
+ MSR_IA32_PERF_CAPABILITIES,
};
static u32 msr_based_features[ARRAY_SIZE(msr_based_features_all)];
--
2.21.3