Intel new hardware introduces some Precise Event-Based Sampling(PEBS)
extensions that output the PEBS record to Intel PT stream instead of
DS area. The PEBS record will be packaged in a specific format when
outputting to Intel PT. This patch set will enable PEBS functionality
in KVM Guest by PEBS output to Intel PT.
The patch 1 introduce a MSRs "base" parameter that use for get the
kvm_pmc structure by New MSR_RELOAD_FIXED_CTRx like get_gp_pmc()
function. The patch 2 implement the PEBS MSRs read/write emulation.
Patch 5/6/7 expose some capabilities(CPUID, MSRs) to KVM guest which
relate with PEBS feature. Patch 3 introduces "pebs" parameter to
allocate a perf event counter from host perf event framework.
The counter using for PEBS event should be disabled before VM-entry
in the previous platform, patch 4 skip this operation when PEBS is
enabled in KVM guest. Patch 8 has some code changes in native that to
make the aux_event only be needed for a non-kernel event(the couner
allocate by KVM is kernel event).
Luwei Kang (8):
KVM: x86: Add base address parameter for get_fixed_pmc function
KVM: x86: PEBS output to Intel PT MSRs emulation
KVM: x86: Allocate performance counter for PEBS event
KVM: x86: Aviod clear the PEBS counter when PEBS enabled in guest
KVM: X86: Expose PDCM cpuid to guest
KVM: X86: MSR_IA32_PERF_CAPABILITIES MSR emulation
KVM: x86: Expose PEBS feature to guest
perf/x86: Add event owner check when PEBS output to Intel PT
arch/x86/events/core.c | 3 +-
arch/x86/events/intel/core.c | 19 ++++++----
arch/x86/events/perf_event.h | 2 +-
arch/x86/include/asm/kvm_host.h | 7 ++++
arch/x86/include/asm/msr-index.h | 9 +++++
arch/x86/include/asm/perf_event.h | 5 ++-
arch/x86/kvm/cpuid.c | 3 +-
arch/x86/kvm/pmu.c | 23 ++++++++----
arch/x86/kvm/pmu.h | 10 ++---
arch/x86/kvm/pmu_amd.c | 2 +-
arch/x86/kvm/svm.c | 12 ++++++
arch/x86/kvm/vmx/capabilities.h | 25 +++++++++++++
arch/x86/kvm/vmx/pmu_intel.c | 79 +++++++++++++++++++++++++++++++++++----
arch/x86/kvm/vmx/vmx.c | 19 +++++++++-
arch/x86/kvm/x86.c | 22 ++++++++---
include/linux/perf_event.h | 1 +
kernel/events/core.c | 2 +-
17 files changed, 201 insertions(+), 42 deletions(-)
--
1.8.3.1
PEBS output to Inte PT introduces some new MSRs(MSR_RELOAD_FIXED_CTRx)
for fixed function counters that use for autoload the preset
value after writing out a PEBS event.
Introduce base MSRs address parameter to make this function can
get kvm performance monitor counter structure by
MSR_RELOAD_FIXED_CTRx registers.
Signed-off-by: Luwei Kang <[email protected]>
---
arch/x86/kvm/pmu.h | 5 ++---
arch/x86/kvm/vmx/pmu_intel.c | 14 +++++++++-----
2 files changed, 11 insertions(+), 8 deletions(-)
diff --git a/arch/x86/kvm/pmu.h b/arch/x86/kvm/pmu.h
index 58265f7..c62a1ff 100644
--- a/arch/x86/kvm/pmu.h
+++ b/arch/x86/kvm/pmu.h
@@ -93,10 +93,9 @@ static inline struct kvm_pmc *get_gp_pmc(struct kvm_pmu *pmu, u32 msr,
}
/* returns fixed PMC with the specified MSR */
-static inline struct kvm_pmc *get_fixed_pmc(struct kvm_pmu *pmu, u32 msr)
+static inline struct kvm_pmc *get_fixed_pmc(struct kvm_pmu *pmu, u32 msr,
+ int base)
{
- int base = MSR_CORE_PERF_FIXED_CTR0;
-
if (msr >= base && msr < base + pmu->nr_arch_fixed_counters)
return &pmu->fixed_counters[msr - base];
diff --git a/arch/x86/kvm/vmx/pmu_intel.c b/arch/x86/kvm/vmx/pmu_intel.c
index 3e9c059..2a485b5 100644
--- a/arch/x86/kvm/vmx/pmu_intel.c
+++ b/arch/x86/kvm/vmx/pmu_intel.c
@@ -41,7 +41,8 @@ static void reprogram_fixed_counters(struct kvm_pmu *pmu, u64 data)
u8 old_ctrl = fixed_ctrl_field(pmu->fixed_ctr_ctrl, i);
struct kvm_pmc *pmc;
- pmc = get_fixed_pmc(pmu, MSR_CORE_PERF_FIXED_CTR0 + i);
+ pmc = get_fixed_pmc(pmu, MSR_CORE_PERF_FIXED_CTR0 + i,
+ MSR_CORE_PERF_FIXED_CTR0);
if (old_ctrl == new_ctrl)
continue;
@@ -106,7 +107,8 @@ static struct kvm_pmc *intel_pmc_idx_to_pmc(struct kvm_pmu *pmu, int pmc_idx)
else {
u32 idx = pmc_idx - INTEL_PMC_IDX_FIXED;
- return get_fixed_pmc(pmu, idx + MSR_CORE_PERF_FIXED_CTR0);
+ return get_fixed_pmc(pmu, idx + MSR_CORE_PERF_FIXED_CTR0,
+ MSR_CORE_PERF_FIXED_CTR0);
}
}
@@ -155,7 +157,7 @@ static bool intel_is_valid_msr(struct kvm_vcpu *vcpu, u32 msr)
default:
ret = get_gp_pmc(pmu, msr, MSR_IA32_PERFCTR0) ||
get_gp_pmc(pmu, msr, MSR_P6_EVNTSEL0) ||
- get_fixed_pmc(pmu, msr);
+ get_fixed_pmc(pmu, msr, MSR_CORE_PERF_FIXED_CTR0);
break;
}
@@ -185,7 +187,8 @@ static int intel_pmu_get_msr(struct kvm_vcpu *vcpu, u32 msr, u64 *data)
u64 val = pmc_read_counter(pmc);
*data = val & pmu->counter_bitmask[KVM_PMC_GP];
return 0;
- } else if ((pmc = get_fixed_pmc(pmu, msr))) {
+ } else if ((pmc = get_fixed_pmc(pmu, msr,
+ MSR_CORE_PERF_FIXED_CTR0))) {
u64 val = pmc_read_counter(pmc);
*data = val & pmu->counter_bitmask[KVM_PMC_FIXED];
return 0;
@@ -243,7 +246,8 @@ static int intel_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
else
pmc->counter = (s32)data;
return 0;
- } else if ((pmc = get_fixed_pmc(pmu, msr))) {
+ } else if ((pmc = get_fixed_pmc(pmu, msr,
+ MSR_CORE_PERF_FIXED_CTR0))) {
pmc->counter = data;
return 0;
} else if ((pmc = get_gp_pmc(pmu, msr, MSR_P6_EVNTSEL0))) {
--
1.8.3.1
Intel new hardware introduces a mechanism to direct PEBS records
output into the Intel PT buffer that can be used for enabling PEBS
in KVM guest. This patch implements the registers read and write
emulation when PEBS is supported in KVM guest.
KMM needs to reprogram the counters when the value of these MSRs
be changed that to make sure it can take effect in hardware.
Signed-off-by: Luwei Kang <[email protected]>
---
arch/x86/include/asm/kvm_host.h | 4 +++
arch/x86/include/asm/msr-index.h | 6 ++++
arch/x86/kvm/vmx/capabilities.h | 15 ++++++++++
arch/x86/kvm/vmx/pmu_intel.c | 63 ++++++++++++++++++++++++++++++++++++++--
4 files changed, 86 insertions(+), 2 deletions(-)
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 50eb430..ed01936 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -448,6 +448,7 @@ struct kvm_pmc {
enum pmc_type type;
u8 idx;
u64 counter;
+ u64 reload_cnt;
u64 eventsel;
struct perf_event *perf_event;
struct kvm_vcpu *vcpu;
@@ -465,7 +466,10 @@ struct kvm_pmu {
u64 global_ctrl_mask;
u64 global_ovf_ctrl_mask;
u64 reserved_bits;
+ u64 pebs_enable;
+ u64 pebs_enable_mask;
u8 version;
+ bool pebs_via_pt; /* PEBS output to Intel PT */
struct kvm_pmc gp_counters[INTEL_PMC_MAX_GENERIC];
struct kvm_pmc fixed_counters[INTEL_PMC_MAX_FIXED];
struct irq_work irq_work;
diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h
index 20ce682..d22f8d9 100644
--- a/arch/x86/include/asm/msr-index.h
+++ b/arch/x86/include/asm/msr-index.h
@@ -131,9 +131,13 @@
#define LBR_INFO_ABORT BIT_ULL(61)
#define LBR_INFO_CYCLES 0xffff
+#define MSR_IA32_PEBS_PMI_AFTER_REC BIT_ULL(60)
+#define MSR_IA32_PEBS_OUTPUT_PT BIT_ULL(61)
+#define MSR_IA32_PEBS_OUTPUT_MASK (3ULL << 61)
#define MSR_IA32_PEBS_ENABLE 0x000003f1
#define MSR_PEBS_DATA_CFG 0x000003f2
#define MSR_IA32_DS_AREA 0x00000600
+#define MSR_IA32_PERF_CAP_PEBS_OUTPUT_PT BIT_ULL(16)
#define MSR_IA32_PERF_CAPABILITIES 0x00000345
#define MSR_PEBS_LD_LAT_THRESHOLD 0x000003f6
@@ -665,6 +669,8 @@
#define MSR_IA32_MISC_ENABLE_FERR (1ULL << MSR_IA32_MISC_ENABLE_FERR_BIT)
#define MSR_IA32_MISC_ENABLE_FERR_MULTIPLEX_BIT 10
#define MSR_IA32_MISC_ENABLE_FERR_MULTIPLEX (1ULL << MSR_IA32_MISC_ENABLE_FERR_MULTIPLEX_BIT)
+#define MSR_IA32_MISC_ENABLE_PEBS_BIT 12
+#define MSR_IA32_MISC_ENABLE_PEBS (1ULL << MSR_IA32_MISC_ENABLE_PEBS_BIT)
#define MSR_IA32_MISC_ENABLE_TM2_BIT 13
#define MSR_IA32_MISC_ENABLE_TM2 (1ULL << MSR_IA32_MISC_ENABLE_TM2_BIT)
#define MSR_IA32_MISC_ENABLE_ADJ_PREF_DISABLE_BIT 19
diff --git a/arch/x86/kvm/vmx/capabilities.h b/arch/x86/kvm/vmx/capabilities.h
index 7aa6971..fc861d4 100644
--- a/arch/x86/kvm/vmx/capabilities.h
+++ b/arch/x86/kvm/vmx/capabilities.h
@@ -348,4 +348,19 @@ static inline bool cpu_has_vmx_intel_pt(void)
(vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_RTIT_CTL);
}
+static inline bool cpu_has_vmx_pebs_output_pt(void)
+{
+ u64 misc, perf_cap;
+
+ rdmsrl(MSR_IA32_MISC_ENABLE, misc);
+ rdmsrl(MSR_IA32_PERF_CAPABILITIES, perf_cap);
+
+ /*
+ * Support Processor Event Based Sampling (PEBS) and
+ * PEBS output to Intel PT.
+ */
+ return (!(misc & MSR_IA32_MISC_ENABLE_PEBS) &&
+ (perf_cap & MSR_IA32_PERF_CAP_PEBS_OUTPUT_PT));
+}
+
#endif /* __KVM_X86_VMX_CAPS_H */
diff --git a/arch/x86/kvm/vmx/pmu_intel.c b/arch/x86/kvm/vmx/pmu_intel.c
index 2a485b5..3f723a3 100644
--- a/arch/x86/kvm/vmx/pmu_intel.c
+++ b/arch/x86/kvm/vmx/pmu_intel.c
@@ -12,6 +12,7 @@
#include <linux/kvm_host.h>
#include <linux/perf_event.h>
#include <asm/perf_event.h>
+#include "capabilities.h"
#include "x86.h"
#include "cpuid.h"
#include "lapic.h"
@@ -65,6 +66,20 @@ static void global_ctrl_changed(struct kvm_pmu *pmu, u64 data)
reprogram_counter(pmu, bit);
}
+static void pebs_enable_changed(struct kvm_pmu *pmu, u64 data)
+{
+ int bit;
+ u64 mask = (BIT_ULL(pmu->nr_arch_gp_counters) - 1) |
+ ((BIT_ULL(pmu->nr_arch_fixed_counters) - 1) <<
+ INTEL_PMC_IDX_FIXED);
+ u64 diff = (pmu->pebs_enable ^ data) & mask;
+
+ pmu->pebs_enable = data;
+
+ for_each_set_bit(bit, (unsigned long *)&diff, X86_PMC_IDX_MAX)
+ reprogram_counter(pmu, bit);
+}
+
static unsigned intel_find_arch_event(struct kvm_pmu *pmu,
u8 event_select,
u8 unit_mask)
@@ -154,10 +169,15 @@ static bool intel_is_valid_msr(struct kvm_vcpu *vcpu, u32 msr)
case MSR_CORE_PERF_GLOBAL_OVF_CTRL:
ret = pmu->version > 1;
break;
+ case MSR_IA32_PEBS_ENABLE:
+ ret = pmu->pebs_via_pt;
+ break;
default:
ret = get_gp_pmc(pmu, msr, MSR_IA32_PERFCTR0) ||
get_gp_pmc(pmu, msr, MSR_P6_EVNTSEL0) ||
- get_fixed_pmc(pmu, msr, MSR_CORE_PERF_FIXED_CTR0);
+ get_fixed_pmc(pmu, msr, MSR_CORE_PERF_FIXED_CTR0) ||
+ get_gp_pmc(pmu, msr, MSR_RELOAD_PMC0) ||
+ get_fixed_pmc(pmu, msr, MSR_RELOAD_FIXED_CTR0);
break;
}
@@ -182,6 +202,9 @@ static int intel_pmu_get_msr(struct kvm_vcpu *vcpu, u32 msr, u64 *data)
case MSR_CORE_PERF_GLOBAL_OVF_CTRL:
*data = pmu->global_ovf_ctrl;
return 0;
+ case MSR_IA32_PEBS_ENABLE:
+ *data = pmu->pebs_enable;
+ return 0;
default:
if ((pmc = get_gp_pmc(pmu, msr, MSR_IA32_PERFCTR0))) {
u64 val = pmc_read_counter(pmc);
@@ -195,6 +218,11 @@ static int intel_pmu_get_msr(struct kvm_vcpu *vcpu, u32 msr, u64 *data)
} else if ((pmc = get_gp_pmc(pmu, msr, MSR_P6_EVNTSEL0))) {
*data = pmc->eventsel;
return 0;
+ } else if ((pmc = get_gp_pmc(pmu, msr, MSR_RELOAD_PMC0)) ||
+ (pmc = get_fixed_pmc(pmu, msr,
+ MSR_RELOAD_FIXED_CTR0))) {
+ *data = pmc->reload_cnt;
+ return 0;
}
}
@@ -239,6 +267,16 @@ static int intel_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
return 0;
}
break;
+ case MSR_IA32_PEBS_ENABLE:
+ if (pmu->pebs_enable == data)
+ return 0;
+ if (!data || (!(data & pmu->pebs_enable_mask) &&
+ (data & MSR_IA32_PEBS_OUTPUT_MASK) ==
+ MSR_IA32_PEBS_OUTPUT_PT)) {
+ pebs_enable_changed(pmu, data);
+ return 0;
+ }
+ break;
default:
if ((pmc = get_gp_pmc(pmu, msr, MSR_IA32_PERFCTR0))) {
if (msr_info->host_initiated)
@@ -257,6 +295,18 @@ static int intel_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
reprogram_gp_counter(pmc, data);
return 0;
}
+ } else if ((pmc = get_gp_pmc(pmu, msr, MSR_RELOAD_PMC0)) ||
+ (pmc = get_fixed_pmc(pmu, msr,
+ MSR_RELOAD_FIXED_CTR0))) {
+ if (data == pmc->reload_cnt)
+ return 0;
+ if (!(data & ~pmc_bitmask(pmc))) {
+ int pmc_idx = pmc_is_fixed(pmc) ? pmc->idx +
+ INTEL_PMC_IDX_FIXED : pmc->idx;
+ pmc->reload_cnt = data;
+ reprogram_counter(pmu, pmc_idx);
+ return 0;
+ }
}
}
@@ -308,14 +358,23 @@ static void intel_pmu_refresh(struct kvm_vcpu *vcpu)
pmu->global_ctrl = ((1ull << pmu->nr_arch_gp_counters) - 1) |
(((1ull << pmu->nr_arch_fixed_counters) - 1) << INTEL_PMC_IDX_FIXED);
+
pmu->global_ctrl_mask = ~pmu->global_ctrl;
pmu->global_ovf_ctrl_mask = pmu->global_ctrl_mask
& ~(MSR_CORE_PERF_GLOBAL_OVF_CTRL_OVF_BUF |
MSR_CORE_PERF_GLOBAL_OVF_CTRL_COND_CHGD);
- if (kvm_x86_ops->pt_supported())
+ if (kvm_x86_ops->pt_supported()) {
pmu->global_ovf_ctrl_mask &=
~MSR_CORE_PERF_GLOBAL_OVF_CTRL_TRACE_TOPA_PMI;
+ if (cpu_has_vmx_pebs_output_pt()) {
+ pmu->pebs_via_pt = true;
+ pmu->pebs_enable_mask = ~(pmu->global_ctrl |
+ MSR_IA32_PEBS_PMI_AFTER_REC |
+ MSR_IA32_PEBS_OUTPUT_MASK);
+ }
+ }
+
entry = kvm_find_cpuid_entry(vcpu, 7, 0);
if (entry &&
(boot_cpu_has(X86_FEATURE_HLE) || boot_cpu_has(X86_FEATURE_RTM)) &&
--
1.8.3.1
This patch introduce a parameter that avoid clear the PEBS event
counter while running in the guest. The performance counter which
use for PEBS event can be enabled through VM-entry when PEBS is
enabled in guest by PEBS output to Intel PT.
Signed-off-by: Luwei Kang <[email protected]>
---
arch/x86/events/intel/core.c | 19 +++++++++++--------
arch/x86/events/perf_event.h | 2 +-
arch/x86/include/asm/perf_event.h | 5 +++--
arch/x86/kvm/vmx/vmx.c | 3 ++-
4 files changed, 17 insertions(+), 12 deletions(-)
diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c
index fcef678..1fcc9fc 100644
--- a/arch/x86/events/intel/core.c
+++ b/arch/x86/events/intel/core.c
@@ -3323,16 +3323,16 @@ static int intel_pmu_hw_config(struct perf_event *event)
return 0;
}
-struct perf_guest_switch_msr *perf_guest_get_msrs(int *nr)
+struct perf_guest_switch_msr *perf_guest_get_msrs(int *nr, bool pebs)
{
if (x86_pmu.guest_get_msrs)
- return x86_pmu.guest_get_msrs(nr);
+ return x86_pmu.guest_get_msrs(nr, pebs);
*nr = 0;
return NULL;
}
EXPORT_SYMBOL_GPL(perf_guest_get_msrs);
-static struct perf_guest_switch_msr *intel_guest_get_msrs(int *nr)
+static struct perf_guest_switch_msr *intel_guest_get_msrs(int *nr, bool pebs)
{
struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
struct perf_guest_switch_msr *arr = cpuc->guest_switch_msrs;
@@ -3340,10 +3340,13 @@ static struct perf_guest_switch_msr *intel_guest_get_msrs(int *nr)
arr[0].msr = MSR_CORE_PERF_GLOBAL_CTRL;
arr[0].host = x86_pmu.intel_ctrl & ~cpuc->intel_ctrl_guest_mask;
arr[0].guest = x86_pmu.intel_ctrl & ~cpuc->intel_ctrl_host_mask;
- if (x86_pmu.flags & PMU_FL_PEBS_ALL)
- arr[0].guest &= ~cpuc->pebs_enabled;
- else
- arr[0].guest &= ~(cpuc->pebs_enabled & PEBS_COUNTER_MASK);
+ if (!pebs) {
+ if (x86_pmu.flags & PMU_FL_PEBS_ALL)
+ arr[0].guest &= ~cpuc->pebs_enabled;
+ else
+ arr[0].guest &=
+ ~(cpuc->pebs_enabled & PEBS_COUNTER_MASK);
+ }
*nr = 1;
if (x86_pmu.pebs && x86_pmu.pebs_no_isolation) {
@@ -3364,7 +3367,7 @@ static struct perf_guest_switch_msr *intel_guest_get_msrs(int *nr)
return arr;
}
-static struct perf_guest_switch_msr *core_guest_get_msrs(int *nr)
+static struct perf_guest_switch_msr *core_guest_get_msrs(int *nr, bool pebs)
{
struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
struct perf_guest_switch_msr *arr = cpuc->guest_switch_msrs;
diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h
index ecacfbf..57058fe 100644
--- a/arch/x86/events/perf_event.h
+++ b/arch/x86/events/perf_event.h
@@ -696,7 +696,7 @@ struct x86_pmu {
/*
* Intel host/guest support (KVM)
*/
- struct perf_guest_switch_msr *(*guest_get_msrs)(int *nr);
+ struct perf_guest_switch_msr *(*guest_get_msrs)(int *nr, bool pebs);
/*
* Check period value for PERF_EVENT_IOC_PERIOD ioctl.
diff --git a/arch/x86/include/asm/perf_event.h b/arch/x86/include/asm/perf_event.h
index ee26e92..e29075a 100644
--- a/arch/x86/include/asm/perf_event.h
+++ b/arch/x86/include/asm/perf_event.h
@@ -322,12 +322,13 @@ struct perf_guest_switch_msr {
u64 host, guest;
};
-extern struct perf_guest_switch_msr *perf_guest_get_msrs(int *nr);
+extern struct perf_guest_switch_msr *perf_guest_get_msrs(int *nr, bool pebs);
extern void perf_get_x86_pmu_capability(struct x86_pmu_capability *cap);
extern void perf_check_microcode(void);
extern int x86_perf_rdpmc_index(struct perf_event *event);
#else
-static inline struct perf_guest_switch_msr *perf_guest_get_msrs(int *nr)
+static inline struct perf_guest_switch_msr *perf_guest_get_msrs(int *nr,
+ bool pebs)
{
*nr = 0;
return NULL;
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index e7970a2..170afde 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -6428,7 +6428,8 @@ static void atomic_switch_perf_msrs(struct vcpu_vmx *vmx)
int i, nr_msrs;
struct perf_guest_switch_msr *msrs;
- msrs = perf_guest_get_msrs(&nr_msrs);
+ msrs = perf_guest_get_msrs(&nr_msrs,
+ vcpu_to_pmu(&vmx->vcpu)->pebs_enable);
if (!msrs)
return;
--
1.8.3.1
Expose some bits of definition which relate with enable
PEBS to KVM guest especially PEBS via PT feature.
Signed-off-by: Luwei Kang <[email protected]>
---
arch/x86/include/asm/kvm_host.h | 1 +
arch/x86/include/asm/msr-index.h | 3 +++
arch/x86/kvm/vmx/vmx.c | 14 ++++++++++++++
3 files changed, 18 insertions(+)
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index a987ae1..24a0ab9 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -569,6 +569,7 @@ struct kvm_vcpu_arch {
u64 ia32_xss;
u64 microcode_version;
u64 arch_capabilities;
+ u64 ia32_perf_capabilities;
/*
* Paging state of the vcpu
diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h
index d22f8d9..75c09e4 100644
--- a/arch/x86/include/asm/msr-index.h
+++ b/arch/x86/include/asm/msr-index.h
@@ -137,6 +137,9 @@
#define MSR_IA32_PEBS_ENABLE 0x000003f1
#define MSR_PEBS_DATA_CFG 0x000003f2
#define MSR_IA32_DS_AREA 0x00000600
+#define MSR_IA32_PERF_CAP_PEBS_TRAP BIT_ULL(6)
+#define MSR_IA32_PERF_CAP_PEBS_ARCH_REG BIT_ULL(7)
+#define MSR_IA32_PERF_CAP_PEBS_REC_FMT (0xfULL << 8)
#define MSR_IA32_PERF_CAP_PEBS_OUTPUT_PT BIT_ULL(16)
#define MSR_IA32_PERF_CAPABILITIES 0x00000345
#define MSR_PEBS_LD_LAT_THRESHOLD 0x000003f6
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index 6c29a57..5c4dd05 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -1828,6 +1828,16 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
return 1;
msr_info->data = vcpu->arch.ia32_xss;
break;
+ case MSR_IA32_PERF_CAPABILITIES:
+ if (!vmx_pdcm_supported() || !vmx_pebs_supported())
+ return 1;
+ rdmsrl(MSR_IA32_PERF_CAPABILITIES, msr_info->data);
+ msr_info->data = msr_info->data &
+ (MSR_IA32_PERF_CAP_PEBS_TRAP |
+ MSR_IA32_PERF_CAP_PEBS_ARCH_REG |
+ MSR_IA32_PERF_CAP_PEBS_REC_FMT |
+ MSR_IA32_PERF_CAP_PEBS_OUTPUT_PT);
+ break;
case MSR_IA32_RTIT_CTL:
if (pt_mode != PT_MODE_HOST_GUEST)
return 1;
@@ -2082,6 +2092,10 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
else
clear_atomic_switch_msr(vmx, MSR_IA32_XSS);
break;
+ case MSR_IA32_PERF_CAPABILITIES:
+ if (!vmx_pdcm_supported() || !vmx_pebs_supported())
+ return 1;
+ break;
case MSR_IA32_RTIT_CTL:
if ((pt_mode != PT_MODE_HOST_GUEST) ||
vmx_rtit_ctl_check(vcpu, data) ||
--
1.8.3.1
For PEBS output to Intel PT, a Intel PT event should be the group
leader of an PEBS counter event in host. For Intel PT
virtualization enabling in KVM guest, the PT facilities will be
passthrough to guest and do not allocate PT event from host perf
event framework. This is different with PMU virtualization.
Intel new hardware feature that can make PEBS enabled in KVM guest
by output PEBS records to Intel PT buffer. KVM need to allocate
a event counter for this PEBS event without Intel PT event leader.
This patch add event owner check for PEBS output to PT event that
only non-kernel event need group leader(PT).
Signed-off-by: Luwei Kang <[email protected]>
---
arch/x86/events/core.c | 3 ++-
include/linux/perf_event.h | 1 +
kernel/events/core.c | 2 +-
3 files changed, 4 insertions(+), 2 deletions(-)
diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c
index 7b21455..214041a 100644
--- a/arch/x86/events/core.c
+++ b/arch/x86/events/core.c
@@ -1014,7 +1014,8 @@ static int collect_events(struct cpu_hw_events *cpuc, struct perf_event *leader,
* away, the group was broken down and this singleton event
* can't schedule any more.
*/
- if (is_pebs_pt(leader) && !leader->aux_event)
+ if (is_pebs_pt(leader) && !leader->aux_event &&
+ !is_kernel_event(leader))
return -EINVAL;
/*
diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 61448c1..22ef4b0 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -928,6 +928,7 @@ int perf_event_read_local(struct perf_event *event, u64 *value,
extern u64 perf_event_read_value(struct perf_event *event,
u64 *enabled, u64 *running);
+extern bool is_kernel_event(struct perf_event *event);
struct perf_sample_data {
/*
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 9ec0b0b..00f943b 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -166,7 +166,7 @@ static void perf_ctx_unlock(struct perf_cpu_context *cpuctx,
#define TASK_TOMBSTONE ((void *)-1L)
-static bool is_kernel_event(struct perf_event *event)
+bool is_kernel_event(struct perf_event *event)
{
return READ_ONCE(event->owner) == TASK_TOMBSTONE;
}
--
1.8.3.1
PDCM (Perfmon and Debug Capability) indicates the processor
supports the performance and debug feature indication
MSR IA32_PERF_CAPABILITIES.
PEBS enabling in KVM guest depend on PEBS via PT, and
PEBS via PT is detected by IA32_PERF_CAPABILITIES[Bit16].
Signed-off-by: Luwei Kang <[email protected]>
---
arch/x86/include/asm/kvm_host.h | 1 +
arch/x86/kvm/cpuid.c | 3 ++-
arch/x86/kvm/svm.c | 6 ++++++
arch/x86/kvm/vmx/capabilities.h | 10 ++++++++++
arch/x86/kvm/vmx/vmx.c | 1 +
5 files changed, 20 insertions(+), 1 deletion(-)
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index ed01936..a987ae1 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -1126,6 +1126,7 @@ struct kvm_x86_ops {
bool (*xsaves_supported)(void);
bool (*umip_emulated)(void);
bool (*pt_supported)(void);
+ bool (*pdcm_supported)(void);
int (*check_nested_events)(struct kvm_vcpu *vcpu, bool external_intr);
void (*request_immediate_exit)(struct kvm_vcpu *vcpu);
diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
index 9c5029c..ab2906d 100644
--- a/arch/x86/kvm/cpuid.c
+++ b/arch/x86/kvm/cpuid.c
@@ -442,6 +442,7 @@ static inline int __do_cpuid_func(struct kvm_cpuid_entry2 *entry, u32 function,
unsigned f_rdtscp = kvm_x86_ops->rdtscp_supported() ? F(RDTSCP) : 0;
unsigned f_xsaves = kvm_x86_ops->xsaves_supported() ? F(XSAVES) : 0;
unsigned f_intel_pt = kvm_x86_ops->pt_supported() ? F(INTEL_PT) : 0;
+ unsigned f_pdcm = kvm_x86_ops->pdcm_supported() ? F(PDCM) : 0;
/* cpuid 1.edx */
const u32 kvm_cpuid_1_edx_x86_features =
@@ -470,7 +471,7 @@ static inline int __do_cpuid_func(struct kvm_cpuid_entry2 *entry, u32 function,
F(XMM3) | F(PCLMULQDQ) | 0 /* DTES64, MONITOR */ |
0 /* DS-CPL, VMX, SMX, EST */ |
0 /* TM2 */ | F(SSSE3) | 0 /* CNXT-ID */ | 0 /* Reserved */ |
- F(FMA) | F(CX16) | 0 /* xTPR Update, PDCM */ |
+ F(FMA) | F(CX16) | 0 /* xTPR Update */ | f_pdcm |
F(PCID) | 0 /* Reserved, DCA */ | F(XMM4_1) |
F(XMM4_2) | F(X2APIC) | F(MOVBE) | F(POPCNT) |
0 /* Reserved*/ | F(AES) | F(XSAVE) | 0 /* OSXSAVE */ | F(AVX) |
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index f8ecb6d..7e0a7b3 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -5975,6 +5975,11 @@ static bool svm_pt_supported(void)
return false;
}
+static bool svm_pdcm_supported(void)
+{
+ return false;
+}
+
static bool svm_has_wbinvd_exit(void)
{
return true;
@@ -7272,6 +7277,7 @@ static bool svm_apic_init_signal_blocked(struct kvm_vcpu *vcpu)
.xsaves_supported = svm_xsaves_supported,
.umip_emulated = svm_umip_emulated,
.pt_supported = svm_pt_supported,
+ .pdcm_supported = svm_pdcm_supported,
.set_supported_cpuid = svm_set_supported_cpuid,
diff --git a/arch/x86/kvm/vmx/capabilities.h b/arch/x86/kvm/vmx/capabilities.h
index fc861d4..2f502f1 100644
--- a/arch/x86/kvm/vmx/capabilities.h
+++ b/arch/x86/kvm/vmx/capabilities.h
@@ -363,4 +363,14 @@ static inline bool cpu_has_vmx_pebs_output_pt(void)
(perf_cap & MSR_IA32_PERF_CAP_PEBS_OUTPUT_PT));
}
+static inline bool vmx_pebs_supported(void)
+{
+ return (cpu_has_vmx_pebs_output_pt() && pt_mode == PT_MODE_HOST_GUEST);
+}
+
+static inline bool vmx_pdcm_supported(void)
+{
+ return boot_cpu_has(X86_FEATURE_PDCM) && vmx_pebs_supported();
+}
+
#endif /* __KVM_X86_VMX_CAPS_H */
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index 170afde..6c29a57 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -7865,6 +7865,7 @@ static __exit void hardware_unsetup(void)
.xsaves_supported = vmx_xsaves_supported,
.umip_emulated = vmx_umip_emulated,
.pt_supported = vmx_pt_supported,
+ .pdcm_supported = vmx_pdcm_supported,
.request_immediate_exit = vmx_request_immediate_exit,
--
1.8.3.1
This patch add a new parameter "pebs" that to make the
host PMU framework allocate the performance counter for
guest PEBS event.
Signed-off-by: Luwei Kang <[email protected]>
---
arch/x86/kvm/pmu.c | 23 +++++++++++++++--------
arch/x86/kvm/pmu.h | 5 +++--
arch/x86/kvm/pmu_amd.c | 2 +-
arch/x86/kvm/vmx/pmu_intel.c | 10 +++++-----
4 files changed, 24 insertions(+), 16 deletions(-)
diff --git a/arch/x86/kvm/pmu.c b/arch/x86/kvm/pmu.c
index 46875bb..5088d1c 100644
--- a/arch/x86/kvm/pmu.c
+++ b/arch/x86/kvm/pmu.c
@@ -99,7 +99,7 @@ static void kvm_perf_overflow_intr(struct perf_event *perf_event,
static void pmc_reprogram_counter(struct kvm_pmc *pmc, u32 type,
unsigned config, bool exclude_user,
bool exclude_kernel, bool intr,
- bool in_tx, bool in_tx_cp)
+ bool in_tx, bool in_tx_cp, bool pebs)
{
struct perf_event *event;
struct perf_event_attr attr = {
@@ -111,9 +111,12 @@ static void pmc_reprogram_counter(struct kvm_pmc *pmc, u32 type,
.exclude_user = exclude_user,
.exclude_kernel = exclude_kernel,
.config = config,
+ .precise_ip = pebs ? 1 : 0,
+ .aux_output = pebs ? 1 : 0,
};
- attr.sample_period = (-pmc->counter) & pmc_bitmask(pmc);
+ attr.sample_period = pebs ? (-pmc->reload_cnt) & pmc_bitmask(pmc) :
+ (-pmc->counter) & pmc_bitmask(pmc);
if (in_tx)
attr.config |= HSW_IN_TX;
@@ -140,7 +143,7 @@ static void pmc_reprogram_counter(struct kvm_pmc *pmc, u32 type,
clear_bit(pmc->idx, (unsigned long*)&pmc_to_pmu(pmc)->reprogram_pmi);
}
-void reprogram_gp_counter(struct kvm_pmc *pmc, u64 eventsel)
+void reprogram_gp_counter(struct kvm_pmc *pmc, u64 eventsel, bool pebs)
{
unsigned config, type = PERF_TYPE_RAW;
u8 event_select, unit_mask;
@@ -198,11 +201,12 @@ void reprogram_gp_counter(struct kvm_pmc *pmc, u64 eventsel)
!(eventsel & ARCH_PERFMON_EVENTSEL_OS),
eventsel & ARCH_PERFMON_EVENTSEL_INT,
(eventsel & HSW_IN_TX),
- (eventsel & HSW_IN_TX_CHECKPOINTED));
+ (eventsel & HSW_IN_TX_CHECKPOINTED),
+ pebs);
}
EXPORT_SYMBOL_GPL(reprogram_gp_counter);
-void reprogram_fixed_counter(struct kvm_pmc *pmc, u8 ctrl, int idx)
+void reprogram_fixed_counter(struct kvm_pmc *pmc, u8 ctrl, int idx, bool pebs)
{
unsigned en_field = ctrl & 0x3;
bool pmi = ctrl & 0x8;
@@ -228,7 +232,8 @@ void reprogram_fixed_counter(struct kvm_pmc *pmc, u8 ctrl, int idx)
kvm_x86_ops->pmu_ops->find_fixed_event(idx),
!(en_field & 0x2), /* exclude user */
!(en_field & 0x1), /* exclude kernel */
- pmi, false, false);
+ pmi, false, false,
+ pebs);
}
EXPORT_SYMBOL_GPL(reprogram_fixed_counter);
@@ -240,12 +245,14 @@ void reprogram_counter(struct kvm_pmu *pmu, int pmc_idx)
return;
if (pmc_is_gp(pmc))
- reprogram_gp_counter(pmc, pmc->eventsel);
+ reprogram_gp_counter(pmc, pmc->eventsel,
+ (pmu->pebs_enable & BIT_ULL(pmc_idx)));
else {
int idx = pmc_idx - INTEL_PMC_IDX_FIXED;
u8 ctrl = fixed_ctrl_field(pmu->fixed_ctr_ctrl, idx);
- reprogram_fixed_counter(pmc, ctrl, idx);
+ reprogram_fixed_counter(pmc, ctrl, idx,
+ (pmu->pebs_enable & BIT_ULL(pmc_idx)));
}
}
EXPORT_SYMBOL_GPL(reprogram_counter);
diff --git a/arch/x86/kvm/pmu.h b/arch/x86/kvm/pmu.h
index c62a1ff..0c59a15 100644
--- a/arch/x86/kvm/pmu.h
+++ b/arch/x86/kvm/pmu.h
@@ -102,8 +102,9 @@ static inline struct kvm_pmc *get_fixed_pmc(struct kvm_pmu *pmu, u32 msr,
return NULL;
}
-void reprogram_gp_counter(struct kvm_pmc *pmc, u64 eventsel);
-void reprogram_fixed_counter(struct kvm_pmc *pmc, u8 ctrl, int fixed_idx);
+void reprogram_gp_counter(struct kvm_pmc *pmc, u64 eventsel, bool pebs);
+void reprogram_fixed_counter(struct kvm_pmc *pmc, u8 ctrl, int fixed_idx,
+ bool pebs);
void reprogram_counter(struct kvm_pmu *pmu, int pmc_idx);
void kvm_pmu_deliver_pmi(struct kvm_vcpu *vcpu);
diff --git a/arch/x86/kvm/pmu_amd.c b/arch/x86/kvm/pmu_amd.c
index c838838..7b3e307 100644
--- a/arch/x86/kvm/pmu_amd.c
+++ b/arch/x86/kvm/pmu_amd.c
@@ -248,7 +248,7 @@ static int amd_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
if (data == pmc->eventsel)
return 0;
if (!(data & pmu->reserved_bits)) {
- reprogram_gp_counter(pmc, data);
+ reprogram_gp_counter(pmc, data, false);
return 0;
}
}
diff --git a/arch/x86/kvm/vmx/pmu_intel.c b/arch/x86/kvm/vmx/pmu_intel.c
index 3f723a3..15b5f05 100644
--- a/arch/x86/kvm/vmx/pmu_intel.c
+++ b/arch/x86/kvm/vmx/pmu_intel.c
@@ -48,7 +48,8 @@ static void reprogram_fixed_counters(struct kvm_pmu *pmu, u64 data)
if (old_ctrl == new_ctrl)
continue;
- reprogram_fixed_counter(pmc, new_ctrl, i);
+ reprogram_fixed_counter(pmc, new_ctrl, i,
+ (pmu->pebs_enable & BIT_ULL(i + INTEL_PMC_IDX_FIXED)));
}
pmu->fixed_ctr_ctrl = data;
@@ -292,7 +293,8 @@ static int intel_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
if (data == pmc->eventsel)
return 0;
if (!(data & pmu->reserved_bits)) {
- reprogram_gp_counter(pmc, data);
+ reprogram_gp_counter(pmc, data,
+ (pmu->pebs_enable & BIT_ULL(pmc->idx)));
return 0;
}
} else if ((pmc = get_gp_pmc(pmu, msr, MSR_RELOAD_PMC0)) ||
@@ -301,10 +303,8 @@ static int intel_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
if (data == pmc->reload_cnt)
return 0;
if (!(data & ~pmc_bitmask(pmc))) {
- int pmc_idx = pmc_is_fixed(pmc) ? pmc->idx +
- INTEL_PMC_IDX_FIXED : pmc->idx;
pmc->reload_cnt = data;
- reprogram_counter(pmu, pmc_idx);
+ reprogram_counter(pmu, pmc->idx);
return 0;
}
}
--
1.8.3.1
Expose PEBS feature to guest by IA32_MISC_ENABLE[bit12].
IA32_MISC_ENABLE[bit12] is Processor Event Based Sampling (PEBS)
Unavailable (RO) flag:
1 = PEBS is not supported; 0 = PEBS is supported.
Signed-off-by: Luwei Kang <[email protected]>
---
arch/x86/include/asm/kvm_host.h | 1 +
arch/x86/kvm/svm.c | 6 ++++++
arch/x86/kvm/vmx/vmx.c | 1 +
arch/x86/kvm/x86.c | 22 +++++++++++++++++-----
4 files changed, 25 insertions(+), 5 deletions(-)
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 24a0ab9..76f5fa5 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -1127,6 +1127,7 @@ struct kvm_x86_ops {
bool (*xsaves_supported)(void);
bool (*umip_emulated)(void);
bool (*pt_supported)(void);
+ bool (*pebs_supported)(void);
bool (*pdcm_supported)(void);
int (*check_nested_events)(struct kvm_vcpu *vcpu, bool external_intr);
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 7e0a7b3..3a1bbb3 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -5975,6 +5975,11 @@ static bool svm_pt_supported(void)
return false;
}
+static bool svm_pebs_supported(void)
+{
+ return false;
+}
+
static bool svm_pdcm_supported(void)
{
return false;
@@ -7277,6 +7282,7 @@ static bool svm_apic_init_signal_blocked(struct kvm_vcpu *vcpu)
.xsaves_supported = svm_xsaves_supported,
.umip_emulated = svm_umip_emulated,
.pt_supported = svm_pt_supported,
+ .pebs_supported = svm_pebs_supported,
.pdcm_supported = svm_pdcm_supported,
.set_supported_cpuid = svm_set_supported_cpuid,
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index 5c4dd05..3c370a3 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -7879,6 +7879,7 @@ static __exit void hardware_unsetup(void)
.xsaves_supported = vmx_xsaves_supported,
.umip_emulated = vmx_umip_emulated,
.pt_supported = vmx_pt_supported,
+ .pebs_supported = vmx_pebs_supported,
.pdcm_supported = vmx_pdcm_supported,
.request_immediate_exit = vmx_request_immediate_exit,
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 661e2bf..5f59073 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -2591,6 +2591,7 @@ static void record_steal_time(struct kvm_vcpu *vcpu)
int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
{
bool pr = false;
+ bool update_cpuid = false;
u32 msr = msr_info->index;
u64 data = msr_info->data;
@@ -2671,11 +2672,17 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
((vcpu->arch.ia32_misc_enable_msr ^ data) & MSR_IA32_MISC_ENABLE_MWAIT)) {
if (!guest_cpuid_has(vcpu, X86_FEATURE_XMM3))
return 1;
- vcpu->arch.ia32_misc_enable_msr = data;
- kvm_update_cpuid(vcpu);
- } else {
- vcpu->arch.ia32_misc_enable_msr = data;
+ update_cpuid = true;
}
+
+ if (kvm_x86_ops->pebs_supported())
+ data &= ~MSR_IA32_MISC_ENABLE_PEBS;
+ else
+ data |= MSR_IA32_MISC_ENABLE_PEBS;
+
+ vcpu->arch.ia32_misc_enable_msr = data;
+ if (update_cpuid)
+ kvm_update_cpuid(vcpu);
break;
case MSR_IA32_SMBASE:
if (!msr_info->host_initiated)
@@ -2971,7 +2978,12 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
msr_info->data = (u64)vcpu->arch.ia32_tsc_adjust_msr;
break;
case MSR_IA32_MISC_ENABLE:
- msr_info->data = vcpu->arch.ia32_misc_enable_msr;
+ if (kvm_x86_ops->pebs_supported())
+ msr_info->data = (vcpu->arch.ia32_misc_enable_msr &
+ ~MSR_IA32_MISC_ENABLE_PEBS);
+ else
+ msr_info->data = (vcpu->arch.ia32_misc_enable_msr |
+ MSR_IA32_MISC_ENABLE_PEBS);
break;
case MSR_IA32_SMBASE:
if (!msr_info->host_initiated)
--
1.8.3.1
On Sun, Oct 27, 2019 at 07:11:13PM -0400, Luwei Kang wrote:
> This patch introduce a parameter that avoid clear the PEBS event
> counter while running in the guest. The performance counter which
> use for PEBS event can be enabled through VM-entry when PEBS is
> enabled in guest by PEBS output to Intel PT.
Please write coherent Changelogs. I have no clue what you're trying to
say.
Also, maybe this attrocious coding style is accepted in KVM, but I'm not
having it. Pretty much all your linebreaks and subsequent indenting is
against style.
On Sun, Oct 27, 2019 at 07:11:16PM -0400, Luwei Kang wrote:
> Expose PEBS feature to guest by IA32_MISC_ENABLE[bit12].
> IA32_MISC_ENABLE[bit12] is Processor Event Based Sampling (PEBS)
> Unavailable (RO) flag:
> 1 = PEBS is not supported; 0 = PEBS is supported.
Why does it make sense to expose this on SVM?
> Signed-off-by: Luwei Kang <[email protected]>
> ---
> arch/x86/include/asm/kvm_host.h | 1 +
> arch/x86/kvm/svm.c | 6 ++++++
> arch/x86/kvm/vmx/vmx.c | 1 +
> arch/x86/kvm/x86.c | 22 +++++++++++++++++-----
> 4 files changed, 25 insertions(+), 5 deletions(-)
>
> diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
> index 24a0ab9..76f5fa5 100644
> --- a/arch/x86/include/asm/kvm_host.h
> +++ b/arch/x86/include/asm/kvm_host.h
> @@ -1127,6 +1127,7 @@ struct kvm_x86_ops {
> bool (*xsaves_supported)(void);
> bool (*umip_emulated)(void);
> bool (*pt_supported)(void);
> + bool (*pebs_supported)(void);
> bool (*pdcm_supported)(void);
>
> int (*check_nested_events)(struct kvm_vcpu *vcpu, bool external_intr);
> diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
> index 7e0a7b3..3a1bbb3 100644
> --- a/arch/x86/kvm/svm.c
> +++ b/arch/x86/kvm/svm.c
> @@ -5975,6 +5975,11 @@ static bool svm_pt_supported(void)
> return false;
> }
>
> +static bool svm_pebs_supported(void)
> +{
> + return false;
> +}
> +
> static bool svm_pdcm_supported(void)
> {
> return false;
> @@ -7277,6 +7282,7 @@ static bool svm_apic_init_signal_blocked(struct kvm_vcpu *vcpu)
> .xsaves_supported = svm_xsaves_supported,
> .umip_emulated = svm_umip_emulated,
> .pt_supported = svm_pt_supported,
> + .pebs_supported = svm_pebs_supported,
> .pdcm_supported = svm_pdcm_supported,
>
> .set_supported_cpuid = svm_set_supported_cpuid,
> diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
> index 5c4dd05..3c370a3 100644
> --- a/arch/x86/kvm/vmx/vmx.c
> +++ b/arch/x86/kvm/vmx/vmx.c
> @@ -7879,6 +7879,7 @@ static __exit void hardware_unsetup(void)
> .xsaves_supported = vmx_xsaves_supported,
> .umip_emulated = vmx_umip_emulated,
> .pt_supported = vmx_pt_supported,
> + .pebs_supported = vmx_pebs_supported,
> .pdcm_supported = vmx_pdcm_supported,
>
> .request_immediate_exit = vmx_request_immediate_exit,
> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> index 661e2bf..5f59073 100644
> --- a/arch/x86/kvm/x86.c
> +++ b/arch/x86/kvm/x86.c
> @@ -2591,6 +2591,7 @@ static void record_steal_time(struct kvm_vcpu *vcpu)
> int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
> {
> bool pr = false;
> + bool update_cpuid = false;
> u32 msr = msr_info->index;
> u64 data = msr_info->data;
>
> @@ -2671,11 +2672,17 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
> ((vcpu->arch.ia32_misc_enable_msr ^ data) & MSR_IA32_MISC_ENABLE_MWAIT)) {
> if (!guest_cpuid_has(vcpu, X86_FEATURE_XMM3))
> return 1;
> - vcpu->arch.ia32_misc_enable_msr = data;
> - kvm_update_cpuid(vcpu);
> - } else {
> - vcpu->arch.ia32_misc_enable_msr = data;
> + update_cpuid = true;
> }
> +
> + if (kvm_x86_ops->pebs_supported())
> + data &= ~MSR_IA32_MISC_ENABLE_PEBS;
whitespace damage
> + else
> + data |= MSR_IA32_MISC_ENABLE_PEBS;
> +
> + vcpu->arch.ia32_misc_enable_msr = data;
> + if (update_cpuid)
> + kvm_update_cpuid(vcpu);
> break;
> case MSR_IA32_SMBASE:
> if (!msr_info->host_initiated)
> @@ -2971,7 +2978,12 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
> msr_info->data = (u64)vcpu->arch.ia32_tsc_adjust_msr;
> break;
> case MSR_IA32_MISC_ENABLE:
> - msr_info->data = vcpu->arch.ia32_misc_enable_msr;
> + if (kvm_x86_ops->pebs_supported())
> + msr_info->data = (vcpu->arch.ia32_misc_enable_msr &
> + ~MSR_IA32_MISC_ENABLE_PEBS);
> + else
> + msr_info->data = (vcpu->arch.ia32_misc_enable_msr |
> + MSR_IA32_MISC_ENABLE_PEBS);
Coding style violation.
> break;
> case MSR_IA32_SMBASE:
> if (!msr_info->host_initiated)
> --
> 1.8.3.1
>
On Sun, Oct 27, 2019 at 07:11:17PM -0400, Luwei Kang wrote:
> For PEBS output to Intel PT, a Intel PT event should be the group
> leader of an PEBS counter event in host. For Intel PT
> virtualization enabling in KVM guest, the PT facilities will be
> passthrough to guest and do not allocate PT event from host perf
> event framework. This is different with PMU virtualization.
>
> Intel new hardware feature that can make PEBS enabled in KVM guest
> by output PEBS records to Intel PT buffer. KVM need to allocate
> a event counter for this PEBS event without Intel PT event leader.
>
> This patch add event owner check for PEBS output to PT event that
> only non-kernel event need group leader(PT).
>
> Signed-off-by: Luwei Kang <[email protected]>
> ---
> arch/x86/events/core.c | 3 ++-
> include/linux/perf_event.h | 1 +
> kernel/events/core.c | 2 +-
> 3 files changed, 4 insertions(+), 2 deletions(-)
>
> diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c
> index 7b21455..214041a 100644
> --- a/arch/x86/events/core.c
> +++ b/arch/x86/events/core.c
> @@ -1014,7 +1014,8 @@ static int collect_events(struct cpu_hw_events *cpuc, struct perf_event *leader,
> * away, the group was broken down and this singleton event
> * can't schedule any more.
> */
> - if (is_pebs_pt(leader) && !leader->aux_event)
> + if (is_pebs_pt(leader) && !leader->aux_event &&
> + !is_kernel_event(leader))
indent fail, but also, I'm not sure I buy this.
Surely pt-on-kvm has a perf event to claim PT for the vCPU context?
Even if not, this is not strictly correct. Not even now is KVM the sole
user of perf_event_create_kernel_counter(), so saying any kernel event
is excempt from this scheduling constraint is jsut wrong.
> return -EINVAL;
>
> /*
On Sun, Oct 27, 2019 at 07:11:12PM -0400, Luwei Kang wrote:
@@ -99,7 +99,7 @@ static void kvm_perf_overflow_intr(struct perf_event *perf_event,
> static void pmc_reprogram_counter(struct kvm_pmc *pmc, u32 type,
> unsigned config, bool exclude_user,
> bool exclude_kernel, bool intr,
> - bool in_tx, bool in_tx_cp)
> + bool in_tx, bool in_tx_cp, bool pebs)
> {
> struct perf_event *event;
> struct perf_event_attr attr = {
> @@ -111,9 +111,12 @@ static void pmc_reprogram_counter(struct kvm_pmc *pmc, u32 type,
> .exclude_user = exclude_user,
> .exclude_kernel = exclude_kernel,
> .config = config,
> + .precise_ip = pebs ? 1 : 0,
> + .aux_output = pebs ? 1 : 0,
srsly?
On Sun, Oct 27, 2019 at 07:11:11PM -0400, Luwei Kang wrote:
> Intel new hardware introduces a mechanism to direct PEBS records
> output into the Intel PT buffer that can be used for enabling PEBS
> in KVM guest. This patch implements the registers read and write
> emulation when PEBS is supported in KVM guest.
>
> KMM needs to reprogram the counters when the value of these MSRs
> be changed that to make sure it can take effect in hardware.
>
> Signed-off-by: Luwei Kang <[email protected]>
> ---
> arch/x86/include/asm/kvm_host.h | 4 +++
> arch/x86/include/asm/msr-index.h | 6 ++++
> arch/x86/kvm/vmx/capabilities.h | 15 ++++++++++
> arch/x86/kvm/vmx/pmu_intel.c | 63 ++++++++++++++++++++++++++++++++++++++--
> 4 files changed, 86 insertions(+), 2 deletions(-)
>
> diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h
> index 20ce682..d22f8d9 100644
> --- a/arch/x86/include/asm/msr-index.h
> +++ b/arch/x86/include/asm/msr-index.h
> @@ -131,9 +131,13 @@
> #define LBR_INFO_ABORT BIT_ULL(61)
> #define LBR_INFO_CYCLES 0xffff
>
> +#define MSR_IA32_PEBS_PMI_AFTER_REC BIT_ULL(60)
> +#define MSR_IA32_PEBS_OUTPUT_PT BIT_ULL(61)
> +#define MSR_IA32_PEBS_OUTPUT_MASK (3ULL << 61)
> #define MSR_IA32_PEBS_ENABLE 0x000003f1
> #define MSR_PEBS_DATA_CFG 0x000003f2
> #define MSR_IA32_DS_AREA 0x00000600
> +#define MSR_IA32_PERF_CAP_PEBS_OUTPUT_PT BIT_ULL(16)
> #define MSR_IA32_PERF_CAPABILITIES 0x00000345
> #define MSR_PEBS_LD_LAT_THRESHOLD 0x000003f6
>
> @@ -665,6 +669,8 @@
> #define MSR_IA32_MISC_ENABLE_FERR (1ULL << MSR_IA32_MISC_ENABLE_FERR_BIT)
> #define MSR_IA32_MISC_ENABLE_FERR_MULTIPLEX_BIT 10
> #define MSR_IA32_MISC_ENABLE_FERR_MULTIPLEX (1ULL << MSR_IA32_MISC_ENABLE_FERR_MULTIPLEX_BIT)
> +#define MSR_IA32_MISC_ENABLE_PEBS_BIT 12
> +#define MSR_IA32_MISC_ENABLE_PEBS (1ULL << MSR_IA32_MISC_ENABLE_PEBS_BIT)
> #define MSR_IA32_MISC_ENABLE_TM2_BIT 13
> #define MSR_IA32_MISC_ENABLE_TM2 (1ULL << MSR_IA32_MISC_ENABLE_TM2_BIT)
> #define MSR_IA32_MISC_ENABLE_ADJ_PREF_DISABLE_BIT 19
Some of these already exist but are local to perf. Don't blindly
introduce more without unifying.
> > static void pmc_reprogram_counter(struct kvm_pmc *pmc, u32 type,
> > unsigned config, bool exclude_user,
> > bool exclude_kernel, bool intr,
> > - bool in_tx, bool in_tx_cp)
> > + bool in_tx, bool in_tx_cp, bool pebs)
> > {
> > struct perf_event *event;
> > struct perf_event_attr attr = {
> > @@ -111,9 +111,12 @@ static void pmc_reprogram_counter(struct kvm_pmc *pmc, u32 type,
> > .exclude_user = exclude_user,
> > .exclude_kernel = exclude_kernel,
> > .config = config,
> > + .precise_ip = pebs ? 1 : 0,
> > + .aux_output = pebs ? 1 : 0,
>
> srsly?
Hi Peter,
Thanks for review. For aux_output, I think it should be set 1 when the guest wants to enabled PEBS by Intel PT.
For precise_ip, it is the precise level in perf and set by perf command line in KVM guest, this may not reflect the accurate value (can be 0~3) here. Here set to 1 is used to allocate a counter for PEBS event and set the MSR_IA32_PEBS_ENABLE register. For PMU virtualization, KVM will trap the guest's write operation to PMU registers and allocate/free event counter from host if a counter enable/disable in guest. We can't always deduce the exact parameter of perf command line from the value of the guest writers to the register.
Thanks,
Luwei Kang
> > This patch introduce a parameter that avoid clear the PEBS event
> > counter while running in the guest. The performance counter which use
> > for PEBS event can be enabled through VM-entry when PEBS is enabled in
> > guest by PEBS output to Intel PT.
>
> Please write coherent Changelogs. I have no clue what you're trying to say.
>
> Also, maybe this attrocious coding style is accepted in KVM, but I'm not having it. Pretty much all your linebreaks and subsequent
> indenting is against style.
Sorry. I mean the performance counter for PEBS event must be disabled before VM-entry at present. After PEBS enabled in guest by PEBS via PT, we don't need to disable the PEBS counters.
We be corrected in next version.
Thanks,
Luwei Kang
> > Intel new hardware introduces a mechanism to direct PEBS records
> > output into the Intel PT buffer that can be used for enabling PEBS in
> > KVM guest. This patch implements the registers read and write
> > emulation when PEBS is supported in KVM guest.
> >
> > KMM needs to reprogram the counters when the value of these MSRs be
> > changed that to make sure it can take effect in hardware.
> >
> > Signed-off-by: Luwei Kang <[email protected]>
> > ---
> > arch/x86/include/asm/kvm_host.h | 4 +++
> > arch/x86/include/asm/msr-index.h | 6 ++++
> > arch/x86/kvm/vmx/capabilities.h | 15 ++++++++++
> > arch/x86/kvm/vmx/pmu_intel.c | 63 ++++++++++++++++++++++++++++++++++++++--
> > 4 files changed, 86 insertions(+), 2 deletions(-)
> >
> > diff --git a/arch/x86/include/asm/msr-index.h
> > b/arch/x86/include/asm/msr-index.h
> > index 20ce682..d22f8d9 100644
> > --- a/arch/x86/include/asm/msr-index.h
> > +++ b/arch/x86/include/asm/msr-index.h
> > @@ -131,9 +131,13 @@
> > #define LBR_INFO_ABORT BIT_ULL(61)
> > #define LBR_INFO_CYCLES 0xffff
> >
> > +#define MSR_IA32_PEBS_PMI_AFTER_REC BIT_ULL(60)
> > +#define MSR_IA32_PEBS_OUTPUT_PT BIT_ULL(61)
> > +#define MSR_IA32_PEBS_OUTPUT_MASK (3ULL << 61)
> > #define MSR_IA32_PEBS_ENABLE 0x000003f1
> > #define MSR_PEBS_DATA_CFG 0x000003f2
> > #define MSR_IA32_DS_AREA 0x00000600
> > +#define MSR_IA32_PERF_CAP_PEBS_OUTPUT_PT BIT_ULL(16)
> > #define MSR_IA32_PERF_CAPABILITIES 0x00000345
> > #define MSR_PEBS_LD_LAT_THRESHOLD 0x000003f6
> >
> > @@ -665,6 +669,8 @@
> > #define MSR_IA32_MISC_ENABLE_FERR (1ULL << MSR_IA32_MISC_ENABLE_FERR_BIT)
> > #define MSR_IA32_MISC_ENABLE_FERR_MULTIPLEX_BIT 10
> > #define MSR_IA32_MISC_ENABLE_FERR_MULTIPLEX (1ULL << MSR_IA32_MISC_ENABLE_FERR_MULTIPLEX_BIT)
> > +#define MSR_IA32_MISC_ENABLE_PEBS_BIT 12
> > +#define MSR_IA32_MISC_ENABLE_PEBS (1ULL << MSR_IA32_MISC_ENABLE_PEBS_BIT)
> > #define MSR_IA32_MISC_ENABLE_TM2_BIT 13
> > #define MSR_IA32_MISC_ENABLE_TM2 (1ULL << MSR_IA32_MISC_ENABLE_TM2_BIT)
> > #define MSR_IA32_MISC_ENABLE_ADJ_PREF_DISABLE_BIT 19
>
> Some of these already exist but are local to perf. Don't blindly introduce more without unifying.
Got it. Will reuse the exist definition in perf.
Thanks,
Luwei Kang
> > Expose PEBS feature to guest by IA32_MISC_ENABLE[bit12].
> > IA32_MISC_ENABLE[bit12] is Processor Event Based Sampling (PEBS)
> > Unavailable (RO) flag:
> > 1 = PEBS is not supported; 0 = PEBS is supported.
>
> Why does it make sense to expose this on SVM?
Thanks for the review. This patch won't expose the pebs feature to SVM and return not supported.
>
> > Signed-off-by: Luwei Kang <[email protected]>
> > ---
> > arch/x86/include/asm/kvm_host.h | 1 +
> > arch/x86/kvm/svm.c | 6 ++++++
> > arch/x86/kvm/vmx/vmx.c | 1 +
> > arch/x86/kvm/x86.c | 22 +++++++++++++++++-----
> > 4 files changed, 25 insertions(+), 5 deletions(-)
> >
> > diff --git a/arch/x86/include/asm/kvm_host.h
> > b/arch/x86/include/asm/kvm_host.h index 24a0ab9..76f5fa5 100644
> > --- a/arch/x86/include/asm/kvm_host.h
> > +++ b/arch/x86/include/asm/kvm_host.h
> > @@ -1127,6 +1127,7 @@ struct kvm_x86_ops {
> > bool (*xsaves_supported)(void);
> > bool (*umip_emulated)(void);
> > bool (*pt_supported)(void);
> > + bool (*pebs_supported)(void);
> > bool (*pdcm_supported)(void);
> >
> > int (*check_nested_events)(struct kvm_vcpu *vcpu, bool
> > external_intr); diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
> > index 7e0a7b3..3a1bbb3 100644
> > --- a/arch/x86/kvm/svm.c
> > +++ b/arch/x86/kvm/svm.c
> > @@ -5975,6 +5975,11 @@ static bool svm_pt_supported(void)
> > return false;
> > }
> >
> > +static bool svm_pebs_supported(void)
> > +{
> > + return false;
> > +}
> > +
> > static bool svm_pdcm_supported(void)
> > {
> > return false;
> > @@ -7277,6 +7282,7 @@ static bool svm_apic_init_signal_blocked(struct kvm_vcpu *vcpu)
> > .xsaves_supported = svm_xsaves_supported,
> > .umip_emulated = svm_umip_emulated,
> > .pt_supported = svm_pt_supported,
> > + .pebs_supported = svm_pebs_supported,
> > .pdcm_supported = svm_pdcm_supported,
> >
> > .set_supported_cpuid = svm_set_supported_cpuid, diff --git
> > a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index
> > 5c4dd05..3c370a3 100644
> > --- a/arch/x86/kvm/vmx/vmx.c
> > +++ b/arch/x86/kvm/vmx/vmx.c
> > @@ -7879,6 +7879,7 @@ static __exit void hardware_unsetup(void)
> > .xsaves_supported = vmx_xsaves_supported,
> > .umip_emulated = vmx_umip_emulated,
> > .pt_supported = vmx_pt_supported,
> > + .pebs_supported = vmx_pebs_supported,
> > .pdcm_supported = vmx_pdcm_supported,
> >
> > .request_immediate_exit = vmx_request_immediate_exit, diff --git
> > a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 661e2bf..5f59073
> > 100644
> > --- a/arch/x86/kvm/x86.c
> > +++ b/arch/x86/kvm/x86.c
> > @@ -2591,6 +2591,7 @@ static void record_steal_time(struct kvm_vcpu
> > *vcpu) int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data
> > *msr_info) {
> > bool pr = false;
> > + bool update_cpuid = false;
> > u32 msr = msr_info->index;
> > u64 data = msr_info->data;
> >
> > @@ -2671,11 +2672,17 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
> > ((vcpu->arch.ia32_misc_enable_msr ^ data) & MSR_IA32_MISC_ENABLE_MWAIT)) {
> > if (!guest_cpuid_has(vcpu, X86_FEATURE_XMM3))
> > return 1;
> > - vcpu->arch.ia32_misc_enable_msr = data;
> > - kvm_update_cpuid(vcpu);
> > - } else {
> > - vcpu->arch.ia32_misc_enable_msr = data;
> > + update_cpuid = true;
> > }
> > +
> > + if (kvm_x86_ops->pebs_supported())
> > + data &= ~MSR_IA32_MISC_ENABLE_PEBS;
>
> whitespace damage
Yes. Will fix it and below coding style violation.
Thanks,
Luwei Kang
>
> > + else
> > + data |= MSR_IA32_MISC_ENABLE_PEBS;
> > +
> > + vcpu->arch.ia32_misc_enable_msr = data;
> > + if (update_cpuid)
> > + kvm_update_cpuid(vcpu);
> > break;
> > case MSR_IA32_SMBASE:
> > if (!msr_info->host_initiated)
> > @@ -2971,7 +2978,12 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
> > msr_info->data = (u64)vcpu->arch.ia32_tsc_adjust_msr;
> > break;
> > case MSR_IA32_MISC_ENABLE:
> > - msr_info->data = vcpu->arch.ia32_misc_enable_msr;
> > + if (kvm_x86_ops->pebs_supported())
> > + msr_info->data = (vcpu->arch.ia32_misc_enable_msr &
> > + ~MSR_IA32_MISC_ENABLE_PEBS);
> > + else
> > + msr_info->data = (vcpu->arch.ia32_misc_enable_msr |
> > + MSR_IA32_MISC_ENABLE_PEBS);
>
> Coding style violation.
>
> > break;
> > case MSR_IA32_SMBASE:
> > if (!msr_info->host_initiated)
> > --
> > 1.8.3.1
> >
> > For PEBS output to Intel PT, a Intel PT event should be the group
> > leader of an PEBS counter event in host. For Intel PT virtualization
> > enabling in KVM guest, the PT facilities will be passthrough to guest
> > and do not allocate PT event from host perf event framework. This is
> > different with PMU virtualization.
> >
> > Intel new hardware feature that can make PEBS enabled in KVM guest by
> > output PEBS records to Intel PT buffer. KVM need to allocate a event
> > counter for this PEBS event without Intel PT event leader.
> >
> > This patch add event owner check for PEBS output to PT event that only
> > non-kernel event need group leader(PT).
> >
> > Signed-off-by: Luwei Kang <[email protected]>
> > ---
> > arch/x86/events/core.c | 3 ++-
> > include/linux/perf_event.h | 1 +
> > kernel/events/core.c | 2 +-
> > 3 files changed, 4 insertions(+), 2 deletions(-)
> >
> > diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c index
> > 7b21455..214041a 100644
> > --- a/arch/x86/events/core.c
> > +++ b/arch/x86/events/core.c
> > @@ -1014,7 +1014,8 @@ static int collect_events(struct cpu_hw_events *cpuc, struct perf_event *leader,
> > * away, the group was broken down and this singleton event
> > * can't schedule any more.
> > */
> > - if (is_pebs_pt(leader) && !leader->aux_event)
> > + if (is_pebs_pt(leader) && !leader->aux_event &&
> > + !is_kernel_event(leader))
>
> indent fail, but also, I'm not sure I buy this.
>
> Surely pt-on-kvm has a perf event to claim PT for the vCPU context?
Hi Peter,
PT on KVM will not allocate perf events from host (this is different from performance counter). The guest PT MSRs value will be load to hardware directly before VM-entry.
A PT event is needed by PEBS event as the event group leader in native. In virtualization, we can allocate a counter for PEBS but can't assign a PT event as the leader of this PEBS event.
>
> Even if not, this is not strictly correct. Not even now is KVM the sole user of perf_event_create_kernel_counter(), so saying any
> kernel event is excempt from this scheduling constraint is jsut wrong.
KVM is not the sole user of perf_event_create_kernel_counter() but I think KVM is the only user that needs group leader events. I found the kernel event check (63b6da39bb) is from you and hope can get some suggestions on this.
Thanks,
Luwei Kang
>
> > return -EINVAL;
> >
> > /*
"Kang, Luwei" <[email protected]> writes:
>> > static void pmc_reprogram_counter(struct kvm_pmc *pmc, u32 type,
>> > unsigned config, bool exclude_user,
>> > bool exclude_kernel, bool intr,
>> > - bool in_tx, bool in_tx_cp)
>> > + bool in_tx, bool in_tx_cp, bool pebs)
>> > {
>> > struct perf_event *event;
>> > struct perf_event_attr attr = {
>> > @@ -111,9 +111,12 @@ static void pmc_reprogram_counter(struct kvm_pmc *pmc, u32 type,
>> > .exclude_user = exclude_user,
>> > .exclude_kernel = exclude_kernel,
>> > .config = config,
>> > + .precise_ip = pebs ? 1 : 0,
>> > + .aux_output = pebs ? 1 : 0,
>>
>> srsly?
>
> Hi Peter,
> Thanks for review. For aux_output, I think it should be set 1 when the guest wants to enabled PEBS by Intel PT.
attr.aux_output==1 means your group leader should be an intel_pt event
for this to succeed. Luckily for this instance,
perf_event_create_kernel_counter() doesn't actually check the
attr.aux_output.
Also, does 'bool pebs' mean PEBS-via-PT or just a PEBS counter? Or does
it mean that in kvm it's the same thing?
Regards,
--
Alex
> >> > static void pmc_reprogram_counter(struct kvm_pmc *pmc, u32 type,
> >> > unsigned config, bool exclude_user,
> >> > bool exclude_kernel, bool intr,
> >> > - bool in_tx, bool in_tx_cp)
> >> > + bool in_tx, bool in_tx_cp, bool pebs)
> >> > {
> >> > struct perf_event *event;
> >> > struct perf_event_attr attr = {
> >> > @@ -111,9 +111,12 @@ static void pmc_reprogram_counter(struct kvm_pmc *pmc, u32 type,
> >> > .exclude_user = exclude_user,
> >> > .exclude_kernel = exclude_kernel,
> >> > .config = config,
> >> > + .precise_ip = pebs ? 1 : 0,
> >> > + .aux_output = pebs ? 1 : 0,
> >>
> >> srsly?
> >
> > Hi Peter,
> > Thanks for review. For aux_output, I think it should be set 1 when the guest wants to enabled PEBS by Intel PT.
>
> attr.aux_output==1 means your group leader should be an intel_pt event for this to succeed. Luckily for this instance,
> perf_event_create_kernel_counter() doesn't actually check the attr.aux_output.
>
> Also, does 'bool pebs' mean PEBS-via-PT or just a PEBS counter? Or does it mean that in kvm it's the same thing?
It is the same thing. Allocate a counter for PEBS event and use PEBS-via-PT.
Thanks,
Luwei Kang
>
> Regards,
> --
> Alex
On Wed, Oct 30, 2019 at 04:06:36AM +0000, Kang, Luwei wrote:
> > > static void pmc_reprogram_counter(struct kvm_pmc *pmc, u32 type,
> > > unsigned config, bool exclude_user,
> > > bool exclude_kernel, bool intr,
> > > - bool in_tx, bool in_tx_cp)
> > > + bool in_tx, bool in_tx_cp, bool pebs)
> > > {
> > > struct perf_event *event;
> > > struct perf_event_attr attr = {
> > > @@ -111,9 +111,12 @@ static void pmc_reprogram_counter(struct kvm_pmc *pmc, u32 type,
> > > .exclude_user = exclude_user,
> > > .exclude_kernel = exclude_kernel,
> > > .config = config,
> > > + .precise_ip = pebs ? 1 : 0,
> > > + .aux_output = pebs ? 1 : 0,
> >
> > srsly?
>
> Hi Peter,
> Thanks for review. For aux_output, I think it should be set 1 when the guest wants to enabled PEBS by Intel PT.
> For precise_ip, it is the precise level in perf and set by perf command line in KVM guest, this may not reflect the accurate value (can be 0~3) here. Here set to 1 is used to allocate a counter for PEBS event and set the MSR_IA32_PEBS_ENABLE register. For PMU virtualization, KVM will trap the guest's write operation to PMU registers and allocate/free event counter from host if a counter enable/disable in guest. We can't always deduce the exact parameter of perf command line from the value of the guest writers to the register.
Please, teach your MUA to wrap on 78 chars.
The thing I really fell over is the gratuitous 'bool ? 1 : 0'. But yes,
the aux_output without a group leader is dead wrong. We'll go fix
perf_event_create_kernel_counter() to refuse that.
On Wed, Oct 30, 2019 at 08:42:10AM +0200, Alexander Shishkin wrote:
> "Kang, Luwei" <[email protected]> writes:
>
> >> > static void pmc_reprogram_counter(struct kvm_pmc *pmc, u32 type,
> >> > unsigned config, bool exclude_user,
> >> > bool exclude_kernel, bool intr,
> >> > - bool in_tx, bool in_tx_cp)
> >> > + bool in_tx, bool in_tx_cp, bool pebs)
> >> > {
> >> > struct perf_event *event;
> >> > struct perf_event_attr attr = {
> >> > @@ -111,9 +111,12 @@ static void pmc_reprogram_counter(struct kvm_pmc *pmc, u32 type,
> >> > .exclude_user = exclude_user,
> >> > .exclude_kernel = exclude_kernel,
> >> > .config = config,
> >> > + .precise_ip = pebs ? 1 : 0,
> >> > + .aux_output = pebs ? 1 : 0,
> >>
> >> srsly?
> >
> > Hi Peter,
> > Thanks for review. For aux_output, I think it should be set 1 when the guest wants to enabled PEBS by Intel PT.
>
> attr.aux_output==1 means your group leader should be an intel_pt event
> for this to succeed. Luckily for this instance,
> perf_event_create_kernel_counter() doesn't actually check the
> attr.aux_output.
>
> Also, does 'bool pebs' mean PEBS-via-PT or just a PEBS counter? Or does
> it mean that in kvm it's the same thing?
It means pebs-over-pt, they only allow pebs if the output to pt bit is
set.
On Wed, Oct 30, 2019 at 06:49:42AM +0000, Kang, Luwei wrote:
> > >> > static void pmc_reprogram_counter(struct kvm_pmc *pmc, u32 type,
> > >> > unsigned config, bool exclude_user,
> > >> > bool exclude_kernel, bool intr,
> > >> > - bool in_tx, bool in_tx_cp)
> > >> > + bool in_tx, bool in_tx_cp, bool pebs)
> > >> > {
> > >> > struct perf_event *event;
> > >> > struct perf_event_attr attr = {
> > >> > @@ -111,9 +111,12 @@ static void pmc_reprogram_counter(struct kvm_pmc *pmc, u32 type,
> > >> > .exclude_user = exclude_user,
> > >> > .exclude_kernel = exclude_kernel,
> > >> > .config = config,
> > >> > + .precise_ip = pebs ? 1 : 0,
> > >> > + .aux_output = pebs ? 1 : 0,
> > >>
> > >> srsly?
> > >
> > > Hi Peter,
> > > Thanks for review. For aux_output, I think it should be set 1 when the guest wants to enabled PEBS by Intel PT.
> >
> > attr.aux_output==1 means your group leader should be an intel_pt event for this to succeed. Luckily for this instance,
> > perf_event_create_kernel_counter() doesn't actually check the attr.aux_output.
> >
> > Also, does 'bool pebs' mean PEBS-via-PT or just a PEBS counter? Or does it mean that in kvm it's the same thing?
>
> It is the same thing. Allocate a counter for PEBS event and use PEBS-via-PT.
It is not the same thing, obviously. It strictly means pebs-over-pt
here.
On Wed, Oct 30, 2019 at 04:07:31AM +0000, Kang, Luwei wrote:
> > > For PEBS output to Intel PT, a Intel PT event should be the group
> > > leader of an PEBS counter event in host. For Intel PT virtualization
> > > enabling in KVM guest, the PT facilities will be passthrough to guest
> > > and do not allocate PT event from host perf event framework. This is
> > > different with PMU virtualization.
> > >
> > > Intel new hardware feature that can make PEBS enabled in KVM guest by
> > > output PEBS records to Intel PT buffer. KVM need to allocate a event
> > > counter for this PEBS event without Intel PT event leader.
> > >
> > > This patch add event owner check for PEBS output to PT event that only
> > > non-kernel event need group leader(PT).
> > >
> > > Signed-off-by: Luwei Kang <[email protected]>
> > > ---
> > > arch/x86/events/core.c | 3 ++-
> > > include/linux/perf_event.h | 1 +
> > > kernel/events/core.c | 2 +-
> > > 3 files changed, 4 insertions(+), 2 deletions(-)
> > >
> > > diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c index
> > > 7b21455..214041a 100644
> > > --- a/arch/x86/events/core.c
> > > +++ b/arch/x86/events/core.c
> > > @@ -1014,7 +1014,8 @@ static int collect_events(struct cpu_hw_events *cpuc, struct perf_event *leader,
> > > * away, the group was broken down and this singleton event
> > > * can't schedule any more.
> > > */
> > > - if (is_pebs_pt(leader) && !leader->aux_event)
> > > + if (is_pebs_pt(leader) && !leader->aux_event &&
> > > + !is_kernel_event(leader))
> >
> > indent fail, but also, I'm not sure I buy this.
> >
> > Surely pt-on-kvm has a perf event to claim PT for the vCPU context?
>
> Hi Peter,
> PT on KVM will not allocate perf events from host (this is different from performance counter). The guest PT MSRs value will be load to hardware directly before VM-entry.
> A PT event is needed by PEBS event as the event group leader in native. In virtualization, we can allocate a counter for PEBS but can't assign a PT event as the leader of this PEBS event.
Please, fix your MUA already.
Then how does KVM deal with the host using PT? You can't just steal PT.
On Wed, Oct 30, 2019 at 04:07:03AM +0000, Kang, Luwei wrote:
> > > Expose PEBS feature to guest by IA32_MISC_ENABLE[bit12].
> > > IA32_MISC_ENABLE[bit12] is Processor Event Based Sampling (PEBS)
> > > Unavailable (RO) flag:
> > > 1 = PEBS is not supported; 0 = PEBS is supported.
> >
> > Why does it make sense to expose this on SVM?
>
> Thanks for the review. This patch won't expose the pebs feature to SVM and return not supported.
AFAICT it exposes/emulates an Intel MSR on AMD, which is just weird.
Peter Zijlstra <[email protected]> writes:
> On Wed, Oct 30, 2019 at 04:06:36AM +0000, Kang, Luwei wrote:
>> > > static void pmc_reprogram_counter(struct kvm_pmc *pmc, u32 type,
>> > > unsigned config, bool exclude_user,
>> > > bool exclude_kernel, bool intr,
>> > > - bool in_tx, bool in_tx_cp)
>> > > + bool in_tx, bool in_tx_cp, bool pebs)
>> > > {
>> > > struct perf_event *event;
>> > > struct perf_event_attr attr = {
>> > > @@ -111,9 +111,12 @@ static void pmc_reprogram_counter(struct kvm_pmc *pmc, u32 type,
>> > > .exclude_user = exclude_user,
>> > > .exclude_kernel = exclude_kernel,
>> > > .config = config,
>> > > + .precise_ip = pebs ? 1 : 0,
>> > > + .aux_output = pebs ? 1 : 0,
>> >
>> > srsly?
>>
>> Hi Peter,
>> Thanks for review. For aux_output, I think it should be set 1 when the guest wants to enabled PEBS by Intel PT.
>> For precise_ip, it is the precise level in perf and set by perf command line in KVM guest, this may not reflect the accurate value (can be 0~3) here. Here set to 1 is used to allocate a counter for PEBS event and set the MSR_IA32_PEBS_ENABLE register. For PMU virtualization, KVM will trap the guest's write operation to PMU registers and allocate/free event counter from host if a counter enable/disable in guest. We can't always deduce the exact parameter of perf command line from the value of the guest writers to the register.
>
> Please, teach your MUA to wrap on 78 chars.
>
> The thing I really fell over is the gratuitous 'bool ? 1 : 0'. But yes,
Notice the .exclude_kernel assignment above that does the same thing the
other way around.
Regards,
--
Alex
> > > > Expose PEBS feature to guest by IA32_MISC_ENABLE[bit12].
> > > > IA32_MISC_ENABLE[bit12] is Processor Event Based Sampling (PEBS)
> > > > Unavailable (RO) flag:
> > > > 1 = PEBS is not supported; 0 = PEBS is supported.
> > >
> > > Why does it make sense to expose this on SVM?
> >
> > Thanks for the review. This patch won't expose the pebs feature to SVM and return not supported.
>
> AFAICT it exposes/emulates an Intel MSR on AMD, which is just weird.
I checked with the "AMD64 Architecture Programmer's Manual" and didn't found this register (IA32_MISC_ENABLE).
Will move this register to vmx.c in next version.
Thanks,
Luwei Kang
> > > > For PEBS output to Intel PT, a Intel PT event should be the group
> > > > leader of an PEBS counter event in host. For Intel PT
> > > > virtualization enabling in KVM guest, the PT facilities will be
> > > > passthrough to guest and do not allocate PT event from host perf
> > > > event framework. This is different with PMU virtualization.
> > > >
> > > > Intel new hardware feature that can make PEBS enabled in KVM guest
> > > > by output PEBS records to Intel PT buffer. KVM need to allocate a
> > > > event counter for this PEBS event without Intel PT event leader.
> > > >
> > > > This patch add event owner check for PEBS output to PT event that
> > > > only non-kernel event need group leader(PT).
> > > >
> > > > Signed-off-by: Luwei Kang <[email protected]>
> > > > ---
> > > > arch/x86/events/core.c | 3 ++-
> > > > include/linux/perf_event.h | 1 +
> > > > kernel/events/core.c | 2 +-
> > > > 3 files changed, 4 insertions(+), 2 deletions(-)
> > > >
> > > > diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c index
> > > > 7b21455..214041a 100644
> > > > --- a/arch/x86/events/core.c
> > > > +++ b/arch/x86/events/core.c
> > > > @@ -1014,7 +1014,8 @@ static int collect_events(struct cpu_hw_events *cpuc, struct perf_event *leader,
> > > > * away, the group was broken down and this singleton event
> > > > * can't schedule any more.
> > > > */
> > > > - if (is_pebs_pt(leader) && !leader->aux_event)
> > > > + if (is_pebs_pt(leader) && !leader->aux_event &&
> > > > + !is_kernel_event(leader))
> > >
> > > indent fail, but also, I'm not sure I buy this.
> > >
> > > Surely pt-on-kvm has a perf event to claim PT for the vCPU context?
> >
> > Hi Peter,
> > PT on KVM will not allocate perf events from host (this is different from performance counter). The guest PT MSRs value will be
> load to hardware directly before VM-entry.
> > A PT event is needed by PEBS event as the event group leader in native. In virtualization, we can allocate a counter for PEBS but
> can't assign a PT event as the leader of this PEBS event.
>
> Please, fix your MUA already.
>
> Then how does KVM deal with the host using PT? You can't just steal PT.
Intel PT in virtualization can work in system and host_guest mode.
In system mode (default), the trace produced by host and guest will be saved in host PT buffer. Intel PT will not be exposed to guest in this mode.
In host_guest mode, Intel PT will be exposed to guest and guest can use PT like native. The value of host PT register will be saved and guest PT register value will be restored during VM-entry. Both trace of host and guest are exported to their respective PT buffer. The host PT buffer not include guest trace in this mode.
Thanks,
Luwei Kang
"Kang, Luwei" <[email protected]> writes:
>> Then how does KVM deal with the host using PT? You can't just steal PT.
>
> Intel PT in virtualization can work in system and host_guest mode.
> In system mode (default), the trace produced by host and guest will be saved in host PT buffer. Intel PT will not be exposed to guest in this mode.
> In host_guest mode, Intel PT will be exposed to guest and guest can use PT like native. The value of host PT register will be saved and guest PT register value will be restored during VM-entry. Both trace of host and guest are exported to their respective PT buffer. The host PT buffer not include guest trace in this mode.
IOW, it will steal PT from the host.
Regards,
--
Alex
> >> Then how does KVM deal with the host using PT? You can't just steal PT.
> >
> > Intel PT in virtualization can work in system and host_guest mode.
> > In system mode (default), the trace produced by host and guest will be saved in host PT buffer. Intel PT will not be exposed to guest
> in this mode.
> > In host_guest mode, Intel PT will be exposed to guest and guest can use PT like native. The value of host PT register will be saved
> and guest PT register value will be restored during VM-entry. Both trace of host and guest are exported to their respective PT buffer.
> The host PT buffer not include guest trace in this mode.
>
> IOW, it will steal PT from the host.
Hi Alexander,
The host buffer does not include guest packets in this mode because the guest trace will be saved in guest PT buffer in this mode. You can think it is stealing.
>
> Regards,
> --
> Alex
> > > > static void pmc_reprogram_counter(struct kvm_pmc *pmc, u32 type,
> > > > unsigned config, bool exclude_user,
> > > > bool exclude_kernel, bool intr,
> > > > - bool in_tx, bool in_tx_cp)
> > > > + bool in_tx, bool in_tx_cp, bool pebs)
> > > > {
> > > > struct perf_event *event;
> > > > struct perf_event_attr attr = {
> > > > @@ -111,9 +111,12 @@ static void pmc_reprogram_counter(struct kvm_pmc *pmc,
> u32 type,
> > > > .exclude_user = exclude_user,
> > > > .exclude_kernel = exclude_kernel,
> > > > .config = config,
> > > > + .precise_ip = pebs ? 1 : 0,
> > > > + .aux_output = pebs ? 1 : 0,
> > >
> > > srsly?
> >
> > Hi Peter,
> > Thanks for review. For aux_output, I think it should be set 1 when the guest wants to
> enabled PEBS by Intel PT.
> > For precise_ip, it is the precise level in perf and set by perf command line in KVM
> guest, this may not reflect the accurate value (can be 0~3) here. Here set to 1 is used to
> allocate a counter for PEBS event and set the MSR_IA32_PEBS_ENABLE register. For
> PMU virtualization, KVM will trap the guest's write operation to PMU registers and
> allocate/free event counter from host if a counter enable/disable in guest. We can't
> always deduce the exact parameter of perf command line from the value of the guest
> writers to the register.
>
> Please, teach your MUA to wrap on 78 chars.
>
> The thing I really fell over is the gratuitous 'bool ? 1 : 0'. But yes, the aux_output without
> a group leader is dead wrong. We'll go fix
> perf_event_create_kernel_counter() to refuse that.
Yes, I also think it is a little gratuitous here. But it is a little hard to reconstruct the guest perf parameters from the register value, especially the "precise_ip". Do you have any advice?
About refuse the perf_event_create_kernel_counter() request w/o aux_ouput, I think I need to allocate the PT event as group leader here, is that right?
Thanks,
Luwei Kang
> > > > > static void pmc_reprogram_counter(struct kvm_pmc *pmc, u32 type,
> > > > > unsigned config, bool exclude_user,
> > > > > bool exclude_kernel, bool intr,
> > > > > - bool in_tx, bool in_tx_cp)
> > > > > + bool in_tx, bool in_tx_cp, bool pebs)
> > > > > {
> > > > > struct perf_event *event;
> > > > > struct perf_event_attr attr = { @@ -111,9 +111,12 @@ static
> > > > > void pmc_reprogram_counter(struct kvm_pmc *pmc,
> > u32 type,
> > > > > .exclude_user = exclude_user,
> > > > > .exclude_kernel = exclude_kernel,
> > > > > .config = config,
> > > > > + .precise_ip = pebs ? 1 : 0,
> > > > > + .aux_output = pebs ? 1 : 0,
> > > >
> > > > srsly?
> > >
> > > Hi Peter,
> > > Thanks for review. For aux_output, I think it should be set 1
> > > when the guest wants to
> > enabled PEBS by Intel PT.
> > > For precise_ip, it is the precise level in perf and set by perf
> > > command line in KVM
> > guest, this may not reflect the accurate value (can be 0~3) here. Here
> > set to 1 is used to allocate a counter for PEBS event and set the
> > MSR_IA32_PEBS_ENABLE register. For PMU virtualization, KVM will trap
> > the guest's write operation to PMU registers and allocate/free event
> > counter from host if a counter enable/disable in guest. We can't
> > always deduce the exact parameter of perf command line from the value of the guest
> writers to the register.
> >
> > Please, teach your MUA to wrap on 78 chars.
> >
> > The thing I really fell over is the gratuitous 'bool ? 1 : 0'. But
> > yes, the aux_output without a group leader is dead wrong. We'll go fix
> > perf_event_create_kernel_counter() to refuse that.
>
> Yes, I also think it is a little gratuitous here. But it is a little hard to reconstruct the guest
> perf parameters from the register value, especially the "precise_ip". Do you have any
> advice?
>
> About refuse the perf_event_create_kernel_counter() request w/o aux_ouput, I think I
> need to allocate the PT event as group leader here, is that right?
Hi Peter,
What's your opinion?
Thanks,
Luwei Kang