The current amd_event_mapping[] named "amd_perfmon_event_map" is only
valid for "K7 and later, up to and including Family 16h" but for AMD
"Family 17h and later", it needs amd_f17h_perfmon_event_mapp[] .
It's proposed to fix it in a more generic approach:
- decouple the available_event_types from the CPUID 0x0A.EBX bit vector;
- alway get the right perfmon_event_map[] form the hoser perf interface;
- dynamically populate {inte|amd}_event_mapping[] during hardware setup;
v1 -> v2 Changelog:
- Drop some merged patches and one misunderstood patch;
- Rename bitmap name from "avail_cpuid_events" to "avail_perf_hw_ids";
- Fix kernel test robot() compiler warning;
Previous:
https://lore.kernel.org/kvm/[email protected]/
Like Xu (3):
KVM: x86/pmu: Replace pmu->available_event_types with a new BITMAP
perf: x86/core: Add interface to query perfmon_event_map[] directly
KVM: x86/pmu: Setup the {inte|amd}_event_mapping[] when hardware_setup
arch/x86/events/core.c | 9 ++++
arch/x86/include/asm/kvm_host.h | 2 +-
arch/x86/include/asm/perf_event.h | 2 +
arch/x86/kvm/pmu.c | 25 ++++++++++-
arch/x86/kvm/pmu.h | 2 +
arch/x86/kvm/svm/pmu.c | 23 ++--------
arch/x86/kvm/vmx/pmu_intel.c | 72 ++++++++++++++++++++-----------
arch/x86/kvm/x86.c | 1 +
8 files changed, 89 insertions(+), 47 deletions(-)
--
2.33.1
From: Like Xu <[email protected]>
Currently, KVM refuses to create a perf_event for a counter if its
requested hw event is prompted as unavailable according to the
Intel CPUID CPUID 0x0A.EBX bit vector. We replace the basis for
this validation with the kernel generic and common enum perf_hw_id{}.
This helps to remove the use of static {intel,amd}_arch_events[] later on,
as it is not constant across platforms.
Signed-off-by: Like Xu <[email protected]>
---
arch/x86/include/asm/kvm_host.h | 2 +-
arch/x86/kvm/vmx/pmu_intel.c | 40 +++++++++++++++++++++++++++------
2 files changed, 34 insertions(+), 8 deletions(-)
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 5d97f4adc1cb..03fabf22e167 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -501,7 +501,6 @@ struct kvm_pmc {
struct kvm_pmu {
unsigned nr_arch_gp_counters;
unsigned nr_arch_fixed_counters;
- unsigned available_event_types;
u64 fixed_ctr_ctrl;
u64 global_ctrl;
u64 global_status;
@@ -516,6 +515,7 @@ struct kvm_pmu {
DECLARE_BITMAP(reprogram_pmi, X86_PMC_IDX_MAX);
DECLARE_BITMAP(all_valid_pmc_idx, X86_PMC_IDX_MAX);
DECLARE_BITMAP(pmc_in_use, X86_PMC_IDX_MAX);
+ DECLARE_BITMAP(avail_perf_hw_ids, PERF_COUNT_HW_MAX);
/*
* The gate to release perf_events not marked in
diff --git a/arch/x86/kvm/vmx/pmu_intel.c b/arch/x86/kvm/vmx/pmu_intel.c
index ffccfd9823c0..1ba8f0f0098b 100644
--- a/arch/x86/kvm/vmx/pmu_intel.c
+++ b/arch/x86/kvm/vmx/pmu_intel.c
@@ -74,6 +74,7 @@ static unsigned int intel_pmc_perf_hw_id(struct kvm_pmc *pmc)
u8 event_select = pmc->eventsel & ARCH_PERFMON_EVENTSEL_EVENT;
u8 unit_mask = (pmc->eventsel & ARCH_PERFMON_EVENTSEL_UMASK) >> 8;
int i;
+ unsigned int event_type = PERF_COUNT_HW_MAX;
for (i = 0; i < ARRAY_SIZE(intel_arch_events); i++) {
if (intel_arch_events[i].eventsel != event_select ||
@@ -81,16 +82,14 @@ static unsigned int intel_pmc_perf_hw_id(struct kvm_pmc *pmc)
continue;
/* disable event that reported as not present by cpuid */
- if ((i < 7) && !(pmu->available_event_types & (1 << i)))
+ event_type = intel_arch_events[i].event_type;
+ if (!test_bit(event_type, pmu->avail_perf_hw_ids))
return PERF_COUNT_HW_MAX + 1;
break;
}
- if (i == ARRAY_SIZE(intel_arch_events))
- return PERF_COUNT_HW_MAX;
-
- return intel_arch_events[i].event_type;
+ return event_type;
}
/* check if a PMC is enabled by comparing it with globl_ctrl bits. */
@@ -469,6 +468,25 @@ static void setup_fixed_pmc_eventsel(struct kvm_pmu *pmu)
}
}
+/* Mapping between CPUID 0x0A.EBX bit vector and enum perf_hw_id. */
+static inline int map_unavail_bit_to_perf_hw_id(int bit)
+{
+ switch (bit) {
+ case 0:
+ case 1:
+ return bit;
+ case 2:
+ return PERF_COUNT_HW_BUS_CYCLES;
+ case 3:
+ case 4:
+ case 5:
+ case 6:
+ return --bit;
+ }
+
+ return PERF_COUNT_HW_MAX;
+}
+
static void intel_pmu_refresh(struct kvm_vcpu *vcpu)
{
struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
@@ -478,6 +496,8 @@ static void intel_pmu_refresh(struct kvm_vcpu *vcpu)
struct kvm_cpuid_entry2 *entry;
union cpuid10_eax eax;
union cpuid10_edx edx;
+ unsigned long available_cpuid_events;
+ int bit;
pmu->nr_arch_gp_counters = 0;
pmu->nr_arch_fixed_counters = 0;
@@ -503,8 +523,14 @@ static void intel_pmu_refresh(struct kvm_vcpu *vcpu)
eax.split.bit_width = min_t(int, eax.split.bit_width, x86_pmu.bit_width_gp);
pmu->counter_bitmask[KVM_PMC_GP] = ((u64)1 << eax.split.bit_width) - 1;
eax.split.mask_length = min_t(int, eax.split.mask_length, x86_pmu.events_mask_len);
- pmu->available_event_types = ~entry->ebx &
- ((1ull << eax.split.mask_length) - 1);
+ /*
+ * The number of valid EBX bits should be less than the number of valid perf_hw_ids.
+ * Otherwise, we need to additionally determine if the event is rejected by KVM.
+ */
+ available_cpuid_events = ~entry->ebx & ((1ull << eax.split.mask_length) - 1);
+ bitmap_fill(pmu->avail_perf_hw_ids, PERF_COUNT_HW_MAX);
+ for_each_clear_bit(bit, (unsigned long *)&available_cpuid_events, eax.split.mask_length)
+ __clear_bit(map_unavail_bit_to_perf_hw_id(bit), pmu->avail_perf_hw_ids);
if (pmu->version == 1) {
pmu->nr_arch_fixed_counters = 0;
--
2.33.1
From: Like Xu <[email protected]>
Currently, we have [intel|knc|p4|p6]_perfmon_event_map on the Intel
platforms and amd_[f17h]_perfmon_event_map on the AMD platforms.
Early clumsy KVM code or other potential perf_event users may have
hard-coded these perfmon_maps (e.g., arch/x86/kvm/svm/pmu.c), so
it would not make sense to program a common hardware event based
on the generic "enum perf_hw_id" once the two tables do not match.
Let's provide an interface for callers outside the perf subsystem to get
the counter config based on the perfmon_event_map currently in use,
and it also helps to save bytes.
Cc: Peter Zijlstra <[email protected]>
Signed-off-by: Like Xu <[email protected]>
---
arch/x86/events/core.c | 9 +++++++++
arch/x86/include/asm/perf_event.h | 2 ++
2 files changed, 11 insertions(+)
diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c
index 38b2c779146f..751048f4cc97 100644
--- a/arch/x86/events/core.c
+++ b/arch/x86/events/core.c
@@ -693,6 +693,15 @@ void x86_pmu_disable_all(void)
}
}
+u64 perf_get_hw_event_config(int perf_hw_id)
+{
+ if (perf_hw_id < x86_pmu.max_events)
+ return x86_pmu.event_map(perf_hw_id);
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(perf_get_hw_event_config);
+
struct perf_guest_switch_msr *perf_guest_get_msrs(int *nr)
{
return static_call(x86_pmu_guest_get_msrs)(nr);
diff --git a/arch/x86/include/asm/perf_event.h b/arch/x86/include/asm/perf_event.h
index 8fc1b5003713..d1e325517b74 100644
--- a/arch/x86/include/asm/perf_event.h
+++ b/arch/x86/include/asm/perf_event.h
@@ -492,9 +492,11 @@ static inline void perf_check_microcode(void) { }
#if defined(CONFIG_PERF_EVENTS) && defined(CONFIG_CPU_SUP_INTEL)
extern struct perf_guest_switch_msr *perf_guest_get_msrs(int *nr);
+extern u64 perf_get_hw_event_config(int perf_hw_id);
extern int x86_perf_get_lbr(struct x86_pmu_lbr *lbr);
#else
struct perf_guest_switch_msr *perf_guest_get_msrs(int *nr);
+u64 perf_get_hw_event_config(int perf_hw_id);
static inline int x86_perf_get_lbr(struct x86_pmu_lbr *lbr)
{
return -1;
--
2.33.1
From: Like Xu <[email protected]>
The current amd_event_mapping[] is only valid for "K7 and later, up to and
including Family 16h" and it needs amd_f17h_perfmon_event_mapp[] for
"Family 17h and later". It's proposed to fix it in a more generic approach.
For both Intel and AMD platforms, the new introduced interface
perf_get_hw_event_config() could be applied to fill up the new global
kernel_hw_events[] array.
When the kernel_hw_events[] is initialized, the original 8-element
array is replaced by a new 10-element array, and the eventsel and
unit_mask of the two new members of will be zero and the event_type
value is invalid. In this case, KVM will not query kernel_hw_events[]
when the trapped event_select and unit_mask are both 0, it will fall
back to PERF_TYPE_RAW type to program the perf_event.
Fixes: 3fe3331bb2857 ("perf/x86/amd: Add event map for AMD Family 17h")
Signed-off-by: Like Xu <[email protected]>
---
arch/x86/kvm/pmu.c | 25 ++++++++++++++++++++++++-
arch/x86/kvm/pmu.h | 2 ++
arch/x86/kvm/svm/pmu.c | 23 ++++-------------------
arch/x86/kvm/vmx/pmu_intel.c | 34 ++++++++++++++--------------------
arch/x86/kvm/x86.c | 1 +
5 files changed, 45 insertions(+), 40 deletions(-)
diff --git a/arch/x86/kvm/pmu.c b/arch/x86/kvm/pmu.c
index e632693a2266..3b68fb4b3ece 100644
--- a/arch/x86/kvm/pmu.c
+++ b/arch/x86/kvm/pmu.c
@@ -22,6 +22,10 @@
/* This is enough to filter the vast majority of currently defined events. */
#define KVM_PMU_EVENT_FILTER_MAX_EVENTS 300
+/* The event_type value is invalid if event_select and unit_mask are both 0. */
+struct kvm_event_hw_type_mapping kernel_hw_events[PERF_COUNT_HW_MAX];
+EXPORT_SYMBOL_GPL(kernel_hw_events);
+
/* NOTE:
* - Each perf counter is defined as "struct kvm_pmc";
* - There are two types of perf counters: general purpose (gp) and fixed.
@@ -206,7 +210,9 @@ void reprogram_gp_counter(struct kvm_pmc *pmc, u64 eventsel)
if (!allow_event)
return;
- if (!(eventsel & (ARCH_PERFMON_EVENTSEL_EDGE |
+ /* Fall back to PERF_TYPE_RAW type if event_select and unit_mask are both 0. */
+ if ((pmc->eventsel & 0xFFFFULL) &&
+ !(eventsel & (ARCH_PERFMON_EVENTSEL_EDGE |
ARCH_PERFMON_EVENTSEL_INV |
ARCH_PERFMON_EVENTSEL_CMASK |
HSW_IN_TX |
@@ -545,6 +551,23 @@ void kvm_pmu_trigger_event(struct kvm_vcpu *vcpu, u64 perf_hw_id)
}
EXPORT_SYMBOL_GPL(kvm_pmu_trigger_event);
+/* Initialize common hardware events[] by iterating over the enum perf_hw_id. */
+void kvm_pmu_hw_events_mapping_setup(void)
+{
+ u64 config;
+ int i;
+
+ for (i = 0; i < PERF_COUNT_HW_MAX; i++) {
+ config = perf_get_hw_event_config(i) & 0xFFFFULL;
+
+ kernel_hw_events[i] = (struct kvm_event_hw_type_mapping){
+ .eventsel = config & ARCH_PERFMON_EVENTSEL_EVENT,
+ .unit_mask = (config & ARCH_PERFMON_EVENTSEL_UMASK) >> 8,
+ .event_type = i,
+ };
+ }
+}
+
int kvm_vm_ioctl_set_pmu_event_filter(struct kvm *kvm, void __user *argp)
{
struct kvm_pmu_event_filter tmp, *filter;
diff --git a/arch/x86/kvm/pmu.h b/arch/x86/kvm/pmu.h
index 7a7b8d5b775e..0d8a63c5b20a 100644
--- a/arch/x86/kvm/pmu.h
+++ b/arch/x86/kvm/pmu.h
@@ -160,7 +160,9 @@ int kvm_vm_ioctl_set_pmu_event_filter(struct kvm *kvm, void __user *argp);
void kvm_pmu_trigger_event(struct kvm_vcpu *vcpu, u64 perf_hw_id);
bool is_vmware_backdoor_pmc(u32 pmc_idx);
+void kvm_pmu_hw_events_mapping_setup(void);
extern struct kvm_pmu_ops intel_pmu_ops;
extern struct kvm_pmu_ops amd_pmu_ops;
+extern struct kvm_event_hw_type_mapping kernel_hw_events[];
#endif /* __KVM_X86_PMU_H */
diff --git a/arch/x86/kvm/svm/pmu.c b/arch/x86/kvm/svm/pmu.c
index 12d8b301065a..a39ef7e7302a 100644
--- a/arch/x86/kvm/svm/pmu.c
+++ b/arch/x86/kvm/svm/pmu.c
@@ -33,18 +33,6 @@ enum index {
INDEX_ERROR,
};
-/* duplicated from amd_perfmon_event_map, K7 and above should work. */
-static struct kvm_event_hw_type_mapping amd_event_mapping[] = {
- [0] = { 0x76, 0x00, PERF_COUNT_HW_CPU_CYCLES },
- [1] = { 0xc0, 0x00, PERF_COUNT_HW_INSTRUCTIONS },
- [2] = { 0x7d, 0x07, PERF_COUNT_HW_CACHE_REFERENCES },
- [3] = { 0x7e, 0x07, PERF_COUNT_HW_CACHE_MISSES },
- [4] = { 0xc2, 0x00, PERF_COUNT_HW_BRANCH_INSTRUCTIONS },
- [5] = { 0xc3, 0x00, PERF_COUNT_HW_BRANCH_MISSES },
- [6] = { 0xd0, 0x00, PERF_COUNT_HW_STALLED_CYCLES_FRONTEND },
- [7] = { 0xd1, 0x00, PERF_COUNT_HW_STALLED_CYCLES_BACKEND },
-};
-
static unsigned int get_msr_base(struct kvm_pmu *pmu, enum pmu_type type)
{
struct kvm_vcpu *vcpu = pmu_to_vcpu(pmu);
@@ -148,15 +136,12 @@ static unsigned int amd_pmc_perf_hw_id(struct kvm_pmc *pmc)
if (WARN_ON(pmc_is_fixed(pmc)))
return PERF_COUNT_HW_MAX;
- for (i = 0; i < ARRAY_SIZE(amd_event_mapping); i++)
- if (amd_event_mapping[i].eventsel == event_select
- && amd_event_mapping[i].unit_mask == unit_mask)
+ for (i = 0; i < PERF_COUNT_HW_MAX; i++)
+ if (kernel_hw_events[i].eventsel == event_select &&
+ kernel_hw_events[i].unit_mask == unit_mask)
break;
- if (i == ARRAY_SIZE(amd_event_mapping))
- return PERF_COUNT_HW_MAX;
-
- return amd_event_mapping[i].event_type;
+ return (i == PERF_COUNT_HW_MAX) ? i : kernel_hw_events[i].event_type;
}
/* check if a PMC is enabled by comparing it against global_ctrl bits. Because
diff --git a/arch/x86/kvm/vmx/pmu_intel.c b/arch/x86/kvm/vmx/pmu_intel.c
index 1ba8f0f0098b..0fc6507cb72d 100644
--- a/arch/x86/kvm/vmx/pmu_intel.c
+++ b/arch/x86/kvm/vmx/pmu_intel.c
@@ -20,20 +20,14 @@
#define MSR_PMC_FULL_WIDTH_BIT (MSR_IA32_PMC0 - MSR_IA32_PERFCTR0)
-static struct kvm_event_hw_type_mapping intel_arch_events[] = {
- [0] = { 0x3c, 0x00, PERF_COUNT_HW_CPU_CYCLES },
- [1] = { 0xc0, 0x00, PERF_COUNT_HW_INSTRUCTIONS },
- [2] = { 0x3c, 0x01, PERF_COUNT_HW_BUS_CYCLES },
- [3] = { 0x2e, 0x4f, PERF_COUNT_HW_CACHE_REFERENCES },
- [4] = { 0x2e, 0x41, PERF_COUNT_HW_CACHE_MISSES },
- [5] = { 0xc4, 0x00, PERF_COUNT_HW_BRANCH_INSTRUCTIONS },
- [6] = { 0xc5, 0x00, PERF_COUNT_HW_BRANCH_MISSES },
- /* The above index must match CPUID 0x0A.EBX bit vector */
- [7] = { 0x00, 0x03, PERF_COUNT_HW_REF_CPU_CYCLES },
-};
-
-/* mapping between fixed pmc index and intel_arch_events array */
-static int fixed_pmc_events[] = {1, 0, 7};
+/*
+ * Mapping between fixed pmc index and kernel_hw_events array
+ *
+ * Fixed pmc 0 is PERF_COUNT_HW_INSTRUCTIONS,
+ * Fixed pmc 1 is PERF_COUNT_HW_CPU_CYCLES,
+ * Fixed pmc 2 is PERF_COUNT_HW_REF_CPU_CYCLES.
+ */
+static int fixed_pmc_events[] = {1, 0, 9};
static void reprogram_fixed_counters(struct kvm_pmu *pmu, u64 data)
{
@@ -76,13 +70,13 @@ static unsigned int intel_pmc_perf_hw_id(struct kvm_pmc *pmc)
int i;
unsigned int event_type = PERF_COUNT_HW_MAX;
- for (i = 0; i < ARRAY_SIZE(intel_arch_events); i++) {
- if (intel_arch_events[i].eventsel != event_select ||
- intel_arch_events[i].unit_mask != unit_mask)
+ for (i = 0; i < PERF_COUNT_HW_MAX; i++) {
+ if (kernel_hw_events[i].eventsel != event_select ||
+ kernel_hw_events[i].unit_mask != unit_mask)
continue;
/* disable event that reported as not present by cpuid */
- event_type = intel_arch_events[i].event_type;
+ event_type = kernel_hw_events[i].event_type;
if (!test_bit(event_type, pmu->avail_perf_hw_ids))
return PERF_COUNT_HW_MAX + 1;
@@ -463,8 +457,8 @@ static void setup_fixed_pmc_eventsel(struct kvm_pmu *pmu)
for (i = 0; i < pmu->nr_arch_fixed_counters; i++) {
pmc = &pmu->fixed_counters[i];
event = fixed_pmc_events[array_index_nospec(i, size)];
- pmc->eventsel = (intel_arch_events[event].unit_mask << 8) |
- intel_arch_events[event].eventsel;
+ pmc->eventsel = (kernel_hw_events[event].unit_mask << 8) |
+ kernel_hw_events[event].eventsel;
}
}
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index c194a8cbd25f..8591fd8b42d1 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -11349,6 +11349,7 @@ int kvm_arch_hardware_setup(void *opaque)
memcpy(&kvm_x86_ops, ops->runtime_ops, sizeof(kvm_x86_ops));
kvm_ops_static_call_update();
+ kvm_pmu_hw_events_mapping_setup();
if (!kvm_cpu_cap_has(X86_FEATURE_XSAVES))
supported_xss = 0;
--
2.33.1
cc AMD folks and ping for any comments.
On 17/1/2022 4:53 pm, Like Xu wrote:
> The current amd_event_mapping[] named "amd_perfmon_event_map" is only
> valid for "K7 and later, up to and including Family 16h" but for AMD
> "Family 17h and later", it needs amd_f17h_perfmon_event_mapp[] .
>
> It's proposed to fix it in a more generic approach:
> - decouple the available_event_types from the CPUID 0x0A.EBX bit vector;
> - alway get the right perfmon_event_map[] form the hoser perf interface;
> - dynamically populate {inte|amd}_event_mapping[] during hardware setup;
>
> v1 -> v2 Changelog:
> - Drop some merged patches and one misunderstood patch;
> - Rename bitmap name from "avail_cpuid_events" to "avail_perf_hw_ids";
> - Fix kernel test robot() compiler warning;
>
> Previous:
> https://lore.kernel.org/kvm/[email protected]/
>
> Like Xu (3):
> KVM: x86/pmu: Replace pmu->available_event_types with a new BITMAP
> perf: x86/core: Add interface to query perfmon_event_map[] directly
> KVM: x86/pmu: Setup the {inte|amd}_event_mapping[] when hardware_setup
>
> arch/x86/events/core.c | 9 ++++
> arch/x86/include/asm/kvm_host.h | 2 +-
> arch/x86/include/asm/perf_event.h | 2 +
> arch/x86/kvm/pmu.c | 25 ++++++++++-
> arch/x86/kvm/pmu.h | 2 +
> arch/x86/kvm/svm/pmu.c | 23 ++--------
> arch/x86/kvm/vmx/pmu_intel.c | 72 ++++++++++++++++++++-----------
> arch/x86/kvm/x86.c | 1 +
> 8 files changed, 89 insertions(+), 47 deletions(-)
>
On 1/17/22 09:53, Like Xu wrote:
> From: Like Xu <[email protected]>
>
> Currently, we have [intel|knc|p4|p6]_perfmon_event_map on the Intel
> platforms and amd_[f17h]_perfmon_event_map on the AMD platforms.
>
> Early clumsy KVM code or other potential perf_event users may have
> hard-coded these perfmon_maps (e.g., arch/x86/kvm/svm/pmu.c), so
> it would not make sense to program a common hardware event based
> on the generic "enum perf_hw_id" once the two tables do not match.
>
> Let's provide an interface for callers outside the perf subsystem to get
> the counter config based on the perfmon_event_map currently in use,
> and it also helps to save bytes.
>
> Cc: Peter Zijlstra <[email protected]>
> Signed-off-by: Like Xu <[email protected]>
> ---
> arch/x86/events/core.c | 9 +++++++++
> arch/x86/include/asm/perf_event.h | 2 ++
> 2 files changed, 11 insertions(+)
>
> diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c
> index 38b2c779146f..751048f4cc97 100644
> --- a/arch/x86/events/core.c
> +++ b/arch/x86/events/core.c
> @@ -693,6 +693,15 @@ void x86_pmu_disable_all(void)
> }
> }
>
> +u64 perf_get_hw_event_config(int perf_hw_id)
> +{
> + if (perf_hw_id < x86_pmu.max_events)
> + return x86_pmu.event_map(perf_hw_id);
> +
> + return 0;
> +}
> +EXPORT_SYMBOL_GPL(perf_get_hw_event_config);
> +
> struct perf_guest_switch_msr *perf_guest_get_msrs(int *nr)
> {
> return static_call(x86_pmu_guest_get_msrs)(nr);
> diff --git a/arch/x86/include/asm/perf_event.h b/arch/x86/include/asm/perf_event.h
> index 8fc1b5003713..d1e325517b74 100644
> --- a/arch/x86/include/asm/perf_event.h
> +++ b/arch/x86/include/asm/perf_event.h
> @@ -492,9 +492,11 @@ static inline void perf_check_microcode(void) { }
>
> #if defined(CONFIG_PERF_EVENTS) && defined(CONFIG_CPU_SUP_INTEL)
> extern struct perf_guest_switch_msr *perf_guest_get_msrs(int *nr);
> +extern u64 perf_get_hw_event_config(int perf_hw_id);
> extern int x86_perf_get_lbr(struct x86_pmu_lbr *lbr);
> #else
> struct perf_guest_switch_msr *perf_guest_get_msrs(int *nr);
> +u64 perf_get_hw_event_config(int perf_hw_id);
Should this be an inline that returns 0?
> static inline int x86_perf_get_lbr(struct x86_pmu_lbr *lbr)
> {
> return -1;
Peter, please review/ack this.
Paolo
On 1/17/22 09:53, Like Xu wrote:
> +/* Mapping between CPUID 0x0A.EBX bit vector and enum perf_hw_id. */
> +static inline int map_unavail_bit_to_perf_hw_id(int bit)
> +{
> + switch (bit) {
> + case 0:
> + case 1:
> + return bit;
> + case 2:
> + return PERF_COUNT_HW_BUS_CYCLES;
> + case 3:
> + case 4:
> + case 5:
> + case 6:
> + return --bit;
> + }
> +
> + return PERF_COUNT_HW_MAX;
> +}
> +
Please use an array here (e.g. cpuid_event_to_perf_hw_id_map[]).
Paolo
On 1/17/22 09:53, Like Xu wrote:
> +
> + for (i = 0; i < PERF_COUNT_HW_MAX; i++) {
> + config = perf_get_hw_event_config(i) & 0xFFFFULL;
> +
> + kernel_hw_events[i] = (struct kvm_event_hw_type_mapping){
> + .eventsel = config & ARCH_PERFMON_EVENTSEL_EVENT,
> + .unit_mask = (config & ARCH_PERFMON_EVENTSEL_UMASK) >> 8,
> + .event_type = i,
> + };
Should event_type be PERF_COUNT_HW_MAX if config is zero?
Thanks,
Paolo
On Wed, Feb 2, 2022 at 6:43 AM Peter Zijlstra <[email protected]> wrote:
> Urgh... hate on kvm being a module again. We really need something like
> EXPORT_SYMBOL_KVM() or something.
Perhaps we should reconsider the current approach of treating the
guest as a client of the host perf subsystem via kvm as a proxy. There
are several drawbacks to the current approach:
1) If the guest actually sets the counter mask (and invert counter
mask) or edge detect in the event selector, we ignore it, because we
have no way of requesting that from perf.
2) If a system-wide pinned counter preempts one of kvm's thread-pinned
counters, we have no way of letting the guest know, because the
architectural specification doesn't allow counters to be suspended.
3) TDX is going to pull the rug out from under us anyway. When the TDX
module usurps control of the PMU, any active host counters are going
to stop counting. We are going to need a way of telling the host perf
subsystem what's happening, or other host perf clients are going to
get bogus data.
Given what's coming with TDX, I wonder if we should just bite the
bullet and cede the PMU to the guest while it's running, even for
non-TDX guests. That would solve (1) and (2) as well.
On Mon, Jan 17, 2022 at 04:53:06PM +0800, Like Xu wrote:
> From: Like Xu <[email protected]>
>
> Currently, we have [intel|knc|p4|p6]_perfmon_event_map on the Intel
> platforms and amd_[f17h]_perfmon_event_map on the AMD platforms.
>
> Early clumsy KVM code or other potential perf_event users may have
> hard-coded these perfmon_maps (e.g., arch/x86/kvm/svm/pmu.c), so
> it would not make sense to program a common hardware event based
> on the generic "enum perf_hw_id" once the two tables do not match.
>
> Let's provide an interface for callers outside the perf subsystem to get
> the counter config based on the perfmon_event_map currently in use,
> and it also helps to save bytes.
>
> Cc: Peter Zijlstra <[email protected]>
> Signed-off-by: Like Xu <[email protected]>
> ---
> arch/x86/events/core.c | 9 +++++++++
> arch/x86/include/asm/perf_event.h | 2 ++
> 2 files changed, 11 insertions(+)
>
> diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c
> index 38b2c779146f..751048f4cc97 100644
> --- a/arch/x86/events/core.c
> +++ b/arch/x86/events/core.c
> @@ -693,6 +693,15 @@ void x86_pmu_disable_all(void)
> }
> }
>
> +u64 perf_get_hw_event_config(int perf_hw_id)
> +{
> + if (perf_hw_id < x86_pmu.max_events)
> + return x86_pmu.event_map(perf_hw_id);
> +
> + return 0;
> +}
Where does perf_hw_id come from? Does this need to be
array_index_nospec() ?
> +EXPORT_SYMBOL_GPL(perf_get_hw_event_config);
Urgh... hate on kvm being a module again. We really need something like
EXPORT_SYMBOL_KVM() or something.
> struct perf_guest_switch_msr *perf_guest_get_msrs(int *nr)
> {
> return static_call(x86_pmu_guest_get_msrs)(nr);
> diff --git a/arch/x86/include/asm/perf_event.h b/arch/x86/include/asm/perf_event.h
> index 8fc1b5003713..d1e325517b74 100644
> --- a/arch/x86/include/asm/perf_event.h
> +++ b/arch/x86/include/asm/perf_event.h
> @@ -492,9 +492,11 @@ static inline void perf_check_microcode(void) { }
>
> #if defined(CONFIG_PERF_EVENTS) && defined(CONFIG_CPU_SUP_INTEL)
> extern struct perf_guest_switch_msr *perf_guest_get_msrs(int *nr);
> +extern u64 perf_get_hw_event_config(int perf_hw_id);
> extern int x86_perf_get_lbr(struct x86_pmu_lbr *lbr);
> #else
> struct perf_guest_switch_msr *perf_guest_get_msrs(int *nr);
> +u64 perf_get_hw_event_config(int perf_hw_id);
I think Paolo already spotted this one.
> static inline int x86_perf_get_lbr(struct x86_pmu_lbr *lbr)
> {
> return -1;
> --
> 2.33.1
>
Jim,
I agree.
It seems inevitable that the demands on the vPMU will continue to
grow. On the current path, we will keep adding complexity to perf
until it essentially has a raw MSR interface used by KVM.
Your proposal gives a clean separation where KVM notifies perf when
the PMC will stop counting. That handles both vPMU and TDX. And it
allows KVM to provide the full expressiveness to guests.
Dave Dunn
On Wed, Feb 2, 2022 at 2:35 PM Jim Mattson <[email protected]> wrote:
> Given what's coming with TDX, I wonder if we should just bite the
> bullet and cede the PMU to the guest while it's running, even for
> non-TDX guests. That would solve (1) and (2) as well.
On 2/2/2022 10:43 pm, Peter Zijlstra wrote:
> On Mon, Jan 17, 2022 at 04:53:06PM +0800, Like Xu wrote:
>> From: Like Xu <[email protected]>
>>
>> Currently, we have [intel|knc|p4|p6]_perfmon_event_map on the Intel
>> platforms and amd_[f17h]_perfmon_event_map on the AMD platforms.
>>
>> Early clumsy KVM code or other potential perf_event users may have
>> hard-coded these perfmon_maps (e.g., arch/x86/kvm/svm/pmu.c), so
>> it would not make sense to program a common hardware event based
>> on the generic "enum perf_hw_id" once the two tables do not match.
>>
>> Let's provide an interface for callers outside the perf subsystem to get
>> the counter config based on the perfmon_event_map currently in use,
>> and it also helps to save bytes.
>>
>> Cc: Peter Zijlstra <[email protected]>
>> Signed-off-by: Like Xu <[email protected]>
>> ---
>> arch/x86/events/core.c | 9 +++++++++
>> arch/x86/include/asm/perf_event.h | 2 ++
>> 2 files changed, 11 insertions(+)
>>
>> diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c
>> index 38b2c779146f..751048f4cc97 100644
>> --- a/arch/x86/events/core.c
>> +++ b/arch/x86/events/core.c
>> @@ -693,6 +693,15 @@ void x86_pmu_disable_all(void)
>> }
>> }
>>
>> +u64 perf_get_hw_event_config(int perf_hw_id)
>> +{
>> + if (perf_hw_id < x86_pmu.max_events)
>> + return x86_pmu.event_map(perf_hw_id);
>> +
>> + return 0;
>> +}
>
> Where does perf_hw_id come from? Does this need to be
> array_index_nospec() ?
A valid incoming parameter will be a member of the generic "enum perf_hw_id" table.
If array_index_nospec() helps, how about:
+u64 perf_get_hw_event_config(int hw_event)
+{
+ int max = x86_pmu.max_events;
+
+ if (hw_event < max)
+ return x86_pmu.event_map(array_index_nospec(hw_event, max));
+
+ return 0;
+}
>
>> +EXPORT_SYMBOL_GPL(perf_get_hw_event_config);
>
> Urgh... hate on kvm being a module again. We really need something like
> EXPORT_SYMBOL_KVM() or something.
As opposed to maintaining the obsolete {intel|amd}_event_mapping[] in the out
context of perf,
a more appropriate method is to set up the table in the KVM through the new perf
interface.
Well, I probably need Paolo's clarity to trigger more changes, whether it's
introducing EXPORT_SYMBOL_KVM or a built-in KVM as a necessary prerequisite for
vPMU.
>
>
>> struct perf_guest_switch_msr *perf_guest_get_msrs(int *nr)
>> {
>> return static_call(x86_pmu_guest_get_msrs)(nr);
>> diff --git a/arch/x86/include/asm/perf_event.h b/arch/x86/include/asm/perf_event.h
>> index 8fc1b5003713..d1e325517b74 100644
>> --- a/arch/x86/include/asm/perf_event.h
>> +++ b/arch/x86/include/asm/perf_event.h
>> @@ -492,9 +492,11 @@ static inline void perf_check_microcode(void) { }
>>
>> #if defined(CONFIG_PERF_EVENTS) && defined(CONFIG_CPU_SUP_INTEL)
>> extern struct perf_guest_switch_msr *perf_guest_get_msrs(int *nr);
>> +extern u64 perf_get_hw_event_config(int perf_hw_id);
>> extern int x86_perf_get_lbr(struct x86_pmu_lbr *lbr);
>> #else
>> struct perf_guest_switch_msr *perf_guest_get_msrs(int *nr);
>> +u64 perf_get_hw_event_config(int perf_hw_id);
>
> I think Paolo already spotted this one.
Indeed, I will apply it.
>
>> static inline int x86_perf_get_lbr(struct x86_pmu_lbr *lbr)
>> {
>> return -1;
>> --
>> 2.33.1
>>
On 1/2/2022 8:28 pm, Paolo Bonzini wrote:
> On 1/17/22 09:53, Like Xu wrote:
>> +
>> + for (i = 0; i < PERF_COUNT_HW_MAX; i++) {
>> + config = perf_get_hw_event_config(i) & 0xFFFFULL;
>> +
>> + kernel_hw_events[i] = (struct kvm_event_hw_type_mapping){
>> + .eventsel = config & ARCH_PERFMON_EVENTSEL_EVENT,
>> + .unit_mask = (config & ARCH_PERFMON_EVENTSEL_UMASK) >> 8,
>> + .event_type = i,
>> + };
>
> Should event_type be PERF_COUNT_HW_MAX if config is zero?
Emm, we do not assume that the hardware event encoded with "eventsel=0 &&
unit_mask=0"
(in this case, config is zero) are illegal.
If perf core puts this encoded event into "enum perf_hw_id" table as this part
is out of the scope of
KVM, we have to setup with a valid event_type value instead of PERF_COUNT_HW_MAX.
In this proposal, the returned perf_hw_id from kvm_x86_ops.pmu_ops->pmc_perf_hw_id()
is only valid and used if "pmc->eventsel & 0xFFFFULL" is non-zero, otherwise the
reprogram_gp_counter() will fall back to use PERF_TYPE_RAW type.
Please let me know if you need more clarification on this change.
>
> Thanks,
>
> Paolo
>
Changed the subject to attract more attention.
On 3/2/2022 6:35 am, Jim Mattson wrote:
> On Wed, Feb 2, 2022 at 6:43 AM Peter Zijlstra <[email protected]> wrote:
>
>> Urgh... hate on kvm being a module again. We really need something like
>> EXPORT_SYMBOL_KVM() or something.
>
> Perhaps we should reconsider the current approach of treating the
> guest as a client of the host perf subsystem via kvm as a proxy. There
The story of vPMU begins with the perf_event_create_kernel_counter()
interface which is a generic API in the kernel mode.
> are several drawbacks to the current approach:
> 1) If the guest actually sets the counter mask (and invert counter
> mask) or edge detect in the event selector, we ignore it, because we
> have no way of requesting that from perf.
We need more guest user cases and voices when it comes to vPMU
capabilities on a case-by-case basis (invert counter mask or edge detect).
KVM may set these bits before vm-entry if it does not affect the host.
> 2) If a system-wide pinned counter preempts one of kvm's thread-pinned
> counters, we have no way of letting the guest know, because the
> architectural specification doesn't allow counters to be suspended.
One such case is NMI watchdog. The truth is that KVM can check the status
of the event before vm-entry to know that the back-end counter has been
rescheduled to another perf user, but can't do anything about it.
I had drafted a vPMU notification patch set to synchronize the status of the
back-end counters to the guest, using the PV method with the help of vPMI.
I'm sceptical about this direction and the efficiency of the notification mechanism
I have designed but I do hope that others have better ideas and quality code.
The number of counters is relatively plenty, but it's a pain in the arse for LBR,
and I may post out a slow path with a high performance cost if you're interested in.
> 3) TDX is going to pull the rug out from under us anyway. When the TDX
> module usurps control of the PMU, any active host counters are going
> to stop counting. We are going to need a way of telling the host perf
I presume that performance counters data of TDX guest is isolated for host,
and host counters (from host perf agent) will not stop and keep counting
only for TDX guests in debug mode.
Off-topic, not all of the capabilities of the core-PMU can or should be
used by TDX guests (given that the behavior of firmware for PMU resource
is constantly changing and not even defined).
> subsystem what's happening, or other host perf clients are going to
> get bogus data.
I predict perf core will be patched to sense (via callback, KVM notifies perf,
smart perf_event running time or host stable TSC diff) and report this kind
of data holes from TDX, SGX, AMD-SEV in the report.
>
> Given what's coming with TDX, I wonder if we should just bite the
> bullet and cede the PMU to the guest while it's running, even for
> non-TDX guests. That would solve (1) and (2) as well.
The idea of "cede the PMU to the guest" or "vPMU pass-through" is not really
new to us, there are two main obstacles: one is the isolation of NMI paths
(including GLOBAL_* MSRs, like STATUS); the other is avoiding guest sniffing
host data which is a perfect method for guest escape attackers.
At one time, we proposed to statically reserve counters from the host
perf view at guest startup, but this option was NAK-ed from PeterZ.
We may have a chance to reserve counters at the host startup
until we have a guest PMU more friendly hardware design. :D
I'd like to add one more vPMU discussion point: support for
heterogeneous VMX cores (run vPMU on the ADL or later).
The current capability of vPMU depends on which physical CPU
the KVM module is initialized on, and the current upstream
solution brings concerns in terms of vCPU compatibility and migration.
Thanks,
Like Xu
On Wed, Feb 09, 2022 at 04:10:48PM +0800, Like Xu wrote:
> On 3/2/2022 6:35 am, Jim Mattson wrote:
> > 3) TDX is going to pull the rug out from under us anyway. When the TDX
> > module usurps control of the PMU, any active host counters are going
> > to stop counting. We are going to need a way of telling the host perf
>
> I presume that performance counters data of TDX guest is isolated for host,
> and host counters (from host perf agent) will not stop and keep counting
> only for TDX guests in debug mode.
Right, lots of people like profiling guests from the host. That allows
including all the other virt gunk that supports the guest.
Guests must not unilaterally steal the PMU.
> At one time, we proposed to statically reserve counters from the host
> perf view at guest startup, but this option was NAK-ed from PeterZ.
Because counter constraints, if you hand C0 to the guest, the host
can no longer count certain events, which is bad.
On Wed, Feb 02, 2022 at 02:35:45PM -0800, Jim Mattson wrote:
> 3) TDX is going to pull the rug out from under us anyway. When the TDX
> module usurps control of the PMU, any active host counters are going
> to stop counting. We are going to need a way of telling the host perf
> subsystem what's happening, or other host perf clients are going to
> get bogus data.
That's not acceptible behaviour. I'm all for unilaterally killing any
guest that does this.
On 2/9/22 05:21, Peter Zijlstra wrote:
> On Wed, Feb 02, 2022 at 02:35:45PM -0800, Jim Mattson wrote:
>> 3) TDX is going to pull the rug out from under us anyway. When the TDX
>> module usurps control of the PMU, any active host counters are going
>> to stop counting. We are going to need a way of telling the host perf
>> subsystem what's happening, or other host perf clients are going to
>> get bogus data.
> That's not acceptible behaviour. I'm all for unilaterally killing any
> guest that does this.
I'm not sure where the "bogus data" comes or to what that refers
specifically. But, the host does have some level of control:
> The host VMM controls whether a guest TD can use the performance
> monitoring ISA using the TD’s ATTRIBUTES.PERFMON bit...
So, worst-case, we don't need to threaten to kill guests. The host can
just deny access in the first place.
I'm not too picky about what the PMU does, but the TDX behavior didn't
seem *that* onerous to me. The gory details are all in "On-TD
Performance Monitoring" here:
> https://www.intel.com/content/dam/develop/external/us/en/documents/tdx-module-1.0-public-spec-v0.931.pdf
My read on it is that TDX host _can_ cede the PMU to TDX guests if it
wants. I assume the context-switching model Jim mentioned is along the
lines of what TDX is already doing on host<->guest transitions.
On 2/9/22 10:47, Jim Mattson wrote:
> On Wed, Feb 9, 2022 at 7:41 AM Dave Hansen <[email protected]> wrote:
>>
>> On 2/9/22 05:21, Peter Zijlstra wrote:
>>> On Wed, Feb 02, 2022 at 02:35:45PM -0800, Jim Mattson wrote:
>>>> 3) TDX is going to pull the rug out from under us anyway. When the TDX
>>>> module usurps control of the PMU, any active host counters are going
>>>> to stop counting. We are going to need a way of telling the host perf
>>>> subsystem what's happening, or other host perf clients are going to
>>>> get bogus data.
>>> That's not acceptible behaviour. I'm all for unilaterally killing any
>>> guest that does this.
>>
>> I'm not sure where the "bogus data" comes or to what that refers
>> specifically. But, the host does have some level of control:
>
> I was referring to gaps in the collection of data that the host perf
> subsystem doesn't know about if ATTRIBUTES.PERFMON is set for a TDX
> guest. This can potentially be a problem if someone is trying to
> measure events per unit of time.
Ahh, that makes sense.
Does SGX cause problem for these people? It can create some of the same
collection gaps:
performance monitoring activities are suppressed when entering
an opt-out (of performance monitoring) enclave.
>>> The host VMM controls whether a guest TD can use the performance
>>> monitoring ISA using the TD’s ATTRIBUTES.PERFMON bit...
>>
>> So, worst-case, we don't need to threaten to kill guests. The host can
>> just deny access in the first place.
>>
>> I'm not too picky about what the PMU does, but the TDX behavior didn't
>> seem *that* onerous to me. The gory details are all in "On-TD
>> Performance Monitoring" here:
>>
>>> https://www.intel.com/content/dam/develop/external/us/en/documents/tdx-module-1.0-public-spec-v0.931.pdf
>>
>> My read on it is that TDX host _can_ cede the PMU to TDX guests if it
>> wants. I assume the context-switching model Jim mentioned is along the
>> lines of what TDX is already doing on host<->guest transitions.
>
> Right. If ATTRIBUTES.PERFMON is set, then "perfmon state is
> context-switched by the Intel TDX module across TD entry and exit
> transitions." Furthermore, the VMM has no access to guest perfmon
> state.
>
> If you're saying that setting this bit is unacceptable, then perhaps
> the TDX folks need to redesign their in-guest PMU support.
It's fine with *me*, but I'm not too picky about the PMU. But, it
sounded like Peter was pretty concerned about it.
In any case, if we (Linux folks) need a change, it's *possible* because
most of this policy is implemented in software in the TDX module. It
would just be painful for the folks who came up with the existing mechanism.
On Wed, Feb 9, 2022 at 7:41 AM Dave Hansen <[email protected]> wrote:
>
> On 2/9/22 05:21, Peter Zijlstra wrote:
> > On Wed, Feb 02, 2022 at 02:35:45PM -0800, Jim Mattson wrote:
> >> 3) TDX is going to pull the rug out from under us anyway. When the TDX
> >> module usurps control of the PMU, any active host counters are going
> >> to stop counting. We are going to need a way of telling the host perf
> >> subsystem what's happening, or other host perf clients are going to
> >> get bogus data.
> > That's not acceptible behaviour. I'm all for unilaterally killing any
> > guest that does this.
>
> I'm not sure where the "bogus data" comes or to what that refers
> specifically. But, the host does have some level of control:
I was referring to gaps in the collection of data that the host perf
subsystem doesn't know about if ATTRIBUTES.PERFMON is set for a TDX
guest. This can potentially be a problem if someone is trying to
measure events per unit of time.
> > The host VMM controls whether a guest TD can use the performance
> > monitoring ISA using the TD’s ATTRIBUTES.PERFMON bit...
>
> So, worst-case, we don't need to threaten to kill guests. The host can
> just deny access in the first place.
>
> I'm not too picky about what the PMU does, but the TDX behavior didn't
> seem *that* onerous to me. The gory details are all in "On-TD
> Performance Monitoring" here:
>
> > https://www.intel.com/content/dam/develop/external/us/en/documents/tdx-module-1.0-public-spec-v0.931.pdf
>
> My read on it is that TDX host _can_ cede the PMU to TDX guests if it
> wants. I assume the context-switching model Jim mentioned is along the
> lines of what TDX is already doing on host<->guest transitions.
Right. If ATTRIBUTES.PERFMON is set, then "perfmon state is
context-switched by the Intel TDX module across TD entry and exit
transitions." Furthermore, the VMM has no access to guest perfmon
state.
If you're saying that setting this bit is unacceptable, then perhaps
the TDX folks need to redesign their in-guest PMU support.
Dave,
In my opinion, the right policy depends on what the host owner and
guest owner are trying to achieve.
If the PMU is being used to locate places where performance could be
improved in the system, there are two sub scenarios:
- The host and guest are owned by same entity that is optimizing
overall system. In this case, the guest doesn't need PMU access and
better information is provided by profiling the entire system from the
host.
- The host and guest are owned by different entities. In this
case, profiling from the host can identify perf issues in the guest.
But what action can be taken? The host entity must communicate issues
back to the guest owner through some sort of out-of-band information
channel. On the other hand, preempting the host PMU to give the guest
a fully functional PMU serves this use case well.
TDX and SGX (outside of debug mode) strongly assume different
entities. And Intel is doing this to reduce insight of the host into
guest operations. So in my opinion, preemption makes sense.
There are also scenarios where the host owner is trying to identify
systemwide impacts of guest actions. For example, detecting memory
bandwidth consumption or split locks. In this case, host control
without preemption is necessary.
To address these various scenarios, it seems like the host needs to be
able to have policy control on whether it is willing to have the PMU
preempted by the guest.
But I don't see what scenario is well served by the current situation
in KVM. Currently the guest will either be told it has no PMU (which
is fine) or that it has full control of a PMU. If the guest is told
it has full control of the PMU, it actually doesn't. But instead of
losing counters on well defined events (from the guest perspective),
they simply stop counting depending on what the host is doing with the
PMU.
On the other hand, if we flip it around the semantics are more clear.
A guest will be told it has no PMU (which is fine) or that it has full
control of the PMU. If the guest is told that it has full control of
the PMU, it does. And the host (which is the thing that granted the
full PMU to the guest) knows that events inside the guest are not
being measured. This results in all entities seeing something that
can be reasoned about from their perspective.
Thanks,
Dave Dunn
On Wed, Feb 9, 2022 at 10:57 AM Dave Hansen <[email protected]> wrote:
> > I was referring to gaps in the collection of data that the host perf
> > subsystem doesn't know about if ATTRIBUTES.PERFMON is set for a TDX
> > guest. This can potentially be a problem if someone is trying to
> > measure events per unit of time.
>
> Ahh, that makes sense.
>
> Does SGX cause problem for these people? It can create some of the same
> collection gaps:
>
> performance monitoring activities are suppressed when entering
> an opt-out (of performance monitoring) enclave.
On Wed, Feb 09, 2022, Peter Zijlstra wrote:
> On Wed, Feb 09, 2022 at 04:10:48PM +0800, Like Xu wrote:
> > On 3/2/2022 6:35 am, Jim Mattson wrote:
> > > 3) TDX is going to pull the rug out from under us anyway. When the TDX
> > > module usurps control of the PMU, any active host counters are going
> > > to stop counting. We are going to need a way of telling the host perf
> >
> > I presume that performance counters data of TDX guest is isolated for host,
> > and host counters (from host perf agent) will not stop and keep counting
> > only for TDX guests in debug mode.
>
> Right, lots of people like profiling guests from the host. That allows
> including all the other virt gunk that supports the guest.
>
> Guests must not unilaterally steal the PMU.
The proposal is to add an option to allow userspace to gift the PMU to the guest,
not to let the guest steal the PMU at will. Off by default, certain capabilities
required, etc... are all completely ok and desired, e.g. we also have use cases
where we don't want to let the guest touch the PMU.
David's response in the original thread[*] explains things far better than I can do.
[*] https://lore.kernel.org/all/CABOYuvbPL0DeEgV4gsC+v786xfBAo3T6+7XQr7cVVzbaoFoEAg@mail.gmail.com
On 2/9/2022 2:24 PM, David Dunn wrote:
> Dave,
>
> In my opinion, the right policy depends on what the host owner and
> guest owner are trying to achieve.
>
> If the PMU is being used to locate places where performance could be
> improved in the system, there are two sub scenarios:
> - The host and guest are owned by same entity that is optimizing
> overall system. In this case, the guest doesn't need PMU access and
> better information is provided by profiling the entire system from the
> host.
> - The host and guest are owned by different entities. In this
> case, profiling from the host can identify perf issues in the guest.
> But what action can be taken? The host entity must communicate issues
> back to the guest owner through some sort of out-of-band information
> channel. On the other hand, preempting the host PMU to give the guest
> a fully functional PMU serves this use case well.
>
> TDX and SGX (outside of debug mode) strongly assume different
> entities. And Intel is doing this to reduce insight of the host into
> guest operations. So in my opinion, preemption makes sense.
>
> There are also scenarios where the host owner is trying to identify
> systemwide impacts of guest actions. For example, detecting memory
> bandwidth consumption or split locks. In this case, host control
> without preemption is necessary.
>
> To address these various scenarios, it seems like the host needs to be
> able to have policy control on whether it is willing to have the PMU
> preempted by the guest.
>
> But I don't see what scenario is well served by the current situation
> in KVM. Currently the guest will either be told it has no PMU (which
> is fine) or that it has full control of a PMU. If the guest is told
> it has full control of the PMU, it actually doesn't. But instead of
> losing counters on well defined events (from the guest perspective),
> they simply stop counting depending on what the host is doing with the
> PMU.
For the current perf subsystem, a PMU should be shared among different
users via the multiplexing mechanism if the resource is limited. No one
has full control of a PMU for lifetime. A user can only have the PMU in
its given period. I think the user can understand how long it runs via
total_time_enabled and total_time_running.
For a guest, it should rely on the host to tell whether the PMU resource
is available. But unfortunately, I don't think we have such a
notification mechanism in KVM. The guest has the wrong impression that
the guest can have full control of the PMU.
In my opinion, we should add the notification mechanism in KVM. When the
PMU resource is limited, the guest can know whether it's multiplexing or
can choose to reschedule the event.
But seems the notification mechanism may not work for TDX case?
>
> On the other hand, if we flip it around the semantics are more clear.
> A guest will be told it has no PMU (which is fine) or that it has full
> control of the PMU. If the guest is told that it has full control of
> the PMU, it does. And the host (which is the thing that granted the
> full PMU to the guest) knows that events inside the guest are not
> being measured. This results in all entities seeing something that
> can be reasoned about from their perspective.
>
I assume that this is for the TDX case (where the notification mechanism
doesn't work). The host still control all the PMU resources. The TDX
guest is treated as a super-user who can 'own' a PMU. The admin in the
host can configure/change the owned PMUs of the TDX. Personally, I think
it makes sense. But please keep in mind that the counters are not
identical. There are some special events that can only run on a specific
counter. If the special counter is assigned to TDX, other entities can
never run some events. We should let other entities know if it happens.
Or we should never let non-host entities own the special counter.
Thanks,
Kan
> Thanks,
>
> Dave Dunn
>
> On Wed, Feb 9, 2022 at 10:57 AM Dave Hansen <[email protected]> wrote:
>
>>> I was referring to gaps in the collection of data that the host perf
>>> subsystem doesn't know about if ATTRIBUTES.PERFMON is set for a TDX
>>> guest. This can potentially be a problem if someone is trying to
>>> measure events per unit of time.
>>
>> Ahh, that makes sense.
>>
>> Does SGX cause problem for these people? It can create some of the same
>> collection gaps:
>>
>> performance monitoring activities are suppressed when entering
>> an opt-out (of performance monitoring) enclave.
On 10/2/2022 3:24 am, David Dunn wrote:
> Dave,
>
> In my opinion, the right policy depends on what the host owner and
> guest owner are trying to achieve.
And more, who is the arbiter of the final resource allocation
and who is the consumer of performance data.
This is not a one-shot deal to cede the PMU to the guest or not.
>
> If the PMU is being used to locate places where performance could be
> improved in the system, there are two sub scenarios:
> - The host and guest are owned by same entity that is optimizing
> overall system. In this case, the guest doesn't need PMU access and
> better information is provided by profiling the entire system from the
> host.
It's not only about who the entity is but more about capabilities:
Can we profile the entire system including normal guests, and at the same time,
some guest PMU featurs are still available ? Isn't that a better thing?
> - The host and guest are owned by different entities. In this
> case, profiling from the host can identify perf issues in the guest.
> But what action can be taken? The host entity must communicate issues
> back to the guest owner through some sort of out-of-band information
Uh, I'd like to public my pv-PMU idea (via static guest physical memory) and
web subscription model to share un-core and off-core performance data to
trusted guests.
> channel. On the other hand, preempting the host PMU to give the guest
> a fully functional PMU serves this use case well.
The idea of "preempting PMU from the host" that requires KVM to prioritize
the PMU over any user on the host, was marked as a dead end by PeterZ.
We need to actively restrict host behavior from grabbing guest pmu
and in that case, "a fully functional guest PMU" serves well.
>
> TDX and SGX (outside of debug mode) strongly assume different
> entities. And Intel is doing this to reduce insight of the host into
> guest operations. So in my opinion, preemption makes sense.
Just like the TDX guest uses pCPU resources, this is isolated from
host world, but only if the host scheduler allows it to happen.
>
> There are also scenarios where the host owner is trying to identify
> systemwide impacts of guest actions. For example, detecting memory
> bandwidth consumption or split locks. In this case, host control
or auto numa balance, bla bla...
> without preemption is necessary.
I have some POC code that allows both guest and host PMUs to work
at the same time for completely different consumers of performance data.
>
> To address these various scenarios, it seems like the host needs to be
> able to have policy control on whether it is willing to have the PMU
> preempted by the guest.
The policy is the perf scheduling. The order is:
CPU-pinned
Task-pinned
CPU-flexible
Task-flexible
There is nothing special about KVM in the perf semantics, which is an art.
>
> But I don't see what scenario is well served by the current situation
> in KVM. Currently the guest will either be told it has no PMU (which
> is fine) or that it has full control of a PMU. If the guest is told
> it has full control of the PMU, it actually doesn't. But instead of
> losing counters on well defined events (from the guest perspective),
> they simply stop counting depending on what the host is doing with the
> PMU.
I do understand your annoyance very well, just as I did at first.
>
> On the other hand, if we flip it around the semantics are more clear.
> A guest will be told it has no PMU (which is fine) or that it has full
> control of the PMU. If the guest is told that it has full control of
> the PMU, it does. And the host (which is the thing that granted the
> full PMU to the guest) knows that events inside the guest are not
> being measured. This results in all entities seeing something that
> can be reasoned about from their perspective.
If my comments helps you deliver some interesting code for vPMU,
I'm looking forward to it. :D
>
> Thanks,
>
> Dave Dunn
>
> On Wed, Feb 9, 2022 at 10:57 AM Dave Hansen <[email protected]> wrote:
>
>>> I was referring to gaps in the collection of data that the host perf
>>> subsystem doesn't know about if ATTRIBUTES.PERFMON is set for a TDX
>>> guest. This can potentially be a problem if someone is trying to
>>> measure events per unit of time.
>>
>> Ahh, that makes sense.
>>
>> Does SGX cause problem for these people? It can create some of the same
>> collection gaps:
>>
>> performance monitoring activities are suppressed when entering
>> an opt-out (of performance monitoring) enclave.
On Thu, Feb 10, 2022 at 7:34 AM Liang, Kan <[email protected]> wrote:
>
>
>
> On 2/9/2022 2:24 PM, David Dunn wrote:
> > Dave,
> >
> > In my opinion, the right policy depends on what the host owner and
> > guest owner are trying to achieve.
> >
> > If the PMU is being used to locate places where performance could be
> > improved in the system, there are two sub scenarios:
> > - The host and guest are owned by same entity that is optimizing
> > overall system. In this case, the guest doesn't need PMU access and
> > better information is provided by profiling the entire system from the
> > host.
> > - The host and guest are owned by different entities. In this
> > case, profiling from the host can identify perf issues in the guest.
> > But what action can be taken? The host entity must communicate issues
> > back to the guest owner through some sort of out-of-band information
> > channel. On the other hand, preempting the host PMU to give the guest
> > a fully functional PMU serves this use case well.
> >
> > TDX and SGX (outside of debug mode) strongly assume different
> > entities. And Intel is doing this to reduce insight of the host into
> > guest operations. So in my opinion, preemption makes sense.
> >
> > There are also scenarios where the host owner is trying to identify
> > systemwide impacts of guest actions. For example, detecting memory
> > bandwidth consumption or split locks. In this case, host control
> > without preemption is necessary.
> >
> > To address these various scenarios, it seems like the host needs to be
> > able to have policy control on whether it is willing to have the PMU
> > preempted by the guest.
> >
> > But I don't see what scenario is well served by the current situation
> > in KVM. Currently the guest will either be told it has no PMU (which
> > is fine) or that it has full control of a PMU. If the guest is told
> > it has full control of the PMU, it actually doesn't. But instead of
> > losing counters on well defined events (from the guest perspective),
> > they simply stop counting depending on what the host is doing with the
> > PMU.
>
> For the current perf subsystem, a PMU should be shared among different
> users via the multiplexing mechanism if the resource is limited. No one
> has full control of a PMU for lifetime. A user can only have the PMU in
> its given period. I think the user can understand how long it runs via
> total_time_enabled and total_time_running.
For most clients, yes. For kvm, no. KVM currently tosses
total_time_enabled and total_time_running in the bitbucket. It could
extrapolate, but that would result in loss of precision. Some guest
uses of the PMU would not be able to cope (e.g.
https://github.com/rr-debugger/rr).
> For a guest, it should rely on the host to tell whether the PMU resource
> is available. But unfortunately, I don't think we have such a
> notification mechanism in KVM. The guest has the wrong impression that
> the guest can have full control of the PMU.
That is the only impression that the architectural specification
allows the guest to have. On Intel, we can mask off individual fixed
counters, and we can reduce the number of GP counters, but AMD offers
us no such freedom. Whatever resources we advertise to the guest must
be available for its use whenever it wants. Otherwise, PMU
virtualization is simply broken.
> In my opinion, we should add the notification mechanism in KVM. When the
> PMU resource is limited, the guest can know whether it's multiplexing or
> can choose to reschedule the event.
That sounds like a paravirtual perf mechanism, rather than PMU
virtualization. Are you suggesting that we not try to virtualize the
PMU? Unfortunately, PMU virtualization is what we have customers
clamoring for. No one is interested in a paravirtual perf mechanism.
For example, when will VTune in the guest know how to use your
proposed paravirtual interface?
> But seems the notification mechanism may not work for TDX case?
> >
> > On the other hand, if we flip it around the semantics are more clear.
> > A guest will be told it has no PMU (which is fine) or that it has full
> > control of the PMU. If the guest is told that it has full control of
> > the PMU, it does. And the host (which is the thing that granted the
> > full PMU to the guest) knows that events inside the guest are not
> > being measured. This results in all entities seeing something that
> > can be reasoned about from their perspective.
> >
>
> I assume that this is for the TDX case (where the notification mechanism
> doesn't work). The host still control all the PMU resources. The TDX
> guest is treated as a super-user who can 'own' a PMU. The admin in the
> host can configure/change the owned PMUs of the TDX. Personally, I think
> it makes sense. But please keep in mind that the counters are not
> identical. There are some special events that can only run on a specific
> counter. If the special counter is assigned to TDX, other entities can
> never run some events. We should let other entities know if it happens.
> Or we should never let non-host entities own the special counter.
Right; the counters are not fungible. Ideally, when the guest requests
a particular counter, that is the counter it gets. If it is given a
different counter, the counter it is given must provide the same
behavior as the requested counter for the event in question.
>
> Thanks,
> Kan
>
> > Thanks,
> >
> > Dave Dunn
> >
> > On Wed, Feb 9, 2022 at 10:57 AM Dave Hansen <[email protected]> wrote:
> >
> >>> I was referring to gaps in the collection of data that the host perf
> >>> subsystem doesn't know about if ATTRIBUTES.PERFMON is set for a TDX
> >>> guest. This can potentially be a problem if someone is trying to
> >>> measure events per unit of time.
> >>
> >> Ahh, that makes sense.
> >>
> >> Does SGX cause problem for these people? It can create some of the same
> >> collection gaps:
> >>
> >> performance monitoring activities are suppressed when entering
> >> an opt-out (of performance monitoring) enclave.
On 10/2/2022 2:57 am, Dave Hansen wrote:
> On 2/9/22 10:47, Jim Mattson wrote:
>> On Wed, Feb 9, 2022 at 7:41 AM Dave Hansen <[email protected]> wrote:
>>>
>>> On 2/9/22 05:21, Peter Zijlstra wrote:
>>>> On Wed, Feb 02, 2022 at 02:35:45PM -0800, Jim Mattson wrote:
>>>>> 3) TDX is going to pull the rug out from under us anyway. When the TDX
>>>>> module usurps control of the PMU, any active host counters are going
>>>>> to stop counting. We are going to need a way of telling the host perf
>>>>> subsystem what's happening, or other host perf clients are going to
>>>>> get bogus data.
>>>> That's not acceptible behaviour. I'm all for unilaterally killing any
>>>> guest that does this.
>>>
>>> I'm not sure where the "bogus data" comes or to what that refers
>>> specifically. But, the host does have some level of control:
>>
>> I was referring to gaps in the collection of data that the host perf
>> subsystem doesn't know about if ATTRIBUTES.PERFMON is set for a TDX
>> guest. This can potentially be a problem if someone is trying to
>> measure events per unit of time.
>
> Ahh, that makes sense.
>
> Does SGX cause problem for these people? It can create some of the same
> collection gaps:
>
> performance monitoring activities are suppressed when entering
> an opt-out (of performance monitoring) enclave.
>
Are the end perf user aware of the collection gaps caused by the code running
under SGX?
As far as I know there shouldn't be one yet, we may need a tool like "perf-kvm"
for SGX enclaves.
>>>> The host VMM controls whether a guest TD can use the performance
>>>> monitoring ISA using the TD’s ATTRIBUTES.PERFMON bit...
>>>
>>> So, worst-case, we don't need to threaten to kill guests. The host can
>>> just deny access in the first place.
The KVM module parameter "enable_pmu" might be respected,
together with a per-TD guest user space control option.
>>>
>>> I'm not too picky about what the PMU does, but the TDX behavior didn't
>>> seem *that* onerous to me. The gory details are all in "On-TD
>>> Performance Monitoring" here:
>>>
>>>> https://www.intel.com/content/dam/develop/external/us/en/documents/tdx-module-1.0-public-spec-v0.931.pdf
>>>
>>> My read on it is that TDX host _can_ cede the PMU to TDX guests if it
>>> wants. I assume the context-switching model Jim mentioned is along the
>>> lines of what TDX is already doing on host<->guest transitions.
>>
>> Right. If ATTRIBUTES.PERFMON is set, then "perfmon state is
>> context-switched by the Intel TDX module across TD entry and exit
>> transitions." Furthermore, the VMM has no access to guest perfmon
>> state.
Even the guest TD is under off-TD debug and is untrusted ?
I think we (host administrators) need to profile off-TD guests to locate
performance bottlenecks with a holistic view, regardless of whether the
ATTRIBUTES.PERFMON bit is cleared or not.
Perhaps shared memory could be a way to pass guests performance data
to the host if PMU activities are suppressed across TD entry and exit
transitions for the guest TD is under off-TD debug and is untrusted.
>>
>> If you're saying that setting this bit is unacceptable, then perhaps
>> the TDX folks need to redesign their in-guest PMU support.
>
> It's fine with *me*, but I'm not too picky about the PMU. But, it
> sounded like Peter was pretty concerned about it.
One protocol I've seen is that the (TD or normal) guest cannot compromise
the host's availability to PMU resources (at least in the host runtime).
It's pretty fine and expected that performance data within the trusted TDX guest
should be logically isolated from host data (without artificial aggregation).
>
> In any case, if we (Linux folks) need a change, it's *possible* because
> most of this policy is implemented in software in the TDX module. It
> would just be painful for the folks who came up with the existing mechanism.
>
When the code to enable ATTRIBUTES.PERFMON appears in the mailing list,
we can have more discussions in a very good time window.
On Thu, Feb 10, 2022, Like Xu wrote:
> On 10/2/2022 5:00 am, Sean Christopherson wrote:
> > On Wed, Feb 09, 2022, Peter Zijlstra wrote:
> > > Guests must not unilaterally steal the PMU.
> >
> > The proposal is to add an option to allow userspace to gift the PMU to the guest,
>
> Please define the verb "gift" in more details.
Add a knob that allows host userspace to control toggle between host perf having
sole ownership of the PMU, versus ownership of the PMU being "gifted" to KVM guests
upon VM-Entry and returned back to the host at VM-Exit.
IIUC, it's the same idea as PT's PT_MODE_HOST_GUEST mode, just applied to the PMU.
By default, the host would have sole ownership, and access to the knob would be
restricted appropriately. KVM would disallow creation any VM that requires
joint ownership, e.g. launching a TDX guest would require the knob to be enabled.
> How do we balance the performance data collection needs of the
> 'hypervisor user space' and the 'system-wide profiler user space' ?
If host userspace enables the knob and transitions into a joint ownership mode,
then host userspace is explicitly acknowledging that it will no longer be able
to profile KVM guests.
Balancing between host and guest then gets factored into VM placement, e.g. VMs
that need or are paying for access to the PMU can only land on systems that are
configured for joint ownership. If profiling the guest from the host is important,
then place those guests only on hosts with sole ownership.
Kan,
On Thu, Feb 10, 2022 at 11:46 AM Liang, Kan <[email protected]> wrote:
> No, we don't, at least for Linux. Because the host own everything. It
> doesn't need the MSR to tell which one is in use. We track it in an SW way.
>
> For the new request from the guest to own a counter, I guess maybe it is
> worth implementing it. But yes, the existing/legacy guest never check
> the MSR.
This is the expectation of all software that uses the PMU in every
guest. It isn't just the Linux perf system.
The KVM vPMU model we have today results in the PMU utilizing software
simply not working properly in a guest. The only case that can
consistently "work" today is not giving the guest a PMU at all.
And that's why you are hearing requests to gift the entire PMU to the
guest while it is running. All existing PMU software knows about the
various constraints on exactly how each MSR must be used to get sane
data. And by gifting the entire PMU it allows that software to work
properly. But that has to be controlled by policy at host level such
that the owner of the host knows that they are not going to have PMU
visibility into guests that have control of PMU.
Dave Dunn
On 2/10/2022 2:16 PM, Jim Mattson wrote:
> On Thu, Feb 10, 2022 at 10:30 AM Liang, Kan <[email protected]> wrote:
>>
>>
>>
>> On 2/10/2022 11:34 AM, Jim Mattson wrote:
>>> On Thu, Feb 10, 2022 at 7:34 AM Liang, Kan <[email protected]> wrote:
>>>>
>>>>
>>>>
>>>> On 2/9/2022 2:24 PM, David Dunn wrote:
>>>>> Dave,
>>>>>
>>>>> In my opinion, the right policy depends on what the host owner and
>>>>> guest owner are trying to achieve.
>>>>>
>>>>> If the PMU is being used to locate places where performance could be
>>>>> improved in the system, there are two sub scenarios:
>>>>> - The host and guest are owned by same entity that is optimizing
>>>>> overall system. In this case, the guest doesn't need PMU access and
>>>>> better information is provided by profiling the entire system from the
>>>>> host.
>>>>> - The host and guest are owned by different entities. In this
>>>>> case, profiling from the host can identify perf issues in the guest.
>>>>> But what action can be taken? The host entity must communicate issues
>>>>> back to the guest owner through some sort of out-of-band information
>>>>> channel. On the other hand, preempting the host PMU to give the guest
>>>>> a fully functional PMU serves this use case well.
>>>>>
>>>>> TDX and SGX (outside of debug mode) strongly assume different
>>>>> entities. And Intel is doing this to reduce insight of the host into
>>>>> guest operations. So in my opinion, preemption makes sense.
>>>>>
>>>>> There are also scenarios where the host owner is trying to identify
>>>>> systemwide impacts of guest actions. For example, detecting memory
>>>>> bandwidth consumption or split locks. In this case, host control
>>>>> without preemption is necessary.
>>>>>
>>>>> To address these various scenarios, it seems like the host needs to be
>>>>> able to have policy control on whether it is willing to have the PMU
>>>>> preempted by the guest.
>>>>>
>>>>> But I don't see what scenario is well served by the current situation
>>>>> in KVM. Currently the guest will either be told it has no PMU (which
>>>>> is fine) or that it has full control of a PMU. If the guest is told
>>>>> it has full control of the PMU, it actually doesn't. But instead of
>>>>> losing counters on well defined events (from the guest perspective),
>>>>> they simply stop counting depending on what the host is doing with the
>>>>> PMU.
>>>>
>>>> For the current perf subsystem, a PMU should be shared among different
>>>> users via the multiplexing mechanism if the resource is limited. No one
>>>> has full control of a PMU for lifetime. A user can only have the PMU in
>>>> its given period. I think the user can understand how long it runs via
>>>> total_time_enabled and total_time_running.
>>>
>>> For most clients, yes. For kvm, no. KVM currently tosses
>>> total_time_enabled and total_time_running in the bitbucket. It could
>>> extrapolate, but that would result in loss of precision. Some guest
>>> uses of the PMU would not be able to cope (e.g.
>>> https://github.com/rr-debugger/rr).
>>>
>>>> For a guest, it should rely on the host to tell whether the PMU resource
>>>> is available. But unfortunately, I don't think we have such a
>>>> notification mechanism in KVM. The guest has the wrong impression that
>>>> the guest can have full control of the PMU.
>>>
>>> That is the only impression that the architectural specification
>>> allows the guest to have. On Intel, we can mask off individual fixed
>>> counters, and we can reduce the number of GP counters, but AMD offers
>>> us no such freedom. Whatever resources we advertise to the guest must
>>> be available for its use whenever it wants. Otherwise, PMU
>>> virtualization is simply broken.
>>>
>>>> In my opinion, we should add the notification mechanism in KVM. When the
>>>> PMU resource is limited, the guest can know whether it's multiplexing or
>>>> can choose to reschedule the event.
>>>
>>> That sounds like a paravirtual perf mechanism, rather than PMU
>>> virtualization. Are you suggesting that we not try to virtualize the
>>> PMU? Unfortunately, PMU virtualization is what we have customers
>>> clamoring for. No one is interested in a paravirtual perf mechanism.
>>> For example, when will VTune in the guest know how to use your
>>> proposed paravirtual interface?
>>
>> OK. If KVM cannot notify the guest, maybe guest can query the usage of
>> counters before using a counter. There is a IA32_PERF_GLOBAL_INUSE MSR
>> introduced with Arch perfmon v4. The MSR provides an "InUse" bit for
>> each counters. But it cannot guarantee that the counter can always be
>> owned by the guest unless the host treats the guest as a super-user and
>> agrees to not touch its counter. This should only works for the Intel
>> platforms.
>
> Simple question: Do all existing guests (Windows and Linux are my
> primary interest) query that MSR today? If not, then this proposal is
> DOA.
>
No, we don't, at least for Linux. Because the host own everything. It
doesn't need the MSR to tell which one is in use. We track it in an SW way.
For the new request from the guest to own a counter, I guess maybe it is
worth implementing it. But yes, the existing/legacy guest never check
the MSR.
>>>
>>>> But seems the notification mechanism may not work for TDX case?
>>>>>
>>>>> On the other hand, if we flip it around the semantics are more clear.
>>>>> A guest will be told it has no PMU (which is fine) or that it has full
>>>>> control of the PMU. If the guest is told that it has full control of
>>>>> the PMU, it does. And the host (which is the thing that granted the
>>>>> full PMU to the guest) knows that events inside the guest are not
>>>>> being measured. This results in all entities seeing something that
>>>>> can be reasoned about from their perspective.
>>>>>
>>>>
>>>> I assume that this is for the TDX case (where the notification mechanism
>>>> doesn't work). The host still control all the PMU resources. The TDX
>>>> guest is treated as a super-user who can 'own' a PMU. The admin in the
>>>> host can configure/change the owned PMUs of the TDX. Personally, I think
>>>> it makes sense. But please keep in mind that the counters are not
>>>> identical. There are some special events that can only run on a specific
>>>> counter. If the special counter is assigned to TDX, other entities can
>>>> never run some events. We should let other entities know if it happens.
>>>> Or we should never let non-host entities own the special counter.
>>>
>>> Right; the counters are not fungible. Ideally, when the guest requests
>>> a particular counter, that is the counter it gets. If it is given a
>>> different counter, the counter it is given must provide the same
>>> behavior as the requested counter for the event in question.
>>
>> Ideally, Yes, but sometimes KVM/host may not know whether they can use
>> another counter to replace the requested counter, because KVM/host
>> cannot retrieve the event constraint information from guest.
>
> In that case, don't do it. When the guest asks for a specific counter,
> give the guest that counter. This isn't rocket science.
>
Sounds like the guest can own everything if they want. Maybe it makes
sense from the virtualization's perspective. But it sounds too
aggressive to me. :)
Thanks,
Kan
>> For example, we have Precise Distribution (PDist) feature enabled only
>> for the GP counter 0 on SPR. Perf uses the precise_level 3 (a SW
>> variable) to indicate the feature. For the KVM/host, they never know
>> whether the guest apply the PDist feature.
>>
>> I have a patch that forces the perf scheduler starts from the regular
>> counters, which may mitigates the issue, but cannot fix it. (I will post
>> the patch separately.)
>>
>> Or we should never let the guest own the special counters. Although the
>> guest has to lose some special events, I guess the host may more likely
>> be willing to let the guest own a regular counter.
>>
>>
>> Thanks,
>> Kan
>>
>>>
>>>>
>>>> Thanks,
>>>> Kan
>>>>
>>>>> Thanks,
>>>>>
>>>>> Dave Dunn
>>>>>
>>>>> On Wed, Feb 9, 2022 at 10:57 AM Dave Hansen <[email protected]> wrote:
>>>>>
>>>>>>> I was referring to gaps in the collection of data that the host perf
>>>>>>> subsystem doesn't know about if ATTRIBUTES.PERFMON is set for a TDX
>>>>>>> guest. This can potentially be a problem if someone is trying to
>>>>>>> measure events per unit of time.
>>>>>>
>>>>>> Ahh, that makes sense.
>>>>>>
>>>>>> Does SGX cause problem for these people? It can create some of the same
>>>>>> collection gaps:
>>>>>>
>>>>>> performance monitoring activities are suppressed when entering
>>>>>> an opt-out (of performance monitoring) enclave.
On 10/2/2022 5:00 am, Sean Christopherson wrote:
> On Wed, Feb 09, 2022, Peter Zijlstra wrote:
>> On Wed, Feb 09, 2022 at 04:10:48PM +0800, Like Xu wrote:
>>> On 3/2/2022 6:35 am, Jim Mattson wrote:
>>>> 3) TDX is going to pull the rug out from under us anyway. When the TDX
>>>> module usurps control of the PMU, any active host counters are going
>>>> to stop counting. We are going to need a way of telling the host perf
>>>
>>> I presume that performance counters data of TDX guest is isolated for host,
>>> and host counters (from host perf agent) will not stop and keep counting
>>> only for TDX guests in debug mode.
>>
>> Right, lots of people like profiling guests from the host. That allows
>> including all the other virt gunk that supports the guest.
We (real-world production environments) have a number of PMU use cases
(system-wide or zoomed in on each guest) to characterize different guests on the
host.
>>
>> Guests must not unilaterally steal the PMU.
>
> The proposal is to add an option to allow userspace to gift the PMU to the guest,
Please define the verb "gift" in more details.
How do we balance the performance data collection needs of the
'hypervisor user space' and the 'system-wide profiler user space' ?
> not to let the guest steal the PMU at will. Off by default, certain capabilities
> required, etc... are all completely ok and desired, e.g. we also have use cases
> where we don't want to let the guest touch the PMU.
One ideological hurdle here (between upstream and vendor-defined Linux) is
whether we
let the host's perf be the final arbiter of PMU resource allocation, rather than
not being able
to recycle this resource once it has been dispatched or gifted to the (TDX or
normal) guests.
>
> David's response in the original thread[*] explains things far better than I can do.
>
> [*] https://lore.kernel.org/all/CABOYuvbPL0DeEgV4gsC+v786xfBAo3T6+7XQr7cVVzbaoFoEAg@mail.gmail.com
On 2/10/2022 11:34 AM, Jim Mattson wrote:
> On Thu, Feb 10, 2022 at 7:34 AM Liang, Kan <[email protected]> wrote:
>>
>>
>>
>> On 2/9/2022 2:24 PM, David Dunn wrote:
>>> Dave,
>>>
>>> In my opinion, the right policy depends on what the host owner and
>>> guest owner are trying to achieve.
>>>
>>> If the PMU is being used to locate places where performance could be
>>> improved in the system, there are two sub scenarios:
>>> - The host and guest are owned by same entity that is optimizing
>>> overall system. In this case, the guest doesn't need PMU access and
>>> better information is provided by profiling the entire system from the
>>> host.
>>> - The host and guest are owned by different entities. In this
>>> case, profiling from the host can identify perf issues in the guest.
>>> But what action can be taken? The host entity must communicate issues
>>> back to the guest owner through some sort of out-of-band information
>>> channel. On the other hand, preempting the host PMU to give the guest
>>> a fully functional PMU serves this use case well.
>>>
>>> TDX and SGX (outside of debug mode) strongly assume different
>>> entities. And Intel is doing this to reduce insight of the host into
>>> guest operations. So in my opinion, preemption makes sense.
>>>
>>> There are also scenarios where the host owner is trying to identify
>>> systemwide impacts of guest actions. For example, detecting memory
>>> bandwidth consumption or split locks. In this case, host control
>>> without preemption is necessary.
>>>
>>> To address these various scenarios, it seems like the host needs to be
>>> able to have policy control on whether it is willing to have the PMU
>>> preempted by the guest.
>>>
>>> But I don't see what scenario is well served by the current situation
>>> in KVM. Currently the guest will either be told it has no PMU (which
>>> is fine) or that it has full control of a PMU. If the guest is told
>>> it has full control of the PMU, it actually doesn't. But instead of
>>> losing counters on well defined events (from the guest perspective),
>>> they simply stop counting depending on what the host is doing with the
>>> PMU.
>>
>> For the current perf subsystem, a PMU should be shared among different
>> users via the multiplexing mechanism if the resource is limited. No one
>> has full control of a PMU for lifetime. A user can only have the PMU in
>> its given period. I think the user can understand how long it runs via
>> total_time_enabled and total_time_running.
>
> For most clients, yes. For kvm, no. KVM currently tosses
> total_time_enabled and total_time_running in the bitbucket. It could
> extrapolate, but that would result in loss of precision. Some guest
> uses of the PMU would not be able to cope (e.g.
> https://github.com/rr-debugger/rr).
>
>> For a guest, it should rely on the host to tell whether the PMU resource
>> is available. But unfortunately, I don't think we have such a
>> notification mechanism in KVM. The guest has the wrong impression that
>> the guest can have full control of the PMU.
>
> That is the only impression that the architectural specification
> allows the guest to have. On Intel, we can mask off individual fixed
> counters, and we can reduce the number of GP counters, but AMD offers
> us no such freedom. Whatever resources we advertise to the guest must
> be available for its use whenever it wants. Otherwise, PMU
> virtualization is simply broken.
>
>> In my opinion, we should add the notification mechanism in KVM. When the
>> PMU resource is limited, the guest can know whether it's multiplexing or
>> can choose to reschedule the event.
>
> That sounds like a paravirtual perf mechanism, rather than PMU
> virtualization. Are you suggesting that we not try to virtualize the
> PMU? Unfortunately, PMU virtualization is what we have customers
> clamoring for. No one is interested in a paravirtual perf mechanism.
> For example, when will VTune in the guest know how to use your
> proposed paravirtual interface?
OK. If KVM cannot notify the guest, maybe guest can query the usage of
counters before using a counter. There is a IA32_PERF_GLOBAL_INUSE MSR
introduced with Arch perfmon v4. The MSR provides an "InUse" bit for
each counters. But it cannot guarantee that the counter can always be
owned by the guest unless the host treats the guest as a super-user and
agrees to not touch its counter. This should only works for the Intel
platforms.
>
>> But seems the notification mechanism may not work for TDX case?
>>>
>>> On the other hand, if we flip it around the semantics are more clear.
>>> A guest will be told it has no PMU (which is fine) or that it has full
>>> control of the PMU. If the guest is told that it has full control of
>>> the PMU, it does. And the host (which is the thing that granted the
>>> full PMU to the guest) knows that events inside the guest are not
>>> being measured. This results in all entities seeing something that
>>> can be reasoned about from their perspective.
>>>
>>
>> I assume that this is for the TDX case (where the notification mechanism
>> doesn't work). The host still control all the PMU resources. The TDX
>> guest is treated as a super-user who can 'own' a PMU. The admin in the
>> host can configure/change the owned PMUs of the TDX. Personally, I think
>> it makes sense. But please keep in mind that the counters are not
>> identical. There are some special events that can only run on a specific
>> counter. If the special counter is assigned to TDX, other entities can
>> never run some events. We should let other entities know if it happens.
>> Or we should never let non-host entities own the special counter.
>
> Right; the counters are not fungible. Ideally, when the guest requests
> a particular counter, that is the counter it gets. If it is given a
> different counter, the counter it is given must provide the same
> behavior as the requested counter for the event in question.
Ideally, Yes, but sometimes KVM/host may not know whether they can use
another counter to replace the requested counter, because KVM/host
cannot retrieve the event constraint information from guest.
For example, we have Precise Distribution (PDist) feature enabled only
for the GP counter 0 on SPR. Perf uses the precise_level 3 (a SW
variable) to indicate the feature. For the KVM/host, they never know
whether the guest apply the PDist feature.
I have a patch that forces the perf scheduler starts from the regular
counters, which may mitigates the issue, but cannot fix it. (I will post
the patch separately.)
Or we should never let the guest own the special counters. Although the
guest has to lose some special events, I guess the host may more likely
be willing to let the guest own a regular counter.
Thanks,
Kan
>
>>
>> Thanks,
>> Kan
>>
>>> Thanks,
>>>
>>> Dave Dunn
>>>
>>> On Wed, Feb 9, 2022 at 10:57 AM Dave Hansen <[email protected]> wrote:
>>>
>>>>> I was referring to gaps in the collection of data that the host perf
>>>>> subsystem doesn't know about if ATTRIBUTES.PERFMON is set for a TDX
>>>>> guest. This can potentially be a problem if someone is trying to
>>>>> measure events per unit of time.
>>>>
>>>> Ahh, that makes sense.
>>>>
>>>> Does SGX cause problem for these people? It can create some of the same
>>>> collection gaps:
>>>>
>>>> performance monitoring activities are suppressed when entering
>>>> an opt-out (of performance monitoring) enclave.
On Thu, Feb 10, 2022 at 10:30 AM Liang, Kan <[email protected]> wrote:
>
>
>
> On 2/10/2022 11:34 AM, Jim Mattson wrote:
> > On Thu, Feb 10, 2022 at 7:34 AM Liang, Kan <[email protected]> wrote:
> >>
> >>
> >>
> >> On 2/9/2022 2:24 PM, David Dunn wrote:
> >>> Dave,
> >>>
> >>> In my opinion, the right policy depends on what the host owner and
> >>> guest owner are trying to achieve.
> >>>
> >>> If the PMU is being used to locate places where performance could be
> >>> improved in the system, there are two sub scenarios:
> >>> - The host and guest are owned by same entity that is optimizing
> >>> overall system. In this case, the guest doesn't need PMU access and
> >>> better information is provided by profiling the entire system from the
> >>> host.
> >>> - The host and guest are owned by different entities. In this
> >>> case, profiling from the host can identify perf issues in the guest.
> >>> But what action can be taken? The host entity must communicate issues
> >>> back to the guest owner through some sort of out-of-band information
> >>> channel. On the other hand, preempting the host PMU to give the guest
> >>> a fully functional PMU serves this use case well.
> >>>
> >>> TDX and SGX (outside of debug mode) strongly assume different
> >>> entities. And Intel is doing this to reduce insight of the host into
> >>> guest operations. So in my opinion, preemption makes sense.
> >>>
> >>> There are also scenarios where the host owner is trying to identify
> >>> systemwide impacts of guest actions. For example, detecting memory
> >>> bandwidth consumption or split locks. In this case, host control
> >>> without preemption is necessary.
> >>>
> >>> To address these various scenarios, it seems like the host needs to be
> >>> able to have policy control on whether it is willing to have the PMU
> >>> preempted by the guest.
> >>>
> >>> But I don't see what scenario is well served by the current situation
> >>> in KVM. Currently the guest will either be told it has no PMU (which
> >>> is fine) or that it has full control of a PMU. If the guest is told
> >>> it has full control of the PMU, it actually doesn't. But instead of
> >>> losing counters on well defined events (from the guest perspective),
> >>> they simply stop counting depending on what the host is doing with the
> >>> PMU.
> >>
> >> For the current perf subsystem, a PMU should be shared among different
> >> users via the multiplexing mechanism if the resource is limited. No one
> >> has full control of a PMU for lifetime. A user can only have the PMU in
> >> its given period. I think the user can understand how long it runs via
> >> total_time_enabled and total_time_running.
> >
> > For most clients, yes. For kvm, no. KVM currently tosses
> > total_time_enabled and total_time_running in the bitbucket. It could
> > extrapolate, but that would result in loss of precision. Some guest
> > uses of the PMU would not be able to cope (e.g.
> > https://github.com/rr-debugger/rr).
> >
> >> For a guest, it should rely on the host to tell whether the PMU resource
> >> is available. But unfortunately, I don't think we have such a
> >> notification mechanism in KVM. The guest has the wrong impression that
> >> the guest can have full control of the PMU.
> >
> > That is the only impression that the architectural specification
> > allows the guest to have. On Intel, we can mask off individual fixed
> > counters, and we can reduce the number of GP counters, but AMD offers
> > us no such freedom. Whatever resources we advertise to the guest must
> > be available for its use whenever it wants. Otherwise, PMU
> > virtualization is simply broken.
> >
> >> In my opinion, we should add the notification mechanism in KVM. When the
> >> PMU resource is limited, the guest can know whether it's multiplexing or
> >> can choose to reschedule the event.
> >
> > That sounds like a paravirtual perf mechanism, rather than PMU
> > virtualization. Are you suggesting that we not try to virtualize the
> > PMU? Unfortunately, PMU virtualization is what we have customers
> > clamoring for. No one is interested in a paravirtual perf mechanism.
> > For example, when will VTune in the guest know how to use your
> > proposed paravirtual interface?
>
> OK. If KVM cannot notify the guest, maybe guest can query the usage of
> counters before using a counter. There is a IA32_PERF_GLOBAL_INUSE MSR
> introduced with Arch perfmon v4. The MSR provides an "InUse" bit for
> each counters. But it cannot guarantee that the counter can always be
> owned by the guest unless the host treats the guest as a super-user and
> agrees to not touch its counter. This should only works for the Intel
> platforms.
Simple question: Do all existing guests (Windows and Linux are my
primary interest) query that MSR today? If not, then this proposal is
DOA.
> >
> >> But seems the notification mechanism may not work for TDX case?
> >>>
> >>> On the other hand, if we flip it around the semantics are more clear.
> >>> A guest will be told it has no PMU (which is fine) or that it has full
> >>> control of the PMU. If the guest is told that it has full control of
> >>> the PMU, it does. And the host (which is the thing that granted the
> >>> full PMU to the guest) knows that events inside the guest are not
> >>> being measured. This results in all entities seeing something that
> >>> can be reasoned about from their perspective.
> >>>
> >>
> >> I assume that this is for the TDX case (where the notification mechanism
> >> doesn't work). The host still control all the PMU resources. The TDX
> >> guest is treated as a super-user who can 'own' a PMU. The admin in the
> >> host can configure/change the owned PMUs of the TDX. Personally, I think
> >> it makes sense. But please keep in mind that the counters are not
> >> identical. There are some special events that can only run on a specific
> >> counter. If the special counter is assigned to TDX, other entities can
> >> never run some events. We should let other entities know if it happens.
> >> Or we should never let non-host entities own the special counter.
> >
> > Right; the counters are not fungible. Ideally, when the guest requests
> > a particular counter, that is the counter it gets. If it is given a
> > different counter, the counter it is given must provide the same
> > behavior as the requested counter for the event in question.
>
> Ideally, Yes, but sometimes KVM/host may not know whether they can use
> another counter to replace the requested counter, because KVM/host
> cannot retrieve the event constraint information from guest.
In that case, don't do it. When the guest asks for a specific counter,
give the guest that counter. This isn't rocket science.
> For example, we have Precise Distribution (PDist) feature enabled only
> for the GP counter 0 on SPR. Perf uses the precise_level 3 (a SW
> variable) to indicate the feature. For the KVM/host, they never know
> whether the guest apply the PDist feature.
>
> I have a patch that forces the perf scheduler starts from the regular
> counters, which may mitigates the issue, but cannot fix it. (I will post
> the patch separately.)
>
> Or we should never let the guest own the special counters. Although the
> guest has to lose some special events, I guess the host may more likely
> be willing to let the guest own a regular counter.
>
>
> Thanks,
> Kan
>
> >
> >>
> >> Thanks,
> >> Kan
> >>
> >>> Thanks,
> >>>
> >>> Dave Dunn
> >>>
> >>> On Wed, Feb 9, 2022 at 10:57 AM Dave Hansen <[email protected]> wrote:
> >>>
> >>>>> I was referring to gaps in the collection of data that the host perf
> >>>>> subsystem doesn't know about if ATTRIBUTES.PERFMON is set for a TDX
> >>>>> guest. This can potentially be a problem if someone is trying to
> >>>>> measure events per unit of time.
> >>>>
> >>>> Ahh, that makes sense.
> >>>>
> >>>> Does SGX cause problem for these people? It can create some of the same
> >>>> collection gaps:
> >>>>
> >>>> performance monitoring activities are suppressed when entering
> >>>> an opt-out (of performance monitoring) enclave.
On 2/10/2022 2:55 PM, David Dunn wrote:
> Kan,
>
> On Thu, Feb 10, 2022 at 11:46 AM Liang, Kan <[email protected]> wrote:
>
>> No, we don't, at least for Linux. Because the host own everything. It
>> doesn't need the MSR to tell which one is in use. We track it in an SW way.
>>
>> For the new request from the guest to own a counter, I guess maybe it is
>> worth implementing it. But yes, the existing/legacy guest never check
>> the MSR.
>
> This is the expectation of all software that uses the PMU in every
> guest. It isn't just the Linux perf system.
>
> The KVM vPMU model we have today results in the PMU utilizing software
> simply not working properly in a guest. The only case that can
> consistently "work" today is not giving the guest a PMU at all.
>
> And that's why you are hearing requests to gift the entire PMU to the
> guest while it is running. All existing PMU software knows about the
> various constraints on exactly how each MSR must be used to get sane
> data. And by gifting the entire PMU it allows that software to work
> properly. But that has to be controlled by policy at host level such
> that the owner of the host knows that they are not going to have PMU
> visibility into guests that have control of PMU.
>
I think here is how a guest event works today with KVM and perf subsystem.
- Guest create an event A
- The guest kernel assigns a guest counter M to event A, and config
the related MSRs of the guest counter M.
- KVM intercepts the MSR access and create a host event B. (The
host event B is based on the settings of the guest counter M. As I said,
at least for Linux, some SW config impacts the counter assignment. KVM
never knows it. Event B can only be a similar event to A.)
- Linux perf subsystem assigns a physical counter N to a host event
B according to event B's constraint. (N may not be the same as M,
because A and B may have different event constraints)
As you can see, even the entire PMU is given to the guest, we still
cannot guarantee that the physical counter M can be assigned to the
guest event A.
How to fix it? The only thing I can imagine is "passthrough". Let KVM
directly assign the counter M to guest. So, to me, this policy sounds
like let KVM replace the perf to control the whole PMU resources, and we
will handover them to our guest then. Is it what we want?
Thanks,
Kan
On Fri, Feb 11, 2022 at 6:11 AM Liang, Kan <[email protected]> wrote:
>
>
>
> On 2/10/2022 2:55 PM, David Dunn wrote:
> > Kan,
> >
> > On Thu, Feb 10, 2022 at 11:46 AM Liang, Kan <[email protected]> wrote:
> >
> >> No, we don't, at least for Linux. Because the host own everything. It
> >> doesn't need the MSR to tell which one is in use. We track it in an SW way.
> >>
> >> For the new request from the guest to own a counter, I guess maybe it is
> >> worth implementing it. But yes, the existing/legacy guest never check
> >> the MSR.
> >
> > This is the expectation of all software that uses the PMU in every
> > guest. It isn't just the Linux perf system.
> >
> > The KVM vPMU model we have today results in the PMU utilizing software
> > simply not working properly in a guest. The only case that can
> > consistently "work" today is not giving the guest a PMU at all.
> >
> > And that's why you are hearing requests to gift the entire PMU to the
> > guest while it is running. All existing PMU software knows about the
> > various constraints on exactly how each MSR must be used to get sane
> > data. And by gifting the entire PMU it allows that software to work
> > properly. But that has to be controlled by policy at host level such
> > that the owner of the host knows that they are not going to have PMU
> > visibility into guests that have control of PMU.
> >
>
> I think here is how a guest event works today with KVM and perf subsystem.
> - Guest create an event A
> - The guest kernel assigns a guest counter M to event A, and config
> the related MSRs of the guest counter M.
> - KVM intercepts the MSR access and create a host event B. (The
> host event B is based on the settings of the guest counter M. As I said,
> at least for Linux, some SW config impacts the counter assignment. KVM
> never knows it. Event B can only be a similar event to A.)
> - Linux perf subsystem assigns a physical counter N to a host event
> B according to event B's constraint. (N may not be the same as M,
> because A and B may have different event constraints)
>
> As you can see, even the entire PMU is given to the guest, we still
> cannot guarantee that the physical counter M can be assigned to the
> guest event A.
All we know about the guest is that it has programmed virtual counter
M. It seems obvious to me that we can satisfy that request by giving
it physical counter M. If, for whatever reason, we give it physical
counter N isntead, and M and N are not completely fungible, then we
have failed.
> How to fix it? The only thing I can imagine is "passthrough". Let KVM
> directly assign the counter M to guest. So, to me, this policy sounds
> like let KVM replace the perf to control the whole PMU resources, and we
> will handover them to our guest then. Is it what we want?
We want PMU virtualization to work. There are at least two ways of doing that:
1) Cede the entire PMU to the guest while it's running.
2) Introduce a new "ultimate" priority level in the host perf
subsystem. Only KVM can request events at the ultimate priority, and
these requests supersede any other requests.
Other solutions are welcome.
> Thanks,
> Kan
On 2/11/2022 1:08 PM, Jim Mattson wrote:
> On Fri, Feb 11, 2022 at 6:11 AM Liang, Kan <[email protected]> wrote:
>>
>>
>>
>> On 2/10/2022 2:55 PM, David Dunn wrote:
>>> Kan,
>>>
>>> On Thu, Feb 10, 2022 at 11:46 AM Liang, Kan <[email protected]> wrote:
>>>
>>>> No, we don't, at least for Linux. Because the host own everything. It
>>>> doesn't need the MSR to tell which one is in use. We track it in an SW way.
>>>>
>>>> For the new request from the guest to own a counter, I guess maybe it is
>>>> worth implementing it. But yes, the existing/legacy guest never check
>>>> the MSR.
>>>
>>> This is the expectation of all software that uses the PMU in every
>>> guest. It isn't just the Linux perf system.
>>>
>>> The KVM vPMU model we have today results in the PMU utilizing software
>>> simply not working properly in a guest. The only case that can
>>> consistently "work" today is not giving the guest a PMU at all.
>>>
>>> And that's why you are hearing requests to gift the entire PMU to the
>>> guest while it is running. All existing PMU software knows about the
>>> various constraints on exactly how each MSR must be used to get sane
>>> data. And by gifting the entire PMU it allows that software to work
>>> properly. But that has to be controlled by policy at host level such
>>> that the owner of the host knows that they are not going to have PMU
>>> visibility into guests that have control of PMU.
>>>
>>
>> I think here is how a guest event works today with KVM and perf subsystem.
>> - Guest create an event A
>> - The guest kernel assigns a guest counter M to event A, and config
>> the related MSRs of the guest counter M.
>> - KVM intercepts the MSR access and create a host event B. (The
>> host event B is based on the settings of the guest counter M. As I said,
>> at least for Linux, some SW config impacts the counter assignment. KVM
>> never knows it. Event B can only be a similar event to A.)
>> - Linux perf subsystem assigns a physical counter N to a host event
>> B according to event B's constraint. (N may not be the same as M,
>> because A and B may have different event constraints)
>>
>> As you can see, even the entire PMU is given to the guest, we still
>> cannot guarantee that the physical counter M can be assigned to the
>> guest event A.
>
> All we know about the guest is that it has programmed virtual counter
> M. It seems obvious to me that we can satisfy that request by giving
> it physical counter M. If, for whatever reason, we give it physical
> counter N isntead, and M and N are not completely fungible, then we
> have failed.
>
>> How to fix it? The only thing I can imagine is "passthrough". Let KVM
>> directly assign the counter M to guest. So, to me, this policy sounds
>> like let KVM replace the perf to control the whole PMU resources, and we
>> will handover them to our guest then. Is it what we want?
>
> We want PMU virtualization to work. There are at least two ways of doing that:
> 1) Cede the entire PMU to the guest while it's running.
So the guest will take over the control of the entire PMUs while it's
running. I know someone wants to do system-wide monitoring. This case
will be failed.
I'm not sure whether you can fully trust the guest. If malware runs in
the guest, I don't know whether it will harm the entire system. I'm not
a security expert, but it sounds dangerous.
Hope the administrators know what they are doing when choosing this policy.
> 2) Introduce a new "ultimate" priority level in the host perf
> subsystem. Only KVM can request events at the ultimate priority, and
> these requests supersede any other requests.
The "ultimate" priority level doesn't help in the above case. The
counter M may not bring the precise which guest requests. I remember you
called it "broken".
KVM can fails the case, but KVM cannot notify the guest. The guest still
see wrong result.
>
> Other solutions are welcome.
I don't have a perfect solution to achieve all your requirements. Based
on my understanding, the guest has to be compromised by either
tolerating some errors or dropping some features (e.g., some special
events). With that, we may consider the above "ultimate" priority level
policy. The default policy would be the same as the current
implementation, where the host perf treats all the users, including the
guest, equally. The administrators can set the "ultimate" priority level
policy, which may let the KVM/guest pin/own some regular counters via
perf subsystem. That's just my personal opinion for your reference.
Thanks,
Kan
On Wed, Feb 9, 2022 at 10:57 AM Dave Hansen <[email protected]> wrote:
> Does SGX cause problem for these people? It can create some of the same
> collection gaps:
>
> performance monitoring activities are suppressed when entering
> an opt-out (of performance monitoring) enclave.
There's our precedent!
On Fri, Feb 11, 2022 at 1:47 PM Liang, Kan <[email protected]> wrote:
>
>
>
> On 2/11/2022 1:08 PM, Jim Mattson wrote:
> > On Fri, Feb 11, 2022 at 6:11 AM Liang, Kan <[email protected]> wrote:
> >>
> >>
> >>
> >> On 2/10/2022 2:55 PM, David Dunn wrote:
> >>> Kan,
> >>>
> >>> On Thu, Feb 10, 2022 at 11:46 AM Liang, Kan <[email protected]> wrote:
> >>>
> >>>> No, we don't, at least for Linux. Because the host own everything. It
> >>>> doesn't need the MSR to tell which one is in use. We track it in an SW way.
> >>>>
> >>>> For the new request from the guest to own a counter, I guess maybe it is
> >>>> worth implementing it. But yes, the existing/legacy guest never check
> >>>> the MSR.
> >>>
> >>> This is the expectation of all software that uses the PMU in every
> >>> guest. It isn't just the Linux perf system.
> >>>
> >>> The KVM vPMU model we have today results in the PMU utilizing software
> >>> simply not working properly in a guest. The only case that can
> >>> consistently "work" today is not giving the guest a PMU at all.
> >>>
> >>> And that's why you are hearing requests to gift the entire PMU to the
> >>> guest while it is running. All existing PMU software knows about the
> >>> various constraints on exactly how each MSR must be used to get sane
> >>> data. And by gifting the entire PMU it allows that software to work
> >>> properly. But that has to be controlled by policy at host level such
> >>> that the owner of the host knows that they are not going to have PMU
> >>> visibility into guests that have control of PMU.
> >>>
> >>
> >> I think here is how a guest event works today with KVM and perf subsystem.
> >> - Guest create an event A
> >> - The guest kernel assigns a guest counter M to event A, and config
> >> the related MSRs of the guest counter M.
> >> - KVM intercepts the MSR access and create a host event B. (The
> >> host event B is based on the settings of the guest counter M. As I said,
> >> at least for Linux, some SW config impacts the counter assignment. KVM
> >> never knows it. Event B can only be a similar event to A.)
> >> - Linux perf subsystem assigns a physical counter N to a host event
> >> B according to event B's constraint. (N may not be the same as M,
> >> because A and B may have different event constraints)
> >>
> >> As you can see, even the entire PMU is given to the guest, we still
> >> cannot guarantee that the physical counter M can be assigned to the
> >> guest event A.
> >
> > All we know about the guest is that it has programmed virtual counter
> > M. It seems obvious to me that we can satisfy that request by giving
> > it physical counter M. If, for whatever reason, we give it physical
> > counter N isntead, and M and N are not completely fungible, then we
> > have failed.
> >
> >> How to fix it? The only thing I can imagine is "passthrough". Let KVM
> >> directly assign the counter M to guest. So, to me, this policy sounds
> >> like let KVM replace the perf to control the whole PMU resources, and we
> >> will handover them to our guest then. Is it what we want?
> >
> > We want PMU virtualization to work. There are at least two ways of doing that:
> > 1) Cede the entire PMU to the guest while it's running.
>
> So the guest will take over the control of the entire PMUs while it's
> running. I know someone wants to do system-wide monitoring. This case
> will be failed.
We have system-wide monitoring for fleet efficiency, but since there's
nothing we can do about the efficiency of the guest (and those cycles
are paid for by the customer, anyway), I don't think our efficiency
experts lose any important insights if guest cycles are a blind spot.
> I'm not sure whether you can fully trust the guest. If malware runs in
> the guest, I don't know whether it will harm the entire system. I'm not
> a security expert, but it sounds dangerous.
> Hope the administrators know what they are doing when choosing this policy.
Virtual machines are inherently dangerous. :-)
Despite our concerns about PMU side-channels, Intel is constantly
reminding us that no such attacks are yet known. We would probably
restrict some events to guests that occupy an entire socket, just to
be safe.
Note that on the flip side, TDX and SEV are all about catering to
guests that don't trust the host. Those customers probably don't want
the host to be able to use the PMU to snoop on guest activity.
> > 2) Introduce a new "ultimate" priority level in the host perf
> > subsystem. Only KVM can request events at the ultimate priority, and
> > these requests supersede any other requests.
>
> The "ultimate" priority level doesn't help in the above case. The
> counter M may not bring the precise which guest requests. I remember you
> called it "broken".
Ideally, ultimate priority also comes with the ability to request
specific counters.
> KVM can fails the case, but KVM cannot notify the guest. The guest still
> see wrong result.
>
> >
> > Other solutions are welcome.
>
> I don't have a perfect solution to achieve all your requirements. Based
> on my understanding, the guest has to be compromised by either
> tolerating some errors or dropping some features (e.g., some special
> events). With that, we may consider the above "ultimate" priority level
> policy. The default policy would be the same as the current
> implementation, where the host perf treats all the users, including the
> guest, equally. The administrators can set the "ultimate" priority level
> policy, which may let the KVM/guest pin/own some regular counters via
> perf subsystem. That's just my personal opinion for your reference.
I disagree. The guest does not have to be compromised. For a proof of
concept, see VMware ESXi. Probably Microsoft Hyper-V as well, though I
haven't checked.
On 2/12/2022 6:31 PM, Jim Mattson wrote:
> On Fri, Feb 11, 2022 at 1:47 PM Liang, Kan <[email protected]> wrote:
>>
>>
>>
>> On 2/11/2022 1:08 PM, Jim Mattson wrote:
>>> On Fri, Feb 11, 2022 at 6:11 AM Liang, Kan <[email protected]> wrote:
>>>>
>>>>
>>>>
>>>> On 2/10/2022 2:55 PM, David Dunn wrote:
>>>>> Kan,
>>>>>
>>>>> On Thu, Feb 10, 2022 at 11:46 AM Liang, Kan <[email protected]> wrote:
>>>>>
>>>>>> No, we don't, at least for Linux. Because the host own everything. It
>>>>>> doesn't need the MSR to tell which one is in use. We track it in an SW way.
>>>>>>
>>>>>> For the new request from the guest to own a counter, I guess maybe it is
>>>>>> worth implementing it. But yes, the existing/legacy guest never check
>>>>>> the MSR.
>>>>>
>>>>> This is the expectation of all software that uses the PMU in every
>>>>> guest. It isn't just the Linux perf system.
>>>>>
>>>>> The KVM vPMU model we have today results in the PMU utilizing software
>>>>> simply not working properly in a guest. The only case that can
>>>>> consistently "work" today is not giving the guest a PMU at all.
>>>>>
>>>>> And that's why you are hearing requests to gift the entire PMU to the
>>>>> guest while it is running. All existing PMU software knows about the
>>>>> various constraints on exactly how each MSR must be used to get sane
>>>>> data. And by gifting the entire PMU it allows that software to work
>>>>> properly. But that has to be controlled by policy at host level such
>>>>> that the owner of the host knows that they are not going to have PMU
>>>>> visibility into guests that have control of PMU.
>>>>>
>>>>
>>>> I think here is how a guest event works today with KVM and perf subsystem.
>>>> - Guest create an event A
>>>> - The guest kernel assigns a guest counter M to event A, and config
>>>> the related MSRs of the guest counter M.
>>>> - KVM intercepts the MSR access and create a host event B. (The
>>>> host event B is based on the settings of the guest counter M. As I said,
>>>> at least for Linux, some SW config impacts the counter assignment. KVM
>>>> never knows it. Event B can only be a similar event to A.)
>>>> - Linux perf subsystem assigns a physical counter N to a host event
>>>> B according to event B's constraint. (N may not be the same as M,
>>>> because A and B may have different event constraints)
>>>>
>>>> As you can see, even the entire PMU is given to the guest, we still
>>>> cannot guarantee that the physical counter M can be assigned to the
>>>> guest event A.
>>>
>>> All we know about the guest is that it has programmed virtual counter
>>> M. It seems obvious to me that we can satisfy that request by giving
>>> it physical counter M. If, for whatever reason, we give it physical
>>> counter N isntead, and M and N are not completely fungible, then we
>>> have failed.
>>>
>>>> How to fix it? The only thing I can imagine is "passthrough". Let KVM
>>>> directly assign the counter M to guest. So, to me, this policy sounds
>>>> like let KVM replace the perf to control the whole PMU resources, and we
>>>> will handover them to our guest then. Is it what we want?
>>>
>>> We want PMU virtualization to work. There are at least two ways of doing that:
>>> 1) Cede the entire PMU to the guest while it's running.
>>
>> So the guest will take over the control of the entire PMUs while it's
>> running. I know someone wants to do system-wide monitoring. This case
>> will be failed.
>
> We have system-wide monitoring for fleet efficiency, but since there's
> nothing we can do about the efficiency of the guest (and those cycles
> are paid for by the customer, anyway), I don't think our efficiency
> experts lose any important insights if guest cycles are a blind spot.
Others, e.g., NMI watchdog, also occupy a performance counter. I think
the NMI watchdog is enabled by default at least for the current Linux
kernel. You have to disable all such cases in the host when the guest is
running.
>
>> I'm not sure whether you can fully trust the guest. If malware runs in
>> the guest, I don't know whether it will harm the entire system. I'm not
>> a security expert, but it sounds dangerous.
>> Hope the administrators know what they are doing when choosing this policy.
>
> Virtual machines are inherently dangerous. :-)
>
> Despite our concerns about PMU side-channels, Intel is constantly
> reminding us that no such attacks are yet known. We would probably
> restrict some events to guests that occupy an entire socket, just to
> be safe.
>
> Note that on the flip side, TDX and SEV are all about catering to
> guests that don't trust the host. Those customers probably don't want
> the host to be able to use the PMU to snoop on guest activity.
>
>>> 2) Introduce a new "ultimate" priority level in the host perf
>>> subsystem. Only KVM can request events at the ultimate priority, and
>>> these requests supersede any other requests.
>>
>> The "ultimate" priority level doesn't help in the above case. The
>> counter M may not bring the precise which guest requests. I remember you
>> called it "broken".
>
> Ideally, ultimate priority also comes with the ability to request
> specific counters.
>
>> KVM can fails the case, but KVM cannot notify the guest. The guest still
>> see wrong result.
>>
>>>
>>> Other solutions are welcome.
>>
>> I don't have a perfect solution to achieve all your requirements. Based
>> on my understanding, the guest has to be compromised by either
>> tolerating some errors or dropping some features (e.g., some special
>> events). With that, we may consider the above "ultimate" priority level
>> policy. The default policy would be the same as the current
>> implementation, where the host perf treats all the users, including the
>> guest, equally. The administrators can set the "ultimate" priority level
>> policy, which may let the KVM/guest pin/own some regular counters via
>> perf subsystem. That's just my personal opinion for your reference.
>
> I disagree. The guest does not have to be compromised. For a proof of
> concept, see VMware ESXi. Probably Microsoft Hyper-V as well, though I
> haven't checked.
As far as I know, VMware ESXi has its own VMkernel, which can owns the
entire HW PMUs. The KVM is part of the Linux kernel. The HW PMUs should
be shared among components/users. I think the cases are different.
Also, from what I searched on the VMware website, they still encounter
the case that a guest VM may not get a performance monitoring counter.
It looks like their solution is to let guest OS check the availability
of the counter, which is similar to the solution I mentioned (Use
GLOBAL_INUSE MSR).
"If an ESXi host's BIOS uses a performance counter or if Fault Tolerance
is enabled, some virtual performance counters might not be available for
the virtual machine to use."
https://docs.vmware.com/en/VMware-vSphere/7.0/com.vmware.vsphere.vm_admin.doc/GUID-F920A3C7-3B42-4E78-8EA7-961E49AF479D.html
"In general, if a physical CPU PMC is in use, the corresponding virtual
CPU PMC is not functional and is unavailable for use by the guest. Guest
OS software detects unavailable general purpose PMCs by checking for a
non-zero event select MSR value when a virtual machine powers on."
https://kb.vmware.com/s/article/2030221
Thanks,
Kan
On Mon, Feb 14, 2022 at 1:55 PM Liang, Kan <[email protected]> wrote:
>
>
>
> On 2/12/2022 6:31 PM, Jim Mattson wrote:
> > On Fri, Feb 11, 2022 at 1:47 PM Liang, Kan <[email protected]> wrote:
> >>
> >>
> >>
> >> On 2/11/2022 1:08 PM, Jim Mattson wrote:
> >>> On Fri, Feb 11, 2022 at 6:11 AM Liang, Kan <[email protected]> wrote:
> >>>>
> >>>>
> >>>>
> >>>> On 2/10/2022 2:55 PM, David Dunn wrote:
> >>>>> Kan,
> >>>>>
> >>>>> On Thu, Feb 10, 2022 at 11:46 AM Liang, Kan <[email protected]> wrote:
> >>>>>
> >>>>>> No, we don't, at least for Linux. Because the host own everything. It
> >>>>>> doesn't need the MSR to tell which one is in use. We track it in an SW way.
> >>>>>>
> >>>>>> For the new request from the guest to own a counter, I guess maybe it is
> >>>>>> worth implementing it. But yes, the existing/legacy guest never check
> >>>>>> the MSR.
> >>>>>
> >>>>> This is the expectation of all software that uses the PMU in every
> >>>>> guest. It isn't just the Linux perf system.
> >>>>>
> >>>>> The KVM vPMU model we have today results in the PMU utilizing software
> >>>>> simply not working properly in a guest. The only case that can
> >>>>> consistently "work" today is not giving the guest a PMU at all.
> >>>>>
> >>>>> And that's why you are hearing requests to gift the entire PMU to the
> >>>>> guest while it is running. All existing PMU software knows about the
> >>>>> various constraints on exactly how each MSR must be used to get sane
> >>>>> data. And by gifting the entire PMU it allows that software to work
> >>>>> properly. But that has to be controlled by policy at host level such
> >>>>> that the owner of the host knows that they are not going to have PMU
> >>>>> visibility into guests that have control of PMU.
> >>>>>
> >>>>
> >>>> I think here is how a guest event works today with KVM and perf subsystem.
> >>>> - Guest create an event A
> >>>> - The guest kernel assigns a guest counter M to event A, and config
> >>>> the related MSRs of the guest counter M.
> >>>> - KVM intercepts the MSR access and create a host event B. (The
> >>>> host event B is based on the settings of the guest counter M. As I said,
> >>>> at least for Linux, some SW config impacts the counter assignment. KVM
> >>>> never knows it. Event B can only be a similar event to A.)
> >>>> - Linux perf subsystem assigns a physical counter N to a host event
> >>>> B according to event B's constraint. (N may not be the same as M,
> >>>> because A and B may have different event constraints)
> >>>>
> >>>> As you can see, even the entire PMU is given to the guest, we still
> >>>> cannot guarantee that the physical counter M can be assigned to the
> >>>> guest event A.
> >>>
> >>> All we know about the guest is that it has programmed virtual counter
> >>> M. It seems obvious to me that we can satisfy that request by giving
> >>> it physical counter M. If, for whatever reason, we give it physical
> >>> counter N isntead, and M and N are not completely fungible, then we
> >>> have failed.
> >>>
> >>>> How to fix it? The only thing I can imagine is "passthrough". Let KVM
> >>>> directly assign the counter M to guest. So, to me, this policy sounds
> >>>> like let KVM replace the perf to control the whole PMU resources, and we
> >>>> will handover them to our guest then. Is it what we want?
> >>>
> >>> We want PMU virtualization to work. There are at least two ways of doing that:
> >>> 1) Cede the entire PMU to the guest while it's running.
> >>
> >> So the guest will take over the control of the entire PMUs while it's
> >> running. I know someone wants to do system-wide monitoring. This case
> >> will be failed.
> >
> > We have system-wide monitoring for fleet efficiency, but since there's
> > nothing we can do about the efficiency of the guest (and those cycles
> > are paid for by the customer, anyway), I don't think our efficiency
> > experts lose any important insights if guest cycles are a blind spot.
>
> Others, e.g., NMI watchdog, also occupy a performance counter. I think
> the NMI watchdog is enabled by default at least for the current Linux
> kernel. You have to disable all such cases in the host when the guest is
> running.
It doesn't actually make any sense to run the NMI watchdog while in
the guest, does it?
> >
> >> I'm not sure whether you can fully trust the guest. If malware runs in
> >> the guest, I don't know whether it will harm the entire system. I'm not
> >> a security expert, but it sounds dangerous.
> >> Hope the administrators know what they are doing when choosing this policy.
> >
> > Virtual machines are inherently dangerous. :-)
> >
> > Despite our concerns about PMU side-channels, Intel is constantly
> > reminding us that no such attacks are yet known. We would probably
> > restrict some events to guests that occupy an entire socket, just to
> > be safe.
> >
> > Note that on the flip side, TDX and SEV are all about catering to
> > guests that don't trust the host. Those customers probably don't want
> > the host to be able to use the PMU to snoop on guest activity.
> >
> >>> 2) Introduce a new "ultimate" priority level in the host perf
> >>> subsystem. Only KVM can request events at the ultimate priority, and
> >>> these requests supersede any other requests.
> >>
> >> The "ultimate" priority level doesn't help in the above case. The
> >> counter M may not bring the precise which guest requests. I remember you
> >> called it "broken".
> >
> > Ideally, ultimate priority also comes with the ability to request
> > specific counters.
> >
> >> KVM can fails the case, but KVM cannot notify the guest. The guest still
> >> see wrong result.
> >>
> >>>
> >>> Other solutions are welcome.
> >>
> >> I don't have a perfect solution to achieve all your requirements. Based
> >> on my understanding, the guest has to be compromised by either
> >> tolerating some errors or dropping some features (e.g., some special
> >> events). With that, we may consider the above "ultimate" priority level
> >> policy. The default policy would be the same as the current
> >> implementation, where the host perf treats all the users, including the
> >> guest, equally. The administrators can set the "ultimate" priority level
> >> policy, which may let the KVM/guest pin/own some regular counters via
> >> perf subsystem. That's just my personal opinion for your reference.
> >
> > I disagree. The guest does not have to be compromised. For a proof of
> > concept, see VMware ESXi. Probably Microsoft Hyper-V as well, though I
> > haven't checked.
>
> As far as I know, VMware ESXi has its own VMkernel, which can owns the
> entire HW PMUs. The KVM is part of the Linux kernel. The HW PMUs should
> be shared among components/users. I think the cases are different.
Architecturally, ESXi is not very different from Linux. The VMkernel
is a posix-compliant kernel, and VMware's "vmm" is comparable to kvm.
> Also, from what I searched on the VMware website, they still encounter
> the case that a guest VM may not get a performance monitoring counter.
> It looks like their solution is to let guest OS check the availability
> of the counter, which is similar to the solution I mentioned (Use
> GLOBAL_INUSE MSR).
>
> "If an ESXi host's BIOS uses a performance counter or if Fault Tolerance
> is enabled, some virtual performance counters might not be available for
> the virtual machine to use."
I'm perfectly happy to give up PMCs on Linux under those same conditions.
> https://docs.vmware.com/en/VMware-vSphere/7.0/com.vmware.vsphere.vm_admin.doc/GUID-F920A3C7-3B42-4E78-8EA7-961E49AF479D.html
>
> "In general, if a physical CPU PMC is in use, the corresponding virtual
> CPU PMC is not functional and is unavailable for use by the guest. Guest
> OS software detects unavailable general purpose PMCs by checking for a
> non-zero event select MSR value when a virtual machine powers on."
>
> https://kb.vmware.com/s/article/2030221
>
Linux, at least, doesn't do that. Maybe Windows does.
> Thanks,
> Kan
>
On 10/2/2022 11:34 pm, Liang, Kan wrote:
> For the current perf subsystem, a PMU should be shared among different users via
> the multiplexing mechanism if the resource is limited. No one has full control
> of a PMU for lifetime. A user can only have the PMU in its given period.
Off-topic, does perf has knobs to disable the default multiplexing mechanism
for individual tasks and enforce a first-come, first-served policy for same
priority ?
The reported perf data from the multiplexing mechanism may even mislead
the conclusions of subsequent statistically based performance analysis.
On 11/2/2022 1:12 am, Sean Christopherson wrote:
> On Thu, Feb 10, 2022, Like Xu wrote:
>> On 10/2/2022 5:00 am, Sean Christopherson wrote:
>>> On Wed, Feb 09, 2022, Peter Zijlstra wrote:
>>>> Guests must not unilaterally steal the PMU.
>>>
>>> The proposal is to add an option to allow userspace to gift the PMU to the guest,
>>
>> Please define the verb "gift" in more details.
>
> Add a knob that allows host userspace to control toggle between host perf having
> sole ownership of the PMU, versus ownership of the PMU being "gifted" to KVM guests
> upon VM-Entry and returned back to the host at VM-Exit.
For the vm-entry/exit level of granularity, we're able to do it without the host
perf knob.
For the guest power-on/off level of granularity, perf does not compromise with KVM.
>
> IIUC, it's the same idea as PT's PT_MODE_HOST_GUEST mode, just applied to the PMU.
TBH, I don't like the design of PT_MODE_HOST_GUEST, it breaks the flexibility.
I would prefer to see a transition in the use of PT to the existing vPMU approach.
>
> By default, the host would have sole ownership, and access to the knob would be
> restricted appropriately. KVM would disallow creation any VM that requires
> joint ownership, e.g. launching a TDX guest would require the knob to be enabled.
The knob implies a per-pCPU control granularity (internal or explicit), for
scalability.
But again, regardless of whether the (TDX) guest has pmu enabled or not, the host
needs to use pmu (to profile host, non-TDX guest) without giving it away easily
at runtime (via knob).
We should not destroy the kind of hybrid usage, but going in a legacy direction,
complementing the lack of guest pmu functionality with emulation or
full context switching per the vm-entry/exit level of granularity.
>
>> How do we balance the performance data collection needs of the
>> 'hypervisor user space' and the 'system-wide profiler user space' ?
>
> If host userspace enables the knob and transitions into a joint ownership mode,
> then host userspace is explicitly acknowledging that it will no longer be able
> to profile KVM guests.
AFAI, most cloud provider don't want to lose this flexibility as it leaves
hundreds of "profile KVM guests" cases with nowhere to land.
>
> Balancing between host and guest then gets factored into VM placement, e.g. VMs
> that need or are paying for access to the PMU can only land on systems that are
> configured for joint ownership. If profiling the guest from the host is important,
> then place those guests only on hosts with sole ownership.
If the host user space does not reclaim PMU (triumph through prioritization)
from the
guest (controlling this behavior is like controlling VMs placement), then the
guest's
PMU functionality has nothing to lose, which is complete.
On 11/2/2022 3:46 am, Liang, Kan wrote:
>
>
> On 2/10/2022 2:16 PM, Jim Mattson wrote:
>> On Thu, Feb 10, 2022 at 10:30 AM Liang, Kan <[email protected]> wrote:
>>>
>>>
>>>
>>> On 2/10/2022 11:34 AM, Jim Mattson wrote:
>>>> On Thu, Feb 10, 2022 at 7:34 AM Liang, Kan <[email protected]> wrote:
>>>>>
>>>>>
>>>>>
>>>>> On 2/9/2022 2:24 PM, David Dunn wrote:
>>>>>> Dave,
>>>>>>
>>>>>> In my opinion, the right policy depends on what the host owner and
>>>>>> guest owner are trying to achieve.
>>>>>>
>>>>>> If the PMU is being used to locate places where performance could be
>>>>>> improved in the system, there are two sub scenarios:
>>>>>> - The host and guest are owned by same entity that is optimizing
>>>>>> overall system. In this case, the guest doesn't need PMU access and
>>>>>> better information is provided by profiling the entire system from the
>>>>>> host.
>>>>>> - The host and guest are owned by different entities. In this
>>>>>> case, profiling from the host can identify perf issues in the guest.
>>>>>> But what action can be taken? The host entity must communicate issues
>>>>>> back to the guest owner through some sort of out-of-band information
>>>>>> channel. On the other hand, preempting the host PMU to give the guest
>>>>>> a fully functional PMU serves this use case well.
>>>>>>
>>>>>> TDX and SGX (outside of debug mode) strongly assume different
>>>>>> entities. And Intel is doing this to reduce insight of the host into
>>>>>> guest operations. So in my opinion, preemption makes sense.
>>>>>>
>>>>>> There are also scenarios where the host owner is trying to identify
>>>>>> systemwide impacts of guest actions. For example, detecting memory
>>>>>> bandwidth consumption or split locks. In this case, host control
>>>>>> without preemption is necessary.
>>>>>>
>>>>>> To address these various scenarios, it seems like the host needs to be
>>>>>> able to have policy control on whether it is willing to have the PMU
>>>>>> preempted by the guest.
>>>>>>
>>>>>> But I don't see what scenario is well served by the current situation
>>>>>> in KVM. Currently the guest will either be told it has no PMU (which
>>>>>> is fine) or that it has full control of a PMU. If the guest is told
>>>>>> it has full control of the PMU, it actually doesn't. But instead of
>>>>>> losing counters on well defined events (from the guest perspective),
>>>>>> they simply stop counting depending on what the host is doing with the
>>>>>> PMU.
>>>>>
>>>>> For the current perf subsystem, a PMU should be shared among different
>>>>> users via the multiplexing mechanism if the resource is limited. No one
>>>>> has full control of a PMU for lifetime. A user can only have the PMU in
>>>>> its given period. I think the user can understand how long it runs via
>>>>> total_time_enabled and total_time_running.
>>>>
>>>> For most clients, yes. For kvm, no. KVM currently tosses
>>>> total_time_enabled and total_time_running in the bitbucket. It could
>>>> extrapolate, but that would result in loss of precision. Some guest
>>>> uses of the PMU would not be able to cope (e.g.
>>>> https://github.com/rr-debugger/rr).
>>>>
>>>>> For a guest, it should rely on the host to tell whether the PMU resource
>>>>> is available. But unfortunately, I don't think we have such a
>>>>> notification mechanism in KVM. The guest has the wrong impression that
>>>>> the guest can have full control of the PMU.
>>>>
>>>> That is the only impression that the architectural specification
>>>> allows the guest to have. On Intel, we can mask off individual fixed
>>>> counters, and we can reduce the number of GP counters, but AMD offers
>>>> us no such freedom. Whatever resources we advertise to the guest must
The future may look a little better, with more and more server
hardware being designed with virtualization requirement in mind.
>>>> be available for its use whenever it wants. Otherwise, PMU
>>>> virtualization is simply broken.
YES for "simply broken" but no for "available whenever it wants"
If there is no host (core) pmu user, the guest pmu is fully and architecturally
available.
If there is no perf agent on host (like watchdog),
current guest pmu is working fine except for some emulated instructions.
>>>>
>>>>> In my opinion, we should add the notification mechanism in KVM. When the
>>>>> PMU resource is limited, the guest can know whether it's multiplexing or
>>>>> can choose to reschedule the event.
Eventually, we moved the topic to an open discussion and I am relieved.
The total_time_enabled and total_time_running of the perf_events
created by KVM are quite unreliable and invisible to the guest, and
we may need to clearly define what they reallt mean, for example
when profiling the SGX applications.
The elephant in the vPMU room at the moment is that the guest has
no way of knowing if the physical pmc on the back end of the vPMC
is being multiplexed, even though the KVM is able to know.
One way to mitigate this is to allow perf to not apply a multiplexing
policy (sys knob), for example with a first-come, first-served policy.
In this case, each user of the same priority of PMC is fair, and KVM
goes first to request hardware when the guest uses vPMC, or requests
re-sched to another pCPU, and only fails in the worst case.
>>>>
>>>> That sounds like a paravirtual perf mechanism, rather than PMU
>>>> virtualization. Are you suggesting that we not try to virtualize the
>>>> PMU? Unfortunately, PMU virtualization is what we have customers
>>>> clamoring for. No one is interested in a paravirtual perf mechanism.
>>>> For example, when will VTune in the guest know how to use your
>>>> proposed paravirtual interface?
>>>
>>> OK. If KVM cannot notify the guest, maybe guest can query the usage of
>>> counters before using a counter. There is a IA32_PERF_GLOBAL_INUSE MSR
>>> introduced with Arch perfmon v4. The MSR provides an "InUse" bit for
>>> each counters. But it cannot guarantee that the counter can always be
>>> owned by the guest unless the host treats the guest as a super-user and
>>> agrees to not touch its counter. This should only works for the Intel
>>> platforms.
>>
>> Simple question: Do all existing guests (Windows and Linux are my
>> primary interest) query that MSR today? If not, then this proposal is
>> DOA.
>>
>
> No, we don't, at least for Linux. Because the host own everything. It doesn't
> need the MSR to tell which one is in use. We track it in an SW way.
Indeed, "the host own everything", which is also the
starting point for the host perf when it received the changes.
>
> For the new request from the guest to own a counter, I guess maybe it is worth
> implementing it. But yes, the existing/legacy guest never check the MSR.
We probably need an X86 generic notification solution for the worst case.
>
>
>>>>
>>>>> But seems the notification mechanism may not work for TDX case?
Shared memory can be used for communication between the host and
the guest, if it's allowed by the TDX guest.
>>>>>>
>>>>>> On the other hand, if we flip it around the semantics are more clear.
>>>>>> A guest will be told it has no PMU (which is fine) or that it has full
>>>>>> control of the PMU. If the guest is told that it has full control of
>>>>>> the PMU, it does. And the host (which is the thing that granted the
>>>>>> full PMU to the guest) knows that events inside the guest are not
>>>>>> being measured. This results in all entities seeing something that
>>>>>> can be reasoned about from their perspective.
>>>>>>
>>>>>
>>>>> I assume that this is for the TDX case (where the notification mechanism
>>>>> doesn't work). The host still control all the PMU resources. The TDX
>>>>> guest is treated as a super-user who can 'own' a PMU. The admin in the
>>>>> host can configure/change the owned PMUs of the TDX. Personally, I think
>>>>> it makes sense. But please keep in mind that the counters are not
>>>>> identical. There are some special events that can only run on a specific
>>>>> counter. If the special counter is assigned to TDX, other entities can
>>>>> never run some events. We should let other entities know if it happens.
>>>>> Or we should never let non-host entities own the special counter.
>>>>
>>>> Right; the counters are not fungible. Ideally, when the guest requests
>>>> a particular counter, that is the counter it gets. If it is given a
>>>> different counter, the counter it is given must provide the same
>>>> behavior as the requested counter for the event in question.
>>>
>>> Ideally, Yes, but sometimes KVM/host may not know whether they can use
>>> another counter to replace the requested counter, because KVM/host
>>> cannot retrieve the event constraint information from guest.
>>
>> In that case, don't do it. When the guest asks for a specific counter,
>> give the guest that counter. This isn't rocket science.
>>
>
> Sounds like the guest can own everything if they want. Maybe it makes sense from
> the virtualization's perspective. But it sounds too aggressive to me. :)
Until Perterz changes his will, upstream may not see this kind of change.
(I actually used to like this design too).
>
> Thanks,
> Kan
>
>
>>> For example, we have Precise Distribution (PDist) feature enabled only
>>> for the GP counter 0 on SPR. Perf uses the precise_level 3 (a SW
>>> variable) to indicate the feature. For the KVM/host, they never know
>>> whether the guest apply the PDist feature.
Yes, just check what we did on PEBS, which is Acked-by PeterZ.
>>>
>>> I have a patch that forces the perf scheduler starts from the regular
>>> counters, which may mitigates the issue, but cannot fix it. (I will post
>>> the patch separately.)
>>>
>>> Or we should never let the guest own the special counters. Although the
>>> guest has to lose some special events, I guess the host may more likely
>>> be willing to let the guest own a regular counter.
AMD seems to do this, but it's just another disable-pmu compromise.
>>>
>>>
>>> Thanks,
>>> Kan
>>>
>>>>
>>>>>
>>>>> Thanks,
>>>>> Kan
>>>>>
>>>>>> Thanks,
>>>>>>
>>>>>> Dave Dunn
>>>>>>
>>>>>> On Wed, Feb 9, 2022 at 10:57 AM Dave Hansen <[email protected]> wrote:
>>>>>>
>>>>>>>> I was referring to gaps in the collection of data that the host perf
>>>>>>>> subsystem doesn't know about if ATTRIBUTES.PERFMON is set for a TDX
>>>>>>>> guest. This can potentially be a problem if someone is trying to
>>>>>>>> measure events per unit of time.
>>>>>>>
>>>>>>> Ahh, that makes sense.
>>>>>>>
>>>>>>> Does SGX cause problem for these people? It can create some of the same
>>>>>>> collection gaps:
>>>>>>>
>>>>>>> performance monitoring activities are suppressed when entering
>>>>>>> an opt-out (of performance monitoring) enclave.
On 15/2/2022 6:55 am, Jim Mattson wrote:
> On Mon, Feb 14, 2022 at 1:55 PM Liang, Kan <[email protected]> wrote:
>>
>>
>>
>> On 2/12/2022 6:31 PM, Jim Mattson wrote:
>>> On Fri, Feb 11, 2022 at 1:47 PM Liang, Kan <[email protected]> wrote:
>>>>
>>>>
>>>>
>>>> On 2/11/2022 1:08 PM, Jim Mattson wrote:
>>>>> On Fri, Feb 11, 2022 at 6:11 AM Liang, Kan <[email protected]> wrote:
>>>>>>
>>>>>>
>>>>>>
>>>>>> On 2/10/2022 2:55 PM, David Dunn wrote:
>>>>>>> Kan,
>>>>>>>
>>>>>>> On Thu, Feb 10, 2022 at 11:46 AM Liang, Kan <[email protected]> wrote:
>>>>>>>
>>>>>>>> No, we don't, at least for Linux. Because the host own everything. It
>>>>>>>> doesn't need the MSR to tell which one is in use. We track it in an SW way.
>>>>>>>>
>>>>>>>> For the new request from the guest to own a counter, I guess maybe it is
>>>>>>>> worth implementing it. But yes, the existing/legacy guest never check
>>>>>>>> the MSR.
>>>>>>>
>>>>>>> This is the expectation of all software that uses the PMU in every
>>>>>>> guest. It isn't just the Linux perf system.
Hardware resources will always be limited (the good news is that
the number of counters will increase in the future), and when we have
a common need for the same hardware, we should prioritize overall
fairness over a GUEST-first strategy.
>>>>>>>
>>>>>>> The KVM vPMU model we have today results in the PMU utilizing software
>>>>>>> simply not working properly in a guest. The only case that can
>>>>>>> consistently "work" today is not giving the guest a PMU at all.
Actually I'd be more interested to check the exact "not working properly" usage.
>>>>>>>
>>>>>>> And that's why you are hearing requests to gift the entire PMU to the
>>>>>>> guest while it is running. All existing PMU software knows about the
>>>>>>> various constraints on exactly how each MSR must be used to get sane
>>>>>>> data. And by gifting the entire PMU it allows that software to work
>>>>>>> properly. But that has to be controlled by policy at host level such
>>>>>>> that the owner of the host knows that they are not going to have PMU
>>>>>>> visibility into guests that have control of PMU.
Don't forget that the perf subsystem manages uncore-pmu and
other profiling resources as well. Making host perf visible to some
resources and invisible to others would require big effort and it
would be very unmaintainable as the pmu hardware resources
become more and more numerous.
IMO, there is no future in having host perf get special for KVM.
>>>>>>>
>>>>>>
>>>>>> I think here is how a guest event works today with KVM and perf subsystem.
>>>>>> - Guest create an event A
>>>>>> - The guest kernel assigns a guest counter M to event A, and config
>>>>>> the related MSRs of the guest counter M.
>>>>>> - KVM intercepts the MSR access and create a host event B. (The
>>>>>> host event B is based on the settings of the guest counter M. As I said,
>>>>>> at least for Linux, some SW config impacts the counter assignment. KVM
>>>>>> never knows it. Event B can only be a similar event to A.)
>>>>>> - Linux perf subsystem assigns a physical counter N to a host event
>>>>>> B according to event B's constraint. (N may not be the same as M,
>>>>>> because A and B may have different event constraints)
There is also the priority availability of previous scheduling policies.
This cross-mapping is relatively common and normal in the real world.
>>>>>>
>>>>>> As you can see, even the entire PMU is given to the guest, we still
>>>>>> cannot guarantee that the physical counter M can be assigned to the
>>>>>> guest event A.
>>>>>
>>>>> All we know about the guest is that it has programmed virtual counter
>>>>> M. It seems obvious to me that we can satisfy that request by giving
>>>>> it physical counter M. If, for whatever reason, we give it physical
>>>>> counter N isntead, and M and N are not completely fungible, then we
>>>>> have failed.
Only host perf has the global perspective to derive interchangeability,
which is a reasonable and flexible design.
>>>>>
>>>>>> How to fix it? The only thing I can imagine is "passthrough". Let KVM
>>>>>> directly assign the counter M to guest. So, to me, this policy sounds
>>>>>> like let KVM replace the perf to control the whole PMU resources, and we
>>>>>> will handover them to our guest then. Is it what we want?
I would prefer to see more code on this idea, as I am very unsure
of my code practice based on the current hardware interface, and
the TDX PMU enablement work might be a good starting point.
>>>>>
>>>>> We want PMU virtualization to work. There are at least two ways of doing that:
>>>>> 1) Cede the entire PMU to the guest while it's running.
>>>>
>>>> So the guest will take over the control of the entire PMUs while it's
>>>> running. I know someone wants to do system-wide monitoring. This case
>>>> will be failed.
I think from the vm-entry/exit level of granularity, we can do it
even if someone wants to do system-wide monitoring, and the
hard part is the sharing, synchronization and filtering of perf data.
>>>
>>> We have system-wide monitoring for fleet efficiency, but since there's
>>> nothing we can do about the efficiency of the guest (and those cycles
>>> are paid for by the customer, anyway), I don't think our efficiency
>>> experts lose any important insights if guest cycles are a blind spot.
Visibility of (non-tdx/sev) guest performance data helps with global optimal
solutions, such as memory migration and malicious attack detection, and
there are too many other PMU use cases that ignore the vm-entry boundary.
>>
>> Others, e.g., NMI watchdog, also occupy a performance counter. I think
>> the NMI watchdog is enabled by default at least for the current Linux
>> kernel. You have to disable all such cases in the host when the guest is
>> running.
>
> It doesn't actually make any sense to run the NMI watchdog while in
> the guest, does it?
I wouldn't think so, AFAI a lot of guest vendor images run the NMI watchdog.
>
>>>
>>>> I'm not sure whether you can fully trust the guest. If malware runs in
>>>> the guest, I don't know whether it will harm the entire system. I'm not
>>>> a security expert, but it sounds dangerous.
>>>> Hope the administrators know what they are doing when choosing this policy.
We always consider the security attack surface when choosing which
pmu features can be virtualized. Until someone gives us a negative answer,
then we quietly and urgently fix it. We already have the module parameter
for vPMU, and the per-vm parameters are in the air.
>>>
>>> Virtual machines are inherently dangerous. :-)
Without a specific context, any hardware or software stack is inherently dangerous.
>>>
>>> Despite our concerns about PMU side-channels, Intel is constantly
>>> reminding us that no such attacks are yet known. We would probably
>>> restrict some events to guests that occupy an entire socket, just to
>>> be safe.
The socket level performance data should not be available for current guests.
>>>
>>> Note that on the flip side, TDX and SEV are all about catering to
>>> guests that don't trust the host. Those customers probably don't want
>>> the host to be able to use the PMU to snoop on guest activity.
Of course, for non-tdx/sev guests, we (upstream) should not prevent the PMU
from being used to snoop on guest activity. How it is used is a matter for
the administrators, but usability or feasibility is a technical consideration.
>>>
>>>>> 2) Introduce a new "ultimate" priority level in the host perf
>>>>> subsystem. Only KVM can request events at the ultimate priority, and
>>>>> these requests supersede any other requests.
>>>>
>>>> The "ultimate" priority level doesn't help in the above case. The
>>>> counter M may not bring the precise which guest requests. I remember you
>>>> called it "broken".
>>>
>>> Ideally, ultimate priority also comes with the ability to request
>>> specific counters.
We need to relay the vcpu's specific needs for counters of different
capabilities to the host perf so that it can make the right decisions.
>>>
>>>> KVM can fails the case, but KVM cannot notify the guest. The guest still
>>>> see wrong result.
Indeed, the pain point is "KVM cannot notify the guest.", or we need to
remove all the "reasons for notification" one by one.
>>>>
>>>>>
>>>>> Other solutions are welcome.
>>>>
>>>> I don't have a perfect solution to achieve all your requirements. Based
>>>> on my understanding, the guest has to be compromised by either
>>>> tolerating some errors or dropping some features (e.g., some special
>>>> events). With that, we may consider the above "ultimate" priority level
>>>> policy. The default policy would be the same as the current
>>>> implementation, where the host perf treats all the users, including the
>>>> guest, equally. The administrators can set the "ultimate" priority level
>>>> policy, which may let the KVM/guest pin/own some regular counters via
>>>> perf subsystem. That's just my personal opinion for your reference.
>>>
>>> I disagree. The guest does not have to be compromised. For a proof of
>>> concept, see VMware ESXi. Probably Microsoft Hyper-V as well, though I
>>> haven't checked.
>>
>> As far as I know, VMware ESXi has its own VMkernel, which can owns the
>> entire HW PMUs. The KVM is part of the Linux kernel. The HW PMUs should
>> be shared among components/users. I think the cases are different.
>
> Architecturally, ESXi is not very different from Linux. The VMkernel
> is a posix-compliant kernel, and VMware's "vmm" is comparable to kvm.
>
>> Also, from what I searched on the VMware website, they still encounter
>> the case that a guest VM may not get a performance monitoring counter.
>> It looks like their solution is to let guest OS check the availability
>> of the counter, which is similar to the solution I mentioned (Use
>> GLOBAL_INUSE MSR).
>>
>> "If an ESXi host's BIOS uses a performance counter or if Fault Tolerance
>> is enabled, some virtual performance counters might not be available for
>> the virtual machine to use."
>
> I'm perfectly happy to give up PMCs on Linux under those same conditions.
>
>> https://docs.vmware.com/en/VMware-vSphere/7.0/com.vmware.vsphere.vm_admin.doc/GUID-F920A3C7-3B42-4E78-8EA7-961E49AF479D.html
>>
>> "In general, if a physical CPU PMC is in use, the corresponding virtual
>> CPU PMC is not functional and is unavailable for use by the guest. Guest
>> OS software detects unavailable general purpose PMCs by checking for a
>> non-zero event select MSR value when a virtual machine powers on."
>>
>> https://kb.vmware.com/s/article/2030221
>>
> Linux, at least, doesn't do that. Maybe Windows does.
>
>> Thanks,
>> Kan
>>
On Tue, Feb 15, 2022 at 7:33 PM Like Xu <[email protected]> wrote:
> AFAI, most cloud provider don't want to lose this flexibility as it leaves
> hundreds of "profile KVM guests" cases with nowhere to land.
I can only speak for Google Cloud. We'd like to be able to profile
host code with system-wide pinned counters on a periodic basis, but we
would like to do so without breaking PMU virtualization in the guest
(and that includes not only the guest counters that run in guest mode,
but also the guest counters that have to run in both guest mode and
host mode, such as reference cycles unhalted).
The one thing that is clear to me from these discussions is that the
perf subsystem behavior needs to be more configurable than it is
today.
One possibility would be to separate priority from usage. Instead of
having implicit priorities based on whether the event is system-wide
pinned, system-wide multiplexed, thread pinned, or thread multiplexed,
we could offer numeric priorities independent of the four usage modes.
That should offer enough flexibility to satisfy everyone.
On Tue, Feb 15, 2022 at 11:36 PM Like Xu <[email protected]> wrote:
> Hardware resources will always be limited (the good news is that
> the number of counters will increase in the future), and when we have
> a common need for the same hardware, we should prioritize overall
> fairness over a GUEST-first strategy.
The bad news about the new counters on Intel is that they are less
capable than the existing counters. SInce there is no bitmask for GP
counters in CPUID.0AH, if you give 4 counters to the guest, then the
host is stuck using the less capable counters.
Overall fairness is definitely not what we want. See the other thread,
which has a proposal for a more configurable perf subsystem that
should meet your needs as well as ours.
Can we move all of this discussion to the other thread, BTW?