Changes:
v3 -> v4:
* Fix the new found issue that pmu_counter_t.config crosses cache
lines (patch 04)
* Fix the cycles event false positive by introducing a warm-up phase
instead of switching cycles events order in v3 (patch 07)
* Use macro to replace hard-coded events index (Mingwei)(patches 08~10)
* Simply the asm code to enable/disable GLOBAL_CTRL MSR in whole asm
blob (patch 11)
* Handle legacy CPUs without clflush/clflushopt/IBPB support
(patches 14, 16)
* Optimize emulated instruction validation (patch 17)
All changes pass validation on Intel Sapphire Rapids and Emerald Rapids
platforms against latest kvm-x86/next code (2d181d84af38). No tests on
AMD platforms since no AMD platform on hand. Any tests on AMD platform
are appreciated.
History:
v3: https://lore.kernel.org/lkml/[email protected]/
v2: https://lore.kernel.org/lkml/[email protected]/
v1: https://lore.kernel.org/lkml/[email protected]/
Dapeng Mi (16):
x86: pmu: Remove blank line and redundant space
x86: pmu: Refine fixed_events[] names
x86: pmu: Fix the issue that pmu_counter_t.config crosses cache line
x86: pmu: Enlarge cnt[] length to 48 in check_counters_many()
x86: pmu: Add asserts to warn inconsistent fixed events and counters
x86: pmu: Fix cycles event validation failure
x86: pmu: Use macro to replace hard-coded branches event index
x86: pmu: Use macro to replace hard-coded ref-cycles event index
x86: pmu: Use macro to replace hard-coded instructions event index
x86: pmu: Enable and disable PMCs in loop() asm blob
x86: pmu: Improve instruction and branches events verification
x86: pmu: Improve LLC misses event verification
x86: pmu: Adjust lower boundary of llc-misses event to 0 for legacy
CPUs
x86: pmu: Add IBPB indirect jump asm blob
x86: pmu: Adjust lower boundary of branch-misses event
x86: pmu: Optimize emulated instruction validation
Xiong Zhang (1):
x86: pmu: Remove duplicate code in pmu_init()
lib/x86/pmu.c | 5 -
x86/pmu.c | 386 ++++++++++++++++++++++++++++++++++++++++----------
2 files changed, 308 insertions(+), 83 deletions(-)
base-commit: e96011b32944b1ecca5967674ae243067588d1e7
--
2.34.1
code style changes.
Reviewed-by: Mingwei Zhang <[email protected]>
Signed-off-by: Dapeng Mi <[email protected]>
---
x86/pmu.c | 3 +--
1 file changed, 1 insertion(+), 2 deletions(-)
diff --git a/x86/pmu.c b/x86/pmu.c
index 47a1a602ade2..4847a424f572 100644
--- a/x86/pmu.c
+++ b/x86/pmu.c
@@ -203,8 +203,7 @@ static noinline void __measure(pmu_counter_t *evt, uint64_t count)
static bool verify_event(uint64_t count, struct pmu_event *e)
{
// printf("%d <= %ld <= %d\n", e->min, count, e->max);
- return count >= e->min && count <= e->max;
-
+ return count >= e->min && count <= e->max;
}
static bool verify_counter(pmu_counter_t *cnt)
--
2.34.1
In SDM the fixed counter is numbered from 0 but currently the
fixed_events names are numbered from 1. It would cause confusion for
users. So Change the fixed_events[] names to number from 0 as well and
keep identical with SDM.
Reviewed-by: Mingwei Zhang <[email protected]>
Signed-off-by: Dapeng Mi <[email protected]>
---
x86/pmu.c | 6 +++---
1 file changed, 3 insertions(+), 3 deletions(-)
diff --git a/x86/pmu.c b/x86/pmu.c
index 4847a424f572..c971386db4e6 100644
--- a/x86/pmu.c
+++ b/x86/pmu.c
@@ -44,9 +44,9 @@ struct pmu_event {
{"branches", 0x00c2, 1*N, 1.1*N},
{"branch misses", 0x00c3, 0, 0.1*N},
}, fixed_events[] = {
- {"fixed 1", MSR_CORE_PERF_FIXED_CTR0, 10*N, 10.2*N},
- {"fixed 2", MSR_CORE_PERF_FIXED_CTR0 + 1, 1*N, 30*N},
- {"fixed 3", MSR_CORE_PERF_FIXED_CTR0 + 2, 0.1*N, 30*N}
+ {"fixed 0", MSR_CORE_PERF_FIXED_CTR0, 10*N, 10.2*N},
+ {"fixed 1", MSR_CORE_PERF_FIXED_CTR0 + 1, 1*N, 30*N},
+ {"fixed 2", MSR_CORE_PERF_FIXED_CTR0 + 2, 0.1*N, 30*N}
};
char *buf;
--
2.34.1
Current PMU code deosn't check whether PMU fixed counter number is
larger than pre-defined fixed events. If so, it would cause memory
access out of range.
So add assert to warn this invalid case.
Reviewed-by: Mingwei Zhang <[email protected]>
Signed-off-by: Dapeng Mi <[email protected]>
---
x86/pmu.c | 10 ++++++++--
1 file changed, 8 insertions(+), 2 deletions(-)
diff --git a/x86/pmu.c b/x86/pmu.c
index 494af4012e84..461a4090d475 100644
--- a/x86/pmu.c
+++ b/x86/pmu.c
@@ -111,8 +111,12 @@ static struct pmu_event* get_counter_event(pmu_counter_t *cnt)
for (i = 0; i < gp_events_size; i++)
if (gp_events[i].unit_sel == (cnt->config & 0xffff))
return &gp_events[i];
- } else
- return &fixed_events[cnt->ctr - MSR_CORE_PERF_FIXED_CTR0];
+ } else {
+ unsigned int idx = cnt->ctr - MSR_CORE_PERF_FIXED_CTR0;
+
+ assert(idx < ARRAY_SIZE(fixed_events));
+ return &fixed_events[idx];
+ }
return (void*)0;
}
@@ -738,6 +742,8 @@ int main(int ac, char **av)
printf("Fixed counters: %d\n", pmu.nr_fixed_counters);
printf("Fixed counter width: %d\n", pmu.fixed_counter_width);
+ assert(pmu.nr_fixed_counters <= ARRAY_SIZE(fixed_events));
+
apic_write(APIC_LVTPC, PMI_VECTOR);
check_counters();
--
2.34.1
Currently enabling PMCs, executing loop() and disabling PMCs are divided
3 separated functions. So there could be other instructions executed
between enabling PMCS and running loop() or running loop() and disabling
PMCs, e.g. if there are multiple counters enabled in measure_many()
function, the instructions which enabling the 2nd and more counters
would be counted in by the 1st counter.
So current implementation can only verify the correctness of count by an
rough range rather than a precise count even for instructions and
branches events. Strictly speaking, this verification is meaningless as
the test could still pass even though KVM vPMU has something wrong and
reports an incorrect instructions or branches count which is in the rough
range.
Thus, move the PMCs enabling and disabling into the loop() asm blob and
ensure only the loop asm instructions would be counted, then the
instructions or branches events can be verified with an precise count
instead of an rough range.
Signed-off-by: Dapeng Mi <[email protected]>
---
x86/pmu.c | 80 ++++++++++++++++++++++++++++++++++++++++++++-----------
1 file changed, 65 insertions(+), 15 deletions(-)
diff --git a/x86/pmu.c b/x86/pmu.c
index 20bc6de9c936..d97309d7b8a3 100644
--- a/x86/pmu.c
+++ b/x86/pmu.c
@@ -18,6 +18,15 @@
#define EXPECTED_INSTR 17
#define EXPECTED_BRNCH 5
+#define LOOP_ASM(_wrmsr) \
+ _wrmsr "\n\t" \
+ "mov %%ecx, %%edi; mov %%ebx, %%ecx;\n\t" \
+ "1: mov (%1), %2; add $64, %1;\n\t" \
+ "nop; nop; nop; nop; nop; nop; nop;\n\t" \
+ "loop 1b;\n\t" \
+ "mov %%edi, %%ecx; xor %%eax, %%eax; xor %%edx, %%edx;\n\t" \
+ _wrmsr "\n\t"
+
typedef struct {
uint32_t ctr;
uint32_t idx;
@@ -73,13 +82,43 @@ char *buf;
static struct pmu_event *gp_events;
static unsigned int gp_events_size;
-static inline void loop(void)
+
+static inline void __loop(void)
+{
+ unsigned long tmp, tmp2, tmp3;
+
+ asm volatile(LOOP_ASM("nop")
+ : "=c"(tmp), "=r"(tmp2), "=r"(tmp3)
+ : "0"(N), "1"(buf));
+}
+
+/*
+ * Enable and disable counters in a whole asm blob to ensure
+ * no other instructions are counted in the window between
+ * counters enabling and really LOOP_ASM code executing.
+ * Thus counters can verify instructions and branches events
+ * against precise counts instead of a rough valid count range.
+ */
+static inline void __precise_loop(u64 cntrs)
{
unsigned long tmp, tmp2, tmp3;
+ unsigned int global_ctl = pmu.msr_global_ctl;
+ u32 eax = cntrs & (BIT_ULL(32) - 1);
+ u32 edx = cntrs >> 32;
- asm volatile("1: mov (%1), %2; add $64, %1; nop; nop; nop; nop; nop; nop; nop; loop 1b"
- : "=c"(tmp), "=r"(tmp2), "=r"(tmp3): "0"(N), "1"(buf));
+ asm volatile(LOOP_ASM("wrmsr")
+ : "=b"(tmp), "=r"(tmp2), "=r"(tmp3)
+ : "a"(eax), "d"(edx), "c"(global_ctl),
+ "0"(N), "1"(buf)
+ : "edi");
+}
+static inline void loop(u64 cntrs)
+{
+ if (!this_cpu_has_perf_global_ctrl())
+ __loop();
+ else
+ __precise_loop(cntrs);
}
volatile uint64_t irq_received;
@@ -178,18 +217,17 @@ static void __start_event(pmu_counter_t *evt, uint64_t count)
ctrl = (ctrl & ~(0xf << shift)) | (usrospmi << shift);
wrmsr(MSR_CORE_PERF_FIXED_CTR_CTRL, ctrl);
}
- global_enable(evt);
apic_write(APIC_LVTPC, PMI_VECTOR);
}
static void start_event(pmu_counter_t *evt)
{
__start_event(evt, 0);
+ global_enable(evt);
}
-static void stop_event(pmu_counter_t *evt)
+static void __stop_event(pmu_counter_t *evt)
{
- global_disable(evt);
if (is_gp(evt)) {
wrmsr(MSR_GP_EVENT_SELECTx(event_to_global_idx(evt)),
evt->config & ~EVNTSEL_EN);
@@ -201,14 +239,24 @@ static void stop_event(pmu_counter_t *evt)
evt->count = rdmsr(evt->ctr);
}
+static void stop_event(pmu_counter_t *evt)
+{
+ global_disable(evt);
+ __stop_event(evt);
+}
+
static noinline void measure_many(pmu_counter_t *evt, int count)
{
int i;
+ u64 cntrs = 0;
+
+ for (i = 0; i < count; i++) {
+ __start_event(&evt[i], 0);
+ cntrs |= BIT_ULL(event_to_global_idx(&evt[i]));
+ }
+ loop(cntrs);
for (i = 0; i < count; i++)
- start_event(&evt[i]);
- loop();
- for (i = 0; i < count; i++)
- stop_event(&evt[i]);
+ __stop_event(&evt[i]);
}
static void measure_one(pmu_counter_t *evt)
@@ -218,9 +266,11 @@ static void measure_one(pmu_counter_t *evt)
static noinline void __measure(pmu_counter_t *evt, uint64_t count)
{
+ u64 cntrs = BIT_ULL(event_to_global_idx(evt));
+
__start_event(evt, count);
- loop();
- stop_event(evt);
+ loop(cntrs);
+ __stop_event(evt);
}
static bool verify_event(uint64_t count, struct pmu_event *e)
@@ -483,7 +533,7 @@ static void check_running_counter_wrmsr(void)
report_prefix_push("running counter wrmsr");
start_event(&evt);
- loop();
+ __loop();
wrmsr(MSR_GP_COUNTERx(0), 0);
stop_event(&evt);
report(evt.count < gp_events[instruction_idx].min, "cntr");
@@ -500,7 +550,7 @@ static void check_running_counter_wrmsr(void)
wrmsr(MSR_GP_COUNTERx(0), count);
- loop();
+ __loop();
stop_event(&evt);
if (this_cpu_has_perf_global_status()) {
@@ -641,7 +691,7 @@ static void warm_up(void)
* the real verification.
*/
while (i--)
- loop();
+ loop(0);
}
static void check_counters(void)
--
2.34.1