LinuxLists.cc - [RFC PATCH 23/41] KVM: x86/pmu: Implement the save/restore of PMU state for Intel CPU

2024-01-26 12:06:24

Subject: [RFC PATCH 23/41] KVM: x86/pmu: Implement the save/restore of PMU state for Intel CPU

From: Dapeng Mi <[email protected]>

Implement the save/restore of PMU state for pasthrough PMU in Intel. In
passthrough mode, KVM owns exclusively the PMU HW when control flow goes to
the scope of passthrough PMU. Thus, KVM needs to save the host PMU state
and gains the full HW PMU ownership. On the contrary, host regains the
ownership of PMU HW from KVM when control flow leaves the scope of
passthrough PMU.

Implement PMU context switches for Intel CPUs and opptunistically use
rdpmcl() instead of rdmsrl() when reading counters since the former has
lower latency in Intel CPUs.

Co-developed-by: Mingwei Zhang <[email protected]>
Signed-off-by: Mingwei Zhang <[email protected]>
Signed-off-by: Dapeng Mi <[email protected]>
---
arch/x86/kvm/vmx/pmu_intel.c | 73 ++++++++++++++++++++++++++++++++++++
1 file changed, 73 insertions(+)

diff --git a/arch/x86/kvm/vmx/pmu_intel.c b/arch/x86/kvm/vmx/pmu_intel.c
index 0d58fe7d243e..f79bebe7093d 100644
--- a/arch/x86/kvm/vmx/pmu_intel.c
+++ b/arch/x86/kvm/vmx/pmu_intel.c
@@ -823,10 +823,83 @@ void intel_passthrough_pmu_msrs(struct kvm_vcpu *vcpu)

static void intel_save_pmu_context(struct kvm_vcpu *vcpu)
{
+ struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
+ struct kvm_pmc *pmc;
+ u32 i;
+
+ if (pmu->version != 2) {
+ pr_warn("only PerfMon v2 is supported for passthrough PMU");
+ return;
+ }
+
+ /* Global ctrl register is already saved at VM-exit. */
+ rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, pmu->global_status);
+ /* Clear hardware MSR_CORE_PERF_GLOBAL_STATUS MSR, if non-zero. */
+ if (pmu->global_status)
+ wrmsrl(MSR_CORE_PERF_GLOBAL_OVF_CTRL, pmu->global_status);
+
+ for (i = 0; i < pmu->nr_arch_gp_counters; i++) {
+ pmc = &pmu->gp_counters[i];
+ rdpmcl(i, pmc->counter);
+ rdmsrl(i + MSR_ARCH_PERFMON_EVENTSEL0, pmc->eventsel);
+ /*
+ * Clear hardware PERFMON_EVENTSELx and its counter to avoid
+ * leakage and also avoid this guest GP counter get accidentally
+ * enabled during host running when host enable global ctrl.
+ */
+ if (pmc->eventsel)
+ wrmsrl(MSR_ARCH_PERFMON_EVENTSEL0 + i, 0);
+ if (pmc->counter)
+ wrmsrl(MSR_IA32_PMC0 + i, 0);
+ }
+
+ rdmsrl(MSR_CORE_PERF_FIXED_CTR_CTRL, pmu->fixed_ctr_ctrl);
+ /*
+ * Clear hardware FIXED_CTR_CTRL MSR to avoid information leakage and
+ * also avoid these guest fixed counters get accidentially enabled
+ * during host running when host enable global ctrl.
+ */
+ if (pmu->fixed_ctr_ctrl)
+ wrmsrl(MSR_CORE_PERF_FIXED_CTR_CTRL, 0);
+ for (i = 0; i < pmu->nr_arch_fixed_counters; i++) {
+ pmc = &pmu->fixed_counters[i];
+ rdpmcl(INTEL_PMC_FIXED_RDPMC_BASE | i, pmc->counter);
+ if (pmc->counter)
+ wrmsrl(MSR_CORE_PERF_FIXED_CTR0 + i, 0);
+ }
}

static void intel_restore_pmu_context(struct kvm_vcpu *vcpu)
{
+ struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
+ struct kvm_pmc *pmc;
+ u64 global_status;
+ int i;
+
+ if (pmu->version != 2) {
+ pr_warn("only PerfMon v2 is supported for passthrough PMU");
+ return;
+ }
+
+ /* Clear host global_ctrl and global_status MSR if non-zero. */
+ wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, 0);
+ rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, global_status);
+ if (global_status)
+ wrmsrl(MSR_CORE_PERF_GLOBAL_OVF_CTRL, global_status);
+
+ wrmsrl(MSR_CORE_PERF_GLOBAL_STATUS_SET, pmu->global_status);
+
+ for (i = 0; i < pmu->nr_arch_gp_counters; i++) {
+ pmc = &pmu->gp_counters[i];
+ wrmsrl(MSR_IA32_PMC0 + i, pmc->counter);
+ wrmsrl(MSR_ARCH_PERFMON_EVENTSEL0 + i, pmc->eventsel);
+ }
+
+ wrmsrl(MSR_CORE_PERF_FIXED_CTR_CTRL, pmu->fixed_ctr_ctrl);
+ for (i = 0; i < pmu->nr_arch_fixed_counters; i++) {
+ pmc = &pmu->fixed_counters[i];
+ wrmsrl(MSR_CORE_PERF_FIXED_CTR0 + i, pmc->counter);
+ }
}

struct kvm_pmu_ops intel_pmu_ops __initdata = {
--
2.34.1

2024-04-11 21:27:00

by Sean Christopherson

[permalink] [raw]

Subject: Re: [RFC PATCH 23/41] KVM: x86/pmu: Implement the save/restore of PMU state for Intel CPU

On Fri, Jan 26, 2024, Xiong Zhang wrote:
> static void intel_save_pmu_context(struct kvm_vcpu *vcpu)
> {
> + struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
> + struct kvm_pmc *pmc;
> + u32 i;
> +
> + if (pmu->version != 2) {
> + pr_warn("only PerfMon v2 is supported for passthrough PMU");
> + return;
> + }
> +
> + /* Global ctrl register is already saved at VM-exit. */
> + rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, pmu->global_status);
> + /* Clear hardware MSR_CORE_PERF_GLOBAL_STATUS MSR, if non-zero. */
> + if (pmu->global_status)
> + wrmsrl(MSR_CORE_PERF_GLOBAL_OVF_CTRL, pmu->global_status);
> +
> + for (i = 0; i < pmu->nr_arch_gp_counters; i++) {
> + pmc = &pmu->gp_counters[i];
> + rdpmcl(i, pmc->counter);
> + rdmsrl(i + MSR_ARCH_PERFMON_EVENTSEL0, pmc->eventsel);
> + /*
> + * Clear hardware PERFMON_EVENTSELx and its counter to avoid
> + * leakage and also avoid this guest GP counter get accidentally
> + * enabled during host running when host enable global ctrl.
> + */
> + if (pmc->eventsel)
> + wrmsrl(MSR_ARCH_PERFMON_EVENTSEL0 + i, 0);
> + if (pmc->counter)
> + wrmsrl(MSR_IA32_PMC0 + i, 0);
> + }
> +
> + rdmsrl(MSR_CORE_PERF_FIXED_CTR_CTRL, pmu->fixed_ctr_ctrl);
> + /*
> + * Clear hardware FIXED_CTR_CTRL MSR to avoid information leakage and
> + * also avoid these guest fixed counters get accidentially enabled
> + * during host running when host enable global ctrl.
> + */
> + if (pmu->fixed_ctr_ctrl)
> + wrmsrl(MSR_CORE_PERF_FIXED_CTR_CTRL, 0);
> + for (i = 0; i < pmu->nr_arch_fixed_counters; i++) {
> + pmc = &pmu->fixed_counters[i];
> + rdpmcl(INTEL_PMC_FIXED_RDPMC_BASE | i, pmc->counter);
> + if (pmc->counter)
> + wrmsrl(MSR_CORE_PERF_FIXED_CTR0 + i, 0);
> + }

For the next RFC, please make that it includes AMD support. Mostly because I'm
pretty all of this code can be in common x86. The fixed counters are ugly,
but pmu->nr_arch_fixed_counters is guaranteed to '0' on AMD, so it's _just_ ugly,
i.e. not functionally problematic.

2024-04-11 21:44:44

by Sean Christopherson

[permalink] [raw]

Subject: Re: [RFC PATCH 23/41] KVM: x86/pmu: Implement the save/restore of PMU state for Intel CPU

On Fri, Jan 26, 2024, Xiong Zhang wrote:
> From: Dapeng Mi <[email protected]>
>
> Implement the save/restore of PMU state for pasthrough PMU in Intel. In
> passthrough mode, KVM owns exclusively the PMU HW when control flow goes to
> the scope of passthrough PMU. Thus, KVM needs to save the host PMU state
> and gains the full HW PMU ownership. On the contrary, host regains the
> ownership of PMU HW from KVM when control flow leaves the scope of
> passthrough PMU.
>
> Implement PMU context switches for Intel CPUs and opptunistically use
> rdpmcl() instead of rdmsrl() when reading counters since the former has
> lower latency in Intel CPUs.
>
> Co-developed-by: Mingwei Zhang <[email protected]>
> Signed-off-by: Mingwei Zhang <[email protected]>
> Signed-off-by: Dapeng Mi <[email protected]>
> ---
> arch/x86/kvm/vmx/pmu_intel.c | 73 ++++++++++++++++++++++++++++++++++++
> 1 file changed, 73 insertions(+)
>
> diff --git a/arch/x86/kvm/vmx/pmu_intel.c b/arch/x86/kvm/vmx/pmu_intel.c
> index 0d58fe7d243e..f79bebe7093d 100644
> --- a/arch/x86/kvm/vmx/pmu_intel.c
> +++ b/arch/x86/kvm/vmx/pmu_intel.c
> @@ -823,10 +823,83 @@ void intel_passthrough_pmu_msrs(struct kvm_vcpu *vcpu)
>
> static void intel_save_pmu_context(struct kvm_vcpu *vcpu)

I would prefer there be a "guest" in there somewhere, e.g. intel_save_guest_pmu_context().

> {
> + struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
> + struct kvm_pmc *pmc;
> + u32 i;
> +
> + if (pmu->version != 2) {
> + pr_warn("only PerfMon v2 is supported for passthrough PMU");
> + return;
> + }
> +
> + /* Global ctrl register is already saved at VM-exit. */
> + rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, pmu->global_status);
> + /* Clear hardware MSR_CORE_PERF_GLOBAL_STATUS MSR, if non-zero. */
> + if (pmu->global_status)
> + wrmsrl(MSR_CORE_PERF_GLOBAL_OVF_CTRL, pmu->global_status);
> +
> + for (i = 0; i < pmu->nr_arch_gp_counters; i++) {
> + pmc = &pmu->gp_counters[i];
> + rdpmcl(i, pmc->counter);
> + rdmsrl(i + MSR_ARCH_PERFMON_EVENTSEL0, pmc->eventsel);
> + /*
> + * Clear hardware PERFMON_EVENTSELx and its counter to avoid
> + * leakage and also avoid this guest GP counter get accidentally
> + * enabled during host running when host enable global ctrl.
> + */
> + if (pmc->eventsel)
> + wrmsrl(MSR_ARCH_PERFMON_EVENTSEL0 + i, 0);
> + if (pmc->counter)
> + wrmsrl(MSR_IA32_PMC0 + i, 0);

This doesn't make much sense. The kernel already has full access to the guest,
I don't see what is gained by zeroing out the MSRs just to hide them from perf.

Similarly, if perf enables a counter if PERF_GLOBAL_CTRL without first restoring
the event selector, we gots problems.

Same thing for the fixed counters below. Can't this just be?

for (i = 0; i < pmu->nr_arch_gp_counters; i++)
rdpmcl(i, pmu->gp_counters[i].counter);

for (i = 0; i < pmu->nr_arch_fixed_counters; i++)
rdpmcl(INTEL_PMC_FIXED_RDPMC_BASE | i,
pmu->fixed_counters[i].counter);

> + }
> +
> + rdmsrl(MSR_CORE_PERF_FIXED_CTR_CTRL, pmu->fixed_ctr_ctrl);
> + /*
> + * Clear hardware FIXED_CTR_CTRL MSR to avoid information leakage and
> + * also avoid these guest fixed counters get accidentially enabled
> + * during host running when host enable global ctrl.
> + */
> + if (pmu->fixed_ctr_ctrl)
> + wrmsrl(MSR_CORE_PERF_FIXED_CTR_CTRL, 0);
> + for (i = 0; i < pmu->nr_arch_fixed_counters; i++) {
> + pmc = &pmu->fixed_counters[i];
> + rdpmcl(INTEL_PMC_FIXED_RDPMC_BASE | i, pmc->counter);
> + if (pmc->counter)
> + wrmsrl(MSR_CORE_PERF_FIXED_CTR0 + i, 0);
> + }
> }
>
> static void intel_restore_pmu_context(struct kvm_vcpu *vcpu)
> {
> + struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
> + struct kvm_pmc *pmc;
> + u64 global_status;
> + int i;
> +
> + if (pmu->version != 2) {
> + pr_warn("only PerfMon v2 is supported for passthrough PMU");
> + return;
> + }
> +
> + /* Clear host global_ctrl and global_status MSR if non-zero. */
> + wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, 0);

Why? PERF_GLOBAL_CTRL will be auto-loaded at VM-Enter, why do it now?

> + rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, global_status);
> + if (global_status)
> + wrmsrl(MSR_CORE_PERF_GLOBAL_OVF_CTRL, global_status);

This seems especially silly, isn't the full MSR being written below? Or am I
misunderstanding how these things work?

> + wrmsrl(MSR_CORE_PERF_GLOBAL_STATUS_SET, pmu->global_status);
> +
> + for (i = 0; i < pmu->nr_arch_gp_counters; i++) {
> + pmc = &pmu->gp_counters[i];
> + wrmsrl(MSR_IA32_PMC0 + i, pmc->counter);
> + wrmsrl(MSR_ARCH_PERFMON_EVENTSEL0 + i, pmc->eventsel);
> + }
> +
> + wrmsrl(MSR_CORE_PERF_FIXED_CTR_CTRL, pmu->fixed_ctr_ctrl);
> + for (i = 0; i < pmu->nr_arch_fixed_counters; i++) {
> + pmc = &pmu->fixed_counters[i];
> + wrmsrl(MSR_CORE_PERF_FIXED_CTR0 + i, pmc->counter);
> + }
> }
>
> struct kvm_pmu_ops intel_pmu_ops __initdata = {
> --
> 2.34.1
>

2024-04-11 22:20:10

by Jim Mattson

[permalink] [raw]

Subject: Re: [RFC PATCH 23/41] KVM: x86/pmu: Implement the save/restore of PMU state for Intel CPU

On Thu, Apr 11, 2024 at 2:44 PM Sean Christopherson <[email protected]> wrote:
>
> On Fri, Jan 26, 2024, Xiong Zhang wrote:
> > From: Dapeng Mi <[email protected]>
> >
> > Implement the save/restore of PMU state for pasthrough PMU in Intel. In
> > passthrough mode, KVM owns exclusively the PMU HW when control flow goes to
> > the scope of passthrough PMU. Thus, KVM needs to save the host PMU state
> > and gains the full HW PMU ownership. On the contrary, host regains the
> > ownership of PMU HW from KVM when control flow leaves the scope of
> > passthrough PMU.
> >
> > Implement PMU context switches for Intel CPUs and opptunistically use
> > rdpmcl() instead of rdmsrl() when reading counters since the former has
> > lower latency in Intel CPUs.
> >
> > Co-developed-by: Mingwei Zhang <[email protected]>
> > Signed-off-by: Mingwei Zhang <[email protected]>
> > Signed-off-by: Dapeng Mi <[email protected]>
> > ---
> > arch/x86/kvm/vmx/pmu_intel.c | 73 ++++++++++++++++++++++++++++++++++++
> > 1 file changed, 73 insertions(+)
> >
> > diff --git a/arch/x86/kvm/vmx/pmu_intel.c b/arch/x86/kvm/vmx/pmu_intel.c
> > index 0d58fe7d243e..f79bebe7093d 100644
> > --- a/arch/x86/kvm/vmx/pmu_intel.c
> > +++ b/arch/x86/kvm/vmx/pmu_intel.c
> > @@ -823,10 +823,83 @@ void intel_passthrough_pmu_msrs(struct kvm_vcpu *vcpu)
> >
> > static void intel_save_pmu_context(struct kvm_vcpu *vcpu)
>
> I would prefer there be a "guest" in there somewhere, e.g. intel_save_guest_pmu_context().
>
> > {
> > + struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
> > + struct kvm_pmc *pmc;
> > + u32 i;
> > +
> > + if (pmu->version != 2) {
> > + pr_warn("only PerfMon v2 is supported for passthrough PMU");
> > + return;
> > + }
> > +
> > + /* Global ctrl register is already saved at VM-exit. */
> > + rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, pmu->global_status);
> > + /* Clear hardware MSR_CORE_PERF_GLOBAL_STATUS MSR, if non-zero. */
> > + if (pmu->global_status)
> > + wrmsrl(MSR_CORE_PERF_GLOBAL_OVF_CTRL, pmu->global_status);
> > +
> > + for (i = 0; i < pmu->nr_arch_gp_counters; i++) {
> > + pmc = &pmu->gp_counters[i];
> > + rdpmcl(i, pmc->counter);
> > + rdmsrl(i + MSR_ARCH_PERFMON_EVENTSEL0, pmc->eventsel);
> > + /*
> > + * Clear hardware PERFMON_EVENTSELx and its counter to avoid
> > + * leakage and also avoid this guest GP counter get accidentally
> > + * enabled during host running when host enable global ctrl.
> > + */
> > + if (pmc->eventsel)
> > + wrmsrl(MSR_ARCH_PERFMON_EVENTSEL0 + i, 0);
> > + if (pmc->counter)
> > + wrmsrl(MSR_IA32_PMC0 + i, 0);
>
> This doesn't make much sense. The kernel already has full access to the guest,
> I don't see what is gained by zeroing out the MSRs just to hide them from perf.
>
> Similarly, if perf enables a counter if PERF_GLOBAL_CTRL without first restoring
> the event selector, we gots problems.
>
> Same thing for the fixed counters below. Can't this just be?
>
> for (i = 0; i < pmu->nr_arch_gp_counters; i++)
> rdpmcl(i, pmu->gp_counters[i].counter);
>
> for (i = 0; i < pmu->nr_arch_fixed_counters; i++)
> rdpmcl(INTEL_PMC_FIXED_RDPMC_BASE | i,
> pmu->fixed_counters[i].counter);
>
> > + }
> > +
> > + rdmsrl(MSR_CORE_PERF_FIXED_CTR_CTRL, pmu->fixed_ctr_ctrl);
> > + /*
> > + * Clear hardware FIXED_CTR_CTRL MSR to avoid information leakage and
> > + * also avoid these guest fixed counters get accidentially enabled
> > + * during host running when host enable global ctrl.
> > + */
> > + if (pmu->fixed_ctr_ctrl)
> > + wrmsrl(MSR_CORE_PERF_FIXED_CTR_CTRL, 0);
> > + for (i = 0; i < pmu->nr_arch_fixed_counters; i++) {
> > + pmc = &pmu->fixed_counters[i];
> > + rdpmcl(INTEL_PMC_FIXED_RDPMC_BASE | i, pmc->counter);
> > + if (pmc->counter)
> > + wrmsrl(MSR_CORE_PERF_FIXED_CTR0 + i, 0);
> > + }
> > }
> >
> > static void intel_restore_pmu_context(struct kvm_vcpu *vcpu)
> > {
> > + struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
> > + struct kvm_pmc *pmc;
> > + u64 global_status;
> > + int i;
> > +
> > + if (pmu->version != 2) {
> > + pr_warn("only PerfMon v2 is supported for passthrough PMU");
> > + return;
> > + }
> > +
> > + /* Clear host global_ctrl and global_status MSR if non-zero. */
> > + wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, 0);
>
> Why? PERF_GLOBAL_CTRL will be auto-loaded at VM-Enter, why do it now?
>
> > + rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, global_status);
> > + if (global_status)
> > + wrmsrl(MSR_CORE_PERF_GLOBAL_OVF_CTRL, global_status);
>
> This seems especially silly, isn't the full MSR being written below? Or am I
> misunderstanding how these things work?

LOL! You expect CPU design to follow basic logic?!?

Writing a 1 to a bit in IA32_PERF_GLOBAL_STATUS_SET sets the
corresponding bit in IA32_PERF_GLOBAL_STATUS to 1.

Writing a 0 to a bit in to IA32_PERF_GLOBAL_STATUS_SET is a nop.

To clear a bit in IA32_PERF_GLOBAL_STATUS, you need to write a 1 to
the corresponding bit in IA32_PERF_GLOBAL_STATUS_RESET (aka
IA32_PERF_GLOBAL_OVF_CTRL).

> > + wrmsrl(MSR_CORE_PERF_GLOBAL_STATUS_SET, pmu->global_status);
> > +
> > + for (i = 0; i < pmu->nr_arch_gp_counters; i++) {
> > + pmc = &pmu->gp_counters[i];
> > + wrmsrl(MSR_IA32_PMC0 + i, pmc->counter);
> > + wrmsrl(MSR_ARCH_PERFMON_EVENTSEL0 + i, pmc->eventsel);
> > + }
> > +
> > + wrmsrl(MSR_CORE_PERF_FIXED_CTR_CTRL, pmu->fixed_ctr_ctrl);
> > + for (i = 0; i < pmu->nr_arch_fixed_counters; i++) {
> > + pmc = &pmu->fixed_counters[i];
> > + wrmsrl(MSR_CORE_PERF_FIXED_CTR0 + i, pmc->counter);
> > + }
> > }
> >
> > struct kvm_pmu_ops intel_pmu_ops __initdata = {
> > --
> > 2.34.1
> >

2024-04-11 23:31:50

by Sean Christopherson

[permalink] [raw]

Subject: Re: [RFC PATCH 23/41] KVM: x86/pmu: Implement the save/restore of PMU state for Intel CPU

On Thu, Apr 11, 2024, Jim Mattson wrote:
> On Thu, Apr 11, 2024 at 2:44 PM Sean Christopherson <[email protected]> wrote:
> > > + /* Clear host global_ctrl and global_status MSR if non-zero. */
> > > + wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, 0);
> >
> > Why? PERF_GLOBAL_CTRL will be auto-loaded at VM-Enter, why do it now?
> >
> > > + rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, global_status);
> > > + if (global_status)
> > > + wrmsrl(MSR_CORE_PERF_GLOBAL_OVF_CTRL, global_status);
> >
> > This seems especially silly, isn't the full MSR being written below? Or am I
> > misunderstanding how these things work?
>
> LOL! You expect CPU design to follow basic logic?!?
>
> Writing a 1 to a bit in IA32_PERF_GLOBAL_STATUS_SET sets the
> corresponding bit in IA32_PERF_GLOBAL_STATUS to 1.
>
> Writing a 0 to a bit in to IA32_PERF_GLOBAL_STATUS_SET is a nop.
>
> To clear a bit in IA32_PERF_GLOBAL_STATUS, you need to write a 1 to
> the corresponding bit in IA32_PERF_GLOBAL_STATUS_RESET (aka
> IA32_PERF_GLOBAL_OVF_CTRL).

If only C had a way to annotate what the code is doing. :-)

> > > + wrmsrl(MSR_CORE_PERF_GLOBAL_STATUS_SET, pmu->global_status);

IIUC, that means this should be:

if (pmu->global_status)
wrmsrl(MSR_CORE_PERF_GLOBAL_STATUS_SET, pmu->global_status);

or even better:

toggle = pmu->global_status ^ global_status;
if (global_status & toggle)
wrmsrl(MSR_CORE_PERF_GLOBAL_OVF_CTRL, global_status & toggle);
if (pmu->global_status & toggle)
wrmsrl(MSR_CORE_PERF_GLOBAL_STATUS_SET, pmu->global_status & toggle);

2024-04-13 03:34:59

by Mingwei Zhang

[permalink] [raw]

Subject: Re: [RFC PATCH 23/41] KVM: x86/pmu: Implement the save/restore of PMU state for Intel CPU

On Sat, Apr 13, 2024, Mi, Dapeng wrote:
>
> On 4/12/2024 5:44 AM, Sean Christopherson wrote:
> > On Fri, Jan 26, 2024, Xiong Zhang wrote:
> > > From: Dapeng Mi <[email protected]>
> > >
> > > Implement the save/restore of PMU state for pasthrough PMU in Intel. In
> > > passthrough mode, KVM owns exclusively the PMU HW when control flow goes to
> > > the scope of passthrough PMU. Thus, KVM needs to save the host PMU state
> > > and gains the full HW PMU ownership. On the contrary, host regains the
> > > ownership of PMU HW from KVM when control flow leaves the scope of
> > > passthrough PMU.
> > >
> > > Implement PMU context switches for Intel CPUs and opptunistically use
> > > rdpmcl() instead of rdmsrl() when reading counters since the former has
> > > lower latency in Intel CPUs.
> > >
> > > Co-developed-by: Mingwei Zhang <[email protected]>
> > > Signed-off-by: Mingwei Zhang <[email protected]>
> > > Signed-off-by: Dapeng Mi <[email protected]>
> > > ---
> > > arch/x86/kvm/vmx/pmu_intel.c | 73 ++++++++++++++++++++++++++++++++++++
> > > 1 file changed, 73 insertions(+)
> > >
> > > diff --git a/arch/x86/kvm/vmx/pmu_intel.c b/arch/x86/kvm/vmx/pmu_intel.c
> > > index 0d58fe7d243e..f79bebe7093d 100644
> > > --- a/arch/x86/kvm/vmx/pmu_intel.c
> > > +++ b/arch/x86/kvm/vmx/pmu_intel.c
> > > @@ -823,10 +823,83 @@ void intel_passthrough_pmu_msrs(struct kvm_vcpu *vcpu)
> > > static void intel_save_pmu_context(struct kvm_vcpu *vcpu)
> > I would prefer there be a "guest" in there somewhere, e.g. intel_save_guest_pmu_context().
> Yeah. It looks clearer.
> >
> > > {
> > > + struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
> > > + struct kvm_pmc *pmc;
> > > + u32 i;
> > > +
> > > + if (pmu->version != 2) {
> > > + pr_warn("only PerfMon v2 is supported for passthrough PMU");
> > > + return;
> > > + }
> > > +
> > > + /* Global ctrl register is already saved at VM-exit. */
> > > + rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, pmu->global_status);
> > > + /* Clear hardware MSR_CORE_PERF_GLOBAL_STATUS MSR, if non-zero. */
> > > + if (pmu->global_status)
> > > + wrmsrl(MSR_CORE_PERF_GLOBAL_OVF_CTRL, pmu->global_status);
> > > +
> > > + for (i = 0; i < pmu->nr_arch_gp_counters; i++) {
> > > + pmc = &pmu->gp_counters[i];
> > > + rdpmcl(i, pmc->counter);
> > > + rdmsrl(i + MSR_ARCH_PERFMON_EVENTSEL0, pmc->eventsel);
> > > + /*
> > > + * Clear hardware PERFMON_EVENTSELx and its counter to avoid
> > > + * leakage and also avoid this guest GP counter get accidentally
> > > + * enabled during host running when host enable global ctrl.
> > > + */
> > > + if (pmc->eventsel)
> > > + wrmsrl(MSR_ARCH_PERFMON_EVENTSEL0 + i, 0);
> > > + if (pmc->counter)
> > > + wrmsrl(MSR_IA32_PMC0 + i, 0);
> > This doesn't make much sense. The kernel already has full access to the guest,
> > I don't see what is gained by zeroing out the MSRs just to hide them from perf.
>
> It's necessary to clear the EVENTSELx MSRs for both GP and fixed counters.
> Considering this case, Guest uses GP counter 2, but Host doesn't use it. So
> if the EVENTSEL2 MSR is not cleared here, the GP counter 2 would be enabled
> unexpectedly on host later since Host perf always enable all validate bits
> in PERF_GLOBAL_CTRL MSR. That would cause issues.
>
> Yeah,? the clearing for PMCx MSR should be unnecessary .
>

Why is clearing for PMCx MSR unnecessary? Do we want to leaking counter
values to the host? NO. Not in cloud usage.

Please make changes to this patch with **extreme** caution.

According to our past experience, if there is a bug somewhere,
there is a catch here (normally).

Thanks.
-Mingwei
>
> >
> > Similarly, if perf enables a counter if PERF_GLOBAL_CTRL without first restoring
> > the event selector, we gots problems.
> >
> > Same thing for the fixed counters below. Can't this just be?
> >
> > for (i = 0; i < pmu->nr_arch_gp_counters; i++)
> > rdpmcl(i, pmu->gp_counters[i].counter);
> >
> > for (i = 0; i < pmu->nr_arch_fixed_counters; i++)
> > rdpmcl(INTEL_PMC_FIXED_RDPMC_BASE | i,
> > pmu->fixed_counters[i].counter);
> >
> > > + }
> > > +
> > > + rdmsrl(MSR_CORE_PERF_FIXED_CTR_CTRL, pmu->fixed_ctr_ctrl);
> > > + /*
> > > + * Clear hardware FIXED_CTR_CTRL MSR to avoid information leakage and
> > > + * also avoid these guest fixed counters get accidentially enabled
> > > + * during host running when host enable global ctrl.
> > > + */
> > > + if (pmu->fixed_ctr_ctrl)
> > > + wrmsrl(MSR_CORE_PERF_FIXED_CTR_CTRL, 0);
> > > + for (i = 0; i < pmu->nr_arch_fixed_counters; i++) {
> > > + pmc = &pmu->fixed_counters[i];
> > > + rdpmcl(INTEL_PMC_FIXED_RDPMC_BASE | i, pmc->counter);
> > > + if (pmc->counter)
> > > + wrmsrl(MSR_CORE_PERF_FIXED_CTR0 + i, 0);
> > > + }
> > > }
> > > static void intel_restore_pmu_context(struct kvm_vcpu *vcpu)
> > > {
> > > + struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
> > > + struct kvm_pmc *pmc;
> > > + u64 global_status;
> > > + int i;
> > > +
> > > + if (pmu->version != 2) {
> > > + pr_warn("only PerfMon v2 is supported for passthrough PMU");
> > > + return;
> > > + }
> > > +
> > > + /* Clear host global_ctrl and global_status MSR if non-zero. */
> > > + wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, 0);
> > Why? PERF_GLOBAL_CTRL will be auto-loaded at VM-Enter, why do it now?
>
> As previous comments say, host perf always enable all counters in
> PERF_GLOBAL_CTRL by default. The reason to clear PERF_GLOBAL_CTRL here is to
> ensure all counters in disabled state and the later counter manipulation
> (writing MSR) won't cause any race condition or unexpected behavior on HW.
>
>
> >
> > > + rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, global_status);
> > > + if (global_status)
> > > + wrmsrl(MSR_CORE_PERF_GLOBAL_OVF_CTRL, global_status);
> > This seems especially silly, isn't the full MSR being written below? Or am I
> > misunderstanding how these things work?
>
> I think Jim's comment has already explain why we need to do this.
>
> >
> > > + wrmsrl(MSR_CORE_PERF_GLOBAL_STATUS_SET, pmu->global_status);
> > > +
> > > + for (i = 0; i < pmu->nr_arch_gp_counters; i++) {
> > > + pmc = &pmu->gp_counters[i];
> > > + wrmsrl(MSR_IA32_PMC0 + i, pmc->counter);
> > > + wrmsrl(MSR_ARCH_PERFMON_EVENTSEL0 + i, pmc->eventsel);
> > > + }
> > > +
> > > + wrmsrl(MSR_CORE_PERF_FIXED_CTR_CTRL, pmu->fixed_ctr_ctrl);
> > > + for (i = 0; i < pmu->nr_arch_fixed_counters; i++) {
> > > + pmc = &pmu->fixed_counters[i];
> > > + wrmsrl(MSR_CORE_PERF_FIXED_CTR0 + i, pmc->counter);
> > > + }
> > > }
> > > struct kvm_pmu_ops intel_pmu_ops __initdata = {
> > > --
> > > 2.34.1
> > >

2024-04-13 04:36:22