Arch LBRs are enabled by setting MSR_ARCH_LBR_CTL.LBREn to 1. A new guest
state field named "Guest IA32_LBR_CTL" is added to enhance guest LBR usage.
When guest Arch LBR is enabled, a guest LBR event will be created like the
model-specific LBR does.
On processors that support Arch LBR, MSR_IA32_DEBUGCTLMSR[bit 0] has no
meaning. It can be written to 0 or 1, but reads will always return 0. On
the vmx_vcpu_reset(), the IA32_LBR_CTL will be cleared to 0.
Signed-off-by: Like Xu <[email protected]>
---
arch/x86/include/asm/vmx.h | 2 ++
arch/x86/kvm/vmx/pmu_intel.c | 27 ++++++++++++++++++++++-----
arch/x86/kvm/vmx/vmx.c | 9 +++++++++
3 files changed, 33 insertions(+), 5 deletions(-)
diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h
index 358707f60d99..8ec7bc24b37a 100644
--- a/arch/x86/include/asm/vmx.h
+++ b/arch/x86/include/asm/vmx.h
@@ -245,6 +245,8 @@ enum vmcs_field {
GUEST_BNDCFGS_HIGH = 0x00002813,
GUEST_IA32_RTIT_CTL = 0x00002814,
GUEST_IA32_RTIT_CTL_HIGH = 0x00002815,
+ GUEST_IA32_LBR_CTL = 0x00002816,
+ GUEST_IA32_LBR_CTL_HIGH = 0x00002817,
HOST_IA32_PAT = 0x00002c00,
HOST_IA32_PAT_HIGH = 0x00002c01,
HOST_IA32_EFER = 0x00002c02,
diff --git a/arch/x86/kvm/vmx/pmu_intel.c b/arch/x86/kvm/vmx/pmu_intel.c
index 25d620685ae7..d14a14eb712d 100644
--- a/arch/x86/kvm/vmx/pmu_intel.c
+++ b/arch/x86/kvm/vmx/pmu_intel.c
@@ -19,6 +19,7 @@
#include "pmu.h"
#define MSR_PMC_FULL_WIDTH_BIT (MSR_IA32_PMC0 - MSR_IA32_PERFCTR0)
+#define KVM_ARCH_LBR_CTL_MASK 0x7f000f
static struct kvm_event_hw_type_mapping intel_arch_events[] = {
/* Index must match CPUID 0x0A.EBX bit vector */
@@ -221,6 +222,7 @@ static bool intel_is_valid_msr(struct kvm_vcpu *vcpu, u32 msr)
ret = pmu->version > 1;
break;
case MSR_ARCH_LBR_DEPTH:
+ case MSR_ARCH_LBR_CTL:
ret = guest_cpuid_has(vcpu, X86_FEATURE_ARCH_LBR);
break;
default:
@@ -390,6 +392,9 @@ static int intel_pmu_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
case MSR_ARCH_LBR_DEPTH:
msr_info->data = lbr_desc->records.nr;
return 0;
+ case MSR_ARCH_LBR_CTL:
+ msr_info->data = vmcs_read64(GUEST_IA32_LBR_CTL);
+ return 0;
default:
if ((pmc = get_gp_pmc(pmu, msr, MSR_IA32_PERFCTR0)) ||
(pmc = get_gp_pmc(pmu, msr, MSR_IA32_PMC0))) {
@@ -457,6 +462,15 @@ static int intel_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
lbr_desc->records.nr = data;
lbr_desc->arch_lbr_reset = true;
return 0;
+ case MSR_ARCH_LBR_CTL:
+ if (!(data & ~KVM_ARCH_LBR_CTL_MASK)) {
+ vmcs_write64(GUEST_IA32_LBR_CTL, data);
+ if (intel_pmu_lbr_is_enabled(vcpu) && !lbr_desc->event &&
+ (data & ARCH_LBR_CTL_LBREN))
+ intel_pmu_create_guest_lbr_event(vcpu);
+ return 0;
+ }
+ break;
default:
if ((pmc = get_gp_pmc(pmu, msr, MSR_IA32_PERFCTR0)) ||
(pmc = get_gp_pmc(pmu, msr, MSR_IA32_PMC0))) {
@@ -635,12 +649,15 @@ static void intel_pmu_reset(struct kvm_vcpu *vcpu)
*/
static void intel_pmu_legacy_freezing_lbrs_on_pmi(struct kvm_vcpu *vcpu)
{
- u64 data = vmcs_read64(GUEST_IA32_DEBUGCTL);
+ u32 lbr_ctl_field = GUEST_IA32_DEBUGCTL;
- if (data & DEBUGCTLMSR_FREEZE_LBRS_ON_PMI) {
- data &= ~DEBUGCTLMSR_LBR;
- vmcs_write64(GUEST_IA32_DEBUGCTL, data);
- }
+ if (!(vmcs_read64(GUEST_IA32_DEBUGCTL) & DEBUGCTLMSR_FREEZE_LBRS_ON_PMI))
+ return;
+
+ if (guest_cpuid_has(vcpu, X86_FEATURE_ARCH_LBR))
+ lbr_ctl_field = GUEST_IA32_LBR_CTL;
+
+ vmcs_write64(lbr_ctl_field, vmcs_read64(lbr_ctl_field) & ~BIT(0));
}
static void intel_pmu_deliver_pmi(struct kvm_vcpu *vcpu)
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index 6d7e760fdfa0..a0660b9934c6 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -2036,6 +2036,13 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
VM_EXIT_SAVE_DEBUG_CONTROLS)
get_vmcs12(vcpu)->guest_ia32_debugctl = data;
+ /*
+ * For Arch LBR, IA32_DEBUGCTL[bit 0] has no meaning.
+ * It can be written to 0 or 1, but reads will always return 0.
+ */
+ if (guest_cpuid_has(vcpu, X86_FEATURE_ARCH_LBR))
+ data &= ~DEBUGCTLMSR_LBR;
+
vmcs_write64(GUEST_IA32_DEBUGCTL, data);
if (intel_pmu_lbr_is_enabled(vcpu) && !to_vmx(vcpu)->lbr_desc.event &&
(data & DEBUGCTLMSR_LBR))
@@ -4463,6 +4470,8 @@ static void vmx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
vmcs_writel(GUEST_SYSENTER_ESP, 0);
vmcs_writel(GUEST_SYSENTER_EIP, 0);
vmcs_write64(GUEST_IA32_DEBUGCTL, 0);
+ if (cpu_has_vmx_arch_lbr())
+ vmcs_write64(GUEST_IA32_LBR_CTL, 0);
}
kvm_set_rflags(vcpu, X86_EFLAGS_FIXED);
--
2.29.2
On Wed, Mar 03, 2021, Like Xu wrote:
> diff --git a/arch/x86/kvm/vmx/pmu_intel.c b/arch/x86/kvm/vmx/pmu_intel.c
> index 25d620685ae7..d14a14eb712d 100644
> --- a/arch/x86/kvm/vmx/pmu_intel.c
> +++ b/arch/x86/kvm/vmx/pmu_intel.c
> @@ -19,6 +19,7 @@
> #include "pmu.h"
>
> #define MSR_PMC_FULL_WIDTH_BIT (MSR_IA32_PMC0 - MSR_IA32_PERFCTR0)
> +#define KVM_ARCH_LBR_CTL_MASK 0x7f000f
It would nice to build this up with the individual bits instead of tossing in a
magic number.
> static struct kvm_event_hw_type_mapping intel_arch_events[] = {
> /* Index must match CPUID 0x0A.EBX bit vector */
> @@ -221,6 +222,7 @@ static bool intel_is_valid_msr(struct kvm_vcpu *vcpu, u32 msr)
> ret = pmu->version > 1;
> break;
> case MSR_ARCH_LBR_DEPTH:
> + case MSR_ARCH_LBR_CTL:
> ret = guest_cpuid_has(vcpu, X86_FEATURE_ARCH_LBR);
> break;
> default:
> @@ -390,6 +392,9 @@ static int intel_pmu_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
> case MSR_ARCH_LBR_DEPTH:
> msr_info->data = lbr_desc->records.nr;
> return 0;
> + case MSR_ARCH_LBR_CTL:
> + msr_info->data = vmcs_read64(GUEST_IA32_LBR_CTL);
> + return 0;
> default:
> if ((pmc = get_gp_pmc(pmu, msr, MSR_IA32_PERFCTR0)) ||
> (pmc = get_gp_pmc(pmu, msr, MSR_IA32_PMC0))) {
> @@ -457,6 +462,15 @@ static int intel_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
> lbr_desc->records.nr = data;
> lbr_desc->arch_lbr_reset = true;
> return 0;
> + case MSR_ARCH_LBR_CTL:
> + if (!(data & ~KVM_ARCH_LBR_CTL_MASK)) {
Maybe invert this to reduce indentation?
if (data & ...)
break; (or "return 1;")
> + vmcs_write64(GUEST_IA32_LBR_CTL, data);
> + if (intel_pmu_lbr_is_enabled(vcpu) && !lbr_desc->event &&
> + (data & ARCH_LBR_CTL_LBREN))
Alignment.
> + intel_pmu_create_guest_lbr_event(vcpu);
> + return 0;
> + }
> + break;
> default:
> if ((pmc = get_gp_pmc(pmu, msr, MSR_IA32_PERFCTR0)) ||
> (pmc = get_gp_pmc(pmu, msr, MSR_IA32_PMC0))) {
> @@ -635,12 +649,15 @@ static void intel_pmu_reset(struct kvm_vcpu *vcpu)
> */
> static void intel_pmu_legacy_freezing_lbrs_on_pmi(struct kvm_vcpu *vcpu)
> {
> - u64 data = vmcs_read64(GUEST_IA32_DEBUGCTL);
> + u32 lbr_ctl_field = GUEST_IA32_DEBUGCTL;
>
> - if (data & DEBUGCTLMSR_FREEZE_LBRS_ON_PMI) {
> - data &= ~DEBUGCTLMSR_LBR;
> - vmcs_write64(GUEST_IA32_DEBUGCTL, data);
> - }
> + if (!(vmcs_read64(GUEST_IA32_DEBUGCTL) & DEBUGCTLMSR_FREEZE_LBRS_ON_PMI))
> + return;
> +
> + if (guest_cpuid_has(vcpu, X86_FEATURE_ARCH_LBR))
> + lbr_ctl_field = GUEST_IA32_LBR_CTL;
> +
> + vmcs_write64(lbr_ctl_field, vmcs_read64(lbr_ctl_field) & ~BIT(0));
Use ARCH_LBR_CTL_LBREN?
> }
>
> static void intel_pmu_deliver_pmi(struct kvm_vcpu *vcpu)
> diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
> index 6d7e760fdfa0..a0660b9934c6 100644
> --- a/arch/x86/kvm/vmx/vmx.c
> +++ b/arch/x86/kvm/vmx/vmx.c
> @@ -2036,6 +2036,13 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
> VM_EXIT_SAVE_DEBUG_CONTROLS)
> get_vmcs12(vcpu)->guest_ia32_debugctl = data;
>
> + /*
> + * For Arch LBR, IA32_DEBUGCTL[bit 0] has no meaning.
> + * It can be written to 0 or 1, but reads will always return 0.
> + */
> + if (guest_cpuid_has(vcpu, X86_FEATURE_ARCH_LBR))
> + data &= ~DEBUGCTLMSR_LBR;
> +
> vmcs_write64(GUEST_IA32_DEBUGCTL, data);
> if (intel_pmu_lbr_is_enabled(vcpu) && !to_vmx(vcpu)->lbr_desc.event &&
> (data & DEBUGCTLMSR_LBR))
> @@ -4463,6 +4470,8 @@ static void vmx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
> vmcs_writel(GUEST_SYSENTER_ESP, 0);
> vmcs_writel(GUEST_SYSENTER_EIP, 0);
> vmcs_write64(GUEST_IA32_DEBUGCTL, 0);
> + if (cpu_has_vmx_arch_lbr())
> + vmcs_write64(GUEST_IA32_LBR_CTL, 0);
Not that any guest is likely to care, but is the MSR cleared on INIT? The SDM
has specific language for warm reset, but I can't find anything for INIT.
On a warm reset, all LBR MSRs, including IA32_LBR_DEPTH, have their values
preserved. However, IA32_LBR_CTL.LBREn is cleared to 0, disabling LBRs. If a
warm reset is triggered while the processor is in C6, also known as warm init,
all LBR MSRs will be reset to their initial values.
> }
>
> kvm_set_rflags(vcpu, X86_EFLAGS_FIXED);
> --
> 2.29.2
>
On 2021/3/4 1:19, Sean Christopherson wrote:
> On Wed, Mar 03, 2021, Like Xu wrote:
>> diff --git a/arch/x86/kvm/vmx/pmu_intel.c b/arch/x86/kvm/vmx/pmu_intel.c
>> index 25d620685ae7..d14a14eb712d 100644
>> --- a/arch/x86/kvm/vmx/pmu_intel.c
>> +++ b/arch/x86/kvm/vmx/pmu_intel.c
>> @@ -19,6 +19,7 @@
>> #include "pmu.h"
>>
>> #define MSR_PMC_FULL_WIDTH_BIT (MSR_IA32_PMC0 - MSR_IA32_PERFCTR0)
>> +#define KVM_ARCH_LBR_CTL_MASK 0x7f000f
> It would nice to build this up with the individual bits instead of tossing in a
> magic number.
I will move the mask ARCH_LBR_CTL_MASK from lbr.c to msr-index.h and thus,
#define KVM_ARCH_LBR_CTL_MASK (ARCH_LBR_CTL_MASK | ARCH_LBR_CTL_LBREN)
I assume the move operation would be a separate patch for host side.
>
>> static struct kvm_event_hw_type_mapping intel_arch_events[] = {
>> /* Index must match CPUID 0x0A.EBX bit vector */
>> @@ -221,6 +222,7 @@ static bool intel_is_valid_msr(struct kvm_vcpu *vcpu, u32 msr)
>> ret = pmu->version > 1;
>> break;
>> case MSR_ARCH_LBR_DEPTH:
>> + case MSR_ARCH_LBR_CTL:
>> ret = guest_cpuid_has(vcpu, X86_FEATURE_ARCH_LBR);
>> break;
>> default:
>> @@ -390,6 +392,9 @@ static int intel_pmu_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
>> case MSR_ARCH_LBR_DEPTH:
>> msr_info->data = lbr_desc->records.nr;
>> return 0;
>> + case MSR_ARCH_LBR_CTL:
>> + msr_info->data = vmcs_read64(GUEST_IA32_LBR_CTL);
>> + return 0;
>> default:
>> if ((pmc = get_gp_pmc(pmu, msr, MSR_IA32_PERFCTR0)) ||
>> (pmc = get_gp_pmc(pmu, msr, MSR_IA32_PMC0))) {
>> @@ -457,6 +462,15 @@ static int intel_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
>> lbr_desc->records.nr = data;
>> lbr_desc->arch_lbr_reset = true;
>> return 0;
>> + case MSR_ARCH_LBR_CTL:
>> + if (!(data & ~KVM_ARCH_LBR_CTL_MASK)) {
> Maybe invert this to reduce indentation?
>
> if (data & ...)
> break; (or "return 1;")
Fine to me.
>
>
>> + vmcs_write64(GUEST_IA32_LBR_CTL, data);
>> + if (intel_pmu_lbr_is_enabled(vcpu) && !lbr_desc->event &&
>> + (data & ARCH_LBR_CTL_LBREN))
> Alignment.
I'll fix it.
>
>> + intel_pmu_create_guest_lbr_event(vcpu);
>> + return 0;
>> + }
>> + break;
>> default:
>> if ((pmc = get_gp_pmc(pmu, msr, MSR_IA32_PERFCTR0)) ||
>> (pmc = get_gp_pmc(pmu, msr, MSR_IA32_PMC0))) {
>> @@ -635,12 +649,15 @@ static void intel_pmu_reset(struct kvm_vcpu *vcpu)
>> */
>> static void intel_pmu_legacy_freezing_lbrs_on_pmi(struct kvm_vcpu *vcpu)
>> {
>> - u64 data = vmcs_read64(GUEST_IA32_DEBUGCTL);
>> + u32 lbr_ctl_field = GUEST_IA32_DEBUGCTL;
>>
>> - if (data & DEBUGCTLMSR_FREEZE_LBRS_ON_PMI) {
>> - data &= ~DEBUGCTLMSR_LBR;
>> - vmcs_write64(GUEST_IA32_DEBUGCTL, data);
>> - }
>> + if (!(vmcs_read64(GUEST_IA32_DEBUGCTL) & DEBUGCTLMSR_FREEZE_LBRS_ON_PMI))
>> + return;
>> +
>> + if (guest_cpuid_has(vcpu, X86_FEATURE_ARCH_LBR))
>> + lbr_ctl_field = GUEST_IA32_LBR_CTL;
>> +
>> + vmcs_write64(lbr_ctl_field, vmcs_read64(lbr_ctl_field) & ~BIT(0));
> Use ARCH_LBR_CTL_LBREN?
I'm trying to unify the usage of EN bit for both Arch LBR and legacy LBR:
#define ARCH_LBR_CTL_LBREN BIT(0)
#define DEBUGCTLMSR_LBR (1UL << 0)
Any suggestion ?
>
>> }
>>
>> static void intel_pmu_deliver_pmi(struct kvm_vcpu *vcpu)
>> diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
>> index 6d7e760fdfa0..a0660b9934c6 100644
>> --- a/arch/x86/kvm/vmx/vmx.c
>> +++ b/arch/x86/kvm/vmx/vmx.c
>> @@ -2036,6 +2036,13 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
>> VM_EXIT_SAVE_DEBUG_CONTROLS)
>> get_vmcs12(vcpu)->guest_ia32_debugctl = data;
>>
>> + /*
>> + * For Arch LBR, IA32_DEBUGCTL[bit 0] has no meaning.
>> + * It can be written to 0 or 1, but reads will always return 0.
>> + */
>> + if (guest_cpuid_has(vcpu, X86_FEATURE_ARCH_LBR))
>> + data &= ~DEBUGCTLMSR_LBR;
>> +
>> vmcs_write64(GUEST_IA32_DEBUGCTL, data);
>> if (intel_pmu_lbr_is_enabled(vcpu) && !to_vmx(vcpu)->lbr_desc.event &&
>> (data & DEBUGCTLMSR_LBR))
>> @@ -4463,6 +4470,8 @@ static void vmx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
>> vmcs_writel(GUEST_SYSENTER_ESP, 0);
>> vmcs_writel(GUEST_SYSENTER_EIP, 0);
>> vmcs_write64(GUEST_IA32_DEBUGCTL, 0);
>> + if (cpu_has_vmx_arch_lbr())
>> + vmcs_write64(GUEST_IA32_LBR_CTL, 0);
> Not that any guest is likely to care, but is the MSR cleared on INIT? The SDM
> has specific language for warm reset, but I can't find anything for INIT.
>
> On a warm reset, all LBR MSRs, including IA32_LBR_DEPTH, have their values
> preserved. However, IA32_LBR_CTL.LBREn is cleared to 0, disabling LBRs. If a
> warm reset is triggered while the processor is in C6, also known as warm init,
> all LBR MSRs will be reset to their initial values.
I was told that the reset behavior of GUEST_IA32_LBR_CTL
would be the same as the GUEST_IA32_DEBUGCTL (true for INIT as well).
It looks we have not strictly distinguished the guest's power concept C*.
Do we have two trap paths for "warm reset" and "warm init" ?
>
>> }
>>
>> kvm_set_rflags(vcpu, X86_EFLAGS_FIXED);
>> --
>> 2.29.2
>>
On Thu, Mar 04, 2021, Xu, Like wrote:
> On 2021/3/4 1:19, Sean Christopherson wrote:
> > > @@ -4463,6 +4470,8 @@ static void vmx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
> > > vmcs_writel(GUEST_SYSENTER_ESP, 0);
> > > vmcs_writel(GUEST_SYSENTER_EIP, 0);
> > > vmcs_write64(GUEST_IA32_DEBUGCTL, 0);
> > > + if (cpu_has_vmx_arch_lbr())
> > > + vmcs_write64(GUEST_IA32_LBR_CTL, 0);
> > Not that any guest is likely to care, but is the MSR cleared on INIT? The SDM
> > has specific language for warm reset, but I can't find anything for INIT.
> >
> > On a warm reset, all LBR MSRs, including IA32_LBR_DEPTH, have their values
> > preserved. However, IA32_LBR_CTL.LBREn is cleared to 0, disabling LBRs. If a
> > warm reset is triggered while the processor is in C6, also known as warm init,
> > all LBR MSRs will be reset to their initial values.
>
> I was told that the reset behavior of GUEST_IA32_LBR_CTL
> would be the same as the GUEST_IA32_DEBUGCTL (true for INIT as well).
Yes, and DEBUGCTL is preserved on INIT.
if (!init_event) {
vmcs_write32(GUEST_SYSENTER_CS, 0);
vmcs_writel(GUEST_SYSENTER_ESP, 0);
vmcs_writel(GUEST_SYSENTER_EIP, 0);
vmcs_write64(GUEST_IA32_DEBUGCTL, 0);
}
Table 22-10 in the SDM:
All Other MSRs | Pwr up or Reset: | INIT:
Undefined Unchanged
If IA32_LBR_DEPTH is weirdly exempt, it needs to be documented. I doubt that's
the case though.
> It looks we have not strictly distinguished the guest's power concept C*.
> Do we have two trap paths for "warm reset" and "warm init" ?
No. Despite the name .vcpu_reset, KVM doesn't even have a RESET path, userspace
is responsible for modelling RESET.