The number of Arch LBR entries available for recording operations
is dictated by the value in MSR_ARCH_LBR_DEPTH.DEPTH. The supported
LBR depth values can be found in CPUID.(EAX=01CH, ECX=0):EAX[7:0]
and for each bit "n" set in this field, the MSR_ARCH_LBR_DEPTH.DEPTH
value of "8*(n+1)" is supported.
On a guest write to MSR_ARCH_LBR_DEPTH, all LBR entries are reset to 0.
KVM emulates the reset behavior by introducing lbr_desc->arch_lbr_reset.
KVM writes the guest requested value to the native ARCH_LBR_DEPTH MSR
(this is safe because the two values will be the same) when the Arch LBR
records MSRs are pass-through to the guest.
Signed-off-by: Like Xu <[email protected]>
---
arch/x86/kvm/vmx/pmu_intel.c | 43 ++++++++++++++++++++++++++++++++++++
arch/x86/kvm/vmx/vmx.h | 3 +++
2 files changed, 46 insertions(+)
diff --git a/arch/x86/kvm/vmx/pmu_intel.c b/arch/x86/kvm/vmx/pmu_intel.c
index 9efc1a6b8693..25d620685ae7 100644
--- a/arch/x86/kvm/vmx/pmu_intel.c
+++ b/arch/x86/kvm/vmx/pmu_intel.c
@@ -220,6 +220,9 @@ static bool intel_is_valid_msr(struct kvm_vcpu *vcpu, u32 msr)
case MSR_CORE_PERF_GLOBAL_OVF_CTRL:
ret = pmu->version > 1;
break;
+ case MSR_ARCH_LBR_DEPTH:
+ ret = guest_cpuid_has(vcpu, X86_FEATURE_ARCH_LBR);
+ break;
default:
ret = get_gp_pmc(pmu, msr, MSR_IA32_PERFCTR0) ||
get_gp_pmc(pmu, msr, MSR_P6_EVNTSEL0) ||
@@ -250,6 +253,7 @@ static inline void intel_pmu_release_guest_lbr_event(struct kvm_vcpu *vcpu)
if (lbr_desc->event) {
perf_event_release_kernel(lbr_desc->event);
lbr_desc->event = NULL;
+ lbr_desc->arch_lbr_reset = false;
vcpu_to_pmu(vcpu)->event_count--;
}
}
@@ -348,10 +352,26 @@ static bool intel_pmu_handle_lbr_msrs_access(struct kvm_vcpu *vcpu,
return true;
}
+/*
+ * Check if the requested depth values is supported
+ * based on the bits [0:7] of the guest cpuid.1c.eax.
+ */
+static bool arch_lbr_depth_is_valid(struct kvm_vcpu *vcpu, u64 depth)
+{
+ struct kvm_cpuid_entry2 *best;
+
+ best = kvm_find_cpuid_entry(vcpu, 0x1c, 0);
+ if (best && depth && !(depth % 8))
+ return (best->eax & 0xff) & (1ULL << (depth / 8 - 1));
+
+ return false;
+}
+
static int intel_pmu_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
{
struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
struct kvm_pmc *pmc;
+ struct lbr_desc *lbr_desc = vcpu_to_lbr_desc(vcpu);
u32 msr = msr_info->index;
switch (msr) {
@@ -367,6 +387,9 @@ static int intel_pmu_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
case MSR_CORE_PERF_GLOBAL_OVF_CTRL:
msr_info->data = pmu->global_ovf_ctrl;
return 0;
+ case MSR_ARCH_LBR_DEPTH:
+ msr_info->data = lbr_desc->records.nr;
+ return 0;
default:
if ((pmc = get_gp_pmc(pmu, msr, MSR_IA32_PERFCTR0)) ||
(pmc = get_gp_pmc(pmu, msr, MSR_IA32_PMC0))) {
@@ -393,6 +416,7 @@ static int intel_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
{
struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
struct kvm_pmc *pmc;
+ struct lbr_desc *lbr_desc = vcpu_to_lbr_desc(vcpu);
u32 msr = msr_info->index;
u64 data = msr_info->data;
@@ -427,6 +451,12 @@ static int intel_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
return 0;
}
break;
+ case MSR_ARCH_LBR_DEPTH:
+ if (!arch_lbr_depth_is_valid(vcpu, data))
+ return 1;
+ lbr_desc->records.nr = data;
+ lbr_desc->arch_lbr_reset = true;
+ return 0;
default:
if ((pmc = get_gp_pmc(pmu, msr, MSR_IA32_PERFCTR0)) ||
(pmc = get_gp_pmc(pmu, msr, MSR_IA32_PMC0))) {
@@ -566,6 +596,7 @@ static void intel_pmu_init(struct kvm_vcpu *vcpu)
lbr_desc->records.nr = 0;
lbr_desc->event = NULL;
lbr_desc->msr_passthrough = false;
+ lbr_desc->arch_lbr_reset = false;
}
static void intel_pmu_reset(struct kvm_vcpu *vcpu)
@@ -623,6 +654,15 @@ static void intel_pmu_deliver_pmi(struct kvm_vcpu *vcpu)
intel_pmu_legacy_freezing_lbrs_on_pmi(vcpu);
}
+static void intel_pmu_arch_lbr_reset(struct kvm_vcpu *vcpu)
+{
+ struct lbr_desc *lbr_desc = vcpu_to_lbr_desc(vcpu);
+
+ /* On a software write to IA32_LBR_DEPTH, all LBR entries are reset to 0. */
+ wrmsrl(MSR_ARCH_LBR_DEPTH, lbr_desc->records.nr);
+ lbr_desc->arch_lbr_reset = false;
+}
+
static void vmx_update_intercept_for_lbr_msrs(struct kvm_vcpu *vcpu, bool set)
{
struct x86_pmu_lbr *lbr = vcpu_to_lbr_records(vcpu);
@@ -654,6 +694,9 @@ static inline void vmx_enable_lbr_msrs_passthrough(struct kvm_vcpu *vcpu)
{
struct lbr_desc *lbr_desc = vcpu_to_lbr_desc(vcpu);
+ if (unlikely(lbr_desc->arch_lbr_reset))
+ intel_pmu_arch_lbr_reset(vcpu);
+
if (lbr_desc->msr_passthrough)
return;
diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h
index 89da5e1251f1..a32c0c95983a 100644
--- a/arch/x86/kvm/vmx/vmx.h
+++ b/arch/x86/kvm/vmx/vmx.h
@@ -116,6 +116,9 @@ struct lbr_desc {
/* True if LBRs are marked as not intercepted in the MSR bitmap */
bool msr_passthrough;
+
+ /* Reset all LBR entries on a guest write to MSR_ARCH_LBR_DEPTH */
+ bool arch_lbr_reset;
};
/*
--
2.29.2
Hi Sean,
Thanks for your detailed review on the patch set.
On 2021/3/4 0:58, Sean Christopherson wrote:
> On Wed, Mar 03, 2021, Like Xu wrote:
>> @@ -348,10 +352,26 @@ static bool intel_pmu_handle_lbr_msrs_access(struct kvm_vcpu *vcpu,
>> return true;
>> }
>>
>> +/*
>> + * Check if the requested depth values is supported
>> + * based on the bits [0:7] of the guest cpuid.1c.eax.
>> + */
>> +static bool arch_lbr_depth_is_valid(struct kvm_vcpu *vcpu, u64 depth)
>> +{
>> + struct kvm_cpuid_entry2 *best;
>> +
>> + best = kvm_find_cpuid_entry(vcpu, 0x1c, 0);
>> + if (best && depth && !(depth % 8))
> This is still wrong, it fails to weed out depth > 64.
How come ? The testcases depth = {65, 127, 128} get #GP as expected.
>
> Not that this is a hot path, but it's probably worth double checking that the
> compiler generates simple code for "depth % 8", e.g. it can be "depth & 7)".
Emm, the "%" operation is quite normal over kernel code.
if (best && depth && !(depth % 8))
10659: 48 85 c0 test rax,rax
1065c: 74 c7 je 10625 <intel_pmu_set_msr+0x65>
1065e: 4d 85 e4 test r12,r12
10661: 74 c2 je 10625 <intel_pmu_set_msr+0x65>
10663: 41 f6 c4 07 test r12b,0x7
10667: 75 bc jne 10625 <intel_pmu_set_msr+0x65>
It looks like the compiler does the right thing.
Do you see the room for optimization ?
>
>> + return (best->eax & 0xff) & (1ULL << (depth / 8 - 1));
>> +
>> + return false;
>> +}
>> +
On Thu, Mar 04, 2021, Xu, Like wrote:
> Hi Sean,
>
> Thanks for your detailed review on the patch set.
>
> On 2021/3/4 0:58, Sean Christopherson wrote:
> > On Wed, Mar 03, 2021, Like Xu wrote:
> > > @@ -348,10 +352,26 @@ static bool intel_pmu_handle_lbr_msrs_access(struct kvm_vcpu *vcpu,
> > > return true;
> > > }
> > > +/*
> > > + * Check if the requested depth values is supported
> > > + * based on the bits [0:7] of the guest cpuid.1c.eax.
> > > + */
> > > +static bool arch_lbr_depth_is_valid(struct kvm_vcpu *vcpu, u64 depth)
> > > +{
> > > + struct kvm_cpuid_entry2 *best;
> > > +
> > > + best = kvm_find_cpuid_entry(vcpu, 0x1c, 0);
> > > + if (best && depth && !(depth % 8))
> > This is still wrong, it fails to weed out depth > 64.
>
> How come ? The testcases depth = {65, 127, 128} get #GP as expected.
@depth is a u64, throw in a number that is a multiple of 8 and >= 520, and the
"(1ULL << (depth / 8 - 1))" will trigger undefined behavior due to shifting
beyond the capacity of a ULL / u64.
Adding the "< 64" check would also allow dropping the " & 0xff" since the check
would ensure the shift doesn't go beyond bit 7. I'm not sure the cleverness is
worth shaving a cycle, though.
> > Not that this is a hot path, but it's probably worth double checking that the
> > compiler generates simple code for "depth % 8", e.g. it can be "depth & 7)".
>
> Emm, the "%" operation is quite normal over kernel code.
So is "&" :-) I was just pointing out that the compiler should optimize this,
and it did.
> if (best && depth && !(depth % 8))
> 10659: 48 85 c0 test rax,rax
> 1065c: 74 c7 je 10625 <intel_pmu_set_msr+0x65>
> 1065e: 4d 85 e4 test r12,r12
> 10661: 74 c2 je 10625 <intel_pmu_set_msr+0x65>
> 10663: 41 f6 c4 07 test r12b,0x7
> 10667: 75 bc jne 10625 <intel_pmu_set_msr+0x65>
>
> It looks like the compiler does the right thing.
> Do you see the room for optimization ?
>
> >
> > > + return (best->eax & 0xff) & (1ULL << (depth / 8 - 1));
Actually, looking at this again, I would explicitly use BIT() instead of 1ULL
(or BIT_ULL), since the shift must be 7 or less.
> > > +
> > > + return false;
> > > +}
> > > +
>
On Wed, Mar 03, 2021, Like Xu wrote:
> @@ -348,10 +352,26 @@ static bool intel_pmu_handle_lbr_msrs_access(struct kvm_vcpu *vcpu,
> return true;
> }
>
> +/*
> + * Check if the requested depth values is supported
> + * based on the bits [0:7] of the guest cpuid.1c.eax.
> + */
> +static bool arch_lbr_depth_is_valid(struct kvm_vcpu *vcpu, u64 depth)
> +{
> + struct kvm_cpuid_entry2 *best;
> +
> + best = kvm_find_cpuid_entry(vcpu, 0x1c, 0);
> + if (best && depth && !(depth % 8))
This is still wrong, it fails to weed out depth > 64.
Not that this is a hot path, but it's probably worth double checking that the
compiler generates simple code for "depth % 8", e.g. it can be "depth & 7)".
> + return (best->eax & 0xff) & (1ULL << (depth / 8 - 1));
> +
> + return false;
> +}
> +
On 2021/3/5 0:12, Sean Christopherson wrote:
> On Thu, Mar 04, 2021, Xu, Like wrote:
>> Hi Sean,
>>
>> Thanks for your detailed review on the patch set.
>>
>> On 2021/3/4 0:58, Sean Christopherson wrote:
>>> On Wed, Mar 03, 2021, Like Xu wrote:
>>>> @@ -348,10 +352,26 @@ static bool intel_pmu_handle_lbr_msrs_access(struct kvm_vcpu *vcpu,
>>>> return true;
>>>> }
>>>> +/*
>>>> + * Check if the requested depth values is supported
>>>> + * based on the bits [0:7] of the guest cpuid.1c.eax.
>>>> + */
>>>> +static bool arch_lbr_depth_is_valid(struct kvm_vcpu *vcpu, u64 depth)
>>>> +{
>>>> + struct kvm_cpuid_entry2 *best;
>>>> +
>>>> + best = kvm_find_cpuid_entry(vcpu, 0x1c, 0);
>>>> + if (best && depth && !(depth % 8))
>>> This is still wrong, it fails to weed out depth > 64.
>> How come ? The testcases depth = {65, 127, 128} get #GP as expected.
> @depth is a u64, throw in a number that is a multiple of 8 and >= 520, and the
> "(1ULL << (depth / 8 - 1))" will trigger undefined behavior due to shifting
> beyond the capacity of a ULL / u64.
Extra:
when we say "undefined behavior" if shifting beyond the capacity of a ULL,
do you mean that the actual behavior depends on the machine, architecture
or compiler?
>
> Adding the "< 64" check would also allow dropping the " & 0xff" since the check
> would ensure the shift doesn't go beyond bit 7. I'm not sure the cleverness is
> worth shaving a cycle, though.
Finally how about:
if (best && depth && (depth < 65) && !(depth & 7))
return best->eax & BIT_ULL(depth / 8 - 1);
return false;
Do you see the room for optimization ?
>
>>> Not that this is a hot path, but it's probably worth double checking that the
>>> compiler generates simple code for "depth % 8", e.g. it can be "depth & 7)".
>> Emm, the "%" operation is quite normal over kernel code.
> So is "&" :-) I was just pointing out that the compiler should optimize this,
> and it did.
>
>> if (best && depth && !(depth % 8))
>> 10659: 48 85 c0 test rax,rax
>> 1065c: 74 c7 je 10625 <intel_pmu_set_msr+0x65>
>> 1065e: 4d 85 e4 test r12,r12
>> 10661: 74 c2 je 10625 <intel_pmu_set_msr+0x65>
>> 10663: 41 f6 c4 07 test r12b,0x7
>> 10667: 75 bc jne 10625 <intel_pmu_set_msr+0x65>
>>
>> It looks like the compiler does the right thing.
>> Do you see the room for optimization ?
>>
>>>> + return (best->eax & 0xff) & (1ULL << (depth / 8 - 1));
> Actually, looking at this again, I would explicitly use BIT() instead of 1ULL
> (or BIT_ULL), since the shift must be 7 or less.
>
>>>> +
>>>> + return false;
>>>> +}
>>>> +