LinuxLists.cc - [PATCH v3 5/9] KVM: vmx/pmu: Add MSR_ARCH_LBR

2021-03-04 11:03:38

Subject: [PATCH v3 5/9] KVM: vmx/pmu: Add MSR_ARCH_LBR_DEPTH emulation for Arch LBR

The number of Arch LBR entries available for recording operations
is dictated by the value in MSR_ARCH_LBR_DEPTH.DEPTH. The supported
LBR depth values can be found in CPUID.(EAX=01CH, ECX=0):EAX[7:0]
and for each bit "n" set in this field, the MSR_ARCH_LBR_DEPTH.DEPTH
value of "8*(n+1)" is supported.

On a guest write to MSR_ARCH_LBR_DEPTH, all LBR entries are reset to 0.
KVM emulates the reset behavior by introducing lbr_desc->arch_lbr_reset.
KVM writes the guest requested value to the native ARCH_LBR_DEPTH MSR
(this is safe because the two values will be the same) when the Arch LBR
records MSRs are pass-through to the guest.

Signed-off-by: Like Xu <[email protected]>
---
arch/x86/kvm/vmx/pmu_intel.c | 43 ++++++++++++++++++++++++++++++++++++
arch/x86/kvm/vmx/vmx.h | 3 +++
2 files changed, 46 insertions(+)

diff --git a/arch/x86/kvm/vmx/pmu_intel.c b/arch/x86/kvm/vmx/pmu_intel.c
index 9efc1a6b8693..25d620685ae7 100644
--- a/arch/x86/kvm/vmx/pmu_intel.c
+++ b/arch/x86/kvm/vmx/pmu_intel.c
@@ -220,6 +220,9 @@ static bool intel_is_valid_msr(struct kvm_vcpu *vcpu, u32 msr)
case MSR_CORE_PERF_GLOBAL_OVF_CTRL:
ret = pmu->version > 1;
break;
+ case MSR_ARCH_LBR_DEPTH:
+ ret = guest_cpuid_has(vcpu, X86_FEATURE_ARCH_LBR);
+ break;
default:
ret = get_gp_pmc(pmu, msr, MSR_IA32_PERFCTR0) ||
get_gp_pmc(pmu, msr, MSR_P6_EVNTSEL0) ||
@@ -250,6 +253,7 @@ static inline void intel_pmu_release_guest_lbr_event(struct kvm_vcpu *vcpu)
if (lbr_desc->event) {
perf_event_release_kernel(lbr_desc->event);
lbr_desc->event = NULL;
+ lbr_desc->arch_lbr_reset = false;
vcpu_to_pmu(vcpu)->event_count--;
}
}
@@ -348,10 +352,26 @@ static bool intel_pmu_handle_lbr_msrs_access(struct kvm_vcpu *vcpu,
return true;
}

+/*
+ * Check if the requested depth values is supported
+ * based on the bits [0:7] of the guest cpuid.1c.eax.
+ */
+static bool arch_lbr_depth_is_valid(struct kvm_vcpu *vcpu, u64 depth)
+{
+ struct kvm_cpuid_entry2 *best;
+
+ best = kvm_find_cpuid_entry(vcpu, 0x1c, 0);
+ if (best && depth && !(depth % 8))
+ return (best->eax & 0xff) & (1ULL << (depth / 8 - 1));
+
+ return false;
+}
+
static int intel_pmu_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
{
struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
struct kvm_pmc *pmc;
+ struct lbr_desc *lbr_desc = vcpu_to_lbr_desc(vcpu);
u32 msr = msr_info->index;

switch (msr) {
@@ -367,6 +387,9 @@ static int intel_pmu_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
case MSR_CORE_PERF_GLOBAL_OVF_CTRL:
msr_info->data = pmu->global_ovf_ctrl;
return 0;
+ case MSR_ARCH_LBR_DEPTH:
+ msr_info->data = lbr_desc->records.nr;
+ return 0;
default:
if ((pmc = get_gp_pmc(pmu, msr, MSR_IA32_PERFCTR0)) ||
(pmc = get_gp_pmc(pmu, msr, MSR_IA32_PMC0))) {
@@ -393,6 +416,7 @@ static int intel_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
{
struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
struct kvm_pmc *pmc;
+ struct lbr_desc *lbr_desc = vcpu_to_lbr_desc(vcpu);
u32 msr = msr_info->index;
u64 data = msr_info->data;

@@ -427,6 +451,12 @@ static int intel_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
return 0;
}
break;
+ case MSR_ARCH_LBR_DEPTH:
+ if (!arch_lbr_depth_is_valid(vcpu, data))
+ return 1;
+ lbr_desc->records.nr = data;
+ lbr_desc->arch_lbr_reset = true;
+ return 0;
default:
if ((pmc = get_gp_pmc(pmu, msr, MSR_IA32_PERFCTR0)) ||
(pmc = get_gp_pmc(pmu, msr, MSR_IA32_PMC0))) {
@@ -566,6 +596,7 @@ static void intel_pmu_init(struct kvm_vcpu *vcpu)
lbr_desc->records.nr = 0;
lbr_desc->event = NULL;
lbr_desc->msr_passthrough = false;
+ lbr_desc->arch_lbr_reset = false;
}

static void intel_pmu_reset(struct kvm_vcpu *vcpu)
@@ -623,6 +654,15 @@ static void intel_pmu_deliver_pmi(struct kvm_vcpu *vcpu)
intel_pmu_legacy_freezing_lbrs_on_pmi(vcpu);
}

+static void intel_pmu_arch_lbr_reset(struct kvm_vcpu *vcpu)
+{
+ struct lbr_desc *lbr_desc = vcpu_to_lbr_desc(vcpu);
+
+ /* On a software write to IA32_LBR_DEPTH, all LBR entries are reset to 0. */
+ wrmsrl(MSR_ARCH_LBR_DEPTH, lbr_desc->records.nr);
+ lbr_desc->arch_lbr_reset = false;
+}
+
static void vmx_update_intercept_for_lbr_msrs(struct kvm_vcpu *vcpu, bool set)
{
struct x86_pmu_lbr *lbr = vcpu_to_lbr_records(vcpu);
@@ -654,6 +694,9 @@ static inline void vmx_enable_lbr_msrs_passthrough(struct kvm_vcpu *vcpu)
{
struct lbr_desc *lbr_desc = vcpu_to_lbr_desc(vcpu);

+ if (unlikely(lbr_desc->arch_lbr_reset))
+ intel_pmu_arch_lbr_reset(vcpu);
+
if (lbr_desc->msr_passthrough)
return;

diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h
index 89da5e1251f1..a32c0c95983a 100644
--- a/arch/x86/kvm/vmx/vmx.h
+++ b/arch/x86/kvm/vmx/vmx.h
@@ -116,6 +116,9 @@ struct lbr_desc {

/* True if LBRs are marked as not intercepted in the MSR bitmap */
bool msr_passthrough;
+
+ /* Reset all LBR entries on a guest write to MSR_ARCH_LBR_DEPTH */
+ bool arch_lbr_reset;
};

/*
--
2.29.2

2021-03-04 14:34:44

by Xu, Like

[permalink] [raw]

Subject: Re: [PATCH v3 5/9] KVM: vmx/pmu: Add MSR_ARCH_LBR_DEPTH emulation for Arch LBR

Hi Sean,

Thanks for your detailed review on the patch set.

On 2021/3/4 0:58, Sean Christopherson wrote:
> On Wed, Mar 03, 2021, Like Xu wrote:
>> @@ -348,10 +352,26 @@ static bool intel_pmu_handle_lbr_msrs_access(struct kvm_vcpu *vcpu,
>> return true;
>> }
>>
>> +/*
>> + * Check if the requested depth values is supported
>> + * based on the bits [0:7] of the guest cpuid.1c.eax.
>> + */
>> +static bool arch_lbr_depth_is_valid(struct kvm_vcpu *vcpu, u64 depth)
>> +{
>> + struct kvm_cpuid_entry2 *best;
>> +
>> + best = kvm_find_cpuid_entry(vcpu, 0x1c, 0);
>> + if (best && depth && !(depth % 8))
> This is still wrong, it fails to weed out depth > 64.

How come ? The testcases depth = {65, 127, 128} get #GP as expected.

>
> Not that this is a hot path, but it's probably worth double checking that the
> compiler generates simple code for "depth % 8", e.g. it can be "depth & 7)".

Emm, the "%" operation is quite normal over kernel code.

if (best && depth && !(depth % 8))
   10659:       48 85 c0                test   rax,rax
   1065c:       74 c7                   je     10625 <intel_pmu_set_msr+0x65>
   1065e:       4d 85 e4                test   r12,r12
   10661:       74 c2                   je     10625 <intel_pmu_set_msr+0x65>
   10663:       41 f6 c4 07             test   r12b,0x7
   10667:       75 bc                   jne    10625 <intel_pmu_set_msr+0x65>

It looks like the compiler does the right thing.
Do you see the room for optimization ？

>
>> + return (best->eax & 0xff) & (1ULL << (depth / 8 - 1));
>> +
>> + return false;
>> +}
>> +

2021-03-04 18:11:10

by Sean Christopherson

[permalink] [raw]

Subject: Re: [PATCH v3 5/9] KVM: vmx/pmu: Add MSR_ARCH_LBR_DEPTH emulation for Arch LBR

On Thu, Mar 04, 2021, Xu, Like wrote:
> Hi Sean,
>
> Thanks for your detailed review on the patch set.
>
> On 2021/3/4 0:58, Sean Christopherson wrote:
> > On Wed, Mar 03, 2021, Like Xu wrote:
> > > @@ -348,10 +352,26 @@ static bool intel_pmu_handle_lbr_msrs_access(struct kvm_vcpu *vcpu,
> > > return true;
> > > }
> > > +/*
> > > + * Check if the requested depth values is supported
> > > + * based on the bits [0:7] of the guest cpuid.1c.eax.
> > > + */
> > > +static bool arch_lbr_depth_is_valid(struct kvm_vcpu *vcpu, u64 depth)
> > > +{
> > > + struct kvm_cpuid_entry2 *best;
> > > +
> > > + best = kvm_find_cpuid_entry(vcpu, 0x1c, 0);
> > > + if (best && depth && !(depth % 8))
> > This is still wrong, it fails to weed out depth > 64.
>
> How come ? The testcases depth = {65, 127, 128} get #GP as expected.

@depth is a u64, throw in a number that is a multiple of 8 and >= 520, and the
"(1ULL << (depth / 8 - 1))" will trigger undefined behavior due to shifting
beyond the capacity of a ULL / u64.

Adding the "< 64" check would also allow dropping the " & 0xff" since the check
would ensure the shift doesn't go beyond bit 7. I'm not sure the cleverness is
worth shaving a cycle, though.

> > Not that this is a hot path, but it's probably worth double checking that the
> > compiler generates simple code for "depth % 8", e.g. it can be "depth & 7)".
>
> Emm, the "%" operation is quite normal over kernel code.

So is "&" :-) I was just pointing out that the compiler should optimize this,
and it did.

> if (best && depth && !(depth % 8))
>    10659:       48 85 c0                test   rax,rax
>    1065c:       74 c7                   je     10625 <intel_pmu_set_msr+0x65>
>    1065e:       4d 85 e4                test   r12,r12
>    10661:       74 c2                   je     10625 <intel_pmu_set_msr+0x65>
>    10663:       41 f6 c4 07             test   r12b,0x7
>    10667:       75 bc                   jne    10625 <intel_pmu_set_msr+0x65>
>
> It looks like the compiler does the right thing.
> Do you see the room for optimization ？
>
> >
> > > + return (best->eax & 0xff) & (1ULL << (depth / 8 - 1));

Actually, looking at this again, I would explicitly use BIT() instead of 1ULL
(or BIT_ULL), since the shift must be 7 or less.

> > > +
> > > + return false;
> > > +}
> > > +
>

2021-03-04 23:26:30

by Sean Christopherson

[permalink] [raw]

Subject: Re: [PATCH v3 5/9] KVM: vmx/pmu: Add MSR_ARCH_LBR_DEPTH emulation for Arch LBR

On Wed, Mar 03, 2021, Like Xu wrote:
> @@ -348,10 +352,26 @@ static bool intel_pmu_handle_lbr_msrs_access(struct kvm_vcpu *vcpu,
> return true;
> }
>
> +/*
> + * Check if the requested depth values is supported
> + * based on the bits [0:7] of the guest cpuid.1c.eax.
> + */
> +static bool arch_lbr_depth_is_valid(struct kvm_vcpu *vcpu, u64 depth)
> +{
> + struct kvm_cpuid_entry2 *best;
> +
> + best = kvm_find_cpuid_entry(vcpu, 0x1c, 0);
> + if (best && depth && !(depth % 8))

This is still wrong, it fails to weed out depth > 64.

Not that this is a hot path, but it's probably worth double checking that the
compiler generates simple code for "depth % 8", e.g. it can be "depth & 7)".

> + return (best->eax & 0xff) & (1ULL << (depth / 8 - 1));
> +
> + return false;
> +}
> +

2021-03-05 02:38:13

by Xu, Like

[permalink] [raw]

Subject: Re: [PATCH v3 5/9] KVM: vmx/pmu: Add MSR_ARCH_LBR_DEPTH emulation for Arch LBR

On 2021/3/5 0:12, Sean Christopherson wrote:
> On Thu, Mar 04, 2021, Xu, Like wrote:
>> Hi Sean,
>>
>> Thanks for your detailed review on the patch set.
>>
>> On 2021/3/4 0:58, Sean Christopherson wrote:
>>> On Wed, Mar 03, 2021, Like Xu wrote:
>>>> @@ -348,10 +352,26 @@ static bool intel_pmu_handle_lbr_msrs_access(struct kvm_vcpu *vcpu,
>>>> return true;
>>>> }
>>>> +/*
>>>> + * Check if the requested depth values is supported
>>>> + * based on the bits [0:7] of the guest cpuid.1c.eax.
>>>> + */
>>>> +static bool arch_lbr_depth_is_valid(struct kvm_vcpu *vcpu, u64 depth)
>>>> +{
>>>> + struct kvm_cpuid_entry2 *best;
>>>> +
>>>> + best = kvm_find_cpuid_entry(vcpu, 0x1c, 0);
>>>> + if (best && depth && !(depth % 8))
>>> This is still wrong, it fails to weed out depth > 64.
>> How come ? The testcases depth = {65, 127, 128} get #GP as expected.
> @depth is a u64, throw in a number that is a multiple of 8 and >= 520, and the
> "(1ULL << (depth / 8 - 1))" will trigger undefined behavior due to shifting
> beyond the capacity of a ULL / u64.

Extra:

when we say "undefined behavior" if shifting beyond the capacity of a ULL,
do you mean that the actual behavior depends on the machine, architecture
or compiler?

>
> Adding the "< 64" check would also allow dropping the " & 0xff" since the check
> would ensure the shift doesn't go beyond bit 7. I'm not sure the cleverness is
> worth shaving a cycle, though.

Finally how about:

    if (best && depth && (depth < 65) && !(depth & 7))
        return best->eax & BIT_ULL(depth / 8 - 1);

    return false;

Do you see the room for optimization ？

>
>>> Not that this is a hot path, but it's probably worth double checking that the
>>> compiler generates simple code for "depth % 8", e.g. it can be "depth & 7)".
>> Emm, the "%" operation is quite normal over kernel code.
> So is "&" :-) I was just pointing out that the compiler should optimize this,
> and it did.
>
>> if (best && depth && !(depth % 8))
>>    10659:       48 85 c0                test   rax,rax
>>    1065c:       74 c7                   je     10625 <intel_pmu_set_msr+0x65>
>>    1065e:       4d 85 e4                test   r12,r12
>>    10661:       74 c2                   je     10625 <intel_pmu_set_msr+0x65>
>>    10663:       41 f6 c4 07             test   r12b,0x7
>>    10667:       75 bc                   jne    10625 <intel_pmu_set_msr+0x65>
>>
>> It looks like the compiler does the right thing.
>> Do you see the room for optimization ？
>>
>>>> + return (best->eax & 0xff) & (1ULL << (depth / 8 - 1));
> Actually, looking at this again, I would explicitly use BIT() instead of 1ULL
> (or BIT_ULL), since the shift must be 7 or less.
>
>>>> +
>>>> + return false;
>>>> +}
>>>> +