- Add a VCPU ioctl that can control which events the guest can monitor.
Signed-off-by: ehankland <[email protected]>
---
Some events can provide a guest with information about other guests or the
host (e.g. L3 cache stats); providing the capability to restrict access
to a "safe" set of events would limit the potential for the PMU to be used
in any side channel attacks. This change introduces a new vcpu ioctl that
sets an event whitelist. If the guest attempts to program a counter for
any unwhitelisted event, the kernel counter won't be created, so any
RDPMC/RDMSR will show 0 instances of that event.
---
Documentation/virtual/kvm/api.txt | 16 +++++++++++
arch/x86/include/asm/kvm_host.h | 1 +
arch/x86/include/uapi/asm/kvm.h | 7 +++++
arch/x86/kvm/pmu.c | 44 +++++++++++++++++++++++++++++++
arch/x86/kvm/pmu.h | 3 +++
arch/x86/kvm/pmu_amd.c | 16 +++++++++++
arch/x86/kvm/vmx/pmu_intel.c | 16 +++++++++++
arch/x86/kvm/x86.c | 7 +++++
include/uapi/linux/kvm.h | 4 +++
9 files changed, 114 insertions(+)
diff --git a/Documentation/virtual/kvm/api.txt
b/Documentation/virtual/kvm/api.txt
index ba6c42c576dd..79cbe7339145 100644
--- a/Documentation/virtual/kvm/api.txt
+++ b/Documentation/virtual/kvm/api.txt
@@ -4065,6 +4065,22 @@ KVM_ARM_VCPU_FINALIZE call.
See KVM_ARM_VCPU_INIT for details of vcpu features that require finalization
using this ioctl.
+4.120 KVM_SET_PMU_WHITELIST
+
+Capability: KVM_CAP_PMU_WHITELIST
+Architectures: x86
+Type: vm ioctl
+Parameters: struct kvm_pmu_whitelist (in)
+Returns: 0 on success, -1 on error
+
+struct kvm_pmu_whitelist {
+ __u64 event_mask;
+ __u16 num_events;
+ __u64 events[0];
+};
+This ioctl restricts the set of PMU events that the guest can program to the
+set of whitelisted events.
+
5. The kvm_run structure
------------------------
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 450d69a1e6fa..942647475999 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -477,6 +477,7 @@ struct kvm_pmu {
struct kvm_pmc fixed_counters[INTEL_PMC_MAX_FIXED];
struct irq_work irq_work;
u64 reprogram_pmi;
+ struct kvm_pmu_whitelist *whitelist;
};
struct kvm_pmu_ops;
diff --git a/arch/x86/include/uapi/asm/kvm.h b/arch/x86/include/uapi/asm/kvm.h
index 7a0e64ccd6ff..2633b48b75cd 100644
--- a/arch/x86/include/uapi/asm/kvm.h
+++ b/arch/x86/include/uapi/asm/kvm.h
@@ -421,4 +421,11 @@ struct kvm_nested_state {
__u8 data[0];
};
+/* for KVM_SET_PMU_WHITELIST */
+struct kvm_pmu_whitelist {
+ __u64 event_mask;
+ __u16 num_events;
+ __u64 events[0];
+};
+
#endif /* _ASM_X86_KVM_H */
diff --git a/arch/x86/kvm/pmu.c b/arch/x86/kvm/pmu.c
index e39741997893..d0d81cd3626d 100644
--- a/arch/x86/kvm/pmu.c
+++ b/arch/x86/kvm/pmu.c
@@ -101,6 +101,9 @@ static void pmc_reprogram_counter(struct kvm_pmc
*pmc, u32 type,
bool exclude_kernel, bool intr,
bool in_tx, bool in_tx_cp)
{
+ struct kvm_pmu *pmu = pmc_to_pmu(pmc);
+ int i;
+ u64 event_config;
struct perf_event *event;
struct perf_event_attr attr = {
.type = type,
@@ -127,6 +130,19 @@ static void pmc_reprogram_counter(struct kvm_pmc
*pmc, u32 type,
attr.config |= HSW_IN_TX_CHECKPOINTED;
}
+ if (pmu->whitelist) {
+ event_config = attr.config;
+ if (type == PERF_TYPE_HARDWARE)
+ event_config = kvm_x86_ops->pmu_ops->get_event_code(
+ attr.config);
+ event_config &= pmu->whitelist->event_mask;
+ for (i = 0; i < pmu->whitelist->num_events; i++)
+ if (event_config == pmu->whitelist->events[i])
+ break;
+ if (i == pmu->whitelist->num_events)
+ return;
+ }
+
event = perf_event_create_kernel_counter(&attr, -1, current,
intr ? kvm_perf_overflow_intr :
kvm_perf_overflow, pmc);
@@ -244,6 +260,34 @@ int kvm_pmu_is_valid_msr_idx(struct kvm_vcpu
*vcpu, unsigned idx)
return kvm_x86_ops->pmu_ops->is_valid_msr_idx(vcpu, idx);
}
+int kvm_vcpu_ioctl_set_pmu_whitelist(struct kvm_vcpu *vcpu,
+ struct kvm_pmu_whitelist __user *whtlst)
+{
+ struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
+ struct kvm_pmu_whitelist *old = pmu->whitelist;
+ struct kvm_pmu_whitelist *new = NULL;
+ struct kvm_pmu_whitelist tmp;
+ int r;
+ size_t size;
+
+ r = -EFAULT;
+ if (copy_from_user(&tmp, whtlst, sizeof(struct kvm_pmu_whitelist)))
+ goto err;
+
+ size = sizeof(tmp) + sizeof(tmp.events[0]) * tmp.num_events;
+ new = kvzalloc(size, GFP_KERNEL_ACCOUNT);
+ r = -ENOMEM;
+ if (!new)
+ goto err;
+ pmu->whitelist = new;
+
+ kvfree(old);
+ return 0;
+err:
+ kvfree(new);
+ return r;
+}
+
bool is_vmware_backdoor_pmc(u32 pmc_idx)
{
switch (pmc_idx) {
diff --git a/arch/x86/kvm/pmu.h b/arch/x86/kvm/pmu.h
index ba8898e1a854..7d4a37fcb043 100644
--- a/arch/x86/kvm/pmu.h
+++ b/arch/x86/kvm/pmu.h
@@ -23,6 +23,7 @@ struct kvm_pmu_ops {
unsigned (*find_arch_event)(struct kvm_pmu *pmu, u8 event_select,
u8 unit_mask);
unsigned (*find_fixed_event)(int idx);
+ u64 (*get_event_code)(unsigned event_type);
bool (*pmc_is_enabled)(struct kvm_pmc *pmc);
struct kvm_pmc *(*pmc_idx_to_pmc)(struct kvm_pmu *pmu, int pmc_idx);
struct kvm_pmc *(*msr_idx_to_pmc)(struct kvm_vcpu *vcpu, unsigned idx);
@@ -108,6 +109,8 @@ void reprogram_counter(struct kvm_pmu *pmu, int pmc_idx);
void kvm_pmu_deliver_pmi(struct kvm_vcpu *vcpu);
void kvm_pmu_handle_event(struct kvm_vcpu *vcpu);
+int kvm_vcpu_ioctl_set_pmu_whitelist(struct kvm_vcpu *vcpu,
+ struct kvm_pmu_whitelist __user *whtlst);
int kvm_pmu_rdpmc(struct kvm_vcpu *vcpu, unsigned pmc, u64 *data);
int kvm_pmu_is_valid_msr_idx(struct kvm_vcpu *vcpu, unsigned idx);
bool kvm_pmu_is_valid_msr(struct kvm_vcpu *vcpu, u32 msr);
diff --git a/arch/x86/kvm/pmu_amd.c b/arch/x86/kvm/pmu_amd.c
index 1495a735b38e..1d977b20c863 100644
--- a/arch/x86/kvm/pmu_amd.c
+++ b/arch/x86/kvm/pmu_amd.c
@@ -145,6 +145,21 @@ static unsigned amd_find_arch_event(struct kvm_pmu *pmu,
return amd_event_mapping[i].event_type;
}
+static u64 amd_get_event_code(unsigned event_type)
+{
+ int i;
+ u64 event_code = 0;
+
+ for (i = 0; i < ARRAY_SIZE(amd_event_mapping); i++)
+ if (amd_event_mapping[i].event_type == event_type) {
+ event_code = amd_event_mapping[i].eventsel |
+ ((u64)amd_event_mapping[i].unit_mask << 8);
+ break;
+ }
+
+ return event_code;
+}
+
/* return PERF_COUNT_HW_MAX as AMD doesn't have fixed events */
static unsigned amd_find_fixed_event(int idx)
{
@@ -306,6 +321,7 @@ static void amd_pmu_reset(struct kvm_vcpu *vcpu)
struct kvm_pmu_ops amd_pmu_ops = {
.find_arch_event = amd_find_arch_event,
.find_fixed_event = amd_find_fixed_event,
+ .get_event_code = amd_get_event_code,
.pmc_is_enabled = amd_pmc_is_enabled,
.pmc_idx_to_pmc = amd_pmc_idx_to_pmc,
.msr_idx_to_pmc = amd_msr_idx_to_pmc,
diff --git a/arch/x86/kvm/vmx/pmu_intel.c b/arch/x86/kvm/vmx/pmu_intel.c
index f8502c376b37..01a5d5a3bf3d 100644
--- a/arch/x86/kvm/vmx/pmu_intel.c
+++ b/arch/x86/kvm/vmx/pmu_intel.c
@@ -67,6 +67,21 @@ static void global_ctrl_changed(struct kvm_pmu
*pmu, u64 data)
reprogram_counter(pmu, bit);
}
+static u64 intel_get_event_code(unsigned event_type)
+{
+ int i;
+ u64 event_code = 0;
+
+ for (i = 0; i < ARRAY_SIZE(intel_arch_events); i++)
+ if (intel_arch_events[i].event_type == event_type) {
+ event_code = intel_arch_events[i].eventsel |
+ ((u64) intel_arch_events[i].unit_mask << 8);
+ break;
+ }
+
+ return event_code;
+}
+
static unsigned intel_find_arch_event(struct kvm_pmu *pmu,
u8 event_select,
u8 unit_mask)
@@ -350,6 +365,7 @@ static void intel_pmu_reset(struct kvm_vcpu *vcpu)
struct kvm_pmu_ops intel_pmu_ops = {
.find_arch_event = intel_find_arch_event,
+ .get_event_code = intel_get_event_code,
.find_fixed_event = intel_find_fixed_event,
.pmc_is_enabled = intel_pmc_is_enabled,
.pmc_idx_to_pmc = intel_pmc_idx_to_pmc,
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 536b78c4af6e..089de23289f4 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -3089,6 +3089,7 @@ int kvm_vm_ioctl_check_extension(struct kvm
*kvm, long ext)
case KVM_CAP_GET_MSR_FEATURES:
case KVM_CAP_MSR_PLATFORM_INFO:
case KVM_CAP_EXCEPTION_PAYLOAD:
+ case KVM_CAP_PMU_WHITELIST:
r = 1;
break;
case KVM_CAP_SYNC_REGS:
@@ -4285,6 +4286,12 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
r = 0;
break;
}
+ case KVM_SET_PMU_WHITELIST: {
+ struct kvm_pmu_whitelist __user *whitelist = argp;
+
+ r = kvm_vcpu_ioctl_set_pmu_whitelist(vcpu, whitelist);
+ goto out;
+ }
default:
r = -EINVAL;
}
diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
index 2fe12b40d503..140a6ac52981 100644
--- a/include/uapi/linux/kvm.h
+++ b/include/uapi/linux/kvm.h
@@ -993,6 +993,7 @@ struct kvm_ppc_resize_hpt {
#define KVM_CAP_ARM_SVE 170
#define KVM_CAP_ARM_PTRAUTH_ADDRESS 171
#define KVM_CAP_ARM_PTRAUTH_GENERIC 172
+#define KVM_CAP_PMU_WHITELIST 173
#ifdef KVM_CAP_IRQ_ROUTING
@@ -1451,6 +1452,9 @@ struct kvm_enc_region {
/* Available with KVM_CAP_ARM_SVE */
#define KVM_ARM_VCPU_FINALIZE _IOW(KVMIO, 0xc2, int)
+/* Available with KVM_CAP_PMU_WHITELIST */
+# define KVM_SET_PMU_WHITELIST _IOW(KVMIO, 0xc3, struct kvm_pmu_whitelist)
+
/* Secure Encrypted Virtualization command */
enum sev_cmd_id {
/* Guest initialization commands */
On 05/23/2019 06:23 AM, Eric Hankland wrote:
> - Add a VCPU ioctl that can control which events the guest can monitor.
>
> Signed-off-by: ehankland <[email protected]>
> ---
> Some events can provide a guest with information about other guests or the
> host (e.g. L3 cache stats); providing the capability to restrict access
> to a "safe" set of events would limit the potential for the PMU to be used
> in any side channel attacks. This change introduces a new vcpu ioctl that
> sets an event whitelist. If the guest attempts to program a counter for
> any unwhitelisted event, the kernel counter won't be created, so any
> RDPMC/RDMSR will show 0 instances of that event.
The general idea sounds good to me :)
For the implementation, I would have the following suggestions:
1) Instead of using a whitelist, it would be better to use a blacklist to
forbid the guest from counting any core level information. So by default,
kvm maintains a list of those core level events, which are not supported to
the guest.
The userspace ioctl removes the related events from the blacklist to
make them usable by the guest.
2) Use vm ioctl, instead of vcpu ioctl. The blacklist-ed events can be
VM wide
(unnecessary to make each CPU to maintain the same copy).
Accordingly, put the pmu event blacklist into kvm->arch.
3) Returning 1 when the guest tries to set the evetlsel msr to count an
event which is on the blacklist.
Best,
Wei
On Mon, May 27, 2019 at 6:56 PM Wei Wang <[email protected]> wrote:
>
> On 05/23/2019 06:23 AM, Eric Hankland wrote:
> > - Add a VCPU ioctl that can control which events the guest can monitor.
> >
> > Signed-off-by: ehankland <[email protected]>
> > ---
> > Some events can provide a guest with information about other guests or the
> > host (e.g. L3 cache stats); providing the capability to restrict access
> > to a "safe" set of events would limit the potential for the PMU to be used
> > in any side channel attacks. This change introduces a new vcpu ioctl that
> > sets an event whitelist. If the guest attempts to program a counter for
> > any unwhitelisted event, the kernel counter won't be created, so any
> > RDPMC/RDMSR will show 0 instances of that event.
>
> The general idea sounds good to me :)
>
> For the implementation, I would have the following suggestions:
>
> 1) Instead of using a whitelist, it would be better to use a blacklist to
> forbid the guest from counting any core level information. So by default,
> kvm maintains a list of those core level events, which are not supported to
> the guest.
>
> The userspace ioctl removes the related events from the blacklist to
> make them usable by the guest.
>
> 2) Use vm ioctl, instead of vcpu ioctl. The blacklist-ed events can be
> VM wide
> (unnecessary to make each CPU to maintain the same copy).
> Accordingly, put the pmu event blacklist into kvm->arch.
>
> 3) Returning 1 when the guest tries to set the evetlsel msr to count an
> event which is on the blacklist.
>
> Best,
> Wei
Thanks for the feedback. I have a couple concerns with a KVM
maintained blacklist. First, I'm worried it will be difficult to keep
such a list up to date and accurate (both coming up with the initial
list since there are so many events, and updating it whenever any new
events are published or vulnerabilities are discovered). Second, users
may want to differentiate between whole-socket and sub-socket VMs
(some events may be fine for the whole-socket case) - keeping a single
blacklist wouldn't allow for this. Let me know what you think. I'll
try implementing the other suggestions.
-Eric
On 05/29/2019 02:14 AM, Eric Hankland wrote:
> On Mon, May 27, 2019 at 6:56 PM Wei Wang <[email protected]> wrote:
>> On 05/23/2019 06:23 AM, Eric Hankland wrote:
>>> - Add a VCPU ioctl that can control which events the guest can monitor.
>>>
>>> Signed-off-by: ehankland <[email protected]>
>>> ---
>>> Some events can provide a guest with information about other guests or the
>>> host (e.g. L3 cache stats); providing the capability to restrict access
>>> to a "safe" set of events would limit the potential for the PMU to be used
>>> in any side channel attacks. This change introduces a new vcpu ioctl that
>>> sets an event whitelist. If the guest attempts to program a counter for
>>> any unwhitelisted event, the kernel counter won't be created, so any
>>> RDPMC/RDMSR will show 0 instances of that event.
>> The general idea sounds good to me :)
>>
>> For the implementation, I would have the following suggestions:
>>
>> 1) Instead of using a whitelist, it would be better to use a blacklist to
>> forbid the guest from counting any core level information. So by default,
>> kvm maintains a list of those core level events, which are not supported to
>> the guest.
>>
>> The userspace ioctl removes the related events from the blacklist to
>> make them usable by the guest.
>>
>> 2) Use vm ioctl, instead of vcpu ioctl. The blacklist-ed events can be
>> VM wide
>> (unnecessary to make each CPU to maintain the same copy).
>> Accordingly, put the pmu event blacklist into kvm->arch.
>>
>> 3) Returning 1 when the guest tries to set the evetlsel msr to count an
>> event which is on the blacklist.
>>
>> Best,
>> Wei
> Thanks for the feedback. I have a couple concerns with a KVM
> maintained blacklist. First, I'm worried it will be difficult to keep
> such a list up to date and accurate (both coming up with the initial
> list since there are so many events, and updating it whenever any new
> events are published or vulnerabilities are discovered).
Not sure about "so many" above. I think there should be much
fewer events that may need to be blacklisted.
For example the event table 19-3 from SDM 19.2 shows hundreds of
events, how many of them would you think that need to be blacklisted?
> Second, users
> may want to differentiate between whole-socket and sub-socket VMs
> (some events may be fine for the whole-socket case) - keeping a single
> blacklist wouldn't allow for this.
Why wouldn't?
In any case (e.g. the whole socket dedicated to the single VM) we
want to unlock the blacklisted events, we can have the userspace
(e.g. qemu command line options "+event1, +event2") do ioctl to
have KVM do that.
Btw, for the L3 cache stats event example, I'm not sure if that could
be an issue if we have "AnyThread=0". I'll double confirm with
someone.
Best,
Wei
On Wed, May 29, 2019 at 12:49 AM Wei Wang <[email protected]> wrote:
>
> On 05/29/2019 02:14 AM, Eric Hankland wrote:
> > On Mon, May 27, 2019 at 6:56 PM Wei Wang <[email protected]> wrote:
> >> On 05/23/2019 06:23 AM, Eric Hankland wrote:
> >>> - Add a VCPU ioctl that can control which events the guest can monitor.
> >>>
> >>> Signed-off-by: ehankland <[email protected]>
> >>> ---
> >>> Some events can provide a guest with information about other guests or the
> >>> host (e.g. L3 cache stats); providing the capability to restrict access
> >>> to a "safe" set of events would limit the potential for the PMU to be used
> >>> in any side channel attacks. This change introduces a new vcpu ioctl that
> >>> sets an event whitelist. If the guest attempts to program a counter for
> >>> any unwhitelisted event, the kernel counter won't be created, so any
> >>> RDPMC/RDMSR will show 0 instances of that event.
> >> The general idea sounds good to me :)
> >>
> >> For the implementation, I would have the following suggestions:
> >>
> >> 1) Instead of using a whitelist, it would be better to use a blacklist to
> >> forbid the guest from counting any core level information. So by default,
> >> kvm maintains a list of those core level events, which are not supported to
> >> the guest.
> >>
> >> The userspace ioctl removes the related events from the blacklist to
> >> make them usable by the guest.
> >>
> >> 2) Use vm ioctl, instead of vcpu ioctl. The blacklist-ed events can be
> >> VM wide
> >> (unnecessary to make each CPU to maintain the same copy).
> >> Accordingly, put the pmu event blacklist into kvm->arch.
> >>
> >> 3) Returning 1 when the guest tries to set the evetlsel msr to count an
> >> event which is on the blacklist.
> >>
> >> Best,
> >> Wei
> > Thanks for the feedback. I have a couple concerns with a KVM
> > maintained blacklist. First, I'm worried it will be difficult to keep
> > such a list up to date and accurate (both coming up with the initial
> > list since there are so many events, and updating it whenever any new
> > events are published or vulnerabilities are discovered).
>
> Not sure about "so many" above. I think there should be much
> fewer events that may need to be blacklisted.
>
> For example the event table 19-3 from SDM 19.2 shows hundreds of
> events, how many of them would you think that need to be blacklisted?
>
> > Second, users
> > may want to differentiate between whole-socket and sub-socket VMs
> > (some events may be fine for the whole-socket case) - keeping a single
> > blacklist wouldn't allow for this.
>
> Why wouldn't?
> In any case (e.g. the whole socket dedicated to the single VM) we
> want to unlock the blacklisted events, we can have the userspace
> (e.g. qemu command line options "+event1, +event2") do ioctl to
> have KVM do that.
>
> Btw, for the L3 cache stats event example, I'm not sure if that could
> be an issue if we have "AnyThread=0". I'll double confirm with
> someone.
>
> Best,
> Wei
> Not sure about "so many" above. I think there should be much
> fewer events that may need to be blacklisted.
I think you're right that there are not as many events that seem like
they could leak info as events that seem like they won't, but I think
the work to validate that they definitely don't could be expensive;
with a whitelist it's easy to start with a smaller set and
incrementally add to it without having to evaluate all the events
right away.
> > (some events may be fine for the whole-socket case) - keeping a single
> > blacklist wouldn't allow for this.
>
> Why wouldn't?
Ah I think I misunderstood your proposal (as an ioctl that would just
enable a hardcoded blacklist). Now that I understand, my main
objection is just the above (coupled with the trouble of needing to
maintain the "default" blacklist).
Eric
On 05/30/2019 01:11 AM, Eric Hankland wrote:
> On Wed, May 29, 2019 at 12:49 AM Wei Wang <[email protected]> wrote:
>> On 05/29/2019 02:14 AM, Eric Hankland wrote:
>>> On Mon, May 27, 2019 at 6:56 PM Wei Wang <[email protected]> wrote:
>>>> On 05/23/2019 06:23 AM, Eric Hankland wrote:
>>>>> - Add a VCPU ioctl that can control which events the guest can monitor.
>>>>>
>>>>> Signed-off-by: ehankland <[email protected]>
>>>>> ---
>>>>> Some events can provide a guest with information about other guests or the
>>>>> host (e.g. L3 cache stats); providing the capability to restrict access
>>>>> to a "safe" set of events would limit the potential for the PMU to be used
>>>>> in any side channel attacks. This change introduces a new vcpu ioctl that
>>>>> sets an event whitelist. If the guest attempts to program a counter for
>>>>> any unwhitelisted event, the kernel counter won't be created, so any
>>>>> RDPMC/RDMSR will show 0 instances of that event.
>>>> The general idea sounds good to me :)
>>>>
>>>> For the implementation, I would have the following suggestions:
>>>>
>>>> 1) Instead of using a whitelist, it would be better to use a blacklist to
>>>> forbid the guest from counting any core level information. So by default,
>>>> kvm maintains a list of those core level events, which are not supported to
>>>> the guest.
>>>>
>>>> The userspace ioctl removes the related events from the blacklist to
>>>> make them usable by the guest.
>>>>
>>>> 2) Use vm ioctl, instead of vcpu ioctl. The blacklist-ed events can be
>>>> VM wide
>>>> (unnecessary to make each CPU to maintain the same copy).
>>>> Accordingly, put the pmu event blacklist into kvm->arch.
>>>>
>>>> 3) Returning 1 when the guest tries to set the evetlsel msr to count an
>>>> event which is on the blacklist.
>>>>
>>>> Best,
>>>> Wei
>>> Thanks for the feedback. I have a couple concerns with a KVM
>>> maintained blacklist. First, I'm worried it will be difficult to keep
>>> such a list up to date and accurate (both coming up with the initial
>>> list since there are so many events, and updating it whenever any new
>>> events are published or vulnerabilities are discovered).
>> Not sure about "so many" above. I think there should be much
>> fewer events that may need to be blacklisted.
>>
>> For example the event table 19-3 from SDM 19.2 shows hundreds of
>> events, how many of them would you think that need to be blacklisted?
>>
>>> Second, users
>>> may want to differentiate between whole-socket and sub-socket VMs
>>> (some events may be fine for the whole-socket case) - keeping a single
>>> blacklist wouldn't allow for this.
>> Why wouldn't?
>> In any case (e.g. the whole socket dedicated to the single VM) we
>> want to unlock the blacklisted events, we can have the userspace
>> (e.g. qemu command line options "+event1, +event2") do ioctl to
>> have KVM do that.
>>
>> Btw, for the L3 cache stats event example, I'm not sure if that could
>> be an issue if we have "AnyThread=0". I'll double confirm with
>> someone.
>>
>> Best,
>> Wei
>> Not sure about "so many" above. I think there should be much
>> fewer events that may need to be blacklisted.
> I think you're right that there are not as many events that seem like
> they could leak info as events that seem like they won't, but I think
> the work to validate that they definitely don't could be expensive;
> with a whitelist it's easy to start with a smaller set and
> incrementally add to it without having to evaluate all the events
> right away.
Before going that whitelist/blacklist direction, do you have an event
example that couldn't be solved by setting "AnyThread=0"?
If no, I think we could simply gate guest's setting of "AnyThread=0".
Best,
Wei
On Thu, May 30, 2019 at 5:57 PM Wei Wang <[email protected]> wrote:
>
> On 05/30/2019 01:11 AM, Eric Hankland wrote:
> > On Wed, May 29, 2019 at 12:49 AM Wei Wang <[email protected]> wrote:
> >> On 05/29/2019 02:14 AM, Eric Hankland wrote:
> >>> On Mon, May 27, 2019 at 6:56 PM Wei Wang <[email protected]> wrote:
> >>>> On 05/23/2019 06:23 AM, Eric Hankland wrote:
> >>>>> - Add a VCPU ioctl that can control which events the guest can monitor.
> >>>>>
> >>>>> Signed-off-by: ehankland <[email protected]>
> >>>>> ---
> >>>>> Some events can provide a guest with information about other guests or the
> >>>>> host (e.g. L3 cache stats); providing the capability to restrict access
> >>>>> to a "safe" set of events would limit the potential for the PMU to be used
> >>>>> in any side channel attacks. This change introduces a new vcpu ioctl that
> >>>>> sets an event whitelist. If the guest attempts to program a counter for
> >>>>> any unwhitelisted event, the kernel counter won't be created, so any
> >>>>> RDPMC/RDMSR will show 0 instances of that event.
> >>>> The general idea sounds good to me :)
> >>>>
> >>>> For the implementation, I would have the following suggestions:
> >>>>
> >>>> 1) Instead of using a whitelist, it would be better to use a blacklist to
> >>>> forbid the guest from counting any core level information. So by default,
> >>>> kvm maintains a list of those core level events, which are not supported to
> >>>> the guest.
> >>>>
> >>>> The userspace ioctl removes the related events from the blacklist to
> >>>> make them usable by the guest.
> >>>>
> >>>> 2) Use vm ioctl, instead of vcpu ioctl. The blacklist-ed events can be
> >>>> VM wide
> >>>> (unnecessary to make each CPU to maintain the same copy).
> >>>> Accordingly, put the pmu event blacklist into kvm->arch.
> >>>>
> >>>> 3) Returning 1 when the guest tries to set the evetlsel msr to count an
> >>>> event which is on the blacklist.
> >>>>
> >>>> Best,
> >>>> Wei
> >>> Thanks for the feedback. I have a couple concerns with a KVM
> >>> maintained blacklist. First, I'm worried it will be difficult to keep
> >>> such a list up to date and accurate (both coming up with the initial
> >>> list since there are so many events, and updating it whenever any new
> >>> events are published or vulnerabilities are discovered).
> >> Not sure about "so many" above. I think there should be much
> >> fewer events that may need to be blacklisted.
> >>
> >> For example the event table 19-3 from SDM 19.2 shows hundreds of
> >> events, how many of them would you think that need to be blacklisted?
> >>
> >>> Second, users
> >>> may want to differentiate between whole-socket and sub-socket VMs
> >>> (some events may be fine for the whole-socket case) - keeping a single
> >>> blacklist wouldn't allow for this.
> >> Why wouldn't?
> >> In any case (e.g. the whole socket dedicated to the single VM) we
> >> want to unlock the blacklisted events, we can have the userspace
> >> (e.g. qemu command line options "+event1, +event2") do ioctl to
> >> have KVM do that.
> >>
> >> Btw, for the L3 cache stats event example, I'm not sure if that could
> >> be an issue if we have "AnyThread=0". I'll double confirm with
> >> someone.
> >>
> >> Best,
> >> Wei
> >> Not sure about "so many" above. I think there should be much
> >> fewer events that may need to be blacklisted.
> > I think you're right that there are not as many events that seem like
> > they could leak info as events that seem like they won't, but I think
> > the work to validate that they definitely don't could be expensive;
> > with a whitelist it's easy to start with a smaller set and
> > incrementally add to it without having to evaluate all the events
> > right away.
>
> Before going that whitelist/blacklist direction, do you have an event
> example that couldn't be solved by setting "AnyThread=0"?
>
> If no, I think we could simply gate guest's setting of "AnyThread=0".
>
> Best,
> Wei
With anythread=0, I'm not aware of any events that directly give info
about other VMs, but monitoring events related to shared resources
(e.g. LLC References and LLC Misses) could indirectly give you info
about how heavily other users are using that resource.
I tried returning 1 when the guest tries to write the eventsel msr for
a disallowed event - the behavior on modern guest kernels looks
reasonable (warns once about an unchecked MSR access error), but it
looks like guests using older kernels (older than 2016) might panic
due to the gpfault (not to mention I'm not sure about the behavior on
non-linux kernels). So I'm hesitant to return 1 - what do you think?
I also looked into moving from a vcpu ioctl to a vm ioctl - I can send
out a version of the patch with this change once we settle on the
other issues. It will involve some extra locking every time the
counters are programmed to ensure the whitelist or blacklist isn't
removed during access.
Eric
On 06/01/2019 03:59 AM, Eric Hankland wrote:
>
> With anythread=0, I'm not aware of any events that directly give info
> about other VMs, but monitoring events related to shared resources
> (e.g. LLC References and LLC Misses) could indirectly give you info
> about how heavily other users are using that resource.
My question is that have we proved that this indirect info leakage
indeed happens?
The spec states that the counter will count the related events generated by
the logical CPU with AnyThread=0. I would be inclined to trust the
hardware behavior
documented in the spec unless we could prove there is a problem.
>
> I tried returning 1 when the guest tries to write the eventsel msr for
> a disallowed event - the behavior on modern guest kernels looks
> reasonable (warns once about an unchecked MSR access error), but it
> looks like guests using older kernels (older than 2016) might panic
> due to the gpfault (not to mention I'm not sure about the behavior on
> non-linux kernels). So I'm hesitant to return 1 - what do you think?
From the guest point of view, returning 0 means that the event counting
is running well.
That is, the guest is expecting to get some count numbers. So better not
to zero the value
when the guest does rdpmc/rdmsr to get the count in this case.
I think we could just ensure "AnyThread=0" in the config, and create the
kernel
counter as usual.
> I also looked into moving from a vcpu ioctl to a vm ioctl - I can send
> out a version of the patch with this change once we settle on the
> other issues. It will involve some extra locking every time the
> counters are programmed to ensure the whitelist or blacklist isn't
> removed during access.
>
Yes, the above discussion needs to be done first.
Best,
Wei
On Sat, Jun 1, 2019 at 3:50 AM Wei Wang <[email protected]> wrote:
>
> My question is that have we proved that this indirect info leakage
> indeed happens?
> The spec states that the counter will count the related events generated by
> the logical CPU with AnyThread=0. I would be inclined to trust the
> hardware behavior
> documented in the spec unless we could prove there is a problem.
I'm not disputing the spec with regards to AnyThread=0; my point is
that LLC contention can be quantified using the PMU regardless of
whether or not you are measuring only the logical CPU you are running
on.
> From the guest point of view, returning 0 means that the event counting
> is running well.
> That is, the guest is expecting to get some count numbers. So better not
> to zero the value
> when the guest does rdpmc/rdmsr to get the count in this case.
>
> I think we could just ensure "AnyThread=0" in the config, and create the
> kernel
> counter as usual.
If you return non-zero in intel_pmu_set_msr(), KVM emulates a gp
fault. Which as you said signals that something went wrong to the
guest. However, older guests with panic_on_oops=1 (which is apparently
default on RHEL 6) will panic if they get a gpfault while trying to do
a wrmsr (see the "Carry on after a non-"safe" MSR access fails without
!panic_on_oops" patch). I think that not panicking guests is probably
preferable to communicating that we weren't able to program the event.
Eric
On 06/04/2019 01:30 AM, Eric Hankland wrote:
> On Sat, Jun 1, 2019 at 3:50 AM Wei Wang <[email protected]> wrote:
>> My question is that have we proved that this indirect info leakage
>> indeed happens?
>> The spec states that the counter will count the related events generated by
>> the logical CPU with AnyThread=0. I would be inclined to trust the
>> hardware behavior
>> documented in the spec unless we could prove there is a problem.
> I'm not disputing the spec with regards to AnyThread=0; my point is
> that LLC contention can be quantified using the PMU regardless of
> whether or not you are measuring only the logical CPU you are running
> on.
So, I'm not sure if "quantifying LLC contention" has been proved to
be a real issue. If this is considered to be an issue:
- without PMU, we could also write a piece of software to run in the
guest to quantify that contention (e.g. by analyzing the memory access
latency). How do you prevent this?
- the same thing could also happen with the L1 cache (e.g. a vCPU
and a host thread run 2 logical CPUs on the same core). If this is disabled
as well, we may have very few events usable, and would like to see what you
have on the whitelist.
Best,
Wei
On Mon, Jun 3, 2019 at 9:37 PM Wei Wang <[email protected]> wrote:
> So, I'm not sure if "quantifying LLC contention" has been proved to
> be a real issue. If this is considered to be an issue:
>
> - without PMU, we could also write a piece of software to run in the
> guest to quantify that contention (e.g. by analyzing the memory access
> latency). How do you prevent this?
>
> - the same thing could also happen with the L1 cache (e.g. a vCPU
> and a host thread run 2 logical CPUs on the same core). If this is disabled
> as well, we may have very few events usable, and would like to see what you
> have on the whitelist.
Right - I'm aware there are other ways of detecting this - it's still
a class of events that some people don't want to surface. I'll ask if
there are any better examples.
Eric
>> Right - I'm aware there are other ways of detecting this - it's still
>> a class of events that some people don't want to surface. I'll ask if
>> there are any better examples.
I asked and it sounds like we are treating all events as potentially
insecure until they've been reviewed. If Intel were to publish
official (reasonably substantiated) guidance stating that the PMU is
secure, then I think we'd be happy without such a safeguard in place,
but short of that I think we want to err on the side of caution.
Eric
On 06/06/2019 05:35 AM, Eric Hankland wrote:
>>> Right - I'm aware there are other ways of detecting this - it's still
>>> a class of events that some people don't want to surface. I'll ask if
>>> there are any better examples.
> I asked and it sounds like we are treating all events as potentially
> insecure until they've been reviewed. If Intel were to publish
> official (reasonably substantiated) guidance stating that the PMU is
> secure, then I think we'd be happy without such a safeguard in place,
> but short of that I think we want to err on the side of caution.
>
I'm not aware of any vendors who'd published statements like that.
Anyway, are you ready to share your QEMU patches or the events you want
to be on the whitelists?
Best,
Wei
Since we aren't using QEMU, I don't have those patches ready yet, but
I can work on them if you want to review them at the same time as this
patch. The architectural events (minus the LLC events) are probably a
reasonable starting point for the whitelist.
Eric
On Thu, Jun 6, 2019 at 12:31 AM Wei Wang <[email protected]> wrote:
>
> On 06/06/2019 05:35 AM, Eric Hankland wrote:
> >>> Right - I'm aware there are other ways of detecting this - it's still
> >>> a class of events that some people don't want to surface. I'll ask if
> >>> there are any better examples.
> > I asked and it sounds like we are treating all events as potentially
> > insecure until they've been reviewed. If Intel were to publish
> > official (reasonably substantiated) guidance stating that the PMU is
> > secure, then I think we'd be happy without such a safeguard in place,
> > but short of that I think we want to err on the side of caution.
> >
>
> I'm not aware of any vendors who'd published statements like that.
>
> Anyway, are you ready to share your QEMU patches or the events you want
> to be on the whitelists?
>
>
> Best,
> Wei
On 06/14/2019 01:43 AM, Eric Hankland wrote:
> Since we aren't using QEMU, I don't have those patches ready yet, but
> I can work on them if you want to review them at the same time as this
> patch. The architectural events (minus the LLC events) are probably a
> reasonable starting point for the whitelist.
Sounds good. Please help on the QEMU patches as well.
Best,
Wei
On 05/23/2019 06:23 AM, Eric Hankland wrote:
> - Add a VCPU ioctl that can control which events the guest can monitor.
>
> Signed-off-by: ehankland <[email protected]>
> ---
> Some events can provide a guest with information about other guests or the
> host (e.g. L3 cache stats); providing the capability to restrict access
> to a "safe" set of events would limit the potential for the PMU to be used
> in any side channel attacks. This change introduces a new vcpu ioctl that
> sets an event whitelist. If the guest attempts to program a counter for
> any unwhitelisted event, the kernel counter won't be created, so any
> RDPMC/RDMSR will show 0 instances of that event.
> ---
> Documentation/virtual/kvm/api.txt | 16 +++++++++++
> arch/x86/include/asm/kvm_host.h | 1 +
> arch/x86/include/uapi/asm/kvm.h | 7 +++++
> arch/x86/kvm/pmu.c | 44 +++++++++++++++++++++++++++++++
> arch/x86/kvm/pmu.h | 3 +++
> arch/x86/kvm/pmu_amd.c | 16 +++++++++++
> arch/x86/kvm/vmx/pmu_intel.c | 16 +++++++++++
> arch/x86/kvm/x86.c | 7 +++++
> include/uapi/linux/kvm.h | 4 +++
> 9 files changed, 114 insertions(+)
>
> diff --git a/Documentation/virtual/kvm/api.txt
> b/Documentation/virtual/kvm/api.txt
> index ba6c42c576dd..79cbe7339145 100644
> --- a/Documentation/virtual/kvm/api.txt
> +++ b/Documentation/virtual/kvm/api.txt
> @@ -4065,6 +4065,22 @@ KVM_ARM_VCPU_FINALIZE call.
> See KVM_ARM_VCPU_INIT for details of vcpu features that require finalization
> using this ioctl.
>
> +4.120 KVM_SET_PMU_WHITELIST
> +
> +Capability: KVM_CAP_PMU_WHITELIST
> +Architectures: x86
> +Type: vm ioctl
> +Parameters: struct kvm_pmu_whitelist (in)
> +Returns: 0 on success, -1 on error
> +
> +struct kvm_pmu_whitelist {
> + __u64 event_mask;
Is this "ARCH_PERFMON_EVENTSEL_EVENT | ARCH_PERFMON_EVENTSEL_UMASK"?
> + __u16 num_events;
> + __u64 events[0];
Can this be __u16?
The lower 16 bits (umask+eventsel) already determines what the event is.
> +};
> +This ioctl restricts the set of PMU events that the guest can program to the
> +set of whitelisted events.
> +
> 5. The kvm_run structure
> ------------------------
>
> diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
> index 450d69a1e6fa..942647475999 100644
> --- a/arch/x86/include/asm/kvm_host.h
> +++ b/arch/x86/include/asm/kvm_host.h
> @@ -477,6 +477,7 @@ struct kvm_pmu {
> struct kvm_pmc fixed_counters[INTEL_PMC_MAX_FIXED];
> struct irq_work irq_work;
> u64 reprogram_pmi;
> + struct kvm_pmu_whitelist *whitelist;
This could be per-VM and under rcu?
> };
>
> struct kvm_pmu_ops;
> diff --git a/arch/x86/include/uapi/asm/kvm.h b/arch/x86/include/uapi/asm/kvm.h
> index 7a0e64ccd6ff..2633b48b75cd 100644
> --- a/arch/x86/include/uapi/asm/kvm.h
> +++ b/arch/x86/include/uapi/asm/kvm.h
> @@ -421,4 +421,11 @@ struct kvm_nested_state {
> __u8 data[0];
> };
>
> +/* for KVM_SET_PMU_WHITELIST */
> +struct kvm_pmu_whitelist {
> + __u64 event_mask;
> + __u16 num_events;
> + __u64 events[0];
> +};
> +
> #endif /* _ASM_X86_KVM_H */
> diff --git a/arch/x86/kvm/pmu.c b/arch/x86/kvm/pmu.c
> index e39741997893..d0d81cd3626d 100644
> --- a/arch/x86/kvm/pmu.c
> +++ b/arch/x86/kvm/pmu.c
> @@ -101,6 +101,9 @@ static void pmc_reprogram_counter(struct kvm_pmc
> *pmc, u32 type,
> bool exclude_kernel, bool intr,
> bool in_tx, bool in_tx_cp)
> {
> + struct kvm_pmu *pmu = pmc_to_pmu(pmc);
> + int i;
> + u64 event_config;
> struct perf_event *event;
> struct perf_event_attr attr = {
> .type = type,
> @@ -127,6 +130,19 @@ static void pmc_reprogram_counter(struct kvm_pmc
> *pmc, u32 type,
> attr.config |= HSW_IN_TX_CHECKPOINTED;
> }
>
> + if (pmu->whitelist) {
Why not moving this filter to reprogram_gp_counter?
You could directly compare "unit_mask, event_sel" with whitelist->events[i]
> + event_config = attr.config;
> + if (type == PERF_TYPE_HARDWARE)
> + event_config = kvm_x86_ops->pmu_ops->get_event_code(
> + attr.config);
> + event_config &= pmu->whitelist->event_mask;
> + for (i = 0; i < pmu->whitelist->num_events; i++)
> + if (event_config == pmu->whitelist->events[i])
> + break;
> + if (i == pmu->whitelist->num_events)
> + return;
> + }
> +
> event = perf_event_create_kernel_counter(&attr, -1, current,
> intr ? kvm_perf_overflow_intr :
> kvm_perf_overflow, pmc);
> @@ -244,6 +260,34 @@ int kvm_pmu_is_valid_msr_idx(struct kvm_vcpu
> *vcpu, unsigned idx)
> return kvm_x86_ops->pmu_ops->is_valid_msr_idx(vcpu, idx);
> }
>
> +int kvm_vcpu_ioctl_set_pmu_whitelist(struct kvm_vcpu *vcpu,
> + struct kvm_pmu_whitelist __user *whtlst)
> +{
> + struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
> + struct kvm_pmu_whitelist *old = pmu->whitelist;
> + struct kvm_pmu_whitelist *new = NULL;
> + struct kvm_pmu_whitelist tmp;
> + int r;
> + size_t size;
> +
> + r = -EFAULT;
> + if (copy_from_user(&tmp, whtlst, sizeof(struct kvm_pmu_whitelist)))
> + goto err;
I would directly return -EFAULT here.
> +
> + size = sizeof(tmp) + sizeof(tmp.events[0]) * tmp.num_events;
> + new = kvzalloc(size, GFP_KERNEL_ACCOUNT);
> + r = -ENOMEM;
> + if (!new)
> + goto err;
Same here.
> + pmu->whitelist = new;
Forgot to copy the whitelist-ed events to new?
Best,
Wei
Eric Hankland <[email protected]> writes:
>
> +int kvm_vcpu_ioctl_set_pmu_whitelist(struct kvm_vcpu *vcpu,
> + struct kvm_pmu_whitelist __user *whtlst)
> +{
> + struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
> + struct kvm_pmu_whitelist *old = pmu->whitelist;
> + struct kvm_pmu_whitelist *new = NULL;
> + struct kvm_pmu_whitelist tmp;
> + int r;
> + size_t size;
> +
> + r = -EFAULT;
> + if (copy_from_user(&tmp, whtlst, sizeof(struct kvm_pmu_whitelist)))
> + goto err;
> +
> + size = sizeof(tmp) + sizeof(tmp.events[0]) * tmp.num_events;
> + new = kvzalloc(size, GFP_KERNEL_ACCOUNT);
Consider what happens when tmp.num_events is large enough to wrap size.
I suspect that's a kernel exploit as written.
Also don't you need to copy tmp to new?
-Andi
> Consider what happens when tmp.num_events is large enough to wrap size.
> I suspect that's a kernel exploit as written.
Thanks for pointing this out - I'll fix it in the next iteration.
> Also don't you need to copy tmp to new?
Yep - somehow I accidentally deleted the lines that did this before I
sent out the patch. I'll also fix this.
Thanks for your feedback - I'll send out an updated version
incorporating your comments shortly (assuming you don't have more
after this).
> > +struct kvm_pmu_whitelist {
> > + __u64 event_mask;
>
> Is this "ARCH_PERFMON_EVENTSEL_EVENT | ARCH_PERFMON_EVENTSEL_UMASK"?
In most cases, I envision this being the case, but it's possible users
may want other bits - see response to the next question below.
> > + __u16 num_events;
> > + __u64 events[0];
>
> Can this be __u16?
> The lower 16 bits (umask+eventsel) already determines what the event is.
It looks like newer AMD processors also use bits 32-35 for eventsel
(see AMD64_EVENTSEL_EVENT/AMD64_RAW_EVENT_MASK in
arch/x86/include/asm/perf_event.h or a recent reference guide), though
it doesn't look like this has made it to pmu_amd.c in kvm yet.
Further, including the whole 64 bits could enable whitelisting some
events with particular modifiers (e.g. in_tx=0, but not in_tx=1). I'm
not sure if whitelisting with specific modifiers will be necessary,
but we definitely need more than u16 if we want to support any AMD
events that make use of those bits in the future.
> > + struct kvm_pmu_whitelist *whitelist;
>
> This could be per-VM and under rcu?
I'll try this out in the next version.
> Why not moving this filter to reprogram_gp_counter?
>
> You could directly compare "unit_mask, event_sel" with whitelist->events[i]
The reason is that this approach provides uniform behavior whether an
event is programmed on a fixed purpose counter vs a general purpose
one. Though I admit it's unlikely that instructions retired/cycles
wouldn't be whitelisted (and ref cycles can't be programmed on gp
counters), so it wouldn't be missing too much if I do move this to
reprogram_gp_counter. What do you think?
> I would directly return -EFAULT here.
>
> Same here.
Sounds good - I'll fix that in the next version.
> > + pmu->whitelist = new;
>
> Forgot to copy the whitelist-ed events to new?
Yep, somehow I deleted the lines that did this - thanks for pointing it out.
On 06/25/2019 08:32 AM, Eric Hankland wrote:
> Thanks for your feedback - I'll send out an updated version
> incorporating your comments shortly (assuming you don't have more
> after this).
Actually I have another thing to discuss:
probably we could consider to make this filter list white/black configurable
from userspace. For example, userspace option: filter-list=white/black
This reason is that for now, we start with only a couple of events to
whitelist.
As people gradually add more (or future hardware has some enhancements to
give you more confidence), finally there may have much more whitelisted
events.
Then users could reuse this interface to switch to "filter-list=black",
this will be more efficient, considering the amount of events to enlist
and the
iteration of event comparisons.
>>> +struct kvm_pmu_whitelist { + __u64 event_mask;
>>
>> Is this "ARCH_PERFMON_EVENTSEL_EVENT |
>> ARCH_PERFMON_EVENTSEL_UMASK"?
>
> In most cases, I envision this being the case, but it's possible
> users may want other bits - see response to the next question below.
>
Probably we don't need this field to be passed from userspace?
We could directly use AMD64_RAW_EVENTMASK_NB, which includes bit[35:32].
Since those bits are reserved on Intel CPUs, have them as mask should be
fine.
Alternatively, we could add this event_mask field to struct kvm_pmu, and
initalize
it in the vendor specific intel_pmu_init or amd_pmu_init.
Both options above look good to me.
>>> + __u16 num_events; + __u64 events[0];
>>
>> Can this be __u16? The lower 16 bits (umask+eventsel) already
>> determines what the event is.
>
> It looks like newer AMD processors also use bits 32-35 for eventsel
> (see AMD64_EVENTSEL_EVENT/AMD64_RAW_EVENT_MASK in
> arch/x86/include/asm/perf_event.h or a recent reference guide),
> though it doesn't look like this has made it to pmu_amd.c in kvm
> yet.
OK, thanks for the reminder on the AMD side. I'm fine to keep it __u64.
>
> Further, including the whole 64 bits could enable whitelisting some
> events with particular modifiers (e.g. in_tx=0, but not in_tx=1).
> I'm not sure if whitelisting with specific modifiers will be
> necessary,
I think "eventsel+umask" should be enough to identify the event.
Other modifiers could be treated with other options when needed (e.g.
AnyThread),
otherwise it would complicate the case (e.g. double/trouble the number
of total events).
>
> but we definitely need more than u16 if we want to support any AMD
> events that make use of those bits in the future.
>
>>> + struct kvm_pmu_whitelist *whitelist;
>>
>> This could be per-VM and under rcu?
> I'll try this out in the next version.
>
>> Why not moving this filter to reprogram_gp_counter?
>>
>> You could directly compare "unit_mask, event_sel" with
>> whitelist->events[i]
> The reason is that this approach provides uniform behavior whether
> an event is programmed on a fixed purpose counter vs a general
> purpose one. Though I admit it's unlikely that instructions
> retired/cycles wouldn't be whitelisted (and ref cycles can't be
> programmed on gp counters), so it wouldn't be missing too much if I
> do move this to reprogram_gp_counter. What do you think?
>
I'd prefer to moving it to reprogram_gp_counter, which
simplifies the implementation (we don't need
.get_event_code as you added in this version.), and it
should also be faster.
Another reason is that we are trying to build an alternative path
of PMU virtualization, which makes the vPMU sits on the hardware
directly, instead of the host perf subsystem. That path wouldn't
go deeper to reprogram_pmc, which invloves the perf subsystem
related implementation, but that path would still need this filter list.
For the fixed counter, we could add a bitmap flag to kvm_arch,
indicating which counter is whitelist-ed based on the
"eventsel+umask" value passed from userspace. This flag is
updated when updating the whitelist-ed events to kvm.
For example, if userspace gives "00+01" (INST_RETIRED_ANY),
then we enable fixed counter0 in the flag.
When reprogram_fixed_counter, we check the flag and return
if the related counter isn't whitelisted.
Best,
Wei
> Actually I have another thing to discuss:
> probably we could consider to make this filter list white/black configurable
> from userspace. For example, userspace option: filter-list=white/black
Works for me. I'll include this in the next version.
> Probably we don't need this field to be passed from userspace?
>
> We could directly use AMD64_RAW_EVENTMASK_NB, which includes bit[35:32].
> Since those bits are reserved on Intel CPUs, have them as mask should be
> fine.
>
> Alternatively, we could add this event_mask field to struct kvm_pmu, and
> initalize
> it in the vendor specific intel_pmu_init or amd_pmu_init.
>
> Both options above look good to me.
Sounds good. I'll go with the first suggestion for now since it's
simpler. If other reviewers prefer the second I can implement that.
> For the fixed counter, we could add a bitmap flag to kvm_arch,
> indicating which counter is whitelist-ed based on the
> "eventsel+umask" value passed from userspace. This flag is
> updated when updating the whitelist-ed events to kvm.
> For example, if userspace gives "00+01" (INST_RETIRED_ANY),
> then we enable fixed counter0 in the flag.
>
> When reprogram_fixed_counter, we check the flag and return
> if the related counter isn't whitelisted.
Sounds good to me.
If you don't have any more comments I'll send out the next version
with all the requested changes.
Eric
On 07/03/2019 01:46 AM, Eric Hankland wrote:
>
> Sounds good to me.
>
> If you don't have any more comments I'll send out the next version
> with all the requested changes.
>
No more so far. I'll see your new version.
Best,
Wei