2010-06-21 09:31:33

by Yanmin Zhang

[permalink] [raw]
Subject: [PATCH V2 3/5] ara virt interface of perf to support kvm guest os statistics collection in guest os

The 3rd patch is to implement para virt perf at host kernel.

Signed-off-by: Zhang Yanmin <[email protected]>

---

--- linux-2.6_tip0620/arch/x86/include/asm/kvm_para.h 2010-06-21 15:19:38.992999849 +0800
+++ linux-2.6_tip0620perfkvm/arch/x86/include/asm/kvm_para.h 2010-06-21 15:21:39.308999849 +0800
@@ -2,6 +2,7 @@
#define _ASM_X86_KVM_PARA_H

#include <linux/types.h>
+#include <linux/list.h>
#include <asm/hyperv.h>

/* This CPUID returns the signature 'KVMKVMKVM' in ebx, ecx, and edx. It
@@ -19,7 +20,8 @@
/* This indicates that the new set of kvmclock msrs
* are available. The use of 0x11 and 0x12 is deprecated
*/
-#define KVM_FEATURE_CLOCKSOURCE2 3
+#define KVM_FEATURE_CLOCKSOURCE2 3
+#define KVM_FEATURE_PV_PERF 4

/* The last 8 bits are used to indicate how to interpret the flags field
* in pvclock structure. If no bits are set, all flags are ignored.
@@ -33,7 +35,14 @@
#define MSR_KVM_WALL_CLOCK_NEW 0x4b564d00
#define MSR_KVM_SYSTEM_TIME_NEW 0x4b564d01

-#define KVM_MAX_MMU_OP_BATCH 32
+#define KVM_MAX_MMU_OP_BATCH 32
+
+/* Operations for KVM_PERF_OP */
+#define KVM_PERF_OP_OPEN 1
+#define KVM_PERF_OP_CLOSE 2
+#define KVM_PERF_OP_ENABLE 3
+#define KVM_PERF_OP_DISABLE 4
+#define KVM_PERF_OP_READ 5

/* Operations for KVM_HC_MMU_OP */
#define KVM_MMU_OP_WRITE_PTE 1
@@ -64,6 +73,85 @@ struct kvm_mmu_op_release_pt {
#ifdef __KERNEL__
#include <asm/processor.h>

+/*
+ * data communication area about perf_event between
+ * Host kernel and guest kernel
+ */
+struct guest_perf_event {
+ u64 count;
+ atomic_t overflows;
+};
+
+/*
+ * In host kernel, perf_event->host_perf_shadow points to
+ * host_perf_shadow which records some information
+ * about the guest.
+ */
+struct host_perf_shadow {
+ /* guest perf_event id passed from guest os */
+ int id;
+ /*
+ * Host kernel saves data into data member counter firstly.
+ * kvm will get data from this counter and calls kvm functions
+ * to copy or add data back to guets os before entering guest os
+ * next time
+ */
+ struct guest_perf_event counter;
+ /* guest_event_addr is gpa_t pointing to guest os guest_perf_event*/
+ __u64 guest_event_addr;
+
+ /*
+ * Link to of kvm.kvm_arch.shadow_hash_table
+ */
+ struct list_head shadow_entry;
+ struct kvm_vcpu *vcpu;
+
+ struct perf_event *host_event;
+ /*
+ * Below counter is to prevent malicious guest os to try to
+ * close/enable event at the same time.
+ */
+ atomic_t ref_counter;
+};
+
+/*
+ * In guest kernel, perf_event->guest_shadow points to
+ * guest_perf_shadow which records some information
+ * about the guest.
+ */
+struct guest_perf_shadow {
+ /* guest perf_event id passed from guest os */
+ int id;
+ /*
+ * Host kernel kvm saves data into data member counter
+ */
+ struct guest_perf_event counter;
+};
+
+/*
+ * guest_perf_attr is used when guest calls hypercall to
+ * open a new perf_event at host side. Mostly, it's a copy of
+ * perf_event_attr and deletes something not used by host kernel.
+ */
+struct guest_perf_attr {
+ __u32 type;
+ __u64 config;
+ __u64 sample_period;
+ __u64 sample_type;
+ __u64 read_format;
+ __u64 flags;
+ __u32 bp_type;
+ __u64 bp_addr;
+ __u64 bp_len;
+};
+
+struct guest_perf_event_param {
+ __u64 attr_addr;
+ __u64 guest_event_addr;
+ /* In case there is an alignment issue, we put id as the last one */
+ int id;
+};
+
extern void kvmclock_init(void);


--- linux-2.6_tip0620/arch/x86/include/asm/kvm_host.h 2010-06-21 15:19:39.019999849 +0800
+++ linux-2.6_tip0620perfkvm/arch/x86/include/asm/kvm_host.h 2010-06-21 15:21:39.308999849 +0800
@@ -24,6 +24,7 @@
#include <asm/desc.h>
#include <asm/mtrr.h>
#include <asm/msr-index.h>
+#include <asm/perf_event.h>

#define KVM_MAX_VCPUS 64
#define KVM_MEMORY_SLOTS 32
@@ -360,6 +361,18 @@ struct kvm_vcpu_arch {

/* fields used by HYPER-V emulation */
u64 hv_vapic;
+
+ /*
+ * Fields used by PARAVIRT perf interface:
+ *
+ * kvm checks overflow_events before entering guest os,
+ * and copy data back to guest os.
+ * event_mutex is to avoid a race between NMI perf event overflow
+ * handler, event close, and enable/disable.
+ */
+ struct mutex event_mutex;
+ int overflows;
+ struct perf_event *overflow_events[X86_PMC_IDX_MAX];
};

struct kvm_mem_alias {
@@ -377,6 +390,9 @@ struct kvm_mem_aliases {
int naliases;
};

+#define KVM_PARAVIRT_PERF_EVENT_ENTRY_BITS (10)
+#define KVM_PARAVIRT_PERF_EVENT_ENTRY_NUM (1<<KVM_PARAVIRT_PERF_EVENT_ENTRY_BITS)
+
struct kvm_arch {
struct kvm_mem_aliases *aliases;

@@ -415,6 +431,15 @@ struct kvm_arch {
/* fields used by HYPER-V emulation */
u64 hv_guest_os_id;
u64 hv_hypercall;
+
+ /*
+ * fields used by PARAVIRT perf interface:
+ * Used to organize all host perf_events representing guest
+ * perf_event on a specific kvm instance
+ */
+ atomic_t kvm_pv_event_num;
+ spinlock_t shadow_lock;
+ struct list_head *shadow_hash_table;
};

struct kvm_vm_stat {
@@ -561,6 +586,9 @@ int emulator_write_phys(struct kvm_vcpu
const void *val, int bytes);
int kvm_pv_mmu_op(struct kvm_vcpu *vcpu, unsigned long bytes,
gpa_t addr, unsigned long *ret);
+int kvm_pv_perf_op(struct kvm_vcpu *vcpu, int op_code, unsigned long a1,
+ unsigned long a2, unsigned long *result);
+
u8 kvm_get_guest_memory_type(struct kvm_vcpu *vcpu, gfn_t gfn);

extern bool tdp_enabled;
--- linux-2.6_tip0620/include/linux/kvm_para.h 2010-06-21 15:19:53.309999849 +0800
+++ linux-2.6_tip0620perfkvm/include/linux/kvm_para.h 2010-06-21 15:21:39.312999849 +0800
@@ -17,6 +17,7 @@

#define KVM_HC_VAPIC_POLL_IRQ 1
#define KVM_HC_MMU_OP 2
+#define KVM_PERF_OP 3

/*
* hypercalls use architecture specific
--- linux-2.6_tip0620/arch/x86/kvm/vmx.c 2010-06-21 15:19:39.322999849 +0800
+++ linux-2.6_tip0620perfkvm/arch/x86/kvm/vmx.c 2010-06-21 15:21:39.310999849 +0800
@@ -3647,6 +3647,7 @@ static int vmx_handle_exit(struct kvm_vc
struct vcpu_vmx *vmx = to_vmx(vcpu);
u32 exit_reason = vmx->exit_reason;
u32 vectoring_info = vmx->idt_vectoring_info;
+ int ret;

trace_kvm_exit(exit_reason, vcpu);

@@ -3694,12 +3695,17 @@ static int vmx_handle_exit(struct kvm_vc

if (exit_reason < kvm_vmx_max_exit_handlers
&& kvm_vmx_exit_handlers[exit_reason])
- return kvm_vmx_exit_handlers[exit_reason](vcpu);
+ ret = kvm_vmx_exit_handlers[exit_reason](vcpu);
else {
vcpu->run->exit_reason = KVM_EXIT_UNKNOWN;
vcpu->run->hw.hardware_exit_reason = exit_reason;
+ ret = 0;
}
- return 0;
+
+ /* sync paravirt perf event to guest */
+ kvm_sync_events_to_guest(vcpu);
+
+ return ret;
}

static void update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int irr)
--- linux-2.6_tip0620/arch/x86/kvm/x86.c 2010-06-21 15:19:39.315999849 +0800
+++ linux-2.6_tip0620perfkvm/arch/x86/kvm/x86.c 2010-06-21 16:49:58.182999849 +0800
@@ -6,12 +6,14 @@
* Copyright (C) 2006 Qumranet, Inc.
* Copyright (C) 2008 Qumranet, Inc.
* Copyright IBM Corporation, 2008
+ * Copyright Intel Corporation, 2010
*
* Authors:
* Avi Kivity <[email protected]>
* Yaniv Kamay <[email protected]>
* Amit Shah <[email protected]>
* Ben-Ami Yassour <[email protected]>
+ * Yanmin Zhang <[email protected]>
*
* This work is licensed under the terms of the GNU GPL, version 2. See
* the COPYING file in the top-level directory.
@@ -1618,6 +1620,7 @@ int kvm_dev_ioctl_check_extension(long e
case KVM_CAP_PCI_SEGMENT:
case KVM_CAP_DEBUGREGS:
case KVM_CAP_X86_ROBUST_SINGLESTEP:
+ case KVM_CAP_PV_PERF:
r = 1;
break;
case KVM_CAP_COALESCED_MMIO:
@@ -1993,7 +1996,9 @@ static void do_cpuid_ent(struct kvm_cpui
entry->eax = (1 << KVM_FEATURE_CLOCKSOURCE) |
(1 << KVM_FEATURE_NOP_IO_DELAY) |
(1 << KVM_FEATURE_CLOCKSOURCE2) |
+ (1 << KVM_FEATURE_PV_PERF) |
(1 << KVM_FEATURE_CLOCKSOURCE_STABLE_BIT);
+
entry->ebx = 0;
entry->ecx = 0;
entry->edx = 0;
@@ -4052,10 +4057,21 @@ static unsigned long kvm_get_guest_ip(vo
return ip;
}

+int kvm_notify_event_overflow(void)
+{
+ if (percpu_read(current_vcpu)) {
+ kvm_inject_nmi(percpu_read(current_vcpu));
+ return 0;
+ }
+
+ return -1;
+}
+
static struct perf_guest_info_callbacks kvm_guest_cbs = {
.is_in_guest = kvm_is_in_guest,
.is_user_mode = kvm_is_user_mode,
.get_guest_ip = kvm_get_guest_ip,
+ .copy_event_to_shadow = kvm_copy_event_to_shadow,
};

void kvm_before_handle_nmi(struct kvm_vcpu *vcpu)
@@ -4138,15 +4154,6 @@ int kvm_emulate_halt(struct kvm_vcpu *vc
}
EXPORT_SYMBOL_GPL(kvm_emulate_halt);

-static inline gpa_t hc_gpa(struct kvm_vcpu *vcpu, unsigned long a0,
- unsigned long a1)
-{
- if (is_long_mode(vcpu))
- return a0;
- else
- return a0 | ((gpa_t)a1 << 32);
-}
-
int kvm_hv_hypercall(struct kvm_vcpu *vcpu)
{
u64 param, ingpa, outgpa, ret;
@@ -4245,6 +4252,9 @@ int kvm_emulate_hypercall(struct kvm_vcp
case KVM_HC_MMU_OP:
r = kvm_pv_mmu_op(vcpu, a0, hc_gpa(vcpu, a1, a2), &ret);
break;
+ case KVM_PERF_OP:
+ r = kvm_pv_perf_op(vcpu, a0, a1, a2, &ret);
+ break;
default:
ret = -KVM_ENOSYS;
break;
@@ -5334,6 +5344,8 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *
}
vcpu->arch.mcg_cap = KVM_MAX_MCE_BANKS;

+ mutex_init(&vcpu->arch.event_mutex);
+
return 0;
fail_free_lapic:
kvm_free_lapic(vcpu);
@@ -5360,6 +5372,8 @@ void kvm_arch_vcpu_uninit(struct kvm_vcp
struct kvm *kvm_arch_create_vm(void)
{
struct kvm *kvm = kzalloc(sizeof(struct kvm), GFP_KERNEL);
+ struct list_head *hash_table;
+ int i;

if (!kvm)
return ERR_PTR(-ENOMEM);
@@ -5369,6 +5383,18 @@ struct kvm *kvm_arch_create_vm(void)
kfree(kvm);
return ERR_PTR(-ENOMEM);
}
+ hash_table = kmalloc(sizeof(struct list_head) *
+ KVM_PARAVIRT_PERF_EVENT_ENTRY_NUM,
+ GFP_KERNEL);
+ if (!hash_table) {
+ kfree(kvm->arch.aliases);
+ kfree(kvm);
+ return ERR_PTR(-ENOMEM);
+ }
+ for (i = 0; i < KVM_PARAVIRT_PERF_EVENT_ENTRY_NUM; i++)
+ INIT_LIST_HEAD(&hash_table[i]);
+ kvm->arch.shadow_hash_table = hash_table;
+ spin_lock_init(&kvm->arch.shadow_lock);

INIT_LIST_HEAD(&kvm->arch.active_mmu_pages);
INIT_LIST_HEAD(&kvm->arch.assigned_dev_head);
@@ -5416,6 +5442,8 @@ void kvm_arch_sync_events(struct kvm *kv

void kvm_arch_destroy_vm(struct kvm *kvm)
{
+ kvm_remove_all_perf_events(kvm);
+
kvm_iommu_unmap_guest(kvm);
kvm_free_pit(kvm);
kfree(kvm->arch.vpic);
@@ -5427,6 +5455,7 @@ void kvm_arch_destroy_vm(struct kvm *kvm
if (kvm->arch.ept_identity_pagetable)
put_page(kvm->arch.ept_identity_pagetable);
cleanup_srcu_struct(&kvm->srcu);
+ kfree(kvm->arch.shadow_hash_table);
kfree(kvm->arch.aliases);
kfree(kvm);
}
--- linux-2.6_tip0620/arch/x86/kvm/x86.h 2010-06-21 15:19:39.311999849 +0800
+++ linux-2.6_tip0620perfkvm/arch/x86/kvm/x86.h 2010-06-21 15:21:39.312999849 +0800
@@ -72,7 +72,20 @@ static inline struct kvm_mem_aliases *kv
|| lockdep_is_held(&kvm->slots_lock));
}

+static inline gpa_t hc_gpa(struct kvm_vcpu *vcpu, unsigned long a0,
+ unsigned long a1)
+{
+ if (is_long_mode(vcpu))
+ return a0;
+ else
+ return a0 | ((gpa_t)a1 << 32);
+}
+
void kvm_before_handle_nmi(struct kvm_vcpu *vcpu);
void kvm_after_handle_nmi(struct kvm_vcpu *vcpu);
+int kvm_notify_event_overflow(void);
+void kvm_copy_event_to_shadow(struct perf_event *event, int overflows);
+void kvm_sync_events_to_guest(struct kvm_vcpu *vcpu);
+void kvm_remove_all_perf_events(struct kvm *kvm);

#endif
--- linux-2.6_tip0620/arch/x86/kvm/Makefile 2010-06-21 15:19:39.311999849 +0800
+++ linux-2.6_tip0620perfkvm/arch/x86/kvm/Makefile 2010-06-21 15:21:39.310999849 +0800
@@ -11,7 +11,7 @@ kvm-y += $(addprefix ../../../virt/kvm
kvm-$(CONFIG_IOMMU_API) += $(addprefix ../../../virt/kvm/, iommu.o)

kvm-y += x86.o mmu.o emulate.o i8259.o irq.o lapic.o \
- i8254.o timer.o
+ i8254.o timer.o kvmperf_event.o
kvm-intel-y += vmx.o
kvm-amd-y += svm.o

--- linux-2.6_tip0620/arch/x86/kvm/kvmperf_event.c 1970-01-01 08:00:00.000000000 +0800
+++ linux-2.6_tip0620perfkvm/arch/x86/kvm/kvmperf_event.c 2010-06-21 16:49:29.509999849 +0800
@@ -0,0 +1,471 @@
+/*
+ * Performance events x86 kvm para architecture code
+ *
+ * Copyright (C) 2010 Intel Inc.
+ * Zhang Yanmin <[email protected]>
+ *
+ * For licencing details see kernel-base/COPYING
+ */
+
+#include <linux/perf_event.h>
+#include <linux/capability.h>
+#include <linux/notifier.h>
+#include <linux/hardirq.h>
+#include <linux/kprobes.h>
+#include <linux/module.h>
+#include <linux/kdebug.h>
+#include <linux/sched.h>
+#include <linux/uaccess.h>
+#include <linux/slab.h>
+#include <linux/highmem.h>
+#include <linux/cpu.h>
+#include <linux/kvm.h>
+#include <linux/kvm_host.h>
+#include <linux/file.h>
+#include <linux/syscalls.h>
+#include <linux/init.h>
+#include <linux/hash.h>
+
+#include <asm/apic.h>
+#include <asm/stacktrace.h>
+#include <asm/nmi.h>
+#include <asm/compat.h>
+
+#include "x86.h"
+
+#define KVM_MAX_PARAVIRT_PERF_EVENT (1024)
+
+static inline u32 shadow_hash_id(int id)
+{
+ u32 hash_value = id;
+
+ hash_value = hash_32(hash_value, KVM_PARAVIRT_PERF_EVENT_ENTRY_BITS);
+ return hash_value;
+}
+
+static int kvm_add_host_event(struct kvm_vcpu *vcpu,
+ struct host_perf_shadow *host_shadow)
+{
+ long unsigned flags;
+ u32 index = shadow_hash_id(host_shadow->id);
+ struct kvm_arch *arch = &vcpu->kvm->arch;
+ struct list_head *head = &arch->shadow_hash_table[index];
+ struct list_head *pos;
+ struct host_perf_shadow *tmp;
+
+ spin_lock_irqsave(&arch->shadow_lock, flags);
+ list_for_each(pos, head) {
+ tmp = container_of(pos, struct host_perf_shadow, shadow_entry);
+ WARN(tmp->id == host_shadow->id, "%s called when there is an"
+ " item with the same id [%d] in hash table,",
+ __func__, host_shadow->id);
+ }
+ list_add(&host_shadow->shadow_entry, head);
+ spin_unlock_irqrestore(&arch->shadow_lock, flags);
+ return 0;
+}
+
+static struct perf_event *
+kvm_find_get_host_event(struct kvm_vcpu *vcpu, int id, int need_delete)
+{
+ long unsigned flags;
+ u32 index = shadow_hash_id(id);
+ struct kvm_arch *arch = &vcpu->kvm->arch;
+ struct list_head *head = &arch->shadow_hash_table[index];
+ struct list_head *pos;
+ struct host_perf_shadow *tmp = NULL;
+ int found = 0;
+
+ spin_lock_irqsave(&arch->shadow_lock, flags);
+ list_for_each(pos, head) {
+ tmp = container_of(pos, struct host_perf_shadow, shadow_entry);
+ if (tmp->id == id) {
+ found = 1;
+ if (need_delete)
+ list_del_init(&tmp->shadow_entry);
+ else
+ atomic_inc(&tmp->ref_counter);
+ break;
+ }
+ }
+ spin_unlock_irqrestore(&arch->shadow_lock, flags);
+
+ if (found)
+ return tmp->host_event;
+ else
+ return NULL;
+}
+
+static void kvm_vcpu_add_event_overflow_ref(struct perf_event *event)
+{
+ struct host_perf_shadow *host_shadow = event->host_perf_shadow;
+ struct kvm_vcpu *vcpu = host_shadow->vcpu;
+ int ret;
+
+ /*
+ * Use trylock as it's in NMI handler. We don't care
+ * too much to lose reporting once of one event to guets os,
+ * because host saves overflows counter in host_perf_shadow.
+ * Next time when a new overflow of the event happens and if
+ * there is no contention, host could push overflows to guest
+ * and guest could process also saved overflows.
+ */
+ ret = mutex_trylock(&vcpu->arch.event_mutex);
+ if (!ret)
+ return;
+ if (vcpu->arch.overflows < X86_PMC_IDX_MAX) {
+ vcpu->arch.overflow_events[vcpu->arch.overflows] = event;
+ vcpu->arch.overflows++;
+ }
+ mutex_unlock(&vcpu->arch.event_mutex);
+}
+
+static int kvm_vcpu_remove_event_overflow_ref(struct host_perf_shadow *shadow)
+{
+ struct kvm_vcpu *vcpu = shadow->vcpu;
+ int i;
+
+ if (!vcpu || !vcpu->arch.overflows)
+ return -1;
+
+ mutex_lock(&vcpu->arch.event_mutex);
+ for (i = 0; i < vcpu->arch.overflows; i++) {
+ if (vcpu->arch.overflow_events[i] == shadow->host_event)
+ vcpu->arch.overflow_events[i] = NULL;
+ }
+ mutex_unlock(&vcpu->arch.event_mutex);
+ return 0;
+}
+
+void kvm_copy_event_to_shadow(struct perf_event *event, int overflows)
+{
+ struct host_perf_shadow *shadow = event->host_perf_shadow;
+
+ shadow->counter.count = local64_read(&event->count);
+ atomic_add(overflows, &shadow->counter.overflows);
+ kvm_vcpu_add_event_overflow_ref(event);
+ /* Inject NMI to guest os */
+ kvm_notify_event_overflow();
+}
+
+static void kvm_perf_event_overflow(struct perf_event *event, int nmi,
+ struct perf_sample_data *data, struct pt_regs *regs)
+{
+ BUG_ON(event->host_perf_shadow == NULL);
+ kvm_copy_event_to_shadow(event, 1);
+}
+
+static void kvm_put_host_event(struct perf_event *host_event)
+{
+ struct host_perf_shadow *shadow = host_event->host_perf_shadow;
+ if (!atomic_dec_return(&shadow->ref_counter)) {
+ /*
+ * detach it in case guest os doesn't disables it
+ * before closing
+ */
+ perf_event_detach(host_event);
+ kvm_vcpu_remove_event_overflow_ref(shadow);
+
+ perf_event_release_kernel(host_event);
+ kfree(shadow);
+ atomic_dec(&shadow->vcpu->kvm->arch.kvm_pv_event_num);
+ }
+}
+
+static void kvm_copy_event_to_guest(struct kvm_vcpu *vcpu,
+ struct perf_event *host_event)
+{
+ struct host_perf_shadow *shadow = host_event->host_perf_shadow;
+ struct guest_perf_event counter;
+ int ret;
+ s32 overflows;
+
+ ret = kvm_read_guest(vcpu->kvm, shadow->guest_event_addr,
+ &counter, sizeof(counter));
+ if (ret < 0)
+ return;
+
+again:
+ overflows = atomic_read(&shadow->counter.overflows);
+ if (atomic_cmpxchg(&shadow->counter.overflows, overflows, 0) !=
+ overflows)
+ goto again;
+
+ counter.count = shadow->counter.count;
+ atomic_add(overflows, &counter.overflows);
+
+ kvm_write_guest(vcpu->kvm,
+ shadow->guest_event_addr,
+ &counter,
+ sizeof(counter));
+ return;
+}
+
+/*
+ * called by KVM to copy both perf_event->count and overflows to guest
+ * after host NMI handler detects guest perf_event overflows
+ */
+void kvm_sync_events_to_guest(struct kvm_vcpu *vcpu)
+{
+ int i;
+
+ if (vcpu->arch.overflows == 0)
+ return;
+
+ mutex_lock(&vcpu->arch.event_mutex);
+ for (i = 0; i < vcpu->arch.overflows; i++) {
+ if (vcpu->arch.overflow_events[i]) {
+ kvm_copy_event_to_guest(vcpu,
+ vcpu->arch.overflow_events[i]);
+ }
+ }
+ vcpu->arch.overflows = 0;
+ mutex_unlock(&vcpu->arch.event_mutex);
+}
+EXPORT_SYMBOL_GPL(kvm_sync_events_to_guest);
+
+/* Just copy perf_event->count to guest. Don't copy overflows to guest */
+static void
+kvm_copy_count_to_guest(struct kvm_vcpu *vcpu, struct perf_event *host_event)
+{
+ struct host_perf_shadow *shadow = host_event->host_perf_shadow;
+
+ shadow->counter.count = local64_read(&host_event->count);
+ kvm_write_guest(vcpu->kvm,
+ shadow->guest_event_addr,
+ &shadow->counter.count,
+ sizeof(shadow->counter.count));
+ return;
+}
+
+static int
+kvm_pv_perf_op_open(struct kvm_vcpu *vcpu, gpa_t addr)
+{
+ int ret = 0;
+ struct perf_event *host_event = NULL;
+ struct host_perf_shadow *shadow = NULL;
+ struct guest_perf_event_param param;
+ struct guest_perf_attr *guest_attr = NULL;
+ struct perf_event_attr *attr = NULL;
+ int next_count;
+
+ next_count = atomic_read(&vcpu->kvm->arch.kvm_pv_event_num);
+ if (next_count >= KVM_MAX_PARAVIRT_PERF_EVENT) {
+ WARN_ONCE(1, "guest os wants to open more than %d events\n",
+ KVM_MAX_PARAVIRT_PERF_EVENT);
+ return -ENOENT;
+ }
+ atomic_inc(&vcpu->kvm->arch.kvm_pv_event_num);
+
+ attr = kzalloc(sizeof(*attr), GFP_KERNEL);
+ if (!attr) {
+ ret = -ENOMEM;
+ goto out;
+ }
+ guest_attr = kzalloc(sizeof(*guest_attr), GFP_KERNEL);
+ if (!attr) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ ret = kvm_read_guest(vcpu->kvm, addr, &param, sizeof(param));
+ if (ret < 0)
+ goto out;
+
+ host_event = kvm_find_get_host_event(vcpu, param.id, 0);
+ if (host_event) {
+ kvm_put_host_event(host_event);
+ return -EEXIST;
+ }
+
+ ret = kvm_read_guest(vcpu->kvm, param.attr_addr,
+ guest_attr, sizeof(*guest_attr));
+ if (ret < 0)
+ goto out;
+
+ attr->type = guest_attr->type;
+ attr->config = guest_attr->config;
+ attr->sample_period = guest_attr->sample_period;
+ attr->read_format = guest_attr->read_format;
+ attr->flags = guest_attr->flags;
+ attr->bp_type = guest_attr->bp_type;
+ attr->bp_addr = guest_attr->bp_addr;
+ attr->bp_len = guest_attr->bp_len;
+ /*
+ * By default, we disable the host event. Later on, guets os
+ * triggers a perf_event_attach to enable it
+ */
+ attr->disabled = 1;
+ attr->inherit = 0;
+ attr->enable_on_exec = 0;
+ /*
+ * We don't support exclude mode of user and kernel for guest os,
+ * which mean we always collect both user and kernel for guest os
+ */
+ attr->exclude_user = 0;
+ attr->exclude_kernel = 0;
+
+ shadow = kzalloc(sizeof(*shadow), GFP_KERNEL);
+ if (!shadow) {
+ ret = -ENOMEM;
+ goto out;
+ }
+ shadow->id = param.id;
+ shadow->guest_event_addr = param.guest_event_addr;
+ shadow->vcpu = vcpu;
+ INIT_LIST_HEAD(&shadow->shadow_entry);
+
+ /* We always create a cpu context host perf event */
+ host_event = perf_event_create_kernel_counter(attr, -1,
+ current->pid, kvm_perf_event_overflow);
+
+ if (IS_ERR(host_event)) {
+ host_event = NULL;
+ ret = -1;
+ goto out;
+ }
+ host_event->host_perf_shadow = shadow;
+ shadow->host_event = host_event;
+ atomic_set(&shadow->ref_counter, 1);
+ kvm_add_host_event(vcpu, shadow);
+
+out:
+ if (!host_event)
+ kfree(shadow);
+
+ kfree(attr);
+ kfree(guest_attr);
+
+ if (ret)
+ atomic_dec(&vcpu->kvm->arch.kvm_pv_event_num);
+
+ return ret;
+}
+
+static int kvm_pv_perf_op_close(struct kvm_vcpu *vcpu, int id)
+{
+ struct perf_event *host_event;
+
+ /* Find and delete the event from the hashtable */
+ host_event = kvm_find_get_host_event(vcpu, id, 1);
+ if (!host_event)
+ return -1;
+ kvm_put_host_event(host_event);
+ return 0;
+}
+
+static int kvm_pv_perf_op_enable(struct kvm_vcpu *vcpu, int id)
+{
+ struct perf_event *event;
+ struct host_perf_shadow *shadow;
+
+ event = kvm_find_get_host_event(vcpu, id, 0);
+ if (!event)
+ return -1;
+
+ shadow = event->host_perf_shadow;
+ if (shadow->vcpu != vcpu) {
+ kvm_vcpu_remove_event_overflow_ref(event->host_perf_shadow);
+ shadow->vcpu = vcpu;
+ }
+
+ perf_event_attach(event);
+ kvm_put_host_event(event);
+
+ return 0;
+}
+
+static int kvm_pv_perf_op_disable(struct kvm_vcpu *vcpu, int id)
+{
+ struct perf_event *host_event = kvm_find_get_host_event(vcpu, id, 0);
+ if (!host_event)
+ return -1;
+ perf_event_detach(host_event);
+ /* We sync count to guest as we delay the guest count update */
+ kvm_copy_count_to_guest(vcpu, host_event);
+ kvm_put_host_event(host_event);
+
+ return 0;
+}
+
+static int kvm_pv_perf_op_read(struct kvm_vcpu *vcpu, int id)
+{
+ u64 enabled, running;
+ struct perf_event *host_event = kvm_find_get_host_event(vcpu, id, 0);
+
+ if (!host_event)
+ return -1;
+ if (host_event->state == PERF_EVENT_STATE_ACTIVE)
+ perf_event_read_value(host_event, &enabled, &running);
+ kvm_copy_count_to_guest(vcpu, host_event);
+ kvm_put_host_event(host_event);
+ return 0;
+}
+
+int kvm_pv_perf_op(struct kvm_vcpu *vcpu, int op_code, unsigned long a1,
+ unsigned long a2, unsigned long *result)
+{
+ unsigned long ret;
+ gpa_t addr;
+ int id;
+
+ switch (op_code) {
+ case KVM_PERF_OP_OPEN:
+ addr = hc_gpa(vcpu, a1, a2);
+ ret = (unsigned long) kvm_pv_perf_op_open(vcpu, addr);
+ break;
+ case KVM_PERF_OP_CLOSE:
+ id = (int) a1;
+ ret = kvm_pv_perf_op_close(vcpu, id);
+ break;
+ case KVM_PERF_OP_ENABLE:
+ id = (int) a1;
+ ret = kvm_pv_perf_op_enable(vcpu, id);
+ break;
+ case KVM_PERF_OP_DISABLE:
+ id = (int) a1;
+ ret = kvm_pv_perf_op_disable(vcpu, id);
+ break;
+ case KVM_PERF_OP_READ:
+ id = (int) a1;
+ ret = kvm_pv_perf_op_read(vcpu, id);
+ break;
+ default:
+ ret = -KVM_ENOSYS;
+ }
+
+ *result = ret;
+ return 0;
+}
+
+void kvm_remove_all_perf_events(struct kvm *kvm)
+{
+ long unsigned flags;
+ struct kvm_arch *arch = &kvm->arch;
+ LIST_HEAD(total_events);
+ struct list_head *head;
+ struct list_head *pos, *next;
+ struct host_perf_shadow *tmp;
+ int i;
+
+ spin_lock_irqsave(&arch->shadow_lock, flags);
+ for (i = 0; i < KVM_PARAVIRT_PERF_EVENT_ENTRY_NUM; i++) {
+ head = &arch->shadow_hash_table[i];
+ list_for_each_safe(pos, next, head) {
+ tmp = container_of(pos, struct host_perf_shadow,
+ shadow_entry);
+ list_del(&tmp->shadow_entry);
+ list_add(&tmp->shadow_entry, &total_events);
+ }
+ }
+ spin_unlock_irqrestore(&arch->shadow_lock, flags);
+ head = &total_events;
+ list_for_each_safe(pos, next, head) {
+ tmp = container_of(pos, struct host_perf_shadow, shadow_entry);
+ list_del(&tmp->shadow_entry);
+ kvm_put_host_event(tmp->host_event);
+ }
+
+ return;
+}
+
--- linux-2.6_tip0620/include/linux/kvm.h 2010-06-21 15:19:52.605999849 +0800
+++ linux-2.6_tip0620perfkvm/include/linux/kvm.h 2010-06-21 15:21:39.312999849 +0800
@@ -524,6 +524,7 @@ struct kvm_enable_cap {
#define KVM_CAP_PPC_OSI 52
#define KVM_CAP_PPC_UNSET_IRQ 53
#define KVM_CAP_ENABLE_CAP 54
+#define KVM_CAP_PV_PERF 57

#ifdef KVM_CAP_IRQ_ROUTING



2010-06-21 12:33:58

by Avi Kivity

[permalink] [raw]
Subject: Re: [PATCH V2 3/5] ara virt interface of perf to support kvm guest os statistics collection in guest os

On 06/21/2010 12:31 PM, Zhang, Yanmin wrote:
> The 3rd patch is to implement para virt perf at host kernel.
>
>
> @@ -64,6 +73,85 @@ struct kvm_mmu_op_release_pt {
> #ifdef __KERNEL__
> #include<asm/processor.h>
>
>
> +/*
> + * In host kernel, perf_event->host_perf_shadow points to
> + * host_perf_shadow which records some information
> + * about the guest.
> + */
> +struct host_perf_shadow {
> + /* guest perf_event id passed from guest os */
> + int id;
> + /*
> + * Host kernel saves data into data member counter firstly.
> + * kvm will get data from this counter and calls kvm functions
> + * to copy or add data back to guets os before entering guest os
> + * next time
> + */
> + struct guest_perf_event counter;
> + /* guest_event_addr is gpa_t pointing to guest os guest_perf_event*/
> + __u64 guest_event_addr;
>

So just use gpa_t as the type.

> +
> + /*
> + * Link to of kvm.kvm_arch.shadow_hash_table
> + */
> + struct list_head shadow_entry;
> + struct kvm_vcpu *vcpu;
> +
> + struct perf_event *host_event;
> + /*
> + * Below counter is to prevent malicious guest os to try to
> + * close/enable event at the same time.
> + */
> + atomic_t ref_counter;
>

If events are made per-vcpu (like real hardware), races become impossible.

> +};
>

Please move this structure to include/linux/kvm_host.h. No need to spam
kvm_para.h. Note it's not x86 specific (though you can leave arch
enabling to arch maintainers).

> +
> +/*
> + * In guest kernel, perf_event->guest_shadow points to
> + * guest_perf_shadow which records some information
> + * about the guest.
> + */
> +struct guest_perf_shadow {
> + /* guest perf_event id passed from guest os */
> + int id;
> + /*
> + * Host kernel kvm saves data into data member counter
> + */
> + struct guest_perf_event counter;
> +};
>

Don't ordinary perf structures already have a counter ID which we can reuse?

> +
> +/*
> + * guest_perf_attr is used when guest calls hypercall to
> + * open a new perf_event at host side. Mostly, it's a copy of
> + * perf_event_attr and deletes something not used by host kernel.
> + */
> +struct guest_perf_attr {
> + __u32 type;
> + __u64 config;
> + __u64 sample_period;
> + __u64 sample_type;
> + __u64 read_format;
> + __u64 flags;
> + __u32 bp_type;
> + __u64 bp_addr;
> + __u64 bp_len;
> +};
>

This is really not a guest or host structure, it's part of the
interface. So please rename it (and similar) kvm_pv_perf_*.

> @@ -24,6 +24,7 @@
> #include<asm/desc.h>
> #include<asm/mtrr.h>
> #include<asm/msr-index.h>
> +#include<asm/perf_event.h>
>
> #define KVM_MAX_VCPUS 64
> #define KVM_MEMORY_SLOTS 32
> @@ -360,6 +361,18 @@ struct kvm_vcpu_arch {
>
> /* fields used by HYPER-V emulation */
> u64 hv_vapic;
> +
> + /*
> + * Fields used by PARAVIRT perf interface:
> + *
> + * kvm checks overflow_events before entering guest os,
> + * and copy data back to guest os.
> + * event_mutex is to avoid a race between NMI perf event overflow
> + * handler, event close, and enable/disable.
> + */
> + struct mutex event_mutex;
>

No race can exist. The host NMI handler cannot take any mutex so it
must be immune to races. The guest NMI handlers and callbacks are all
serialized by the guest itself.

> + int overflows;
> + struct perf_event *overflow_events[X86_PMC_IDX_MAX];
> };
>

KVM_PV_PERF_MAX_EVENTS (which needs to be exposed to the guest via cpuid).

>
> struct kvm_mem_alias {
> @@ -377,6 +390,9 @@ struct kvm_mem_aliases {
> int naliases;
> };
>
> +#define KVM_PARAVIRT_PERF_EVENT_ENTRY_BITS (10)
> +#define KVM_PARAVIRT_PERF_EVENT_ENTRY_NUM (1<<KVM_PARAVIRT_PERF_EVENT_ENTRY_BITS)
>

What are these?

> +
> struct kvm_arch {
> struct kvm_mem_aliases *aliases;
>
> @@ -415,6 +431,15 @@ struct kvm_arch {
> /* fields used by HYPER-V emulation */
> u64 hv_guest_os_id;
> u64 hv_hypercall;
> +
> + /*
> + * fields used by PARAVIRT perf interface:
> + * Used to organize all host perf_events representing guest
> + * perf_event on a specific kvm instance
> + */
> + atomic_t kvm_pv_event_num;
> + spinlock_t shadow_lock;
> + struct list_head *shadow_hash_table;
>

Need to be per-vcpu. Also wrap in a kvm_vcpu_perf structure, the names
are very generic.

Why do we need the hash table? Use the index directly?

> /*
> * hypercalls use architecture specific
> --- linux-2.6_tip0620/arch/x86/kvm/vmx.c 2010-06-21 15:19:39.322999849 +0800
> +++ linux-2.6_tip0620perfkvm/arch/x86/kvm/vmx.c 2010-06-21 15:21:39.310999849 +0800
> @@ -3647,6 +3647,7 @@ static int vmx_handle_exit(struct kvm_vc
> struct vcpu_vmx *vmx = to_vmx(vcpu);
> u32 exit_reason = vmx->exit_reason;
> u32 vectoring_info = vmx->idt_vectoring_info;
> + int ret;
>
> trace_kvm_exit(exit_reason, vcpu);
>
> @@ -3694,12 +3695,17 @@ static int vmx_handle_exit(struct kvm_vc
>
> if (exit_reason< kvm_vmx_max_exit_handlers
> && kvm_vmx_exit_handlers[exit_reason])
> - return kvm_vmx_exit_handlers[exit_reason](vcpu);
> + ret = kvm_vmx_exit_handlers[exit_reason](vcpu);
> else {
> vcpu->run->exit_reason = KVM_EXIT_UNKNOWN;
> vcpu->run->hw.hardware_exit_reason = exit_reason;
> + ret = 0;
> }
> - return 0;
> +
> + /* sync paravirt perf event to guest */
> + kvm_sync_events_to_guest(vcpu);
>

Why do that every exit?

Why in vmx specific code?

> @@ -1618,6 +1620,7 @@ int kvm_dev_ioctl_check_extension(long e
> case KVM_CAP_PCI_SEGMENT:
> case KVM_CAP_DEBUGREGS:
> case KVM_CAP_X86_ROBUST_SINGLESTEP:
> + case KVM_CAP_PV_PERF:
> r = 1;
> break;
> case KVM_CAP_COALESCED_MMIO:
>

You can use KVM_CAP_PV_PERF to report the number of supported events to
userspace.

>
> --- linux-2.6_tip0620/arch/x86/kvm/kvmperf_event.c 1970-01-01 08:00:00.000000000 +0800
> +++ linux-2.6_tip0620perfkvm/arch/x86/kvm/kvmperf_event.c 2010-06-21 16:49:29.509999849 +0800
>

No need for kvm prefix, we're in the kvm directory.
> +
> +#define KVM_MAX_PARAVIRT_PERF_EVENT (1024)
>

This is really high. I don't think it's necessary, or useful since the
underlying hardware has much fewer events, and since the guest can
multiplex events itself.

> +static void kvm_copy_event_to_guest(struct kvm_vcpu *vcpu,
> + struct perf_event *host_event)
> +{
> + struct host_perf_shadow *shadow = host_event->host_perf_shadow;
> + struct guest_perf_event counter;
> + int ret;
> + s32 overflows;
> +
> + ret = kvm_read_guest(vcpu->kvm, shadow->guest_event_addr,
> + &counter, sizeof(counter));
> + if (ret< 0)
> + return;
>

Need better error handling.

> +
> +again:
> + overflows = atomic_read(&shadow->counter.overflows);
> + if (atomic_cmpxchg(&shadow->counter.overflows, overflows, 0) !=
> + overflows)
> + goto again;
>

Can just use atomic_xchg() here.

> +
> + counter.count = shadow->counter.count;
> + atomic_add(overflows,&counter.overflows);
> +
> + kvm_write_guest(vcpu->kvm,
> + shadow->guest_event_addr,
> + &counter,
> + sizeof(counter));
>

kvm_write_guest() is _very_ nonatomic...

Need error handling.

> +
> +/* Just copy perf_event->count to guest. Don't copy overflows to guest */
> +static void
> +kvm_copy_count_to_guest(struct kvm_vcpu *vcpu, struct perf_event *host_event)
> +{
> + struct host_perf_shadow *shadow = host_event->host_perf_shadow;
> +
> + shadow->counter.count = local64_read(&host_event->count);
> + kvm_write_guest(vcpu->kvm,
> + shadow->guest_event_addr,
> + &shadow->counter.count,
> + sizeof(shadow->counter.count));
>

Error handling.

> + return;
> +}
> +
> +static int
> +kvm_pv_perf_op_open(struct kvm_vcpu *vcpu, gpa_t addr)
> +{
> + int ret = 0;
> + struct perf_event *host_event = NULL;
> + struct host_perf_shadow *shadow = NULL;
> + struct guest_perf_event_param param;
> + struct guest_perf_attr *guest_attr = NULL;
> + struct perf_event_attr *attr = NULL;
> + int next_count;
> +
> + next_count = atomic_read(&vcpu->kvm->arch.kvm_pv_event_num);
> + if (next_count>= KVM_MAX_PARAVIRT_PERF_EVENT) {
> + WARN_ONCE(1, "guest os wants to open more than %d events\n",
> + KVM_MAX_PARAVIRT_PERF_EVENT);
> + return -ENOENT;
>

Need to distinguish between guest errors (bad parameters) or host errors
(-ENOMEM) here. Guest errors need to be returned to the guest, while
host errors need to be propagated to userspace (which called
ioctl(KVM_VCPU_RUN) some time ago).

> + }
> + atomic_inc(&vcpu->kvm->arch.kvm_pv_event_num);
> +
> + attr = kzalloc(sizeof(*attr), GFP_KERNEL);
> + if (!attr) {
> + ret = -ENOMEM;
> + goto out;
> + }
> + guest_attr = kzalloc(sizeof(*guest_attr), GFP_KERNEL);
> + if (!attr) {
> + ret = -ENOMEM;
> + goto out;
> + }
> +
> + ret = kvm_read_guest(vcpu->kvm, addr,&param, sizeof(param));
> + if (ret< 0)
> + goto out;
> +
> + host_event = kvm_find_get_host_event(vcpu, param.id, 0);
> + if (host_event) {
> + kvm_put_host_event(host_event);
> + return -EEXIST;
> + }
>

> + ret = kvm_read_guest(vcpu->kvm, param.attr_addr,
> + guest_attr, sizeof(*guest_attr));
> + if (ret< 0)
> + goto out;
> +
> + attr->type = guest_attr->type;
> + attr->config = guest_attr->config;
> + attr->sample_period = guest_attr->sample_period;
> + attr->read_format = guest_attr->read_format;
> + attr->flags = guest_attr->flags;
> + attr->bp_type = guest_attr->bp_type;
> + attr->bp_addr = guest_attr->bp_addr;
> + attr->bp_len = guest_attr->bp_len;
>

Needs tons of parameter validation.

> + /*
> + * By default, we disable the host event. Later on, guets os
> + * triggers a perf_event_attach to enable it
> + */
> + attr->disabled = 1;
> + attr->inherit = 0;
> + attr->enable_on_exec = 0;
> + /*
> + * We don't support exclude mode of user and kernel for guest os,
> + * which mean we always collect both user and kernel for guest os
> + */
> + attr->exclude_user = 0;
> + attr->exclude_kernel = 0;
>

First, if we don't support it, we should error out when the guest
specifies it. Don't lie to the guest.

Second, why can't we support it? should work for the guest just as it
does for us.

> +
> + shadow = kzalloc(sizeof(*shadow), GFP_KERNEL);
> + if (!shadow) {
> + ret = -ENOMEM;
> + goto out;
> + }
> + shadow->id = param.id;
> + shadow->guest_event_addr = param.guest_event_addr;
> + shadow->vcpu = vcpu;
> + INIT_LIST_HEAD(&shadow->shadow_entry);
> +
> + /* We always create a cpu context host perf event */
> + host_event = perf_event_create_kernel_counter(attr, -1,
> + current->pid, kvm_perf_event_overflow);
>

What does 'cpu context' mean in this context?

> +
> + if (IS_ERR(host_event)) {
> + host_event = NULL;
> + ret = -1;
> + goto out;
> + }
> + host_event->host_perf_shadow = shadow;
> + shadow->host_event = host_event;
> + atomic_set(&shadow->ref_counter, 1);
> + kvm_add_host_event(vcpu, shadow);
> +
> +out:
> + if (!host_event)
> + kfree(shadow);
> +
> + kfree(attr);
> + kfree(guest_attr);
> +
> + if (ret)
> + atomic_dec(&vcpu->kvm->arch.kvm_pv_event_num);
> +
> + return ret;
> +}
> +
>
>

--
error compiling committee.c: too many arguments to function

2010-06-21 13:57:27

by Gleb Natapov

[permalink] [raw]
Subject: Re: [PATCH V2 3/5] ara virt interface of perf to support kvm guest os statistics collection in guest os

On Mon, Jun 21, 2010 at 05:31:43PM +0800, Zhang, Yanmin wrote:
> The 3rd patch is to implement para virt perf at host kernel.
>
> Signed-off-by: Zhang Yanmin <[email protected]>
>
> ---
>
> --- linux-2.6_tip0620/arch/x86/include/asm/kvm_para.h 2010-06-21 15:19:38.992999849 +0800
> +++ linux-2.6_tip0620perfkvm/arch/x86/include/asm/kvm_para.h 2010-06-21 15:21:39.308999849 +0800
> @@ -2,6 +2,7 @@
> #define _ASM_X86_KVM_PARA_H
>
> #include <linux/types.h>
> +#include <linux/list.h>
> #include <asm/hyperv.h>
>
> /* This CPUID returns the signature 'KVMKVMKVM' in ebx, ecx, and edx. It
> @@ -19,7 +20,8 @@
> /* This indicates that the new set of kvmclock msrs
> * are available. The use of 0x11 and 0x12 is deprecated
> */
> -#define KVM_FEATURE_CLOCKSOURCE2 3
> +#define KVM_FEATURE_CLOCKSOURCE2 3
> +#define KVM_FEATURE_PV_PERF 4
>
> /* The last 8 bits are used to indicate how to interpret the flags field
> * in pvclock structure. If no bits are set, all flags are ignored.
> @@ -33,7 +35,14 @@
> #define MSR_KVM_WALL_CLOCK_NEW 0x4b564d00
> #define MSR_KVM_SYSTEM_TIME_NEW 0x4b564d01
>
> -#define KVM_MAX_MMU_OP_BATCH 32
> +#define KVM_MAX_MMU_OP_BATCH 32
> +
> +/* Operations for KVM_PERF_OP */
> +#define KVM_PERF_OP_OPEN 1
> +#define KVM_PERF_OP_CLOSE 2
> +#define KVM_PERF_OP_ENABLE 3
> +#define KVM_PERF_OP_DISABLE 4
> +#define KVM_PERF_OP_READ 5
>
> /* Operations for KVM_HC_MMU_OP */
> #define KVM_MMU_OP_WRITE_PTE 1
> @@ -64,6 +73,85 @@ struct kvm_mmu_op_release_pt {
> #ifdef __KERNEL__
> #include <asm/processor.h>
>
> +/*
> + * data communication area about perf_event between
> + * Host kernel and guest kernel
> + */
> +struct guest_perf_event {
> + u64 count;
> + atomic_t overflows;
> +};
> +
> +/*
> + * In host kernel, perf_event->host_perf_shadow points to
> + * host_perf_shadow which records some information
> + * about the guest.
> + */
> +struct host_perf_shadow {
> + /* guest perf_event id passed from guest os */
> + int id;
> + /*
> + * Host kernel saves data into data member counter firstly.
> + * kvm will get data from this counter and calls kvm functions
> + * to copy or add data back to guets os before entering guest os
> + * next time
> + */
> + struct guest_perf_event counter;
> + /* guest_event_addr is gpa_t pointing to guest os guest_perf_event*/
> + __u64 guest_event_addr;
> +
> + /*
> + * Link to of kvm.kvm_arch.shadow_hash_table
> + */
> + struct list_head shadow_entry;
> + struct kvm_vcpu *vcpu;
> +
> + struct perf_event *host_event;
> + /*
> + * Below counter is to prevent malicious guest os to try to
> + * close/enable event at the same time.
> + */
> + atomic_t ref_counter;
> +};
> +
> +/*
> + * In guest kernel, perf_event->guest_shadow points to
> + * guest_perf_shadow which records some information
> + * about the guest.
> + */
> +struct guest_perf_shadow {
> + /* guest perf_event id passed from guest os */
> + int id;
> + /*
> + * Host kernel kvm saves data into data member counter
> + */
> + struct guest_perf_event counter;
> +};
> +
> +/*
> + * guest_perf_attr is used when guest calls hypercall to
> + * open a new perf_event at host side. Mostly, it's a copy of
> + * perf_event_attr and deletes something not used by host kernel.
> + */
> +struct guest_perf_attr {
> + __u32 type;
> + __u64 config;
> + __u64 sample_period;
> + __u64 sample_type;
> + __u64 read_format;
> + __u64 flags;
> + __u32 bp_type;
> + __u64 bp_addr;
> + __u64 bp_len;
> +};
> +
> +struct guest_perf_event_param {
> + __u64 attr_addr;
> + __u64 guest_event_addr;
> + /* In case there is an alignment issue, we put id as the last one */
> + int id;
> +};
> +
> extern void kvmclock_init(void);
>
>
> --- linux-2.6_tip0620/arch/x86/include/asm/kvm_host.h 2010-06-21 15:19:39.019999849 +0800
> +++ linux-2.6_tip0620perfkvm/arch/x86/include/asm/kvm_host.h 2010-06-21 15:21:39.308999849 +0800
> @@ -24,6 +24,7 @@
> #include <asm/desc.h>
> #include <asm/mtrr.h>
> #include <asm/msr-index.h>
> +#include <asm/perf_event.h>
>
> #define KVM_MAX_VCPUS 64
> #define KVM_MEMORY_SLOTS 32
> @@ -360,6 +361,18 @@ struct kvm_vcpu_arch {
>
> /* fields used by HYPER-V emulation */
> u64 hv_vapic;
> +
> + /*
> + * Fields used by PARAVIRT perf interface:
> + *
> + * kvm checks overflow_events before entering guest os,
> + * and copy data back to guest os.
> + * event_mutex is to avoid a race between NMI perf event overflow
> + * handler, event close, and enable/disable.
> + */
> + struct mutex event_mutex;
> + int overflows;
> + struct perf_event *overflow_events[X86_PMC_IDX_MAX];
> };
>
> struct kvm_mem_alias {
> @@ -377,6 +390,9 @@ struct kvm_mem_aliases {
> int naliases;
> };
>
> +#define KVM_PARAVIRT_PERF_EVENT_ENTRY_BITS (10)
> +#define KVM_PARAVIRT_PERF_EVENT_ENTRY_NUM (1<<KVM_PARAVIRT_PERF_EVENT_ENTRY_BITS)
> +
> struct kvm_arch {
> struct kvm_mem_aliases *aliases;
>
> @@ -415,6 +431,15 @@ struct kvm_arch {
> /* fields used by HYPER-V emulation */
> u64 hv_guest_os_id;
> u64 hv_hypercall;
> +
> + /*
> + * fields used by PARAVIRT perf interface:
> + * Used to organize all host perf_events representing guest
> + * perf_event on a specific kvm instance
> + */
> + atomic_t kvm_pv_event_num;
> + spinlock_t shadow_lock;
> + struct list_head *shadow_hash_table;
> };
>
> struct kvm_vm_stat {
> @@ -561,6 +586,9 @@ int emulator_write_phys(struct kvm_vcpu
> const void *val, int bytes);
> int kvm_pv_mmu_op(struct kvm_vcpu *vcpu, unsigned long bytes,
> gpa_t addr, unsigned long *ret);
> +int kvm_pv_perf_op(struct kvm_vcpu *vcpu, int op_code, unsigned long a1,
> + unsigned long a2, unsigned long *result);
> +
> u8 kvm_get_guest_memory_type(struct kvm_vcpu *vcpu, gfn_t gfn);
>
> extern bool tdp_enabled;
> --- linux-2.6_tip0620/include/linux/kvm_para.h 2010-06-21 15:19:53.309999849 +0800
> +++ linux-2.6_tip0620perfkvm/include/linux/kvm_para.h 2010-06-21 15:21:39.312999849 +0800
> @@ -17,6 +17,7 @@
>
> #define KVM_HC_VAPIC_POLL_IRQ 1
> #define KVM_HC_MMU_OP 2
> +#define KVM_PERF_OP 3
>
> /*
> * hypercalls use architecture specific
> --- linux-2.6_tip0620/arch/x86/kvm/vmx.c 2010-06-21 15:19:39.322999849 +0800
> +++ linux-2.6_tip0620perfkvm/arch/x86/kvm/vmx.c 2010-06-21 15:21:39.310999849 +0800
> @@ -3647,6 +3647,7 @@ static int vmx_handle_exit(struct kvm_vc
> struct vcpu_vmx *vmx = to_vmx(vcpu);
> u32 exit_reason = vmx->exit_reason;
> u32 vectoring_info = vmx->idt_vectoring_info;
> + int ret;
>
> trace_kvm_exit(exit_reason, vcpu);
>
> @@ -3694,12 +3695,17 @@ static int vmx_handle_exit(struct kvm_vc
>
> if (exit_reason < kvm_vmx_max_exit_handlers
> && kvm_vmx_exit_handlers[exit_reason])
> - return kvm_vmx_exit_handlers[exit_reason](vcpu);
> + ret = kvm_vmx_exit_handlers[exit_reason](vcpu);
> else {
> vcpu->run->exit_reason = KVM_EXIT_UNKNOWN;
> vcpu->run->hw.hardware_exit_reason = exit_reason;
> + ret = 0;
> }
> - return 0;
> +
> + /* sync paravirt perf event to guest */
> + kvm_sync_events_to_guest(vcpu);
> +
> + return ret;
> }
>
> static void update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int irr)
> --- linux-2.6_tip0620/arch/x86/kvm/x86.c 2010-06-21 15:19:39.315999849 +0800
> +++ linux-2.6_tip0620perfkvm/arch/x86/kvm/x86.c 2010-06-21 16:49:58.182999849 +0800
> @@ -6,12 +6,14 @@
> * Copyright (C) 2006 Qumranet, Inc.
> * Copyright (C) 2008 Qumranet, Inc.
> * Copyright IBM Corporation, 2008
> + * Copyright Intel Corporation, 2010
> *
> * Authors:
> * Avi Kivity <[email protected]>
> * Yaniv Kamay <[email protected]>
> * Amit Shah <[email protected]>
> * Ben-Ami Yassour <[email protected]>
> + * Yanmin Zhang <[email protected]>
> *
> * This work is licensed under the terms of the GNU GPL, version 2. See
> * the COPYING file in the top-level directory.
> @@ -1618,6 +1620,7 @@ int kvm_dev_ioctl_check_extension(long e
> case KVM_CAP_PCI_SEGMENT:
> case KVM_CAP_DEBUGREGS:
> case KVM_CAP_X86_ROBUST_SINGLESTEP:
> + case KVM_CAP_PV_PERF:
> r = 1;
> break;
> case KVM_CAP_COALESCED_MMIO:
> @@ -1993,7 +1996,9 @@ static void do_cpuid_ent(struct kvm_cpui
> entry->eax = (1 << KVM_FEATURE_CLOCKSOURCE) |
> (1 << KVM_FEATURE_NOP_IO_DELAY) |
> (1 << KVM_FEATURE_CLOCKSOURCE2) |
> + (1 << KVM_FEATURE_PV_PERF) |
> (1 << KVM_FEATURE_CLOCKSOURCE_STABLE_BIT);
> +
> entry->ebx = 0;
> entry->ecx = 0;
> entry->edx = 0;
> @@ -4052,10 +4057,21 @@ static unsigned long kvm_get_guest_ip(vo
> return ip;
> }
>
> +int kvm_notify_event_overflow(void)
> +{
> + if (percpu_read(current_vcpu)) {
> + kvm_inject_nmi(percpu_read(current_vcpu));
> + return 0;
> + }
> +
> + return -1;
> +}
> +
> static struct perf_guest_info_callbacks kvm_guest_cbs = {
> .is_in_guest = kvm_is_in_guest,
> .is_user_mode = kvm_is_user_mode,
> .get_guest_ip = kvm_get_guest_ip,
> + .copy_event_to_shadow = kvm_copy_event_to_shadow,
> };
>
> void kvm_before_handle_nmi(struct kvm_vcpu *vcpu)
> @@ -4138,15 +4154,6 @@ int kvm_emulate_halt(struct kvm_vcpu *vc
> }
> EXPORT_SYMBOL_GPL(kvm_emulate_halt);
>
> -static inline gpa_t hc_gpa(struct kvm_vcpu *vcpu, unsigned long a0,
> - unsigned long a1)
> -{
> - if (is_long_mode(vcpu))
> - return a0;
> - else
> - return a0 | ((gpa_t)a1 << 32);
> -}
> -
> int kvm_hv_hypercall(struct kvm_vcpu *vcpu)
> {
> u64 param, ingpa, outgpa, ret;
> @@ -4245,6 +4252,9 @@ int kvm_emulate_hypercall(struct kvm_vcp
> case KVM_HC_MMU_OP:
> r = kvm_pv_mmu_op(vcpu, a0, hc_gpa(vcpu, a1, a2), &ret);
> break;
> + case KVM_PERF_OP:
> + r = kvm_pv_perf_op(vcpu, a0, a1, a2, &ret);
> + break;
> default:
> ret = -KVM_ENOSYS;
> break;
> @@ -5334,6 +5344,8 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *
> }
> vcpu->arch.mcg_cap = KVM_MAX_MCE_BANKS;
>
> + mutex_init(&vcpu->arch.event_mutex);
> +
> return 0;
> fail_free_lapic:
> kvm_free_lapic(vcpu);
> @@ -5360,6 +5372,8 @@ void kvm_arch_vcpu_uninit(struct kvm_vcp
> struct kvm *kvm_arch_create_vm(void)
> {
> struct kvm *kvm = kzalloc(sizeof(struct kvm), GFP_KERNEL);
> + struct list_head *hash_table;
> + int i;
>
> if (!kvm)
> return ERR_PTR(-ENOMEM);
> @@ -5369,6 +5383,18 @@ struct kvm *kvm_arch_create_vm(void)
> kfree(kvm);
> return ERR_PTR(-ENOMEM);
> }
> + hash_table = kmalloc(sizeof(struct list_head) *
> + KVM_PARAVIRT_PERF_EVENT_ENTRY_NUM,
> + GFP_KERNEL);
> + if (!hash_table) {
> + kfree(kvm->arch.aliases);
> + kfree(kvm);
> + return ERR_PTR(-ENOMEM);
> + }
> + for (i = 0; i < KVM_PARAVIRT_PERF_EVENT_ENTRY_NUM; i++)
> + INIT_LIST_HEAD(&hash_table[i]);
> + kvm->arch.shadow_hash_table = hash_table;
> + spin_lock_init(&kvm->arch.shadow_lock);
>
> INIT_LIST_HEAD(&kvm->arch.active_mmu_pages);
> INIT_LIST_HEAD(&kvm->arch.assigned_dev_head);
> @@ -5416,6 +5442,8 @@ void kvm_arch_sync_events(struct kvm *kv
>
> void kvm_arch_destroy_vm(struct kvm *kvm)
> {
> + kvm_remove_all_perf_events(kvm);
> +
> kvm_iommu_unmap_guest(kvm);
> kvm_free_pit(kvm);
> kfree(kvm->arch.vpic);
> @@ -5427,6 +5455,7 @@ void kvm_arch_destroy_vm(struct kvm *kvm
> if (kvm->arch.ept_identity_pagetable)
> put_page(kvm->arch.ept_identity_pagetable);
> cleanup_srcu_struct(&kvm->srcu);
> + kfree(kvm->arch.shadow_hash_table);
> kfree(kvm->arch.aliases);
> kfree(kvm);
> }
> --- linux-2.6_tip0620/arch/x86/kvm/x86.h 2010-06-21 15:19:39.311999849 +0800
> +++ linux-2.6_tip0620perfkvm/arch/x86/kvm/x86.h 2010-06-21 15:21:39.312999849 +0800
> @@ -72,7 +72,20 @@ static inline struct kvm_mem_aliases *kv
> || lockdep_is_held(&kvm->slots_lock));
> }
>
> +static inline gpa_t hc_gpa(struct kvm_vcpu *vcpu, unsigned long a0,
> + unsigned long a1)
> +{
> + if (is_long_mode(vcpu))
> + return a0;
> + else
> + return a0 | ((gpa_t)a1 << 32);
> +}
> +
> void kvm_before_handle_nmi(struct kvm_vcpu *vcpu);
> void kvm_after_handle_nmi(struct kvm_vcpu *vcpu);
> +int kvm_notify_event_overflow(void);
> +void kvm_copy_event_to_shadow(struct perf_event *event, int overflows);
> +void kvm_sync_events_to_guest(struct kvm_vcpu *vcpu);
> +void kvm_remove_all_perf_events(struct kvm *kvm);
>
> #endif
> --- linux-2.6_tip0620/arch/x86/kvm/Makefile 2010-06-21 15:19:39.311999849 +0800
> +++ linux-2.6_tip0620perfkvm/arch/x86/kvm/Makefile 2010-06-21 15:21:39.310999849 +0800
> @@ -11,7 +11,7 @@ kvm-y += $(addprefix ../../../virt/kvm
> kvm-$(CONFIG_IOMMU_API) += $(addprefix ../../../virt/kvm/, iommu.o)
>
> kvm-y += x86.o mmu.o emulate.o i8259.o irq.o lapic.o \
> - i8254.o timer.o
> + i8254.o timer.o kvmperf_event.o
> kvm-intel-y += vmx.o
> kvm-amd-y += svm.o
>
> --- linux-2.6_tip0620/arch/x86/kvm/kvmperf_event.c 1970-01-01 08:00:00.000000000 +0800
> +++ linux-2.6_tip0620perfkvm/arch/x86/kvm/kvmperf_event.c 2010-06-21 16:49:29.509999849 +0800
> @@ -0,0 +1,471 @@
> +/*
> + * Performance events x86 kvm para architecture code
> + *
> + * Copyright (C) 2010 Intel Inc.
> + * Zhang Yanmin <[email protected]>
> + *
> + * For licencing details see kernel-base/COPYING
> + */
> +
> +#include <linux/perf_event.h>
> +#include <linux/capability.h>
> +#include <linux/notifier.h>
> +#include <linux/hardirq.h>
> +#include <linux/kprobes.h>
> +#include <linux/module.h>
> +#include <linux/kdebug.h>
> +#include <linux/sched.h>
> +#include <linux/uaccess.h>
> +#include <linux/slab.h>
> +#include <linux/highmem.h>
> +#include <linux/cpu.h>
> +#include <linux/kvm.h>
> +#include <linux/kvm_host.h>
> +#include <linux/file.h>
> +#include <linux/syscalls.h>
> +#include <linux/init.h>
> +#include <linux/hash.h>
> +
> +#include <asm/apic.h>
> +#include <asm/stacktrace.h>
> +#include <asm/nmi.h>
> +#include <asm/compat.h>
> +
> +#include "x86.h"
> +
> +#define KVM_MAX_PARAVIRT_PERF_EVENT (1024)
> +
> +static inline u32 shadow_hash_id(int id)
> +{
> + u32 hash_value = id;
> +
> + hash_value = hash_32(hash_value, KVM_PARAVIRT_PERF_EVENT_ENTRY_BITS);
> + return hash_value;
> +}
> +
> +static int kvm_add_host_event(struct kvm_vcpu *vcpu,
> + struct host_perf_shadow *host_shadow)
> +{
> + long unsigned flags;
> + u32 index = shadow_hash_id(host_shadow->id);
> + struct kvm_arch *arch = &vcpu->kvm->arch;
> + struct list_head *head = &arch->shadow_hash_table[index];
> + struct list_head *pos;
> + struct host_perf_shadow *tmp;
> +
> + spin_lock_irqsave(&arch->shadow_lock, flags);
> + list_for_each(pos, head) {
> + tmp = container_of(pos, struct host_perf_shadow, shadow_entry);
> + WARN(tmp->id == host_shadow->id, "%s called when there is an"
> + " item with the same id [%d] in hash table,",
> + __func__, host_shadow->id);
> + }
> + list_add(&host_shadow->shadow_entry, head);
> + spin_unlock_irqrestore(&arch->shadow_lock, flags);
> + return 0;
> +}
> +
> +static struct perf_event *
> +kvm_find_get_host_event(struct kvm_vcpu *vcpu, int id, int need_delete)
> +{
> + long unsigned flags;
> + u32 index = shadow_hash_id(id);
> + struct kvm_arch *arch = &vcpu->kvm->arch;
> + struct list_head *head = &arch->shadow_hash_table[index];
> + struct list_head *pos;
> + struct host_perf_shadow *tmp = NULL;
> + int found = 0;
> +
> + spin_lock_irqsave(&arch->shadow_lock, flags);
> + list_for_each(pos, head) {
> + tmp = container_of(pos, struct host_perf_shadow, shadow_entry);
> + if (tmp->id == id) {
> + found = 1;
> + if (need_delete)
> + list_del_init(&tmp->shadow_entry);
> + else
> + atomic_inc(&tmp->ref_counter);
> + break;
> + }
> + }
> + spin_unlock_irqrestore(&arch->shadow_lock, flags);
> +
> + if (found)
> + return tmp->host_event;
> + else
> + return NULL;
> +}
> +
> +static void kvm_vcpu_add_event_overflow_ref(struct perf_event *event)
> +{
> + struct host_perf_shadow *host_shadow = event->host_perf_shadow;
> + struct kvm_vcpu *vcpu = host_shadow->vcpu;
> + int ret;
> +
> + /*
> + * Use trylock as it's in NMI handler. We don't care
> + * too much to lose reporting once of one event to guets os,
> + * because host saves overflows counter in host_perf_shadow.
> + * Next time when a new overflow of the event happens and if
> + * there is no contention, host could push overflows to guest
> + * and guest could process also saved overflows.
> + */
> + ret = mutex_trylock(&vcpu->arch.event_mutex);
> + if (!ret)
> + return;
> + if (vcpu->arch.overflows < X86_PMC_IDX_MAX) {
> + vcpu->arch.overflow_events[vcpu->arch.overflows] = event;
> + vcpu->arch.overflows++;
> + }
> + mutex_unlock(&vcpu->arch.event_mutex);
> +}
> +
> +static int kvm_vcpu_remove_event_overflow_ref(struct host_perf_shadow *shadow)
> +{
> + struct kvm_vcpu *vcpu = shadow->vcpu;
> + int i;
> +
> + if (!vcpu || !vcpu->arch.overflows)
> + return -1;
> +
> + mutex_lock(&vcpu->arch.event_mutex);
> + for (i = 0; i < vcpu->arch.overflows; i++) {
> + if (vcpu->arch.overflow_events[i] == shadow->host_event)
> + vcpu->arch.overflow_events[i] = NULL;
> + }
> + mutex_unlock(&vcpu->arch.event_mutex);
> + return 0;
> +}
> +
> +void kvm_copy_event_to_shadow(struct perf_event *event, int overflows)
> +{
> + struct host_perf_shadow *shadow = event->host_perf_shadow;
> +
> + shadow->counter.count = local64_read(&event->count);
> + atomic_add(overflows, &shadow->counter.overflows);
> + kvm_vcpu_add_event_overflow_ref(event);
> + /* Inject NMI to guest os */
> + kvm_notify_event_overflow();
> +}
> +
> +static void kvm_perf_event_overflow(struct perf_event *event, int nmi,
> + struct perf_sample_data *data, struct pt_regs *regs)
> +{
> + BUG_ON(event->host_perf_shadow == NULL);
> + kvm_copy_event_to_shadow(event, 1);
> +}
> +
> +static void kvm_put_host_event(struct perf_event *host_event)
> +{
> + struct host_perf_shadow *shadow = host_event->host_perf_shadow;
> + if (!atomic_dec_return(&shadow->ref_counter)) {
> + /*
> + * detach it in case guest os doesn't disables it
> + * before closing
> + */
> + perf_event_detach(host_event);
> + kvm_vcpu_remove_event_overflow_ref(shadow);
> +
> + perf_event_release_kernel(host_event);
> + kfree(shadow);
> + atomic_dec(&shadow->vcpu->kvm->arch.kvm_pv_event_num);
> + }
> +}
> +
> +static void kvm_copy_event_to_guest(struct kvm_vcpu *vcpu,
> + struct perf_event *host_event)
> +{
> + struct host_perf_shadow *shadow = host_event->host_perf_shadow;
> + struct guest_perf_event counter;
> + int ret;
> + s32 overflows;
> +
> + ret = kvm_read_guest(vcpu->kvm, shadow->guest_event_addr,
> + &counter, sizeof(counter));
> + if (ret < 0)
> + return;
> +
> +again:
> + overflows = atomic_read(&shadow->counter.overflows);
> + if (atomic_cmpxchg(&shadow->counter.overflows, overflows, 0) !=
> + overflows)
> + goto again;
> +
> + counter.count = shadow->counter.count;
> + atomic_add(overflows, &counter.overflows);
> +
> + kvm_write_guest(vcpu->kvm,
> + shadow->guest_event_addr,
> + &counter,
> + sizeof(counter));
Those kind of interfaces worry me since the can cause bugs that are
very hard to catch. What if guest enables some events and crashes into
kdump kernel (or kexec new kernel) without reseting HW. Now host may
write over guest memory without guest expecting it. Do you handle this
scenario in a guest side? I think you need to register reboot notify
and disable events from there.

> + return;
> +}
> +
> +/*
> + * called by KVM to copy both perf_event->count and overflows to guest
> + * after host NMI handler detects guest perf_event overflows
> + */
> +void kvm_sync_events_to_guest(struct kvm_vcpu *vcpu)
> +{
> + int i;
> +
> + if (vcpu->arch.overflows == 0)
> + return;
> +
> + mutex_lock(&vcpu->arch.event_mutex);
> + for (i = 0; i < vcpu->arch.overflows; i++) {
> + if (vcpu->arch.overflow_events[i]) {
> + kvm_copy_event_to_guest(vcpu,
> + vcpu->arch.overflow_events[i]);
> + }
> + }
> + vcpu->arch.overflows = 0;
> + mutex_unlock(&vcpu->arch.event_mutex);
> +}
> +EXPORT_SYMBOL_GPL(kvm_sync_events_to_guest);
> +
> +/* Just copy perf_event->count to guest. Don't copy overflows to guest */
> +static void
> +kvm_copy_count_to_guest(struct kvm_vcpu *vcpu, struct perf_event *host_event)
> +{
> + struct host_perf_shadow *shadow = host_event->host_perf_shadow;
> +
> + shadow->counter.count = local64_read(&host_event->count);
> + kvm_write_guest(vcpu->kvm,
> + shadow->guest_event_addr,
> + &shadow->counter.count,
> + sizeof(shadow->counter.count));
> + return;
> +}
> +
> +static int
> +kvm_pv_perf_op_open(struct kvm_vcpu *vcpu, gpa_t addr)
> +{
> + int ret = 0;
> + struct perf_event *host_event = NULL;
> + struct host_perf_shadow *shadow = NULL;
> + struct guest_perf_event_param param;
> + struct guest_perf_attr *guest_attr = NULL;
> + struct perf_event_attr *attr = NULL;
> + int next_count;
> +
> + next_count = atomic_read(&vcpu->kvm->arch.kvm_pv_event_num);
> + if (next_count >= KVM_MAX_PARAVIRT_PERF_EVENT) {
> + WARN_ONCE(1, "guest os wants to open more than %d events\n",
> + KVM_MAX_PARAVIRT_PERF_EVENT);
> + return -ENOENT;
> + }
> + atomic_inc(&vcpu->kvm->arch.kvm_pv_event_num);
> +
> + attr = kzalloc(sizeof(*attr), GFP_KERNEL);
> + if (!attr) {
> + ret = -ENOMEM;
> + goto out;
> + }
> + guest_attr = kzalloc(sizeof(*guest_attr), GFP_KERNEL);
> + if (!attr) {
> + ret = -ENOMEM;
> + goto out;
> + }
> +
> + ret = kvm_read_guest(vcpu->kvm, addr, &param, sizeof(param));
> + if (ret < 0)
> + goto out;
> +
> + host_event = kvm_find_get_host_event(vcpu, param.id, 0);
> + if (host_event) {
> + kvm_put_host_event(host_event);
> + return -EEXIST;
> + }
> +
> + ret = kvm_read_guest(vcpu->kvm, param.attr_addr,
> + guest_attr, sizeof(*guest_attr));
> + if (ret < 0)
> + goto out;
> +
> + attr->type = guest_attr->type;
> + attr->config = guest_attr->config;
> + attr->sample_period = guest_attr->sample_period;
> + attr->read_format = guest_attr->read_format;
> + attr->flags = guest_attr->flags;
> + attr->bp_type = guest_attr->bp_type;
> + attr->bp_addr = guest_attr->bp_addr;
> + attr->bp_len = guest_attr->bp_len;
> + /*
> + * By default, we disable the host event. Later on, guets os
> + * triggers a perf_event_attach to enable it
> + */
> + attr->disabled = 1;
> + attr->inherit = 0;
> + attr->enable_on_exec = 0;
> + /*
> + * We don't support exclude mode of user and kernel for guest os,
> + * which mean we always collect both user and kernel for guest os
> + */
> + attr->exclude_user = 0;
> + attr->exclude_kernel = 0;
> +
> + shadow = kzalloc(sizeof(*shadow), GFP_KERNEL);
> + if (!shadow) {
> + ret = -ENOMEM;
> + goto out;
> + }
> + shadow->id = param.id;
> + shadow->guest_event_addr = param.guest_event_addr;
> + shadow->vcpu = vcpu;
> + INIT_LIST_HEAD(&shadow->shadow_entry);
> +
> + /* We always create a cpu context host perf event */
> + host_event = perf_event_create_kernel_counter(attr, -1,
> + current->pid, kvm_perf_event_overflow);
> +
> + if (IS_ERR(host_event)) {
> + host_event = NULL;
> + ret = -1;
> + goto out;
> + }
> + host_event->host_perf_shadow = shadow;
> + shadow->host_event = host_event;
> + atomic_set(&shadow->ref_counter, 1);
> + kvm_add_host_event(vcpu, shadow);
> +
> +out:
> + if (!host_event)
> + kfree(shadow);
> +
> + kfree(attr);
> + kfree(guest_attr);
> +
> + if (ret)
> + atomic_dec(&vcpu->kvm->arch.kvm_pv_event_num);
> +
> + return ret;
> +}
> +
> +static int kvm_pv_perf_op_close(struct kvm_vcpu *vcpu, int id)
> +{
> + struct perf_event *host_event;
> +
> + /* Find and delete the event from the hashtable */
> + host_event = kvm_find_get_host_event(vcpu, id, 1);
> + if (!host_event)
> + return -1;
> + kvm_put_host_event(host_event);
> + return 0;
> +}
> +
> +static int kvm_pv_perf_op_enable(struct kvm_vcpu *vcpu, int id)
> +{
> + struct perf_event *event;
> + struct host_perf_shadow *shadow;
> +
> + event = kvm_find_get_host_event(vcpu, id, 0);
> + if (!event)
> + return -1;
> +
> + shadow = event->host_perf_shadow;
> + if (shadow->vcpu != vcpu) {
> + kvm_vcpu_remove_event_overflow_ref(event->host_perf_shadow);
> + shadow->vcpu = vcpu;
> + }
> +
> + perf_event_attach(event);
> + kvm_put_host_event(event);
> +
> + return 0;
> +}
> +
> +static int kvm_pv_perf_op_disable(struct kvm_vcpu *vcpu, int id)
> +{
> + struct perf_event *host_event = kvm_find_get_host_event(vcpu, id, 0);
> + if (!host_event)
> + return -1;
> + perf_event_detach(host_event);
> + /* We sync count to guest as we delay the guest count update */
> + kvm_copy_count_to_guest(vcpu, host_event);
> + kvm_put_host_event(host_event);
> +
> + return 0;
> +}
> +
> +static int kvm_pv_perf_op_read(struct kvm_vcpu *vcpu, int id)
> +{
> + u64 enabled, running;
> + struct perf_event *host_event = kvm_find_get_host_event(vcpu, id, 0);
> +
> + if (!host_event)
> + return -1;
> + if (host_event->state == PERF_EVENT_STATE_ACTIVE)
> + perf_event_read_value(host_event, &enabled, &running);
> + kvm_copy_count_to_guest(vcpu, host_event);
> + kvm_put_host_event(host_event);
> + return 0;
> +}
> +
> +int kvm_pv_perf_op(struct kvm_vcpu *vcpu, int op_code, unsigned long a1,
> + unsigned long a2, unsigned long *result)
> +{
> + unsigned long ret;
> + gpa_t addr;
> + int id;
> +
> + switch (op_code) {
> + case KVM_PERF_OP_OPEN:
> + addr = hc_gpa(vcpu, a1, a2);
> + ret = (unsigned long) kvm_pv_perf_op_open(vcpu, addr);
> + break;
> + case KVM_PERF_OP_CLOSE:
> + id = (int) a1;
> + ret = kvm_pv_perf_op_close(vcpu, id);
> + break;
> + case KVM_PERF_OP_ENABLE:
> + id = (int) a1;
> + ret = kvm_pv_perf_op_enable(vcpu, id);
> + break;
> + case KVM_PERF_OP_DISABLE:
> + id = (int) a1;
> + ret = kvm_pv_perf_op_disable(vcpu, id);
> + break;
> + case KVM_PERF_OP_READ:
> + id = (int) a1;
> + ret = kvm_pv_perf_op_read(vcpu, id);
> + break;
> + default:
> + ret = -KVM_ENOSYS;
> + }
> +
> + *result = ret;
> + return 0;
> +}
> +
> +void kvm_remove_all_perf_events(struct kvm *kvm)
> +{
> + long unsigned flags;
> + struct kvm_arch *arch = &kvm->arch;
> + LIST_HEAD(total_events);
> + struct list_head *head;
> + struct list_head *pos, *next;
> + struct host_perf_shadow *tmp;
> + int i;
> +
> + spin_lock_irqsave(&arch->shadow_lock, flags);
> + for (i = 0; i < KVM_PARAVIRT_PERF_EVENT_ENTRY_NUM; i++) {
> + head = &arch->shadow_hash_table[i];
> + list_for_each_safe(pos, next, head) {
> + tmp = container_of(pos, struct host_perf_shadow,
> + shadow_entry);
> + list_del(&tmp->shadow_entry);
> + list_add(&tmp->shadow_entry, &total_events);
> + }
> + }
> + spin_unlock_irqrestore(&arch->shadow_lock, flags);
> + head = &total_events;
> + list_for_each_safe(pos, next, head) {
> + tmp = container_of(pos, struct host_perf_shadow, shadow_entry);
> + list_del(&tmp->shadow_entry);
> + kvm_put_host_event(tmp->host_event);
> + }
> +
> + return;
> +}
> +
> --- linux-2.6_tip0620/include/linux/kvm.h 2010-06-21 15:19:52.605999849 +0800
> +++ linux-2.6_tip0620perfkvm/include/linux/kvm.h 2010-06-21 15:21:39.312999849 +0800
> @@ -524,6 +524,7 @@ struct kvm_enable_cap {
> #define KVM_CAP_PPC_OSI 52
> #define KVM_CAP_PPC_UNSET_IRQ 53
> #define KVM_CAP_ENABLE_CAP 54
> +#define KVM_CAP_PV_PERF 57
>
> #ifdef KVM_CAP_IRQ_ROUTING
>
>

--
Gleb.

2010-06-22 03:11:58

by Yanmin Zhang

[permalink] [raw]
Subject: Re: [PATCH V2 3/5] ara virt interface of perf to support kvm guest os statistics collection in guest os

On Mon, 2010-06-21 at 15:33 +0300, Avi Kivity wrote:
> On 06/21/2010 12:31 PM, Zhang, Yanmin wrote:
> > The 3rd patch is to implement para virt perf at host kernel.
> >
> >
> > @@ -64,6 +73,85 @@ struct kvm_mmu_op_release_pt {
> > #ifdef __KERNEL__
> > #include<asm/processor.h>
> >
> >
> > +/*
> > + * In host kernel, perf_event->host_perf_shadow points to
> > + * host_perf_shadow which records some information
> > + * about the guest.
> > + */
> > +struct host_perf_shadow {
> > + /* guest perf_event id passed from guest os */
> > + int id;
> > + /*
> > + * Host kernel saves data into data member counter firstly.
> > + * kvm will get data from this counter and calls kvm functions
> > + * to copy or add data back to guets os before entering guest os
> > + * next time
> > + */
> > + struct guest_perf_event counter;
> > + /* guest_event_addr is gpa_t pointing to guest os guest_perf_event*/
> > + __u64 guest_event_addr;
> >
>
> So just use gpa_t as the type.
host_perf_shadow->guest_event_addr is a copy of guest_event_addr->guest_event_addr.
As the latter's type is __u64 as the interface between guest os and host os, I use
__u64 as the type of host_perf_shadow->guest_event_addr.

>
> > +
> > + /*
> > + * Link to of kvm.kvm_arch.shadow_hash_table
> > + */
> > + struct list_head shadow_entry;
> > + struct kvm_vcpu *vcpu;
> > +
> > + struct perf_event *host_event;
> > + /*
> > + * Below counter is to prevent malicious guest os to try to
> > + * close/enable event at the same time.
> > + */
> > + atomic_t ref_counter;
> >
>
> If events are made per-vcpu (like real hardware), races become impossible.
This design is to deal with a task context perf collection in guest os.
Scenario 1:
1) guest os starts to collect statistics of process A on vcpu 0;
2) process A is scheduled to vcpu 1. Then, the perf_event at host side need
to be moved to VCPU 1 's thread. With the per KVM instance design, we needn't
move host_perf_shadow among vcpus.

Scenario 2:
1) guest os creates a perf_event at host side on vcpu 0;
2) malicious guest os calls close to delete the host perf_event on vcpu 1, but
enables the perf_event on vcpu0 at the same time. When close thread runs to get the
host_perf_shadow from the list, enable thread also gets it. Then, close thread
deletes the perf_event, and enable thread will cause host kernel panic when using
host_perf_shadow.


>
> > +};
> >
>
> Please move this structure to include/linux/kvm_host.h. No need to spam
> kvm_para.h. Note it's not x86 specific (though you can leave arch
> enabling to arch maintainers).
Ok. Originally, I wanted to do so, but I'm afraid other arch might be not happy.

>
> > +
> > +/*
> > + * In guest kernel, perf_event->guest_shadow points to
> > + * guest_perf_shadow which records some information
> > + * about the guest.
> > + */
> > +struct guest_perf_shadow {
> > + /* guest perf_event id passed from guest os */
> > + int id;
> > + /*
> > + * Host kernel kvm saves data into data member counter
> > + */
> > + struct guest_perf_event counter;
> > +};
> >
>
> Don't ordinary perf structures already have a counter ID which we can reuse?
No. In the other hand, if we assume generic perf has, we couldn't use it, because
generic perf id (actually there is no) is host kernel system-wide while here
guest_perf_shadow->id is per kvm instance wide.

>
> > +
> > +/*
> > + * guest_perf_attr is used when guest calls hypercall to
> > + * open a new perf_event at host side. Mostly, it's a copy of
> > + * perf_event_attr and deletes something not used by host kernel.
> > + */
> > +struct guest_perf_attr {
> > + __u32 type;
> > + __u64 config;
> > + __u64 sample_period;
> > + __u64 sample_type;
> > + __u64 read_format;
> > + __u64 flags;
> > + __u32 bp_type;
> > + __u64 bp_addr;
> > + __u64 bp_len;
> > +};
> >
>
> This is really not a guest or host structure, it's part of the
> interface. So please rename it (and similar) kvm_pv_perf_*.
Good idea.

>
> > @@ -24,6 +24,7 @@
> > #include<asm/desc.h>
> > #include<asm/mtrr.h>
> > #include<asm/msr-index.h>
> > +#include<asm/perf_event.h>
> >
> > #define KVM_MAX_VCPUS 64
> > #define KVM_MEMORY_SLOTS 32
> > @@ -360,6 +361,18 @@ struct kvm_vcpu_arch {
> >
> > /* fields used by HYPER-V emulation */
> > u64 hv_vapic;
> > +
> > + /*
> > + * Fields used by PARAVIRT perf interface:
> > + *
> > + * kvm checks overflow_events before entering guest os,
> > + * and copy data back to guest os.
> > + * event_mutex is to avoid a race between NMI perf event overflow
> > + * handler, event close, and enable/disable.
> > + */
> > + struct mutex event_mutex;
> >
>
> No race can exist. The host NMI handler cannot take any mutex
We use a mutex_trylock in NMI hanlder. If it can't get the lock, there is a NMI miss
happening, but host kernel still updates perf_event->host_perf_shadow.counter, so the
overflow data will be accumulated.

> so it
> must be immune to races. The guest NMI handlers and callbacks are all
> serialized by the guest itself.
This is to fight with malicious guest os kernel. Just like what I mention above,
the race might happen when:
1) NMI handler accesses it;
2) vmx_handle_exit codes access overflow_events to sync data to guest os;
3) Another vcpu thread of the same guest os calls close to delete the perf_event;

>
> > + int overflows;
> > + struct perf_event *overflow_events[X86_PMC_IDX_MAX];
> > };
> >
>
> KVM_PV_PERF_MAX_EVENTS (which needs to be exposed to the guest via cpuid).
Ok.

>
> >
> > struct kvm_mem_alias {
> > @@ -377,6 +390,9 @@ struct kvm_mem_aliases {
> > int naliases;
> > };
> >
> > +#define KVM_PARAVIRT_PERF_EVENT_ENTRY_BITS (10)
> > +#define KVM_PARAVIRT_PERF_EVENT_ENTRY_NUM (1<<KVM_PARAVIRT_PERF_EVENT_ENTRY_BITS)
> >
>
> What are these?
The length of kvm_arch->shadow_hash_table.

>
> > +
> > struct kvm_arch {
> > struct kvm_mem_aliases *aliases;
> >
> > @@ -415,6 +431,15 @@ struct kvm_arch {
> > /* fields used by HYPER-V emulation */
> > u64 hv_guest_os_id;
> > u64 hv_hypercall;
> > +
> > + /*
> > + * fields used by PARAVIRT perf interface:
> > + * Used to organize all host perf_events representing guest
> > + * perf_event on a specific kvm instance
> > + */
> > + atomic_t kvm_pv_event_num;
> > + spinlock_t shadow_lock;
> > + struct list_head *shadow_hash_table;
> >
>
> Need to be per-vcpu. Also wrap in a kvm_vcpu_perf structure, the names
> are very generic.
Originally, I did so, but changed it to per kvm instance wide when considering
perf_event moving around vcpu threads.

>
> Why do we need the hash table? Use the index directly?
See above explanation.

>
> > /*
> > * hypercalls use architecture specific
> > --- linux-2.6_tip0620/arch/x86/kvm/vmx.c 2010-06-21 15:19:39.322999849 +0800
> > +++ linux-2.6_tip0620perfkvm/arch/x86/kvm/vmx.c 2010-06-21 15:21:39.310999849 +0800
> > @@ -3647,6 +3647,7 @@ static int vmx_handle_exit(struct kvm_vc
> > struct vcpu_vmx *vmx = to_vmx(vcpu);
> > u32 exit_reason = vmx->exit_reason;
> > u32 vectoring_info = vmx->idt_vectoring_info;
> > + int ret;
> >
> > trace_kvm_exit(exit_reason, vcpu);
> >
> > @@ -3694,12 +3695,17 @@ static int vmx_handle_exit(struct kvm_vc
> >
> > if (exit_reason< kvm_vmx_max_exit_handlers
> > && kvm_vmx_exit_handlers[exit_reason])
> > - return kvm_vmx_exit_handlers[exit_reason](vcpu);
> > + ret = kvm_vmx_exit_handlers[exit_reason](vcpu);
> > else {
> > vcpu->run->exit_reason = KVM_EXIT_UNKNOWN;
> > vcpu->run->hw.hardware_exit_reason = exit_reason;
> > + ret = 0;
> > }
> > - return 0;
> > +
> > + /* sync paravirt perf event to guest */
> > + kvm_sync_events_to_guest(vcpu);
> >
>
> Why do that every exit?
> Why in vmx specific code?
I could move it to the tail of vcpu_enter_guest. kvm_sync_events_to_guest
might go to sleep when going through guest os page tables, so we couldn't call it
by NMI handler.


>
> > @@ -1618,6 +1620,7 @@ int kvm_dev_ioctl_check_extension(long e
> > case KVM_CAP_PCI_SEGMENT:
> > case KVM_CAP_DEBUGREGS:
> > case KVM_CAP_X86_ROBUST_SINGLESTEP:
> > + case KVM_CAP_PV_PERF:
> > r = 1;
> > break;
> > case KVM_CAP_COALESCED_MMIO:
> >
>
> You can use KVM_CAP_PV_PERF to report the number of supported events to
> userspace.
Thanks for the good pointer!

>
> >
> > --- linux-2.6_tip0620/arch/x86/kvm/kvmperf_event.c 1970-01-01 08:00:00.000000000 +0800
> > +++ linux-2.6_tip0620perfkvm/arch/x86/kvm/kvmperf_event.c 2010-06-21 16:49:29.509999849 +0800
> >
>
> No need for kvm prefix, we're in the kvm directory.
> > +
> > +#define KVM_MAX_PARAVIRT_PERF_EVENT (1024)
> >
>
> This is really high. I don't think it's necessary, or useful since the
> underlying hardware has much fewer events, and since the guest can
> multiplex events itself.
This limitation is different from hardware PMU counter imitation. When any application or
guest os vcpu thread creates perf_event, host kernel has no limitation. Kernel just arranges
all perf_event in a list (not considering group case) and schedules them to PMU hardware
by a round-robin method.
KVM_MAX_PARAVIRT_PERF_EVENT is to restrict guest os instance not to create too many
perf_event at host side which consumes too much memory of host kernel and slow the perf_event
schedule.

>
> > +static void kvm_copy_event_to_guest(struct kvm_vcpu *vcpu,
> > + struct perf_event *host_event)
> > +{
> > + struct host_perf_shadow *shadow = host_event->host_perf_shadow;
> > + struct guest_perf_event counter;
> > + int ret;
> > + s32 overflows;
> > +
> > + ret = kvm_read_guest(vcpu->kvm, shadow->guest_event_addr,
> > + &counter, sizeof(counter));
> > + if (ret< 0)
> > + return;
> >
>
> Need better error handling.
As host kernel saves/accumulate data in perf_event->host_perf_shadow.counter,
it doesn't matter to have one failure. next time when overflowing again, it will
copy all data back to guest os.

>
> > +
> > +again:
> > + overflows = atomic_read(&shadow->counter.overflows);
> > + if (atomic_cmpxchg(&shadow->counter.overflows, overflows, 0) !=
> > + overflows)
> > + goto again;
> >
>
> Can just use atomic_xchg() here.
Definitely can. Thanks.

>
> > +
> > + counter.count = shadow->counter.count;
> > + atomic_add(overflows,&counter.overflows);
> > +
> > + kvm_write_guest(vcpu->kvm,
> > + shadow->guest_event_addr,
> > + &counter,
> > + sizeof(counter));
> >
>
> kvm_write_guest() is _very_ nonatomic...
It doesn't matter. There is only one potential race between host kernel and
guest kernel. When guest vmexits to host, it wouldn't access data pointed by
shadow->guest_event_addr. Above kvm_write_guest happens with the same vpcu.
So we just need make sure guest os vcpu accesses guest_perf_shadow->counter.overflows
atomically.

>
> Need error handling.
Above explanation about data accumulation.

>
> > +
> > +/* Just copy perf_event->count to guest. Don't copy overflows to guest */
> > +static void
> > +kvm_copy_count_to_guest(struct kvm_vcpu *vcpu, struct perf_event *host_event)
> > +{
> > + struct host_perf_shadow *shadow = host_event->host_perf_shadow;
> > +
> > + shadow->counter.count = local64_read(&host_event->count);
> > + kvm_write_guest(vcpu->kvm,
> > + shadow->guest_event_addr,
> > + &shadow->counter.count,
> > + sizeof(shadow->counter.count));
> >
>
> Error handling.
Same thing.

>
> > + return;
> > +}
> > +
> > +static int
> > +kvm_pv_perf_op_open(struct kvm_vcpu *vcpu, gpa_t addr)
> > +{
> > + int ret = 0;
> > + struct perf_event *host_event = NULL;
> > + struct host_perf_shadow *shadow = NULL;
> > + struct guest_perf_event_param param;
> > + struct guest_perf_attr *guest_attr = NULL;
> > + struct perf_event_attr *attr = NULL;
> > + int next_count;
> > +
> > + next_count = atomic_read(&vcpu->kvm->arch.kvm_pv_event_num);
> > + if (next_count>= KVM_MAX_PARAVIRT_PERF_EVENT) {
> > + WARN_ONCE(1, "guest os wants to open more than %d events\n",
> > + KVM_MAX_PARAVIRT_PERF_EVENT);
> > + return -ENOENT;
> >
>
> Need to distinguish between guest errors (bad parameters) or host errors
> (-ENOMEM) here. Guest errors need to be returned to the guest, while
> host errors need to be propagated to userspace (which called
> ioctl(KVM_VCPU_RUN) some time ago).
I will double-check it. Thanks. Now I can understand why kvm_pv_mmu_op has
a special *ret parameter.

>
> > + }
> > + atomic_inc(&vcpu->kvm->arch.kvm_pv_event_num);
> > +
> > + attr = kzalloc(sizeof(*attr), GFP_KERNEL);
> > + if (!attr) {
> > + ret = -ENOMEM;
> > + goto out;
> > + }
> > + guest_attr = kzalloc(sizeof(*guest_attr), GFP_KERNEL);
> > + if (!attr) {
> > + ret = -ENOMEM;
> > + goto out;
> > + }
> > +
> > + ret = kvm_read_guest(vcpu->kvm, addr,&param, sizeof(param));
> > + if (ret< 0)
> > + goto out;
> > +
> > + host_event = kvm_find_get_host_event(vcpu, param.id, 0);
> > + if (host_event) {
> > + kvm_put_host_event(host_event);
> > + return -EEXIST;
> > + }
> >
>
> > + ret = kvm_read_guest(vcpu->kvm, param.attr_addr,
> > + guest_attr, sizeof(*guest_attr));
> > + if (ret< 0)
> > + goto out;
> > +
> > + attr->type = guest_attr->type;
> > + attr->config = guest_attr->config;
> > + attr->sample_period = guest_attr->sample_period;
> > + attr->read_format = guest_attr->read_format;
> > + attr->flags = guest_attr->flags;
> > + attr->bp_type = guest_attr->bp_type;
> > + attr->bp_addr = guest_attr->bp_addr;
> > + attr->bp_len = guest_attr->bp_len;
> >
>
> Needs tons of parameter validation.
The idea is we let generic perf codes to check the parameter.
We might add some simple checking here.

>
> > + /*
> > + * By default, we disable the host event. Later on, guets os
> > + * triggers a perf_event_attach to enable it
> > + */
> > + attr->disabled = 1;
> > + attr->inherit = 0;
> > + attr->enable_on_exec = 0;
> > + /*
> > + * We don't support exclude mode of user and kernel for guest os,
> > + * which mean we always collect both user and kernel for guest os
> > + */
> > + attr->exclude_user = 0;
> > + attr->exclude_kernel = 0;
> >
>
> First, if we don't support it, we should error out when the guest
> specifies it. Don't lie to the guest.
>
> Second, why can't we support it? should work for the guest just as it
> does for us.
exclude_user and exclude_kernel are just hardware capability. Current PMU hardware
doesn't support virtualization. So when a counter is at exclude_user mode, we couldn't
collect any event happens in guest os. That's my direct thinking without architect
confirmation.

>
> > +
> > + shadow = kzalloc(sizeof(*shadow), GFP_KERNEL);
> > + if (!shadow) {
> > + ret = -ENOMEM;
> > + goto out;
> > + }
> > + shadow->id = param.id;
> > + shadow->guest_event_addr = param.guest_event_addr;
> > + shadow->vcpu = vcpu;
> > + INIT_LIST_HEAD(&shadow->shadow_entry);
> > +
> > + /* We always create a cpu context host perf event */
> > + host_event = perf_event_create_kernel_counter(attr, -1,
> > + current->pid, kvm_perf_event_overflow);
> >
>
> What does 'cpu context' mean in this context?
Sorry, above comments are bad. Right one is:
/* We always create a process context host perf event */

perf event generic has 2 context, process context and per cpu context. process
context event is to collect statistics of a specific thread (process), while
cpu context event is to collect statistics of this cpu.

>
> > +
> > + if (IS_ERR(host_event)) {
> > + host_event = NULL;
> > + ret = -1;
> > + goto out;
> > + }
> > + host_event->host_perf_shadow = shadow;
> > + shadow->host_event = host_event;
> > + atomic_set(&shadow->ref_counter, 1);
> > + kvm_add_host_event(vcpu, shadow);
> > +
> > +out:
> > + if (!host_event)
> > + kfree(shadow);
> > +
> > + kfree(attr);
> > + kfree(guest_attr);
> > +
> > + if (ret)
> > + atomic_dec(&vcpu->kvm->arch.kvm_pv_event_num);
> > +
> > + return ret;
> > +}
> > +
> >
> >
>

2010-06-22 05:47:00

by Yanmin Zhang

[permalink] [raw]
Subject: Re: [PATCH V2 3/5] ara virt interface of perf to support kvm guest os statistics collection in guest os

On Mon, 2010-06-21 at 16:56 +0300, Gleb Natapov wrote:
> On Mon, Jun 21, 2010 at 05:31:43PM +0800, Zhang, Yanmin wrote:
> > The 3rd patch is to implement para virt perf at host kernel.
> >
> > Signed-off-by: Zhang Yanmin <[email protected]>
> >
<snip>

> > +
> > +static void kvm_copy_event_to_guest(struct kvm_vcpu *vcpu,
> > + struct perf_event *host_event)
> > +{
> > + struct host_perf_shadow *shadow = host_event->host_perf_shadow;
> > + struct guest_perf_event counter;
> > + int ret;
> > + s32 overflows;
> > +
> > + ret = kvm_read_guest(vcpu->kvm, shadow->guest_event_addr,
> > + &counter, sizeof(counter));
> > + if (ret < 0)
> > + return;
> > +
> > +again:
> > + overflows = atomic_read(&shadow->counter.overflows);
> > + if (atomic_cmpxchg(&shadow->counter.overflows, overflows, 0) !=
> > + overflows)
> > + goto again;
> > +
> > + counter.count = shadow->counter.count;
> > + atomic_add(overflows, &counter.overflows);
> > +
> > + kvm_write_guest(vcpu->kvm,
> > + shadow->guest_event_addr,
> > + &counter,
> > + sizeof(counter));
> Those kind of interfaces worry me since the can cause bugs that are
> very hard to catch. What if guest enables some events and crashes into
> kdump kernel (or kexec new kernel) without reseting HW. Now host may
> write over guest memory without guest expecting it. Do you handle this
> scenario in a guest side? I think you need to register reboot notify
> and disable events from there.
Sorry for missing your comments.

My patch could take care of dead guest os by cleaning up all events in function
kvm_arch_destroy_vm, so all events are closed if host user kills the guest
qemu process.

As for your scenario, I will register reboot notify and add a new pv perf
hypercall interface to vmexit to host kernel to do cleanup.

Thanks,
Yanmin

2010-06-22 08:05:12

by Avi Kivity

[permalink] [raw]
Subject: Re: [PATCH V2 3/5] ara virt interface of perf to support kvm guest os statistics collection in guest os

On 06/22/2010 08:47 AM, Zhang, Yanmin wrote:
> On Mon, 2010-06-21 at 16:56 +0300, Gleb Natapov wrote:
>
>> On Mon, Jun 21, 2010 at 05:31:43PM +0800, Zhang, Yanmin wrote:
>>
>>> The 3rd patch is to implement para virt perf at host kernel.
>>>
>>> Signed-off-by: Zhang Yanmin<[email protected]>
>>>
>>>
> <snip>
>
>
>>> +
>>> +static void kvm_copy_event_to_guest(struct kvm_vcpu *vcpu,
>>> + struct perf_event *host_event)
>>> +{
>>> + struct host_perf_shadow *shadow = host_event->host_perf_shadow;
>>> + struct guest_perf_event counter;
>>> + int ret;
>>> + s32 overflows;
>>> +
>>> + ret = kvm_read_guest(vcpu->kvm, shadow->guest_event_addr,
>>> + &counter, sizeof(counter));
>>> + if (ret< 0)
>>> + return;
>>> +
>>> +again:
>>> + overflows = atomic_read(&shadow->counter.overflows);
>>> + if (atomic_cmpxchg(&shadow->counter.overflows, overflows, 0) !=
>>> + overflows)
>>> + goto again;
>>> +
>>> + counter.count = shadow->counter.count;
>>> + atomic_add(overflows,&counter.overflows);
>>> +
>>> + kvm_write_guest(vcpu->kvm,
>>> + shadow->guest_event_addr,
>>> + &counter,
>>> + sizeof(counter));
>>>
>> Those kind of interfaces worry me since the can cause bugs that are
>> very hard to catch. What if guest enables some events and crashes into
>> kdump kernel (or kexec new kernel) without reseting HW. Now host may
>> write over guest memory without guest expecting it. Do you handle this
>> scenario in a guest side? I think you need to register reboot notify
>> and disable events from there.
>>
> Sorry for missing your comments.
>
> My patch could take care of dead guest os by cleaning up all events in function
> kvm_arch_destroy_vm, so all events are closed if host user kills the guest
> qemu process.
>
>

A reset does not involve destroying a vm; you have to clean up as part
of the rest process.

Note MSRs are automatically cleared, so that's something in favour of an
MSR interface.

> As for your scenario, I will register reboot notify and add a new pv perf
> hypercall interface to vmexit to host kernel to do cleanup.
>

You aren't guaranteed a reboot notifier will be called. On the other
hand, we need a kexec handler.


--
error compiling committee.c: too many arguments to function

2010-06-22 08:30:10

by Avi Kivity

[permalink] [raw]
Subject: Re: [PATCH V2 3/5] ara virt interface of perf to support kvm guest os statistics collection in guest os

On 06/22/2010 06:12 AM, Zhang, Yanmin wrote:
> On Mon, 2010-06-21 at 15:33 +0300, Avi Kivity wrote:
>
>> On 06/21/2010 12:31 PM, Zhang, Yanmin wrote:
>>
>>> The 3rd patch is to implement para virt perf at host kernel.
>>>
>>>
>>> @@ -64,6 +73,85 @@ struct kvm_mmu_op_release_pt {
>>> #ifdef __KERNEL__
>>> #include<asm/processor.h>
>>>
>>>
>>> +/*
>>> + * In host kernel, perf_event->host_perf_shadow points to
>>> + * host_perf_shadow which records some information
>>> + * about the guest.
>>> + */
>>> +struct host_perf_shadow {
>>> + /* guest perf_event id passed from guest os */
>>> + int id;
>>> + /*
>>> + * Host kernel saves data into data member counter firstly.
>>> + * kvm will get data from this counter and calls kvm functions
>>> + * to copy or add data back to guets os before entering guest os
>>> + * next time
>>> + */
>>> + struct guest_perf_event counter;
>>> + /* guest_event_addr is gpa_t pointing to guest os guest_perf_event*/
>>> + __u64 guest_event_addr;
>>>
>>>
>> So just use gpa_t as the type.
>>
> host_perf_shadow->guest_event_addr is a copy of guest_event_addr->guest_event_addr.
> As the latter's type is __u64 as the interface between guest os and host os, I use
> __u64 as the type of host_perf_shadow->guest_event_addr.
>

Right. Bug gpa_t is more descriptive. We have a lot of address spaces
in kvm.

>>> +
>>> + /*
>>> + * Link to of kvm.kvm_arch.shadow_hash_table
>>> + */
>>> + struct list_head shadow_entry;
>>> + struct kvm_vcpu *vcpu;
>>> +
>>> + struct perf_event *host_event;
>>> + /*
>>> + * Below counter is to prevent malicious guest os to try to
>>> + * close/enable event at the same time.
>>> + */
>>> + atomic_t ref_counter;
>>>
>>>
>> If events are made per-vcpu (like real hardware), races become impossible.
>>
> This design is to deal with a task context perf collection in guest os.
> Scenario 1:
> 1) guest os starts to collect statistics of process A on vcpu 0;
> 2) process A is scheduled to vcpu 1. Then, the perf_event at host side need
> to be moved to VCPU 1 's thread. With the per KVM instance design, we needn't
> move host_perf_shadow among vcpus.
>

First, the guest already knows how to deal with per-cpu performance
monitors, since that's how most (all) hardware works. So we aren't
making the guest more complex, and on the other hand we simplify the host.

Second, if process A is migrated, and the guest uses per-process
counters, the guest will need to stop/start the counter during the
migration. This will cause the host to migrate the counter, so while we
didn't move the counter to a different vcpu, we still have to move it to
a different cpu.

> Scenario 2:
> 1) guest os creates a perf_event at host side on vcpu 0;
> 2) malicious guest os calls close to delete the host perf_event on vcpu 1, but
> enables the perf_event on vcpu0 at the same time. When close thread runs to get the
> host_perf_shadow from the list, enable thread also gets it. Then, close thread
> deletes the perf_event, and enable thread will cause host kernel panic when using
> host_perf_shadow.
>

With per-vcpu events, this can't happen. Each vcpu has its own set of
perf events in their own ID namespace. vcpu 0 can't touch vcpu 1's
events even if it knows their IDs.

>> Please move this structure to include/linux/kvm_host.h. No need to spam
>> kvm_para.h. Note it's not x86 specific (though you can leave arch
>> enabling to arch maintainers).
>>
> Ok. Originally, I wanted to do so, but I'm afraid other arch might be not happy.
>

Copying kvm arch maintainers. Please keep an eye on this topic and
holler if something doesn't fit your arch.


>>
>>> @@ -24,6 +24,7 @@
>>> #include<asm/desc.h>
>>> #include<asm/mtrr.h>
>>> #include<asm/msr-index.h>
>>> +#include<asm/perf_event.h>
>>>
>>> #define KVM_MAX_VCPUS 64
>>> #define KVM_MEMORY_SLOTS 32
>>> @@ -360,6 +361,18 @@ struct kvm_vcpu_arch {
>>>
>>> /* fields used by HYPER-V emulation */
>>> u64 hv_vapic;
>>> +
>>> + /*
>>> + * Fields used by PARAVIRT perf interface:
>>> + *
>>> + * kvm checks overflow_events before entering guest os,
>>> + * and copy data back to guest os.
>>> + * event_mutex is to avoid a race between NMI perf event overflow
>>> + * handler, event close, and enable/disable.
>>> + */
>>> + struct mutex event_mutex;
>>>
>>>
>> No race can exist. The host NMI handler cannot take any mutex
>>
> We use a mutex_trylock in NMI hanlder. If it can't get the lock, there is a NMI miss
> happening, but host kernel still updates perf_event->host_perf_shadow.counter, so the
> overflow data will be accumulated.
>

I see. I don't think this is needed if we disable the counters during
guest->host switch, we can just copy the data and set a bit in
vcpu->requests so that we can update the guest during next entry.

>> so it
>> must be immune to races. The guest NMI handlers and callbacks are all
>> serialized by the guest itself.
>>
> This is to fight with malicious guest os kernel. Just like what I mention above,
> the race might happen when:
> 1) NMI handler accesses it;
> 2) vmx_handle_exit codes access overflow_events to sync data to guest os;
> 3) Another vcpu thread of the same guest os calls close to delete the perf_event;
>

This goes away with per-vcpu events.

>>> struct kvm_arch {
>>> struct kvm_mem_aliases *aliases;
>>>
>>> @@ -415,6 +431,15 @@ struct kvm_arch {
>>> /* fields used by HYPER-V emulation */
>>> u64 hv_guest_os_id;
>>> u64 hv_hypercall;
>>> +
>>> + /*
>>> + * fields used by PARAVIRT perf interface:
>>> + * Used to organize all host perf_events representing guest
>>> + * perf_event on a specific kvm instance
>>> + */
>>> + atomic_t kvm_pv_event_num;
>>> + spinlock_t shadow_lock;
>>> + struct list_head *shadow_hash_table;
>>>
>>>
>> Need to be per-vcpu. Also wrap in a kvm_vcpu_perf structure, the names
>> are very generic.
>>
> Originally, I did so, but changed it to per kvm instance wide when considering
> perf_event moving around vcpu threads.
>

I think this makes the code more complicated, needlessly.

>>
>>> /*
>>> * hypercalls use architecture specific
>>> --- linux-2.6_tip0620/arch/x86/kvm/vmx.c 2010-06-21 15:19:39.322999849 +0800
>>> +++ linux-2.6_tip0620perfkvm/arch/x86/kvm/vmx.c 2010-06-21 15:21:39.310999849 +0800
>>> @@ -3647,6 +3647,7 @@ static int vmx_handle_exit(struct kvm_vc
>>> struct vcpu_vmx *vmx = to_vmx(vcpu);
>>> u32 exit_reason = vmx->exit_reason;
>>> u32 vectoring_info = vmx->idt_vectoring_info;
>>> + int ret;
>>>
>>> trace_kvm_exit(exit_reason, vcpu);
>>>
>>> @@ -3694,12 +3695,17 @@ static int vmx_handle_exit(struct kvm_vc
>>>
>>> if (exit_reason< kvm_vmx_max_exit_handlers
>>> && kvm_vmx_exit_handlers[exit_reason])
>>> - return kvm_vmx_exit_handlers[exit_reason](vcpu);
>>> + ret = kvm_vmx_exit_handlers[exit_reason](vcpu);
>>> else {
>>> vcpu->run->exit_reason = KVM_EXIT_UNKNOWN;
>>> vcpu->run->hw.hardware_exit_reason = exit_reason;
>>> + ret = 0;
>>> }
>>> - return 0;
>>> +
>>> + /* sync paravirt perf event to guest */
>>> + kvm_sync_events_to_guest(vcpu);
>>>
>>>
>> Why do that every exit?
>> Why in vmx specific code?
>>
> I could move it to the tail of vcpu_enter_guest. kvm_sync_events_to_guest
> might go to sleep when going through guest os page tables, so we couldn't call it
> by NMI handler.
>

But syncing every exit is expensive. Use vcpu->requests to sync only
when the data has changed.

>>> +
>>> +#define KVM_MAX_PARAVIRT_PERF_EVENT (1024)
>>>
>>>
>> This is really high. I don't think it's necessary, or useful since the
>> underlying hardware has much fewer events, and since the guest can
>> multiplex events itself.
>>
> This limitation is different from hardware PMU counter imitation. When any application or
> guest os vcpu thread creates perf_event, host kernel has no limitation. Kernel just arranges
> all perf_event in a list (not considering group case) and schedules them to PMU hardware
> by a round-robin method.
>

In practice, it will take such a long time to cycle through all events
that measurement quality will deteriorate. I prefer exposing a much
smaller number of events so that multiplexing on the host side will be
rare (for example, if both guest and host are monitoring at the same
time, or to accomodate hardware constraints). If the guest needs 1024
events, it can schedule them itself (and then it knows the measurement
is very inaccurate due to sampling)

> KVM_MAX_PARAVIRT_PERF_EVENT is to restrict guest os instance not to create too many
> perf_event at host side which consumes too much memory of host kernel and slow the perf_event
> schedule.
>
>
>>
>>> +static void kvm_copy_event_to_guest(struct kvm_vcpu *vcpu,
>>> + struct perf_event *host_event)
>>> +{
>>> + struct host_perf_shadow *shadow = host_event->host_perf_shadow;
>>> + struct guest_perf_event counter;
>>> + int ret;
>>> + s32 overflows;
>>> +
>>> + ret = kvm_read_guest(vcpu->kvm, shadow->guest_event_addr,
>>> + &counter, sizeof(counter));
>>> + if (ret< 0)
>>> + return;
>>>
>>>
>> Need better error handling.
>>
> As host kernel saves/accumulate data in perf_event->host_perf_shadow.counter,
> it doesn't matter to have one failure. next time when overflowing again, it will
> copy all data back to guest os.
>

Next time we may fail too. And next time as well.


>>> +
>>> + counter.count = shadow->counter.count;
>>> + atomic_add(overflows,&counter.overflows);
>>> +
>>> + kvm_write_guest(vcpu->kvm,
>>> + shadow->guest_event_addr,
>>> + &counter,
>>> + sizeof(counter));
>>>
>>>
>> kvm_write_guest() is _very_ nonatomic...
>>
> It doesn't matter. There is only one potential race between host kernel and
> guest kernel. When guest vmexits to host, it wouldn't access data pointed by
> shadow->guest_event_addr. Above kvm_write_guest happens with the same vpcu.
> So we just need make sure guest os vcpu accesses guest_perf_shadow->counter.overflows
> atomically.
>

Well, without per-vcpu events, you can't guarantee this.

With per-vcpu events, I agree.


>>> + /*
>>> + * By default, we disable the host event. Later on, guets os
>>> + * triggers a perf_event_attach to enable it
>>> + */
>>> + attr->disabled = 1;
>>> + attr->inherit = 0;
>>> + attr->enable_on_exec = 0;
>>> + /*
>>> + * We don't support exclude mode of user and kernel for guest os,
>>> + * which mean we always collect both user and kernel for guest os
>>> + */
>>> + attr->exclude_user = 0;
>>> + attr->exclude_kernel = 0;
>>>
>>>
>> First, if we don't support it, we should error out when the guest
>> specifies it. Don't lie to the guest.
>>
>> Second, why can't we support it? should work for the guest just as it
>> does for us.
>>
> exclude_user and exclude_kernel are just hardware capability. Current PMU hardware
> doesn't support virtualization. So when a counter is at exclude_user mode, we couldn't
> collect any event happens in guest os. That's my direct thinking without architect
> confirmation.
>

1) IIUC exclude_user and exclude_kernel should just work. They work by
counting only when the cpl matches, yes? The hardware cpl is available
and valid in the guest.

2) We should atomically enable/disable the hardware performance counter
during guest entry/exit, like during ordinary context switch, so that
the guest doesn't measure host code (for example, ip would be
meaningless). Needs integration between perf core and kvm here.

>>> +
>>> + shadow = kzalloc(sizeof(*shadow), GFP_KERNEL);
>>> + if (!shadow) {
>>> + ret = -ENOMEM;
>>> + goto out;
>>> + }
>>> + shadow->id = param.id;
>>> + shadow->guest_event_addr = param.guest_event_addr;
>>> + shadow->vcpu = vcpu;
>>> + INIT_LIST_HEAD(&shadow->shadow_entry);
>>> +
>>> + /* We always create a cpu context host perf event */
>>> + host_event = perf_event_create_kernel_counter(attr, -1,
>>> + current->pid, kvm_perf_event_overflow);
>>>
>>>
>> What does 'cpu context' mean in this context?
>>
> Sorry, above comments are bad. Right one is:
> /* We always create a process context host perf event */
>
> perf event generic has 2 context, process context and per cpu context. process
> context event is to collect statistics of a specific thread (process), while
> cpu context event is to collect statistics of this cpu.
>

Thanks, that makes more sense.


--
error compiling committee.c: too many arguments to function

2010-06-23 01:43:23

by Yanmin Zhang

[permalink] [raw]
Subject: Re: [PATCH V2 3/5] ara virt interface of perf to support kvm guest os statistics collection in guest os

On Tue, 2010-06-22 at 11:04 +0300, Avi Kivity wrote:
> On 06/22/2010 08:47 AM, Zhang, Yanmin wrote:
> > On Mon, 2010-06-21 at 16:56 +0300, Gleb Natapov wrote:
> >
> >> On Mon, Jun 21, 2010 at 05:31:43PM +0800, Zhang, Yanmin wrote:
> >>
> >>> The 3rd patch is to implement para virt perf at host kernel.
> >>>
> >>> Signed-off-by: Zhang Yanmin<[email protected]>
> >>>
> >>>
> > <snip>
> >
> >
> >>> +
> >>> +static void kvm_copy_event_to_guest(struct kvm_vcpu *vcpu,
> >>> + struct perf_event *host_event)
> >>> +{
> >>> + struct host_perf_shadow *shadow = host_event->host_perf_shadow;
> >>> + struct guest_perf_event counter;
> >>> + int ret;
> >>> + s32 overflows;
> >>> +
> >>> + ret = kvm_read_guest(vcpu->kvm, shadow->guest_event_addr,
> >>> + &counter, sizeof(counter));
> >>> + if (ret< 0)
> >>> + return;
> >>> +
> >>> +again:
> >>> + overflows = atomic_read(&shadow->counter.overflows);
> >>> + if (atomic_cmpxchg(&shadow->counter.overflows, overflows, 0) !=
> >>> + overflows)
> >>> + goto again;
> >>> +
> >>> + counter.count = shadow->counter.count;
> >>> + atomic_add(overflows,&counter.overflows);
> >>> +
> >>> + kvm_write_guest(vcpu->kvm,
> >>> + shadow->guest_event_addr,
> >>> + &counter,
> >>> + sizeof(counter));
> >>>
> >> Those kind of interfaces worry me since the can cause bugs that are
> >> very hard to catch. What if guest enables some events and crashes into
> >> kdump kernel (or kexec new kernel) without reseting HW. Now host may
> >> write over guest memory without guest expecting it. Do you handle this
> >> scenario in a guest side? I think you need to register reboot notify
> >> and disable events from there.
> >>
> > Sorry for missing your comments.
> >
> > My patch could take care of dead guest os by cleaning up all events in function
> > kvm_arch_destroy_vm, so all events are closed if host user kills the guest
> > qemu process.
> >
> >
>
> A reset does not involve destroying a vm; you have to clean up as part
> of the rest process.
What does 'reset' here mean? Is it a reboot or halt? If it's a halt, it involves
destroying a vm. If a host user just kills the qemu process, is it a reset involving
destroying a vm?

>
> Note MSRs are automatically cleared, so that's something in favour of an
> MSR interface.
>
> > As for your scenario, I will register reboot notify and add a new pv perf
> > hypercall interface to vmexit to host kernel to do cleanup.
> >
>
> You aren't guaranteed a reboot notifier will be called. On the other
> hand, we need a kexec handler.
ordinary kexec calls all reboot notifiers. Only crash kexec doesn't call them.
I will implement a machine_ops.crash_shutdown callback.

2010-06-23 03:12:36

by Yanmin Zhang

[permalink] [raw]
Subject: Re: [PATCH V2 3/5] ara virt interface of perf to support kvm guest os statistics collection in guest os

On Tue, 2010-06-22 at 11:29 +0300, Avi Kivity wrote:
> On 06/22/2010 06:12 AM, Zhang, Yanmin wrote:
> > On Mon, 2010-06-21 at 15:33 +0300, Avi Kivity wrote:
> >
> >> On 06/21/2010 12:31 PM, Zhang, Yanmin wrote:
> >>
> >>> The 3rd patch is to implement para virt perf at host kernel.
> >>>
> >>>
> >>> @@ -64,6 +73,85 @@ struct kvm_mmu_op_release_pt {
> >>> #ifdef __KERNEL__
> >>> #include<asm/processor.h>
> >>>
> >>>
> >>> +/*
> >>> + * In host kernel, perf_event->host_perf_shadow points to
> >>> + * host_perf_shadow which records some information
> >>> + * about the guest.
> >>> + */
> >>> +struct host_perf_shadow {
> >>> + /* guest perf_event id passed from guest os */
> >>> + int id;
> >>> + /*
> >>> + * Host kernel saves data into data member counter firstly.
> >>> + * kvm will get data from this counter and calls kvm functions
> >>> + * to copy or add data back to guets os before entering guest os
> >>> + * next time
> >>> + */
> >>> + struct guest_perf_event counter;
> >>> + /* guest_event_addr is gpa_t pointing to guest os guest_perf_event*/
> >>> + __u64 guest_event_addr;
> >>>
> >>>
> >> So just use gpa_t as the type.
> >>
> > host_perf_shadow->guest_event_addr is a copy of guest_event_addr->guest_event_addr.
> > As the latter's type is __u64 as the interface between guest os and host os, I use
> > __u64 as the type of host_perf_shadow->guest_event_addr.
> >
>
> Right. Bug gpa_t is more descriptive. We have a lot of address spaces
> in kvm.
I change it to gpa_t.

>
> >>> +
> >>> + /*
> >>> + * Link to of kvm.kvm_arch.shadow_hash_table
> >>> + */
> >>> + struct list_head shadow_entry;
> >>> + struct kvm_vcpu *vcpu;
> >>> +
> >>> + struct perf_event *host_event;
> >>> + /*
> >>> + * Below counter is to prevent malicious guest os to try to
> >>> + * close/enable event at the same time.
> >>> + */
> >>> + atomic_t ref_counter;
> >>>
> >>>
> >> If events are made per-vcpu (like real hardware), races become impossible.
> >>
> > This design is to deal with a task context perf collection in guest os.
> > Scenario 1:
> > 1) guest os starts to collect statistics of process A on vcpu 0;
> > 2) process A is scheduled to vcpu 1. Then, the perf_event at host side need
> > to be moved to VCPU 1 's thread. With the per KVM instance design, we needn't
> > move host_perf_shadow among vcpus.
> >
>
> First, the guest already knows how to deal with per-cpu performance
> monitors, since that's how most (all) hardware works. So we aren't
> making the guest more complex, and on the other hand we simplify the host.
I agree that we need keep things simple.

>
> Second, if process A is migrated, and the guest uses per-process
> counters, the guest will need to stop/start the counter during the
> migration. This will cause the host to migrate the counter,
Agree. My patches do so.

Question: Where does host migrate the counter?
The perf event at host side is bound to specific vcpu thread.

> so while we
> didn't move the counter to a different vcpu,
Disagree here. If process A on vcpu 0 in guest os is migrated to vcpu 1,
host has to move process A's perf_event to vcpu 1 thread.


> we still have to move it to
> a different cpu.
>
> > Scenario 2:
> > 1) guest os creates a perf_event at host side on vcpu 0;
> > 2) malicious guest os calls close to delete the host perf_event on vcpu 1, but
> > enables the perf_event on vcpu0 at the same time. When close thread runs to get the
> > host_perf_shadow from the list, enable thread also gets it. Then, close thread
> > deletes the perf_event, and enable thread will cause host kernel panic when using
> > host_perf_shadow.
> >
>
> With per-vcpu events, this can't happen. Each vcpu has its own set of
> perf events in their own ID namespace. vcpu 0 can't touch vcpu 1's
> events even if it knows their IDs.
What does 'touch' mean here?

With the task (process) context event in guest os, we have to migrate the event
among vcpus when guest process scheduler balances processes among vcpus. So all
events in a guest os instance uses the same ID name space.

What you mentioned is really right when the event is cpu context event, but
not with task context event.

>
> >> Please move this structure to include/linux/kvm_host.h. No need to spam
> >> kvm_para.h. Note it's not x86 specific (though you can leave arch
> >> enabling to arch maintainers).
> >>
> > Ok. Originally, I wanted to do so, but I'm afraid other arch might be not happy.
> >
>
> Copying kvm arch maintainers. Please keep an eye on this topic and
> holler if something doesn't fit your arch.
>
>
> >>
> >>> @@ -24,6 +24,7 @@
> >>> #include<asm/desc.h>
> >>> #include<asm/mtrr.h>
> >>> #include<asm/msr-index.h>
> >>> +#include<asm/perf_event.h>
> >>>
> >>> #define KVM_MAX_VCPUS 64
> >>> #define KVM_MEMORY_SLOTS 32
> >>> @@ -360,6 +361,18 @@ struct kvm_vcpu_arch {
> >>>
> >>> /* fields used by HYPER-V emulation */
> >>> u64 hv_vapic;
> >>> +
> >>> + /*
> >>> + * Fields used by PARAVIRT perf interface:
> >>> + *
> >>> + * kvm checks overflow_events before entering guest os,
> >>> + * and copy data back to guest os.
> >>> + * event_mutex is to avoid a race between NMI perf event overflow
> >>> + * handler, event close, and enable/disable.
> >>> + */
> >>> + struct mutex event_mutex;
> >>>
> >>>
> >> No race can exist. The host NMI handler cannot take any mutex
> >>
> > We use a mutex_trylock in NMI hanlder. If it can't get the lock, there is a NMI miss
> > happening, but host kernel still updates perf_event->host_perf_shadow.counter, so the
> > overflow data will be accumulated.
> >
>
> I see. I don't think this is needed if we disable the counters during
> guest->host switch,
We don't disable counters during guest->host switch. Generic perf codes will
disable it when:
1) host kernel process scheduler schedules the task out if the event is a task
context event.
2) guest os calls DISABLE hypercall.


> we can just copy the data and set a bit in
> vcpu->requests so that we can update the guest during next entry.
We can use a bit of vcpu->requests, but it has no far difference from a simple
checking (vcpu->arch.overflows == 0) in function kvm_sync_events_to_guest.

The key is how to save pending perf_event pointers, so kvm could update
their data to guest. vcpu->arch.overflow_events does so.

>
> >> so it
> >> must be immune to races.
If considering vcpu->arch.overflow_events, there will be a race if no locking.

> The guest NMI handlers and callbacks are all
> >> serialized by the guest itself.
In guest NMI and callbacks are serialized on a specific perf event, but perf_close
isn't. perf generic codes have good consideration on it. Under guest/host environment,
we need new lock to coordinate it.

In addition, pls. consider a malicious guest os who might doesn't serialize
the operations to try to cause host kernel panic.

> >>
> > This is to fight with malicious guest os kernel. Just like what I mention above,
> > the race might happen when:
> > 1) NMI handler accesses it;
> > 2) vmx_handle_exit codes access overflow_events to sync data to guest os;
> > 3) Another vcpu thread of the same guest os calls close to delete the perf_event;
> >
>
> This goes away with per-vcpu events.
Again, per-vcpu event couldn't work well with task context event in guest.

>
> >>> struct kvm_arch {
> >>> struct kvm_mem_aliases *aliases;
> >>>
> >>> @@ -415,6 +431,15 @@ struct kvm_arch {
> >>> /* fields used by HYPER-V emulation */
> >>> u64 hv_guest_os_id;
> >>> u64 hv_hypercall;
> >>> +
> >>> + /*
> >>> + * fields used by PARAVIRT perf interface:
> >>> + * Used to organize all host perf_events representing guest
> >>> + * perf_event on a specific kvm instance
> >>> + */
> >>> + atomic_t kvm_pv_event_num;
> >>> + spinlock_t shadow_lock;
> >>> + struct list_head *shadow_hash_table;
> >>>
> >>>
> >> Need to be per-vcpu. Also wrap in a kvm_vcpu_perf structure, the names
> >> are very generic.
If we move it to per-vcpu, host kernel need move the entry (struct host_perf_shadow)
among vcpu when an event be migrated to another vcpu. That causes codes a little complicated
and might introduce something like deadlock or new race. With my implementation, there is
only one potential performance issue as there are some lock contention on shadow_lock.
But the performance issue is not severe, because:
1) guest os doesn't support too many vcpu (usually no larger than 8);
2) host kernel doesn't lock shadow_lock when processing NMI and syncing data to guest;
3) if the event is per-cpu context, usually there is no much event disable/enable happening.

> >>
> > Originally, I did so, but changed it to per kvm instance wide when considering
> > perf_event moving around vcpu threads.
> >
>
> I think this makes the code more complicated, needlessly.
Complicated mostly because host need to deal with malicious guest kernel.

It's often that it's easy to implement ordinary logic with fewer codes but add far more
codes to deal with error scenarios. For example, originally I sync data to guest os in host
NMI handler. With your suggestion to use kvm_read_guest/kvm_write_guest, I have to move them
to handle_exit or vcpu_enter_guest. To do so, I need save all pending event's pointers to
an array in vcpu struct. Then there would be new races, so I need add new lock to protect
or serialize some operations.

>
> >>
> >>> /*
> >>> * hypercalls use architecture specific
> >>> --- linux-2.6_tip0620/arch/x86/kvm/vmx.c 2010-06-21 15:19:39.322999849 +0800
> >>> +++ linux-2.6_tip0620perfkvm/arch/x86/kvm/vmx.c 2010-06-21 15:21:39.310999849 +0800
> >>> @@ -3647,6 +3647,7 @@ static int vmx_handle_exit(struct kvm_vc
> >>> struct vcpu_vmx *vmx = to_vmx(vcpu);
> >>> u32 exit_reason = vmx->exit_reason;
> >>> u32 vectoring_info = vmx->idt_vectoring_info;
> >>> + int ret;
> >>>
> >>> trace_kvm_exit(exit_reason, vcpu);
> >>>
> >>> @@ -3694,12 +3695,17 @@ static int vmx_handle_exit(struct kvm_vc
> >>>
> >>> if (exit_reason< kvm_vmx_max_exit_handlers
> >>> && kvm_vmx_exit_handlers[exit_reason])
> >>> - return kvm_vmx_exit_handlers[exit_reason](vcpu);
> >>> + ret = kvm_vmx_exit_handlers[exit_reason](vcpu);
> >>> else {
> >>> vcpu->run->exit_reason = KVM_EXIT_UNKNOWN;
> >>> vcpu->run->hw.hardware_exit_reason = exit_reason;
> >>> + ret = 0;
> >>> }
> >>> - return 0;
> >>> +
> >>> + /* sync paravirt perf event to guest */
> >>> + kvm_sync_events_to_guest(vcpu);
> >>>
> >>>
> >> Why do that every exit?
> >> Why in vmx specific code?
> >>
> > I could move it to the tail of vcpu_enter_guest. kvm_sync_events_to_guest
> > might go to sleep when going through guest os page tables, so we couldn't call it
> > by NMI handler.
> >
>
> But syncing every exit is expensive. Use vcpu->requests to sync only
> when the data has changed.
kvm_sync_events_to_guest has a checking. We sync data to guest only when:
1) an event overflows and host will sync _this_ event's data to guest;
2) sync data when guest calls DISABLE or READ hypercalls.

>
> >>> +
> >>> +#define KVM_MAX_PARAVIRT_PERF_EVENT (1024)
> >>>
> >>>
> >> This is really high. I don't think it's necessary, or useful since the
> >> underlying hardware has much fewer events, and since the guest can
> >> multiplex events itself.
> >>
> > This limitation is different from hardware PMU counter imitation. When any application or
> > guest os vcpu thread creates perf_event, host kernel has no limitation. Kernel just arranges
> > all perf_event in a list (not considering group case) and schedules them to PMU hardware
> > by a round-robin method.
> >
>
> In practice, it will take such a long time to cycle through all events
> that measurement quality will deteriorate.
There are 2 things.
1) We provide a good capability to support applications to submit more events;
2) Application just use a small group of event, typically one cpu context event per vcpu.

They are very different. As usual case about 2), the measurement quality wouldn't
deteriorate.


> I prefer exposing a much
> smaller number of events so that multiplexing on the host side will be
> rare (for example, if both guest and host are monitoring at the same
> time, or to accomodate hardware constraints). If the guest needs 1024
> events, it can schedule them itself (and then it knows the measurement
> is very inaccurate due to sampling)
Guest os of linux does schedule them. By default, guest kernel enables
X86_PMC_IDX_MAX (64) events on a specific vpcu at the same time. See
function kvm_pmu_enable and kvm_add_event.

Exposing is different from disable/enable. If we expose/hide events when
enable/disable events, it consumes too much cpu resources as host kernel need to
create/delete the events frequently.

1024 is just the upper limitations that host could _create_ perf_event for
the guest os instance. If guest os is linux, most active events in host
for this guest os is VCPU_NUM*64.

>
> > KVM_MAX_PARAVIRT_PERF_EVENT is to restrict guest os instance not to create too many
> > perf_event at host side which consumes too much memory of host kernel and slow the perf_event
> > schedule.
> >
> >
> >>
> >>> +static void kvm_copy_event_to_guest(struct kvm_vcpu *vcpu,
> >>> + struct perf_event *host_event)
> >>> +{
> >>> + struct host_perf_shadow *shadow = host_event->host_perf_shadow;
> >>> + struct guest_perf_event counter;
> >>> + int ret;
> >>> + s32 overflows;
> >>> +
> >>> + ret = kvm_read_guest(vcpu->kvm, shadow->guest_event_addr,
> >>> + &counter, sizeof(counter));
> >>> + if (ret< 0)
> >>> + return;
> >>>
> >>>
> >> Need better error handling.
> >>
> > As host kernel saves/accumulate data in perf_event->host_perf_shadow.counter,
> > it doesn't matter to have one failure. next time when overflowing again, it will
> > copy all data back to guest os.
> >
>
> Next time we may fail too. And next time as well.
How to process the failure? Kill the guest os? :)

>
>
> >>> +
> >>> + counter.count = shadow->counter.count;
> >>> + atomic_add(overflows,&counter.overflows);
> >>> +
> >>> + kvm_write_guest(vcpu->kvm,
> >>> + shadow->guest_event_addr,
> >>> + &counter,
> >>> + sizeof(counter));
> >>>
> >>>
> >> kvm_write_guest() is _very_ nonatomic...
> >>
> > It doesn't matter. There is only one potential race between host kernel and
> > guest kernel. When guest vmexits to host, it wouldn't access data pointed by
> > shadow->guest_event_addr. Above kvm_write_guest happens with the same vpcu.
> > So we just need make sure guest os vcpu accesses guest_perf_shadow->counter.overflows
> > atomically.
> >
>
> Well, without per-vcpu events, you can't guarantee this.
>
> With per-vcpu events, I agree.
Frankly, I used per-vcpu in the beginning, but move to per-kvm after checking
every possible race issues.

>
>
> >>> + /*
> >>> + * By default, we disable the host event. Later on, guets os
> >>> + * triggers a perf_event_attach to enable it
> >>> + */
> >>> + attr->disabled = 1;
> >>> + attr->inherit = 0;
> >>> + attr->enable_on_exec = 0;
> >>> + /*
> >>> + * We don't support exclude mode of user and kernel for guest os,
> >>> + * which mean we always collect both user and kernel for guest os
> >>> + */
> >>> + attr->exclude_user = 0;
> >>> + attr->exclude_kernel = 0;
> >>>
> >>>
> >> First, if we don't support it, we should error out when the guest
> >> specifies it. Don't lie to the guest.
> >>
> >> Second, why can't we support it? should work for the guest just as it
> >> does for us.
> >>
> > exclude_user and exclude_kernel are just hardware capability. Current PMU hardware
> > doesn't support virtualization. So when a counter is at exclude_user mode, we couldn't
> > collect any event happens in guest os. That's my direct thinking without architect
> > confirmation.
> >
>
> 1) IIUC exclude_user and exclude_kernel should just work. They work by
> counting only when the cpl matches, yes? The hardware cpl is available
> and valid in the guest.
Good pointer! Let me do some experiments to make sure it does work.

>
> 2) We should atomically enable/disable the hardware performance counter
> during guest entry/exit, like during ordinary context switch, so that
> the guest doesn't measure host code (for example, ip would be
> meaningless).
I once checked it. At least under the vcpu thread context after vmexit, host
code execution is for the guest. It's reasonable to count this part. If the
vcpu thread is scheduled out, host kernel disables all events binding to this vcpu
thread. ip is meaningless if NMI happens in host code path, but host kernel would
accumulate the overflow count into host_perf_shadow->counter.overflows. Next time
when NMI happens in guest os, host kernel inject NMI guest kernel, so guest uses
that pt_regs->ip.

> Needs integration between perf core and kvm here.
>
> >>> +
> >>> + shadow = kzalloc(sizeof(*shadow), GFP_KERNEL);
> >>> + if (!shadow) {
> >>> + ret = -ENOMEM;
> >>> + goto out;
> >>> + }
> >>> + shadow->id = param.id;
> >>> + shadow->guest_event_addr = param.guest_event_addr;
> >>> + shadow->vcpu = vcpu;
> >>> + INIT_LIST_HEAD(&shadow->shadow_entry);
> >>> +
> >>> + /* We always create a cpu context host perf event */
> >>> + host_event = perf_event_create_kernel_counter(attr, -1,
> >>> + current->pid, kvm_perf_event_overflow);
> >>>
> >>>
> >> What does 'cpu context' mean in this context?
> >>
> > Sorry, above comments are bad. Right one is:
> > /* We always create a process context host perf event */
> >
> > perf event generic has 2 context, process context and per cpu context. process
> > context event is to collect statistics of a specific thread (process), while
> > cpu context event is to collect statistics of this cpu.

Thanks for your detailed comments.
Yanmin

2010-06-23 05:52:39

by Avi Kivity

[permalink] [raw]
Subject: Re: [PATCH V2 3/5] ara virt interface of perf to support kvm guest os statistics collection in guest os

On 06/23/2010 06:12 AM, Zhang, Yanmin wrote:
>>>
>>> This design is to deal with a task context perf collection in guest os.
>>> Scenario 1:
>>> 1) guest os starts to collect statistics of process A on vcpu 0;
>>> 2) process A is scheduled to vcpu 1. Then, the perf_event at host side need
>>> to be moved to VCPU 1 's thread. With the per KVM instance design, we needn't
>>> move host_perf_shadow among vcpus.
>>>
>>>
>> First, the guest already knows how to deal with per-cpu performance
>> monitors, since that's how most (all) hardware works. So we aren't
>> making the guest more complex, and on the other hand we simplify the host.
>>
> I agree that we need keep things simple.
>
>
>> Second, if process A is migrated, and the guest uses per-process
>> counters, the guest will need to stop/start the counter during the
>> migration. This will cause the host to migrate the counter,
>>
> Agree. My patches do so.
>
> Question: Where does host migrate the counter?
> The perf event at host side is bound to specific vcpu thread.
>

If the perf event is bound to the vm, not a vcpu, then on guest process
migration you will have to disable it on one vcpu and enable it on the
other, no?


>> so while we
>> didn't move the counter to a different vcpu,
>>
> Disagree here. If process A on vcpu 0 in guest os is migrated to vcpu 1,
> host has to move process A's perf_event to vcpu 1 thread.
>

Sorry, I'm confused now (lost track of our example). But whatever we
do, if a guest process is migrated, the host will have to migrate the
perf event, yes?

>
>
>> we still have to move it to
>> a different cpu.
>>
>>
>>> Scenario 2:
>>> 1) guest os creates a perf_event at host side on vcpu 0;
>>> 2) malicious guest os calls close to delete the host perf_event on vcpu 1, but
>>> enables the perf_event on vcpu0 at the same time. When close thread runs to get the
>>> host_perf_shadow from the list, enable thread also gets it. Then, close thread
>>> deletes the perf_event, and enable thread will cause host kernel panic when using
>>> host_perf_shadow.
>>>
>>>
>> With per-vcpu events, this can't happen. Each vcpu has its own set of
>> perf events in their own ID namespace. vcpu 0 can't touch vcpu 1's
>> events even if it knows their IDs.
>>
> What does 'touch' mean here?
>

Read or write using hypercalls.

> With the task (process) context event in guest os, we have to migrate the event
> among vcpus when guest process scheduler balances processes among vcpus. So all
> events in a guest os instance uses the same ID name space.
>

On real hardware a process running on cpu 0 might use counter X but
after migrating to cpu 1 it might use counter Y.

So the same perf_event object can be mapped to different hardware
resources depending on which cpu it is running on and on time (due to
changing resource requirements).

Same thing with guest processes. A process starts at vcpu 0 using
counter X, migrates to vcpu 1 using counter Y.

> What you mentioned is really right when the event is cpu context event, but
> not with task context event.
>

Real hardware supports only per-cpu events, and perf implements
per-process events on top of that. We could do the same with virtual
hardware.




>>> We use a mutex_trylock in NMI hanlder. If it can't get the lock, there is a NMI miss
>>> happening, but host kernel still updates perf_event->host_perf_shadow.counter, so the
>>> overflow data will be accumulated.
>>>
>>>
>> I see. I don't think this is needed if we disable the counters during
>> guest->host switch,
>>
> We don't disable counters during guest->host switch. Generic perf codes will
> disable it when:
> 1) host kernel process scheduler schedules the task out if the event is a task
> context event.
> 2) guest os calls DISABLE hypercall.
>

I think we should. Otherwise we will measure kvm host code and
unrelated host interrupt handlers.

>> we can just copy the data and set a bit in
>> vcpu->requests so that we can update the guest during next entry.
>>
> We can use a bit of vcpu->requests, but it has no far difference from a simple
> checking (vcpu->arch.overflows == 0) in function kvm_sync_events_to_guest.
>
> The key is how to save pending perf_event pointers, so kvm could update
> their data to guest. vcpu->arch.overflow_events does so.
>

Ok.

>> The guest NMI handlers and callbacks are all
>>
>>>> serialized by the guest itself.
>>>>
> In guest NMI and callbacks are serialized on a specific perf event, but perf_close
> isn't. perf generic codes have good consideration on it. Under guest/host environment,
> we need new lock to coordinate it.
>
> In addition, pls. consider a malicious guest os who might doesn't serialize
> the operations to try to cause host kernel panic.
>

The guest NMI handler is the guest's problem. But yes, the host NMI
handler can race with close.


>>>>
>>>>
>>> This is to fight with malicious guest os kernel. Just like what I mention above,
>>> the race might happen when:
>>> 1) NMI handler accesses it;
>>> 2) vmx_handle_exit codes access overflow_events to sync data to guest os;
>>> 3) Another vcpu thread of the same guest os calls close to delete the perf_event;
>>>
>>>
>> This goes away with per-vcpu events.
>>
> Again, per-vcpu event couldn't work well with task context event in guest.
>
>

I don't understand why. Real hardware has per-cpu events.

>>>>> struct kvm_arch {
>>>>> struct kvm_mem_aliases *aliases;
>>>>>
>>>>> @@ -415,6 +431,15 @@ struct kvm_arch {
>>>>> /* fields used by HYPER-V emulation */
>>>>> u64 hv_guest_os_id;
>>>>> u64 hv_hypercall;
>>>>> +
>>>>> + /*
>>>>> + * fields used by PARAVIRT perf interface:
>>>>> + * Used to organize all host perf_events representing guest
>>>>> + * perf_event on a specific kvm instance
>>>>> + */
>>>>> + atomic_t kvm_pv_event_num;
>>>>> + spinlock_t shadow_lock;
>>>>> + struct list_head *shadow_hash_table;
>>>>>
>>>>>
>>>>>
>>>> Need to be per-vcpu. Also wrap in a kvm_vcpu_perf structure, the names
>>>> are very generic.
>>>>
> If we move it to per-vcpu, host kernel need move the entry (struct host_perf_shadow)
> among vcpu when an event be migrated to another vcpu. That causes codes a little complicated
> and might introduce something like deadlock or new race.

Any complication will be in guest code, not host code, and the guest is
already prepared for it.

> With my implementation, there is
> only one potential performance issue as there are some lock contention on shadow_lock.
> But the performance issue is not severe, because:
> 1) guest os doesn't support too many vcpu (usually no larger than 8);
>

We support 64 now, and might increase it. Please don't introduce new
barriers to scalability.

>>> This limitation is different from hardware PMU counter imitation. When any application or
>>> guest os vcpu thread creates perf_event, host kernel has no limitation. Kernel just arranges
>>> all perf_event in a list (not considering group case) and schedules them to PMU hardware
>>> by a round-robin method.
>>>
>>>
>> In practice, it will take such a long time to cycle through all events
>> that measurement quality will deteriorate.
>>
> There are 2 things.
> 1) We provide a good capability to support applications to submit more events;
> 2) Application just use a small group of event, typically one cpu context event per vcpu.
>
> They are very different. As usual case about 2), the measurement quality wouldn't
> deteriorate.
>

In the untypical case, quality will deteriorate and the guest will have
no way of knowing it.

If the number of events is restricted, the guest has to multiplex itself
so it knows quality deteriorates and can do something about it.

>> I prefer exposing a much
>> smaller number of events so that multiplexing on the host side will be
>> rare (for example, if both guest and host are monitoring at the same
>> time, or to accomodate hardware constraints). If the guest needs 1024
>> events, it can schedule them itself (and then it knows the measurement
>> is very inaccurate due to sampling)
>>
> Guest os of linux does schedule them. By default, guest kernel enables
> X86_PMC_IDX_MAX (64) events on a specific vpcu at the same time. See
> function kvm_pmu_enable and kvm_add_event.
>
> Exposing is different from disable/enable. If we expose/hide events when
> enable/disable events, it consumes too much cpu resources as host kernel need to
> create/delete the events frequently.
>
> 1024 is just the upper limitations that host could _create_ perf_event for
> the guest os instance. If guest os is linux, most active events in host
> for this guest os is VCPU_NUM*64.
>

I see. So you create events in the host but restrict the number active
at the same time. This should work.

64 is way to much for a single vcpu but this can be tuned.

>>> As host kernel saves/accumulate data in perf_event->host_perf_shadow.counter,
>>> it doesn't matter to have one failure. next time when overflowing again, it will
>>> copy all data back to guest os.
>>>
>>>
>> Next time we may fail too. And next time as well.
>>
> How to process the failure? Kill the guest os? :)
>

Good question...

Failure can come from memory hotunplug of the perf region, anything else?

If that's the only cause we can kill the guest or raise an MCE.

>>>>
>>>>
>>> It doesn't matter. There is only one potential race between host kernel and
>>> guest kernel. When guest vmexits to host, it wouldn't access data pointed by
>>> shadow->guest_event_addr. Above kvm_write_guest happens with the same vpcu.
>>> So we just need make sure guest os vcpu accesses guest_perf_shadow->counter.overflows
>>> atomically.
>>>
>>>
>> Well, without per-vcpu events, you can't guarantee this.
>>
>> With per-vcpu events, I agree.
>>
> Frankly, I used per-vcpu in the beginning, but move to per-kvm after checking
> every possible race issues.
>

Okay. I'll need to think about it. I have a strong preference to
per-vcpu since it is a lot simpler in the host, similar to real
hardware, and inherently scalable. So I need a lot of convincing to
move to per-vm.

>> 1) IIUC exclude_user and exclude_kernel should just work. They work by
>> counting only when the cpl matches, yes? The hardware cpl is available
>> and valid in the guest.
>>
> Good pointer! Let me do some experiments to make sure it does work.
>
>

Note, that only works correctly, if we disable the counters on
guest->host switch, otherwise we count hypervisor code as cpl 0.

>> 2) We should atomically enable/disable the hardware performance counter
>> during guest entry/exit, like during ordinary context switch, so that
>> the guest doesn't measure host code (for example, ip would be
>> meaningless).
>>
> I once checked it. At least under the vcpu thread context after vmexit, host
> code execution is for the guest. It's reasonable to count this part. If the
> vcpu thread is scheduled out, host kernel disables all events binding to this vcpu
> thread. ip is meaningless if NMI happens in host code path, but host kernel would
> accumulate the overflow count into host_perf_shadow->counter.overflows. Next time
> when NMI happens in guest os, host kernel inject NMI guest kernel, so guest uses
> that pt_regs->ip.
>

For a cycle counter it might make sense (guest sees it spends time in
sensitive instructions that are trapped), but what about things like
DTLB misses or cache references? It will see large counters for these
instructions but no way to improve them.


--
I have a truly marvellous patch that fixes the bug which this
signature is too narrow to contain.

2010-06-23 08:09:48

by Avi Kivity

[permalink] [raw]
Subject: Re: [PATCH V2 3/5] ara virt interface of perf to support kvm guest os statistics collection in guest os

On 06/23/2010 04:43 AM, Zhang, Yanmin wrote:
>
>> A reset does not involve destroying a vm; you have to clean up as part
>> of the rest process.
>>
> What does 'reset' here mean? Is it a reboot or halt? If it's a halt, it involves
> destroying a vm. If a host user just kills the qemu process, is it a reset involving
> destroying a vm?
>

'reset' is either a complete reset ('system_reset' on the qemu monitor,
cycles through the bios etc.) or just an INIT signal to one vcpu.
Neither involves destroying a vm.

>> You aren't guaranteed a reboot notifier will be called. On the other
>> hand, we need a kexec handler.
>>
> ordinary kexec calls all reboot notifiers. Only crash kexec doesn't call them.
> I will implement a machine_ops.crash_shutdown callback.
>

Thanks.

--
error compiling committee.c: too many arguments to function

2010-06-24 03:35:40

by Yanmin Zhang

[permalink] [raw]
Subject: Re: [PATCH V2 3/5] ara virt interface of perf to support kvm guest os statistics collection in guest os

On Wed, 2010-06-23 at 08:51 +0300, Avi Kivity wrote:
> On 06/23/2010 06:12 AM, Zhang, Yanmin wrote:
> >>>
> >>> This design is to deal with a task context perf collection in guest os.
> >>> Scenario 1:
> >>> 1) guest os starts to collect statistics of process A on vcpu 0;
> >>> 2) process A is scheduled to vcpu 1. Then, the perf_event at host side need
> >>> to be moved to VCPU 1 's thread. With the per KVM instance design, we needn't
> >>> move host_perf_shadow among vcpus.
> >>>
> >>>
> >> First, the guest already knows how to deal with per-cpu performance
> >> monitors, since that's how most (all) hardware works. So we aren't
> >> making the guest more complex, and on the other hand we simplify the host.
> >>
> > I agree that we need keep things simple.
> >
> >
> >> Second, if process A is migrated, and the guest uses per-process
> >> counters, the guest will need to stop/start the counter during the
> >> migration. This will cause the host to migrate the counter,
> >>
> > Agree. My patches do so.
> >
> > Question: Where does host migrate the counter?
> > The perf event at host side is bound to specific vcpu thread.
> >
>
> If the perf event is bound to the vm, not a vcpu, then on guest process
> migration you will have to disable it on one vcpu and enable it on the
> other, no?
I found we start from different points. This patch is to implement a para virt
interface based on current perf implementation in kernel.

Here is a diagram about perf implementation layers. Below picture is not precise,
but it could show perf layers. Ingo and Peter could correct me if something is wrong.

-------------------------------------------------
| Perf Generic Layer |
-------------------------------------------------
| PMU Abstraction Layer |
| (a couple of callbacks) |
-------------------------------------------------
| x86_pmu |
| (operate real PMU hardware) |
-------------------------------------------------


The upper layer is perf generic layer. The 3rd layer is x86_pmu which really
manipulate PMU hardware. Sometimes, 1st calls 3rd directly at event initialization
and enable/disable all events.

My patch implements a kvm_pmu at the 2nd layer in guest os, to call hypercall to vmexit
to host. At host side, mostly it would go through the 3 layers till accessing real
hardware.

Most of your comments don't agree with the kvm_pmu design. Although you didn't say
directly, I know that perhaps you want to implement para virt interface at 3rd layer
in guest os. That means guest os maintains a mapping between guest event and PMU counters.
That's why you strongly prefer per-vcpu event managements and idx reference to event.
If we implement it at 3rd layer (or something like that although you might say I don't
like that layer...) in guest, we need bypass 1st and 2nd layers in host kernel when
processing guest os event. Eventually, we almost add a new layer under x86_pmu to arbitrate
between perf PMU request and KVM guest event request.

My current patch arranges the calling to go through the whole perf stack at host side.
The upper layer arranges perf event scheduling on PMU hardware. Applications don't know
when its event will be really scheduled to real hardware as they needn't know.


>
> >> so while we
> >> didn't move the counter to a different vcpu,
> >>
> > Disagree here. If process A on vcpu 0 in guest os is migrated to vcpu 1,
> > host has to move process A's perf_event to vcpu 1 thread.
> >
>
> Sorry, I'm confused now (lost track of our example). But whatever we
> do, if a guest process is migrated, the host will have to migrate the
> perf event, yes?
>

2010-06-24 08:00:11

by Avi Kivity

[permalink] [raw]
Subject: Re: [PATCH V2 3/5] ara virt interface of perf to support kvm guest os statistics collection in guest os

On 06/24/2010 06:36 AM, Zhang, Yanmin wrote:
>
>> If the perf event is bound to the vm, not a vcpu, then on guest process
>> migration you will have to disable it on one vcpu and enable it on the
>> other, no?
>>
> I found we start from different points. This patch is to implement a para virt
> interface based on current perf implementation in kernel.
>

The words 'current perf implementation' are scary. I'm after a long
term stable interface. My goals are a simple interface (so it is easy
to implement on both sides, easy to scale, and resists implementation
changes in guest and host), live migration support, and good documentation.

Since most of our infrastructure is for emulating hardware, I tend
towards hardware-like interfaces. These tend to retain all state in
registers so they work well with live migration.

While realistically I don't expect other OSes to implement this
interface, I would like to design it so it would be easy to do so. That
will help Linux as well in case the perf implementation changes.

> Here is a diagram about perf implementation layers. Below picture is not precise,
> but it could show perf layers. Ingo and Peter could correct me if something is wrong.
>
> -------------------------------------------------
> | Perf Generic Layer |
> -------------------------------------------------
> | PMU Abstraction Layer |
> | (a couple of callbacks) |
> -------------------------------------------------
> | x86_pmu |
> | (operate real PMU hardware) |
> -------------------------------------------------
>
>
> The upper layer is perf generic layer. The 3rd layer is x86_pmu which really
> manipulate PMU hardware. Sometimes, 1st calls 3rd directly at event initialization
> and enable/disable all events.
>
> My patch implements a kvm_pmu at the 2nd layer in guest os, to call hypercall to vmexit
> to host. At host side, mostly it would go through the 3 layers till accessing real
> hardware.
>

Ok.

> Most of your comments don't agree with the kvm_pmu design. Although you didn't say
> directly, I know that perhaps you want to implement para virt interface at 3rd layer
> in guest os. That means guest os maintains a mapping between guest event and PMU counters.
> That's why you strongly prefer per-vcpu event managements and idx reference to event.
>

The conclusion is correct, but I arrived at it from a different
direction. I'm not really familiar with perf internals (do you have
pointers I could study?). My preference comes from the desire to retain
all state in guest-visible registers or memory. That simplifies live
migration significantly. Keeping things per-vcpu simplifies the interface.

> If we implement it at 3rd layer (or something like that although you might say I don't
> like that layer...) in guest, we need bypass 1st and 2nd layers in host kernel when
> processing guest os event. Eventually, we almost add a new layer under x86_pmu to arbitrate
> between perf PMU request and KVM guest event request.
>
> My current patch arranges the calling to go through the whole perf stack at host side.
> The upper layer arranges perf event scheduling on PMU hardware. Applications don't know
> when its event will be really scheduled to real hardware as they needn't know.
>

No, I don't think we should bypass the perf stack on the host. It is
important since the perf stack arbitrates a scarce resource that needs
to be shared with other users on the host.

The way I see it, pvpmu can easily expose an interface that is
hardware-like: a process context host perf event corresponds to a guest
vcpu context performance counter. The guest already knows how to
convert vcpu context hardware counters to process context hardware
counters, and how to multiplex multiple software visible perf events on
limited hardware resources.

All three layers would be involved on both guest and host. When I
suggest to use WRMSR and RDPMC to access pvpmu, that doesn't mean it
accesses the real pmu; it's just a hardware-like interface to access a
vcpu-context/per-thread counter on the host. An advantage of an MSR
interface is that we have infrastructure to live migrate the state
associated with it.

Having the host see guest process context events is not useful IMO. We
can't allow the guest to create unlimited events, so the multiplexing
code will still be needed. Because of that, we may as well restrict
ourselves to vcpu context events, which is how real hardware works.

If there is concern about a guest task migrating to a different vcpu and
requiring the destruction and re-creation of a perf event on the host
side, that can be addressed by a cache on the host side. The cache
would be invisible ("non-architectural" from the guest's point of view),
and so we would not need to live migrate it. However, I don't believe
such a cache is really necessary, or that it's a good idea for large guests.

--
error compiling committee.c: too many arguments to function