2010-06-21 09:31:57

by Yanmin Zhang

[permalink] [raw]
Subject: [PATCH V2 4/5] ara virt interface of perf to support kvm guest os statistics collection in guest os

The 4th patch is to implement para virt perf at guest side.

Signed-off-by: Zhang Yanmin <[email protected]>

---

--- linux-2.6_tip0620/arch/x86/Kconfig 2010-06-21 15:19:39.180999849 +0800
+++ linux-2.6_tip0620perfkvm/arch/x86/Kconfig 2010-06-21 15:21:39.309999849 +0800
@@ -552,6 +552,14 @@ config KVM_GUEST
This option enables various optimizations for running under the KVM
hypervisor.

+config KVM_PERF
+ bool "KVM Guest perf support"
+ select PARAVIRT
+ select PERF_EVENT
+ ---help---
+ This option enables various optimizations for running perf in
+ guest os under the KVM hypervisor.
+
source "arch/x86/lguest/Kconfig"

config PARAVIRT
--- linux-2.6_tip0620/arch/x86/kernel/cpu/perf_event.c 2010-06-21 15:19:39.964999849 +0800
+++ linux-2.6_tip0620perfkvm/arch/x86/kernel/cpu/perf_event.c 2010-06-21 16:44:36.602999849 +0800
@@ -25,6 +25,7 @@
#include <linux/highmem.h>
#include <linux/cpu.h>
#include <linux/bitops.h>
+#include <linux/kvm_para.h>

#include <asm/apic.h>
#include <asm/stacktrace.h>
@@ -583,10 +584,20 @@ static void x86_pmu_disable_all(void)
}
}

+#ifdef CONFIG_KVM_PERF
+static int kvm_hw_perf_enable(void);
+static int kvm_hw_perf_disable(void);
+#endif
+
void hw_perf_disable(void)
{
struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);

+#ifdef CONFIG_KVM_PERF
+ if (!kvm_hw_perf_disable())
+ return;
+#endif
+
if (!x86_pmu_initialized())
return;

@@ -810,6 +821,11 @@ void hw_perf_enable(void)
struct hw_perf_event *hwc;
int i, added = cpuc->n_added;

+#ifdef CONFIG_KVM_PERF
+ if (!kvm_hw_perf_enable())
+ return;
+#endif
+
if (!x86_pmu_initialized())
return;

@@ -1264,6 +1280,7 @@ x86_get_event_constraints(struct cpu_hw_
#include "perf_event_intel_lbr.c"
#include "perf_event_intel_ds.c"
#include "perf_event_intel.c"
+#include "perf_event_kvm.c"

static int __cpuinit
x86_pmu_notifier(struct notifier_block *self, unsigned long action, void *hcpu)
@@ -1317,6 +1334,11 @@ void __init init_hw_perf_events(void)

pr_info("Performance Events: ");

+#ifdef CONFIG_KVM_PERF
+ if (!kvm_init_hw_perf_events())
+ return;
+#endif
+
switch (boot_cpu_data.x86_vendor) {
case X86_VENDOR_INTEL:
err = intel_pmu_init();
@@ -1541,6 +1563,13 @@ const struct pmu *hw_perf_event_init(str
const struct pmu *tmp;
int err;

+#ifdef CONFIG_KVM_PERF
+ if (kvm_para_available()) {
+ tmp = kvm_hw_perf_event_init(event);
+ return tmp;
+ }
+#endif
+
err = __hw_perf_event_init(event);
if (!err) {
/*
--- linux-2.6_tip0620/arch/x86/kernel/cpu/perf_event_kvm.c 1970-01-01 08:00:00.000000000 +0800
+++ linux-2.6_tip0620perfkvm/arch/x86/kernel/cpu/perf_event_kvm.c 2010-06-21 16:44:56.735999849 +0800
@@ -0,0 +1,426 @@
+/*
+ * Performance events
+ *
+ * Copyright (C) 2010 Intel Corporation
+ * Zhang Yanmin <[email protected]>
+ *
+ * For licencing details see kernel-base/COPYING
+ */
+
+#ifdef CONFIG_KVM_PERF
+
+static atomic_t guest_perf_id; /*Global id counter per guest os*/
+
+static inline int get_new_perf_event_id(void)
+{
+ return atomic_inc_return(&guest_perf_id);
+}
+
+#ifdef CONFIG_X86_LOCAL_APIC
+
+static bool kvm_reserve_pmc_hardware(void)
+{
+ if (nmi_watchdog == NMI_LOCAL_APIC)
+ disable_lapic_nmi_watchdog();
+
+ return true;
+}
+
+static void kvm_release_pmc_hardware(void)
+{
+ if (nmi_watchdog == NMI_LOCAL_APIC)
+ enable_lapic_nmi_watchdog();
+}
+
+#else
+
+static bool kvm_reserve_pmc_hardware(void) { return true; }
+static void kvm_release_pmc_hardware(void) {}
+
+#endif
+
+static void kvm_hw_perf_event_destroy(struct perf_event *event)
+{
+ struct guest_perf_shadow *shadow = event->guest_perf_shadow;
+
+ BUG_ON(!shadow);
+ kvm_hypercall2(KVM_PERF_OP, KVM_PERF_OP_CLOSE, shadow->id);
+
+ kfree(shadow);
+ event->guest_perf_shadow = NULL;
+
+ if (atomic_dec_and_mutex_lock(&active_events, &pmc_reserve_mutex)) {
+ kvm_release_pmc_hardware();
+ mutex_unlock(&pmc_reserve_mutex);
+ }
+}
+
+/* The guest might also run as a host */
+static int check_ontop_guest_overflow(struct perf_event *event, int overflows)
+{
+ struct host_perf_shadow *host_shadow = event->host_perf_shadow;
+ if (!host_shadow)
+ return 0;
+
+ if (perf_guest_cbs)
+ perf_guest_cbs->copy_event_to_shadow(event, overflows);
+
+ return 1;
+}
+
+static int
+check_event_overflow(struct perf_event *event, struct pt_regs *regs)
+{
+ struct perf_sample_data data;
+ struct guest_perf_shadow *guest_shadow = event->guest_perf_shadow;
+ s32 overflows;
+ int i;
+ int handled = 0;
+
+ local64_set(&event->count, guest_shadow->counter.count);
+
+again:
+ overflows = atomic_read(&guest_shadow->counter.overflows);
+ if (atomic_cmpxchg(&guest_shadow->counter.overflows, overflows, 0) !=
+ overflows)
+ goto again;
+
+ if (check_ontop_guest_overflow(event, overflows)) {
+ handled = 1;
+ return handled;
+ }
+
+ for (i = 0; i < overflows; i++) {
+ perf_sample_data_init(&data, 0);
+
+ data.period = event->hw.last_period;
+
+ if (event->overflow_handler)
+ event->overflow_handler(event, 1, &data, regs);
+ else
+
+ perf_event_output(event, 1, &data, regs);
+
+ handled++;
+ }
+
+ return handled;
+}
+
+static int
+kvm_check_event_overflow(struct pt_regs *regs)
+{
+ struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
+ struct perf_event *event;
+ int i, max_count;
+ int handled = 0;
+
+ max_count = X86_PMC_IDX_MAX;
+ for (i = 0; i < max_count; i++) {
+ event = cpuc->event_list[i];
+ if (event)
+ handled += check_event_overflow(event, regs);
+ }
+ return handled;
+}
+
+static DEFINE_PER_CPU(int, kvm_nmi_entered);
+
+static int kvm_x86_pmu_handle_irq(struct pt_regs *regs)
+{
+ int handled = 0;
+
+ if (percpu_read(kvm_nmi_entered))
+ return 0;
+
+ percpu_write(kvm_nmi_entered, 1);
+
+ handled = kvm_check_event_overflow(regs);
+ if (handled)
+ inc_irq_stat(apic_perf_irqs);
+
+ percpu_write(kvm_nmi_entered, 0);
+
+ return handled;
+}
+
+static int __kprobes
+kvm_perf_event_nmi_handler(struct notifier_block *self,
+ unsigned long cmd, void *__args)
+{
+ struct die_args *args = __args;
+ struct pt_regs *regs;
+
+ if (!atomic_read(&active_events))
+ return NOTIFY_DONE;
+
+ switch (cmd) {
+ case DIE_NMI:
+ case DIE_NMI_IPI:
+ break;
+
+ default:
+ return NOTIFY_DONE;
+ }
+
+ regs = args->regs;
+
+ kvm_x86_pmu_handle_irq(regs);
+
+ return NOTIFY_STOP;
+}
+
+static __read_mostly struct notifier_block kvm_perf_event_nmi_notifier = {
+ .notifier_call = kvm_perf_event_nmi_handler,
+ .next = NULL,
+ .priority = 1
+};
+
+static int kvm_add_event(struct perf_event *event)
+{
+ int i, max_count;
+ unsigned long flags;
+ struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
+ int ret = -1;
+
+ local_irq_save(flags);
+ max_count = X86_PMC_IDX_MAX;
+
+ if (cpuc->n_events >= max_count) {
+ local_irq_restore(flags);
+ return -ENOSPC;
+ }
+ for (i = 0; i < max_count; i++) {
+ if (cpuc->event_list[i] == NULL) {
+ cpuc->event_list[i] = event;
+ cpuc->n_events++;
+ ret = 0;
+ break;
+ }
+ }
+ local_irq_restore(flags);
+ return ret;
+}
+
+static int kvm_del_event(struct perf_event *event)
+{
+ int i, max_count;
+ unsigned long flags;
+ struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
+ int ret = -1;
+
+ local_irq_save(flags);
+ max_count = X86_PMC_IDX_MAX;
+ for (i = 0; i < max_count; i++) {
+ if (cpuc->event_list[i] == event) {
+ cpuc->event_list[i] = NULL;
+ cpuc->n_events--;
+ ret = 0;
+ break;
+ }
+ }
+ local_irq_restore(flags);
+ return ret;
+}
+
+static int kvm_pmu_enable(struct perf_event *event)
+{
+ int ret;
+ struct guest_perf_shadow *shadow = event->guest_perf_shadow;
+
+ if (kvm_add_event(event))
+ return -1;
+
+ ret = kvm_hypercall2(KVM_PERF_OP, KVM_PERF_OP_ENABLE, shadow->id);
+ return ret;
+}
+
+static void kvm_pmu_disable(struct perf_event *event)
+{
+ struct guest_perf_shadow *shadow = event->guest_perf_shadow;
+ kvm_hypercall2(KVM_PERF_OP, KVM_PERF_OP_DISABLE, shadow->id);
+ local64_set(&event->count, shadow->counter.count);
+ kvm_del_event(event);
+}
+
+static void kvm_pmu_read(struct perf_event *event)
+{
+ int ret;
+ struct guest_perf_shadow *shadow = event->guest_perf_shadow;
+ ret = kvm_hypercall2(KVM_PERF_OP, KVM_PERF_OP_READ, shadow->id);
+ if (!ret)
+ local64_set(&event->count, shadow->counter.count);
+ return;
+}
+
+static void kvm_pmu_unthrottle(struct perf_event *event)
+{
+ return;
+}
+
+static const struct pmu kvm_pmu = {
+ .enable = kvm_pmu_enable,
+ .disable = kvm_pmu_disable,
+ .start = kvm_pmu_enable,
+ .stop = kvm_pmu_disable,
+ .read = kvm_pmu_read,
+ .unthrottle = kvm_pmu_unthrottle,
+};
+
+static int kvm_default_x86_handle_irq(struct pt_regs *regs)
+{
+ return 1;
+}
+
+int __init kvm_init_hw_perf_events(void)
+{
+ if (!kvm_para_available())
+ return -1;
+
+ x86_pmu.handle_irq = kvm_default_x86_handle_irq;
+
+ pr_cont("KVM PARA PMU driver.\n");
+ register_die_notifier(&kvm_perf_event_nmi_notifier);
+
+ return 0;
+}
+
+static __u64 kvm_get_pte_phys(void *virt_addr)
+{
+ __u64 pte_phys;
+
+#ifdef CONFIG_HIGHPTE
+ struct page *page;
+ unsigned long dst = (unsigned long) virt_addr;
+
+ page = kmap_atomic_to_page(virt_addr);
+ pte_phys = page_to_pfn(page);
+ pte_phys <<= PAGE_SHIFT;
+ pte_phys += (dst & ~(PAGE_MASK));
+#else
+ pte_phys = (unsigned long)__pa(virt_addr);
+#endif
+ return pte_phys;
+}
+
+static int __kvm_hw_perf_event_init(struct perf_event *event)
+{
+ int err;
+ unsigned long result;
+ __u64 param_addr;
+ struct guest_perf_shadow *shadow = NULL;
+ struct guest_perf_event_param guest_param;
+ struct guest_perf_attr *attr = NULL;
+
+ err = 0;
+
+ attr = kzalloc(sizeof(*attr), GFP_KERNEL);
+ if (!attr) {
+ err = -ENOMEM;
+ goto out;
+ }
+
+ shadow = kzalloc(sizeof(*shadow), GFP_KERNEL);
+ if (!shadow) {
+ err = -ENOMEM;
+ goto out;
+ }
+
+ shadow->id = get_new_perf_event_id();
+ event->guest_perf_shadow = shadow;
+
+ if (!atomic_inc_not_zero(&active_events)) {
+ mutex_lock(&pmc_reserve_mutex);
+ if (atomic_read(&active_events) == 0) {
+ if (!kvm_reserve_pmc_hardware())
+ err = -EBUSY;
+ }
+ if (!err)
+ atomic_inc(&active_events);
+ mutex_unlock(&pmc_reserve_mutex);
+ if (err)
+ goto out;
+ }
+
+ event->destroy = kvm_hw_perf_event_destroy;
+ attr->type = event->attr.type;
+ attr->config = event->attr.config;
+ attr->sample_period = event->attr.sample_period;
+ attr->read_format = event->attr.read_format;
+ attr->flags = event->attr.flags;
+ attr->bp_type = event->attr.bp_type;
+ attr->bp_addr = event->attr.bp_addr;
+ attr->bp_len = event->attr.bp_len;
+
+ guest_param.id = shadow->id;
+ guest_param.attr_addr = kvm_get_pte_phys(attr);
+ guest_param.guest_event_addr = kvm_get_pte_phys(&shadow->counter);
+ param_addr = kvm_get_pte_phys(&guest_param);
+ result = kvm_hypercall3(KVM_PERF_OP, KVM_PERF_OP_OPEN,
+ (unsigned long) param_addr, param_addr >> 32);
+
+ if (result)
+ err = result;
+
+out:
+ if (err && shadow) {
+ kfree(shadow);
+ event->guest_perf_shadow = NULL;
+ }
+ kfree(attr);
+
+ return err;
+}
+
+const struct pmu *kvm_hw_perf_event_init(struct perf_event *event)
+{
+ int err;
+
+ if (!kvm_para_has_feature(KVM_FEATURE_PV_PERF))
+ return ERR_PTR(-ENOSYS);
+
+ err = __kvm_hw_perf_event_init(event);
+ if (err)
+ return ERR_PTR(err);
+
+ return &kvm_pmu;
+}
+
+static int kvm_hw_perf_enable(void)
+{
+ struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
+
+ if (!kvm_para_available())
+ return -1;
+
+ if (cpuc->enabled)
+ return 0;
+
+ if (cpuc->n_added)
+ cpuc->n_added = 0;
+
+ cpuc->enabled = 1;
+ barrier();
+
+ return 0;
+}
+
+static int kvm_hw_perf_disable(void)
+{
+ struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
+
+ if (!kvm_para_available())
+ return -1;
+
+ if (!cpuc->enabled)
+ return 0;
+
+ cpuc->n_added = 0;
+ cpuc->enabled = 0;
+ barrier();
+
+ return 0;
+}
+
+#endif
+
--- linux-2.6_tip0620/Documentation/kvm/cpuid.txt 2010-06-21 15:19:26.199999849 +0800
+++ linux-2.6_tip0620perfkvm/Documentation/kvm/cpuid.txt 2010-06-21 15:21:39.312999849 +0800
@@ -36,6 +36,9 @@ KVM_FEATURE_MMU_OP ||
KVM_FEATURE_CLOCKSOURCE2 || 3 || kvmclock available at msrs
|| || 0x4b564d00 and 0x4b564d01
------------------------------------------------------------------------------
+KVM_FEATURE_PV_PERF || 4 || kvm paravirt perf event
+ || || available
+------------------------------------------------------------------------------
KVM_FEATURE_CLOCKSOURCE_STABLE_BIT || 24 || host will warn if no guest-side
|| || per-cpu warps are expected in
|| || kvmclock.


2010-06-22 08:25:21

by Jes Sorensen

[permalink] [raw]
Subject: Re: [PATCH V2 4/5] ara virt interface of perf to support kvm guest os statistics collection in guest os

On 06/21/10 11:31, Zhang, Yanmin wrote:
> @@ -583,10 +584,20 @@ static void x86_pmu_disable_all(void)
> }
> }
>
> +#ifdef CONFIG_KVM_PERF
> +static int kvm_hw_perf_enable(void);
> +static int kvm_hw_perf_disable(void);
> +#endif

Please put these prototypes into a header ... and create dummy stubs for
them when CONFIG_KVM_PERF is not set.

> void hw_perf_disable(void)
> {
> struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
>
> +#ifdef CONFIG_KVM_PERF
> + if (!kvm_hw_perf_disable())
> + return;
> +#endif

If you stub them out we can avoid all the ugly #ifdefs

> @@ -810,6 +821,11 @@ void hw_perf_enable(void)
> struct hw_perf_event *hwc;
> int i, added = cpuc->n_added;
>
> +#ifdef CONFIG_KVM_PERF
> + if (!kvm_hw_perf_enable())
> + return;
> +#endif

and here....

> @@ -1317,6 +1334,11 @@ void __init init_hw_perf_events(void)
>
> pr_info("Performance Events: ");
>
> +#ifdef CONFIG_KVM_PERF
> + if (!kvm_init_hw_perf_events())
> + return;
> +#endif

and again here :)

Cheers,
Jes

2010-06-22 09:10:26

by Yanmin Zhang

[permalink] [raw]
Subject: Re: [PATCH V2 4/5] ara virt interface of perf to support kvm guest os statistics collection in guest os

On Tue, 2010-06-22 at 10:24 +0200, Jes Sorensen wrote:
> On 06/21/10 11:31, Zhang, Yanmin wrote:
> > @@ -583,10 +584,20 @@ static void x86_pmu_disable_all(void)
> > }
> > }
> >
> > +#ifdef CONFIG_KVM_PERF
> > +static int kvm_hw_perf_enable(void);
> > +static int kvm_hw_perf_disable(void);
> > +#endif
>
> Please put these prototypes into a header ... and create dummy stubs for
> them when CONFIG_KVM_PERF is not set.
Ok. I just didn't want to touch too much generic codes of perf.

>
> > void hw_perf_disable(void)
> > {
> > struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
> >
> > +#ifdef CONFIG_KVM_PERF
> > + if (!kvm_hw_perf_disable())
> > + return;
> > +#endif
>
> If you stub them out we can avoid all the ugly #ifdefs
Ok.

>
> > @@ -810,6 +821,11 @@ void hw_perf_enable(void)
> > struct hw_perf_event *hwc;
> > int i, added = cpuc->n_added;
> >
> > +#ifdef CONFIG_KVM_PERF
> > + if (!kvm_hw_perf_enable())
> > + return;
> > +#endif
>
> and here....
Ok.

>
> > @@ -1317,6 +1334,11 @@ void __init init_hw_perf_events(void)
> >
> > pr_info("Performance Events: ");
> >
> > +#ifdef CONFIG_KVM_PERF
> > + if (!kvm_init_hw_perf_events())
> > + return;
> > +#endif
>
> and again here :)
Ok. Peter is working out a couple of patches to support multiple PMU. His patches
change pmu difition and we might move some into the callbacks. That will become
much clearer.

Yanmin