2010-06-21 09:31:26

by Yanmin Zhang

[permalink] [raw]
Subject: [PATCH V2 2/5] ara virt interface of perf to support kvm guest os statistics collection in guest os

The 2nd patch is to change the definition of perf_event to facilitate
perf attr copy when a hypercall happens.

Signed-off-by: Zhang Yanmin <[email protected]>

---

--- linux-2.6_tip0620/include/linux/perf_event.h 2010-06-21 15:19:52.821999849 +0800
+++ linux-2.6_tip0620perfkvm/include/linux/perf_event.h 2010-06-21 16:53:49.283999849 +0800
@@ -188,7 +188,10 @@ struct perf_event_attr {
__u64 sample_type;
__u64 read_format;

- __u64 disabled : 1, /* off by default */
+ union {
+ __u64 flags;
+ struct {
+ __u64 disabled : 1, /* off by default */
inherit : 1, /* children inherit it */
pinned : 1, /* must always be on PMU */
exclusive : 1, /* only group on PMU */
@@ -217,6 +220,8 @@ struct perf_event_attr {
mmap_data : 1, /* non-exec mmap data */

__reserved_1 : 46;
+ };
+ };

union {
__u32 wakeup_events; /* wakeup every n events */
@@ -465,12 +470,6 @@ enum perf_callchain_context {
# include <asm/local64.h>
#endif

-struct perf_guest_info_callbacks {
- int (*is_in_guest) (void);
- int (*is_user_mode) (void);
- unsigned long (*get_guest_ip) (void);
-};
-
#ifdef CONFIG_HAVE_HW_BREAKPOINT
#include <asm/hw_breakpoint.h>
#endif
@@ -753,6 +752,20 @@ struct perf_event {

perf_overflow_handler_t overflow_handler;

+ /*
+ * pointers used by kvm perf paravirt interface.
+ *
+ * 1) Used in host kernel and points to host_perf_shadow which
+ * has information about guest perf_event
+ */
+ void *host_perf_shadow;
+ /*
+ * 2) Used in guest kernel and points to guest_perf_shadow which
+ * is used as a communication area with host kernel. Host kernel
+ * copies overflow data to it when an event overflows.
+ */
+ void *guest_perf_shadow;
+
#ifdef CONFIG_EVENT_TRACING
struct ftrace_event_call *tp_event;
struct event_filter *filter;
@@ -838,6 +851,16 @@ struct perf_output_handle {
int sample;
};

+struct perf_guest_info_callbacks {
+ /* Support collect guest statistics from host side */
+ int (*is_in_guest) (void);
+ int (*is_user_mode) (void);
+ unsigned long (*get_guest_ip) (void);
+
+ /* Support paravirt interface */
+ void (*copy_event_to_shadow) (struct perf_event *event, int overflows);
+};
+
#ifdef CONFIG_PERF_EVENTS

/*
@@ -871,6 +894,10 @@ perf_event_create_kernel_counter(struct
perf_overflow_handler_t callback);
extern u64 perf_event_read_value(struct perf_event *event,
u64 *enabled, u64 *running);
+extern void perf_event_output(struct perf_event *event, int nmi,
+ struct perf_sample_data *data, struct pt_regs *regs);
+void perf_event_attach(struct perf_event *event);
+void perf_event_detach(struct perf_event *event);

struct perf_sample_data {
u64 type;
@@ -1023,6 +1050,14 @@ perf_event_task_sched_in(struct task_str
static inline void
perf_event_task_sched_out(struct task_struct *task,
struct task_struct *next) { }
+
+static inline void
+perf_event_output(struct perf_event *event, int nmi,
+ struct perf_sample_data *data, struct pt_regs *regs) { }
+
+static inline void perf_event_attach(struct perf_event *event) { }
+static inline void perf_event_detach(struct perf_event *event) { }
+
static inline void
perf_event_task_tick(struct task_struct *task) { }
static inline int perf_event_init_task(struct task_struct *child) { return 0; }
--- linux-2.6_tip0620/kernel/watchdog.c 2010-06-21 15:20:48.517999849 +0800
+++ linux-2.6_tip0620perfkvm/kernel/watchdog.c 2010-06-21 15:21:39.315999849 +0800
@@ -197,8 +197,6 @@ static struct perf_event_attr wd_hw_attr
.type = PERF_TYPE_HARDWARE,
.config = PERF_COUNT_HW_CPU_CYCLES,
.size = sizeof(struct perf_event_attr),
- .pinned = 1,
- .disabled = 1,
};

/* Callback function for perf event subsystem */
@@ -361,6 +359,8 @@ static int watchdog_nmi_enable(int cpu)
/* Try to register using hardware perf events */
wd_attr = &wd_hw_attr;
wd_attr->sample_period = hw_nmi_get_sample_period();
+ wd_attr->pinned = 1;
+ wd_attr->disabled = 1;
event = perf_event_create_kernel_counter(wd_attr, cpu, -1, watchdog_overflow_callback);
if (!IS_ERR(event)) {
printk(KERN_INFO "NMI watchdog enabled, takes one hw-pmu counter.\n");
--- linux-2.6_tip0620/kernel/perf_event.c 2010-06-21 15:20:49.013999849 +0800
+++ linux-2.6_tip0620perfkvm/kernel/perf_event.c 2010-06-21 16:52:35.432999849 +0800
@@ -32,6 +32,7 @@
#include <linux/perf_event.h>
#include <linux/ftrace_event.h>
#include <linux/hw_breakpoint.h>
+#include <linux/kvm_para.h>

#include <asm/irq_regs.h>

@@ -747,6 +748,7 @@ static int group_can_go_on(struct perf_e
*/
if (event->attr.exclusive && cpuctx->active_oncpu)
return 0;
+
/*
* Otherwise, try to add it if all previous groups were able
* to go on.
@@ -1613,6 +1615,7 @@ void perf_event_task_tick(struct task_st
struct perf_cpu_context *cpuctx;
struct perf_event_context *ctx;
int rotate = 0;
+ int adjust_freq = 1;

if (!atomic_read(&nr_events))
return;
@@ -1626,9 +1629,22 @@ void perf_event_task_tick(struct task_st
if (ctx && ctx->nr_events && ctx->nr_events != ctx->nr_active)
rotate = 1;

- perf_ctx_adjust_freq(&cpuctx->ctx);
- if (ctx)
- perf_ctx_adjust_freq(ctx);
+#ifdef CONFIG_KVM_PERF
+ if (kvm_para_available()) {
+ /*
+ * perf_ctx_adjust_freq causes lots of pmu->read which would
+ * trigger too many vmexit to host kernel. We disable it
+ * under para virt situation
+ */
+ adjust_freq = 0;
+ }
+#endif
+
+ if (adjust_freq) {
+ perf_ctx_adjust_freq(&cpuctx->ctx);
+ if (ctx)
+ perf_ctx_adjust_freq(ctx);
+ }

if (!rotate)
return;
@@ -3434,7 +3450,7 @@ void perf_prepare_sample(struct perf_eve
}
}

-static void perf_event_output(struct perf_event *event, int nmi,
+void perf_event_output(struct perf_event *event, int nmi,
struct perf_sample_data *data,
struct pt_regs *regs)
{
@@ -5261,6 +5277,47 @@ perf_event_create_kernel_counter(struct
}
EXPORT_SYMBOL_GPL(perf_event_create_kernel_counter);

+void perf_event_attach(struct perf_event *event)
+{
+ struct perf_event_context *old_ctx, *new_ctx;
+
+ old_ctx = event->ctx;
+ new_ctx = find_get_context(current->pid, -1);
+ if (old_ctx != new_ctx) {
+ if (old_ctx) {
+ /* Delete from old ctx before joining new ctx */
+ mutex_lock(&old_ctx->mutex);
+ raw_spin_lock(&old_ctx->lock);
+ list_del_event(event, old_ctx);
+ raw_spin_unlock(&old_ctx->lock);
+ mutex_unlock(&old_ctx->mutex);
+ put_ctx(old_ctx);
+ }
+
+ mutex_lock(&new_ctx->mutex);
+ raw_spin_lock(&new_ctx->lock);
+ list_add_event(event, new_ctx);
+ event->ctx = new_ctx;
+ raw_spin_unlock(&new_ctx->lock);
+ mutex_unlock(&new_ctx->mutex);
+ } else
+ put_ctx(new_ctx);
+
+ perf_event_enable(event);
+}
+EXPORT_SYMBOL_GPL(perf_event_attach);
+
+void perf_event_detach(struct perf_event *event)
+{
+ /*
+ * Just disable the event and don't del it from
+ * ctx->event_list in case there is a race condition
+ * with perf_event_read_value
+ */
+ perf_event_disable(event);
+}
+EXPORT_SYMBOL_GPL(perf_event_detach);
+
/*
* inherit a event from parent task to child task:
*/


2010-06-21 12:00:47

by Avi Kivity

[permalink] [raw]
Subject: Re: [PATCH V2 2/5] ara virt interface of perf to support kvm guest os statistics collection in guest os

On 06/21/2010 12:31 PM, Zhang, Yanmin wrote:
> The 2nd patch is to change the definition of perf_event to facilitate
> perf attr copy when a hypercall happens.
>
> Signed-off-by: Zhang Yanmin<[email protected]>
>
> ---
>
> --- linux-2.6_tip0620/include/linux/perf_event.h 2010-06-21 15:19:52.821999849 +0800
> +++ linux-2.6_tip0620perfkvm/include/linux/perf_event.h 2010-06-21 16:53:49.283999849 +0800
> @@ -188,7 +188,10 @@ struct perf_event_attr {
> __u64 sample_type;
> __u64 read_format;
>
>

Assuming these flags are available to the guest?

> - __u64 disabled : 1, /* off by default */
> + union {
> + __u64 flags;
> + struct {
> + __u64 disabled : 1, /* off by default */
> inherit : 1, /* children inherit it */
>

inherit is meaningless for a guest.

> pinned : 1, /* must always be on PMU */
>

We cannot allow a guest to pin a counter.

The other flags are also problematic. I'd like to see virt-specific
flags (probably we'll only need kernel/user and nested_hv for nested
virtualization).

Something that is worrying is that we don't expose group information.
perf will multiplex the events for us, but there will be a loss in accuracy.

> #ifdef CONFIG_HAVE_HW_BREAKPOINT
> #include<asm/hw_breakpoint.h>
> #endif
> @@ -753,6 +752,20 @@ struct perf_event {
>
> perf_overflow_handler_t overflow_handler;
>
> + /*
> + * pointers used by kvm perf paravirt interface.
> + *
> + * 1) Used in host kernel and points to host_perf_shadow which
> + * has information about guest perf_event
> + */
> + void *host_perf_shadow;
>

Can we have real types instead of void pointers?

> + /*
> + * 2) Used in guest kernel and points to guest_perf_shadow which
> + * is used as a communication area with host kernel. Host kernel
> + * copies overflow data to it when an event overflows.
> + */
> + void *guest_perf_shadow;
>

It's strange to see both guest and host parts in the same patch.
Splitting to separate patches will really help review.

> @@ -1626,9 +1629,22 @@ void perf_event_task_tick(struct task_st
> if (ctx&& ctx->nr_events&& ctx->nr_events != ctx->nr_active)
> rotate = 1;
>
> - perf_ctx_adjust_freq(&cpuctx->ctx);
> - if (ctx)
> - perf_ctx_adjust_freq(ctx);
> +#ifdef CONFIG_KVM_PERF
> + if (kvm_para_available()) {
> + /*
> + * perf_ctx_adjust_freq causes lots of pmu->read which would
> + * trigger too many vmexit to host kernel. We disable it
> + * under para virt situation
> + */
> + adjust_freq = 0;
> + }
> +#endif
>

Perhaps we can have a batch read interface which will read many counters
at once. This would reduce the number of exits. Also adjust the
frequency less frequently.

> +
> + if (adjust_freq) {
> + perf_ctx_adjust_freq(&cpuctx->ctx);
> + if (ctx)
> + perf_ctx_adjust_freq(ctx);
> + }
>
>

--
error compiling committee.c: too many arguments to function

2010-06-22 02:08:17

by Yanmin Zhang

[permalink] [raw]
Subject: Re: [PATCH V2 2/5] ara virt interface of perf to support kvm guest os statistics collection in guest os

On Mon, 2010-06-21 at 15:00 +0300, Avi Kivity wrote:
> On 06/21/2010 12:31 PM, Zhang, Yanmin wrote:
> > The 2nd patch is to change the definition of perf_event to facilitate
> > perf attr copy when a hypercall happens.
> >
> > Signed-off-by: Zhang Yanmin<[email protected]>
> >
> > ---
> >
> > --- linux-2.6_tip0620/include/linux/perf_event.h 2010-06-21 15:19:52.821999849 +0800
> > +++ linux-2.6_tip0620perfkvm/include/linux/perf_event.h 2010-06-21 16:53:49.283999849 +0800
> > @@ -188,7 +188,10 @@ struct perf_event_attr {
> > __u64 sample_type;
> > __u64 read_format;
> >
> >
>
> Assuming these flags are available to the guest?
These flags are used by generic perf codes. To match with host kernel, we wish
guest os also use the flags.

>
> > - __u64 disabled : 1, /* off by default */
> > + union {
> > + __u64 flags;
> > + struct {
> > + __u64 disabled : 1, /* off by default */
> > inherit : 1, /* children inherit it */
> >
>
> inherit is meaningless for a guest.
Right. host kernel will reset it to 0 before create perf_event for guest os.
Here we couldn't delete the flag as it's used by perf generic codes.
I need separate the patch a little better. All definitions in include/linux/perf_event.h
are mostly perf generic code related. I'm very careful to change it.

>
> > pinned : 1, /* must always be on PMU */
> >
>
> We cannot allow a guest to pin a counter.
Ok. I will reset it in function kvm_pv_perf_op_open.

>
> The other flags are also problematic. I'd like to see virt-specific
> flags (probably we'll only need kernel/user and nested_hv for nested
> virtualization).
How about to add more comments around struct guest_perf_attr->flags that
guest os developers should take a look at include/linux/perf_event.h?
BTW, I will reset more flags to 0 in kvm_pv_perf_op_open.

>
> Something that is worrying is that we don't expose group information.
> perf will multiplex the events for us, but there will be a loss in accuracy.
>
> > #ifdef CONFIG_HAVE_HW_BREAKPOINT
> > #include<asm/hw_breakpoint.h>
> > #endif
> > @@ -753,6 +752,20 @@ struct perf_event {
> >
> > perf_overflow_handler_t overflow_handler;
> >
> > + /*
> > + * pointers used by kvm perf paravirt interface.
> > + *
> > + * 1) Used in host kernel and points to host_perf_shadow which
> > + * has information about guest perf_event
> > + */
> > + void *host_perf_shadow;
> >
>
> Can we have real types instead of void pointers?
I just want perf generic codes have less dependency on KVM codes.

>
> > + /*
> > + * 2) Used in guest kernel and points to guest_perf_shadow which
> > + * is used as a communication area with host kernel. Host kernel
> > + * copies overflow data to it when an event overflows.
> > + */
> > + void *guest_perf_shadow;
> >
>
> It's strange to see both guest and host parts in the same patch.
> Splitting to separate patches will really help review.
It's a little hard to split the patches if they change the same file. Perhaps
I could add more statements before the patch when I send it out.

>
> > @@ -1626,9 +1629,22 @@ void perf_event_task_tick(struct task_st
> > if (ctx&& ctx->nr_events&& ctx->nr_events != ctx->nr_active)
> > rotate = 1;
> >
> > - perf_ctx_adjust_freq(&cpuctx->ctx);
> > - if (ctx)
> > - perf_ctx_adjust_freq(ctx);
> > +#ifdef CONFIG_KVM_PERF
> > + if (kvm_para_available()) {
> > + /*
> > + * perf_ctx_adjust_freq causes lots of pmu->read which would
> > + * trigger too many vmexit to host kernel. We disable it
> > + * under para virt situation
> > + */
> > + adjust_freq = 0;
> > + }
> > +#endif
> >
>
> Perhaps we can have a batch read interface which will read many counters
> at once.
It's a good idea. But that will touch many perf generic codes which causes it's hard
to maintain or follow future changes.

> This would reduce the number of exits. Also adjust the
> frequency less frequently.
Here it depends on process scheduler frequency, CONFIG_HZ.

>
> > +
> > + if (adjust_freq) {
> > + perf_ctx_adjust_freq(&cpuctx->ctx);
> > + if (ctx)
> > + perf_ctx_adjust_freq(ctx);
> > + }
> >
> >
>

2010-06-22 09:12:25

by Avi Kivity

[permalink] [raw]
Subject: Re: [PATCH V2 2/5] ara virt interface of perf to support kvm guest os statistics collection in guest os

On 06/22/2010 05:08 AM, Zhang, Yanmin wrote:
>
>> Something that is worrying is that we don't expose group information.
>> perf will multiplex the events for us, but there will be a loss in accuracy.
>>
>>
>>> #ifdef CONFIG_HAVE_HW_BREAKPOINT
>>> #include<asm/hw_breakpoint.h>
>>> #endif
>>> @@ -753,6 +752,20 @@ struct perf_event {
>>>
>>> perf_overflow_handler_t overflow_handler;
>>>
>>> + /*
>>> + * pointers used by kvm perf paravirt interface.
>>> + *
>>> + * 1) Used in host kernel and points to host_perf_shadow which
>>> + * has information about guest perf_event
>>> + */
>>> + void *host_perf_shadow;
>>>
>>>
>> Can we have real types instead of void pointers?
>>
> I just want perf generic codes have less dependency on KVM codes.
>

One way to do that and retain type safety is to have

struct perf_client {
struct perf_client_ops *ops;
...
}

The client (kvm) can do

struct kvm_perf_client {
struct perf_client pc;
// kvm specific stuff
};

the callbacks receive struct perf_client and use container_of to reach
the kvm_perf_client that contains it.

>>> + /*
>>> + * 2) Used in guest kernel and points to guest_perf_shadow which
>>> + * is used as a communication area with host kernel. Host kernel
>>> + * copies overflow data to it when an event overflows.
>>> + */
>>> + void *guest_perf_shadow;
>>>
>>>
>> It's strange to see both guest and host parts in the same patch.
>> Splitting to separate patches will really help review.
>>
> It's a little hard to split the patches if they change the same file. Perhaps
> I could add more statements before the patch when I send it out.
>

With git, it's easy (once you're used to it):

# go back one commit:
git reset HEAD^
# selectively add bits:
git add -p
# commit first patch
git commit -s
# selectively add bits:
git add -p
# commit second patch
git commit -s


>>> @@ -1626,9 +1629,22 @@ void perf_event_task_tick(struct task_st
>>> if (ctx&& ctx->nr_events&& ctx->nr_events != ctx->nr_active)
>>> rotate = 1;
>>>
>>> - perf_ctx_adjust_freq(&cpuctx->ctx);
>>> - if (ctx)
>>> - perf_ctx_adjust_freq(ctx);
>>> +#ifdef CONFIG_KVM_PERF
>>> + if (kvm_para_available()) {
>>> + /*
>>> + * perf_ctx_adjust_freq causes lots of pmu->read which would
>>> + * trigger too many vmexit to host kernel. We disable it
>>> + * under para virt situation
>>> + */
>>> + adjust_freq = 0;
>>> + }
>>> +#endif
>>>
>>>
>> Perhaps we can have a batch read interface which will read many counters
>> at once.
>>
> It's a good idea. But that will touch many perf generic codes which causes it's hard
> to maintain or follow future changes.
>

I'm talking about the guest/host interface. So you have one vmexit and
many host perf calls.

--
error compiling committee.c: too many arguments to function

2010-06-22 09:25:35

by Yanmin Zhang

[permalink] [raw]
Subject: Re: [PATCH V2 2/5] ara virt interface of perf to support kvm guest os statistics collection in guest os

On Tue, 2010-06-22 at 12:12 +0300, Avi Kivity wrote:
> On 06/22/2010 05:08 AM, Zhang, Yanmin wrote:
> >
> >> Something that is worrying is that we don't expose group information.
> >> perf will multiplex the events for us, but there will be a loss in accuracy.
> >>
> >>
> >>> #ifdef CONFIG_HAVE_HW_BREAKPOINT
> >>> #include<asm/hw_breakpoint.h>
> >>> #endif
> >>> @@ -753,6 +752,20 @@ struct perf_event {
> >>>
> >>> perf_overflow_handler_t overflow_handler;
> >>>
> >>> + /*
> >>> + * pointers used by kvm perf paravirt interface.
> >>> + *
> >>> + * 1) Used in host kernel and points to host_perf_shadow which
> >>> + * has information about guest perf_event
> >>> + */
> >>> + void *host_perf_shadow;
> >>>
> >>>
> >> Can we have real types instead of void pointers?
> >>
> > I just want perf generic codes have less dependency on KVM codes.
> >
>
> One way to do that and retain type safety is to have
>
> struct perf_client {
> struct perf_client_ops *ops;
> ...
> }
>
> The client (kvm) can do
>
> struct kvm_perf_client {
> struct perf_client pc;
> // kvm specific stuff
> };
>
> the callbacks receive struct perf_client and use container_of to reach
> the kvm_perf_client that contains it.
Let me double check it.

>
> >>> + /*
> >>> + * 2) Used in guest kernel and points to guest_perf_shadow which
> >>> + * is used as a communication area with host kernel. Host kernel
> >>> + * copies overflow data to it when an event overflows.
> >>> + */
> >>> + void *guest_perf_shadow;
> >>>
> >>>
> >> It's strange to see both guest and host parts in the same patch.
> >> Splitting to separate patches will really help review.
> >>
> > It's a little hard to split the patches if they change the same file. Perhaps
> > I could add more statements before the patch when I send it out.
> >
>
> With git, it's easy (once you're used to it):
>
> # go back one commit:
> git reset HEAD^
> # selectively add bits:
> git add -p
> # commit first patch
> git commit -s
> # selectively add bits:
> git add -p
> # commit second patch
> git commit -s
Thanks for your teaching.

>
>
> >>> @@ -1626,9 +1629,22 @@ void perf_event_task_tick(struct task_st
> >>> if (ctx&& ctx->nr_events&& ctx->nr_events != ctx->nr_active)
> >>> rotate = 1;
> >>>
> >>> - perf_ctx_adjust_freq(&cpuctx->ctx);
> >>> - if (ctx)
> >>> - perf_ctx_adjust_freq(ctx);
> >>> +#ifdef CONFIG_KVM_PERF
> >>> + if (kvm_para_available()) {
> >>> + /*
> >>> + * perf_ctx_adjust_freq causes lots of pmu->read which would
> >>> + * trigger too many vmexit to host kernel. We disable it
> >>> + * under para virt situation
> >>> + */
> >>> + adjust_freq = 0;
> >>> + }
> >>> +#endif
> >>>
> >>>
> >> Perhaps we can have a batch read interface which will read many counters
> >> at once.
> >>
> > It's a good idea. But that will touch many perf generic codes which causes it's hard
> > to maintain or follow future changes.
> >
>
> I'm talking about the guest/host interface. So you have one vmexit and
> many host perf calls.
I understood what you were speaking. I mean, perf generic codes operate perf_event
one by one. At low layer, we just know one perf_event before calling hypercall to
vmexit to host kernel.


2010-06-22 09:41:38

by Avi Kivity

[permalink] [raw]
Subject: Re: [PATCH V2 2/5] ara virt interface of perf to support kvm guest os statistics collection in guest os

On 06/22/2010 12:25 PM, Zhang, Yanmin wrote:
>
>> I'm talking about the guest/host interface. So you have one vmexit and
>> many host perf calls.
>>
> I understood what you were speaking. I mean, perf generic codes operate perf_event
> one by one. At low layer, we just know one perf_event before calling hypercall to
> vmexit to host kernel.
>

Ah.

We might fix that by having the perf ops work on guest memory, and add a
commit that transfers them to the host. But this is complicated and can
be left for later.

--
error compiling committee.c: too many arguments to function