Below is the kernel patch to enable perf to collect guest os statistics.
Joerg,
Would you like to add support on svm? I don't know the exact point to trigger
NMI to host with svm.
See below code with vmx:
+ kvm_before_handle_nmi(&vmx->vcpu);
asm("int $2");
+ kvm_after_handle_nmi(&vmx->vcpu);
Signed-off-by: Zhang Yanmin <[email protected]>
---
diff -Nraup --exclude=tools linux-2.6_tip0413/arch/x86/include/asm/perf_event.h linux-2.6_tip0413_perfkvm/arch/x86/include/asm/perf_event.h
--- linux-2.6_tip0413/arch/x86/include/asm/perf_event.h 2010-04-14 11:11:03.992966568 +0800
+++ linux-2.6_tip0413_perfkvm/arch/x86/include/asm/perf_event.h 2010-04-14 11:13:17.261881591 +0800
@@ -135,17 +135,10 @@ extern void perf_events_lapic_init(void)
*/
#define PERF_EFLAGS_EXACT (1UL << 3)
-#define perf_misc_flags(regs) \
-({ int misc = 0; \
- if (user_mode(regs)) \
- misc |= PERF_RECORD_MISC_USER; \
- else \
- misc |= PERF_RECORD_MISC_KERNEL; \
- if (regs->flags & PERF_EFLAGS_EXACT) \
- misc |= PERF_RECORD_MISC_EXACT; \
- misc; })
-
-#define perf_instruction_pointer(regs) ((regs)->ip)
+struct pt_regs;
+extern unsigned long perf_instruction_pointer(struct pt_regs *regs);
+extern unsigned long perf_misc_flags(struct pt_regs *regs);
+#define perf_misc_flags(regs) perf_misc_flags(regs)
#else
static inline void init_hw_perf_events(void) { }
diff -Nraup --exclude=tools linux-2.6_tip0413/arch/x86/kernel/cpu/perf_event.c linux-2.6_tip0413_perfkvm/arch/x86/kernel/cpu/perf_event.c
--- linux-2.6_tip0413/arch/x86/kernel/cpu/perf_event.c 2010-04-14 11:11:04.825028810 +0800
+++ linux-2.6_tip0413_perfkvm/arch/x86/kernel/cpu/perf_event.c 2010-04-14 17:02:12.198063684 +0800
@@ -1720,6 +1720,11 @@ struct perf_callchain_entry *perf_callch
{
struct perf_callchain_entry *entry;
+ if (perf_guest_cbs && perf_guest_cbs->is_in_guest()) {
+ /* TODO: We don't support guest os callchain now */
+ return NULL;
+ }
+
if (in_nmi())
entry = &__get_cpu_var(pmc_nmi_entry);
else
@@ -1743,3 +1748,30 @@ void perf_arch_fetch_caller_regs(struct
regs->cs = __KERNEL_CS;
local_save_flags(regs->flags);
}
+
+unsigned long perf_instruction_pointer(struct pt_regs *regs)
+{
+ unsigned long ip;
+ if (perf_guest_cbs && perf_guest_cbs->is_in_guest())
+ ip = perf_guest_cbs->get_guest_ip();
+ else
+ ip = instruction_pointer(regs);
+ return ip;
+}
+
+unsigned long perf_misc_flags(struct pt_regs *regs)
+{
+ int misc = 0;
+ if (perf_guest_cbs && perf_guest_cbs->is_in_guest()) {
+ misc |= perf_guest_cbs->is_user_mode() ?
+ PERF_RECORD_MISC_GUEST_USER :
+ PERF_RECORD_MISC_GUEST_KERNEL;
+ } else
+ misc |= user_mode(regs) ? PERF_RECORD_MISC_USER :
+ PERF_RECORD_MISC_KERNEL;
+ if (regs->flags & PERF_EFLAGS_EXACT)
+ misc |= PERF_RECORD_MISC_EXACT;
+
+ return misc;
+}
+
diff -Nraup --exclude=tools linux-2.6_tip0413/arch/x86/kvm/vmx.c linux-2.6_tip0413_perfkvm/arch/x86/kvm/vmx.c
--- linux-2.6_tip0413/arch/x86/kvm/vmx.c 2010-04-14 11:11:04.353024541 +0800
+++ linux-2.6_tip0413_perfkvm/arch/x86/kvm/vmx.c 2010-04-15 10:28:39.516891050 +0800
@@ -3654,8 +3654,11 @@ static void vmx_complete_interrupts(stru
/* We need to handle NMIs before interrupts are enabled */
if ((exit_intr_info & INTR_INFO_INTR_TYPE_MASK) == INTR_TYPE_NMI_INTR &&
- (exit_intr_info & INTR_INFO_VALID_MASK))
+ (exit_intr_info & INTR_INFO_VALID_MASK)) {
+ kvm_before_handle_nmi(&vmx->vcpu);
asm("int $2");
+ kvm_after_handle_nmi(&vmx->vcpu);
+ }
idtv_info_valid = idt_vectoring_info & VECTORING_INFO_VALID_MASK;
diff -Nraup --exclude=tools linux-2.6_tip0413/arch/x86/kvm/x86.c linux-2.6_tip0413_perfkvm/arch/x86/kvm/x86.c
--- linux-2.6_tip0413/arch/x86/kvm/x86.c 2010-04-14 11:11:04.341042024 +0800
+++ linux-2.6_tip0413_perfkvm/arch/x86/kvm/x86.c 2010-04-15 17:16:41.340064784 +0800
@@ -40,6 +40,7 @@
#include <linux/user-return-notifier.h>
#include <linux/srcu.h>
#include <linux/slab.h>
+#include <linux/perf_event.h>
#include <trace/events/kvm.h>
#undef TRACE_INCLUDE_FILE
#define CREATE_TRACE_POINTS
@@ -3765,6 +3766,47 @@ static void kvm_timer_init(void)
}
}
+static DEFINE_PER_CPU(struct kvm_vcpu *, current_vcpu);
+
+static int kvm_is_in_guest(void)
+{
+ return percpu_read(current_vcpu) != NULL;
+}
+
+static int kvm_is_user_mode(void)
+{
+ int user_mode = 3;
+ if (percpu_read(current_vcpu))
+ user_mode = kvm_x86_ops->get_cpl(percpu_read(current_vcpu));
+ return user_mode != 0;
+}
+
+static unsigned long kvm_get_guest_ip(void)
+{
+ unsigned long ip = 0;
+ if (percpu_read(current_vcpu))
+ ip = kvm_rip_read(percpu_read(current_vcpu));
+ return ip;
+}
+
+static struct perf_guest_info_callbacks kvm_guest_cbs = {
+ .is_in_guest = kvm_is_in_guest,
+ .is_user_mode = kvm_is_user_mode,
+ .get_guest_ip = kvm_get_guest_ip,
+};
+
+void kvm_before_handle_nmi(struct kvm_vcpu *vcpu)
+{
+ percpu_write(current_vcpu, vcpu);
+}
+EXPORT_SYMBOL_GPL(kvm_before_handle_nmi);
+
+void kvm_after_handle_nmi(struct kvm_vcpu *vcpu)
+{
+ percpu_write(current_vcpu, NULL);
+}
+EXPORT_SYMBOL_GPL(kvm_after_handle_nmi);
+
int kvm_arch_init(void *opaque)
{
int r;
@@ -3801,6 +3843,8 @@ int kvm_arch_init(void *opaque)
kvm_timer_init();
+ perf_register_guest_info_callbacks(&kvm_guest_cbs);
+
return 0;
out:
@@ -3809,6 +3853,8 @@ out:
void kvm_arch_exit(void)
{
+ perf_unregister_guest_info_callbacks(&kvm_guest_cbs);
+
if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC))
cpufreq_unregister_notifier(&kvmclock_cpufreq_notifier_block,
CPUFREQ_TRANSITION_NOTIFIER);
diff -Nraup --exclude=tools linux-2.6_tip0413/arch/x86/kvm/x86.h linux-2.6_tip0413_perfkvm/arch/x86/kvm/x86.h
--- linux-2.6_tip0413/arch/x86/kvm/x86.h 2010-04-14 11:11:04.328996790 +0800
+++ linux-2.6_tip0413_perfkvm/arch/x86/kvm/x86.h 2010-04-15 10:27:57.116972433 +0800
@@ -65,4 +65,7 @@ static inline int is_paging(struct kvm_v
return kvm_read_cr0_bits(vcpu, X86_CR0_PG);
}
+void kvm_before_handle_nmi(struct kvm_vcpu *vcpu);
+void kvm_after_handle_nmi(struct kvm_vcpu *vcpu);
+
#endif
diff -Nraup --exclude=tools linux-2.6_tip0413/include/linux/perf_event.h linux-2.6_tip0413_perfkvm/include/linux/perf_event.h
--- linux-2.6_tip0413/include/linux/perf_event.h 2010-04-14 11:11:16.922212684 +0800
+++ linux-2.6_tip0413_perfkvm/include/linux/perf_event.h 2010-04-14 11:34:33.478072738 +0800
@@ -288,11 +288,13 @@ struct perf_event_mmap_page {
__u64 data_tail; /* user-space written tail */
};
-#define PERF_RECORD_MISC_CPUMODE_MASK (3 << 0)
+#define PERF_RECORD_MISC_CPUMODE_MASK (7 << 0)
#define PERF_RECORD_MISC_CPUMODE_UNKNOWN (0 << 0)
#define PERF_RECORD_MISC_KERNEL (1 << 0)
#define PERF_RECORD_MISC_USER (2 << 0)
#define PERF_RECORD_MISC_HYPERVISOR (3 << 0)
+#define PERF_RECORD_MISC_GUEST_KERNEL (4 << 0)
+#define PERF_RECORD_MISC_GUEST_USER (5 << 0)
#define PERF_RECORD_MISC_EXACT (1 << 14)
/*
@@ -446,6 +448,12 @@ enum perf_callchain_context {
# include <asm/perf_event.h>
#endif
+struct perf_guest_info_callbacks {
+ int (*is_in_guest) (void);
+ int (*is_user_mode) (void);
+ unsigned long (*get_guest_ip) (void);
+};
+
#ifdef CONFIG_HAVE_HW_BREAKPOINT
#include <asm/hw_breakpoint.h>
#endif
@@ -920,6 +928,12 @@ static inline void perf_event_mmap(struc
__perf_event_mmap(vma);
}
+extern struct perf_guest_info_callbacks *perf_guest_cbs;
+extern int perf_register_guest_info_callbacks(
+ struct perf_guest_info_callbacks *);
+extern int perf_unregister_guest_info_callbacks(
+ struct perf_guest_info_callbacks *);
+
extern void perf_event_comm(struct task_struct *tsk);
extern void perf_event_fork(struct task_struct *tsk);
@@ -989,6 +1003,11 @@ perf_sw_event(u32 event_id, u64 nr, int
static inline void
perf_bp_event(struct perf_event *event, void *data) { }
+static inline int perf_register_guest_info_callbacks
+(struct perf_guest_info_callbacks *) {return 0; }
+static inline int perf_unregister_guest_info_callbacks
+(struct perf_guest_info_callbacks *) {return 0; }
+
static inline void perf_event_mmap(struct vm_area_struct *vma) { }
static inline void perf_event_comm(struct task_struct *tsk) { }
static inline void perf_event_fork(struct task_struct *tsk) { }
diff -Nraup --exclude=tools linux-2.6_tip0413/kernel/perf_event.c linux-2.6_tip0413_perfkvm/kernel/perf_event.c
--- linux-2.6_tip0413/kernel/perf_event.c 2010-04-14 11:12:04.090770764 +0800
+++ linux-2.6_tip0413_perfkvm/kernel/perf_event.c 2010-04-14 11:13:17.265859229 +0800
@@ -2797,6 +2797,27 @@ void perf_arch_fetch_caller_regs(struct
/*
+ * We assume there is only KVM supporting the callbacks.
+ * Later on, we might change it to a list if there is
+ * another virtualization implementation supporting the callbacks.
+ */
+struct perf_guest_info_callbacks *perf_guest_cbs;
+
+int perf_register_guest_info_callbacks(struct perf_guest_info_callbacks *cbs)
+{
+ perf_guest_cbs = cbs;
+ return 0;
+}
+EXPORT_SYMBOL_GPL(perf_register_guest_info_callbacks);
+
+int perf_unregister_guest_info_callbacks(struct perf_guest_info_callbacks *cbs)
+{
+ perf_guest_cbs = NULL;
+ return 0;
+}
+EXPORT_SYMBOL_GPL(perf_unregister_guest_info_callbacks);
+
+/*
* Output
*/
static bool perf_output_space(struct perf_mmap_data *data, unsigned long tail,
@@ -3748,7 +3769,7 @@ void __perf_event_mmap(struct vm_area_st
.event_id = {
.header = {
.type = PERF_RECORD_MMAP,
- .misc = 0,
+ .misc = PERF_RECORD_MISC_USER,
/* .size */
},
/* .pid */
Hi Yanmin,
On Fri, Apr 16, 2010 at 03:34:35PM +0800, Zhang, Yanmin wrote:
> Below is the kernel patch to enable perf to collect guest os statistics.
>
> Joerg,
>
> Would you like to add support on svm? I don't know the exact point to trigger
> NMI to host with svm.
Yes I will do that, thanks for all the work you have already done :-) Do
we have a branch for that work somewhere? Probably in the -tip tree?
Joerg
On 04/16/2010 10:34 AM, Zhang, Yanmin wrote:
> Below is the kernel patch to enable perf to collect guest os statistics.
>
> Joerg,
>
> Would you like to add support on svm? I don't know the exact point to trigger
> NMI to host with svm.
>
> See below code with vmx:
>
> + kvm_before_handle_nmi(&vmx->vcpu);
> asm("int $2");
> + kvm_after_handle_nmi(&vmx->vcpu);
>
> Signed-off-by: Zhang Yanmin<[email protected]>
>
Can you please split it further?
Patch 1 introduces perf_register_guest_info_callbacks() and related.
Ingo can merge this into a branch in tip.git.
Patch 2 is just the kvm bits, I'll apply that after merging the branch
with patch 1.
Patch 3 adds the tools/perf changes.
This way perf development can continue on tip.git, and kvm development
can continue on kvm.git, without the code bases diverging and requiring
a merge later.
--
Do not meddle in the internals of kernels, for they are subtle and quick to panic.
* Avi Kivity <[email protected]> wrote:
> On 04/16/2010 10:34 AM, Zhang, Yanmin wrote:
> >Below is the kernel patch to enable perf to collect guest os statistics.
> >
> >Joerg,
> >
> >Would you like to add support on svm? I don't know the exact point to trigger
> >NMI to host with svm.
> >
> >See below code with vmx:
> >
> >+ kvm_before_handle_nmi(&vmx->vcpu);
> > asm("int $2");
> >+ kvm_after_handle_nmi(&vmx->vcpu);
> >
> >Signed-off-by: Zhang Yanmin<[email protected]>
>
> Can you please split it further?
>
> Patch 1 introduces perf_register_guest_info_callbacks() and related. Ingo
> can merge this into a branch in tip.git. Patch 2 is just the kvm bits, I'll
> apply that after merging the branch with patch 1. Patch 3 adds the
> tools/perf changes.
>
> This way perf development can continue on tip.git, and kvm development can
> continue on kvm.git, without the code bases diverging and requiring a merge
> later.
I'd like to pull the KVM bits from you into perf - so that there's a testable
form of the changes. We can do that via a branch that has 1-2 changes, plus
minimal conflicts down the line, right?
Ingo
On 04/17/2010 10:13 PM, Ingo Molnar wrote:
> * Avi Kivity<[email protected]> wrote:
>
>
>> On 04/16/2010 10:34 AM, Zhang, Yanmin wrote:
>>
>>> Below is the kernel patch to enable perf to collect guest os statistics.
>>>
>>> Joerg,
>>>
>>> Would you like to add support on svm? I don't know the exact point to trigger
>>> NMI to host with svm.
>>>
>>> See below code with vmx:
>>>
>>> + kvm_before_handle_nmi(&vmx->vcpu);
>>> asm("int $2");
>>> + kvm_after_handle_nmi(&vmx->vcpu);
>>>
>>> Signed-off-by: Zhang Yanmin<[email protected]>
>>>
>> Can you please split it further?
>>
>> Patch 1 introduces perf_register_guest_info_callbacks() and related. Ingo
>> can merge this into a branch in tip.git. Patch 2 is just the kvm bits, I'll
>> apply that after merging the branch with patch 1. Patch 3 adds the
>> tools/perf changes.
>>
>> This way perf development can continue on tip.git, and kvm development can
>> continue on kvm.git, without the code bases diverging and requiring a merge
>> later.
>>
> I'd like to pull the KVM bits from you into perf - so that there's a testable
> form of the changes. We can do that via a branch that has 1-2 changes, plus
> minimal conflicts down the line, right?
>
We can try doing this (currently we don't, but this is simple enough
that we could). I'd still like 1-2 in two patches.
--
Do not meddle in the internals of kernels, for they are subtle and quick to panic.
* Avi Kivity <[email protected]> wrote:
> On 04/17/2010 10:13 PM, Ingo Molnar wrote:
> >* Avi Kivity<[email protected]> wrote:
> >
> >>On 04/16/2010 10:34 AM, Zhang, Yanmin wrote:
> >>>Below is the kernel patch to enable perf to collect guest os statistics.
> >>>
> >>>Joerg,
> >>>
> >>>Would you like to add support on svm? I don't know the exact point to trigger
> >>>NMI to host with svm.
> >>>
> >>>See below code with vmx:
> >>>
> >>>+ kvm_before_handle_nmi(&vmx->vcpu);
> >>> asm("int $2");
> >>>+ kvm_after_handle_nmi(&vmx->vcpu);
> >>>
> >>>Signed-off-by: Zhang Yanmin<[email protected]>
> >>Can you please split it further?
> >>
> >>Patch 1 introduces perf_register_guest_info_callbacks() and related. Ingo
> >>can merge this into a branch in tip.git. Patch 2 is just the kvm bits, I'll
> >>apply that after merging the branch with patch 1. Patch 3 adds the
> >>tools/perf changes.
> >>
> >>This way perf development can continue on tip.git, and kvm development can
> >>continue on kvm.git, without the code bases diverging and requiring a merge
> >>later.
> >I'd like to pull the KVM bits from you into perf - so that there's a testable
> >form of the changes. We can do that via a branch that has 1-2 changes, plus
> >minimal conflicts down the line, right?
>
> We can try doing this (currently we don't, but this is simple enough that we
> could).
Thanks.
> [...] I'd still like 1-2 in two patches.
Sure.
Ingo