Return-Path: Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S1753299Ab0BVOHq (ORCPT ); Mon, 22 Feb 2010 09:07:46 -0500 Received: from smtp-out.google.com ([216.239.33.17]:27819 "EHLO smtp-out.google.com" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S1751821Ab0BVOHo (ORCPT ); Mon, 22 Feb 2010 09:07:44 -0500 DomainKey-Signature: a=rsa-sha1; s=beta; d=google.com; c=nofws; q=dns; h=mime-version:in-reply-to:references:date:message-id:subject:from:to: cc:content-type:content-transfer-encoding:x-system-of-record; b=U5Ui+EK6lZbr8KmUssxvb5S/NpY/cxqLyg0YCmPXKCON4uzxLdcadVRvkb6ArMvD0 NsKgHpC757zhQMicFje4w== MIME-Version: 1.0 In-Reply-To: <1266531936.2903.58.camel@laptop> References: <1266142321.5273.409.camel@laptop> <1266531936.2903.58.camel@laptop> Date: Mon, 22 Feb 2010 15:07:38 +0100 Message-ID: Subject: Re: [RFC] perf_events: how to add Intel LBR support From: Stephane Eranian To: Peter Zijlstra Cc: linux-kernel@vger.kernel.org, mingo@elte.hu, paulus@samba.org, davem@davemloft.net, fweisbec@gmail.com, robert.richter@amd.com, perfmon2-devel@lists.sf.net, eranian@gmail.com Content-Type: text/plain; charset=UTF-8 X-System-Of-Record: true Sender: linux-kernel-owner@vger.kernel.org List-ID: X-Mailing-List: linux-kernel@vger.kernel.org Content-Transfer-Encoding: 8bit X-MIME-Autoconverted: from base64 to 8bit by alpha.home.local id o1ME8VmE018242 Content-Length: 21079 Lines: 16 Hi, On Thu, Feb 18, 2010 at 11:25 PM, Peter Zijlstra wrote:> On Sun, 2010-02-14 at 11:12 +0100, Peter Zijlstra wrote:>>>> Dealing with context switches is also going to be tricky, where we have>> to safe and 'restore' LBR stacks for per-task counters.>> OK, so I poked at the LBR hardware a bit, sadly the TOS really doesn't> count beyond the few bits it requires :-(> The TOS is also a read-only MSR. > I had hopes it would, since that would make it easier to share the LBR,> simply take a TOS snapshot when you schedule the counter in, and never> roll back further for that particular counter.>> As it stands we'll have to wipe the full LBR state every time we 'touch'> it, which makes it less useful for cpu-bound counters.>Yes, you need to clean it up each time you snapshot it and each timeyou restore it. The patch does not seem to handle LBR context switches. > Also, not all hw (core and pentium-m) supports the freeze_lbrs_on_pmi> bit, what we could do for those is stick an unconditional LBR disable> very early in the NMI path and simply roll back the stack until we hit a> branch into the NMI vector, that should leave a few usable LBR entries.>You need to be consistent across the CPUs. If a CPU does not providefreeze_on_pmi, then I would simply not support it as a first approach.Same thing if the LBR is less than 4-deep. I don't think you'll get anythinguseful out of it. > For AMD and P6 there is only a single LBR record, AMD seems to freeze> the thing on #DB traps but the PMI isn't qualified as one afaict,> rendering the single entry useless (didn't look at the P6 details).>> hackery below.. The patch does not address the configuration options available on IntelNehalem/Westmere, i.e., LBR_SELECT (see Vol 3a table 16-9). We canhandle priv level separately as it can be derived from the event exclude_*.But it you want to allow multiple events in a group to use PERF_SAMPLE_LBRthen you need to ensure LBR_SELECT is set to the same value, priv levelsincluded. Furthermore, LBR_SELECT is shared between HT threads. We need to eitheradd another field in perf_event_attr or encode this in the configfield, though itis ugly because unrelated to the event but rather to the sample_type. The patch is missing the sampling part, i.e., dump of the LBR (in sequentialorder) into the sampling buffer. I would also select a better name than PERF_SAMPLE_LBR. LBR is anIntel thing. Maybe PERF_SAMPLE_TAKEN_BRANCH. > --->  arch/x86/include/asm/perf_event.h |   24 +++>  arch/x86/kernel/cpu/perf_event.c  |  233 +++++++++++++++++++++++++++++++++++--->  arch/x86/kernel/traps.c           |    3>  include/linux/perf_event.h        |    7 ->  4 files changed, 251 insertions(+), 16 deletions(-)>> Index: linux-2.6/arch/x86/kernel/cpu/perf_event.c> ===================================================================> --- linux-2.6.orig/arch/x86/kernel/cpu/perf_event.c> +++ linux-2.6/arch/x86/kernel/cpu/perf_event.c> @@ -104,6 +104,10 @@ struct amd_nb {>        struct event_constraint event_constraints[X86_PMC_IDX_MAX];>  };>> +struct lbr_entry {> +       u64 from, to, flags;> +};> +>  struct cpu_hw_events {>        struct perf_event       *events[X86_PMC_IDX_MAX]; /* in counter order */>        unsigned long           active_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)];> @@ -117,6 +121,10 @@ struct cpu_hw_events {>        u64                     tags[X86_PMC_IDX_MAX];>        struct perf_event       *event_list[X86_PMC_IDX_MAX]; /* in enabled order */>        struct amd_nb           *amd_nb;> +> +       int                     lbr_users;> +       int                     lbr_entries;> +       struct lbr_entry        lbr_stack[16];>  };>>  #define __EVENT_CONSTRAINT(c, n, m, w) {\> @@ -187,6 +195,19 @@ struct x86_pmu {>        void            (*put_event_constraints)(struct cpu_hw_events *cpuc,>                                                 struct perf_event *event);>        struct event_constraint *event_constraints;> +> +       unsigned long   lbr_tos;> +       unsigned long   lbr_from, lbr_to;> +       int             lbr_nr;> +       int             lbr_ctl;> +       int             lbr_format;> +};> +> +enum {> +       LBR_FORMAT_32           = 0x00,> +       LBR_FORMAT_LIP          = 0x01,> +       LBR_FORMAT_EIP          = 0x02,> +       LBR_FORMAT_EIP_FLAGS    = 0x03,>  };>>  static struct x86_pmu x86_pmu __read_mostly;> @@ -1203,6 +1224,52 @@ static void intel_pmu_disable_bts(void)>        update_debugctlmsr(debugctlmsr);>  }>> +static void __intel_pmu_enable_lbr(void)> +{> +       u64 debugctl;> +> +       rdmsrl(MSR_IA32_DEBUGCTLMSR, debugctl);> +       debugctl |= x86_pmu.lbr_ctl;> +       wrmsrl(MSR_IA32_DEBUGCTLMSR, debugctl);> +}> +> +static void intel_pmu_enable_lbr(void)> +{> +       struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);> +> +       if (!x86_pmu.lbr_nr)> +               return;> +> +       if (!cpuc->lbr_users)> +               __intel_pmu_enable_lbr();> +> +       cpuc->lbr_users++;> +}> +> +static void __intel_pmu_disable_lbr(void)> +{> +       u64 debugctl;> +> +       rdmsrl(MSR_IA32_DEBUGCTLMSR, debugctl);> +       debugctl &= ~x86_pmu.lbr_ctl;> +       wrmsrl(MSR_IA32_DEBUGCTLMSR, debugctl);> +}> +> +static void intel_pmu_disable_lbr(void)> +{> +       struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);> +> +       if (!x86_pmu.lbr_nr)> +               return;> +> +       cpuc->lbr_users--;> +> +       BUG_ON(cpuc->lbr_users < 0);> +> +       if (!cpuc->lbr_users)> +               __intel_pmu_disable_lbr();> +}> +>  static void intel_pmu_pebs_enable(struct hw_perf_event *hwc)>  {>        struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);> @@ -1402,6 +1469,9 @@ void hw_perf_disable(void)>        cpuc->enabled = 0;>        barrier();>> +       if (cpuc->lbr_users)> +               __intel_pmu_disable_lbr();> +>        x86_pmu.disable_all();>  }>> @@ -1703,6 +1773,10 @@ void hw_perf_enable(void)>        barrier();>>        x86_pmu.enable_all();> +> +       // XXX> +       if (cpuc->lbr_users = 1)> +               __intel_pmu_enable_lbr();>  }>>  static inline u64 intel_pmu_get_status(void)> @@ -2094,7 +2168,6 @@ static void intel_pmu_drain_pebs_core(st>        struct perf_event_header header;>        struct perf_sample_data data;>        struct pt_regs regs;> -       u64>>        if (!event || !ds || !x86_pmu.pebs)>                return;> @@ -2114,7 +2187,7 @@ static void intel_pmu_drain_pebs_core(st>>        perf_prepare_sample(&header, &data, event, ®s);>> -       event.hw.interrupts += (top - at);> +       event->hw.interrupts += (top - at);>        atomic64_add((top - at) * event->hw.last_period, &event->count);>>        if (perf_output_begin(&handle, event, header.size * (top - at), 1, 1))> @@ -2188,6 +2261,84 @@ static void intel_pmu_drain_pebs_nhm(str>        }>  }>> +static inline u64 intel_pmu_lbr_tos(void)> +{> +       u64 tos;> +> +       rdmsrl(x86_pmu.lbr_tos, tos);> +       return tos;> +}> +> +static void> +intel_pmu_read_lbr_32(struct cpu_hw_events *cpuc, struct perf_event *event)> +{> +       struct hw_perf_event *hwc = &event->hw;> +       unsigned long mask = x86_pmu.lbr_nr - 1;> +       u64 tos = intel_pmu_lbr_tos();> +       int i;> +> +       for (i = 0; tos > hwc->lbr_tos && i < x86_pmu.lbr_nr; i++, tos--) {> +               unsigned long lbr_idx = (tos - i) & mask;> +               union {> +                       struct {> +                               u32 from;> +                               u32 to;> +                       };> +                       u64     lbr;> +               } msr_lastbranch;> +> +               rdmsrl(x86_pmu.lbr_from + lbr_idx, msr_lastbranch.lbr);> +> +               cpuc->lbr_stack[i].from  = msr_lastbranch.from;> +               cpuc->lbr_stack[i].to    = msr_lastbranch.to;> +               cpuc->lbr_stack[i].flags = 0;> +       }> +       cpuc->lbr_entries = i;> +}> +> +#define LBR_FROM_FLAG_MISPRED  (1ULL << 63)> +> +/*> + * Due to lack of segmentation in Linux the effective address (offset)> + * is the same as the linear address, allowing us to merge the LIP and EIP> + * LBR formats.> + */> +static void> +intel_pmu_read_lbr_64(struct cpu_hw_events *cpuc, struct perf_event *event)> +{> +       struct hw_perf_event *hwc = &event->hw;> +       unsigned long mask = x86_pmu.lbr_nr - 1;> +       u64 tos = intel_pmu_lbr_tos();> +       int i;> +> +       for (i = 0; tos > hwc->lbr_tos && i < x86_pmu.lbr_nr; i++, tos--) {> +               unsigned long lbr_idx = (tos - i) & mask;> +               u64 from, to, flags = 0;> +> +               rdmsrl(x86_pmu.lbr_from + lbr_idx, from);> +               rdmsrl(x86_pmu.lbr_to   + lbr_idx, to);> +> +               if (x86_pmu.lbr_format == LBR_FORMAT_EIP_FLAGS) {> +                       flags = !!(from & LBR_FROM_FLAG_MISPRED);> +                       from = (u64)((((s64)from) << 1) >> 1);> +               }> +> +               cpuc->lbr_stack[i].from  = from;> +               cpuc->lbr_stack[i].to    = to;> +               cpuc->lbr_stack[i].flags = flags;> +       }> +       cpuc->lbr_entries = i;> +}> +> +static void> +intel_pmu_read_lbr(struct cpu_hw_events *cpuc, struct perf_event *event)> +{> +       if (x86_pmu.lbr_format == LBR_FORMAT_32)> +               intel_pmu_read_lbr_32(cpuc, event);> +       else> +               intel_pmu_read_lbr_64(cpuc, event);> +}> +>  static void x86_pmu_stop(struct perf_event *event)>  {>        struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);> @@ -2456,11 +2607,26 @@ perf_event_nmi_handler(struct notifier_b>         * If the first NMI handles both, the latter will be empty and daze>         * the CPU.>         */> +       trace_printk("LBR TOS: %Ld\n", intel_pmu_lbr_tos());>        x86_pmu.handle_irq(regs);>>        return NOTIFY_STOP;>  }>> +static __read_mostly struct notifier_block perf_event_nmi_notifier = {> +       .notifier_call          = perf_event_nmi_handler,> +       .next                   = NULL,> +       .priority               = 1> +};> +> +void perf_nmi_exit(void)> +{> +       struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);> +> +       if (cpuc->lbr_users)> +               __intel_pmu_enable_lbr();> +}> +>  static struct event_constraint unconstrained;  /* can schedule */>  static struct event_constraint null_constraint; /* can't schedule */>  static struct event_constraint bts_constraint => @@ -2761,12 +2927,6 @@ undo:>        return ret;>  }>> -static __read_mostly struct notifier_block perf_event_nmi_notifier = {> -       .notifier_call          = perf_event_nmi_handler,> -       .next                   = NULL,> -       .priority               = 1> -};> ->  static __initconst struct x86_pmu p6_pmu = {>        .name                   = "p6",>        .handle_irq             = x86_pmu_handle_irq,> @@ -2793,7 +2953,7 @@ static __initconst struct x86_pmu p6_pmu>        .event_bits             = 32,>        .event_mask             = (1ULL << 32) - 1,>        .get_event_constraints  = intel_get_event_constraints,> -       .event_constraints      = intel_p6_event_constraints> +       .event_constraints      = intel_p6_event_constraints,>  };>>  static __initconst struct x86_pmu core_pmu = {> @@ -2873,18 +3033,26 @@ static __init int p6_pmu_init(void)>        case 7:>        case 8:>        case 11: /* Pentium III */> +               x86_pmu = p6_pmu;> +> +               break;>        case 9:> -       case 13:> -               /* Pentium M */> +       case 13: /* Pentium M */> +               x86_pmu = p6_pmu;> +> +               x86_pmu.lbr_nr = 8;> +               x86_pmu.lbr_tos = 0x01c9;> +               x86_pmu.lbr_ctl = X86_DEBUGCTL_LBR;> +               x86_pmu.lbr_from = 0x40;> +>                break;> +>        default:>                pr_cont("unsupported p6 CPU model %d ",>                        boot_cpu_data.x86_model);>                return -ENODEV;>        }>> -       x86_pmu = p6_pmu;> ->        return 0;>  }>> @@ -2925,6 +3093,9 @@ static __init int intel_pmu_init(void)>        x86_pmu.event_bits              = eax.split.bit_width;>        x86_pmu.event_mask              = (1ULL << eax.split.bit_width) - 1;>> +       rdmsrl(MSR_IA32_PERF_CAPABILITIES, capabilities);> +       x86_pmu.lbr_format = capabilities & 0x1f;> +>        /*>         * Quirk: v2 perfmon does not report fixed-purpose events, so>         * assume at least 3 events:> @@ -2973,6 +3144,10 @@ no_datastore:>         */>        switch (boot_cpu_data.x86_model) {>        case 14: /* 65 nm core solo/duo, "Yonah" */> +               x86_pmu.lbr_nr = 8;> +               x86_pmu.lbr_tos = 0x01c9;> +               x86_pmu.lbr_ctl = X86_DEBUGCTL_LBR;> +               x86_pmu.lbr_from = 0x40;>                pr_cont("Core events, ");>                break;>> @@ -2980,6 +3155,13 @@ no_datastore:>        case 22: /* single-core 65 nm celeron/core2solo "Merom-L"/"Conroe-L" */>        case 23: /* current 45 nm celeron/core2/xeon "Penryn"/"Wolfdale" */>        case 29: /* six-core 45 nm xeon "Dunnington" */> +               x86_pmu.lbr_nr = 4;> +               x86_pmu.lbr_tos = 0x01c9;> +               x86_pmu.lbr_ctl = X86_DEBUGCTL_LBR |> +                                 X86_DEBUGCTL_FREEZE_LBRS_ON_PMI;> +               x86_pmu.lbr_from = 0x40;> +               x86_pmu.lbr_to = 0x60;> +>                memcpy(hw_cache_event_ids, core2_hw_cache_event_ids,>                       sizeof(hw_cache_event_ids));>> @@ -2989,13 +3171,28 @@ no_datastore:>>        case 26: /* 45 nm nehalem, "Bloomfield" */>        case 30: /* 45 nm nehalem, "Lynnfield" */> +               x86_pmu.lbr_nr = 16;> +               x86_pmu.lbr_tos = 0x01c9;> +               x86_pmu.lbr_ctl = X86_DEBUGCTL_LBR |> +                                 X86_DEBUGCTL_FREEZE_LBRS_ON_PMI;> +               x86_pmu.lbr_from = 0x680;> +               x86_pmu.lbr_to = 0x6c0;> +>                memcpy(hw_cache_event_ids, nehalem_hw_cache_event_ids,>                       sizeof(hw_cache_event_ids));>>                x86_pmu.event_constraints = intel_nehalem_event_constraints;>                pr_cont("Nehalem/Corei7 events, ");>                break;> -       case 28:> +> +       case 28: /* Atom */> +               x86_pmu.lbr_nr = 8;> +               x86_pmu.lbr_tos = 0x01c9;> +               x86_pmu.lbr_ctl = X86_DEBUGCTL_LBR |> +                                 X86_DEBUGCTL_FREEZE_LBRS_ON_PMI;> +               x86_pmu.lbr_from = 0x40;> +               x86_pmu.lbr_to = 0x60;> +>                memcpy(hw_cache_event_ids, atom_hw_cache_event_ids,>                       sizeof(hw_cache_event_ids));>> @@ -3005,12 +3202,20 @@ no_datastore:>>        case 37: /* 32 nm nehalem, "Clarkdale" */>        case 44: /* 32 nm nehalem, "Gulftown" */> +               x86_pmu.lbr_nr = 16;> +               x86_pmu.lbr_tos = 0x01c9;> +               x86_pmu.lbr_ctl = X86_DEBUGCTL_LBR |> +                                    X86_DEBUGCTL_FREEZE_LBRS_ON_PMI;> +               x86_pmu.lbr_from = 0x680;> +               x86_pmu.lbr_to = 0x6c0;> +>                memcpy(hw_cache_event_ids, westmere_hw_cache_event_ids,>                       sizeof(hw_cache_event_ids));>>                x86_pmu.event_constraints = intel_westmere_event_constraints;>                pr_cont("Westmere events, ");>                break;> +>        default:>                /*>                 * default constraints for v2 and up> Index: linux-2.6/arch/x86/include/asm/perf_event.h> ===================================================================> --- linux-2.6.orig/arch/x86/include/asm/perf_event.h> +++ linux-2.6/arch/x86/include/asm/perf_event.h> @@ -1,6 +1,8 @@>  #ifndef _ASM_X86_PERF_EVENT_H>  #define _ASM_X86_PERF_EVENT_H>> +#include > +>  /*>  * Performance event hw details:>  */> @@ -122,11 +124,31 @@ union cpuid10_edx {>  extern void init_hw_perf_events(void);>  extern void perf_events_lapic_init(void);>> +#define X86_DEBUGCTL_LBR               (1 << 0)> +#define X86_DEBUGCTL_FREEZE_LBRS_ON_PMI        (1 << 11)> +> +static __always_inline void perf_nmi_enter(void)> +{> +       u64 debugctl;> +> +       /*> +        * Unconditionally disable LBR so as to minimally pollute the LBR stack.> +        * XXX: paravirt will screw us over massive> +        */> +       rdmsrl(MSR_IA32_DEBUGCTLMSR, debugctl);> +       debugctl &= ~X86_DEBUGCTL_LBR;> +       wrmsrl(MSR_IA32_DEBUGCTLMSR, debugctl);> +}> +> +extern void perf_nmi_exit(void);> +>  #define PERF_EVENT_INDEX_OFFSET                        0>>  #else>  static inline void init_hw_perf_events(void)           { }> -static inline void perf_events_lapic_init(void)        { }> +static inline void perf_events_lapic_init(void)                { }> +static inline void perf_nmi_enter(void)                        { }> +static inline void perf_nmi_exit(void)                 { }>  #endif>>  #endif /* _ASM_X86_PERF_EVENT_H */> Index: linux-2.6/arch/x86/kernel/traps.c> ===================================================================> --- linux-2.6.orig/arch/x86/kernel/traps.c> +++ linux-2.6/arch/x86/kernel/traps.c> @@ -45,6 +45,7 @@>  #endif>>  #include > +#include >  #include >  #include >  #include > @@ -442,6 +443,7 @@ static notrace __kprobes void default_do>  dotraplinkage notrace __kprobes void>  do_nmi(struct pt_regs *regs, long error_code)>  {> +       perf_nmi_enter();>        nmi_enter();>>        inc_irq_stat(__nmi_count);> @@ -450,6 +452,7 @@ do_nmi(struct pt_regs *regs, long error_>                default_do_nmi(regs);>>        nmi_exit();> +       perf_nmi_exit();>  }>>  void stop_nmi(void)> Index: linux-2.6/include/linux/perf_event.h> ===================================================================> --- linux-2.6.orig/include/linux/perf_event.h> +++ linux-2.6/include/linux/perf_event.h> @@ -125,8 +125,9 @@ enum perf_event_sample_format {>        PERF_SAMPLE_PERIOD                      = 1U << 8,>        PERF_SAMPLE_STREAM_ID                   = 1U << 9,>        PERF_SAMPLE_RAW                         = 1U << 10,> +       PERF_SAMPLE_LBR                         = 1U << 11,>> -       PERF_SAMPLE_MAX = 1U << 11,             /* non-ABI */> +       PERF_SAMPLE_MAX = 1U << 12,             /* non-ABI */>  };>>  /*> @@ -396,6 +397,9 @@ enum perf_event_type {>         *      { u64                   nr,>         *        u64                   ips[nr];  } && PERF_SAMPLE_CALLCHAIN>         *> +        *      { u64                   nr;> +        *        struct lbr_format     lbr[nr];  } && PERF_SAMPLE_LBR> +        *>         *      #>         *      # The RAW record below is opaque data wrt the ABI>         *      #> @@ -483,6 +487,7 @@ struct hw_perf_event {>                        int             idx;>                        int             last_cpu;>                        int             pebs;> +                       u64             lbr_tos;>                };>                struct { /* software */>                        s64             remaining;>>>????{.n?+???????+%?????ݶ??w??{.n?+????{??G?????{ay?ʇڙ?,j??f???h?????????z_??(?階?ݢj"???m??????G????????????&???~???iO???z??v?^?m???? ????????I?