MIME-Version: 1.0
In-Reply-To: <1388728091-18564-4-git-send-email-zheng.z.yan@intel.com>
References: <1388728091-18564-1-git-send-email-zheng.z.yan@intel.com>
	<1388728091-18564-4-git-send-email-zheng.z.yan@intel.com>
Date: Wed, 5 Feb 2014 17:34:08 +0100
Message-ID: <CABPqkBQd1zZu17DxxC5nfsp7H6sOrUQzfkJb7V0cBfXbD_cjOA@mail.gmail.com>
Subject: Re: [PATCH 03/14] perf, x86: use context switch callback to flush LBR stack
From: Stephane Eranian <eranian@google.com>
To: "Yan, Zheng" <zheng.z.yan@intel.com>
Cc: LKML <linux-kernel@vger.kernel.org>,
        Peter Zijlstra <a.p.zijlstra@chello.nl>,
        Ingo Molnar <mingo@kernel.org>,
        Arnaldo Carvalho de Melo <acme@infradead.org>,
        Andi Kleen <andi@firstfloor.org>
Content-Type: text/plain; charset=UTF-8
Sender: linux-kernel-owner@vger.kernel.org

On Fri, Jan 3, 2014 at 6:48 AM, Yan, Zheng <zheng.z.yan@intel.com> wrote:
> Enable the pmu context switch callback when LBR is used. Use the
> callback to flush LBR stack when task is scheduled in. This allows
> us to move code that flushes LBR stack from perf core to perf x86.
>
You forgot to remove perf_event_context.nr_branch_stack which
does not appear to be needed anymore.

> Signed-off-by: Yan, Zheng <zheng.z.yan@intel.com>
> ---
>  arch/x86/kernel/cpu/perf_event.c           |  7 ---
>  arch/x86/kernel/cpu/perf_event.h           |  2 -
>  arch/x86/kernel/cpu/perf_event_intel.c     | 14 +-----
>  arch/x86/kernel/cpu/perf_event_intel_lbr.c | 32 ++++++++-----
>  include/linux/perf_event.h                 |  5 ---
>  kernel/events/core.c                       | 72 ------------------------------
>  6 files changed, 21 insertions(+), 111 deletions(-)
>
> diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
> index 6703d17..69e2095 100644
> --- a/arch/x86/kernel/cpu/perf_event.c
> +++ b/arch/x86/kernel/cpu/perf_event.c
> @@ -1852,12 +1852,6 @@ static void x86_pmu_sched_task(struct perf_event_context *ctx, bool sched_in)
>                 x86_pmu.sched_task(ctx, sched_in);
>  }
>
> -static void x86_pmu_flush_branch_stack(void)
> -{
> -       if (x86_pmu.flush_branch_stack)
> -               x86_pmu.flush_branch_stack();
> -}
> -
>  void perf_check_microcode(void)
>  {
>         if (x86_pmu.check_microcode)
> @@ -1884,7 +1878,6 @@ static struct pmu pmu = {
>         .commit_txn             = x86_pmu_commit_txn,
>
>         .event_idx              = x86_pmu_event_idx,
> -       .flush_branch_stack     = x86_pmu_flush_branch_stack,
>         .sched_task             = x86_pmu_sched_task,
>  };
>
> diff --git a/arch/x86/kernel/cpu/perf_event.h b/arch/x86/kernel/cpu/perf_event.h
> index 3fdb751..80b8e83 100644
> --- a/arch/x86/kernel/cpu/perf_event.h
> +++ b/arch/x86/kernel/cpu/perf_event.h
> @@ -150,7 +150,6 @@ struct cpu_hw_events {
>          * Intel LBR bits
>          */
>         int                             lbr_users;
> -       void                            *lbr_context;
>         struct perf_branch_stack        lbr_stack;
>         struct perf_branch_entry        lbr_entries[MAX_LBR_ENTRIES];
>         struct er_account               *lbr_sel;
> @@ -416,7 +415,6 @@ struct x86_pmu {
>         void            (*cpu_dead)(int cpu);
>
>         void            (*check_microcode)(void);
> -       void            (*flush_branch_stack)(void);
>         void            (*sched_task)(struct perf_event_context *ctx,
>                                       bool sched_in);
>
> diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c
> index 0fa4f24..4325bae 100644
> --- a/arch/x86/kernel/cpu/perf_event_intel.c
> +++ b/arch/x86/kernel/cpu/perf_event_intel.c
> @@ -2038,18 +2038,6 @@ static void intel_pmu_cpu_dying(int cpu)
>         fini_debug_store_on_cpu(cpu);
>  }
>
> -static void intel_pmu_flush_branch_stack(void)
> -{
> -       /*
> -        * Intel LBR does not tag entries with the
> -        * PID of the current task, then we need to
> -        * flush it on ctxsw
> -        * For now, we simply reset it
> -        */
> -       if (x86_pmu.lbr_nr)
> -               intel_pmu_lbr_reset();
> -}
> -
>  PMU_FORMAT_ATTR(offcore_rsp, "config1:0-63");
>
>  PMU_FORMAT_ATTR(ldlat, "config1:0-15");
> @@ -2101,7 +2089,7 @@ static __initconst const struct x86_pmu intel_pmu = {
>         .cpu_starting           = intel_pmu_cpu_starting,
>         .cpu_dying              = intel_pmu_cpu_dying,
>         .guest_get_msrs         = intel_guest_get_msrs,
> -       .flush_branch_stack     = intel_pmu_flush_branch_stack,
> +       .sched_task             = intel_pmu_lbr_sched_task,
>  };
>
>  static __init void intel_clovertown_quirk(void)
> diff --git a/arch/x86/kernel/cpu/perf_event_intel_lbr.c b/arch/x86/kernel/cpu/perf_event_intel_lbr.c
> index 1ae2ec5..7ff2a99 100644
> --- a/arch/x86/kernel/cpu/perf_event_intel_lbr.c
> +++ b/arch/x86/kernel/cpu/perf_event_intel_lbr.c
> @@ -177,24 +177,32 @@ void intel_pmu_lbr_reset(void)
>                 intel_pmu_lbr_reset_64();
>  }
>
> -void intel_pmu_lbr_enable(struct perf_event *event)
> +void intel_pmu_lbr_sched_task(struct perf_event_context *ctx, bool sched_in)
>  {
> -       struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
> -
>         if (!x86_pmu.lbr_nr)
>                 return;
>
>         /*
> -        * Reset the LBR stack if we changed task context to
> -        * avoid data leaks.
> +        * It is necessary to flush the stack on context switch. This happens
> +        * when the branch stack does not tag its entries with the pid of the
> +        * current task.
>          */
> -       if (event->ctx->task && cpuc->lbr_context != event->ctx) {
> +       if (sched_in)
>                 intel_pmu_lbr_reset();
> -               cpuc->lbr_context = event->ctx;
> -       }
> +}
> +
> +void intel_pmu_lbr_enable(struct perf_event *event)
> +{
> +       struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
> +
> +       if (!x86_pmu.lbr_nr)
> +               return;
> +
>         cpuc->br_sel = event->hw.branch_reg.reg;
>
>         cpuc->lbr_users++;
> +       if (cpuc->lbr_users == 1)
> +               perf_sched_cb_enable(event->ctx->pmu);
>  }
>
>  void intel_pmu_lbr_disable(struct perf_event *event)
> @@ -207,10 +215,10 @@ void intel_pmu_lbr_disable(struct perf_event *event)
>         cpuc->lbr_users--;
>         WARN_ON_ONCE(cpuc->lbr_users < 0);
>
> -       if (cpuc->enabled && !cpuc->lbr_users) {
> -               __intel_pmu_lbr_disable();
> -               /* avoid stale pointer */
> -               cpuc->lbr_context = NULL;
> +       if (!cpuc->lbr_users) {
> +               perf_sched_cb_disable(event->ctx->pmu);
> +               if (cpuc->enabled)
> +                       __intel_pmu_lbr_disable();
>         }
>  }
>
> diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
> index 6a3e603..96cb88b 100644
> --- a/include/linux/perf_event.h
> +++ b/include/linux/perf_event.h
> @@ -248,11 +248,6 @@ struct pmu {
>         int (*event_idx)                (struct perf_event *event); /*optional */
>
>         /*
> -        * flush branch stack on context-switches (needed in cpu-wide mode)
> -        */
> -       void (*flush_branch_stack)      (void);
> -
> -       /*
>          * PMU callback for context-switches. optional
>          */
>         void (*sched_task)              (struct perf_event_context *ctx,
> diff --git a/kernel/events/core.c b/kernel/events/core.c
> index d110a23..aba4d6d 100644
> --- a/kernel/events/core.c
> +++ b/kernel/events/core.c
> @@ -140,7 +140,6 @@ enum event_type_t {
>   */
>  struct static_key_deferred perf_sched_events __read_mostly;
>  static DEFINE_PER_CPU(atomic_t, perf_cgroup_events);
> -static DEFINE_PER_CPU(atomic_t, perf_branch_stack_events);
>  static DEFINE_PER_CPU(int, perf_sched_cb_usages);
>
>  static atomic_t nr_mmap_events __read_mostly;
> @@ -2566,65 +2565,6 @@ static void perf_event_context_sched_in(struct perf_event_context *ctx,
>         perf_pmu_rotate_start(ctx->pmu);
>  }
>
> -/*
> - * When sampling the branck stack in system-wide, it may be necessary
> - * to flush the stack on context switch. This happens when the branch
> - * stack does not tag its entries with the pid of the current task.
> - * Otherwise it becomes impossible to associate a branch entry with a
> - * task. This ambiguity is more likely to appear when the branch stack
> - * supports priv level filtering and the user sets it to monitor only
> - * at the user level (which could be a useful measurement in system-wide
> - * mode). In that case, the risk is high of having a branch stack with
> - * branch from multiple tasks. Flushing may mean dropping the existing
> - * entries or stashing them somewhere in the PMU specific code layer.
> - *
> - * This function provides the context switch callback to the lower code
> - * layer. It is invoked ONLY when there is at least one system-wide context
> - * with at least one active event using taken branch sampling.
> - */
> -static void perf_branch_stack_sched_in(struct task_struct *prev,
> -                                      struct task_struct *task)
> -{
> -       struct perf_cpu_context *cpuctx;
> -       struct pmu *pmu;
> -       unsigned long flags;
> -
> -       /* no need to flush branch stack if not changing task */
> -       if (prev == task)
> -               return;
> -
> -       local_irq_save(flags);
> -
> -       rcu_read_lock();
> -
> -       list_for_each_entry_rcu(pmu, &pmus, entry) {
> -               cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
> -
> -               /*
> -                * check if the context has at least one
> -                * event using PERF_SAMPLE_BRANCH_STACK
> -                */
> -               if (cpuctx->ctx.nr_branch_stack > 0
> -                   && pmu->flush_branch_stack) {
> -
> -                       pmu = cpuctx->ctx.pmu;
> -
> -                       perf_ctx_lock(cpuctx, cpuctx->task_ctx);
> -
> -                       perf_pmu_disable(pmu);
> -
> -                       pmu->flush_branch_stack();
> -
> -                       perf_pmu_enable(pmu);
> -
> -                       perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
> -               }
> -       }
> -
> -       rcu_read_unlock();
> -
> -       local_irq_restore(flags);
> -}
>
>  /*
>   * Called from scheduler to add the events of the current task
> @@ -2658,10 +2598,6 @@ void __perf_event_task_sched_in(struct task_struct *prev,
>         if (atomic_read(&__get_cpu_var(perf_cgroup_events)))
>                 perf_cgroup_sched_in(prev, task);
>
> -       /* check for system-wide branch_stack events */
> -       if (atomic_read(&__get_cpu_var(perf_branch_stack_events)))
> -               perf_branch_stack_sched_in(prev, task);
> -
>         if (__get_cpu_var(perf_sched_cb_usages))
>                 perf_pmu_sched_task(prev, task, true);
>  }
> @@ -3226,10 +3162,6 @@ static void unaccount_event_cpu(struct perf_event *event, int cpu)
>         if (event->parent)
>                 return;
>
> -       if (has_branch_stack(event)) {
> -               if (!(event->attach_state & PERF_ATTACH_TASK))
> -                       atomic_dec(&per_cpu(perf_branch_stack_events, cpu));
> -       }
>         if (is_cgroup_event(event))
>                 atomic_dec(&per_cpu(perf_cgroup_events, cpu));
>  }
> @@ -6655,10 +6587,6 @@ static void account_event_cpu(struct perf_event *event, int cpu)
>         if (event->parent)
>                 return;
>
> -       if (has_branch_stack(event)) {
> -               if (!(event->attach_state & PERF_ATTACH_TASK))
> -                       atomic_inc(&per_cpu(perf_branch_stack_events, cpu));
> -       }
>         if (is_cgroup_event(event))
>                 atomic_inc(&per_cpu(perf_cgroup_events, cpu));
>  }
> --
> 1.8.4.2
>
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/