MIME-Version: 1.0
In-Reply-To: <20150522133135.447912500@infradead.org>
References: <20150522132905.416122812@infradead.org>
	<20150522133135.447912500@infradead.org>
Date: Tue, 26 May 2015 02:37:52 -0700
Message-ID: <CABPqkBSpmMUOhK4oAVVDYctwizyi5mSm=o+A4XmtivZAq-Z5HA@mail.gmail.com>
Subject: Re: [PATCH v2 02/11] perf/x86: Improve HT workaround GP counter constraint
From: Stephane Eranian <eranian@google.com>
To: Peter Zijlstra <peterz@infradead.org>
Cc: Ingo Molnar <mingo@kernel.org>, Vince Weaver <vincent.weaver@maine.edu>,
        Jiri Olsa <jolsa@redhat.com>, "Liang, Kan" <kan.liang@intel.com>,
        LKML <linux-kernel@vger.kernel.org>
Content-Type: text/plain; charset=UTF-8
Sender: linux-kernel-owner@vger.kernel.org
Content-Length: 10893
Lines: 261

On Fri, May 22, 2015 at 6:29 AM, Peter Zijlstra <peterz@infradead.org> wrote:
> The (SNB/IVB/HSW) HT bug only affects events that can be programmed
> onto GP counters, therefore we should only limit the number of GP
> counters that can be used per cpu -- iow we should not constrain the
> FP counters.
>
> Furthermore, we should only enfore such a limit when there are in fact
> exclusive events being scheduled on either sibling.
>
> Reported-by: Vince Weaver <vincent.weaver@maine.edu>
> Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
> ---
>  arch/x86/kernel/cpu/perf_event.c              |   36 +++++++++++++++++++++-----
>  arch/x86/kernel/cpu/perf_event.h              |   11 +++++--
>  arch/x86/kernel/cpu/perf_event_intel.c        |   30 +++++++--------------
>  arch/x86/kernel/cpu/perf_event_intel_uncore.c |    2 -
>  4 files changed, 49 insertions(+), 30 deletions(-)
>
> --- a/arch/x86/kernel/cpu/perf_event.c
> +++ b/arch/x86/kernel/cpu/perf_event.c
> @@ -611,6 +611,7 @@ struct sched_state {
>         int     event;          /* event index */
>         int     counter;        /* counter index */
>         int     unassigned;     /* number of events to be assigned left */
> +       int     nr_gp;          /* number of GP counters used */
>         unsigned long used[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
>  };
>
> @@ -620,9 +621,10 @@ struct sched_state {
>  struct perf_sched {
>         int                     max_weight;
>         int                     max_events;
> +       int                     max_gp;
> +       int                     saved_states;
>         struct event_constraint **constraints;
>         struct sched_state      state;
> -       int                     saved_states;
>         struct sched_state      saved[SCHED_STATES_MAX];
>  };
>
> @@ -630,13 +632,14 @@ struct perf_sched {
>   * Initialize interator that runs through all events and counters.
>   */
>  static void perf_sched_init(struct perf_sched *sched, struct event_constraint **constraints,
> -                           int num, int wmin, int wmax)
> +                           int num, int wmin, int wmax, int gpmax)
>  {
>         int idx;
>
>         memset(sched, 0, sizeof(*sched));
>         sched->max_events       = num;
>         sched->max_weight       = wmax;
> +       sched->max_gp           = gpmax;
>         sched->constraints      = constraints;
>
>         for (idx = 0; idx < num; idx++) {
> @@ -696,11 +699,16 @@ static bool __perf_sched_find_counter(st
>                                 goto done;
>                 }
>         }
> +
>         /* Grab the first unused counter starting with idx */
>         idx = sched->state.counter;
>         for_each_set_bit_from(idx, c->idxmsk, INTEL_PMC_IDX_FIXED) {
> -               if (!__test_and_set_bit(idx, sched->state.used))
> +               if (!__test_and_set_bit(idx, sched->state.used)) {
> +                       if (sched->state.nr_gp++ >= sched->max_gp)
> +                               return false;
> +
>                         goto done;
> +               }
>         }
>
>         return false;
> @@ -757,11 +765,11 @@ static bool perf_sched_next_event(struct
>   * Assign a counter for each event.
>   */
>  int perf_assign_events(struct event_constraint **constraints, int n,
> -                       int wmin, int wmax, int *assign)
> +                       int wmin, int wmax, int gpmax, int *assign)
>  {
>         struct perf_sched sched;
>
> -       perf_sched_init(&sched, constraints, n, wmin, wmax);
> +       perf_sched_init(&sched, constraints, n, wmin, wmax, gpmax);
>
>         do {
>                 if (!perf_sched_find_counter(&sched))
> @@ -822,8 +830,24 @@ int x86_schedule_events(struct cpu_hw_ev
>
>         /* slow path */
>         if (i != n) {
> +               int gpmax = x86_pmu.num_counters;
> +
> +               /*
> +                * Do not allow scheduling of more than half the available
> +                * generic counters.
> +                *
> +                * This helps avoid counter starvation of sibling thread by
> +                * ensuring at most half the counters cannot be in exclusive
> +                * mode. There is no designated counters for the limits. Any
> +                * N/2 counters can be used. This helps with events with
> +                * specific counter constraints.
> +                */
> +               if (is_ht_workaround_enabled() && !cpuc->is_fake &&
> +                   READ_ONCE(cpuc->excl_cntrs->exclusive_present))
> +                       gpmax /= 2;
> +
What I don't like about this part is that this is a hack to work around a bug
on some limited Intel CPUs and yet it is in the middle of generic x86 code.
I understand it will be inoperative on AMD PMU and is not used by Intel
uncore. On KNC or P6, you will not have is_ht_workaround_enabled().
Could this be made a x86_pmu callback? x86_pmu.counter_limit()?


>                 unsched = perf_assign_events(cpuc->event_constraint, n, wmin,
> -                                            wmax, assign);
> +                                            wmax, gpmax, assign);
>         }
>
>         /*
> --- a/arch/x86/kernel/cpu/perf_event.h
> +++ b/arch/x86/kernel/cpu/perf_event.h
> @@ -74,6 +74,7 @@ struct event_constraint {
>  #define PERF_X86_EVENT_EXCL            0x0040 /* HT exclusivity on counter */
>  #define PERF_X86_EVENT_DYNAMIC         0x0080 /* dynamic alloc'd constraint */
>  #define PERF_X86_EVENT_RDPMC_ALLOWED   0x0100 /* grant rdpmc permission */
> +#define PERF_X86_EVENT_EXCL_ACCT       0x0200 /* accounted EXCL event */
>
>
>  struct amd_nb {
> @@ -134,8 +135,6 @@ enum intel_excl_state_type {
>  struct intel_excl_states {
>         enum intel_excl_state_type init_state[X86_PMC_IDX_MAX];
>         enum intel_excl_state_type state[X86_PMC_IDX_MAX];
> -       int  num_alloc_cntrs;/* #counters allocated */
> -       int  max_alloc_cntrs;/* max #counters allowed */
>         bool sched_started; /* true if scheduling has started */
>  };
>
> @@ -144,6 +143,11 @@ struct intel_excl_cntrs {
>
>         struct intel_excl_states states[2];
>
> +       union {
> +               u16     has_exclusive[2];
> +               u32     exclusive_present;
> +       };
> +
>         int             refcnt;         /* per-core: #HT threads */
>         unsigned        core_id;        /* per-core: core id */
>  };
> @@ -176,6 +180,7 @@ struct cpu_hw_events {
>         struct perf_event       *event_list[X86_PMC_IDX_MAX]; /* in enabled order */
>         struct event_constraint *event_constraint[X86_PMC_IDX_MAX];
>
> +       int                     n_excl; /* the number of exclusive events */
>
>         unsigned int            group_flag;
>         int                     is_fake;
> @@ -719,7 +724,7 @@ static inline void __x86_pmu_enable_even
>  void x86_pmu_enable_all(int added);
>
>  int perf_assign_events(struct event_constraint **constraints, int n,
> -                       int wmin, int wmax, int *assign);
> +                       int wmin, int wmax, int gpmax, int *assign);
>  int x86_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign);
>
>  void x86_pmu_stop(struct perf_event *event, int flags);
> --- a/arch/x86/kernel/cpu/perf_event_intel.c
> +++ b/arch/x86/kernel/cpu/perf_event_intel.c
> @@ -1923,7 +1923,6 @@ intel_start_scheduling(struct cpu_hw_eve
>         xl = &excl_cntrs->states[tid];
>
>         xl->sched_started = true;
> -       xl->num_alloc_cntrs = 0;
>         /*
>          * lock shared state until we are done scheduling
>          * in stop_event_scheduling()
> @@ -2000,6 +1999,11 @@ intel_get_excl_constraints(struct cpu_hw
>          * across HT threads
>          */
>         is_excl = c->flags & PERF_X86_EVENT_EXCL;
> +       if (is_excl && !(event->hw.flags & PERF_X86_EVENT_EXCL_ACCT)) {
> +               event->hw.flags |= PERF_X86_EVENT_EXCL_ACCT;
> +               if (!cpuc->n_excl++)
> +                       WRITE_ONCE(excl_cntrs->has_exclusive[tid], 1);
> +       }
>
>         /*
>          * xl = state of current HT
> @@ -2008,18 +2012,6 @@ intel_get_excl_constraints(struct cpu_hw
>         xl = &excl_cntrs->states[tid];
>         xlo = &excl_cntrs->states[o_tid];
>
> -       /*
> -        * do not allow scheduling of more than max_alloc_cntrs
> -        * which is set to half the available generic counters.
> -        * this helps avoid counter starvation of sibling thread
> -        * by ensuring at most half the counters cannot be in
> -        * exclusive mode. There is not designated counters for the
> -        * limits. Any N/2 counters can be used. This helps with
> -        * events with specifix counter constraints
> -        */
> -       if (xl->num_alloc_cntrs++ == xl->max_alloc_cntrs)
> -               return &emptyconstraint;
> -
>         cx = c;
>
>         /*
> @@ -2150,6 +2142,11 @@ static void intel_put_excl_constraints(s
>
>         xl = &excl_cntrs->states[tid];
>         xlo = &excl_cntrs->states[o_tid];
> +       if (hwc->flags & PERF_X86_EVENT_EXCL_ACCT) {
> +               hwc->flags &= ~PERF_X86_EVENT_EXCL_ACCT;
> +               if (!--cpuc->n_excl)
> +                       WRITE_ONCE(excl_cntrs->has_exclusive[tid], 0);
> +       }
>
>         /*
>          * put_constraint may be called from x86_schedule_events()
> @@ -2632,8 +2629,6 @@ static void intel_pmu_cpu_starting(int c
>                 cpuc->lbr_sel = &cpuc->shared_regs->regs[EXTRA_REG_LBR];
>
>         if (x86_pmu.flags & PMU_FL_EXCL_CNTRS) {
> -               int h = x86_pmu.num_counters >> 1;
> -
>                 for_each_cpu(i, topology_thread_cpumask(cpu)) {
>                         struct intel_excl_cntrs *c;
>
> @@ -2647,11 +2642,6 @@ static void intel_pmu_cpu_starting(int c
>                 }
>                 cpuc->excl_cntrs->core_id = core_id;
>                 cpuc->excl_cntrs->refcnt++;
> -               /*
> -                * set hard limit to half the number of generic counters
> -                */
> -               cpuc->excl_cntrs->states[0].max_alloc_cntrs = h;
> -               cpuc->excl_cntrs->states[1].max_alloc_cntrs = h;
>         }
>  }
>
> --- a/arch/x86/kernel/cpu/perf_event_intel_uncore.c
> +++ b/arch/x86/kernel/cpu/perf_event_intel_uncore.c
> @@ -394,7 +394,7 @@ static int uncore_assign_events(struct i
>         /* slow path */
>         if (i != n)
>                 ret = perf_assign_events(box->event_constraint, n,
> -                                        wmin, wmax, assign);
> +                                        wmin, wmax, n, assign);
>
>         if (!assign || ret) {
>                 for (i = 0; i < n; i++)
>
>
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/