2015-05-22 13:33:34

by Peter Zijlstra

[permalink] [raw]
Subject: [PATCH v2 02/11] perf/x86: Improve HT workaround GP counter constraint

The (SNB/IVB/HSW) HT bug only affects events that can be programmed
onto GP counters, therefore we should only limit the number of GP
counters that can be used per cpu -- iow we should not constrain the
FP counters.

Furthermore, we should only enfore such a limit when there are in fact
exclusive events being scheduled on either sibling.

Reported-by: Vince Weaver <[email protected]>
Signed-off-by: Peter Zijlstra (Intel) <[email protected]>
---
arch/x86/kernel/cpu/perf_event.c | 36 +++++++++++++++++++++-----
arch/x86/kernel/cpu/perf_event.h | 11 +++++--
arch/x86/kernel/cpu/perf_event_intel.c | 30 +++++++--------------
arch/x86/kernel/cpu/perf_event_intel_uncore.c | 2 -
4 files changed, 49 insertions(+), 30 deletions(-)

--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -611,6 +611,7 @@ struct sched_state {
int event; /* event index */
int counter; /* counter index */
int unassigned; /* number of events to be assigned left */
+ int nr_gp; /* number of GP counters used */
unsigned long used[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
};

@@ -620,9 +621,10 @@ struct sched_state {
struct perf_sched {
int max_weight;
int max_events;
+ int max_gp;
+ int saved_states;
struct event_constraint **constraints;
struct sched_state state;
- int saved_states;
struct sched_state saved[SCHED_STATES_MAX];
};

@@ -630,13 +632,14 @@ struct perf_sched {
* Initialize interator that runs through all events and counters.
*/
static void perf_sched_init(struct perf_sched *sched, struct event_constraint **constraints,
- int num, int wmin, int wmax)
+ int num, int wmin, int wmax, int gpmax)
{
int idx;

memset(sched, 0, sizeof(*sched));
sched->max_events = num;
sched->max_weight = wmax;
+ sched->max_gp = gpmax;
sched->constraints = constraints;

for (idx = 0; idx < num; idx++) {
@@ -696,11 +699,16 @@ static bool __perf_sched_find_counter(st
goto done;
}
}
+
/* Grab the first unused counter starting with idx */
idx = sched->state.counter;
for_each_set_bit_from(idx, c->idxmsk, INTEL_PMC_IDX_FIXED) {
- if (!__test_and_set_bit(idx, sched->state.used))
+ if (!__test_and_set_bit(idx, sched->state.used)) {
+ if (sched->state.nr_gp++ >= sched->max_gp)
+ return false;
+
goto done;
+ }
}

return false;
@@ -757,11 +765,11 @@ static bool perf_sched_next_event(struct
* Assign a counter for each event.
*/
int perf_assign_events(struct event_constraint **constraints, int n,
- int wmin, int wmax, int *assign)
+ int wmin, int wmax, int gpmax, int *assign)
{
struct perf_sched sched;

- perf_sched_init(&sched, constraints, n, wmin, wmax);
+ perf_sched_init(&sched, constraints, n, wmin, wmax, gpmax);

do {
if (!perf_sched_find_counter(&sched))
@@ -822,8 +830,24 @@ int x86_schedule_events(struct cpu_hw_ev

/* slow path */
if (i != n) {
+ int gpmax = x86_pmu.num_counters;
+
+ /*
+ * Do not allow scheduling of more than half the available
+ * generic counters.
+ *
+ * This helps avoid counter starvation of sibling thread by
+ * ensuring at most half the counters cannot be in exclusive
+ * mode. There is no designated counters for the limits. Any
+ * N/2 counters can be used. This helps with events with
+ * specific counter constraints.
+ */
+ if (is_ht_workaround_enabled() && !cpuc->is_fake &&
+ READ_ONCE(cpuc->excl_cntrs->exclusive_present))
+ gpmax /= 2;
+
unsched = perf_assign_events(cpuc->event_constraint, n, wmin,
- wmax, assign);
+ wmax, gpmax, assign);
}

/*
--- a/arch/x86/kernel/cpu/perf_event.h
+++ b/arch/x86/kernel/cpu/perf_event.h
@@ -74,6 +74,7 @@ struct event_constraint {
#define PERF_X86_EVENT_EXCL 0x0040 /* HT exclusivity on counter */
#define PERF_X86_EVENT_DYNAMIC 0x0080 /* dynamic alloc'd constraint */
#define PERF_X86_EVENT_RDPMC_ALLOWED 0x0100 /* grant rdpmc permission */
+#define PERF_X86_EVENT_EXCL_ACCT 0x0200 /* accounted EXCL event */


struct amd_nb {
@@ -134,8 +135,6 @@ enum intel_excl_state_type {
struct intel_excl_states {
enum intel_excl_state_type init_state[X86_PMC_IDX_MAX];
enum intel_excl_state_type state[X86_PMC_IDX_MAX];
- int num_alloc_cntrs;/* #counters allocated */
- int max_alloc_cntrs;/* max #counters allowed */
bool sched_started; /* true if scheduling has started */
};

@@ -144,6 +143,11 @@ struct intel_excl_cntrs {

struct intel_excl_states states[2];

+ union {
+ u16 has_exclusive[2];
+ u32 exclusive_present;
+ };
+
int refcnt; /* per-core: #HT threads */
unsigned core_id; /* per-core: core id */
};
@@ -176,6 +180,7 @@ struct cpu_hw_events {
struct perf_event *event_list[X86_PMC_IDX_MAX]; /* in enabled order */
struct event_constraint *event_constraint[X86_PMC_IDX_MAX];

+ int n_excl; /* the number of exclusive events */

unsigned int group_flag;
int is_fake;
@@ -719,7 +724,7 @@ static inline void __x86_pmu_enable_even
void x86_pmu_enable_all(int added);

int perf_assign_events(struct event_constraint **constraints, int n,
- int wmin, int wmax, int *assign);
+ int wmin, int wmax, int gpmax, int *assign);
int x86_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign);

void x86_pmu_stop(struct perf_event *event, int flags);
--- a/arch/x86/kernel/cpu/perf_event_intel.c
+++ b/arch/x86/kernel/cpu/perf_event_intel.c
@@ -1923,7 +1923,6 @@ intel_start_scheduling(struct cpu_hw_eve
xl = &excl_cntrs->states[tid];

xl->sched_started = true;
- xl->num_alloc_cntrs = 0;
/*
* lock shared state until we are done scheduling
* in stop_event_scheduling()
@@ -2000,6 +1999,11 @@ intel_get_excl_constraints(struct cpu_hw
* across HT threads
*/
is_excl = c->flags & PERF_X86_EVENT_EXCL;
+ if (is_excl && !(event->hw.flags & PERF_X86_EVENT_EXCL_ACCT)) {
+ event->hw.flags |= PERF_X86_EVENT_EXCL_ACCT;
+ if (!cpuc->n_excl++)
+ WRITE_ONCE(excl_cntrs->has_exclusive[tid], 1);
+ }

/*
* xl = state of current HT
@@ -2008,18 +2012,6 @@ intel_get_excl_constraints(struct cpu_hw
xl = &excl_cntrs->states[tid];
xlo = &excl_cntrs->states[o_tid];

- /*
- * do not allow scheduling of more than max_alloc_cntrs
- * which is set to half the available generic counters.
- * this helps avoid counter starvation of sibling thread
- * by ensuring at most half the counters cannot be in
- * exclusive mode. There is not designated counters for the
- * limits. Any N/2 counters can be used. This helps with
- * events with specifix counter constraints
- */
- if (xl->num_alloc_cntrs++ == xl->max_alloc_cntrs)
- return &emptyconstraint;
-
cx = c;

/*
@@ -2150,6 +2142,11 @@ static void intel_put_excl_constraints(s

xl = &excl_cntrs->states[tid];
xlo = &excl_cntrs->states[o_tid];
+ if (hwc->flags & PERF_X86_EVENT_EXCL_ACCT) {
+ hwc->flags &= ~PERF_X86_EVENT_EXCL_ACCT;
+ if (!--cpuc->n_excl)
+ WRITE_ONCE(excl_cntrs->has_exclusive[tid], 0);
+ }

/*
* put_constraint may be called from x86_schedule_events()
@@ -2632,8 +2629,6 @@ static void intel_pmu_cpu_starting(int c
cpuc->lbr_sel = &cpuc->shared_regs->regs[EXTRA_REG_LBR];

if (x86_pmu.flags & PMU_FL_EXCL_CNTRS) {
- int h = x86_pmu.num_counters >> 1;
-
for_each_cpu(i, topology_thread_cpumask(cpu)) {
struct intel_excl_cntrs *c;

@@ -2647,11 +2642,6 @@ static void intel_pmu_cpu_starting(int c
}
cpuc->excl_cntrs->core_id = core_id;
cpuc->excl_cntrs->refcnt++;
- /*
- * set hard limit to half the number of generic counters
- */
- cpuc->excl_cntrs->states[0].max_alloc_cntrs = h;
- cpuc->excl_cntrs->states[1].max_alloc_cntrs = h;
}
}

--- a/arch/x86/kernel/cpu/perf_event_intel_uncore.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_uncore.c
@@ -394,7 +394,7 @@ static int uncore_assign_events(struct i
/* slow path */
if (i != n)
ret = perf_assign_events(box->event_constraint, n,
- wmin, wmax, assign);
+ wmin, wmax, n, assign);

if (!assign || ret) {
for (i = 0; i < n; i++)


2015-05-22 13:42:49

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [PATCH v2 02/11] perf/x86: Improve HT workaround GP counter constraint

On Fri, May 22, 2015 at 03:29:07PM +0200, Peter Zijlstra wrote:
> @@ -696,11 +699,16 @@ static bool __perf_sched_find_counter(st
> goto done;
> }
> }
> +
> /* Grab the first unused counter starting with idx */
> idx = sched->state.counter;
> for_each_set_bit_from(idx, c->idxmsk, INTEL_PMC_IDX_FIXED) {
> + if (!__test_and_set_bit(idx, sched->state.used)) {
> + if (sched->state.nr_gp++ >= sched->max_gp)
> + return false;

Note the placement inside the success path of the GP allocation, instead
of the attempt a GP place we had before.

> +
> goto done;
> + }
> }
>
> return false;




> @@ -2000,6 +1999,11 @@ intel_get_excl_constraints(struct cpu_hw
> * across HT threads
> */
> is_excl = c->flags & PERF_X86_EVENT_EXCL;
> + if (is_excl && !(event->hw.flags & PERF_X86_EVENT_EXCL_ACCT)) {
> + event->hw.flags |= PERF_X86_EVENT_EXCL_ACCT;
> + if (!cpuc->n_excl++)
> + WRITE_ONCE(excl_cntrs->has_exclusive[tid], 1);
> + }
>
> /*
> * xl = state of current HT

And that is what keeps repeated get_event_constraints() calls from ever
increasing our n_excl count.

2015-05-26 13:27:12

by Stephane Eranian

[permalink] [raw]
Subject: Re: [PATCH v2 02/11] perf/x86: Improve HT workaround GP counter constraint

On Fri, May 22, 2015 at 6:29 AM, Peter Zijlstra <[email protected]> wrote:
> The (SNB/IVB/HSW) HT bug only affects events that can be programmed
> onto GP counters, therefore we should only limit the number of GP
> counters that can be used per cpu -- iow we should not constrain the
> FP counters.
>
> Furthermore, we should only enfore such a limit when there are in fact
> exclusive events being scheduled on either sibling.
>
> Reported-by: Vince Weaver <[email protected]>
> Signed-off-by: Peter Zijlstra (Intel) <[email protected]>
> ---
> arch/x86/kernel/cpu/perf_event.c | 36 +++++++++++++++++++++-----
> arch/x86/kernel/cpu/perf_event.h | 11 +++++--
> arch/x86/kernel/cpu/perf_event_intel.c | 30 +++++++--------------
> arch/x86/kernel/cpu/perf_event_intel_uncore.c | 2 -
> 4 files changed, 49 insertions(+), 30 deletions(-)
>
> --- a/arch/x86/kernel/cpu/perf_event.c
> +++ b/arch/x86/kernel/cpu/perf_event.c
> @@ -611,6 +611,7 @@ struct sched_state {
> int event; /* event index */
> int counter; /* counter index */
> int unassigned; /* number of events to be assigned left */
> + int nr_gp; /* number of GP counters used */
> unsigned long used[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
> };
>
> @@ -620,9 +621,10 @@ struct sched_state {
> struct perf_sched {
> int max_weight;
> int max_events;
> + int max_gp;
> + int saved_states;
> struct event_constraint **constraints;
> struct sched_state state;
> - int saved_states;
> struct sched_state saved[SCHED_STATES_MAX];
> };
>
> @@ -630,13 +632,14 @@ struct perf_sched {
> * Initialize interator that runs through all events and counters.
> */
> static void perf_sched_init(struct perf_sched *sched, struct event_constraint **constraints,
> - int num, int wmin, int wmax)
> + int num, int wmin, int wmax, int gpmax)
> {
> int idx;
>
> memset(sched, 0, sizeof(*sched));
> sched->max_events = num;
> sched->max_weight = wmax;
> + sched->max_gp = gpmax;
> sched->constraints = constraints;
>
> for (idx = 0; idx < num; idx++) {
> @@ -696,11 +699,16 @@ static bool __perf_sched_find_counter(st
> goto done;
> }
> }
> +
> /* Grab the first unused counter starting with idx */
> idx = sched->state.counter;
> for_each_set_bit_from(idx, c->idxmsk, INTEL_PMC_IDX_FIXED) {
> - if (!__test_and_set_bit(idx, sched->state.used))
> + if (!__test_and_set_bit(idx, sched->state.used)) {
> + if (sched->state.nr_gp++ >= sched->max_gp)
> + return false;
> +
> goto done;
> + }
> }
>
> return false;
> @@ -757,11 +765,11 @@ static bool perf_sched_next_event(struct
> * Assign a counter for each event.
> */
> int perf_assign_events(struct event_constraint **constraints, int n,
> - int wmin, int wmax, int *assign)
> + int wmin, int wmax, int gpmax, int *assign)
> {
> struct perf_sched sched;
>
> - perf_sched_init(&sched, constraints, n, wmin, wmax);
> + perf_sched_init(&sched, constraints, n, wmin, wmax, gpmax);
>
> do {
> if (!perf_sched_find_counter(&sched))
> @@ -822,8 +830,24 @@ int x86_schedule_events(struct cpu_hw_ev
>
> /* slow path */
> if (i != n) {
> + int gpmax = x86_pmu.num_counters;
> +
> + /*
> + * Do not allow scheduling of more than half the available
> + * generic counters.
> + *
> + * This helps avoid counter starvation of sibling thread by
> + * ensuring at most half the counters cannot be in exclusive
> + * mode. There is no designated counters for the limits. Any
> + * N/2 counters can be used. This helps with events with
> + * specific counter constraints.
> + */
> + if (is_ht_workaround_enabled() && !cpuc->is_fake &&
> + READ_ONCE(cpuc->excl_cntrs->exclusive_present))
> + gpmax /= 2;
> +
What I don't like about this part is that this is a hack to work around a bug
on some limited Intel CPUs and yet it is in the middle of generic x86 code.
I understand it will be inoperative on AMD PMU and is not used by Intel
uncore. On KNC or P6, you will not have is_ht_workaround_enabled().
Could this be made a x86_pmu callback? x86_pmu.counter_limit()?


> unsched = perf_assign_events(cpuc->event_constraint, n, wmin,
> - wmax, assign);
> + wmax, gpmax, assign);
> }
>
> /*
> --- a/arch/x86/kernel/cpu/perf_event.h
> +++ b/arch/x86/kernel/cpu/perf_event.h
> @@ -74,6 +74,7 @@ struct event_constraint {
> #define PERF_X86_EVENT_EXCL 0x0040 /* HT exclusivity on counter */
> #define PERF_X86_EVENT_DYNAMIC 0x0080 /* dynamic alloc'd constraint */
> #define PERF_X86_EVENT_RDPMC_ALLOWED 0x0100 /* grant rdpmc permission */
> +#define PERF_X86_EVENT_EXCL_ACCT 0x0200 /* accounted EXCL event */
>
>
> struct amd_nb {
> @@ -134,8 +135,6 @@ enum intel_excl_state_type {
> struct intel_excl_states {
> enum intel_excl_state_type init_state[X86_PMC_IDX_MAX];
> enum intel_excl_state_type state[X86_PMC_IDX_MAX];
> - int num_alloc_cntrs;/* #counters allocated */
> - int max_alloc_cntrs;/* max #counters allowed */
> bool sched_started; /* true if scheduling has started */
> };
>
> @@ -144,6 +143,11 @@ struct intel_excl_cntrs {
>
> struct intel_excl_states states[2];
>
> + union {
> + u16 has_exclusive[2];
> + u32 exclusive_present;
> + };
> +
> int refcnt; /* per-core: #HT threads */
> unsigned core_id; /* per-core: core id */
> };
> @@ -176,6 +180,7 @@ struct cpu_hw_events {
> struct perf_event *event_list[X86_PMC_IDX_MAX]; /* in enabled order */
> struct event_constraint *event_constraint[X86_PMC_IDX_MAX];
>
> + int n_excl; /* the number of exclusive events */
>
> unsigned int group_flag;
> int is_fake;
> @@ -719,7 +724,7 @@ static inline void __x86_pmu_enable_even
> void x86_pmu_enable_all(int added);
>
> int perf_assign_events(struct event_constraint **constraints, int n,
> - int wmin, int wmax, int *assign);
> + int wmin, int wmax, int gpmax, int *assign);
> int x86_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign);
>
> void x86_pmu_stop(struct perf_event *event, int flags);
> --- a/arch/x86/kernel/cpu/perf_event_intel.c
> +++ b/arch/x86/kernel/cpu/perf_event_intel.c
> @@ -1923,7 +1923,6 @@ intel_start_scheduling(struct cpu_hw_eve
> xl = &excl_cntrs->states[tid];
>
> xl->sched_started = true;
> - xl->num_alloc_cntrs = 0;
> /*
> * lock shared state until we are done scheduling
> * in stop_event_scheduling()
> @@ -2000,6 +1999,11 @@ intel_get_excl_constraints(struct cpu_hw
> * across HT threads
> */
> is_excl = c->flags & PERF_X86_EVENT_EXCL;
> + if (is_excl && !(event->hw.flags & PERF_X86_EVENT_EXCL_ACCT)) {
> + event->hw.flags |= PERF_X86_EVENT_EXCL_ACCT;
> + if (!cpuc->n_excl++)
> + WRITE_ONCE(excl_cntrs->has_exclusive[tid], 1);
> + }
>
> /*
> * xl = state of current HT
> @@ -2008,18 +2012,6 @@ intel_get_excl_constraints(struct cpu_hw
> xl = &excl_cntrs->states[tid];
> xlo = &excl_cntrs->states[o_tid];
>
> - /*
> - * do not allow scheduling of more than max_alloc_cntrs
> - * which is set to half the available generic counters.
> - * this helps avoid counter starvation of sibling thread
> - * by ensuring at most half the counters cannot be in
> - * exclusive mode. There is not designated counters for the
> - * limits. Any N/2 counters can be used. This helps with
> - * events with specifix counter constraints
> - */
> - if (xl->num_alloc_cntrs++ == xl->max_alloc_cntrs)
> - return &emptyconstraint;
> -
> cx = c;
>
> /*
> @@ -2150,6 +2142,11 @@ static void intel_put_excl_constraints(s
>
> xl = &excl_cntrs->states[tid];
> xlo = &excl_cntrs->states[o_tid];
> + if (hwc->flags & PERF_X86_EVENT_EXCL_ACCT) {
> + hwc->flags &= ~PERF_X86_EVENT_EXCL_ACCT;
> + if (!--cpuc->n_excl)
> + WRITE_ONCE(excl_cntrs->has_exclusive[tid], 0);
> + }
>
> /*
> * put_constraint may be called from x86_schedule_events()
> @@ -2632,8 +2629,6 @@ static void intel_pmu_cpu_starting(int c
> cpuc->lbr_sel = &cpuc->shared_regs->regs[EXTRA_REG_LBR];
>
> if (x86_pmu.flags & PMU_FL_EXCL_CNTRS) {
> - int h = x86_pmu.num_counters >> 1;
> -
> for_each_cpu(i, topology_thread_cpumask(cpu)) {
> struct intel_excl_cntrs *c;
>
> @@ -2647,11 +2642,6 @@ static void intel_pmu_cpu_starting(int c
> }
> cpuc->excl_cntrs->core_id = core_id;
> cpuc->excl_cntrs->refcnt++;
> - /*
> - * set hard limit to half the number of generic counters
> - */
> - cpuc->excl_cntrs->states[0].max_alloc_cntrs = h;
> - cpuc->excl_cntrs->states[1].max_alloc_cntrs = h;
> }
> }
>
> --- a/arch/x86/kernel/cpu/perf_event_intel_uncore.c
> +++ b/arch/x86/kernel/cpu/perf_event_intel_uncore.c
> @@ -394,7 +394,7 @@ static int uncore_assign_events(struct i
> /* slow path */
> if (i != n)
> ret = perf_assign_events(box->event_constraint, n,
> - wmin, wmax, assign);
> + wmin, wmax, n, assign);
>
> if (!assign || ret) {
> for (i = 0; i < n; i++)
>
>

2015-05-26 10:17:12

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [PATCH v2 02/11] perf/x86: Improve HT workaround GP counter constraint

Please trim your email.

On Tue, May 26, 2015 at 02:37:52AM -0700, Stephane Eranian wrote:
> > @@ -822,8 +830,24 @@ int x86_schedule_events(struct cpu_hw_ev
> >
> > /* slow path */
> > if (i != n) {
> > + int gpmax = x86_pmu.num_counters;
> > +
> > + /*
> > + * Do not allow scheduling of more than half the available
> > + * generic counters.
> > + *
> > + * This helps avoid counter starvation of sibling thread by
> > + * ensuring at most half the counters cannot be in exclusive
> > + * mode. There is no designated counters for the limits. Any
> > + * N/2 counters can be used. This helps with events with
> > + * specific counter constraints.
> > + */
> > + if (is_ht_workaround_enabled() && !cpuc->is_fake &&
> > + READ_ONCE(cpuc->excl_cntrs->exclusive_present))
> > + gpmax /= 2;
> > +
> What I don't like about this part is that this is a hack to work around a bug
> on some limited Intel CPUs and yet it is in the middle of generic x86 code.
> I understand it will be inoperative on AMD PMU and is not used by Intel
> uncore. On KNC or P6, you will not have is_ht_workaround_enabled().
> Could this be made a x86_pmu callback? x86_pmu.counter_limit()?

It'll be slower though. You get an indirect function call in there.

But sure we can clean that up later if you like; there's other things
needing to be fixed here first.

I'm going to overhaul the whole get/put constraints stuff first.

2015-05-26 13:40:59

by Stephane Eranian

[permalink] [raw]
Subject: Re: [PATCH v2 02/11] perf/x86: Improve HT workaround GP counter constraint

On Tue, May 26, 2015 at 3:15 AM, Peter Zijlstra <[email protected]> wrote:
> Please trim your email.
>
> On Tue, May 26, 2015 at 02:37:52AM -0700, Stephane Eranian wrote:
>> > @@ -822,8 +830,24 @@ int x86_schedule_events(struct cpu_hw_ev
>> >
>> > /* slow path */
>> > if (i != n) {
>> > + int gpmax = x86_pmu.num_counters;
>> > +
>> > + /*
>> > + * Do not allow scheduling of more than half the available
>> > + * generic counters.
>> > + *
>> > + * This helps avoid counter starvation of sibling thread by
>> > + * ensuring at most half the counters cannot be in exclusive
>> > + * mode. There is no designated counters for the limits. Any
>> > + * N/2 counters can be used. This helps with events with
>> > + * specific counter constraints.
>> > + */
>> > + if (is_ht_workaround_enabled() && !cpuc->is_fake &&
>> > + READ_ONCE(cpuc->excl_cntrs->exclusive_present))
>> > + gpmax /= 2;
>> > +
>> What I don't like about this part is that this is a hack to work around a bug
>> on some limited Intel CPUs and yet it is in the middle of generic x86 code.
>> I understand it will be inoperative on AMD PMU and is not used by Intel
>> uncore. On KNC or P6, you will not have is_ht_workaround_enabled().
>> Could this be made a x86_pmu callback? x86_pmu.counter_limit()?
>
> It'll be slower though. You get an indirect function call in there.
>
> But sure we can clean that up later if you like; there's other things
> needing to be fixed here first.
>
> I'm going to overhaul the whole get/put constraints stuff first.

Ok, I think it would be good to balance to number of get/put. It would
avoid the confusion. Is that what you are thinking about?

2015-05-26 13:20:19

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [PATCH v2 02/11] perf/x86: Improve HT workaround GP counter constraint

On Tue, May 26, 2015 at 04:47:11AM -0700, Stephane Eranian wrote:
> > I'm going to overhaul the whole get/put constraints stuff first.
>
> Ok, I think it would be good to balance to number of get/put. It would
> avoid the confusion. Is that what you are thinking about?

Yes, and remove the few associated modifications to events.

2015-05-26 16:07:25

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [PATCH v2 02/11] perf/x86: Improve HT workaround GP counter constraint

On Tue, May 26, 2015 at 03:19:50PM +0200, Peter Zijlstra wrote:
> On Tue, May 26, 2015 at 04:47:11AM -0700, Stephane Eranian wrote:
> > > I'm going to overhaul the whole get/put constraints stuff first.
> >
> > Ok, I think it would be good to balance to number of get/put. It would
> > avoid the confusion. Is that what you are thinking about?
>
> Yes, and remove the few associated modifications to events.

I have the below (in 4 patches); compile tested only so far.

---
perf_event.c | 51 +++++++++++++++++++++++++--------------------------
perf_event.h | 4 ++--
perf_event_intel.c | 40 ++++------------------------------------
3 files changed, 31 insertions(+), 64 deletions(-)

--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -810,9 +810,15 @@ int x86_schedule_events(struct cpu_hw_ev
x86_pmu.start_scheduling(cpuc);

for (i = 0, wmin = X86_PMC_IDX_MAX, wmax = 0; i < n; i++) {
- cpuc->event_constraint[i] = NULL;
- c = x86_pmu.get_event_constraints(cpuc, i, cpuc->event_list[i]);
- cpuc->event_constraint[i] = c;
+ /*
+ * Only call get_event_constraints() once!
+ */
+ c = cpuc->event_constraint[i];
+ if (!c) {
+ e = cpuc->event_list[i];
+ c = x86_pmu.get_event_constraints(cpuc, i, e);
+ cpuc->event_constraint[i] = c;
+ }

wmin = min(wmin, c->weight);
wmax = max(wmax, c->weight);
@@ -875,27 +881,23 @@ int x86_schedule_events(struct cpu_hw_ev
* validate an event group (assign == NULL)
*/
if (!unsched && assign) {
- for (i = 0; i < n; i++) {
- e = cpuc->event_list[i];
- e->hw.flags |= PERF_X86_EVENT_COMMITTED;
- if (x86_pmu.commit_scheduling)
+ if (x86_pmu.commit_scheduling) {
+ for (i = 0; i < n; i++) {
x86_pmu.commit_scheduling(cpuc, i, assign[i]);
+ }
}
- } else {
- for (i = 0; i < n; i++) {
- e = cpuc->event_list[i];
- /*
- * do not put_constraint() on comitted events,
- * because they are good to go
- */
- if ((e->hw.flags & PERF_X86_EVENT_COMMITTED))
- continue;
+ } else if (x86_pmu.put_event_constraints) {
+ /* x86_pmu_add() will not yet have updated n_events */
+ i = cpuc->n_events;
+
+ /* x86_pmu_commit_txn() relies on n_txn */
+ if (cpuc->group_flag & PERF_EVENT_TXN)
+ i -= cpuc->n_txn;

- /*
- * release events that failed scheduling
- */
- if (x86_pmu.put_event_constraints)
- x86_pmu.put_event_constraints(cpuc, e);
+ for (; i < n; i++) {
+ e = cpuc->event_list[i];
+ /* release events that failed scheduling */
+ x86_pmu.put_event_constraints(cpuc, e);
}
}

@@ -923,6 +925,7 @@ static int collect_events(struct cpu_hw_
if (n >= max_count)
return -EINVAL;
cpuc->event_list[n] = leader;
+ cpuc->event_constraint[n] = NULL;
n++;
}
if (!dogrp)
@@ -937,6 +940,7 @@ static int collect_events(struct cpu_hw_
return -EINVAL;

cpuc->event_list[n] = event;
+ cpuc->event_constraint[n] = NULL;
n++;
}
return n;
@@ -1295,11 +1299,6 @@ static void x86_pmu_del(struct perf_even
int i;

/*
- * event is descheduled
- */
- event->hw.flags &= ~PERF_X86_EVENT_COMMITTED;
-
- /*
* If we're called during a txn, we don't need to do anything.
* The events never got scheduled and ->cancel_txn will truncate
* the event_list.
--- a/arch/x86/kernel/cpu/perf_event.h
+++ b/arch/x86/kernel/cpu/perf_event.h
@@ -68,13 +68,13 @@ struct event_constraint {
#define PERF_X86_EVENT_PEBS_LDLAT 0x0001 /* ld+ldlat data address sampling */
#define PERF_X86_EVENT_PEBS_ST 0x0002 /* st data address sampling */
#define PERF_X86_EVENT_PEBS_ST_HSW 0x0004 /* haswell style datala, store */
-#define PERF_X86_EVENT_COMMITTED 0x0008 /* event passed commit_txn */
+
#define PERF_X86_EVENT_PEBS_LD_HSW 0x0010 /* haswell style datala, load */
#define PERF_X86_EVENT_PEBS_NA_HSW 0x0020 /* haswell style datala, unknown */
#define PERF_X86_EVENT_EXCL 0x0040 /* HT exclusivity on counter */
#define PERF_X86_EVENT_DYNAMIC 0x0080 /* dynamic alloc'd constraint */
#define PERF_X86_EVENT_RDPMC_ALLOWED 0x0100 /* grant rdpmc permission */
-#define PERF_X86_EVENT_EXCL_ACCT 0x0200 /* accounted EXCL event */
+
#define PERF_X86_EVENT_AUTO_RELOAD 0x0400 /* use PEBS auto-reload */
#define PERF_X86_EVENT_FREERUNNING 0x0800 /* use freerunning PEBS */

--- a/arch/x86/kernel/cpu/perf_event_intel.c
+++ b/arch/x86/kernel/cpu/perf_event_intel.c
@@ -1955,14 +1955,6 @@ __intel_shared_reg_get_constraints(struc
unsigned long flags;
int idx = reg->idx;

- /*
- * reg->alloc can be set due to existing state, so for fake cpuc we
- * need to ignore this, otherwise we might fail to allocate proper fake
- * state for this extra reg constraint. Also see the comment below.
- */
- if (reg->alloc && !cpuc->is_fake)
- return NULL; /* call x86_get_event_constraint() */
-
again:
era = &cpuc->shared_regs->regs[idx];
/*
@@ -1986,14 +1978,6 @@ __intel_shared_reg_get_constraints(struc
if (!cpuc->is_fake) {
if (idx != reg->idx)
intel_fixup_er(event, idx);
-
- /*
- * x86_schedule_events() can call get_event_constraints()
- * multiple times on events in the case of incremental
- * scheduling(). reg->alloc ensures we only do the ER
- * allocation once.
- */
- reg->alloc = 1;
}

/* lock in msr value */
@@ -2026,24 +2010,12 @@ __intel_shared_reg_put_constraints(struc
{
struct er_account *era;

- /*
- * Only put constraint if extra reg was actually allocated. Also takes
- * care of event which do not use an extra shared reg.
- *
- * Also, if this is a fake cpuc we shouldn't touch any event state
- * (reg->alloc) and we don't care about leaving inconsistent cpuc state
- * either since it'll be thrown out.
- */
- if (!reg->alloc || cpuc->is_fake)
- return;
+ WARN_ON_ONCE(cpuc->is_fake);

era = &cpuc->shared_regs->regs[reg->idx];

/* one fewer user */
atomic_dec(&era->ref);
-
- /* allocate again next time */
- reg->alloc = 0;
}

static struct event_constraint *
@@ -2261,8 +2233,7 @@ intel_get_excl_constraints(struct cpu_hw
* across HT threads
*/
is_excl = c->flags & PERF_X86_EVENT_EXCL;
- if (is_excl && !(event->hw.flags & PERF_X86_EVENT_EXCL_ACCT)) {
- event->hw.flags |= PERF_X86_EVENT_EXCL_ACCT;
+ if (is_excl) {
if (!cpuc->n_excl++)
WRITE_ONCE(excl_cntrs->has_exclusive[tid], 1);
}
@@ -2350,11 +2321,8 @@ static void intel_put_excl_constraints(s
if (WARN_ON_ONCE(!excl_cntrs))
return;

- if (hwc->flags & PERF_X86_EVENT_EXCL_ACCT) {
- hwc->flags &= ~PERF_X86_EVENT_EXCL_ACCT;
- if (!--cpuc->n_excl)
- WRITE_ONCE(excl_cntrs->has_exclusive[tid], 0);
- }
+ if ((hwc->flags & PERF_X86_EVENT_EXCL) && !--cpuc->n_excl)
+ WRITE_ONCE(excl_cntrs->has_exclusive[tid], 0);

/*
* If event was actually assigned, then mark the counter state as

2015-05-26 23:33:45

by Andi Kleen

[permalink] [raw]
Subject: Re: [PATCH v2 02/11] perf/x86: Improve HT workaround GP counter constraint

Peter Zijlstra <[email protected]> writes:
> + */
> + if (is_ht_workaround_enabled() && !cpuc->is_fake &&

Could this function also check if at least one leaking event is
scheduled somewhere? (e.g. from a global count)

-Andi

--
[email protected] -- Speaking for myself only