The (SNB/IVB/HSW) HT bug only affects events that can be programmed
onto GP counters, therefore we should only limit the number of GP
counters that can be used per cpu -- iow we should not constrain the
FP counters.
This patch changes the code such that we only count GP counters.
Reported-by: Vince Weaver <[email protected]>
Signed-off-by: Peter Zijlstra (Intel) <[email protected]>
---
arch/x86/kernel/cpu/perf_event.c | 15 +++++++++++++++
arch/x86/kernel/cpu/perf_event.h | 2 --
arch/x86/kernel/cpu/perf_event_intel.c | 20 --------------------
3 files changed, 15 insertions(+), 22 deletions(-)
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -611,6 +611,7 @@ struct sched_state {
int event; /* event index */
int counter; /* counter index */
int unassigned; /* number of events to be assigned left */
+ int nr_gp_counters; /* number of GP counters used */
unsigned long used[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
};
@@ -696,6 +697,20 @@ static bool __perf_sched_find_counter(st
goto done;
}
}
+
+ /*
+ * Do not allow scheduling of more than half the available generic
+ * counters.
+ *
+ * This helps avoid counter starvation of sibling thread by ensuring at
+ * most half the counters cannot be in exclusive mode. There is no
+ * designated counters for the limits. Any N/2 counters can be used.
+ * This helps with events with specific counter constraints.
+ */
+ if (is_ht_workaround_enabled() &&
+ sched->state.nr_gp_counters++ >= x86_pmu.num_counters / 2)
+ return false;
+
/* Grab the first unused counter starting with idx */
idx = sched->state.counter;
for_each_set_bit_from(idx, c->idxmsk, INTEL_PMC_IDX_FIXED) {
--- a/arch/x86/kernel/cpu/perf_event.h
+++ b/arch/x86/kernel/cpu/perf_event.h
@@ -134,8 +134,6 @@ enum intel_excl_state_type {
struct intel_excl_states {
enum intel_excl_state_type init_state[X86_PMC_IDX_MAX];
enum intel_excl_state_type state[X86_PMC_IDX_MAX];
- int num_alloc_cntrs;/* #counters allocated */
- int max_alloc_cntrs;/* max #counters allowed */
bool sched_started; /* true if scheduling has started */
};
--- a/arch/x86/kernel/cpu/perf_event_intel.c
+++ b/arch/x86/kernel/cpu/perf_event_intel.c
@@ -1923,7 +1923,6 @@ intel_start_scheduling(struct cpu_hw_eve
xl = &excl_cntrs->states[tid];
xl->sched_started = true;
- xl->num_alloc_cntrs = 0;
/*
* lock shared state until we are done scheduling
* in stop_event_scheduling()
@@ -2008,18 +2007,6 @@ intel_get_excl_constraints(struct cpu_hw
xl = &excl_cntrs->states[tid];
xlo = &excl_cntrs->states[o_tid];
- /*
- * do not allow scheduling of more than max_alloc_cntrs
- * which is set to half the available generic counters.
- * this helps avoid counter starvation of sibling thread
- * by ensuring at most half the counters cannot be in
- * exclusive mode. There is not designated counters for the
- * limits. Any N/2 counters can be used. This helps with
- * events with specifix counter constraints
- */
- if (xl->num_alloc_cntrs++ == xl->max_alloc_cntrs)
- return &emptyconstraint;
-
cx = c;
/*
@@ -2632,8 +2619,6 @@ static void intel_pmu_cpu_starting(int c
cpuc->lbr_sel = &cpuc->shared_regs->regs[EXTRA_REG_LBR];
if (x86_pmu.flags & PMU_FL_EXCL_CNTRS) {
- int h = x86_pmu.num_counters >> 1;
-
for_each_cpu(i, topology_thread_cpumask(cpu)) {
struct intel_excl_cntrs *c;
@@ -2647,11 +2632,6 @@ static void intel_pmu_cpu_starting(int c
}
cpuc->excl_cntrs->core_id = core_id;
cpuc->excl_cntrs->refcnt++;
- /*
- * set hard limit to half the number of generic counters
- */
- cpuc->excl_cntrs->states[0].max_alloc_cntrs = h;
- cpuc->excl_cntrs->states[1].max_alloc_cntrs = h;
}
}
On Thu, May 21, 2015 at 4:17 AM, Peter Zijlstra <[email protected]> wrote:
>
> The (SNB/IVB/HSW) HT bug only affects events that can be programmed
> onto GP counters, therefore we should only limit the number of GP
> counters that can be used per cpu -- iow we should not constrain the
> FP counters.
>
> This patch changes the code such that we only count GP counters.
>
> Reported-by: Vince Weaver <[email protected]>
> Signed-off-by: Peter Zijlstra (Intel) <[email protected]>
> ---
> arch/x86/kernel/cpu/perf_event.c | 15 +++++++++++++++
> arch/x86/kernel/cpu/perf_event.h | 2 --
> arch/x86/kernel/cpu/perf_event_intel.c | 20 --------------------
> 3 files changed, 15 insertions(+), 22 deletions(-)
>
> --- a/arch/x86/kernel/cpu/perf_event.c
> +++ b/arch/x86/kernel/cpu/perf_event.c
> @@ -611,6 +611,7 @@ struct sched_state {
> int event; /* event index */
> int counter; /* counter index */
> int unassigned; /* number of events to be assigned left */
> + int nr_gp_counters; /* number of GP counters used */
> unsigned long used[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
> };
>
> @@ -696,6 +697,20 @@ static bool __perf_sched_find_counter(st
> goto done;
> }
> }
> +
> + /*
> + * Do not allow scheduling of more than half the available generic
> + * counters.
> + *
> + * This helps avoid counter starvation of sibling thread by ensuring at
> + * most half the counters cannot be in exclusive mode. There is no
> + * designated counters for the limits. Any N/2 counters can be used.
> + * This helps with events with specific counter constraints.
> + */
> + if (is_ht_workaround_enabled() &&
> + sched->state.nr_gp_counters++ >= x86_pmu.num_counters / 2)
> + return false;
> +
Has to be > and not >= otherwise:
$ perf stat -a -C 0 -e r81d0,r81d0 -I 1000 sleep 100
Gives you 50% multiplexing when it should be 0% based on my testing.
> /* Grab the first unused counter starting with idx */
> idx = sched->state.counter;
> for_each_set_bit_from(idx, c->idxmsk, INTEL_PMC_IDX_FIXED) {
> --- a/arch/x86/kernel/cpu/perf_event.h
> +++ b/arch/x86/kernel/cpu/perf_event.h
> @@ -134,8 +134,6 @@ enum intel_excl_state_type {
> struct intel_excl_states {
> enum intel_excl_state_type init_state[X86_PMC_IDX_MAX];
> enum intel_excl_state_type state[X86_PMC_IDX_MAX];
> - int num_alloc_cntrs;/* #counters allocated */
> - int max_alloc_cntrs;/* max #counters allowed */
> bool sched_started; /* true if scheduling has started */
> };
>
> --- a/arch/x86/kernel/cpu/perf_event_intel.c
> +++ b/arch/x86/kernel/cpu/perf_event_intel.c
> @@ -1923,7 +1923,6 @@ intel_start_scheduling(struct cpu_hw_eve
> xl = &excl_cntrs->states[tid];
>
> xl->sched_started = true;
> - xl->num_alloc_cntrs = 0;
> /*
> * lock shared state until we are done scheduling
> * in stop_event_scheduling()
> @@ -2008,18 +2007,6 @@ intel_get_excl_constraints(struct cpu_hw
> xl = &excl_cntrs->states[tid];
> xlo = &excl_cntrs->states[o_tid];
>
> - /*
> - * do not allow scheduling of more than max_alloc_cntrs
> - * which is set to half the available generic counters.
> - * this helps avoid counter starvation of sibling thread
> - * by ensuring at most half the counters cannot be in
> - * exclusive mode. There is not designated counters for the
> - * limits. Any N/2 counters can be used. This helps with
> - * events with specifix counter constraints
> - */
> - if (xl->num_alloc_cntrs++ == xl->max_alloc_cntrs)
> - return &emptyconstraint;
> -
> cx = c;
>
> /*
> @@ -2632,8 +2619,6 @@ static void intel_pmu_cpu_starting(int c
> cpuc->lbr_sel = &cpuc->shared_regs->regs[EXTRA_REG_LBR];
>
> if (x86_pmu.flags & PMU_FL_EXCL_CNTRS) {
> - int h = x86_pmu.num_counters >> 1;
> -
> for_each_cpu(i, topology_thread_cpumask(cpu)) {
> struct intel_excl_cntrs *c;
>
> @@ -2647,11 +2632,6 @@ static void intel_pmu_cpu_starting(int c
> }
> cpuc->excl_cntrs->core_id = core_id;
> cpuc->excl_cntrs->refcnt++;
> - /*
> - * set hard limit to half the number of generic counters
> - */
> - cpuc->excl_cntrs->states[0].max_alloc_cntrs = h;
> - cpuc->excl_cntrs->states[1].max_alloc_cntrs = h;
> }
> }
>
>
>
On Fri, May 22, 2015 at 03:04:45AM -0700, Stephane Eranian wrote:
> > + if (is_ht_workaround_enabled() &&
> > + sched->state.nr_gp_counters++ >= x86_pmu.num_counters / 2)
> > + return false;
> > +
>
> Has to be > and not >= otherwise:
but its a post inc, so we'll return: 0, 1, 2, ... With > we'll match
after 3 gp events.
I'll agree its not working right though.
FWIW, I currently have the below; which also isn't working right.
It should not enforce the limit when there's no exclusive events being
scheduled.
It also doesn't break uncore scheduling.
---
arch/x86/kernel/cpu/perf_event.c | 31 ++++++++++++++++++++++----
arch/x86/kernel/cpu/perf_event.h | 10 +++++---
arch/x86/kernel/cpu/perf_event_intel.c | 28 ++++++-----------------
arch/x86/kernel/cpu/perf_event_intel_uncore.c | 2 -
4 files changed, 43 insertions(+), 28 deletions(-)
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -611,6 +611,7 @@ struct sched_state {
int event; /* event index */
int counter; /* counter index */
int unassigned; /* number of events to be assigned left */
+ int nr_gp; /* number of GP counters used */
unsigned long used[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
};
@@ -620,6 +621,7 @@ struct sched_state {
struct perf_sched {
int max_weight;
int max_events;
+ int max_gp;
struct event_constraint **constraints;
struct sched_state state;
int saved_states;
@@ -630,13 +632,14 @@ struct perf_sched {
* Initialize interator that runs through all events and counters.
*/
static void perf_sched_init(struct perf_sched *sched, struct event_constraint **constraints,
- int num, int wmin, int wmax)
+ int num, int wmin, int wmax, int gpmax)
{
int idx;
memset(sched, 0, sizeof(*sched));
sched->max_events = num;
sched->max_weight = wmax;
+ sched->max_gp = gpmax;
sched->constraints = constraints;
for (idx = 0; idx < num; idx++) {
@@ -696,6 +699,10 @@ static bool __perf_sched_find_counter(st
goto done;
}
}
+
+ if (sched->state.nr_gp++ >= sched->max_gp)
+ return false;
+
/* Grab the first unused counter starting with idx */
idx = sched->state.counter;
for_each_set_bit_from(idx, c->idxmsk, INTEL_PMC_IDX_FIXED) {
@@ -757,11 +764,11 @@ static bool perf_sched_next_event(struct
* Assign a counter for each event.
*/
int perf_assign_events(struct event_constraint **constraints, int n,
- int wmin, int wmax, int *assign)
+ int wmin, int wmax, int gpmax, int *assign)
{
struct perf_sched sched;
- perf_sched_init(&sched, constraints, n, wmin, wmax);
+ perf_sched_init(&sched, constraints, n, wmin, wmax, gpmax);
do {
if (!perf_sched_find_counter(&sched))
@@ -821,8 +828,24 @@ int x86_schedule_events(struct cpu_hw_ev
/* slow path */
if (i != n) {
+ int gpmax = x86_pmu.num_counters / 2;
+
+ /*
+ * Do not allow scheduling of more than half the available
+ * generic counters.
+ *
+ * This helps avoid counter starvation of sibling thread by
+ * ensuring at most half the counters cannot be in exclusive
+ * mode. There is no designated counters for the limits. Any
+ * N/2 counters can be used. This helps with events with
+ * specific counter constraints.
+ */
+ if (is_ht_workaround_enabled() && !cpuc->is_fake &&
+ READ_ONCE(cpuc->excl_cntrs->exclusive_present))
+ gpmax /= 2;
+
unsched = perf_assign_events(cpuc->event_constraint, n, wmin,
- wmax, assign);
+ wmax, gpmax, assign);
}
/*
--- a/arch/x86/kernel/cpu/perf_event.h
+++ b/arch/x86/kernel/cpu/perf_event.h
@@ -134,8 +134,6 @@ enum intel_excl_state_type {
struct intel_excl_states {
enum intel_excl_state_type init_state[X86_PMC_IDX_MAX];
enum intel_excl_state_type state[X86_PMC_IDX_MAX];
- int num_alloc_cntrs;/* #counters allocated */
- int max_alloc_cntrs;/* max #counters allowed */
bool sched_started; /* true if scheduling has started */
};
@@ -144,6 +142,11 @@ struct intel_excl_cntrs {
struct intel_excl_states states[2];
+ union {
+ u16 has_exclusive[2];
+ u32 exclusive_present;
+ };
+
int refcnt; /* per-core: #HT threads */
unsigned core_id; /* per-core: core id */
};
@@ -176,6 +179,7 @@ struct cpu_hw_events {
struct perf_event *event_list[X86_PMC_IDX_MAX]; /* in enabled order */
struct event_constraint *event_constraint[X86_PMC_IDX_MAX];
+ int n_excl; /* the number of exclusive events */
unsigned int group_flag;
int is_fake;
@@ -719,7 +723,7 @@ static inline void __x86_pmu_enable_even
void x86_pmu_enable_all(int added);
int perf_assign_events(struct event_constraint **constraints, int n,
- int wmin, int wmax, int *assign);
+ int wmin, int wmax, int gpmax, int *assign);
int x86_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign);
void x86_pmu_stop(struct perf_event *event, int flags);
--- a/arch/x86/kernel/cpu/perf_event_intel.c
+++ b/arch/x86/kernel/cpu/perf_event_intel.c
@@ -1923,7 +1923,6 @@ intel_start_scheduling(struct cpu_hw_eve
xl = &excl_cntrs->states[tid];
xl->sched_started = true;
- xl->num_alloc_cntrs = 0;
/*
* lock shared state until we are done scheduling
* in stop_event_scheduling()
@@ -2000,6 +1999,10 @@ intel_get_excl_constraints(struct cpu_hw
* across HT threads
*/
is_excl = c->flags & PERF_X86_EVENT_EXCL;
+ if (is_excl) {
+ if (!cpuc->n_excl++)
+ WRITE_ONCE(excl_cntrs->has_exclusive[tid], 1);
+ }
/*
* xl = state of current HT
@@ -2008,18 +2011,6 @@ intel_get_excl_constraints(struct cpu_hw
xl = &excl_cntrs->states[tid];
xlo = &excl_cntrs->states[o_tid];
- /*
- * do not allow scheduling of more than max_alloc_cntrs
- * which is set to half the available generic counters.
- * this helps avoid counter starvation of sibling thread
- * by ensuring at most half the counters cannot be in
- * exclusive mode. There is not designated counters for the
- * limits. Any N/2 counters can be used. This helps with
- * events with specifix counter constraints
- */
- if (xl->num_alloc_cntrs++ == xl->max_alloc_cntrs)
- return &emptyconstraint;
-
cx = c;
/*
@@ -2150,6 +2141,10 @@ static void intel_put_excl_constraints(s
xl = &excl_cntrs->states[tid];
xlo = &excl_cntrs->states[o_tid];
+ if (hwc->flags & PERF_X86_EVENT_EXCL) {
+ if (!--cpuc->n_excl)
+ WRITE_ONCE(excl_cntrs->has_exclusive[tid], 0);
+ }
/*
* put_constraint may be called from x86_schedule_events()
@@ -2632,8 +2627,6 @@ static void intel_pmu_cpu_starting(int c
cpuc->lbr_sel = &cpuc->shared_regs->regs[EXTRA_REG_LBR];
if (x86_pmu.flags & PMU_FL_EXCL_CNTRS) {
- int h = x86_pmu.num_counters >> 1;
-
for_each_cpu(i, topology_thread_cpumask(cpu)) {
struct intel_excl_cntrs *c;
@@ -2647,11 +2640,6 @@ static void intel_pmu_cpu_starting(int c
}
cpuc->excl_cntrs->core_id = core_id;
cpuc->excl_cntrs->refcnt++;
- /*
- * set hard limit to half the number of generic counters
- */
- cpuc->excl_cntrs->states[0].max_alloc_cntrs = h;
- cpuc->excl_cntrs->states[1].max_alloc_cntrs = h;
}
}
--- a/arch/x86/kernel/cpu/perf_event_intel_uncore.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_uncore.c
@@ -394,7 +394,7 @@ static int uncore_assign_events(struct i
/* slow path */
if (i != n)
ret = perf_assign_events(box->event_constraint, n,
- wmin, wmax, assign);
+ wmin, wmax, n, assign);
if (!assign || ret) {
for (i = 0; i < n; i++)
On Fri, May 22, 2015 at 4:21 AM, Peter Zijlstra <[email protected]> wrote:
> On Fri, May 22, 2015 at 03:04:45AM -0700, Stephane Eranian wrote:
>> > + if (is_ht_workaround_enabled() &&
>> > + sched->state.nr_gp_counters++ >= x86_pmu.num_counters / 2)
>> > + return false;
>> > +
>>
>> Has to be > and not >= otherwise:
>
> but its a post inc, so we'll return: 0, 1, 2, ... With > we'll match
> after 3 gp events.
>
> I'll agree its not working right though.
>
> FWIW, I currently have the below; which also isn't working right.
>
> It should not enforce the limit when there's no exclusive events being
> scheduled.
>
> It also doesn't break uncore scheduling.
>
> ---
> arch/x86/kernel/cpu/perf_event.c | 31 ++++++++++++++++++++++----
> arch/x86/kernel/cpu/perf_event.h | 10 +++++---
> arch/x86/kernel/cpu/perf_event_intel.c | 28 ++++++-----------------
> arch/x86/kernel/cpu/perf_event_intel_uncore.c | 2 -
> 4 files changed, 43 insertions(+), 28 deletions(-)
>
> --- a/arch/x86/kernel/cpu/perf_event.c
> +++ b/arch/x86/kernel/cpu/perf_event.c
> @@ -611,6 +611,7 @@ struct sched_state {
> int event; /* event index */
> int counter; /* counter index */
> int unassigned; /* number of events to be assigned left */
> + int nr_gp; /* number of GP counters used */
> unsigned long used[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
> };
>
> @@ -620,6 +621,7 @@ struct sched_state {
> struct perf_sched {
> int max_weight;
> int max_events;
> + int max_gp;
> struct event_constraint **constraints;
> struct sched_state state;
> int saved_states;
> @@ -630,13 +632,14 @@ struct perf_sched {
> * Initialize interator that runs through all events and counters.
> */
> static void perf_sched_init(struct perf_sched *sched, struct event_constraint **constraints,
> - int num, int wmin, int wmax)
> + int num, int wmin, int wmax, int gpmax)
> {
> int idx;
>
> memset(sched, 0, sizeof(*sched));
> sched->max_events = num;
> sched->max_weight = wmax;
> + sched->max_gp = gpmax;
> sched->constraints = constraints;
>
> for (idx = 0; idx < num; idx++) {
> @@ -696,6 +699,10 @@ static bool __perf_sched_find_counter(st
> goto done;
> }
> }
> +
> + if (sched->state.nr_gp++ >= sched->max_gp)
> + return false;
> +
> /* Grab the first unused counter starting with idx */
> idx = sched->state.counter;
> for_each_set_bit_from(idx, c->idxmsk, INTEL_PMC_IDX_FIXED) {
> @@ -757,11 +764,11 @@ static bool perf_sched_next_event(struct
> * Assign a counter for each event.
> */
> int perf_assign_events(struct event_constraint **constraints, int n,
> - int wmin, int wmax, int *assign)
> + int wmin, int wmax, int gpmax, int *assign)
> {
> struct perf_sched sched;
>
> - perf_sched_init(&sched, constraints, n, wmin, wmax);
> + perf_sched_init(&sched, constraints, n, wmin, wmax, gpmax);
>
> do {
> if (!perf_sched_find_counter(&sched))
> @@ -821,8 +828,24 @@ int x86_schedule_events(struct cpu_hw_ev
>
> /* slow path */
> if (i != n) {
> + int gpmax = x86_pmu.num_counters / 2;
> +
> + /*
> + * Do not allow scheduling of more than half the available
> + * generic counters.
> + *
> + * This helps avoid counter starvation of sibling thread by
> + * ensuring at most half the counters cannot be in exclusive
> + * mode. There is no designated counters for the limits. Any
> + * N/2 counters can be used. This helps with events with
> + * specific counter constraints.
> + */
> + if (is_ht_workaround_enabled() && !cpuc->is_fake &&
> + READ_ONCE(cpuc->excl_cntrs->exclusive_present))
> + gpmax /= 2;
> +
That's num_counters / 4!
I think you meant: int gpmax = x86_pmu.num_counters;
> unsched = perf_assign_events(cpuc->event_constraint, n, wmin,
> - wmax, assign);
> + wmax, gpmax, assign);
> }
>
> /*
> --- a/arch/x86/kernel/cpu/perf_event.h
> +++ b/arch/x86/kernel/cpu/perf_event.h
> @@ -134,8 +134,6 @@ enum intel_excl_state_type {
> struct intel_excl_states {
> enum intel_excl_state_type init_state[X86_PMC_IDX_MAX];
> enum intel_excl_state_type state[X86_PMC_IDX_MAX];
> - int num_alloc_cntrs;/* #counters allocated */
> - int max_alloc_cntrs;/* max #counters allowed */
> bool sched_started; /* true if scheduling has started */
> };
>
> @@ -144,6 +142,11 @@ struct intel_excl_cntrs {
>
> struct intel_excl_states states[2];
>
> + union {
> + u16 has_exclusive[2];
> + u32 exclusive_present;
> + };
> +
> int refcnt; /* per-core: #HT threads */
> unsigned core_id; /* per-core: core id */
> };
> @@ -176,6 +179,7 @@ struct cpu_hw_events {
> struct perf_event *event_list[X86_PMC_IDX_MAX]; /* in enabled order */
> struct event_constraint *event_constraint[X86_PMC_IDX_MAX];
>
> + int n_excl; /* the number of exclusive events */
>
> unsigned int group_flag;
> int is_fake;
> @@ -719,7 +723,7 @@ static inline void __x86_pmu_enable_even
> void x86_pmu_enable_all(int added);
>
> int perf_assign_events(struct event_constraint **constraints, int n,
> - int wmin, int wmax, int *assign);
> + int wmin, int wmax, int gpmax, int *assign);
> int x86_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign);
>
> void x86_pmu_stop(struct perf_event *event, int flags);
> --- a/arch/x86/kernel/cpu/perf_event_intel.c
> +++ b/arch/x86/kernel/cpu/perf_event_intel.c
> @@ -1923,7 +1923,6 @@ intel_start_scheduling(struct cpu_hw_eve
> xl = &excl_cntrs->states[tid];
>
> xl->sched_started = true;
> - xl->num_alloc_cntrs = 0;
> /*
> * lock shared state until we are done scheduling
> * in stop_event_scheduling()
> @@ -2000,6 +1999,10 @@ intel_get_excl_constraints(struct cpu_hw
> * across HT threads
> */
> is_excl = c->flags & PERF_X86_EVENT_EXCL;
> + if (is_excl) {
> + if (!cpuc->n_excl++)
> + WRITE_ONCE(excl_cntrs->has_exclusive[tid], 1);
> + }
>
> /*
> * xl = state of current HT
> @@ -2008,18 +2011,6 @@ intel_get_excl_constraints(struct cpu_hw
> xl = &excl_cntrs->states[tid];
> xlo = &excl_cntrs->states[o_tid];
>
> - /*
> - * do not allow scheduling of more than max_alloc_cntrs
> - * which is set to half the available generic counters.
> - * this helps avoid counter starvation of sibling thread
> - * by ensuring at most half the counters cannot be in
> - * exclusive mode. There is not designated counters for the
> - * limits. Any N/2 counters can be used. This helps with
> - * events with specifix counter constraints
> - */
> - if (xl->num_alloc_cntrs++ == xl->max_alloc_cntrs)
> - return &emptyconstraint;
> -
> cx = c;
>
> /*
> @@ -2150,6 +2141,10 @@ static void intel_put_excl_constraints(s
>
> xl = &excl_cntrs->states[tid];
> xlo = &excl_cntrs->states[o_tid];
> + if (hwc->flags & PERF_X86_EVENT_EXCL) {
> + if (!--cpuc->n_excl)
> + WRITE_ONCE(excl_cntrs->has_exclusive[tid], 0);
> + }
>
> /*
> * put_constraint may be called from x86_schedule_events()
> @@ -2632,8 +2627,6 @@ static void intel_pmu_cpu_starting(int c
> cpuc->lbr_sel = &cpuc->shared_regs->regs[EXTRA_REG_LBR];
>
> if (x86_pmu.flags & PMU_FL_EXCL_CNTRS) {
> - int h = x86_pmu.num_counters >> 1;
> -
> for_each_cpu(i, topology_thread_cpumask(cpu)) {
> struct intel_excl_cntrs *c;
>
> @@ -2647,11 +2640,6 @@ static void intel_pmu_cpu_starting(int c
> }
> cpuc->excl_cntrs->core_id = core_id;
> cpuc->excl_cntrs->refcnt++;
> - /*
> - * set hard limit to half the number of generic counters
> - */
> - cpuc->excl_cntrs->states[0].max_alloc_cntrs = h;
> - cpuc->excl_cntrs->states[1].max_alloc_cntrs = h;
> }
> }
>
> --- a/arch/x86/kernel/cpu/perf_event_intel_uncore.c
> +++ b/arch/x86/kernel/cpu/perf_event_intel_uncore.c
> @@ -394,7 +394,7 @@ static int uncore_assign_events(struct i
> /* slow path */
> if (i != n)
> ret = perf_assign_events(box->event_constraint, n,
> - wmin, wmax, assign);
> + wmin, wmax, n, assign);
>
> if (!assign || ret) {
> for (i = 0; i < n; i++)
On Fri, May 22, 2015 at 01:21:46PM +0200, Peter Zijlstra wrote:
> @@ -821,8 +828,24 @@ int x86_schedule_events(struct cpu_hw_ev
>
> /* slow path */
> if (i != n) {
> + int gpmax = x86_pmu.num_counters / 2;
> +
> + /*
> + * Do not allow scheduling of more than half the available
> + * generic counters.
> + *
> + * This helps avoid counter starvation of sibling thread by
> + * ensuring at most half the counters cannot be in exclusive
> + * mode. There is no designated counters for the limits. Any
> + * N/2 counters can be used. This helps with events with
> + * specific counter constraints.
> + */
> + if (is_ht_workaround_enabled() && !cpuc->is_fake &&
> + READ_ONCE(cpuc->excl_cntrs->exclusive_present))
> + gpmax /= 2;
> +
> unsched = perf_assign_events(cpuc->event_constraint, n, wmin,
> - wmax, assign);
> + wmax, gpmax, assign);
> }
>
Hmm, I divide by 2 twice.. no wonder it doesn't quite work as expected.
On Fri, May 22, 2015 at 4:28 AM, Peter Zijlstra <[email protected]> wrote:
> On Fri, May 22, 2015 at 01:21:46PM +0200, Peter Zijlstra wrote:
>> @@ -821,8 +828,24 @@ int x86_schedule_events(struct cpu_hw_ev
>>
>> /* slow path */
>> if (i != n) {
>> + int gpmax = x86_pmu.num_counters / 2;
>> +
>> + /*
>> + * Do not allow scheduling of more than half the available
>> + * generic counters.
>> + *
>> + * This helps avoid counter starvation of sibling thread by
>> + * ensuring at most half the counters cannot be in exclusive
>> + * mode. There is no designated counters for the limits. Any
>> + * N/2 counters can be used. This helps with events with
>> + * specific counter constraints.
>> + */
>> + if (is_ht_workaround_enabled() && !cpuc->is_fake &&
>> + READ_ONCE(cpuc->excl_cntrs->exclusive_present))
>> + gpmax /= 2;
>> +
>> unsched = perf_assign_events(cpuc->event_constraint, n, wmin,
>> - wmax, assign);
>> + wmax, gpmax, assign);
>> }
>>
>
> Hmm, I divide by 2 twice.. no wonder it doesn't quite work as expected.
Yes, that's what I said. Other problem is, with no watchdog, measuring
a non-corrupting event is still multiplexing when more than 2 instances
are passed:
$ perf stat -a -C 0 -e r20cc,r20cc,r20cc,r20cc -I 1000 sleep 100
I get 50% scheduling, only 2 out of 4 events scheduled at any time.
There is nothing running on the sibling thread, so it should let me run with 4
instances as per your patch.
On Fri, May 22, 2015 at 05:35:14AM -0700, Stephane Eranian wrote:
> On Fri, May 22, 2015 at 4:28 AM, Peter Zijlstra <[email protected]> wrote:
> > On Fri, May 22, 2015 at 01:21:46PM +0200, Peter Zijlstra wrote:
> >> @@ -821,8 +828,24 @@ int x86_schedule_events(struct cpu_hw_ev
> >>
> >> /* slow path */
> >> if (i != n) {
> >> + int gpmax = x86_pmu.num_counters / 2;
> >> +
> >> + /*
> >> + * Do not allow scheduling of more than half the available
> >> + * generic counters.
> >> + *
> >> + * This helps avoid counter starvation of sibling thread by
> >> + * ensuring at most half the counters cannot be in exclusive
> >> + * mode. There is no designated counters for the limits. Any
> >> + * N/2 counters can be used. This helps with events with
> >> + * specific counter constraints.
> >> + */
> >> + if (is_ht_workaround_enabled() && !cpuc->is_fake &&
> >> + READ_ONCE(cpuc->excl_cntrs->exclusive_present))
> >> + gpmax /= 2;
> >> +
> >> unsched = perf_assign_events(cpuc->event_constraint, n, wmin,
> >> - wmax, assign);
> >> + wmax, gpmax, assign);
> >> }
> >>
> >
> > Hmm, I divide by 2 twice.. no wonder it doesn't quite work as expected.
>
> Yes, that's what I said. Other problem is, with no watchdog, measuring
> a non-corrupting event is still multiplexing when more than 2 instances
> are passed:
> $ perf stat -a -C 0 -e r20cc,r20cc,r20cc,r20cc -I 1000 sleep 100
>
> I get 50% scheduling, only 2 out of 4 events scheduled at any time.
>
> There is nothing running on the sibling thread, so it should let me run with 4
> instances as per your patch.
Ah, I limited it to n/2 if either of the siblings has an exclusive event
on.
In any case, there's at least two more bugs in that patch that I know
of, one of which I've fixed, the other I know how to fix.
Will re-post in a bit once I got it tested.
On Fri, May 22, 2015 at 5:53 AM, Peter Zijlstra <[email protected]> wrote:
> On Fri, May 22, 2015 at 05:35:14AM -0700, Stephane Eranian wrote:
>> On Fri, May 22, 2015 at 4:28 AM, Peter Zijlstra <[email protected]> wrote:
>> > On Fri, May 22, 2015 at 01:21:46PM +0200, Peter Zijlstra wrote:
>> >> @@ -821,8 +828,24 @@ int x86_schedule_events(struct cpu_hw_ev
>> >>
>> >> /* slow path */
>> >> if (i != n) {
>> >> + int gpmax = x86_pmu.num_counters / 2;
>> >> +
>> >> + /*
>> >> + * Do not allow scheduling of more than half the available
>> >> + * generic counters.
>> >> + *
>> >> + * This helps avoid counter starvation of sibling thread by
>> >> + * ensuring at most half the counters cannot be in exclusive
>> >> + * mode. There is no designated counters for the limits. Any
>> >> + * N/2 counters can be used. This helps with events with
>> >> + * specific counter constraints.
>> >> + */
>> >> + if (is_ht_workaround_enabled() && !cpuc->is_fake &&
>> >> + READ_ONCE(cpuc->excl_cntrs->exclusive_present))
>> >> + gpmax /= 2;
>> >> +
>> >> unsched = perf_assign_events(cpuc->event_constraint, n, wmin,
>> >> - wmax, assign);
>> >> + wmax, gpmax, assign);
>> >> }
>> >>
>> >
>> > Hmm, I divide by 2 twice.. no wonder it doesn't quite work as expected.
>>
>> Yes, that's what I said. Other problem is, with no watchdog, measuring
>> a non-corrupting event is still multiplexing when more than 2 instances
>> are passed:
>> $ perf stat -a -C 0 -e r20cc,r20cc,r20cc,r20cc -I 1000 sleep 100
>>
>> I get 50% scheduling, only 2 out of 4 events scheduled at any time.
>>
>> There is nothing running on the sibling thread, so it should let me run with 4
>> instances as per your patch.
>
> Ah, I limited it to n/2 if either of the siblings has an exclusive event
> on.
>
But in my test case above, there was no exclusive event at all on either
sibling and yet it limited the non-excl to 2.
> In any case, there's at least two more bugs in that patch that I know
> of, one of which I've fixed, the other I know how to fix.
>
> Will re-post in a bit once I got it tested.
On Fri, May 22, 2015 at 05:55:32AM -0700, Stephane Eranian wrote:
> On Fri, May 22, 2015 at 5:53 AM, Peter Zijlstra <[email protected]> wrote:
> > On Fri, May 22, 2015 at 05:35:14AM -0700, Stephane Eranian wrote:
> >> On Fri, May 22, 2015 at 4:28 AM, Peter Zijlstra <[email protected]> wrote:
> >> > On Fri, May 22, 2015 at 01:21:46PM +0200, Peter Zijlstra wrote:
> >> >> @@ -821,8 +828,24 @@ int x86_schedule_events(struct cpu_hw_ev
> >> >>
> >> >> /* slow path */
> >> >> if (i != n) {
> >> >> + int gpmax = x86_pmu.num_counters / 2;
> >> >> +
> >> >> + /*
> >> >> + * Do not allow scheduling of more than half the available
> >> >> + * generic counters.
> >> >> + *
> >> >> + * This helps avoid counter starvation of sibling thread by
> >> >> + * ensuring at most half the counters cannot be in exclusive
> >> >> + * mode. There is no designated counters for the limits. Any
> >> >> + * N/2 counters can be used. This helps with events with
> >> >> + * specific counter constraints.
> >> >> + */
> >> >> + if (is_ht_workaround_enabled() && !cpuc->is_fake &&
> >> >> + READ_ONCE(cpuc->excl_cntrs->exclusive_present))
> >> >> + gpmax /= 2;
> >> >> +
> >> >> unsched = perf_assign_events(cpuc->event_constraint, n, wmin,
> >> >> - wmax, assign);
> >> >> + wmax, gpmax, assign);
> >> >> }
> >> >>
> >> >
> >> > Hmm, I divide by 2 twice.. no wonder it doesn't quite work as expected.
> >>
> >> Yes, that's what I said. Other problem is, with no watchdog, measuring
> >> a non-corrupting event is still multiplexing when more than 2 instances
> >> are passed:
> >> $ perf stat -a -C 0 -e r20cc,r20cc,r20cc,r20cc -I 1000 sleep 100
> >>
> >> I get 50% scheduling, only 2 out of 4 events scheduled at any time.
> >>
> >> There is nothing running on the sibling thread, so it should let me run with 4
> >> instances as per your patch.
> >
> > Ah, I limited it to n/2 if either of the siblings has an exclusive event
> > on.
> >
> But in my test case above, there was no exclusive event at all on either
> sibling and yet it limited the non-excl to 2.
I bet you tested the exclusive events earlier :-) Its one of the bugs,
the n_excl accounting is leaking up. Once !0 it stays !0.
On Fri, May 22, 2015 at 5:59 AM, Peter Zijlstra <[email protected]> wrote:
> On Fri, May 22, 2015 at 05:55:32AM -0700, Stephane Eranian wrote:
>> On Fri, May 22, 2015 at 5:53 AM, Peter Zijlstra <[email protected]> wrote:
>> > On Fri, May 22, 2015 at 05:35:14AM -0700, Stephane Eranian wrote:
>> >> On Fri, May 22, 2015 at 4:28 AM, Peter Zijlstra <[email protected]> wrote:
>> >> > On Fri, May 22, 2015 at 01:21:46PM +0200, Peter Zijlstra wrote:
>> >> >> @@ -821,8 +828,24 @@ int x86_schedule_events(struct cpu_hw_ev
>> >> >>
>> >> >> /* slow path */
>> >> >> if (i != n) {
>> >> >> + int gpmax = x86_pmu.num_counters / 2;
>> >> >> +
>> >> >> + /*
>> >> >> + * Do not allow scheduling of more than half the available
>> >> >> + * generic counters.
>> >> >> + *
>> >> >> + * This helps avoid counter starvation of sibling thread by
>> >> >> + * ensuring at most half the counters cannot be in exclusive
>> >> >> + * mode. There is no designated counters for the limits. Any
>> >> >> + * N/2 counters can be used. This helps with events with
>> >> >> + * specific counter constraints.
>> >> >> + */
>> >> >> + if (is_ht_workaround_enabled() && !cpuc->is_fake &&
>> >> >> + READ_ONCE(cpuc->excl_cntrs->exclusive_present))
>> >> >> + gpmax /= 2;
>> >> >> +
>> >> >> unsched = perf_assign_events(cpuc->event_constraint, n, wmin,
>> >> >> - wmax, assign);
>> >> >> + wmax, gpmax, assign);
>> >> >> }
>> >> >>
>> >> >
>> >> > Hmm, I divide by 2 twice.. no wonder it doesn't quite work as expected.
>> >>
>> >> Yes, that's what I said. Other problem is, with no watchdog, measuring
>> >> a non-corrupting event is still multiplexing when more than 2 instances
>> >> are passed:
>> >> $ perf stat -a -C 0 -e r20cc,r20cc,r20cc,r20cc -I 1000 sleep 100
>> >>
>> >> I get 50% scheduling, only 2 out of 4 events scheduled at any time.
>> >>
>> >> There is nothing running on the sibling thread, so it should let me run with 4
>> >> instances as per your patch.
>> >
>> > Ah, I limited it to n/2 if either of the siblings has an exclusive event
>> > on.
>> >
>> But in my test case above, there was no exclusive event at all on either
>> sibling and yet it limited the non-excl to 2.
>
> I bet you tested the exclusive events earlier :-) Its one of the bugs,
> the n_excl accounting is leaking up. Once !0 it stays !0.
So you're saying intel_put_excl_constraint() does not do the --n_excl?
Could it be that the flags is not showing PERF_X86_EVENT_EXCL?
Cannot be related to cpuc_fake because you have it in both the ++
and -- functions.
On Fri, May 22, 2015 at 6:05 AM, Stephane Eranian <[email protected]> wrote:
> On Fri, May 22, 2015 at 5:59 AM, Peter Zijlstra <[email protected]> wrote:
>> On Fri, May 22, 2015 at 05:55:32AM -0700, Stephane Eranian wrote:
>>> On Fri, May 22, 2015 at 5:53 AM, Peter Zijlstra <[email protected]> wrote:
>>> > On Fri, May 22, 2015 at 05:35:14AM -0700, Stephane Eranian wrote:
>>> >> On Fri, May 22, 2015 at 4:28 AM, Peter Zijlstra <[email protected]> wrote:
>>> >> > On Fri, May 22, 2015 at 01:21:46PM +0200, Peter Zijlstra wrote:
>>> >> >> @@ -821,8 +828,24 @@ int x86_schedule_events(struct cpu_hw_ev
>>> >> >>
>>> >> >> /* slow path */
>>> >> >> if (i != n) {
>>> >> >> + int gpmax = x86_pmu.num_counters / 2;
>>> >> >> +
>>> >> >> + /*
>>> >> >> + * Do not allow scheduling of more than half the available
>>> >> >> + * generic counters.
>>> >> >> + *
>>> >> >> + * This helps avoid counter starvation of sibling thread by
>>> >> >> + * ensuring at most half the counters cannot be in exclusive
>>> >> >> + * mode. There is no designated counters for the limits. Any
>>> >> >> + * N/2 counters can be used. This helps with events with
>>> >> >> + * specific counter constraints.
>>> >> >> + */
>>> >> >> + if (is_ht_workaround_enabled() && !cpuc->is_fake &&
>>> >> >> + READ_ONCE(cpuc->excl_cntrs->exclusive_present))
>>> >> >> + gpmax /= 2;
>>> >> >> +
>>> >> >> unsched = perf_assign_events(cpuc->event_constraint, n, wmin,
>>> >> >> - wmax, assign);
>>> >> >> + wmax, gpmax, assign);
>>> >> >> }
>>> >> >>
>>> >> >
>>> >> > Hmm, I divide by 2 twice.. no wonder it doesn't quite work as expected.
>>> >>
>>> >> Yes, that's what I said. Other problem is, with no watchdog, measuring
>>> >> a non-corrupting event is still multiplexing when more than 2 instances
>>> >> are passed:
>>> >> $ perf stat -a -C 0 -e r20cc,r20cc,r20cc,r20cc -I 1000 sleep 100
>>> >>
>>> >> I get 50% scheduling, only 2 out of 4 events scheduled at any time.
>>> >>
>>> >> There is nothing running on the sibling thread, so it should let me run with 4
>>> >> instances as per your patch.
>>> >
>>> > Ah, I limited it to n/2 if either of the siblings has an exclusive event
>>> > on.
>>> >
>>> But in my test case above, there was no exclusive event at all on either
>>> sibling and yet it limited the non-excl to 2.
>>
>> I bet you tested the exclusive events earlier :-) Its one of the bugs,
>> the n_excl accounting is leaking up. Once !0 it stays !0.
>
> So you're saying intel_put_excl_constraint() does not do the --n_excl?
> Could it be that the flags is not showing PERF_X86_EVENT_EXCL?
> Cannot be related to cpuc_fake because you have it in both the ++
> and -- functions.
One other thing I noticed is that the --n_excl needs to be protected by the
excl_cntrs->lock in put_excl_constraints().
On Fri, May 22, 2015 at 02:59:08PM +0200, Peter Zijlstra wrote:
>
> I bet you tested the exclusive events earlier :-) Its one of the bugs,
> the n_excl accounting is leaking up. Once !0 it stays !0.
Ha!, also caused by my first patch.
It looks to be generally working. Let me update the Changelogs on the
lot and send out the stack again.
On Fri, May 22, 2015 at 06:05:49AM -0700, Stephane Eranian wrote:
> > I bet you tested the exclusive events earlier :-) Its one of the bugs,
> > the n_excl accounting is leaking up. Once !0 it stays !0.
>
> So you're saying intel_put_excl_constraint() does not do the --n_excl?
No it does, but we call get_events_constraints() every time we do
x86_schedule_events(), and put_events_constraints() only on
x86_pmu_del(). This means we call get() much more than we put().
Therefore leak up.
On Fri, May 22, 2015 at 06:07:00AM -0700, Stephane Eranian wrote:
>
> One other thing I noticed is that the --n_excl needs to be protected by the
> excl_cntrs->lock in put_excl_constraints().
Nah, its strictly per cpu.
On Fri, May 22, 2015 at 6:25 AM, Peter Zijlstra <[email protected]> wrote:
> On Fri, May 22, 2015 at 06:07:00AM -0700, Stephane Eranian wrote:
>>
>> One other thing I noticed is that the --n_excl needs to be protected by the
>> excl_cntrs->lock in put_excl_constraints().
>
> Nah, its strictly per cpu.
No. the excl_cntrs struct is pointed to by cpuc but it is shared between the
sibling HT. Otherwise this would not work!
On Fri, May 22, 2015 at 06:29:47AM -0700, Stephane Eranian wrote:
> On Fri, May 22, 2015 at 6:25 AM, Peter Zijlstra <[email protected]> wrote:
> > On Fri, May 22, 2015 at 06:07:00AM -0700, Stephane Eranian wrote:
> >>
> >> One other thing I noticed is that the --n_excl needs to be protected by the
> >> excl_cntrs->lock in put_excl_constraints().
> >
> > Nah, its strictly per cpu.
>
> No. the excl_cntrs struct is pointed to by cpuc but it is shared between the
> sibling HT. Otherwise this would not work!
n_excl is per cpuc, see the trickery with has_exclusive vs
exclusive_present on how I avoid the lock.
On Fri, May 22, 2015 at 6:36 AM, Peter Zijlstra <[email protected]> wrote:
> On Fri, May 22, 2015 at 06:29:47AM -0700, Stephane Eranian wrote:
>> On Fri, May 22, 2015 at 6:25 AM, Peter Zijlstra <[email protected]> wrote:
>> > On Fri, May 22, 2015 at 06:07:00AM -0700, Stephane Eranian wrote:
>> >>
>> >> One other thing I noticed is that the --n_excl needs to be protected by the
>> >> excl_cntrs->lock in put_excl_constraints().
>> >
>> > Nah, its strictly per cpu.
>>
>> No. the excl_cntrs struct is pointed to by cpuc but it is shared between the
>> sibling HT. Otherwise this would not work!
>
> n_excl is per cpuc, see the trickery with has_exclusive vs
> exclusive_present on how I avoid the lock.
Yes, but I believe you create a store forward penalty with this.
You store 16bits and you load 32 bits on the same cache line.
On Fri, May 22, 2015 at 06:40:49AM -0700, Stephane Eranian wrote:
> On Fri, May 22, 2015 at 6:36 AM, Peter Zijlstra <[email protected]> wrote:
> > On Fri, May 22, 2015 at 06:29:47AM -0700, Stephane Eranian wrote:
> >> On Fri, May 22, 2015 at 6:25 AM, Peter Zijlstra <[email protected]> wrote:
> >> > On Fri, May 22, 2015 at 06:07:00AM -0700, Stephane Eranian wrote:
> >> >>
> >> >> One other thing I noticed is that the --n_excl needs to be protected by the
> >> >> excl_cntrs->lock in put_excl_constraints().
> >> >
> >> > Nah, its strictly per cpu.
> >>
> >> No. the excl_cntrs struct is pointed to by cpuc but it is shared between the
> >> sibling HT. Otherwise this would not work!
> >
> > n_excl is per cpuc, see the trickery with has_exclusive vs
> > exclusive_present on how I avoid the lock.
>
> Yes, but I believe you create a store forward penalty with this.
> You store 16bits and you load 32 bits on the same cache line.
The store and load are fairly well spaced -- the entire scheduling fast
path is in between.
And such a penalty is still cheap compared to locking, no?
* Peter Zijlstra <[email protected]> wrote:
> On Fri, May 22, 2015 at 06:40:49AM -0700, Stephane Eranian wrote:
> > On Fri, May 22, 2015 at 6:36 AM, Peter Zijlstra <[email protected]> wrote:
> > > On Fri, May 22, 2015 at 06:29:47AM -0700, Stephane Eranian wrote:
> > >> On Fri, May 22, 2015 at 6:25 AM, Peter Zijlstra <[email protected]> wrote:
> > >> > On Fri, May 22, 2015 at 06:07:00AM -0700, Stephane Eranian wrote:
> > >> >>
> > >> >> One other thing I noticed is that the --n_excl needs to be protected by the
> > >> >> excl_cntrs->lock in put_excl_constraints().
> > >> >
> > >> > Nah, its strictly per cpu.
> > >>
> > >> No. the excl_cntrs struct is pointed to by cpuc but it is shared between the
> > >> sibling HT. Otherwise this would not work!
> > >
> > > n_excl is per cpuc, see the trickery with has_exclusive vs
> > > exclusive_present on how I avoid the lock.
> >
> > Yes, but I believe you create a store forward penalty with this.
> > You store 16bits and you load 32 bits on the same cache line.
Same cacheline access has no such penalty: only if the partial access
is for the same word.
> The store and load are fairly well spaced -- the entire scheduling
> fast path is in between.
>
> And such a penalty is still cheap compared to locking, no?
The 'penalty' is essentially just a delay in the execution of the
load, if the store has not finished yet: typically less than 10
cycles, around 3 cycles on recent uarchs.
So it should not be a big issue if there's indeed so much code between
them - probably it's not even causing any delay anywhere.
Thanks,
Ingo