2015-05-21 11:23:33

by Peter Zijlstra

[permalink] [raw]
Subject: [PATCH 03/10] perf/x86: Correct local vs remote sibling state

For some obscure reason the current code accounts the current SMT
thread's state on the remote thread and reads the remote's state on
the local SMT thread.

While internally consistent, and 'correct' its pointless confusion we
can do without.

Flip them the right way around.

Signed-off-by: Peter Zijlstra (Intel) <[email protected]>
---
arch/x86/kernel/cpu/perf_event_intel.c | 69 +++++++++++++--------------------
1 file changed, 28 insertions(+), 41 deletions(-)

--- a/arch/x86/kernel/cpu/perf_event_intel.c
+++ b/arch/x86/kernel/cpu/perf_event_intel.c
@@ -1903,9 +1903,8 @@ static void
intel_start_scheduling(struct cpu_hw_events *cpuc)
{
struct intel_excl_cntrs *excl_cntrs = cpuc->excl_cntrs;
- struct intel_excl_states *xl, *xlo;
+ struct intel_excl_states *xl;
int tid = cpuc->excl_thread_id;
- int o_tid = 1 - tid; /* sibling thread */

/*
* nothing needed if in group validation mode
@@ -1919,7 +1918,6 @@ intel_start_scheduling(struct cpu_hw_eve
if (!excl_cntrs)
return;

- xlo = &excl_cntrs->states[o_tid];
xl = &excl_cntrs->states[tid];

xl->sched_started = true;
@@ -1932,18 +1930,17 @@ intel_start_scheduling(struct cpu_hw_eve
raw_spin_lock(&excl_cntrs->lock);

/*
- * save initial state of sibling thread
+ * Save a copy of our state to work on.
*/
- memcpy(xlo->init_state, xlo->state, sizeof(xlo->init_state));
+ memcpy(xl->init_state, xl->state, sizeof(xl->init_state));
}

static void
intel_stop_scheduling(struct cpu_hw_events *cpuc)
{
struct intel_excl_cntrs *excl_cntrs = cpuc->excl_cntrs;
- struct intel_excl_states *xl, *xlo;
+ struct intel_excl_states *xl;
int tid = cpuc->excl_thread_id;
- int o_tid = 1 - tid; /* sibling thread */

/*
* nothing needed if in group validation mode
@@ -1956,13 +1953,12 @@ intel_stop_scheduling(struct cpu_hw_even
if (!excl_cntrs)
return;

- xlo = &excl_cntrs->states[o_tid];
xl = &excl_cntrs->states[tid];

/*
- * make new sibling thread state visible
+ * Commit the working state.
*/
- memcpy(xlo->state, xlo->init_state, sizeof(xlo->state));
+ memcpy(xl->state, xl->init_state, sizeof(xl->state));

xl->sched_started = false;
/*
@@ -1977,10 +1973,9 @@ intel_get_excl_constraints(struct cpu_hw
{
struct event_constraint *cx;
struct intel_excl_cntrs *excl_cntrs = cpuc->excl_cntrs;
- struct intel_excl_states *xl, *xlo;
- int is_excl, i;
+ struct intel_excl_states *xlo;
int tid = cpuc->excl_thread_id;
- int o_tid = 1 - tid; /* alternate */
+ int is_excl, i;

/*
* validating a group does not require
@@ -1994,18 +1989,6 @@ intel_get_excl_constraints(struct cpu_hw
*/
if (!excl_cntrs)
return c;
- /*
- * event requires exclusive counter access
- * across HT threads
- */
- is_excl = c->flags & PERF_X86_EVENT_EXCL;
-
- /*
- * xl = state of current HT
- * xlo = state of sibling HT
- */
- xl = &excl_cntrs->states[tid];
- xlo = &excl_cntrs->states[o_tid];

cx = c;

@@ -2049,6 +2032,17 @@ intel_get_excl_constraints(struct cpu_hw
*/

/*
+ * state of sibling HT
+ */
+ xlo = &excl_cntrs->states[tid ^ 1];
+
+ /*
+ * event requires exclusive counter access
+ * across HT threads
+ */
+ is_excl = c->flags & PERF_X86_EVENT_EXCL;
+
+ /*
* Modify static constraint with current dynamic
* state of thread
*
@@ -2062,14 +2056,14 @@ intel_get_excl_constraints(struct cpu_hw
* our corresponding counter cannot be used
* regardless of our event
*/
- if (xl->state[i] == INTEL_EXCL_EXCLUSIVE)
+ if (xlo->state[i] == INTEL_EXCL_EXCLUSIVE)
__clear_bit(i, cx->idxmsk);
/*
* if measuring an exclusive event, sibling
* measuring non-exclusive, then counter cannot
* be used
*/
- if (is_excl && xl->state[i] == INTEL_EXCL_SHARED)
+ if (is_excl && xlo->state[i] == INTEL_EXCL_SHARED)
__clear_bit(i, cx->idxmsk);
}

@@ -2119,10 +2113,9 @@ static void intel_put_excl_constraints(s
{
struct hw_perf_event *hwc = &event->hw;
struct intel_excl_cntrs *excl_cntrs = cpuc->excl_cntrs;
- struct intel_excl_states *xlo, *xl;
- unsigned long flags = 0; /* keep compiler happy */
int tid = cpuc->excl_thread_id;
- int o_tid = 1 - tid;
+ struct intel_excl_states *xl;
+ unsigned long flags = 0; /* keep compiler happy */

/*
* nothing needed if in group validation mode
@@ -2136,7 +2129,6 @@ static void intel_put_excl_constraints(s
return;

xl = &excl_cntrs->states[tid];
- xlo = &excl_cntrs->states[o_tid];

/*
* put_constraint may be called from x86_schedule_events()
@@ -2151,7 +2143,7 @@ static void intel_put_excl_constraints(s
* counter state as unused now
*/
if (hwc->idx >= 0)
- xlo->state[hwc->idx] = INTEL_EXCL_UNUSED;
+ xl->state[hwc->idx] = INTEL_EXCL_UNUSED;

if (!xl->sched_started)
raw_spin_unlock_irqrestore(&excl_cntrs->lock, flags);
@@ -2190,16 +2182,12 @@ static void intel_commit_scheduling(stru
{
struct intel_excl_cntrs *excl_cntrs = cpuc->excl_cntrs;
struct event_constraint *c = cpuc->event_constraint[idx];
- struct intel_excl_states *xlo, *xl;
+ struct intel_excl_states *xl;
int tid = cpuc->excl_thread_id;
- int o_tid = 1 - tid;
- int is_excl;

if (cpuc->is_fake || !c)
return;

- is_excl = c->flags & PERF_X86_EVENT_EXCL;
-
if (!(c->flags & PERF_X86_EVENT_DYNAMIC))
return;

@@ -2209,15 +2197,14 @@ static void intel_commit_scheduling(stru
return;

xl = &excl_cntrs->states[tid];
- xlo = &excl_cntrs->states[o_tid];

WARN_ON_ONCE(!raw_spin_is_locked(&excl_cntrs->lock));

if (cntr >= 0) {
- if (is_excl)
- xlo->init_state[cntr] = INTEL_EXCL_EXCLUSIVE;
+ if (c->flags & PERF_X86_EVENT_EXCL)
+ xl->init_state[cntr] = INTEL_EXCL_EXCLUSIVE;
else
- xlo->init_state[cntr] = INTEL_EXCL_SHARED;
+ xl->init_state[cntr] = INTEL_EXCL_SHARED;
}
}



2015-05-21 13:31:31

by Stephane Eranian

[permalink] [raw]
Subject: Re: [PATCH 03/10] perf/x86: Correct local vs remote sibling state

On Thu, May 21, 2015 at 4:17 AM, Peter Zijlstra <[email protected]> wrote:
> For some obscure reason the current code accounts the current SMT
> thread's state on the remote thread and reads the remote's state on
> the local SMT thread.
>
> While internally consistent, and 'correct' its pointless confusion we
> can do without.
>
> Flip them the right way around.
>
So you are changing the logic here from:

* EXCLUSIVE: sibling counter measuring exclusive event
* SHARED : sibling counter measuring non-exclusive event
* UNUSED : sibling counter unused

to:

* EXCLUSIVE: current thread is using an exclusive event
* SHARED: current thread is using a non-exclusive event
* UNUSED: current thread is not using this counter

I am okay with this just need to make sure there were no
assumptions made about that. I will look.

> Signed-off-by: Peter Zijlstra (Intel) <[email protected]>
> ---
> arch/x86/kernel/cpu/perf_event_intel.c | 69 +++++++++++++--------------------
> 1 file changed, 28 insertions(+), 41 deletions(-)
>
> --- a/arch/x86/kernel/cpu/perf_event_intel.c
> +++ b/arch/x86/kernel/cpu/perf_event_intel.c
> @@ -1903,9 +1903,8 @@ static void
> intel_start_scheduling(struct cpu_hw_events *cpuc)
> {
> struct intel_excl_cntrs *excl_cntrs = cpuc->excl_cntrs;
> - struct intel_excl_states *xl, *xlo;
> + struct intel_excl_states *xl;
> int tid = cpuc->excl_thread_id;
> - int o_tid = 1 - tid; /* sibling thread */
>
> /*
> * nothing needed if in group validation mode
> @@ -1919,7 +1918,6 @@ intel_start_scheduling(struct cpu_hw_eve
> if (!excl_cntrs)
> return;
>
> - xlo = &excl_cntrs->states[o_tid];
> xl = &excl_cntrs->states[tid];
>
> xl->sched_started = true;
> @@ -1932,18 +1930,17 @@ intel_start_scheduling(struct cpu_hw_eve
> raw_spin_lock(&excl_cntrs->lock);
>
> /*
> - * save initial state of sibling thread
> + * Save a copy of our state to work on.
> */
> - memcpy(xlo->init_state, xlo->state, sizeof(xlo->init_state));
> + memcpy(xl->init_state, xl->state, sizeof(xl->init_state));
> }
>
> static void
> intel_stop_scheduling(struct cpu_hw_events *cpuc)
> {
> struct intel_excl_cntrs *excl_cntrs = cpuc->excl_cntrs;
> - struct intel_excl_states *xl, *xlo;
> + struct intel_excl_states *xl;
> int tid = cpuc->excl_thread_id;
> - int o_tid = 1 - tid; /* sibling thread */
>
> /*
> * nothing needed if in group validation mode
> @@ -1956,13 +1953,12 @@ intel_stop_scheduling(struct cpu_hw_even
> if (!excl_cntrs)
> return;
>
> - xlo = &excl_cntrs->states[o_tid];
> xl = &excl_cntrs->states[tid];
>
> /*
> - * make new sibling thread state visible
> + * Commit the working state.
> */
> - memcpy(xlo->state, xlo->init_state, sizeof(xlo->state));
> + memcpy(xl->state, xl->init_state, sizeof(xl->state));
>
> xl->sched_started = false;
> /*
> @@ -1977,10 +1973,9 @@ intel_get_excl_constraints(struct cpu_hw
> {
> struct event_constraint *cx;
> struct intel_excl_cntrs *excl_cntrs = cpuc->excl_cntrs;
> - struct intel_excl_states *xl, *xlo;
> - int is_excl, i;
> + struct intel_excl_states *xlo;
> int tid = cpuc->excl_thread_id;
> - int o_tid = 1 - tid; /* alternate */
> + int is_excl, i;
>
> /*
> * validating a group does not require
> @@ -1994,18 +1989,6 @@ intel_get_excl_constraints(struct cpu_hw
> */
> if (!excl_cntrs)
> return c;
> - /*
> - * event requires exclusive counter access
> - * across HT threads
> - */
> - is_excl = c->flags & PERF_X86_EVENT_EXCL;
> -
> - /*
> - * xl = state of current HT
> - * xlo = state of sibling HT
> - */
> - xl = &excl_cntrs->states[tid];
> - xlo = &excl_cntrs->states[o_tid];
>
> cx = c;
>
> @@ -2049,6 +2032,17 @@ intel_get_excl_constraints(struct cpu_hw
> */
>
> /*
> + * state of sibling HT
> + */
> + xlo = &excl_cntrs->states[tid ^ 1];
> +
> + /*
> + * event requires exclusive counter access
> + * across HT threads
> + */
> + is_excl = c->flags & PERF_X86_EVENT_EXCL;
> +
> + /*
> * Modify static constraint with current dynamic
> * state of thread
> *
> @@ -2062,14 +2056,14 @@ intel_get_excl_constraints(struct cpu_hw
> * our corresponding counter cannot be used
> * regardless of our event
> */
> - if (xl->state[i] == INTEL_EXCL_EXCLUSIVE)
> + if (xlo->state[i] == INTEL_EXCL_EXCLUSIVE)
> __clear_bit(i, cx->idxmsk);
> /*
> * if measuring an exclusive event, sibling
> * measuring non-exclusive, then counter cannot
> * be used
> */
> - if (is_excl && xl->state[i] == INTEL_EXCL_SHARED)
> + if (is_excl && xlo->state[i] == INTEL_EXCL_SHARED)
> __clear_bit(i, cx->idxmsk);
> }
>
> @@ -2119,10 +2113,9 @@ static void intel_put_excl_constraints(s
> {
> struct hw_perf_event *hwc = &event->hw;
> struct intel_excl_cntrs *excl_cntrs = cpuc->excl_cntrs;
> - struct intel_excl_states *xlo, *xl;
> - unsigned long flags = 0; /* keep compiler happy */
> int tid = cpuc->excl_thread_id;
> - int o_tid = 1 - tid;
> + struct intel_excl_states *xl;
> + unsigned long flags = 0; /* keep compiler happy */
>
> /*
> * nothing needed if in group validation mode
> @@ -2136,7 +2129,6 @@ static void intel_put_excl_constraints(s
> return;
>
> xl = &excl_cntrs->states[tid];
> - xlo = &excl_cntrs->states[o_tid];
>
> /*
> * put_constraint may be called from x86_schedule_events()
> @@ -2151,7 +2143,7 @@ static void intel_put_excl_constraints(s
> * counter state as unused now
> */
> if (hwc->idx >= 0)
> - xlo->state[hwc->idx] = INTEL_EXCL_UNUSED;
> + xl->state[hwc->idx] = INTEL_EXCL_UNUSED;
>
> if (!xl->sched_started)
> raw_spin_unlock_irqrestore(&excl_cntrs->lock, flags);
> @@ -2190,16 +2182,12 @@ static void intel_commit_scheduling(stru
> {
> struct intel_excl_cntrs *excl_cntrs = cpuc->excl_cntrs;
> struct event_constraint *c = cpuc->event_constraint[idx];
> - struct intel_excl_states *xlo, *xl;
> + struct intel_excl_states *xl;
> int tid = cpuc->excl_thread_id;
> - int o_tid = 1 - tid;
> - int is_excl;
>
> if (cpuc->is_fake || !c)
> return;
>
> - is_excl = c->flags & PERF_X86_EVENT_EXCL;
> -
> if (!(c->flags & PERF_X86_EVENT_DYNAMIC))
> return;
>
> @@ -2209,15 +2197,14 @@ static void intel_commit_scheduling(stru
> return;
>
> xl = &excl_cntrs->states[tid];
> - xlo = &excl_cntrs->states[o_tid];
>
> WARN_ON_ONCE(!raw_spin_is_locked(&excl_cntrs->lock));
>
> if (cntr >= 0) {
> - if (is_excl)
> - xlo->init_state[cntr] = INTEL_EXCL_EXCLUSIVE;
> + if (c->flags & PERF_X86_EVENT_EXCL)
> + xl->init_state[cntr] = INTEL_EXCL_EXCLUSIVE;
> else
> - xlo->init_state[cntr] = INTEL_EXCL_SHARED;
> + xl->init_state[cntr] = INTEL_EXCL_SHARED;
> }
> }
>
>
>

2015-05-21 14:10:40

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [PATCH 03/10] perf/x86: Correct local vs remote sibling state

On Thu, May 21, 2015 at 06:31:25AM -0700, Stephane Eranian wrote:
> On Thu, May 21, 2015 at 4:17 AM, Peter Zijlstra <[email protected]> wrote:
> > For some obscure reason the current code accounts the current SMT
> > thread's state on the remote thread and reads the remote's state on
> > the local SMT thread.
> >
> > While internally consistent, and 'correct' its pointless confusion we
> > can do without.
> >
> > Flip them the right way around.
> >
> So you are changing the logic here from:
>
> * EXCLUSIVE: sibling counter measuring exclusive event
> * SHARED : sibling counter measuring non-exclusive event
> * UNUSED : sibling counter unused
>
> to:
>
> * EXCLUSIVE: current thread is using an exclusive event
> * SHARED: current thread is using a non-exclusive event
> * UNUSED: current thread is not using this counter
>
> I am okay with this just need to make sure there were no
> assumptions made about that. I will look.

Right; and when we construct the constraint mask we look at the other
one too. So both on the update and the read side I flipped things
around.

And that is really the only thing that matters, that you look at the
other sibling's thread state when constructing that mask. And that's
kept invariant.