LinuxLists.cc - [PATCH] perf_counter: Provide functions for locking and pinning the context for a task

2009-06-01 07:53:45

Subject: [PATCH] perf_counter: Provide functions for locking and pinning the context for a task

This abstracts out the code for locking the context associated with a
task. Because the context might get transferred from one task to
another concurrently, we have to check after locking the context that
it is still the right context for the task and retry if not. This
was open-coded in find_get_context() and perf_counter_init_task().

This adds a further function for pinning the context for a task, i.e.
marking it so it can't be transferred to another task. This adds a
'pin_count' field to struct perf_counter_context to indicate that a
context is pinned, instead of the previous method of setting the
parent_gen count to all 1s. Pinning the context with a pin_count is
easier to undo and doesn't require saving the parent_gen value. This
also adds a perf_unpin_context() to undo the effect of
perf_pin_task_context() and changes perf_counter_init_task to use it.

Signed-off-by: Paul Mackerras <[email protected]>
---
include/linux/perf_counter.h | 1 +
kernel/perf_counter.c | 128 ++++++++++++++++++++++++------------------
2 files changed, 75 insertions(+), 54 deletions(-)

diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index 519a41b..81ec79c 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -543,6 +543,7 @@ struct perf_counter_context {
struct perf_counter_context *parent_ctx;
u64 parent_gen;
u64 generation;
+ int pin_count;
struct rcu_head rcu_head;
};

diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 79c3f26..da8dfef 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -123,6 +123,69 @@ static void put_ctx(struct perf_counter_context *ctx)
}

/*
+ * Get the perf_counter_context for a task and lock it.
+ * This has to cope with with the fact that until it is locked,
+ * the context could get moved to another task.
+ */
+static struct perf_counter_context *perf_lock_task_context(
+ struct task_struct *task, unsigned long *flags)
+{
+ struct perf_counter_context *ctx;
+
+ rcu_read_lock();
+ retry:
+ ctx = rcu_dereference(task->perf_counter_ctxp);
+ if (ctx) {
+ /*
+ * If this context is a clone of another, it might
+ * get swapped for another underneath us by
+ * perf_counter_task_sched_out, though the
+ * rcu_read_lock() protects us from any context
+ * getting freed. Lock the context and check if it
+ * got swapped before we could get the lock, and retry
+ * if so. If we locked the right context, then it
+ * can't get swapped on us any more.
+ */
+ spin_lock_irqsave(&ctx->lock, *flags);
+ if (ctx != rcu_dereference(task->perf_counter_ctxp)) {
+ spin_unlock_irqrestore(&ctx->lock, *flags);
+ goto retry;
+ }
+ }
+ rcu_read_unlock();
+ return ctx;
+}
+
+/*
+ * Get the context for a task and increment its pin_count so it
+ * can't get swapped to another task. This also increments its
+ * reference count so that the context can't get freed.
+ */
+static struct perf_counter_context *perf_pin_task_context(struct task_struct *task)
+{
+ struct perf_counter_context *ctx;
+ unsigned long flags;
+
+ ctx = perf_lock_task_context(task, &flags);
+ if (ctx) {
+ ++ctx->pin_count;
+ get_ctx(ctx);
+ spin_unlock_irqrestore(&ctx->lock, flags);
+ }
+ return ctx;
+}
+
+static void perf_unpin_context(struct perf_counter_context *ctx)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&ctx->lock, flags);
+ --ctx->pin_count;
+ spin_unlock_irqrestore(&ctx->lock, flags);
+ put_ctx(ctx);
+}
+
+/*
* Add a counter from the lists for its context.
* Must be called with ctx->mutex and ctx->lock held.
*/
@@ -916,7 +979,7 @@ static int context_equiv(struct perf_counter_context *ctx1,
{
return ctx1->parent_ctx && ctx1->parent_ctx == ctx2->parent_ctx
&& ctx1->parent_gen == ctx2->parent_gen
- && ctx1->parent_gen != ~0ull;
+ && !ctx1->pin_count && !ctx2->pin_count;
}

/*
@@ -1271,6 +1334,7 @@ static struct perf_counter_context *find_get_context(pid_t pid, int cpu)
struct perf_counter_context *ctx;
struct perf_counter_context *parent_ctx;
struct task_struct *task;
+ unsigned long flags;
int err;

/*
@@ -1323,28 +1387,9 @@ static struct perf_counter_context *find_get_context(pid_t pid, int cpu)
if (!ptrace_may_access(task, PTRACE_MODE_READ))
goto errout;

- retry_lock:
- rcu_read_lock();
retry:
- ctx = rcu_dereference(task->perf_counter_ctxp);
+ ctx = perf_lock_task_context(task, &flags);
if (ctx) {
- /*
- * If this context is a clone of another, it might
- * get swapped for another underneath us by
- * perf_counter_task_sched_out, though the
- * rcu_read_lock() protects us from any context
- * getting freed. Lock the context and check if it
- * got swapped before we could get the lock, and retry
- * if so. If we locked the right context, then it
- * can't get swapped on us any more and we can
- * unclone it if necessary.
- * Once it's not a clone things will be stable.
- */
- spin_lock_irq(&ctx->lock);
- if (ctx != rcu_dereference(task->perf_counter_ctxp)) {
- spin_unlock_irq(&ctx->lock);
- goto retry;
- }
parent_ctx = ctx->parent_ctx;
if (parent_ctx) {
put_ctx(parent_ctx);
@@ -1355,9 +1400,8 @@ static struct perf_counter_context *find_get_context(pid_t pid, int cpu)
* this context won't get freed if the task exits.
*/
get_ctx(ctx);
- spin_unlock_irq(&ctx->lock);
+ spin_unlock_irqrestore(&ctx->lock, flags);
}
- rcu_read_unlock();

if (!ctx) {
ctx = kmalloc(sizeof(struct perf_counter_context), GFP_KERNEL);
@@ -1372,7 +1416,7 @@ static struct perf_counter_context *find_get_context(pid_t pid, int cpu)
* the context they set.
*/
kfree(ctx);
- goto retry_lock;
+ goto retry;
}
get_task_struct(task);
}
@@ -3667,7 +3711,6 @@ int perf_counter_init_task(struct task_struct *child)
struct perf_counter *counter;
struct task_struct *parent = current;
int inherited_all = 1;
- u64 cloned_gen;
int ret = 0;

child->perf_counter_ctxp = NULL;
@@ -3693,32 +3736,17 @@ int perf_counter_init_task(struct task_struct *child)
get_task_struct(child);

/*
- * If the parent's context is a clone, temporarily set its
- * parent_gen to an impossible value (all 1s) so it won't get
- * swapped under us. The rcu_read_lock makes sure that
- * parent_ctx continues to exist even if it gets swapped to
- * another process and then freed while we are trying to get
- * its lock.
+ * If the parent's context is a clone, pin it so it won't get
+ * swapped under us.
*/
- rcu_read_lock();
- retry:
- parent_ctx = rcu_dereference(parent->perf_counter_ctxp);
+ parent_ctx = perf_pin_task_context(parent);
+
/*
* No need to check if parent_ctx != NULL here; since we saw
* it non-NULL earlier, the only reason for it to become NULL
* is if we exit, and since we're currently in the middle of
* a fork we can't be exiting at the same time.
*/
- spin_lock_irq(&parent_ctx->lock);
- if (parent_ctx != rcu_dereference(parent->perf_counter_ctxp)) {
- spin_unlock_irq(&parent_ctx->lock);
- goto retry;
- }
- cloned_gen = parent_ctx->parent_gen;
- if (parent_ctx->parent_ctx)
- parent_ctx->parent_gen = ~0ull;
- spin_unlock_irq(&parent_ctx->lock);
- rcu_read_unlock();

/*
* Lock the parent list. No need to lock the child - not PID
@@ -3759,7 +3787,7 @@ int perf_counter_init_task(struct task_struct *child)
cloned_ctx = rcu_dereference(parent_ctx->parent_ctx);
if (cloned_ctx) {
child_ctx->parent_ctx = cloned_ctx;
- child_ctx->parent_gen = cloned_gen;
+ child_ctx->parent_gen = parent_ctx->parent_gen;
} else {
child_ctx->parent_ctx = parent_ctx;
child_ctx->parent_gen = parent_ctx->generation;
@@ -3769,15 +3797,7 @@ int perf_counter_init_task(struct task_struct *child)

mutex_unlock(&parent_ctx->mutex);

- /*
- * Restore the clone status of the parent.
- */
- if (parent_ctx->parent_ctx) {
- spin_lock_irq(&parent_ctx->lock);
- if (parent_ctx->parent_ctx)
- parent_ctx->parent_gen = cloned_gen;
- spin_unlock_irq(&parent_ctx->lock);
- }
+ perf_unpin_context(parent_ctx);

return ret;
}
--
1.6.0.4

2009-06-01 07:53:34

by Paul Mackerras

[permalink] [raw]

Subject: [PATCH] perf_counter: Remove unused prev_state field

This removes the prev_state field of struct perf_counter since it is
now unused. It was only used by the cpu migration counter, which
doesn't use it any more.

Signed-off-by: Paul Mackerras <[email protected]>
---
include/linux/perf_counter.h | 1 -
kernel/perf_counter.c | 4 ----
2 files changed, 0 insertions(+), 5 deletions(-)

diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index e630602..82ab958 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -427,7 +427,6 @@ struct perf_counter {
const struct pmu *pmu;

enum perf_counter_active_state state;
- enum perf_counter_active_state prev_state;
atomic64_t count;

/*
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index c7a20fa..d6d2f9d 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -570,7 +570,6 @@ group_sched_in(struct perf_counter *group_counter,
if (ret)
return ret < 0 ? ret : 0;

- group_counter->prev_state = group_counter->state;
if (counter_sched_in(group_counter, cpuctx, ctx, cpu))
return -EAGAIN;

@@ -578,7 +577,6 @@ group_sched_in(struct perf_counter *group_counter,
* Schedule in siblings as one group (if any):
*/
list_for_each_entry(counter, &group_counter->sibling_list, list_entry) {
- counter->prev_state = counter->state;
if (counter_sched_in(counter, cpuctx, ctx, cpu)) {
partial_group = counter;
goto group_error;
@@ -655,7 +653,6 @@ static void add_counter_to_ctx(struct perf_counter *counter,
struct perf_counter_context *ctx)
{
list_add_counter(counter, ctx);
- counter->prev_state = PERF_COUNTER_STATE_OFF;
counter->tstamp_enabled = ctx->time;
counter->tstamp_running = ctx->time;
counter->tstamp_stopped = ctx->time;
@@ -818,7 +815,6 @@ static void __perf_counter_enable(void *info)
ctx->is_active = 1;
update_context_time(ctx);

- counter->prev_state = counter->state;
if (counter->state >= PERF_COUNTER_STATE_INACTIVE)
goto unlock;
counter->state = PERF_COUNTER_STATE_INACTIVE;
--
1.6.0.4

2009-06-01 07:53:57

by Paul Mackerras

[permalink] [raw]

Subject: [PATCH] perf_counter: Fix cpu migration counter

This fixes the cpu migration software counter to count correctly even
when contexts get swapped from one task to another. Previously the
cpu migration counts reported by perf stat were bogus, ranging from
negative to several thousand for a single "lat_ctx 2 8 32" run. With
this patch the cpu migration count reported for "lat_ctx 2 8 32" is
almost always between 35 and 44.

This fixes the problem by adding a call into the perf_counter code
from set_task_cpu when tasks are migrated. This enables us to use the
generic swcounter code (with some modifications) for the cpu migration
counter.

This modifies the swcounter code to allow a NULL regs pointer to be
passed in to perf_swcounter_ctx_event() etc. The cpu migration
counter does this because there isn't necessarily a pt_regs struct for
the task available. In this case, the counter will not have interrupt
capability - but the migration counter didn't have interrupt
capability before, so this is no loss.

Signed-off-by: Paul Mackerras <[email protected]>
---
include/linux/perf_counter.h | 4 ++
kernel/perf_counter.c | 74 ++++++++++++------------------------------
kernel/sched.c | 1 +
3 files changed, 26 insertions(+), 53 deletions(-)

diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index 81ec79c..e630602 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -615,6 +615,8 @@ extern void perf_counter_munmap(unsigned long addr, unsigned long len,

extern void perf_counter_comm(struct task_struct *tsk);

+extern void perf_counter_task_migration(struct task_struct *task, int cpu);
+
#define MAX_STACK_DEPTH 255

struct perf_callchain_entry {
@@ -668,6 +670,8 @@ perf_counter_munmap(unsigned long addr, unsigned long len,

static inline void perf_counter_comm(struct task_struct *tsk) { }
static inline void perf_counter_init(void) { }
+static inline void perf_counter_task_migration(struct task_struct *task,
+ int cpu) { }
#endif

#endif /* __KERNEL__ */
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index ff8b463..c7a20fa 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -2919,11 +2919,13 @@ static int perf_swcounter_match(struct perf_counter *counter,
if (counter->hw_event.config != event_config)
return 0;

- if (counter->hw_event.exclude_user && user_mode(regs))
- return 0;
+ if (regs) {
+ if (counter->hw_event.exclude_user && user_mode(regs))
+ return 0;

- if (counter->hw_event.exclude_kernel && !user_mode(regs))
- return 0;
+ if (counter->hw_event.exclude_kernel && !user_mode(regs))
+ return 0;
+ }

return 1;
}
@@ -2932,7 +2934,7 @@ static void perf_swcounter_add(struct perf_counter *counter, u64 nr,
int nmi, struct pt_regs *regs, u64 addr)
{
int neg = atomic64_add_negative(nr, &counter->hw.count);
- if (counter->hw.irq_period && !neg)
+ if (counter->hw.irq_period && !neg && regs)
perf_swcounter_overflow(counter, nmi, regs, addr);
}

@@ -3148,55 +3150,24 @@ static const struct pmu perf_ops_task_clock = {
/*
* Software counter: cpu migrations
*/
-
-static inline u64 get_cpu_migrations(struct perf_counter *counter)
-{
- struct task_struct *curr = counter->ctx->task;
-
- if (curr)
- return curr->se.nr_migrations;
- return cpu_nr_migrations(smp_processor_id());
-}
-
-static void cpu_migrations_perf_counter_update(struct perf_counter *counter)
-{
- u64 prev, now;
- s64 delta;
-
- prev = atomic64_read(&counter->hw.prev_count);
- now = get_cpu_migrations(counter);
-
- atomic64_set(&counter->hw.prev_count, now);
-
- delta = now - prev;
-
- atomic64_add(delta, &counter->count);
-}
-
-static void cpu_migrations_perf_counter_read(struct perf_counter *counter)
+void perf_counter_task_migration(struct task_struct *task, int cpu)
{
- cpu_migrations_perf_counter_update(counter);
-}
+ struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu);
+ struct perf_counter_context *ctx;

-static int cpu_migrations_perf_counter_enable(struct perf_counter *counter)
-{
- if (counter->prev_state <= PERF_COUNTER_STATE_OFF)
- atomic64_set(&counter->hw.prev_count,
- get_cpu_migrations(counter));
- return 0;
-}
+ perf_swcounter_ctx_event(&cpuctx->ctx, PERF_TYPE_SOFTWARE,
+ PERF_COUNT_CPU_MIGRATIONS,
+ 1, 1, NULL, 0);

-static void cpu_migrations_perf_counter_disable(struct perf_counter *counter)
-{
- cpu_migrations_perf_counter_update(counter);
+ ctx = perf_pin_task_context(task);
+ if (ctx) {
+ perf_swcounter_ctx_event(ctx, PERF_TYPE_SOFTWARE,
+ PERF_COUNT_CPU_MIGRATIONS,
+ 1, 1, NULL, 0);
+ perf_unpin_context(ctx);
+ }
}

-static const struct pmu perf_ops_cpu_migrations = {
- .enable = cpu_migrations_perf_counter_enable,
- .disable = cpu_migrations_perf_counter_disable,
- .read = cpu_migrations_perf_counter_read,
-};
-
#ifdef CONFIG_EVENT_PROFILE
void perf_tpcounter_event(int event_id)
{
@@ -3269,11 +3240,8 @@ static const struct pmu *sw_perf_counter_init(struct perf_counter *counter)
case PERF_COUNT_PAGE_FAULTS_MIN:
case PERF_COUNT_PAGE_FAULTS_MAJ:
case PERF_COUNT_CONTEXT_SWITCHES:
- pmu = &perf_ops_generic;
- break;
case PERF_COUNT_CPU_MIGRATIONS:
- if (!counter->hw_event.exclude_kernel)
- pmu = &perf_ops_cpu_migrations;
+ pmu = &perf_ops_generic;
break;
}

diff --git a/kernel/sched.c b/kernel/sched.c
index ad079f0..a55c192 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -1977,6 +1977,7 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
if (task_hot(p, old_rq->clock, NULL))
schedstat_inc(p, se.nr_forced2_migrations);
#endif
+ perf_counter_task_migration(p, new_cpu);
}
p->se.vruntime -= old_cfsrq->min_vruntime -
new_cfsrq->min_vruntime;
--
1.6.0.4

2009-06-01 07:54:15

by Paul Mackerras

[permalink] [raw]

Subject: [PATCH] perf_counter: Allow software counters to count while task is not running

This changes perf_swcounter_match() so that per-task software counters
can count events that occur while their associated task is not
running. This will allow us to use the generic software counter code
for counting task migrations, which can occur while the task is
not scheduled in.

To do this, we have to distinguish between the situations where the
counter is inactive because its task has been scheduled out, and those
where the counter is inactive because it is part of a group that was
not able to go on the PMU. In the former case we want the counter to
count, but not in the latter case. If the context is active, we have
the latter case. If the context is inactive then we need to know
whether the counter was counting when the context was last active,
which we can determine by comparing its ->tstamp_stopped timestamp
with the context's timestamp.

This also folds three checks in perf_swcounter_match, checking
perf_event_raw(), perf_event_type() and perf_event_id() individually,
into a single 64-bit comparison on counter->hw_event.config, as
an optimization.

Signed-off-by: Paul Mackerras <[email protected]>
---
kernel/perf_counter.c | 48 ++++++++++++++++++++++++++++++++++++++++++------
1 files changed, 42 insertions(+), 6 deletions(-)

diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index da8dfef..ff8b463 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -2867,20 +2867,56 @@ static void perf_swcounter_overflow(struct perf_counter *counter,

}

+static int perf_swcounter_is_counting(struct perf_counter *counter)
+{
+ struct perf_counter_context *ctx;
+ unsigned long flags;
+ int count;
+
+ if (counter->state == PERF_COUNTER_STATE_ACTIVE)
+ return 1;
+
+ if (counter->state != PERF_COUNTER_STATE_INACTIVE)
+ return 0;
+
+ /*
+ * If the counter is inactive, it could be just because
+ * its task is scheduled out, or because it's in a group
+ * which could not go on the PMU. We want to count in
+ * the first case but not the second. If the context is
+ * currently active then an inactive software counter must
+ * be the second case. If it's not currently active then
+ * we need to know whether the counter was active when the
+ * context was last active, which we can determine by
+ * comparing counter->tstamp_stopped with ctx->time.
+ *
+ * We are within an RCU read-side critical section,
+ * which protects the existence of *ctx.
+ */
+ ctx = counter->ctx;
+ spin_lock_irqsave(&ctx->lock, flags);
+ count = 1;
+ /* Re-check state now we have the lock */
+ if (counter->state < PERF_COUNTER_STATE_INACTIVE ||
+ counter->ctx->is_active ||
+ counter->tstamp_stopped < ctx->time)
+ count = 0;
+ spin_unlock_irqrestore(&ctx->lock, flags);
+ return count;
+}
+
static int perf_swcounter_match(struct perf_counter *counter,
enum perf_event_types type,
u32 event, struct pt_regs *regs)
{
- if (counter->state != PERF_COUNTER_STATE_ACTIVE)
- return 0;
+ u64 event_config;

- if (perf_event_raw(&counter->hw_event))
- return 0;
+ event_config = ((u64) type << PERF_COUNTER_TYPE_SHIFT) | event;

- if (perf_event_type(&counter->hw_event) != type)
+ if (!perf_swcounter_is_counting(counter))
return 0;

- if (perf_event_id(&counter->hw_event) != event)
+ if (counter->hw_event.config != event_config)
return 0;

if (counter->hw_event.exclude_user && user_mode(regs))
--
1.6.0.4

2009-06-01 07:55:48

by Paul Mackerras

[permalink] [raw]

Subject: Re: [PATCH] perf_counter: Provide functions for locking and pinning the context for a task

Oops, forgot to put sequence numbers on these patches. They are:

[PATCH 1/4] perf_counter: Provide functions for locking and pinning the
context for a task
[PATCH 2/4] perf_counter: Allow software counters to count while task is
not running
[PATCH 3/4] perf_counter: Fix cpu migration counter
[PATCH 4/4] perf_counter: Remove unused prev_state field

Paul.

2009-06-01 08:19:40

by Paul Mackerras

[permalink] [raw]

Subject: [tip:perfcounters/core] perf_counter: Provide functions for locking and pinning the context for a task

Commit-ID: 25346b93ca079080c9cb23331db5c4f6404e8530
Gitweb: http://git.kernel.org/tip/25346b93ca079080c9cb23331db5c4f6404e8530
Author: Paul Mackerras <[email protected]>
AuthorDate: Mon, 1 Jun 2009 17:48:12 +1000
Committer: Ingo Molnar <[email protected]>
CommitDate: Mon, 1 Jun 2009 10:04:05 +0200

perf_counter: Provide functions for locking and pinning the context for a task

This abstracts out the code for locking the context associated
with a task. Because the context might get transferred from
one task to another concurrently, we have to check after
locking the context that it is still the right context for the
task and retry if not. This was open-coded in
find_get_context() and perf_counter_init_task().

This adds a further function for pinning the context for a
task, i.e. marking it so it can't be transferred to another
task. This adds a 'pin_count' field to struct
perf_counter_context to indicate that a context is pinned,
instead of the previous method of setting the parent_gen count
to all 1s. Pinning the context with a pin_count is easier to
undo and doesn't require saving the parent_gen value. This
also adds a perf_unpin_context() to undo the effect of
perf_pin_task_context() and changes perf_counter_init_task to
use it.

Signed-off-by: Paul Mackerras <[email protected]>
Cc: Peter Zijlstra <[email protected]>
Cc: Mike Galbraith <[email protected]>
Cc: Paul Mackerras <[email protected]>
Cc: Corey Ashford <[email protected]>
Cc: Marcelo Tosatti <[email protected]>
Cc: Arnaldo Carvalho de Melo <[email protected]>
Cc: John Kacur <[email protected]>
LKML-Reference: <[email protected]>
Signed-off-by: Ingo Molnar <[email protected]>

---
include/linux/perf_counter.h | 1 +
kernel/perf_counter.c | 128 ++++++++++++++++++++++++------------------
2 files changed, 75 insertions(+), 54 deletions(-)

diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index 519a41b..81ec79c 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -543,6 +543,7 @@ struct perf_counter_context {
struct perf_counter_context *parent_ctx;
u64 parent_gen;
u64 generation;
+ int pin_count;
struct rcu_head rcu_head;
};

diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 79c3f26..da8dfef 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -123,6 +123,69 @@ static void put_ctx(struct perf_counter_context *ctx)
}

/*
+ * Get the perf_counter_context for a task and lock it.
+ * This has to cope with with the fact that until it is locked,
+ * the context could get moved to another task.
+ */
+static struct perf_counter_context *perf_lock_task_context(
+ struct task_struct *task, unsigned long *flags)
+{
+ struct perf_counter_context *ctx;
+
+ rcu_read_lock();
+ retry:
+ ctx = rcu_dereference(task->perf_counter_ctxp);
+ if (ctx) {
+ /*
+ * If this context is a clone of another, it might
+ * get swapped for another underneath us by
+ * perf_counter_task_sched_out, though the
+ * rcu_read_lock() protects us from any context
+ * getting freed. Lock the context and check if it
+ * got swapped before we could get the lock, and retry
+ * if so. If we locked the right context, then it
+ * can't get swapped on us any more.
+ */
+ spin_lock_irqsave(&ctx->lock, *flags);
+ if (ctx != rcu_dereference(task->perf_counter_ctxp)) {
+ spin_unlock_irqrestore(&ctx->lock, *flags);
+ goto retry;
+ }
+ }
+ rcu_read_unlock();
+ return ctx;
+}
+
+/*
+ * Get the context for a task and increment its pin_count so it
+ * can't get swapped to another task. This also increments its
+ * reference count so that the context can't get freed.
+ */
+static struct perf_counter_context *perf_pin_task_context(struct task_struct *task)
+{
+ struct perf_counter_context *ctx;
+ unsigned long flags;
+
+ ctx = perf_lock_task_context(task, &flags);
+ if (ctx) {
+ ++ctx->pin_count;
+ get_ctx(ctx);
+ spin_unlock_irqrestore(&ctx->lock, flags);
+ }
+ return ctx;
+}
+
+static void perf_unpin_context(struct perf_counter_context *ctx)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&ctx->lock, flags);
+ --ctx->pin_count;
+ spin_unlock_irqrestore(&ctx->lock, flags);
+ put_ctx(ctx);
+}
+
+/*
* Add a counter from the lists for its context.
* Must be called with ctx->mutex and ctx->lock held.
*/
@@ -916,7 +979,7 @@ static int context_equiv(struct perf_counter_context *ctx1,
{
return ctx1->parent_ctx && ctx1->parent_ctx == ctx2->parent_ctx
&& ctx1->parent_gen == ctx2->parent_gen
- && ctx1->parent_gen != ~0ull;
+ && !ctx1->pin_count && !ctx2->pin_count;
}

/*
@@ -1271,6 +1334,7 @@ static struct perf_counter_context *find_get_context(pid_t pid, int cpu)
struct perf_counter_context *ctx;
struct perf_counter_context *parent_ctx;
struct task_struct *task;
+ unsigned long flags;
int err;

/*
@@ -1323,28 +1387,9 @@ static struct perf_counter_context *find_get_context(pid_t pid, int cpu)
if (!ptrace_may_access(task, PTRACE_MODE_READ))
goto errout;

- retry_lock:
- rcu_read_lock();
retry:
- ctx = rcu_dereference(task->perf_counter_ctxp);
+ ctx = perf_lock_task_context(task, &flags);
if (ctx) {
- /*
- * If this context is a clone of another, it might
- * get swapped for another underneath us by
- * perf_counter_task_sched_out, though the
- * rcu_read_lock() protects us from any context
- * getting freed. Lock the context and check if it
- * got swapped before we could get the lock, and retry
- * if so. If we locked the right context, then it
- * can't get swapped on us any more and we can
- * unclone it if necessary.
- * Once it's not a clone things will be stable.
- */
- spin_lock_irq(&ctx->lock);
- if (ctx != rcu_dereference(task->perf_counter_ctxp)) {
- spin_unlock_irq(&ctx->lock);
- goto retry;
- }
parent_ctx = ctx->parent_ctx;
if (parent_ctx) {
put_ctx(parent_ctx);
@@ -1355,9 +1400,8 @@ static struct perf_counter_context *find_get_context(pid_t pid, int cpu)
* this context won't get freed if the task exits.
*/
get_ctx(ctx);
- spin_unlock_irq(&ctx->lock);
+ spin_unlock_irqrestore(&ctx->lock, flags);
}
- rcu_read_unlock();

if (!ctx) {
ctx = kmalloc(sizeof(struct perf_counter_context), GFP_KERNEL);
@@ -1372,7 +1416,7 @@ static struct perf_counter_context *find_get_context(pid_t pid, int cpu)
* the context they set.
*/
kfree(ctx);
- goto retry_lock;
+ goto retry;
}
get_task_struct(task);
}
@@ -3667,7 +3711,6 @@ int perf_counter_init_task(struct task_struct *child)
struct perf_counter *counter;
struct task_struct *parent = current;
int inherited_all = 1;
- u64 cloned_gen;
int ret = 0;

child->perf_counter_ctxp = NULL;
@@ -3693,32 +3736,17 @@ int perf_counter_init_task(struct task_struct *child)
get_task_struct(child);

/*
- * If the parent's context is a clone, temporarily set its
- * parent_gen to an impossible value (all 1s) so it won't get
- * swapped under us. The rcu_read_lock makes sure that
- * parent_ctx continues to exist even if it gets swapped to
- * another process and then freed while we are trying to get
- * its lock.
+ * If the parent's context is a clone, pin it so it won't get
+ * swapped under us.
*/
- rcu_read_lock();
- retry:
- parent_ctx = rcu_dereference(parent->perf_counter_ctxp);
+ parent_ctx = perf_pin_task_context(parent);
+
/*
* No need to check if parent_ctx != NULL here; since we saw
* it non-NULL earlier, the only reason for it to become NULL
* is if we exit, and since we're currently in the middle of
* a fork we can't be exiting at the same time.
*/
- spin_lock_irq(&parent_ctx->lock);
- if (parent_ctx != rcu_dereference(parent->perf_counter_ctxp)) {
- spin_unlock_irq(&parent_ctx->lock);
- goto retry;
- }
- cloned_gen = parent_ctx->parent_gen;
- if (parent_ctx->parent_ctx)
- parent_ctx->parent_gen = ~0ull;
- spin_unlock_irq(&parent_ctx->lock);
- rcu_read_unlock();

/*
* Lock the parent list. No need to lock the child - not PID
@@ -3759,7 +3787,7 @@ int perf_counter_init_task(struct task_struct *child)
cloned_ctx = rcu_dereference(parent_ctx->parent_ctx);
if (cloned_ctx) {
child_ctx->parent_ctx = cloned_ctx;
- child_ctx->parent_gen = cloned_gen;
+ child_ctx->parent_gen = parent_ctx->parent_gen;
} else {
child_ctx->parent_ctx = parent_ctx;
child_ctx->parent_gen = parent_ctx->generation;
@@ -3769,15 +3797,7 @@ int perf_counter_init_task(struct task_struct *child)

mutex_unlock(&parent_ctx->mutex);

- /*
- * Restore the clone status of the parent.
- */
- if (parent_ctx->parent_ctx) {
- spin_lock_irq(&parent_ctx->lock);
- if (parent_ctx->parent_ctx)
- parent_ctx->parent_gen = cloned_gen;
- spin_unlock_irq(&parent_ctx->lock);
- }
+ perf_unpin_context(parent_ctx);

return ret;
}

2009-06-01 08:19:53

by Paul Mackerras

[permalink] [raw]

Subject: [tip:perfcounters/core] perf_counter: Allow software counters to count while task is not running

Commit-ID: 880ca15adf2392770a68047e7a98e076ff4d21da
Gitweb: http://git.kernel.org/tip/880ca15adf2392770a68047e7a98e076ff4d21da
Author: Paul Mackerras <[email protected]>
AuthorDate: Mon, 1 Jun 2009 17:49:14 +1000
Committer: Ingo Molnar <[email protected]>
CommitDate: Mon, 1 Jun 2009 10:04:06 +0200

perf_counter: Allow software counters to count while task is not running

This changes perf_swcounter_match() so that per-task software
counters can count events that occur while their associated
task is not running. This will allow us to use the generic
software counter code for counting task migrations, which can
occur while the task is not scheduled in.

To do this, we have to distinguish between the situations where
the counter is inactive because its task has been scheduled
out, and those where the counter is inactive because it is part
of a group that was not able to go on the PMU. In the former
case we want the counter to count, but not in the latter case.
If the context is active, we have the latter case. If the
context is inactive then we need to know whether the counter
was counting when the context was last active, which we can
determine by comparing its ->tstamp_stopped timestamp with the
context's timestamp.

This also folds three checks in perf_swcounter_match, checking
perf_event_raw(), perf_event_type() and perf_event_id()
individually, into a single 64-bit comparison on
counter->hw_event.config, as an optimization.

Signed-off-by: Paul Mackerras <[email protected]>
Cc: Peter Zijlstra <[email protected]>
Cc: Mike Galbraith <[email protected]>
Cc: Paul Mackerras <[email protected]>
Cc: Corey Ashford <[email protected]>
Cc: Marcelo Tosatti <[email protected]>
Cc: Arnaldo Carvalho de Melo <[email protected]>
Cc: John Kacur <[email protected]>
LKML-Reference: <[email protected]>
Signed-off-by: Ingo Molnar <[email protected]>

---
kernel/perf_counter.c | 48 ++++++++++++++++++++++++++++++++++++++++++------
1 files changed, 42 insertions(+), 6 deletions(-)

diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index da8dfef..ff8b463 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -2867,20 +2867,56 @@ static void perf_swcounter_overflow(struct perf_counter *counter,

}

+static int perf_swcounter_is_counting(struct perf_counter *counter)
+{
+ struct perf_counter_context *ctx;
+ unsigned long flags;
+ int count;
+
+ if (counter->state == PERF_COUNTER_STATE_ACTIVE)
+ return 1;
+
+ if (counter->state != PERF_COUNTER_STATE_INACTIVE)
+ return 0;
+
+ /*
+ * If the counter is inactive, it could be just because
+ * its task is scheduled out, or because it's in a group
+ * which could not go on the PMU. We want to count in
+ * the first case but not the second. If the context is
+ * currently active then an inactive software counter must
+ * be the second case. If it's not currently active then
+ * we need to know whether the counter was active when the
+ * context was last active, which we can determine by
+ * comparing counter->tstamp_stopped with ctx->time.
+ *
+ * We are within an RCU read-side critical section,
+ * which protects the existence of *ctx.
+ */
+ ctx = counter->ctx;
+ spin_lock_irqsave(&ctx->lock, flags);
+ count = 1;
+ /* Re-check state now we have the lock */
+ if (counter->state < PERF_COUNTER_STATE_INACTIVE ||
+ counter->ctx->is_active ||
+ counter->tstamp_stopped < ctx->time)
+ count = 0;
+ spin_unlock_irqrestore(&ctx->lock, flags);
+ return count;
+}
+
static int perf_swcounter_match(struct perf_counter *counter,
enum perf_event_types type,
u32 event, struct pt_regs *regs)
{
- if (counter->state != PERF_COUNTER_STATE_ACTIVE)
- return 0;
+ u64 event_config;

- if (perf_event_raw(&counter->hw_event))
- return 0;
+ event_config = ((u64) type << PERF_COUNTER_TYPE_SHIFT) | event;

- if (perf_event_type(&counter->hw_event) != type)
+ if (!perf_swcounter_is_counting(counter))
return 0;

- if (perf_event_id(&counter->hw_event) != event)
+ if (counter->hw_event.config != event_config)
return 0;

if (counter->hw_event.exclude_user && user_mode(regs))

2009-06-01 08:20:15

by Paul Mackerras

[permalink] [raw]

Subject: [tip:perfcounters/core] perf_counter: Fix cpu migration counter

Commit-ID: ed058219223216eabe14c0692d6e51f032043bb9
Gitweb: http://git.kernel.org/tip/ed058219223216eabe14c0692d6e51f032043bb9
Author: Paul Mackerras <[email protected]>
AuthorDate: Mon, 1 Jun 2009 17:52:30 +1000
Committer: Ingo Molnar <[email protected]>
CommitDate: Mon, 1 Jun 2009 10:04:07 +0200

perf_counter: Fix cpu migration counter

This fixes the cpu migration software counter to count
correctly even when contexts get swapped from one task to
another. Previously the cpu migration counts reported by perf
stat were bogus, ranging from negative to several thousand for
a single "lat_ctx 2 8 32" run. With this patch the cpu
migration count reported for "lat_ctx 2 8 32" is almost always
between 35 and 44.

This fixes the problem by adding a call into the perf_counter
code from set_task_cpu when tasks are migrated. This enables
us to use the generic swcounter code (with some modifications)
for the cpu migration counter.

This modifies the swcounter code to allow a NULL regs pointer
to be passed in to perf_swcounter_ctx_event() etc. The cpu
migration counter does this because there isn't necessarily a
pt_regs struct for the task available. In this case, the
counter will not have interrupt capability - but the migration
counter didn't have interrupt capability before, so this is no
loss.

Signed-off-by: Paul Mackerras <[email protected]>
Cc: Peter Zijlstra <[email protected]>
Cc: Mike Galbraith <[email protected]>
Cc: Paul Mackerras <[email protected]>
Cc: Corey Ashford <[email protected]>
Cc: Marcelo Tosatti <[email protected]>
Cc: Arnaldo Carvalho de Melo <[email protected]>
Cc: John Kacur <[email protected]>
LKML-Reference: <[email protected]>
Signed-off-by: Ingo Molnar <[email protected]>

---
include/linux/perf_counter.h | 4 ++
kernel/perf_counter.c | 74 ++++++++++++------------------------------
kernel/sched.c | 1 +
3 files changed, 26 insertions(+), 53 deletions(-)

diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index 81ec79c..e630602 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -615,6 +615,8 @@ extern void perf_counter_munmap(unsigned long addr, unsigned long len,

extern void perf_counter_comm(struct task_struct *tsk);

+extern void perf_counter_task_migration(struct task_struct *task, int cpu);
+
#define MAX_STACK_DEPTH 255

struct perf_callchain_entry {
@@ -668,6 +670,8 @@ perf_counter_munmap(unsigned long addr, unsigned long len,

static inline void perf_counter_comm(struct task_struct *tsk) { }
static inline void perf_counter_init(void) { }
+static inline void perf_counter_task_migration(struct task_struct *task,
+ int cpu) { }
#endif

#endif /* __KERNEL__ */
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index ff8b463..c7a20fa 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -2919,11 +2919,13 @@ static int perf_swcounter_match(struct perf_counter *counter,
if (counter->hw_event.config != event_config)
return 0;

- if (counter->hw_event.exclude_user && user_mode(regs))
- return 0;
+ if (regs) {
+ if (counter->hw_event.exclude_user && user_mode(regs))
+ return 0;

- if (counter->hw_event.exclude_kernel && !user_mode(regs))
- return 0;
+ if (counter->hw_event.exclude_kernel && !user_mode(regs))
+ return 0;
+ }

return 1;
}
@@ -2932,7 +2934,7 @@ static void perf_swcounter_add(struct perf_counter *counter, u64 nr,
int nmi, struct pt_regs *regs, u64 addr)
{
int neg = atomic64_add_negative(nr, &counter->hw.count);
- if (counter->hw.irq_period && !neg)
+ if (counter->hw.irq_period && !neg && regs)
perf_swcounter_overflow(counter, nmi, regs, addr);
}

@@ -3148,55 +3150,24 @@ static const struct pmu perf_ops_task_clock = {
/*
* Software counter: cpu migrations
*/
-
-static inline u64 get_cpu_migrations(struct perf_counter *counter)
-{
- struct task_struct *curr = counter->ctx->task;
-
- if (curr)
- return curr->se.nr_migrations;
- return cpu_nr_migrations(smp_processor_id());
-}
-
-static void cpu_migrations_perf_counter_update(struct perf_counter *counter)
-{
- u64 prev, now;
- s64 delta;
-
- prev = atomic64_read(&counter->hw.prev_count);
- now = get_cpu_migrations(counter);
-
- atomic64_set(&counter->hw.prev_count, now);
-
- delta = now - prev;
-
- atomic64_add(delta, &counter->count);
-}
-
-static void cpu_migrations_perf_counter_read(struct perf_counter *counter)
+void perf_counter_task_migration(struct task_struct *task, int cpu)
{
- cpu_migrations_perf_counter_update(counter);
-}
+ struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu);
+ struct perf_counter_context *ctx;

-static int cpu_migrations_perf_counter_enable(struct perf_counter *counter)
-{
- if (counter->prev_state <= PERF_COUNTER_STATE_OFF)
- atomic64_set(&counter->hw.prev_count,
- get_cpu_migrations(counter));
- return 0;
-}
+ perf_swcounter_ctx_event(&cpuctx->ctx, PERF_TYPE_SOFTWARE,
+ PERF_COUNT_CPU_MIGRATIONS,
+ 1, 1, NULL, 0);

-static void cpu_migrations_perf_counter_disable(struct perf_counter *counter)
-{
- cpu_migrations_perf_counter_update(counter);
+ ctx = perf_pin_task_context(task);
+ if (ctx) {
+ perf_swcounter_ctx_event(ctx, PERF_TYPE_SOFTWARE,
+ PERF_COUNT_CPU_MIGRATIONS,
+ 1, 1, NULL, 0);
+ perf_unpin_context(ctx);
+ }
}

-static const struct pmu perf_ops_cpu_migrations = {
- .enable = cpu_migrations_perf_counter_enable,
- .disable = cpu_migrations_perf_counter_disable,
- .read = cpu_migrations_perf_counter_read,
-};
-
#ifdef CONFIG_EVENT_PROFILE
void perf_tpcounter_event(int event_id)
{
@@ -3269,11 +3240,8 @@ static const struct pmu *sw_perf_counter_init(struct perf_counter *counter)
case PERF_COUNT_PAGE_FAULTS_MIN:
case PERF_COUNT_PAGE_FAULTS_MAJ:
case PERF_COUNT_CONTEXT_SWITCHES:
- pmu = &perf_ops_generic;
- break;
case PERF_COUNT_CPU_MIGRATIONS:
- if (!counter->hw_event.exclude_kernel)
- pmu = &perf_ops_cpu_migrations;
+ pmu = &perf_ops_generic;
break;
}

diff --git a/kernel/sched.c b/kernel/sched.c
index ad079f0..a55c192 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -1977,6 +1977,7 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
if (task_hot(p, old_rq->clock, NULL))
schedstat_inc(p, se.nr_forced2_migrations);
#endif
+ perf_counter_task_migration(p, new_cpu);
}
p->se.vruntime -= old_cfsrq->min_vruntime -
new_cfsrq->min_vruntime;

2009-06-01 08:20:33

by Paul Mackerras

[permalink] [raw]

Subject: [tip:perfcounters/core] perf_counter: Remove unused prev_state field

Commit-ID: 4a3b3ef5ba929d2698794984bf9c9b02244c49cd
Gitweb: http://git.kernel.org/tip/4a3b3ef5ba929d2698794984bf9c9b02244c49cd
Author: Paul Mackerras <[email protected]>
AuthorDate: Mon, 1 Jun 2009 17:53:16 +1000
Committer: Ingo Molnar <[email protected]>
CommitDate: Mon, 1 Jun 2009 10:04:07 +0200

perf_counter: Remove unused prev_state field

This removes the prev_state field of struct perf_counter since
it is now unused. It was only used by the cpu migration
counter, which doesn't use it any more.

Signed-off-by: Paul Mackerras <[email protected]>
Cc: Peter Zijlstra <[email protected]>
Cc: Mike Galbraith <[email protected]>
Cc: Paul Mackerras <[email protected]>
Cc: Corey Ashford <[email protected]>
Cc: Marcelo Tosatti <[email protected]>
Cc: Arnaldo Carvalho de Melo <[email protected]>
Cc: John Kacur <[email protected]>
LKML-Reference: <[email protected]>
Signed-off-by: Ingo Molnar <[email protected]>

---
include/linux/perf_counter.h | 1 -
kernel/perf_counter.c | 4 ----
2 files changed, 0 insertions(+), 5 deletions(-)

diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index e630602..82ab958 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -427,7 +427,6 @@ struct perf_counter {
const struct pmu *pmu;

enum perf_counter_active_state state;
- enum perf_counter_active_state prev_state;
atomic64_t count;

/*
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index c7a20fa..d6d2f9d 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -570,7 +570,6 @@ group_sched_in(struct perf_counter *group_counter,
if (ret)
return ret < 0 ? ret : 0;

- group_counter->prev_state = group_counter->state;
if (counter_sched_in(group_counter, cpuctx, ctx, cpu))
return -EAGAIN;

@@ -578,7 +577,6 @@ group_sched_in(struct perf_counter *group_counter,
* Schedule in siblings as one group (if any):
*/
list_for_each_entry(counter, &group_counter->sibling_list, list_entry) {
- counter->prev_state = counter->state;
if (counter_sched_in(counter, cpuctx, ctx, cpu)) {
partial_group = counter;
goto group_error;
@@ -655,7 +653,6 @@ static void add_counter_to_ctx(struct perf_counter *counter,
struct perf_counter_context *ctx)
{
list_add_counter(counter, ctx);
- counter->prev_state = PERF_COUNTER_STATE_OFF;
counter->tstamp_enabled = ctx->time;
counter->tstamp_running = ctx->time;
counter->tstamp_stopped = ctx->time;
@@ -818,7 +815,6 @@ static void __perf_counter_enable(void *info)
ctx->is_active = 1;
update_context_time(ctx);

- counter->prev_state = counter->state;
if (counter->state >= PERF_COUNTER_STATE_INACTIVE)
goto unlock;
counter->state = PERF_COUNTER_STATE_INACTIVE;

2009-06-01 16:21:36

by Ingo Molnar

[permalink] [raw]

Subject: Re: [PATCH] perf_counter: Provide functions for locking and pinning the context for a task

Paul,

-tip testing found that these patches are causing a crash:

[ 24.995999] BUG: unable to handle kernel NULL pointer dereference at (null)
[ 24.995999] IP: [<ffffffff810bd645>] perf_swcounter_ctx_event+0x1f4/0x26c
[ 24.995999] PGD 3b555067 PUD 3b5a1067 PMD 0
[ 24.995999] Oops: 0000 [#1] SMP DEBUG_PAGEALLOC
[ 24.995999] last sysfs file: /sys/devices/system/cpu/cpu1/online
[ 24.995999] CPU 0
[ 24.995999] Modules linked in:
[ 24.995999] Pid: 3217, comm: S99local Tainted: G W 2.6.30-rc7-tip-01947-gd584b05-dirty #47106 System Product Name
[ 24.995999] RIP: 0010:[<ffffffff810bd645>] [<ffffffff810bd645>] perf_swcounter_ctx_event+0x1f4/0x26c
[ 24.995999] RSP: 0018:ffff88003b595be8 EFLAGS: 00010202
[ 24.995999] RAX: 0100000000000000 RBX: fffffffffffffff0 RCX: ffff88003ccd30e0
[ 24.995999] RDX: 0000000000000000 RSI: 0000000000000010 RDI: 0000000000000246
[ 24.995999] RBP: ffff88003b595c48 R08: 0000000000000004 R09: 0000000000000000
[ 24.995999] R10: 00000000e73bd78f R11: ffffffff81fe10b0 R12: 0000000000000004
[ 24.995999] R13: 0000000000000001 R14: 0000000000000000 R15: 0100000000000004
[ 24.995999] FS: 00007f1896b066f0(0000) GS:ffff880004a00000(0000) knlGS:0000000000000000
[ 24.995999] CS: 0010 DS: 0000 ES: 0000 CR0: 000000008005003b
[ 24.995999] CR2: 0000000000000000 CR3: 000000003d1f6000 CR4: 00000000000006f0
[ 24.995999] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
[ 24.995999] DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400
[ 24.995999] Process S99local (pid: 3217, threadinfo ffff88003b594000, task ffff88003ccd3000)
[ 24.995999] Stack:
[ 24.995999] ffffffff810bd4b2 0000000000000000 0000000104bd6280 0000000000000001
[ 24.995999] ffff880004dae3b8 ffffffff8108010f 00000000e73bd78f ffff88003b5c8000
[ 24.995999] ffff88003b5c8000 00000005d1e0b069 0000000000000001 ffff880004db4280
[ 24.995999] Call Trace:
[ 24.995999] [<ffffffff810bd4b2>] ? perf_swcounter_ctx_event+0x61/0x26c
[ 24.995999] [<ffffffff8108010f>] ? trace_hardirqs_on_caller+0x12b/0x16d
[ 24.995999] [<ffffffff810bd70f>] perf_counter_task_migration+0x52/0xad
[ 24.995999] [<ffffffff810486f3>] set_task_cpu+0x174/0x1cb
[ 24.995999] [<ffffffff8106ce51>] kthread_bind+0x4b/0x9d
[ 24.995999] [<ffffffff8164bd42>] migration_call+0x76/0x50a
[ 24.995999] [<ffffffff810519f4>] ? cpu_hotplug_begin+0x35/0x77
[ 24.995999] [<ffffffff81656757>] notifier_call_chain+0x6d/0xb5
[ 24.995999] [<ffffffff810723e6>] __raw_notifier_call_chain+0x1c/0x32
[ 24.995999] [<ffffffff8164c3b5>] cpu_up+0xde/0x1b9
[ 24.995999] [<ffffffff8161ba84>] store_online+0x5a/0x9a
[ 24.995999] [<ffffffff812cb9e1>] sysdev_store+0x2e/0x44
[ 24.995999] [<ffffffff81159f39>] sysfs_write_file+0xf3/0x13c
[ 24.995999] [<ffffffff810fb410>] ? rw_verify_area+0x97/0xd1
[ 24.995999] [<ffffffff810fbdda>] vfs_write+0xbe/0x130
[ 24.995999] [<ffffffff810fbf42>] sys_write+0x56/0x93
[ 24.995999] [<ffffffff8100b502>] system_call_fastpath+0x16/0x1b
[ 24.995999] Code: 4c 89 f2 48 89 df 48 f7 d8 48 89 83 28 01 00 00 48 89 83 b8 00 00 00 48 8b 4d a8 8b 75 b4 e8 ad f8 ff ff 48 8b 43 10 48 8d 58 f0 <48> 8b 43 10 0f 18 08 48 8d 43 10 48 39 45 c0 0f 85 98 fe ff ff
[ 24.995999] RIP [<ffffffff810bd645>] perf_swcounter_ctx_event+0x1f4/0x26c

config attached.

Ingo

Attachments:

(No filename) (3.28 kB)
config (65.57 kB)
Download all attachments

2009-06-01 21:38:29

by Paul Mackerras

[permalink] [raw]

Subject: Re: [PATCH] perf_counter: Provide functions for locking and pinning the context for a task

Ingo Molnar writes:

> -tip testing found that these patches are causing a crash:

Argh. What was userspace doing at the time of the crash, do you know?
It looks like it might have been on/off-lining a cpu, judging by the
stack trace. Would there have been any counters attached to the task
doing there, or were there any per-cpu counters active?

Paul.

2009-06-01 23:14:42

by Ingo Molnar

[permalink] [raw]

Subject: Re: [PATCH] perf_counter: Provide functions for locking and pinning the context for a task

* Paul Mackerras <[email protected]> wrote:

> Ingo Molnar writes:
>
> > -tip testing found that these patches are causing a crash:
>
> Argh. What was userspace doing at the time of the crash, do you
> know? It looks like it might have been on/off-lining a cpu,
> judging by the stack trace. Would there have been any counters
> attached to the task doing there, or were there any per-cpu
> counters active?

Yeah, indeed that box has a CPU hotplug testcase - sets cpu1 to
offline then online.

There should be no counters active anywhere during that.

Ingo

2009-06-02 05:37:48

by Paul Mackerras

[permalink] [raw]

Subject: Re: [PATCH] perf_counter: Provide functions for locking and pinning the context for a task

Ingo Molnar writes:

> Yeah, indeed that box has a CPU hotplug testcase - sets cpu1 to
> offline then online.
>
> There should be no counters active anywhere during that.

OK, I can't reproduce this on powerpc. I guess you have dynamic
per-cpu patches in there, and per-cpu areas are getting reinitialized
when cpus come up. That, combined with the fact that the
migration_notifier in kernel/sched.c puts itself at priority 10, means
that we're getting a call to perf_counter_task_migration() for a
newly-added CPU before perf_cpu_notify() has been called for that CPU,
and so we're trying to use an uninitialized perf_cpu_context and we go
boom.

Could you try the same test with this patch? If this fixes it, then
that's what the problem is. It's up to you whether increasing the
priority on perf_cpu_nb is the right solution or whether we should
solve the problem some other way.

Paul.

diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -3902,8 +3902,12 @@ perf_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu)
return NOTIFY_OK;
}

+/*
+ * This has to have a higher priority than migration_notifier in sched.c.
+ */
static struct notifier_block __cpuinitdata perf_cpu_nb = {
.notifier_call = perf_cpu_notify,
+ .priority = 20,
};

void __init perf_counter_init(void)

2009-06-02 07:56:57

by Ingo Molnar

[permalink] [raw]

Subject: Re: [PATCH] perf_counter: Provide functions for locking and pinning the context for a task

* Paul Mackerras <[email protected]> wrote:

> Ingo Molnar writes:
>
> > Yeah, indeed that box has a CPU hotplug testcase - sets cpu1 to
> > offline then online.
> >
> > There should be no counters active anywhere during that.
>
> OK, I can't reproduce this on powerpc. I guess you have dynamic
> per-cpu patches in there, and per-cpu areas are getting
> reinitialized when cpus come up. That, combined with the fact
> that the migration_notifier in kernel/sched.c puts itself at
> priority 10, means that we're getting a call to
> perf_counter_task_migration() for a newly-added CPU before
> perf_cpu_notify() has been called for that CPU, and so we're
> trying to use an uninitialized perf_cpu_context and we go boom.

Sounds very plausible.

> Could you try the same test with this patch? If this fixes it,
> then that's what the problem is. It's up to you whether
> increasing the priority on perf_cpu_nb is the right solution or
> whether we should solve the problem some other way.
>
> Paul.
>
> diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
> --- a/kernel/perf_counter.c
> +++ b/kernel/perf_counter.c
> @@ -3902,8 +3902,12 @@ perf_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu)
> return NOTIFY_OK;
> }
>
> +/*
> + * This has to have a higher priority than migration_notifier in sched.c.
> + */
> static struct notifier_block __cpuinitdata perf_cpu_nb = {
> .notifier_call = perf_cpu_notify,
> + .priority = 20,
> };

Makes sense. Mind doing a full patch with a changelog, and with a
comment that explains what the priority rules are? Perhaps add a
comment to the counterpart in sched.c too.

Ingo

2009-06-02 14:19:37

by Paul Mackerras

[permalink] [raw]

Subject: [tip:perfcounters/core] perf_counter: Fix cpu migration counter

Commit-ID: 3f731ca60afc29f5bcdb5fd2a04391466313a9ac
Gitweb: http://git.kernel.org/tip/3f731ca60afc29f5bcdb5fd2a04391466313a9ac
Author: Paul Mackerras <[email protected]>
AuthorDate: Mon, 1 Jun 2009 17:52:30 +1000
Committer: Ingo Molnar <[email protected]>
CommitDate: Tue, 2 Jun 2009 13:10:54 +0200

perf_counter: Fix cpu migration counter

This fixes the cpu migration software counter to count
correctly even when contexts get swapped from one task to
another. Previously the cpu migration counts reported by perf
stat were bogus, ranging from negative to several thousand for
a single "lat_ctx 2 8 32" run. With this patch the cpu
migration count reported for "lat_ctx 2 8 32" is almost always
between 35 and 44.

This fixes the problem by adding a call into the perf_counter
code from set_task_cpu when tasks are migrated. This enables
us to use the generic swcounter code (with some modifications)
for the cpu migration counter.

This modifies the swcounter code to allow a NULL regs pointer
to be passed in to perf_swcounter_ctx_event() etc. The cpu
migration counter does this because there isn't necessarily a
pt_regs struct for the task available. In this case, the
counter will not have interrupt capability - but the migration
counter didn't have interrupt capability before, so this is no
loss.

Signed-off-by: Paul Mackerras <[email protected]>
Cc: Peter Zijlstra <[email protected]>
Cc: Mike Galbraith <[email protected]>
Cc: Corey Ashford <[email protected]>
Cc: Marcelo Tosatti <[email protected]>
Cc: Arnaldo Carvalho de Melo <[email protected]>
Cc: John Kacur <[email protected]>
LKML-Reference: <[email protected]>
Signed-off-by: Ingo Molnar <[email protected]>

---
include/linux/perf_counter.h | 4 ++
kernel/perf_counter.c | 74 ++++++++++++------------------------------
kernel/sched.c | 1 +
3 files changed, 26 insertions(+), 53 deletions(-)

diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index 0e57d8c..deb9acf 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -615,6 +615,8 @@ extern void perf_counter_munmap(unsigned long addr, unsigned long len,

extern void perf_counter_comm(struct task_struct *tsk);

+extern void perf_counter_task_migration(struct task_struct *task, int cpu);
+
#define MAX_STACK_DEPTH 255

struct perf_callchain_entry {
@@ -668,6 +670,8 @@ perf_counter_munmap(unsigned long addr, unsigned long len,

static inline void perf_counter_comm(struct task_struct *tsk) { }
static inline void perf_counter_init(void) { }
+static inline void perf_counter_task_migration(struct task_struct *task,
+ int cpu) { }
#endif

#endif /* __KERNEL__ */
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 8d2653f..cd94cf3 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -2921,11 +2921,13 @@ static int perf_swcounter_match(struct perf_counter *counter,
if (counter->hw_event.config != event_config)
return 0;

- if (counter->hw_event.exclude_user && user_mode(regs))
- return 0;
+ if (regs) {
+ if (counter->hw_event.exclude_user && user_mode(regs))
+ return 0;

- if (counter->hw_event.exclude_kernel && !user_mode(regs))
- return 0;
+ if (counter->hw_event.exclude_kernel && !user_mode(regs))
+ return 0;
+ }

return 1;
}
@@ -2935,7 +2937,7 @@ static void perf_swcounter_add(struct perf_counter *counter, u64 nr,
{
int neg = atomic64_add_negative(nr, &counter->hw.count);

- if (counter->hw.irq_period && !neg)
+ if (counter->hw.irq_period && !neg && regs)
perf_swcounter_overflow(counter, nmi, regs, addr);
}

@@ -3151,55 +3153,24 @@ static const struct pmu perf_ops_task_clock = {
/*
* Software counter: cpu migrations
*/
-
-static inline u64 get_cpu_migrations(struct perf_counter *counter)
-{
- struct task_struct *curr = counter->ctx->task;
-
- if (curr)
- return curr->se.nr_migrations;
- return cpu_nr_migrations(smp_processor_id());
-}
-
-static void cpu_migrations_perf_counter_update(struct perf_counter *counter)
-{
- u64 prev, now;
- s64 delta;
-
- prev = atomic64_read(&counter->hw.prev_count);
- now = get_cpu_migrations(counter);
-
- atomic64_set(&counter->hw.prev_count, now);
-
- delta = now - prev;
-
- atomic64_add(delta, &counter->count);
-}
-
-static void cpu_migrations_perf_counter_read(struct perf_counter *counter)
+void perf_counter_task_migration(struct task_struct *task, int cpu)
{
- cpu_migrations_perf_counter_update(counter);
-}
+ struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu);
+ struct perf_counter_context *ctx;

-static int cpu_migrations_perf_counter_enable(struct perf_counter *counter)
-{
- if (counter->prev_state <= PERF_COUNTER_STATE_OFF)
- atomic64_set(&counter->hw.prev_count,
- get_cpu_migrations(counter));
- return 0;
-}
+ perf_swcounter_ctx_event(&cpuctx->ctx, PERF_TYPE_SOFTWARE,
+ PERF_COUNT_CPU_MIGRATIONS,
+ 1, 1, NULL, 0);

-static void cpu_migrations_perf_counter_disable(struct perf_counter *counter)
-{
- cpu_migrations_perf_counter_update(counter);
+ ctx = perf_pin_task_context(task);
+ if (ctx) {
+ perf_swcounter_ctx_event(ctx, PERF_TYPE_SOFTWARE,
+ PERF_COUNT_CPU_MIGRATIONS,
+ 1, 1, NULL, 0);
+ perf_unpin_context(ctx);
+ }
}

-static const struct pmu perf_ops_cpu_migrations = {
- .enable = cpu_migrations_perf_counter_enable,
- .disable = cpu_migrations_perf_counter_disable,
- .read = cpu_migrations_perf_counter_read,
-};
-
#ifdef CONFIG_EVENT_PROFILE
void perf_tpcounter_event(int event_id)
{
@@ -3272,11 +3243,8 @@ static const struct pmu *sw_perf_counter_init(struct perf_counter *counter)
case PERF_COUNT_PAGE_FAULTS_MIN:
case PERF_COUNT_PAGE_FAULTS_MAJ:
case PERF_COUNT_CONTEXT_SWITCHES:
- pmu = &perf_ops_generic;
- break;
case PERF_COUNT_CPU_MIGRATIONS:
- if (!counter->hw_event.exclude_kernel)
- pmu = &perf_ops_cpu_migrations;
+ pmu = &perf_ops_generic;
break;
}

diff --git a/kernel/sched.c b/kernel/sched.c
index 3226cc1..8d43347 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -1977,6 +1977,7 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
if (task_hot(p, old_rq->clock, NULL))
schedstat_inc(p, se.nr_forced2_migrations);
#endif
+ perf_counter_task_migration(p, new_cpu);
}
p->se.vruntime -= old_cfsrq->min_vruntime -
new_cfsrq->min_vruntime;

2009-06-02 14:19:48

by Paul Mackerras

[permalink] [raw]

Subject: [tip:perfcounters/core] perf_counter: Remove unused prev_state field

Commit-ID: bf4e0ed3d027ce581be18496036862131b5f32aa
Gitweb: http://git.kernel.org/tip/bf4e0ed3d027ce581be18496036862131b5f32aa
Author: Paul Mackerras <[email protected]>
AuthorDate: Mon, 1 Jun 2009 17:53:16 +1000
Committer: Ingo Molnar <[email protected]>
CommitDate: Tue, 2 Jun 2009 13:10:55 +0200

perf_counter: Remove unused prev_state field

This removes the prev_state field of struct perf_counter since
it is now unused. It was only used by the cpu migration
counter, which doesn't use it any more.

Signed-off-by: Paul Mackerras <[email protected]>
Cc: Peter Zijlstra <[email protected]>
Cc: Mike Galbraith <[email protected]>
Cc: Paul Mackerras <[email protected]>
Cc: Corey Ashford <[email protected]>
Cc: Marcelo Tosatti <[email protected]>
Cc: Arnaldo Carvalho de Melo <[email protected]>
Cc: John Kacur <[email protected]>
LKML-Reference: <[email protected]>
Signed-off-by: Ingo Molnar <[email protected]>

---
include/linux/perf_counter.h | 1 -
kernel/perf_counter.c | 4 ----
2 files changed, 0 insertions(+), 5 deletions(-)

diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index deb9acf..d970fbc 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -427,7 +427,6 @@ struct perf_counter {
const struct pmu *pmu;

enum perf_counter_active_state state;
- enum perf_counter_active_state prev_state;
atomic64_t count;

/*
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index cd94cf3..fbed4d2 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -572,7 +572,6 @@ group_sched_in(struct perf_counter *group_counter,
if (ret)
return ret < 0 ? ret : 0;

- group_counter->prev_state = group_counter->state;
if (counter_sched_in(group_counter, cpuctx, ctx, cpu))
return -EAGAIN;

@@ -580,7 +579,6 @@ group_sched_in(struct perf_counter *group_counter,
* Schedule in siblings as one group (if any):
*/
list_for_each_entry(counter, &group_counter->sibling_list, list_entry) {
- counter->prev_state = counter->state;
if (counter_sched_in(counter, cpuctx, ctx, cpu)) {
partial_group = counter;
goto group_error;
@@ -657,7 +655,6 @@ static void add_counter_to_ctx(struct perf_counter *counter,
struct perf_counter_context *ctx)
{
list_add_counter(counter, ctx);
- counter->prev_state = PERF_COUNTER_STATE_OFF;
counter->tstamp_enabled = ctx->time;
counter->tstamp_running = ctx->time;
counter->tstamp_stopped = ctx->time;
@@ -820,7 +817,6 @@ static void __perf_counter_enable(void *info)
ctx->is_active = 1;
update_context_time(ctx);

- counter->prev_state = counter->state;
if (counter->state >= PERF_COUNTER_STATE_INACTIVE)
goto unlock;
counter->state = PERF_COUNTER_STATE_INACTIVE;