2009-06-19 11:54:48

by Peter Zijlstra

[permalink] [raw]
Subject: [tip:perfcounters/core] perf_counter: Simplify and fix task migration counting

Commit-ID: e5289d4a181fb6c0b7a7607649af2ffdc491335c
Gitweb: http://git.kernel.org/tip/e5289d4a181fb6c0b7a7607649af2ffdc491335c
Author: Peter Zijlstra <[email protected]>
AuthorDate: Fri, 19 Jun 2009 13:22:51 +0200
Committer: Ingo Molnar <[email protected]>
CommitDate: Fri, 19 Jun 2009 13:43:12 +0200

perf_counter: Simplify and fix task migration counting

The task migrations counter was causing rare and hard to decypher
memory corruptions under load. After a day of debugging and bisection
we found that the problem was introduced with:

3f731ca: perf_counter: Fix cpu migration counter

Turning them off fixes the crashes. Incidentally, the whole
perf_counter_task_migration() logic can be done simpler as well,
by injecting a proper sw-counter event.

This cleanup also fixed the crashes. The precise failure mode is
not completely clear yet, but we are clearly not unhappy about
having a fix ;-)

Signed-off-by: Peter Zijlstra <[email protected]>
Cc: Mike Galbraith <[email protected]>
Cc: Paul Mackerras <[email protected]>
Cc: Corey Ashford <[email protected]>
Cc: Marcelo Tosatti <[email protected]>
Cc: Arnaldo Carvalho de Melo <[email protected]>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <[email protected]>


---
include/linux/perf_counter.h | 4 ----
kernel/perf_counter.c | 23 +----------------------
kernel/sched.c | 3 ++-
3 files changed, 3 insertions(+), 27 deletions(-)

diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index e7e7e02..89698d8 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -682,8 +682,6 @@ static inline void perf_counter_mmap(struct vm_area_struct *vma)
extern void perf_counter_comm(struct task_struct *tsk);
extern void perf_counter_fork(struct task_struct *tsk);

-extern void perf_counter_task_migration(struct task_struct *task, int cpu);
-
extern struct perf_callchain_entry *perf_callchain(struct pt_regs *regs);

extern int sysctl_perf_counter_paranoid;
@@ -724,8 +722,6 @@ static inline void perf_counter_mmap(struct vm_area_struct *vma) { }
static inline void perf_counter_comm(struct task_struct *tsk) { }
static inline void perf_counter_fork(struct task_struct *tsk) { }
static inline void perf_counter_init(void) { }
-static inline void perf_counter_task_migration(struct task_struct *task,
- int cpu) { }
#endif

#endif /* __KERNEL__ */
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 7e9108e..8d4f0dd 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -124,7 +124,7 @@ void perf_enable(void)

static void get_ctx(struct perf_counter_context *ctx)
{
- atomic_inc(&ctx->refcount);
+ WARN_ON(!atomic_inc_not_zero(&ctx->refcount));
}

static void free_ctx(struct rcu_head *head)
@@ -3467,27 +3467,6 @@ static const struct pmu perf_ops_task_clock = {
.read = task_clock_perf_counter_read,
};

-/*
- * Software counter: cpu migrations
- */
-void perf_counter_task_migration(struct task_struct *task, int cpu)
-{
- struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu);
- struct perf_counter_context *ctx;
-
- perf_swcounter_ctx_event(&cpuctx->ctx, PERF_TYPE_SOFTWARE,
- PERF_COUNT_SW_CPU_MIGRATIONS,
- 1, 1, NULL, 0);
-
- ctx = perf_pin_task_context(task);
- if (ctx) {
- perf_swcounter_ctx_event(ctx, PERF_TYPE_SOFTWARE,
- PERF_COUNT_SW_CPU_MIGRATIONS,
- 1, 1, NULL, 0);
- perf_unpin_context(ctx);
- }
-}
-
#ifdef CONFIG_EVENT_PROFILE
void perf_tpcounter_event(int event_id)
{
diff --git a/kernel/sched.c b/kernel/sched.c
index 8fb88a9..f46540b 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -1978,7 +1978,8 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
if (task_hot(p, old_rq->clock, NULL))
schedstat_inc(p, se.nr_forced2_migrations);
#endif
- perf_counter_task_migration(p, new_cpu);
+ perf_swcounter_event(PERF_COUNT_SW_CPU_MIGRATIONS,
+ 1, 1, NULL, 0);
}
p->se.vruntime -= old_cfsrq->min_vruntime -
new_cfsrq->min_vruntime;


2009-06-19 11:59:48

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [tip:perfcounters/core] perf_counter: Simplify and fix task migration counting

On Fri, 2009-06-19 at 11:52 +0000, tip-bot for Peter Zijlstra wrote:
> Commit-ID: e5289d4a181fb6c0b7a7607649af2ffdc491335c
> Gitweb: http://git.kernel.org/tip/e5289d4a181fb6c0b7a7607649af2ffdc491335c
> Author: Peter Zijlstra <[email protected]>
> AuthorDate: Fri, 19 Jun 2009 13:22:51 +0200
> Committer: Ingo Molnar <[email protected]>
> CommitDate: Fri, 19 Jun 2009 13:43:12 +0200
>
> perf_counter: Simplify and fix task migration counting
>
> The task migrations counter was causing rare and hard to decypher
> memory corruptions under load. After a day of debugging and bisection
> we found that the problem was introduced with:
>
> 3f731ca: perf_counter: Fix cpu migration counter
>
> Turning them off fixes the crashes. Incidentally, the whole
> perf_counter_task_migration() logic can be done simpler as well,
> by injecting a proper sw-counter event.
>
> This cleanup also fixed the crashes. The precise failure mode is
> not completely clear yet, but we are clearly not unhappy about
> having a fix ;-)


I actually do know what happens:

static struct perf_counter_context *
perf_lock_task_context(struct task_struct *task, unsigned long *flags)
{
struct perf_counter_context *ctx;

rcu_read_lock();
retry:
ctx = rcu_dereference(task->perf_counter_ctxp);
if (ctx) {

spin_lock_irqsave(&ctx->lock, *flags);
if (ctx != rcu_dereference(task->perf_counter_ctxp)) {
spin_unlock_irqrestore(&ctx->lock, *flags);
goto retry;
}
}
rcu_read_unlock();
return ctx;
}


static struct perf_counter_context *perf_pin_task_context(struct task_struct *task)
{
struct perf_counter_context *ctx;
unsigned long flags;

ctx = perf_lock_task_context(task, &flags);
if (ctx) {
++ctx->pin_count;
get_ctx(ctx);
spin_unlock_irqrestore(&ctx->lock, flags);
}
return ctx;
}

Is buggy because perf_lock_task_context() can return a dead context.

the RCU read lock in perf_lock_task_context() only guarantees the memory
won't get freed, it doesn't guarantee the object is valid (in our case
refcount > 0).

Therefore we can return a locked object that can get freed the moment we
release the rcu read lock.

perf_pin_task_context() then increases the refcount and does an unlock
on freed memory.

That increased refcount will cause a double free, in case it started out
with 0.


2009-06-19 12:24:45

by Paul Mackerras

[permalink] [raw]
Subject: Re: [tip:perfcounters/core] perf_counter: Simplify and fix task migration counting

Peter Zijlstra writes:

> static struct perf_counter_context *perf_pin_task_context(struct task_struct *task)
> {
> struct perf_counter_context *ctx;
> unsigned long flags;
>
> ctx = perf_lock_task_context(task, &flags);
> if (ctx) {
> ++ctx->pin_count;
> get_ctx(ctx);
> spin_unlock_irqrestore(&ctx->lock, flags);
> }
> return ctx;
> }
>
> Is buggy because perf_lock_task_context() can return a dead context.
>
> the RCU read lock in perf_lock_task_context() only guarantees the memory
> won't get freed, it doesn't guarantee the object is valid (in our case
> refcount > 0).
>
> Therefore we can return a locked object that can get freed the moment we
> release the rcu read lock.
>
> perf_pin_task_context() then increases the refcount and does an unlock
> on freed memory.
>
> That increased refcount will cause a double free, in case it started out
> with 0.

Wow, good catch! Thanks for finding that.

Paul.

2009-06-19 12:26:25

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [tip:perfcounters/core] perf_counter: Simplify and fix task migration counting

On Fri, 2009-06-19 at 13:59 +0200, Peter Zijlstra wrote:
> On Fri, 2009-06-19 at 11:52 +0000, tip-bot for Peter Zijlstra wrote:
> > Commit-ID: e5289d4a181fb6c0b7a7607649af2ffdc491335c
> > Gitweb: http://git.kernel.org/tip/e5289d4a181fb6c0b7a7607649af2ffdc491335c
> > Author: Peter Zijlstra <[email protected]>
> > AuthorDate: Fri, 19 Jun 2009 13:22:51 +0200
> > Committer: Ingo Molnar <[email protected]>
> > CommitDate: Fri, 19 Jun 2009 13:43:12 +0200
> >
> > perf_counter: Simplify and fix task migration counting
> >
> > The task migrations counter was causing rare and hard to decypher
> > memory corruptions under load. After a day of debugging and bisection
> > we found that the problem was introduced with:
> >
> > 3f731ca: perf_counter: Fix cpu migration counter
> >
> > Turning them off fixes the crashes. Incidentally, the whole
> > perf_counter_task_migration() logic can be done simpler as well,
> > by injecting a proper sw-counter event.
> >
> > This cleanup also fixed the crashes. The precise failure mode is
> > not completely clear yet, but we are clearly not unhappy about
> > having a fix ;-)
>
>
> I actually do know what happens:
>
> static struct perf_counter_context *
> perf_lock_task_context(struct task_struct *task, unsigned long *flags)
> {
> struct perf_counter_context *ctx;
>
> rcu_read_lock();
> retry:
> ctx = rcu_dereference(task->perf_counter_ctxp);
> if (ctx) {
>
> spin_lock_irqsave(&ctx->lock, *flags);
> if (ctx != rcu_dereference(task->perf_counter_ctxp)) {
> spin_unlock_irqrestore(&ctx->lock, *flags);
> goto retry;
> }
> }
> rcu_read_unlock();
> return ctx;
> }
>
>
> static struct perf_counter_context *perf_pin_task_context(struct task_struct *task)
> {
> struct perf_counter_context *ctx;
> unsigned long flags;
>
> ctx = perf_lock_task_context(task, &flags);
> if (ctx) {
> ++ctx->pin_count;
> get_ctx(ctx);
> spin_unlock_irqrestore(&ctx->lock, flags);
> }
> return ctx;
> }
>
> Is buggy because perf_lock_task_context() can return a dead context.
>
> the RCU read lock in perf_lock_task_context() only guarantees the memory
> won't get freed, it doesn't guarantee the object is valid (in our case
> refcount > 0).
>
> Therefore we can return a locked object that can get freed the moment we
> release the rcu read lock.
>
> perf_pin_task_context() then increases the refcount and does an unlock
> on freed memory.
>
> That increased refcount will cause a double free, in case it started out
> with 0.
>

Maybe something like so..

---
kernel/perf_counter.c | 11 +++++------
1 files changed, 5 insertions(+), 6 deletions(-)

diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 7e9108e..923189e 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -175,6 +175,11 @@ perf_lock_task_context(struct task_struct *task, unsigned long *flags)
spin_unlock_irqrestore(&ctx->lock, *flags);
goto retry;
}
+
+ if (!atomic_inc_not_zero(&ctx->refcount)) {
+ spin_unlock_irqrestore(&ctx->lock, *flags);
+ ctx = NULL;
+ }
}
rcu_read_unlock();
return ctx;
@@ -193,7 +198,6 @@ static struct perf_counter_context *perf_pin_task_context(struct task_struct *ta
ctx = perf_lock_task_context(task, &flags);
if (ctx) {
++ctx->pin_count;
- get_ctx(ctx);
spin_unlock_irqrestore(&ctx->lock, flags);
}
return ctx;
@@ -1459,11 +1463,6 @@ static struct perf_counter_context *find_get_context(pid_t pid, int cpu)
put_ctx(parent_ctx);
ctx->parent_ctx = NULL; /* no longer a clone */
}
- /*
- * Get an extra reference before dropping the lock so that
- * this context won't get freed if the task exits.
- */
- get_ctx(ctx);
spin_unlock_irqrestore(&ctx->lock, flags);
}

2009-06-19 12:39:47

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [tip:perfcounters/core] perf_counter: Simplify and fix task migration counting

On Fri, 2009-06-19 at 22:24 +1000, Paul Mackerras wrote:
> Peter Zijlstra writes:
>
> > static struct perf_counter_context *perf_pin_task_context(struct task_struct *task)
> > {
> > struct perf_counter_context *ctx;
> > unsigned long flags;
> >
> > ctx = perf_lock_task_context(task, &flags);
> > if (ctx) {
> > ++ctx->pin_count;
> > get_ctx(ctx);
> > spin_unlock_irqrestore(&ctx->lock, flags);
> > }
> > return ctx;
> > }
> >
> > Is buggy because perf_lock_task_context() can return a dead context.
> >
> > the RCU read lock in perf_lock_task_context() only guarantees the memory
> > won't get freed, it doesn't guarantee the object is valid (in our case
> > refcount > 0).
> >
> > Therefore we can return a locked object that can get freed the moment we
> > release the rcu read lock.
> >
> > perf_pin_task_context() then increases the refcount and does an unlock
> > on freed memory.
> >
> > That increased refcount will cause a double free, in case it started out
> > with 0.
>
> Wow, good catch! Thanks for finding that.

The clue that got me started was this scribble:

[ 123.800172] =============================================================================
[ 123.808725] BUG kmalloc-512: Poison overwritten
[ 123.813422] -----------------------------------------------------------------------------
[ 123.813424]
[ 123.823390] INFO: 0xffff8801bd546480-0xffff8801bd546588. First byte 0x6c instead of 0x6b
[ 123.831777] INFO: Allocated in perf_counter_init_task+0x8d/0x262 age=351 cpu=1 pid=12276
[ 123.840047] INFO: Freed in free_ctx+0x10/0x12 age=256 cpu=1 pid=12669
[ 123.846702] INFO: Slab 0xffffea0009914f60 objects=28 used=14 fp=0xffff8801bd546480 flags=0x2000000000040c3
[ 123.856710] INFO: Object 0xffff8801bd546480 @offset=9344 fp=0xffff8801bd546fe8
[ 123.856712]
[ 123.865782] Bytes b4 0xffff8801bd546470: 31 3b fd ff 00 00 00 00 5a 5a 5a 5a 5a 5a 5a 5a 1;<FD><FF>....ZZZZZZZZ
[ 123.876163] Object 0xffff8801bd546480: 6c 6c 6b 6b 6b 6b 6b 6b ff ff ff ff 6b 6b 6b 6b llkkkkkk<FF><FF><FF><FF>kkkk
[ 123.886447] Object 0xffff8801bd546490: ff ff ff ff ff ff ff ff 6b 6b 6b 6b 6b 6b 6b 6b <FF><FF><FF><FF><FF><FF><FF><FF>kkkkkkkk
[ 123.896810] Object 0xffff8801bd5464a0: 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b kkkkkkkkkkkkkkkk
[ 123.907162] Object 0xffff8801bd5464b0: 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b kkkkkkkkkkkkkkkk
[ 123.917421] Object 0xffff8801bd5464c0: 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b kkkkkkkkkkkkkkkk
[ 123.927837] Object 0xffff8801bd5464d0: 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b kkkkkkkkkkkkkkkk
[ 123.938147] Object 0xffff8801bd5464e0: 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b kkkkkkkkkkkkkkkk
[ 123.948398] Object 0xffff8801bd5464f0: 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b kkkkkkkkkkkkkkkk

And from pahole (thanks acme!) we find that the only relevant perf
object in that bucket is:

struct perf_counter_context {
spinlock_t lock; /* 0 64 */
/* --- cacheline 1 boundary (64 bytes) --- */
struct mutex mutex; /* 64 152 */
/* --- cacheline 3 boundary (192 bytes) was 24 bytes ago --- */
struct list_head counter_list; /* 216 16 */
struct list_head event_list; /* 232 16 */
int nr_counters; /* 248 4 */
int nr_active; /* 252 4 */
/* --- cacheline 4 boundary (256 bytes) --- */
int is_active; /* 256 4 */
atomic_t refcount; /* 260 4 */
struct task_struct * task; /* 264 8 */
u64 time; /* 272 8 */
u64 timestamp; /* 280 8 */
struct perf_counter_context * parent_ctx; /* 288 8 */
u64 parent_gen; /* 296 8 */
u64 generation; /* 304 8 */
int pin_count; /* 312 4 */

/* XXX 4 bytes hole, try to pack */

/* --- cacheline 5 boundary (320 bytes) --- */
struct rcu_head rcu_head; /* 320 16 */

/* size: 336, cachelines: 6 */
/* sum members: 332, holes: 1, sum holes: 4 */
/* last cacheline: 16 bytes */
};

typedef struct {
raw_spinlock_t raw_lock; /* 0 4 */
unsigned int magic; /* 4 4 */
unsigned int owner_cpu; /* 8 4 */

/* XXX 4 bytes hole, try to pack */

void * owner; /* 16 8 */
struct lockdep_map dep_map; /* 24 40 */
/* --- cacheline 1 boundary (64 bytes) --- */

/* size: 64, cachelines: 1 */
/* sum members: 60, holes: 1, sum holes: 4 */
} spinlock_t; /* definitions: 1 */


Which learns us that we had lock/unlock (6b->6c) + owner and owner_cpu
scribble as through:

static inline void debug_spin_unlock(spinlock_t *lock)
{
SPIN_BUG_ON(lock->magic != SPINLOCK_MAGIC, lock, "bad magic");
SPIN_BUG_ON(!spin_is_locked(lock), lock, "already unlocked");
SPIN_BUG_ON(lock->owner != current, lock, "wrong owner");
SPIN_BUG_ON(lock->owner_cpu != raw_smp_processor_id(),
lock, "wrong CPU");
lock->owner = SPINLOCK_OWNER_INIT;
lock->owner_cpu = -1;
}

The SPIN_BUG_ON() failed to trigger in this particular case because
earlier corruption took out lockdep.