2009-06-25 19:44:36

by Peter Zijlstra

[permalink] [raw]
Subject: [tip:perfcounters/urgent] perf_counter: Implement more accurate per task statistics

Commit-ID: bfbd3381e63aa2a14c6706afb50ce4630aa0d9a2
Gitweb: http://git.kernel.org/tip/bfbd3381e63aa2a14c6706afb50ce4630aa0d9a2
Author: Peter Zijlstra <[email protected]>
AuthorDate: Wed, 24 Jun 2009 21:11:59 +0200
Committer: Ingo Molnar <[email protected]>
CommitDate: Thu, 25 Jun 2009 21:39:07 +0200

perf_counter: Implement more accurate per task statistics

With the introduction of PERF_EVENT_READ we have the
possibility to provide accurate counter values for
individual tasks in a task hierarchy.

However, due to the lazy context switching used for similar
counter contexts our current per task counts are way off.

In order to maintain some of the lazy switch benefits we
don't disable it out-right, but simply iterate the active
counters and flip the values between the contexts.

This only reads the counters but does not need to reprogram
the full PMU.

Signed-off-by: Peter Zijlstra <[email protected]>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <[email protected]>


---
include/linux/perf_counter.h | 4 ++-
kernel/perf_counter.c | 83 ++++++++++++++++++++++++++++++++++++++++--
2 files changed, 83 insertions(+), 4 deletions(-)

diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index 6a384f0..de70a10 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -178,8 +178,9 @@ struct perf_counter_attr {
mmap : 1, /* include mmap data */
comm : 1, /* include comm data */
freq : 1, /* use freq, not period */
+ inherit_stat : 1, /* per task counts */

- __reserved_1 : 53;
+ __reserved_1 : 52;

__u32 wakeup_events; /* wakeup every n events */
__u32 __reserved_2;
@@ -602,6 +603,7 @@ struct perf_counter_context {
int nr_counters;
int nr_active;
int is_active;
+ int nr_stat;
atomic_t refcount;
struct task_struct *task;

diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index a72c20e..385ca51 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -236,6 +236,8 @@ list_add_counter(struct perf_counter *counter, struct perf_counter_context *ctx)

list_add_rcu(&counter->event_entry, &ctx->event_list);
ctx->nr_counters++;
+ if (counter->attr.inherit_stat)
+ ctx->nr_stat++;
}

/*
@@ -250,6 +252,8 @@ list_del_counter(struct perf_counter *counter, struct perf_counter_context *ctx)
if (list_empty(&counter->list_entry))
return;
ctx->nr_counters--;
+ if (counter->attr.inherit_stat)
+ ctx->nr_stat--;

list_del_init(&counter->list_entry);
list_del_rcu(&counter->event_entry);
@@ -1006,6 +1010,76 @@ static int context_equiv(struct perf_counter_context *ctx1,
&& !ctx1->pin_count && !ctx2->pin_count;
}

+static void __perf_counter_read(void *counter);
+
+static void __perf_counter_sync_stat(struct perf_counter *counter,
+ struct perf_counter *next_counter)
+{
+ u64 value;
+
+ if (!counter->attr.inherit_stat)
+ return;
+
+ /*
+ * Update the counter value, we cannot use perf_counter_read()
+ * because we're in the middle of a context switch and have IRQs
+ * disabled, which upsets smp_call_function_single(), however
+ * we know the counter must be on the current CPU, therefore we
+ * don't need to use it.
+ */
+ switch (counter->state) {
+ case PERF_COUNTER_STATE_ACTIVE:
+ __perf_counter_read(counter);
+ break;
+
+ case PERF_COUNTER_STATE_INACTIVE:
+ update_counter_times(counter);
+ break;
+
+ default:
+ break;
+ }
+
+ /*
+ * In order to keep per-task stats reliable we need to flip the counter
+ * values when we flip the contexts.
+ */
+ value = atomic64_read(&next_counter->count);
+ value = atomic64_xchg(&counter->count, value);
+ atomic64_set(&next_counter->count, value);
+
+ /*
+ * XXX also sync time_enabled and time_running ?
+ */
+}
+
+#define list_next_entry(pos, member) \
+ list_entry(pos->member.next, typeof(*pos), member)
+
+static void perf_counter_sync_stat(struct perf_counter_context *ctx,
+ struct perf_counter_context *next_ctx)
+{
+ struct perf_counter *counter, *next_counter;
+
+ if (!ctx->nr_stat)
+ return;
+
+ counter = list_first_entry(&ctx->event_list,
+ struct perf_counter, event_entry);
+
+ next_counter = list_first_entry(&next_ctx->event_list,
+ struct perf_counter, event_entry);
+
+ while (&counter->event_entry != &ctx->event_list &&
+ &next_counter->event_entry != &next_ctx->event_list) {
+
+ __perf_counter_sync_stat(counter, next_counter);
+
+ counter = list_next_entry(counter, event_entry);
+ next_counter = list_next_entry(counter, event_entry);
+ }
+}
+
/*
* Called from scheduler to remove the counters of the current task,
* with interrupts disabled.
@@ -1061,6 +1135,8 @@ void perf_counter_task_sched_out(struct task_struct *task,
ctx->task = next;
next_ctx->task = task;
do_switch = 0;
+
+ perf_counter_sync_stat(ctx, next_ctx);
}
spin_unlock(&next_ctx->lock);
spin_unlock(&ctx->lock);
@@ -1350,7 +1426,7 @@ void perf_counter_task_tick(struct task_struct *curr, int cpu)
/*
* Cross CPU call to read the hardware counter
*/
-static void __read(void *info)
+static void __perf_counter_read(void *info)
{
struct perf_counter *counter = info;
struct perf_counter_context *ctx = counter->ctx;
@@ -1372,7 +1448,7 @@ static u64 perf_counter_read(struct perf_counter *counter)
*/
if (counter->state == PERF_COUNTER_STATE_ACTIVE) {
smp_call_function_single(counter->oncpu,
- __read, counter, 1);
+ __perf_counter_read, counter, 1);
} else if (counter->state == PERF_COUNTER_STATE_INACTIVE) {
update_counter_times(counter);
}
@@ -4050,7 +4126,8 @@ static void sync_child_counter(struct perf_counter *child_counter,
struct perf_counter *parent_counter = child_counter->parent;
u64 child_val;

- perf_counter_read_event(child_counter, child);
+ if (child_counter->attr.inherit_stat)
+ perf_counter_read_event(child_counter, child);

child_val = atomic64_read(&child_counter->count);


2009-06-26 11:10:27

by Peter Zijlstra

[permalink] [raw]
Subject: [RFC][PATCH] perf_counter: Complete counter swap.

On Thu, 2009-06-25 at 19:43 +0000, tip-bot for Peter Zijlstra wrote:

> +static void __perf_counter_sync_stat(struct perf_counter *counter,
> + struct perf_counter *next_counter)
> +{
> + u64 value;
> +
> + if (!counter->attr.inherit_stat)
> + return;
> +
> + /*
> + * Update the counter value, we cannot use perf_counter_read()
> + * because we're in the middle of a context switch and have IRQs
> + * disabled, which upsets smp_call_function_single(), however
> + * we know the counter must be on the current CPU, therefore we
> + * don't need to use it.
> + */
> + switch (counter->state) {
> + case PERF_COUNTER_STATE_ACTIVE:
> + __perf_counter_read(counter);
> + break;
> +
> + case PERF_COUNTER_STATE_INACTIVE:
> + update_counter_times(counter);
> + break;
> +
> + default:
> + break;
> + }
> +
> + /*
> + * In order to keep per-task stats reliable we need to flip the counter
> + * values when we flip the contexts.
> + */
> + value = atomic64_read(&next_counter->count);
> + value = atomic64_xchg(&counter->count, value);
> + atomic64_set(&next_counter->count, value);
> +
> + /*
> + * XXX also sync time_enabled and time_running ?
> + */
> +}

Right, so I convinced myself we indeed want to swap the times as well,
and realized we need to update the userpage after modifying these
counters.

Then again, since inherited counters tend to wander around
self-monitoring mmap() + inherit is bound to be broken.. hmm?

Do we want to fix that or shall we simply say: don't do that then!

Paul?

---
Subject: perf_counter: Complete counter swap.

Complete the counter swap by indeed switching the times too and
updateing the userpage after modifying the counter values.

Signed-off-by: Peter Zijlstra <[email protected]>
---
kernel/perf_counter.c | 7 ++++++-
1 files changed, 6 insertions(+), 1 deletions(-)

diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index f2f2326..66ab1e9 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -1048,9 +1048,14 @@ static void __perf_counter_sync_stat(struct perf_counter *counter,
value = atomic64_xchg(&counter->count, value);
atomic64_set(&next_counter->count, value);

+ swap(counter->total_time_enabled, next_counter->total_time_enabled);
+ swap(counter->total_time_running, next_counter->total_time_running);
+
/*
- * XXX also sync time_enabled and time_running ?
+ * Since we swizzled the values, update the user visible data too.
*/
+ perf_counter_update_userpage(counter);
+ perf_counter_update_userpage(next_counter);
}

#define list_next_entry(pos, member) \

2009-06-26 12:44:59

by Paul Mackerras

[permalink] [raw]
Subject: Re: [RFC][PATCH] perf_counter: Complete counter swap.

Peter Zijlstra writes:

> Right, so I convinced myself we indeed want to swap the times as well,
> and realized we need to update the userpage after modifying these
> counters.
>
> Then again, since inherited counters tend to wander around
> self-monitoring mmap() + inherit is bound to be broken.. hmm?
>
> Do we want to fix that or shall we simply say: don't do that then!

We only swap the contexts around if all the counters in both contexts
have been inherited, i.e. neither context is a top-level parent
context. Now I had been going to say that a counter that had been
inherited couldn't be used for self-monitoring, and I think that is
actually true, but I guess the problem is that the child could have
inherited the fd and the mmap too, but the mmap will be on the parent
counter not the inherited child counter. And there's no way for the
child (or anyone) to mmap the inherited counter since there's no fd
for it.

Currently we don't do anything to stop people trying to read the
counters directly when they're not self-monitoring - they just get
bogus data. Maybe we should put the tid of the task the counter's on
into the first page of the mmap so that userspace can at least check
if it's the task being monitored.

Unless you can see some way to change the mmap on fork to point to the
child counter rather than the parent... Which would possibly be a bit
nasty anyway since then the child's address space wouldn't be clone of
the parent's like you would expect after a fork.

Paul.

2009-06-26 15:53:29

by Peter Zijlstra

[permalink] [raw]
Subject: [tip:perfcounters/urgent] perf_counter: Complete counter swap

Commit-ID: 19d2e755436054dfc2be640bffc32e427c37ac3d
Gitweb: http://git.kernel.org/tip/19d2e755436054dfc2be640bffc32e427c37ac3d
Author: Peter Zijlstra <[email protected]>
AuthorDate: Fri, 26 Jun 2009 13:10:23 +0200
Committer: Ingo Molnar <[email protected]>
CommitDate: Fri, 26 Jun 2009 17:48:54 +0200

perf_counter: Complete counter swap

Complete the counter swap by indeed switching the times too and
updating the userpage after modifying the counter values.

Signed-off-by: Peter Zijlstra <[email protected]>
Cc: Paul Mackerras <[email protected]>
LKML-Reference: <1246014623.31755.195.camel@twins>
Signed-off-by: Ingo Molnar <[email protected]>


---
kernel/perf_counter.c | 7 ++++++-
1 files changed, 6 insertions(+), 1 deletions(-)

diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index f2f2326..66ab1e9 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -1048,9 +1048,14 @@ static void __perf_counter_sync_stat(struct perf_counter *counter,
value = atomic64_xchg(&counter->count, value);
atomic64_set(&next_counter->count, value);

+ swap(counter->total_time_enabled, next_counter->total_time_enabled);
+ swap(counter->total_time_running, next_counter->total_time_running);
+
/*
- * XXX also sync time_enabled and time_running ?
+ * Since we swizzled the values, update the user visible data too.
*/
+ perf_counter_update_userpage(counter);
+ perf_counter_update_userpage(next_counter);
}

#define list_next_entry(pos, member) \