From: Kan Liang <[email protected]>
The perf per-process monitoring overhead increases rapidly with the
increasing of events# and CPU#.
Here is some data from the overhead test on Skylake server which has 64
logical CPU. Elapsed time of AIM7 is used to measure the overhead.
perf record -e $event_list -p $pid #$pid is the pid of AIM7
Elapsed time A: elapsed time of AIM7
Elapsed time B: elapsed time of AIM7 while perf is running in parallel.
Overhead = (B - A) / A
Events# Overhead (%)
1 0.45%
2 1.20%
4 3.85%
8 15.82%
16 50.24%
The perf_iterate_ctx contributes to the most of the increased overheads.
Because it iterates over the whole rcu list ctx->event_list.
ctx->event_list increases rapidly by adding events. By adding one event,
CPU# of events will be finally inserted into the event_list.
Iterating the whole event list will become a disaster on multi-core
systems with large CPU#.
Actually, it doesn't need to iterate the whole rcu list every time.
Sometimes, it only needs to output the event on current CPU.
A per cpu event list is introduced to divide the big rcu list into small
per CPU rcu lists, which significantly reduce the overhead.
Events# Overhead (%)
1 0.15%
2 1.06%
4 1.85%
8 9.99%
16 17.26%
Signed-off-by: Kan Liang <[email protected]>
---
include/linux/perf_event.h | 3 +-
kernel/events/core.c | 177 ++++++++++++++++++++++++++++++++++-----------
2 files changed, 138 insertions(+), 42 deletions(-)
diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 4741ecd..1222a33 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -734,7 +734,8 @@ struct perf_event_context {
struct list_head active_ctx_list;
struct list_head pinned_groups;
struct list_head flexible_groups;
- struct list_head event_list;
+ struct list_head *__percpu event_list;
+
int nr_events;
int nr_active;
int is_active;
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 0e29213..00c12df 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -1118,6 +1118,9 @@ static void free_ctx(struct rcu_head *head)
ctx = container_of(head, struct perf_event_context, rcu_head);
kfree(ctx->task_ctx_data);
+
+ free_percpu(ctx->event_list);
+
kfree(ctx);
}
@@ -1461,6 +1464,7 @@ ctx_group_list(struct perf_event *event, struct perf_event_context *ctx)
static void
list_add_event(struct perf_event *event, struct perf_event_context *ctx)
{
+ struct list_head *t_list;
lockdep_assert_held(&ctx->lock);
@@ -1483,7 +1487,9 @@ list_add_event(struct perf_event *event, struct perf_event_context *ctx)
list_update_cgroup_event(event, ctx, true);
- list_add_rcu(&event->event_entry, &ctx->event_list);
+ /* If event CPU is not set, add the event to the list of CPU 0 */
+ t_list = per_cpu_ptr(ctx->event_list, event->cpu == -1 ? 0 : event->cpu);
+ list_add_rcu(&event->event_entry, t_list);
ctx->nr_events++;
if (event->attr.inherit_stat)
ctx->nr_stat++;
@@ -2749,25 +2755,30 @@ static void perf_event_sync_stat(struct perf_event_context *ctx,
struct perf_event_context *next_ctx)
{
struct perf_event *event, *next_event;
+ int cpu;
if (!ctx->nr_stat)
return;
update_context_time(ctx);
- event = list_first_entry(&ctx->event_list,
- struct perf_event, event_entry);
+ for_each_possible_cpu(cpu) {
+ struct list_head *list, *next_list;
- next_event = list_first_entry(&next_ctx->event_list,
- struct perf_event, event_entry);
+ list = per_cpu_ptr(ctx->event_list, cpu);
+ next_list = per_cpu_ptr(next_ctx->event_list, cpu);
- while (&event->event_entry != &ctx->event_list &&
- &next_event->event_entry != &next_ctx->event_list) {
+ event = list_first_entry(list, struct perf_event, event_entry);
+ next_event = list_first_entry(next_list, struct perf_event, event_entry);
- __perf_event_sync_stat(event, next_event);
+ while (&event->event_entry != list &&
+ &next_event->event_entry != next_list) {
- event = list_next_entry(event, event_entry);
- next_event = list_next_entry(next_event, event_entry);
+ __perf_event_sync_stat(event, next_event);
+
+ event = list_next_entry(event, event_entry);
+ next_event = list_next_entry(next_event, event_entry);
+ }
}
}
@@ -3241,7 +3252,9 @@ static void perf_adjust_freq_unthr_context(struct perf_event_context *ctx,
struct perf_event *event;
struct hw_perf_event *hwc;
u64 now, period = TICK_NSEC;
+ struct list_head *t_list;
s64 delta;
+ int cpu;
/*
* only need to iterate over all events iff:
@@ -3254,7 +3267,11 @@ static void perf_adjust_freq_unthr_context(struct perf_event_context *ctx,
raw_spin_lock(&ctx->lock);
perf_pmu_disable(ctx->pmu);
- list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
+ cpu = smp_processor_id();
+again:
+ t_list = per_cpu_ptr(ctx->event_list, cpu);
+
+ list_for_each_entry_rcu(event, t_list, event_entry) {
if (event->state != PERF_EVENT_STATE_ACTIVE)
continue;
@@ -3298,6 +3315,15 @@ static void perf_adjust_freq_unthr_context(struct perf_event_context *ctx,
perf_pmu_enable(event->pmu);
}
+ /*
+ * the event->cpu may be -1.
+ * If so, the event is stored in CPU0's event_list.
+ */
+ if (cpu != 0) {
+ cpu = 0;
+ goto again;
+ }
+
perf_pmu_enable(ctx->pmu);
raw_spin_unlock(&ctx->lock);
}
@@ -3385,6 +3411,12 @@ static int event_enable_on_exec(struct perf_event *event,
return 1;
}
+#define for_each_ctx_event_list(__ctx, __cpu, __list) \
+ for (__cpu = cpumask_first(cpu_possible_mask), \
+ __list = per_cpu_ptr(__ctx->event_list, __cpu); \
+ __list; \
+ __cpu = cpumask_next(__cpu, cpu_possible_mask), \
+ __list = (__cpu < nr_cpu_ids) ? per_cpu_ptr(__ctx->event_list, __cpu) : NULL)
/*
* Enable all of a task's events that have been marked enable-on-exec.
* This expects task == current.
@@ -3394,8 +3426,10 @@ static void perf_event_enable_on_exec(int ctxn)
struct perf_event_context *ctx, *clone_ctx = NULL;
struct perf_cpu_context *cpuctx;
struct perf_event *event;
+ struct list_head *list;
unsigned long flags;
int enabled = 0;
+ int cpu;
local_irq_save(flags);
ctx = current->perf_event_ctxp[ctxn];
@@ -3405,8 +3439,11 @@ static void perf_event_enable_on_exec(int ctxn)
cpuctx = __get_cpu_context(ctx);
perf_ctx_lock(cpuctx, ctx);
ctx_sched_out(ctx, cpuctx, EVENT_TIME);
- list_for_each_entry(event, &ctx->event_list, event_entry)
- enabled |= event_enable_on_exec(event, ctx);
+
+ for_each_ctx_event_list(ctx, cpu, list) {
+ list_for_each_entry(event, list, event_entry)
+ enabled |= event_enable_on_exec(event, ctx);
+ }
/*
* Unclone and reschedule this context if we enabled any event.
@@ -3623,15 +3660,26 @@ static int perf_event_read(struct perf_event *event, bool group)
/*
* Initialize the perf_event context in a task_struct:
*/
-static void __perf_event_init_context(struct perf_event_context *ctx)
+static int __perf_event_init_context(struct perf_event_context *ctx)
{
+ struct list_head *list;
+ int cpu;
+
raw_spin_lock_init(&ctx->lock);
mutex_init(&ctx->mutex);
INIT_LIST_HEAD(&ctx->active_ctx_list);
INIT_LIST_HEAD(&ctx->pinned_groups);
INIT_LIST_HEAD(&ctx->flexible_groups);
- INIT_LIST_HEAD(&ctx->event_list);
+
+ ctx->event_list = alloc_percpu(struct list_head);
+ if (!ctx->event_list)
+ return -1;
+ for_each_ctx_event_list(ctx, cpu, list)
+ INIT_LIST_HEAD(list);
+
atomic_set(&ctx->refcount, 1);
+
+ return 0;
}
static struct perf_event_context *
@@ -3643,7 +3691,11 @@ alloc_perf_context(struct pmu *pmu, struct task_struct *task)
if (!ctx)
return NULL;
- __perf_event_init_context(ctx);
+ if (__perf_event_init_context(ctx)) {
+ kfree(ctx);
+ return NULL;
+ }
+
if (task) {
ctx->task = task;
get_task_struct(task);
@@ -3978,13 +4030,17 @@ static bool exclusive_event_installable(struct perf_event *event,
{
struct perf_event *iter_event;
struct pmu *pmu = event->pmu;
+ struct list_head *list;
+ int cpu;
if (!(pmu->capabilities & PERF_PMU_CAP_EXCLUSIVE))
return true;
- list_for_each_entry(iter_event, &ctx->event_list, event_entry) {
- if (exclusive_event_match(iter_event, event))
- return false;
+ for_each_ctx_event_list(ctx, cpu, list) {
+ list_for_each_entry(iter_event, list, event_entry) {
+ if (exclusive_event_match(iter_event, event))
+ return false;
+ }
}
return true;
@@ -6067,16 +6123,29 @@ perf_iterate_ctx(struct perf_event_context *ctx,
void *data, bool all)
{
struct perf_event *event;
+ struct list_head *t_list;
+ int cpu;
- list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
- if (!all) {
+ if (all) {
+ for_each_ctx_event_list(ctx, cpu, t_list) {
+ list_for_each_entry_rcu(event, t_list, event_entry)
+ output(event, data);
+ }
+ } else {
+ cpu = smp_processor_id();
+again:
+ t_list = this_cpu_ptr(ctx->event_list);
+ list_for_each_entry_rcu(event, t_list, event_entry) {
if (event->state < PERF_EVENT_STATE_INACTIVE)
continue;
if (!event_filter_match(event))
continue;
+ output(event, data);
+ }
+ if (cpu != 0) {
+ cpu = 0;
+ goto again;
}
-
- output(event, data);
}
}
@@ -7605,6 +7674,8 @@ void perf_tp_event(u16 event_type, u64 count, void *record, int entry_size,
{
struct perf_sample_data data;
struct perf_event *event;
+ struct list_head *list;
+ int cpu;
struct perf_raw_record raw = {
.frag = {
@@ -7636,13 +7707,15 @@ void perf_tp_event(u16 event_type, u64 count, void *record, int entry_size,
if (!ctx)
goto unlock;
- list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
- if (event->attr.type != PERF_TYPE_TRACEPOINT)
- continue;
- if (event->attr.config != entry->type)
- continue;
- if (perf_tp_event_match(event, &data, regs))
- perf_swevent_event(event, count, &data, regs);
+ for_each_ctx_event_list(ctx, cpu, list) {
+ list_for_each_entry_rcu(event, list, event_entry) {
+ if (event->attr.type != PERF_TYPE_TRACEPOINT)
+ continue;
+ if (event->attr.config != entry->type)
+ continue;
+ if (perf_tp_event_match(event, &data, regs))
+ perf_swevent_event(event, count, &data, regs);
+ }
}
unlock:
rcu_read_unlock();
@@ -8590,6 +8663,7 @@ static void update_pmu_context(struct pmu *pmu, struct pmu *old_pmu)
static void free_pmu_context(struct pmu *pmu)
{
struct pmu *i;
+ int cpu;
mutex_lock(&pmus_lock);
/*
@@ -8601,7 +8675,12 @@ static void free_pmu_context(struct pmu *pmu)
goto out;
}
}
+ for_each_possible_cpu(cpu) {
+ struct perf_cpu_context *cpuctx;
+ cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu);
+ free_percpu(cpuctx->ctx.event_list);
+ }
free_percpu(pmu->pmu_cpu_context);
out:
mutex_unlock(&pmus_lock);
@@ -8801,7 +8880,8 @@ int perf_pmu_register(struct pmu *pmu, const char *name, int type)
struct perf_cpu_context *cpuctx;
cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu);
- __perf_event_init_context(&cpuctx->ctx);
+ if (__perf_event_init_context(&cpuctx->ctx))
+ goto free_pmu_cpu_context;
lockdep_set_class(&cpuctx->ctx.mutex, &cpuctx_mutex);
lockdep_set_class(&cpuctx->ctx.lock, &cpuctx_lock);
cpuctx->ctx.pmu = pmu;
@@ -8845,6 +8925,9 @@ int perf_pmu_register(struct pmu *pmu, const char *name, int type)
return ret;
+free_pmu_cpu_context:
+ free_pmu_context(pmu);
+
free_dev:
device_del(pmu->dev);
put_device(pmu->dev);
@@ -9969,6 +10052,8 @@ void perf_pmu_migrate_context(struct pmu *pmu, int src_cpu, int dst_cpu)
struct perf_event_context *src_ctx;
struct perf_event_context *dst_ctx;
struct perf_event *event, *tmp;
+ struct list_head *list;
+ int cpu;
LIST_HEAD(events);
src_ctx = &per_cpu_ptr(pmu->pmu_cpu_context, src_cpu)->ctx;
@@ -9979,12 +10064,14 @@ void perf_pmu_migrate_context(struct pmu *pmu, int src_cpu, int dst_cpu)
* of swizzling perf_event::ctx.
*/
mutex_lock_double(&src_ctx->mutex, &dst_ctx->mutex);
- list_for_each_entry_safe(event, tmp, &src_ctx->event_list,
- event_entry) {
- perf_remove_from_context(event, 0);
- unaccount_event_cpu(event, src_cpu);
- put_ctx(src_ctx);
- list_add(&event->migrate_entry, &events);
+
+ for_each_ctx_event_list(src_ctx, cpu, list) {
+ list_for_each_entry_safe(event, tmp, list, event_entry) {
+ perf_remove_from_context(event, 0);
+ unaccount_event_cpu(event, src_cpu);
+ put_ctx(src_ctx);
+ list_add(&event->migrate_entry, &events);
+ }
}
/*
@@ -10111,6 +10198,8 @@ static void perf_event_exit_task_context(struct task_struct *child, int ctxn)
{
struct perf_event_context *child_ctx, *clone_ctx = NULL;
struct perf_event *child_event, *next;
+ struct list_head *list;
+ int cpu;
WARN_ON_ONCE(child != current);
@@ -10160,8 +10249,10 @@ static void perf_event_exit_task_context(struct task_struct *child, int ctxn)
*/
perf_event_task(child, child_ctx, 0);
- list_for_each_entry_safe(child_event, next, &child_ctx->event_list, event_entry)
- perf_event_exit_event(child_event, child_ctx, child);
+ for_each_ctx_event_list(child_ctx, cpu, list) {
+ list_for_each_entry_safe(child_event, next, list, event_entry)
+ perf_event_exit_event(child_event, child_ctx, child);
+ }
mutex_unlock(&child_ctx->mutex);
@@ -10611,10 +10702,14 @@ static void __perf_event_exit_context(void *__info)
struct perf_event_context *ctx = __info;
struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
struct perf_event *event;
+ struct list_head *list;
+ int cpu;
raw_spin_lock(&ctx->lock);
- list_for_each_entry(event, &ctx->event_list, event_entry)
- __perf_remove_from_context(event, cpuctx, ctx, (void *)DETACH_GROUP);
+ for_each_ctx_event_list(ctx, cpu, list) {
+ list_for_each_entry(event, list, event_entry)
+ __perf_remove_from_context(event, cpuctx, ctx, (void *)DETACH_GROUP);
+ }
raw_spin_unlock(&ctx->lock);
}
--
2.5.5
Yes this is a problem, but no this cannot be done. We can't have per-cpu
storage per task. That rapidly explodes.
Mark is looking at replacing this stuff with an rb-tree for big-little,
that would also allow improving this I think.
Hi,
On Thu, Nov 10, 2016 at 09:33:55AM +0100, Peter Zijlstra wrote:
> Yes this is a problem, but no this cannot be done. We can't have per-cpu
> storage per task. That rapidly explodes.
>
> Mark is looking at replacing this stuff with an rb-tree for big-little,
> that would also allow improving this I think.
Unfortunately I've not had the chance to look at that since returning
from Plumbers. Also, I was leaning towards the alternative approach we
discussed, with a perf_event_task_contexts container, as that also
solved some other issues with the way we used perf_event_context::pmu in
big.LITTLE systems.
Looking at the way perf_iterate_ctx is used, it seems that we're just
trying to iterate over the active events for a context (i.e. those
programmed into the HW at this point in time). Though I'm missing some
subtlety, since we check event->state < PERF_EVENT_STATE_INACTIVE.
We have a similar issue with perf_event_task_tick() needing to know the
relevant contexts, and for that we have the active_ctx_list. Can't we do
something similar and add an active_events_list to perf_event_context?
Thanks,
Mark.
On Thu, Nov 10, 2016 at 11:05:17AM +0000, Mark Rutland wrote:
> Hi,
>
> On Thu, Nov 10, 2016 at 09:33:55AM +0100, Peter Zijlstra wrote:
> > Yes this is a problem, but no this cannot be done. We can't have per-cpu
> > storage per task. That rapidly explodes.
> >
> > Mark is looking at replacing this stuff with an rb-tree for big-little,
> > that would also allow improving this I think.
>
> Unfortunately I've not had the chance to look at that since returning
> from Plumbers. Also, I was leaning towards the alternative approach we
> discussed, with a perf_event_task_contexts container, as that also
> solved some other issues with the way we used perf_event_context::pmu in
> big.LITTLE systems.
>
> Looking at the way perf_iterate_ctx is used, it seems that we're just
> trying to iterate over the active events for a context (i.e. those
> programmed into the HW at this point in time). Though I'm missing some
> subtlety, since we check event->state < PERF_EVENT_STATE_INACTIVE.
>
> We have a similar issue with perf_event_task_tick() needing to know the
> relevant contexts, and for that we have the active_ctx_list. Can't we do
> something similar and add an active_events_list to perf_event_context?
So the problem is finding which events are active when.
If we stick all events in an RB-tree sorted on: {pmu,cpu,runtime} we
can, fairly easily, find the relevant subtree and limit the iteration.
Esp. if we use a threaded tree.
On Thu, Nov 10, 2016 at 12:37:05PM +0100, Peter Zijlstra wrote:
> On Thu, Nov 10, 2016 at 11:05:17AM +0000, Mark Rutland wrote:
> > On Thu, Nov 10, 2016 at 09:33:55AM +0100, Peter Zijlstra wrote:
> > > Yes this is a problem, but no this cannot be done. We can't have per-cpu
> > > storage per task. That rapidly explodes.
> > >
> > > Mark is looking at replacing this stuff with an rb-tree for big-little,
> > > that would also allow improving this I think.
> >
> > Unfortunately I've not had the chance to look at that since returning
> > from Plumbers. Also, I was leaning towards the alternative approach we
> > discussed, with a perf_event_task_contexts container, as that also
> > solved some other issues with the way we used perf_event_context::pmu in
> > big.LITTLE systems.
> >
> > Looking at the way perf_iterate_ctx is used, it seems that we're just
> > trying to iterate over the active events for a context (i.e. those
> > programmed into the HW at this point in time). Though I'm missing some
> > subtlety, since we check event->state < PERF_EVENT_STATE_INACTIVE.
> >
> > We have a similar issue with perf_event_task_tick() needing to know the
> > relevant contexts, and for that we have the active_ctx_list. Can't we do
> > something similar and add an active_events_list to perf_event_context?
>
> So the problem is finding which events are active when.
Sure.
If we only care about PERF_EVENT_STATE_ACTIVE, then I think we can
fairly easily maintain a perf_event_context::active_event_list at
event_sched_{in,out}() time (or somewhere close to that).
If we need PERF_EVENT_STATE_INACTIVE events, then that doesn't work,
since we can give up early and not schedule some eligible events.
> If we stick all events in an RB-tree sorted on: {pmu,cpu,runtime} we
> can, fairly easily, find the relevant subtree and limit the iteration.
> Esp. if we use a threaded tree.
That would cater for big.LITTLE, certainly, but I'm not sure I follow
how that helps to find active events -- you'll still have to iterate
through the whole PMU subtree to find which are active, no?
Thanks,
Mark.
On Thu, Nov 10, 2016 at 12:04:23PM +0000, Mark Rutland wrote:
> On Thu, Nov 10, 2016 at 12:37:05PM +0100, Peter Zijlstra wrote:
> > So the problem is finding which events are active when.
>
> Sure.
>
> If we only care about PERF_EVENT_STATE_ACTIVE, then I think we can
> fairly easily maintain a perf_event_context::active_event_list at
> event_sched_{in,out}() time (or somewhere close to that).
>
> If we need PERF_EVENT_STATE_INACTIVE events, then that doesn't work,
> since we can give up early and not schedule some eligible events.
>
> > If we stick all events in an RB-tree sorted on: {pmu,cpu,runtime} we
> > can, fairly easily, find the relevant subtree and limit the iteration.
> > Esp. if we use a threaded tree.
>
> That would cater for big.LITTLE, certainly, but I'm not sure I follow
> how that helps to find active events -- you'll still have to iterate
> through the whole PMU subtree to find which are active, no?
Ah, so the tree would in fact only contain 'INACTIVE' events :-)
That is, when no events are on the hardware, all events (if there are
any) are INACTIVE.
Then on sched-in, we find the relevant subtree, and linearly try and
program all events from that subtree onto the PMU. Once adding an event
fails programming, we stop (like we do now).
These programmed events transition from INACTIVE to ACTIVE, and we take
them out of the tree.
Then on sched-out, we remove all events from the hardware, increase the
events their runtime value by however long they were ACTIVE, flip them
to INACTIVE and stuff them back in the tree.
(I'm can't quite recall if we can easily find ACTIVE events from a PMU,
but if not, we can easily track those on a separate list).
On Thu, Nov 10, 2016 at 01:12:53PM +0100, Peter Zijlstra wrote:
> On Thu, Nov 10, 2016 at 12:04:23PM +0000, Mark Rutland wrote:
> > On Thu, Nov 10, 2016 at 12:37:05PM +0100, Peter Zijlstra wrote:
>
> > > So the problem is finding which events are active when.
> > > If we stick all events in an RB-tree sorted on: {pmu,cpu,runtime} we
> > > can, fairly easily, find the relevant subtree and limit the iteration.
> > > Esp. if we use a threaded tree.
> >
> > That would cater for big.LITTLE, certainly, but I'm not sure I follow
> > how that helps to find active events -- you'll still have to iterate
> > through the whole PMU subtree to find which are active, no?
>
> Ah, so the tree would in fact only contain 'INACTIVE' events :-)
Ah. :)
That explains some of the magic, but...
> That is, when no events are on the hardware, all events (if there are
> any) are INACTIVE.
>
> Then on sched-in, we find the relevant subtree, and linearly try and
> program all events from that subtree onto the PMU. Once adding an event
> fails programming, we stop (like we do now).
>
> These programmed events transition from INACTIVE to ACTIVE, and we take
> them out of the tree.
>
> Then on sched-out, we remove all events from the hardware, increase the
> events their runtime value by however long they were ACTIVE, flip them
> to INACTIVE and stuff them back in the tree.
... per the above, won't the tree also contain 'OFF' events (and
'ERROR', etc)?
... or do we keep them somewhere else (another list or sub-tree)?
If not, we still have to walk all of those in perf_iterate_ctx().
> (I'm can't quite recall if we can easily find ACTIVE events from a PMU,
> but if not, we can easily track those on a separate list).
I think we just iterate the perf_event_context::event list and look at
the state. Regardless, adding lists is fairly simple.
Thanks,
Mark.
On Thu, Nov 10, 2016 at 12:26:18PM +0000, Mark Rutland wrote:
> On Thu, Nov 10, 2016 at 01:12:53PM +0100, Peter Zijlstra wrote:
> > Ah, so the tree would in fact only contain 'INACTIVE' events :-)
>
> Ah. :)
>
> That explains some of the magic, but...
>
> > That is, when no events are on the hardware, all events (if there are
> > any) are INACTIVE.
> >
> > Then on sched-in, we find the relevant subtree, and linearly try and
> > program all events from that subtree onto the PMU. Once adding an event
> > fails programming, we stop (like we do now).
> >
> > These programmed events transition from INACTIVE to ACTIVE, and we take
> > them out of the tree.
> >
> > Then on sched-out, we remove all events from the hardware, increase the
> > events their runtime value by however long they were ACTIVE, flip them
> > to INACTIVE and stuff them back in the tree.
>
> ... per the above, won't the tree also contain 'OFF' events (and
> 'ERROR', etc)?
>
> ... or do we keep them somewhere else (another list or sub-tree)?
I don't think those need be tracked at all, they're immaterial for
actual scheduling. Once we ioctl() them back to life we can insert them
into the tree.
> If not, we still have to walk all of those in perf_iterate_ctx().
>
> > (I'm can't quite recall if we can easily find ACTIVE events from a PMU,
> > but if not, we can easily track those on a separate list).
>
> I think we just iterate the perf_event_context::event list and look at
> the state. Regardless, adding lists is fairly simple.
>
> Thanks,
> Mark.
On Thu, Nov 10, 2016 at 01:58:04PM +0100, Peter Zijlstra wrote:
> On Thu, Nov 10, 2016 at 12:26:18PM +0000, Mark Rutland wrote:
> > On Thu, Nov 10, 2016 at 01:12:53PM +0100, Peter Zijlstra wrote:
>
> > > Ah, so the tree would in fact only contain 'INACTIVE' events :-)
> >
> > Ah. :)
> >
> > That explains some of the magic, but...
> >
> > > That is, when no events are on the hardware, all events (if there are
> > > any) are INACTIVE.
> > >
> > > Then on sched-in, we find the relevant subtree, and linearly try and
> > > program all events from that subtree onto the PMU. Once adding an event
> > > fails programming, we stop (like we do now).
> > >
> > > These programmed events transition from INACTIVE to ACTIVE, and we take
> > > them out of the tree.
> > >
> > > Then on sched-out, we remove all events from the hardware, increase the
> > > events their runtime value by however long they were ACTIVE, flip them
> > > to INACTIVE and stuff them back in the tree.
> >
> > ... per the above, won't the tree also contain 'OFF' events (and
> > 'ERROR', etc)?
> >
> > ... or do we keep them somewhere else (another list or sub-tree)?
>
> I don't think those need be tracked at all, they're immaterial for
> actual scheduling. Once we ioctl() them back to life we can insert them
> into the tree.
Sure, that sounds fine for scheduling (including big.LITTLE).
I might still be misunderstanding something, but I don't think that
helps Kan's case: since INACTIVE events which will fail their filters
(including the CPU check) will still be in the tree, they will still
have to be iterated over.
That is, unless we also sort the tree by event->cpu, or if in those
cases we only care about ACTIVE events and can use an active list.
Thanks,
Mark.
> > I don't think those need be tracked at all, they're immaterial for
> > actual scheduling. Once we ioctl() them back to life we can insert
> > them into the tree.
>
> Sure, that sounds fine for scheduling (including big.LITTLE).
>
> I might still be misunderstanding something, but I don't think that helps
> Kan's case: since INACTIVE events which will fail their filters (including the
> CPU check) will still be in the tree, they will still have to be iterated over.
>
> That is, unless we also sort the tree by event->cpu, or if in those cases we
> only care about ACTIVE events and can use an active list.
Yes, if we can sort the tree by event->cpu, we can minimize the iteration in
the perf_iterate_ctx. That should help on reducing the overhead.
Thanks,
Kan
On Thu, Nov 10, 2016 at 02:10:37PM +0000, Mark Rutland wrote:
> Sure, that sounds fine for scheduling (including big.LITTLE).
>
> I might still be misunderstanding something, but I don't think that
> helps Kan's case: since INACTIVE events which will fail their filters
> (including the CPU check) will still be in the tree, they will still
> have to be iterated over.
>
> That is, unless we also sort the tree by event->cpu, or if in those
> cases we only care about ACTIVE events and can use an active list.
A few emails back up I wrote:
>> If we stick all events in an RB-tree sorted on: {pmu,cpu,runtime} we
Looking at the code there's also cgroup muck, not entirely sure where in
the sort order that should go if at all.
But having pmu and cpu in there would cure the big-little and
per-task-per-cpu event issues.
On Thu, Nov 10, 2016 at 05:26:32PM +0100, Peter Zijlstra wrote:
> On Thu, Nov 10, 2016 at 02:10:37PM +0000, Mark Rutland wrote:
>
> > Sure, that sounds fine for scheduling (including big.LITTLE).
> >
> > I might still be misunderstanding something, but I don't think that
> > helps Kan's case: since INACTIVE events which will fail their filters
> > (including the CPU check) will still be in the tree, they will still
> > have to be iterated over.
> >
> > That is, unless we also sort the tree by event->cpu, or if in those
> > cases we only care about ACTIVE events and can use an active list.
>
> A few emails back up I wrote:
>
> >> If we stick all events in an RB-tree sorted on: {pmu,cpu,runtime} we
Ah, sorry. Clearly I wouldn't pass a reading comprehension test today.
> Looking at the code there's also cgroup muck, not entirely sure where in
> the sort order that should go if at all.
>
> But having pmu and cpu in there would cure the big-little and
> per-task-per-cpu event issues.
Yup, that all makes sense to me now (modulo the cgroup stuff I also
haven't considered yet).
Thanks,
Mark.