Return-Path: Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S1756335AbdIHIrO (ORCPT ); Fri, 8 Sep 2017 04:47:14 -0400 Received: from mga03.intel.com ([134.134.136.65]:59857 "EHLO mga03.intel.com" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S1756316AbdIHIrJ (ORCPT ); Fri, 8 Sep 2017 04:47:09 -0400 X-ExtLoop1: 1 X-IronPort-AV: E=Sophos;i="5.42,360,1500966000"; d="scan'208";a="1170295355" Subject: Re: [RFC][PATCH] perf: Rewrite enabled/running timekeeping To: Peter Zijlstra Cc: Ingo Molnar , Arnaldo Carvalho de Melo , Alexander Shishkin , Andi Kleen , Kan Liang , Dmitri Prokhorov , Valery Cherepennikov , Mark Rutland , Stephane Eranian , David Carrillo-Cisneros , linux-kernel , Vince Weaver , Thomas Gleixner References: <20170822204743.GR32112@worktop.programming.kicks-ass.net> <2a426aa2-42c8-e839-1cec-aa3971651f3e@linux.intel.com> <20170831171837.njnc6r6elsvkl7lt@hirez.programming.kicks-ass.net> <46f72a3f-f18b-0227-3d78-fb23c8a6e18e@linux.intel.com> <20170904120843.oazlv73phoxoinlj@hirez.programming.kicks-ass.net> <385005b6-51ea-383e-df81-43365f3f5152@linux.intel.com> <20170904154145.xl4fyg7vhgbnmhwi@hirez.programming.kicks-ass.net> <9c3c1fc7-5a2e-2745-dab1-390dc7d58d50@linux.intel.com> <20170905160341.jpq6il7gux7bmd3e@hirez.programming.kicks-ass.net> From: Alexey Budankov Organization: Intel Corp. Message-ID: <372f9c8b-0cfe-4240-e44d-83d863d40813@linux.intel.com> Date: Fri, 8 Sep 2017 11:47:03 +0300 User-Agent: Mozilla/5.0 (Windows NT 10.0; WOW64; rv:52.0) Gecko/20100101 Thunderbird/52.3.0 MIME-Version: 1.0 In-Reply-To: <20170905160341.jpq6il7gux7bmd3e@hirez.programming.kicks-ass.net> Content-Type: text/plain; charset=utf-8 Content-Language: en-US Content-Transfer-Encoding: 7bit Sender: linux-kernel-owner@vger.kernel.org List-ID: X-Mailing-List: linux-kernel@vger.kernel.org Content-Length: 15566 Lines: 517 On 05.09.2017 19:03, Peter Zijlstra wrote: > On Tue, Sep 05, 2017 at 03:06:26PM +0300, Alexey Budankov wrote: >> [ 6614.226305] WARNING: CPU: 45 PID: 43385 at kernel/events/core.c:239 event_function+0xb3/0xe0 > > I think I avoided that problem by not radically rewriting > perf_event_read() but fixing it instead: > > https://git.kernel.org/pub/scm/linux/kernel/git/peterz/queue.git/commit/?h=perf/core&id=8ad650955ede95e4a6fd6afbda2a0b37d4af9c29 > > Full tree at: > > git://git.kernel.org/pub/scm/linux/kernel/git/peterz/queue.git perf/core > > > Very minimally tested so far, I'll continue tomorrow. > The patch set v9 on top of peterz/queue perf/core repository above: --- include/linux/perf_event.h | 16 ++- kernel/events/core.c | 307 +++++++++++++++++++++++++++++++++++++-------- 2 files changed, 267 insertions(+), 56 deletions(-) diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 2a6ae48..92cda40 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -557,7 +557,11 @@ struct perf_event { */ struct list_head group_entry; struct list_head sibling_list; - + /* + * Node on the pinned or flexible tree located at the event context; + */ + struct rb_node group_node; + u64 group_index; /* * We need storage to track the entries in perf_pmu_migrate_context; we * cannot use the event_entry because of RCU and we want to keep the @@ -689,6 +693,12 @@ struct perf_event { #endif /* CONFIG_PERF_EVENTS */ }; + +struct perf_event_groups { + struct rb_root tree; + u64 index; +}; + /** * struct perf_event_context - event context structure * @@ -709,8 +719,8 @@ struct perf_event_context { struct mutex mutex; struct list_head active_ctx_list; - struct list_head pinned_groups; - struct list_head flexible_groups; + struct perf_event_groups pinned_groups; + struct perf_event_groups flexible_groups; struct list_head event_list; int nr_events; int nr_active; diff --git a/kernel/events/core.c b/kernel/events/core.c index 56e9214..8158f1d 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -1454,8 +1454,21 @@ static enum event_type_t get_event_type(struct perf_event *event) return event_type; } -static struct list_head * -ctx_group_list(struct perf_event *event, struct perf_event_context *ctx) +/* + * Helper function to initialize group leader event; + */ +void init_event_group(struct perf_event *event) +{ + RB_CLEAR_NODE(&event->group_node); + event->group_index = 0; +} + +/* + * Extract pinned or flexible groups from the context + * based on event attrs bits; + */ +static struct perf_event_groups * +get_event_groups(struct perf_event *event, struct perf_event_context *ctx) { if (event->attr.pinned) return &ctx->pinned_groups; @@ -1464,6 +1477,169 @@ static enum event_type_t get_event_type(struct perf_event *event) } /* + * Helper function to initializes perf event groups object; + */ +void perf_event_groups_init(struct perf_event_groups *groups) +{ + groups->tree = RB_ROOT; + groups->index = 0; +} + +/* + * Compare function for event groups; + * Implements complex key that first sorts by CPU and then by + * virtual index which provides ordering when rotating + * groups for the same CPU; + */ +int perf_event_groups_less(struct perf_event *left, struct perf_event *right) +{ + if (left->cpu < right->cpu) { + return 1; + } else if (left->cpu > right->cpu) { + return 0; + } else { + if (left->group_index < right->group_index) { + return 1; + } else if(left->group_index > right->group_index) { + return 0; + } else { + return 0; + } + } +} + +/* + * Insert a group into a tree using event->cpu as a key. If event->cpu node + * is already attached to the tree then the event is added to the attached + * group's group_list list. + */ +static void +perf_event_groups_insert(struct perf_event_groups *groups, + struct perf_event *event) +{ + struct perf_event *node_event; + struct rb_node *parent; + struct rb_node **node; + + event->group_index = ++groups->index; + + node = &groups->tree.rb_node; + parent = *node; + + while (*node) { + parent = *node; + node_event = container_of(*node, + struct perf_event, group_node); + + if (perf_event_groups_less(event, node_event)) + node = &parent->rb_left; + else + node = &parent->rb_right; + } + + rb_link_node(&event->group_node, parent, node); + rb_insert_color(&event->group_node, &groups->tree); +} + +/* + * Helper function to insert event into the pinned or + * flexible groups; + */ +static void +add_event_to_groups(struct perf_event *event, struct perf_event_context *ctx) +{ + struct perf_event_groups *groups; + + groups = get_event_groups(event, ctx); + perf_event_groups_insert(groups, event); +} + +/* + * Delete a group from a tree. If the group is directly attached to the tree + * it also detaches all groups on the group's group_list list. + */ +static void +perf_event_groups_delete(struct perf_event_groups *groups, + struct perf_event *event) +{ + if (!RB_EMPTY_NODE(&event->group_node) && + !RB_EMPTY_ROOT(&groups->tree)) + rb_erase(&event->group_node, &groups->tree); + + init_event_group(event); +} + +/* + * Helper function to delete event from its groups; + */ +static void +del_event_from_groups(struct perf_event *event, struct perf_event_context *ctx) +{ + struct perf_event_groups *groups; + + groups = get_event_groups(event, ctx); + perf_event_groups_delete(groups, event); +} + +/* + * Get a group by a cpu key from groups tree with the least group_index; + */ +static struct perf_event * +perf_event_groups_first(struct perf_event_groups *groups, int cpu) +{ + struct perf_event *node_event = NULL, *match = NULL; + struct rb_node *node = groups->tree.rb_node; + + while (node) { + node_event = container_of(node, + struct perf_event, group_node); + + if (cpu < node_event->cpu) { + node = node->rb_left; + } else if (cpu > node_event->cpu) { + node = node->rb_right; + } else { + match = node_event; + node = node->rb_left; + } + } + + return match; +} + +/* + * Find group list by a cpu key and rotate it. + */ +static void +perf_event_groups_rotate(struct perf_event_groups *groups, int cpu) +{ + struct perf_event *event = + perf_event_groups_first(groups, cpu); + + if (event) { + perf_event_groups_delete(groups, event); + perf_event_groups_insert(groups, event); + } +} + +/* + * Iterate event groups thru the whole tree. + */ +#define perf_event_groups_for_each(event, groups, node) \ + for (event = rb_entry_safe(rb_first(&((groups)->tree)), \ + typeof(*event), node); event; \ + event = rb_entry_safe(rb_next(&event->node), \ + typeof(*event), node)) +/* + * Iterate event groups with cpu == key. + */ +#define perf_event_groups_for_each_cpu(event, key, groups, node) \ + for (event = perf_event_groups_first(groups, key); \ + event && event->cpu == key; \ + event = rb_entry_safe(rb_next(&event->node), \ + typeof(*event), node)) + +/* * Add a event from the lists for its context. * Must be called with ctx->mutex and ctx->lock held. */ @@ -1483,12 +1659,8 @@ static enum event_type_t get_event_type(struct perf_event *event) * perf_group_detach can, at all times, locate all siblings. */ if (event->group_leader == event) { - struct list_head *list; - event->group_caps = event->event_caps; - - list = ctx_group_list(event, ctx); - list_add_tail(&event->group_entry, list); + add_event_to_groups(event, ctx); } list_update_cgroup_event(event, ctx, true); @@ -1682,7 +1854,7 @@ static void perf_group_attach(struct perf_event *event) list_del_rcu(&event->event_entry); if (event->group_leader == event) - list_del_init(&event->group_entry); + del_event_from_groups(event, ctx); /* * If event was in error state, then keep it @@ -1700,7 +1872,6 @@ static void perf_group_attach(struct perf_event *event) static void perf_group_detach(struct perf_event *event) { struct perf_event *sibling, *tmp; - struct list_head *list = NULL; lockdep_assert_held(&event->ctx->lock); @@ -1721,22 +1892,23 @@ static void perf_group_detach(struct perf_event *event) goto out; } - if (!list_empty(&event->group_entry)) - list = &event->group_entry; - /* * If this was a group event with sibling events then * upgrade the siblings to singleton events by adding them * to whatever list we are on. */ list_for_each_entry_safe(sibling, tmp, &event->sibling_list, group_entry) { - if (list) - list_move_tail(&sibling->group_entry, list); + sibling->group_leader = sibling; /* Inherit group flags from the previous leader */ sibling->group_caps = event->group_caps; + if (!RB_EMPTY_NODE(&event->group_node)) { + list_del_init(&sibling->group_entry); + add_event_to_groups(sibling, event->ctx); + } + WARN_ON_ONCE(sibling->ctx != event->ctx); } @@ -2180,6 +2352,22 @@ static int group_can_go_on(struct perf_event *event, return can_add_hw; } +static int +flexible_group_sched_in(struct perf_event *event, + struct perf_event_context *ctx, + struct perf_cpu_context *cpuctx, + int *can_add_hw) +{ + if (event->state <= PERF_EVENT_STATE_OFF || !event_filter_match(event)) + return 0; + + if (group_can_go_on(event, cpuctx, *can_add_hw)) + if (group_sched_in(event, cpuctx, ctx)) + *can_add_hw = 0; + + return 1; +} + static void add_event_to_ctx(struct perf_event *event, struct perf_event_context *ctx) { @@ -2646,6 +2834,7 @@ static void ctx_sched_out(struct perf_event_context *ctx, struct perf_cpu_context *cpuctx, enum event_type_t event_type) { + int sw = -1, cpu = smp_processor_id(); int is_active = ctx->is_active; struct perf_event *event; @@ -2694,12 +2883,20 @@ static void ctx_sched_out(struct perf_event_context *ctx, perf_pmu_disable(ctx->pmu); if (is_active & EVENT_PINNED) { - list_for_each_entry(event, &ctx->pinned_groups, group_entry) + perf_event_groups_for_each_cpu(event, cpu, + &ctx->pinned_groups, group_node) + group_sched_out(event, cpuctx, ctx); + perf_event_groups_for_each_cpu(event, sw, + &ctx->pinned_groups, group_node) group_sched_out(event, cpuctx, ctx); } if (is_active & EVENT_FLEXIBLE) { - list_for_each_entry(event, &ctx->flexible_groups, group_entry) + perf_event_groups_for_each_cpu(event, cpu, + &ctx->flexible_groups, group_node) + group_sched_out(event, cpuctx, ctx); + perf_event_groups_for_each_cpu(event, sw, + &ctx->flexible_groups, group_node) group_sched_out(event, cpuctx, ctx); } perf_pmu_enable(ctx->pmu); @@ -2990,23 +3187,28 @@ static void cpu_ctx_sched_out(struct perf_cpu_context *cpuctx, ctx_pinned_sched_in(struct perf_event_context *ctx, struct perf_cpu_context *cpuctx) { + int sw = -1, cpu = smp_processor_id(); struct perf_event *event; + int can_add_hw; + + perf_event_groups_for_each_cpu(event, sw, + &ctx->pinned_groups, group_node) { + can_add_hw = 1; + if (flexible_group_sched_in(event, ctx, cpuctx, &can_add_hw)) { + if (event->state == PERF_EVENT_STATE_INACTIVE) + perf_event_set_state(event, + PERF_EVENT_STATE_ERROR); + } + } - list_for_each_entry(event, &ctx->pinned_groups, group_entry) { - if (event->state <= PERF_EVENT_STATE_OFF) - continue; - if (!event_filter_match(event)) - continue; - - if (group_can_go_on(event, cpuctx, 1)) - group_sched_in(event, cpuctx, ctx); - - /* - * If this pinned group hasn't been scheduled, - * put it in error state. - */ - if (event->state == PERF_EVENT_STATE_INACTIVE) - perf_event_set_state(event, PERF_EVENT_STATE_ERROR); + perf_event_groups_for_each_cpu(event, cpu, + &ctx->pinned_groups, group_node) { + can_add_hw = 1; + if (flexible_group_sched_in(event, ctx, cpuctx, &can_add_hw)) { + if (event->state == PERF_EVENT_STATE_INACTIVE) + perf_event_set_state(event, + PERF_EVENT_STATE_ERROR); + } } } @@ -3014,25 +3216,19 @@ static void cpu_ctx_sched_out(struct perf_cpu_context *cpuctx, ctx_flexible_sched_in(struct perf_event_context *ctx, struct perf_cpu_context *cpuctx) { + int sw = -1, cpu = smp_processor_id(); struct perf_event *event; int can_add_hw = 1; - list_for_each_entry(event, &ctx->flexible_groups, group_entry) { - /* Ignore events in OFF or ERROR state */ - if (event->state <= PERF_EVENT_STATE_OFF) - continue; - /* - * Listen to the 'cpu' scheduling filter constraint - * of events: - */ - if (!event_filter_match(event)) - continue; + perf_event_groups_for_each_cpu(event, sw, + &ctx->flexible_groups, group_node) + flexible_group_sched_in(event, ctx, cpuctx, &can_add_hw); + + can_add_hw = 1; + perf_event_groups_for_each_cpu(event, cpu, + &ctx->flexible_groups, group_node) + flexible_group_sched_in(event, ctx, cpuctx, &can_add_hw); - if (group_can_go_on(event, cpuctx, can_add_hw)) { - if (group_sched_in(event, cpuctx, ctx)) - can_add_hw = 0; - } - } } static void @@ -3113,7 +3309,7 @@ static void perf_event_context_sched_in(struct perf_event_context *ctx, * However, if task's ctx is not carrying any pinned * events, no need to flip the cpuctx's events around. */ - if (!list_empty(&ctx->pinned_groups)) + if (!RB_EMPTY_ROOT(&ctx->pinned_groups.tree)) cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE); perf_event_sched_in(cpuctx, ctx, task); perf_pmu_enable(ctx->pmu); @@ -3350,8 +3546,12 @@ static void rotate_ctx(struct perf_event_context *ctx) * Rotate the first entry last of non-pinned groups. Rotation might be * disabled by the inheritance code. */ - if (!ctx->rotate_disable) - list_rotate_left(&ctx->flexible_groups); + if (!ctx->rotate_disable) { + int sw = -1, cpu = smp_processor_id(); + + perf_event_groups_rotate(&ctx->flexible_groups, sw); + perf_event_groups_rotate(&ctx->flexible_groups, cpu); + } } static int perf_rotate_context(struct perf_cpu_context *cpuctx) @@ -3698,8 +3898,8 @@ static void __perf_event_init_context(struct perf_event_context *ctx) raw_spin_lock_init(&ctx->lock); mutex_init(&ctx->mutex); INIT_LIST_HEAD(&ctx->active_ctx_list); - INIT_LIST_HEAD(&ctx->pinned_groups); - INIT_LIST_HEAD(&ctx->flexible_groups); + perf_event_groups_init(&ctx->pinned_groups); + perf_event_groups_init(&ctx->flexible_groups); INIT_LIST_HEAD(&ctx->event_list); atomic_set(&ctx->refcount, 1); } @@ -9370,6 +9570,7 @@ static void account_event(struct perf_event *event) INIT_LIST_HEAD(&event->group_entry); INIT_LIST_HEAD(&event->event_entry); INIT_LIST_HEAD(&event->sibling_list); + init_event_group(event); INIT_LIST_HEAD(&event->rb_entry); INIT_LIST_HEAD(&event->active_entry); INIT_LIST_HEAD(&event->addr_filters.list); @@ -10880,7 +11081,7 @@ static int perf_event_init_context(struct task_struct *child, int ctxn) * We dont have to disable NMIs - we are only looking at * the list, not manipulating it: */ - list_for_each_entry(event, &parent_ctx->pinned_groups, group_entry) { + perf_event_groups_for_each(event, &parent_ctx->pinned_groups, group_node) { ret = inherit_task_group(event, parent, parent_ctx, child, ctxn, &inherited_all); if (ret) @@ -10896,7 +11097,7 @@ static int perf_event_init_context(struct task_struct *child, int ctxn) parent_ctx->rotate_disable = 1; raw_spin_unlock_irqrestore(&parent_ctx->lock, flags); - list_for_each_entry(event, &parent_ctx->flexible_groups, group_entry) { + perf_event_groups_for_each(event, &parent_ctx->flexible_groups, group_node) { ret = inherit_task_group(event, parent, parent_ctx, child, ctxn, &inherited_all); if (ret)