Return-Path: Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S932554AbdHWSMl (ORCPT ); Wed, 23 Aug 2017 14:12:41 -0400 Received: from mga04.intel.com ([192.55.52.120]:27988 "EHLO mga04.intel.com" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S932111AbdHWSMk (ORCPT ); Wed, 23 Aug 2017 14:12:40 -0400 X-ExtLoop1: 1 X-IronPort-AV: E=Sophos;i="5.41,417,1498546800"; d="scan'208";a="127493662" Subject: Re: [PATCH v7 2/2] perf/core: add mux switch to skip to the current CPU's events list on mux interrupt To: Alexander Shishkin , Peter Zijlstra , Ingo Molnar , Arnaldo Carvalho de Melo Cc: Andi Kleen , Kan Liang , Dmitri Prokhorov , Valery Cherepennikov , Mark Rutland , Stephane Eranian , David Carrillo-Cisneros , linux-kernel References: <96c7776f-1f17-a39e-23e9-658596216d6b@linux.intel.com> <87r2w2zeh8.fsf@ashishki-desk.ger.corp.intel.com> From: Alexey Budankov Organization: Intel Corp. Message-ID: Date: Wed, 23 Aug 2017 21:12:34 +0300 User-Agent: Mozilla/5.0 (Windows NT 10.0; WOW64; rv:52.0) Gecko/20100101 Thunderbird/52.3.0 MIME-Version: 1.0 In-Reply-To: <87r2w2zeh8.fsf@ashishki-desk.ger.corp.intel.com> Content-Type: text/plain; charset=utf-8 Content-Language: en-US Content-Transfer-Encoding: 7bit Sender: linux-kernel-owner@vger.kernel.org List-ID: X-Mailing-List: linux-kernel@vger.kernel.org Content-Length: 15308 Lines: 450 On 23.08.2017 14:54, Alexander Shishkin wrote: > Alexey Budankov writes: > >> This patch implements mux switch that triggers skipping to the >> current CPU's events list at mulitplexing hrtimer interrupt >> handler as well as adoption of the switch in the existing >> implementation. >> >> perf_event_groups_iterate_cpu() API is introduced to implement >> iteration thru the certain CPU groups list skipping groups > > "through" > >> allocated for the other CPUs. >> >> Signed-off-by: Alexey Budankov >> --- >> kernel/events/core.c | 193 ++++++++++++++++++++++++++++++++++++--------------- >> 1 file changed, 137 insertions(+), 56 deletions(-) >> >> diff --git a/kernel/events/core.c b/kernel/events/core.c >> index 08ccfb2..aeb0f81 100644 >> --- a/kernel/events/core.c >> +++ b/kernel/events/core.c >> @@ -556,11 +556,11 @@ void perf_sample_event_took(u64 sample_len_ns) >> static atomic64_t perf_event_id; >> >> static void cpu_ctx_sched_out(struct perf_cpu_context *cpuctx, >> - enum event_type_t event_type); >> + enum event_type_t event_type, int mux); >> >> static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx, >> enum event_type_t event_type, >> - struct task_struct *task); >> + struct task_struct *task, int mux); >> >> static void update_context_time(struct perf_event_context *ctx); >> static u64 perf_event_time(struct perf_event *event); >> @@ -702,6 +702,7 @@ static void perf_cgroup_switch(struct task_struct *task, int mode) >> struct perf_cpu_context *cpuctx; >> struct list_head *list; >> unsigned long flags; >> + int mux = 0; >> >> /* >> * Disable interrupts and preemption to avoid this CPU's >> @@ -717,7 +718,7 @@ static void perf_cgroup_switch(struct task_struct *task, int mode) >> perf_pmu_disable(cpuctx->ctx.pmu); >> >> if (mode & PERF_CGROUP_SWOUT) { >> - cpu_ctx_sched_out(cpuctx, EVENT_ALL); >> + cpu_ctx_sched_out(cpuctx, EVENT_ALL, mux); >> /* >> * must not be done before ctxswout due >> * to event_filter_match() in event_sched_out() >> @@ -736,7 +737,7 @@ static void perf_cgroup_switch(struct task_struct *task, int mode) >> */ >> cpuctx->cgrp = perf_cgroup_from_task(task, >> &cpuctx->ctx); >> - cpu_ctx_sched_in(cpuctx, EVENT_ALL, task); >> + cpu_ctx_sched_in(cpuctx, EVENT_ALL, task, mux); > > 'mux' is always zero in this function, isn't it? > >> } >> perf_pmu_enable(cpuctx->ctx.pmu); >> perf_ctx_unlock(cpuctx, cpuctx->task_ctx); >> @@ -1613,8 +1614,16 @@ perf_event_groups_rotate(struct rb_root *groups, int cpu) >> */ >> #define perf_event_groups_for_each(event, iter, tree, node, list, link) \ >> for (iter = rb_first(tree); iter; iter = rb_next(iter)) \ >> - list_for_each_entry(event, &(rb_entry(iter, \ >> - typeof(*event), node)->list), link) >> + list_for_each_entry(event, &(rb_entry(iter, \ >> + typeof(*event), node)->list), link) > > Is this an indentation change? What is it doing here? > >> + >> +/* >> + * Iterate event groups related to specific cpu. >> + */ >> +#define perf_event_groups_for_each_cpu(event, cpu, tree, list, link) \ >> + list = perf_event_groups_get_list(tree, cpu); \ >> + if (list) \ >> + list_for_each_entry(event, list, link) > > ..or not, if there's no list. > >> >> /* >> * Add a event from the lists for its context. >> @@ -2397,36 +2406,38 @@ static void add_event_to_ctx(struct perf_event *event, >> >> static void ctx_sched_out(struct perf_event_context *ctx, >> struct perf_cpu_context *cpuctx, >> - enum event_type_t event_type); >> + enum event_type_t event_type, int mux); >> static void >> ctx_sched_in(struct perf_event_context *ctx, >> struct perf_cpu_context *cpuctx, >> enum event_type_t event_type, >> - struct task_struct *task); >> + struct task_struct *task, int mux); >> >> static void task_ctx_sched_out(struct perf_cpu_context *cpuctx, >> struct perf_event_context *ctx, >> enum event_type_t event_type) >> { >> + int mux = 0; >> + >> if (!cpuctx->task_ctx) >> return; >> >> if (WARN_ON_ONCE(ctx != cpuctx->task_ctx)) >> return; >> >> - ctx_sched_out(ctx, cpuctx, event_type); >> + ctx_sched_out(ctx, cpuctx, event_type, mux); > > Just use 0. Well, I intentionally made this ugly switch named over local variable to add clarity into the code so I simply wonder - why do you prefer the unnamed variant? > >> } >> >> static void perf_event_sched_in(struct perf_cpu_context *cpuctx, >> struct perf_event_context *ctx, >> - struct task_struct *task) >> + struct task_struct *task, int mux) >> { >> - cpu_ctx_sched_in(cpuctx, EVENT_PINNED, task); >> + cpu_ctx_sched_in(cpuctx, EVENT_PINNED, task, mux); >> if (ctx) >> - ctx_sched_in(ctx, cpuctx, EVENT_PINNED, task); >> - cpu_ctx_sched_in(cpuctx, EVENT_FLEXIBLE, task); >> + ctx_sched_in(ctx, cpuctx, EVENT_PINNED, task, mux); >> + cpu_ctx_sched_in(cpuctx, EVENT_FLEXIBLE, task, mux); >> if (ctx) >> - ctx_sched_in(ctx, cpuctx, EVENT_FLEXIBLE, task); >> + ctx_sched_in(ctx, cpuctx, EVENT_FLEXIBLE, task, mux); >> } >> >> /* >> @@ -2450,6 +2461,7 @@ static void ctx_resched(struct perf_cpu_context *cpuctx, >> { >> enum event_type_t ctx_event_type = event_type & EVENT_ALL; >> bool cpu_event = !!(event_type & EVENT_CPU); >> + int mux = 0; >> >> /* >> * If pinned groups are involved, flexible groups also need to be >> @@ -2470,11 +2482,11 @@ static void ctx_resched(struct perf_cpu_context *cpuctx, >> * - otherwise, do nothing more. >> */ >> if (cpu_event) >> - cpu_ctx_sched_out(cpuctx, ctx_event_type); >> + cpu_ctx_sched_out(cpuctx, ctx_event_type, mux); >> else if (ctx_event_type & EVENT_PINNED) >> - cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE); >> + cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE, mux); >> >> - perf_event_sched_in(cpuctx, task_ctx, current); >> + perf_event_sched_in(cpuctx, task_ctx, current, mux); > > Also mux==0 in all cases in this function. > >> perf_pmu_enable(cpuctx->ctx.pmu); >> } >> >> @@ -2491,7 +2503,7 @@ static int __perf_install_in_context(void *info) >> struct perf_cpu_context *cpuctx = __get_cpu_context(ctx); >> struct perf_event_context *task_ctx = cpuctx->task_ctx; >> bool reprogram = true; >> - int ret = 0; >> + int ret = 0, mux =0; >> >> raw_spin_lock(&cpuctx->ctx.lock); >> if (ctx->task) { >> @@ -2518,7 +2530,7 @@ static int __perf_install_in_context(void *info) >> } >> >> if (reprogram) { >> - ctx_sched_out(ctx, cpuctx, EVENT_TIME); >> + ctx_sched_out(ctx, cpuctx, EVENT_TIME, mux); >> add_event_to_ctx(event, ctx); >> ctx_resched(cpuctx, task_ctx, get_event_type(event)); >> } else { >> @@ -2655,13 +2667,14 @@ static void __perf_event_enable(struct perf_event *event, >> { >> struct perf_event *leader = event->group_leader; >> struct perf_event_context *task_ctx; >> + int mux = 0; >> >> if (event->state >= PERF_EVENT_STATE_INACTIVE || >> event->state <= PERF_EVENT_STATE_ERROR) >> return; >> >> if (ctx->is_active) >> - ctx_sched_out(ctx, cpuctx, EVENT_TIME); >> + ctx_sched_out(ctx, cpuctx, EVENT_TIME, mux); >> >> __perf_event_mark_enabled(event); >> >> @@ -2671,7 +2684,7 @@ static void __perf_event_enable(struct perf_event *event, >> if (!event_filter_match(event)) { >> if (is_cgroup_event(event)) >> perf_cgroup_defer_enabled(event); >> - ctx_sched_in(ctx, cpuctx, EVENT_TIME, current); >> + ctx_sched_in(ctx, cpuctx, EVENT_TIME, current, mux); >> return; >> } >> >> @@ -2680,7 +2693,7 @@ static void __perf_event_enable(struct perf_event *event, >> * then don't put it on unless the group is on. >> */ >> if (leader != event && leader->state != PERF_EVENT_STATE_ACTIVE) { >> - ctx_sched_in(ctx, cpuctx, EVENT_TIME, current); >> + ctx_sched_in(ctx, cpuctx, EVENT_TIME, current, mux); > > And here. > >> return; >> } >> >> @@ -2876,11 +2889,13 @@ EXPORT_SYMBOL_GPL(perf_event_refresh); >> >> static void ctx_sched_out(struct perf_event_context *ctx, >> struct perf_cpu_context *cpuctx, >> - enum event_type_t event_type) >> + enum event_type_t event_type, int mux) >> { >> int is_active = ctx->is_active; >> + struct list_head *group_list; >> struct perf_event *event; >> struct rb_node *node; >> + int sw = -1, cpu = smp_processor_id(); > > Same thing seems to be happening with 'sw'. > >> lockdep_assert_held(&ctx->lock); >> >> if (likely(!ctx->nr_events)) { >> @@ -2926,17 +2941,47 @@ static void ctx_sched_out(struct perf_event_context *ctx, >> >> perf_pmu_disable(ctx->pmu); >> >> - if (is_active & EVENT_PINNED) >> - perf_event_groups_for_each(event, node, >> - &ctx->pinned_groups, group_node, >> - group_list, group_entry) >> - group_sched_out(event, cpuctx, ctx); >> + if (is_active & EVENT_PINNED) { >> + if (mux) { > > So it's 'rotate', really. Which 'rotate' do you mean? If the local variable from perf_rotate_context() then yes - it may be passed to this function call from there as mux value, but logically they are still different. > >> + perf_event_groups_for_each_cpu(event, cpu, >> + &ctx->pinned_groups, >> + group_list, group_entry) { >> + group_sched_out(event, cpuctx, ctx); >> + } >> + perf_event_groups_for_each_cpu(event, sw, >> + &ctx->pinned_groups, >> + group_list, group_entry) { >> + group_sched_out(event, cpuctx, ctx); >> + } >> + } else { >> + perf_event_groups_for_each(event, node, >> + &ctx->pinned_groups, group_node, >> + group_list, group_entry) { >> + group_sched_out(event, cpuctx, ctx); >> + } >> + } >> + } >> >> - if (is_active & EVENT_FLEXIBLE) >> - perf_event_groups_for_each(event, node, >> - &ctx->flexible_groups, group_node, >> - group_list, group_entry) >> - group_sched_out(event, cpuctx, ctx); >> + if (is_active & EVENT_FLEXIBLE) { >> + if (mux) { >> + perf_event_groups_for_each_cpu(event, cpu, >> + &ctx->flexible_groups, >> + group_list, group_entry) { >> + group_sched_out(event, cpuctx, ctx); >> + } >> + perf_event_groups_for_each_cpu(event, sw, >> + &ctx->flexible_groups, >> + group_list, group_entry) { >> + group_sched_out(event, cpuctx, ctx); >> + } >> + } else { >> + perf_event_groups_for_each(event, node, >> + &ctx->flexible_groups, group_node, >> + group_list, group_entry) { >> + group_sched_out(event, cpuctx, ctx); >> + } >> + } >> + } >> >> perf_pmu_enable(ctx->pmu); >> } >> @@ -3225,9 +3270,9 @@ void __perf_event_task_sched_out(struct task_struct *task, >> * Called with IRQs disabled >> */ >> static void cpu_ctx_sched_out(struct perf_cpu_context *cpuctx, >> - enum event_type_t event_type) >> + enum event_type_t event_type, int mux) >> { >> - ctx_sched_out(&cpuctx->ctx, cpuctx, event_type); >> + ctx_sched_out(&cpuctx->ctx, cpuctx, event_type, mux); >> } >> >> static void >> @@ -3287,11 +3332,13 @@ static void >> ctx_sched_in(struct perf_event_context *ctx, >> struct perf_cpu_context *cpuctx, >> enum event_type_t event_type, >> - struct task_struct *task) >> + struct task_struct *task, int mux) >> { >> int is_active = ctx->is_active; >> + struct list_head *group_list; >> struct perf_event *event; >> struct rb_node *node; >> + int sw = -1, cpu = smp_processor_id(); >> >> lockdep_assert_held(&ctx->lock); >> >> @@ -3319,35 +3366,69 @@ ctx_sched_in(struct perf_event_context *ctx, >> * First go through the list and put on any pinned groups >> * in order to give them the best chance of going on. >> */ >> - if (is_active & EVENT_PINNED) >> - perf_event_groups_for_each(event, node, >> - &ctx->pinned_groups, group_node, >> - group_list, group_entry) >> - ctx_pinned_sched_in(event, cpuctx, ctx); >> + if (is_active & EVENT_PINNED) { >> + if (mux) { >> + perf_event_groups_for_each_cpu(event, sw, >> + &ctx->pinned_groups, >> + group_list, group_entry) { >> + ctx_pinned_sched_in(event, cpuctx, ctx); >> + } >> + perf_event_groups_for_each_cpu(event, cpu, >> + &ctx->pinned_groups, >> + group_list, group_entry) { >> + ctx_pinned_sched_in(event, cpuctx, ctx); >> + } >> + } else { >> + perf_event_groups_for_each(event, node, >> + &ctx->pinned_groups, group_node, >> + group_list, group_entry) { >> + ctx_pinned_sched_in(event, cpuctx, ctx); >> + } >> + } >> + } >> >> /* Then walk through the lower prio flexible groups */ >> if (is_active & EVENT_FLEXIBLE) { >> int can_add_hw = 1; >> - perf_event_groups_for_each(event, node, >> - &ctx->flexible_groups, group_node, >> - group_list, group_entry) >> - ctx_flexible_sched_in(event, cpuctx, ctx, &can_add_hw); >> + if (mux) { >> + perf_event_groups_for_each_cpu(event, sw, >> + &ctx->flexible_groups, >> + group_list, group_entry) { >> + ctx_flexible_sched_in(event, cpuctx, >> + ctx, &can_add_hw); >> + } >> + can_add_hw = 1; >> + perf_event_groups_for_each_cpu(event, cpu, >> + &ctx->flexible_groups, >> + group_list, group_entry) { >> + ctx_flexible_sched_in(event, cpuctx, >> + ctx, &can_add_hw); >> + } >> + } else { >> + perf_event_groups_for_each(event, node, >> + &ctx->flexible_groups, group_node, >> + group_list, group_entry) { >> + ctx_flexible_sched_in(event, cpuctx, >> + ctx, &can_add_hw); >> + } >> + } >> } >> } >> >> static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx, >> enum event_type_t event_type, >> - struct task_struct *task) >> + struct task_struct *task, int mux) >> { >> struct perf_event_context *ctx = &cpuctx->ctx; >> >> - ctx_sched_in(ctx, cpuctx, event_type, task); >> + ctx_sched_in(ctx, cpuctx, event_type, task, mux); >> } >> >> static void perf_event_context_sched_in(struct perf_event_context *ctx, >> struct task_struct *task) >> { >> struct perf_cpu_context *cpuctx; >> + int mux = 0; >> >> cpuctx = __get_cpu_context(ctx); >> if (cpuctx->task_ctx == ctx) >> @@ -3371,8 +3452,8 @@ static void perf_event_context_sched_in(struct perf_event_context *ctx, >> * events, no need to flip the cpuctx's events around. >> */ >> if (!RB_EMPTY_ROOT(&ctx->pinned_groups)) >> - cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE); >> - perf_event_sched_in(cpuctx, ctx, task); >> + cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE, mux); >> + perf_event_sched_in(cpuctx, ctx, task, mux); >> perf_pmu_enable(ctx->pmu); >> >> unlock: >> @@ -3618,7 +3699,7 @@ static void rotate_ctx(struct perf_event_context *ctx) >> static int perf_rotate_context(struct perf_cpu_context *cpuctx) >> { >> struct perf_event_context *ctx = NULL; >> - int rotate = 0; >> + int rotate = 0, mux = 1; >> >> if (cpuctx->ctx.nr_events) { >> if (cpuctx->ctx.nr_events != cpuctx->ctx.nr_active) >> @@ -3637,15 +3718,15 @@ static int perf_rotate_context(struct perf_cpu_context *cpuctx) >> perf_ctx_lock(cpuctx, cpuctx->task_ctx); >> perf_pmu_disable(cpuctx->ctx.pmu); >> >> - cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE); >> + cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE, mux); > > It's '1'. > >