From: Alexander Shishkin <alexander.shishkin@linux.intel.com>
To: Alexey Budankov <alexey.budankov@linux.intel.com>,
        Peter Zijlstra <peterz@infradead.org>, Ingo Molnar <mingo@redhat.com>,
        Arnaldo Carvalho de Melo <acme@kernel.org>
Cc: Andi Kleen <ak@linux.intel.com>, Kan Liang <kan.liang@intel.com>,
        Dmitri Prokhorov <Dmitry.Prohorov@intel.com>,
        Valery Cherepennikov <valery.cherepennikov@intel.com>,
        Mark Rutland <mark.rutland@arm.com>,
        Stephane Eranian <eranian@google.com>,
        David Carrillo-Cisneros <davidcc@google.com>,
        linux-kernel <linux-kernel@vger.kernel.org>
Subject: Re: [PATCH v7 2/2] perf/core: add mux switch to skip to the current CPU's events list on mux interrupt
In-Reply-To: <edf9ed7e-a4d0-dbce-53cc-883f9ab40957@linux.intel.com>
References: <96c7776f-1f17-a39e-23e9-658596216d6b@linux.intel.com> <d676ecc5-488a-e82c-b85d-ccc8e0f02c4e@linux.intel.com> <edf9ed7e-a4d0-dbce-53cc-883f9ab40957@linux.intel.com>
User-Agent: Notmuch/0.23.7 (http://notmuchmail.org) Emacs/25.1.1 (x86_64-pc-linux-gnu)
Date: Wed, 23 Aug 2017 14:54:27 +0300
Message-ID: <87r2w2zeh8.fsf@ashishki-desk.ger.corp.intel.com>
MIME-Version: 1.0
Content-Type: text/plain
Sender: linux-kernel-owner@vger.kernel.org
Content-Length: 14428
Lines: 437

Alexey Budankov <alexey.budankov@linux.intel.com> writes:

> This patch implements mux switch that triggers skipping to the 
> current CPU's events list at mulitplexing hrtimer interrupt 
> handler as well as adoption of the switch in the existing 
> implementation.
>
> perf_event_groups_iterate_cpu() API is introduced to implement 
> iteration thru the certain CPU groups list skipping groups 

"through"

> allocated for the other CPUs.
>
> Signed-off-by: Alexey Budankov <alexey.budankov@linux.intel.com>
> ---
>  kernel/events/core.c | 193 ++++++++++++++++++++++++++++++++++++---------------
>  1 file changed, 137 insertions(+), 56 deletions(-)
>
> diff --git a/kernel/events/core.c b/kernel/events/core.c
> index 08ccfb2..aeb0f81 100644
> --- a/kernel/events/core.c
> +++ b/kernel/events/core.c
> @@ -556,11 +556,11 @@ void perf_sample_event_took(u64 sample_len_ns)
>  static atomic64_t perf_event_id;
>  
>  static void cpu_ctx_sched_out(struct perf_cpu_context *cpuctx,
> -			      enum event_type_t event_type);
> +			      enum event_type_t event_type, int mux);
>  
>  static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx,
>  			     enum event_type_t event_type,
> -			     struct task_struct *task);
> +			     struct task_struct *task, int mux);
>  
>  static void update_context_time(struct perf_event_context *ctx);
>  static u64 perf_event_time(struct perf_event *event);
> @@ -702,6 +702,7 @@ static void perf_cgroup_switch(struct task_struct *task, int mode)
>  	struct perf_cpu_context *cpuctx;
>  	struct list_head *list;
>  	unsigned long flags;
> +	int mux = 0;
>  
>  	/*
>  	 * Disable interrupts and preemption to avoid this CPU's
> @@ -717,7 +718,7 @@ static void perf_cgroup_switch(struct task_struct *task, int mode)
>  		perf_pmu_disable(cpuctx->ctx.pmu);
>  
>  		if (mode & PERF_CGROUP_SWOUT) {
> -			cpu_ctx_sched_out(cpuctx, EVENT_ALL);
> +			cpu_ctx_sched_out(cpuctx, EVENT_ALL, mux);
>  			/*
>  			 * must not be done before ctxswout due
>  			 * to event_filter_match() in event_sched_out()
> @@ -736,7 +737,7 @@ static void perf_cgroup_switch(struct task_struct *task, int mode)
>  			 */
>  			cpuctx->cgrp = perf_cgroup_from_task(task,
>  							     &cpuctx->ctx);
> -			cpu_ctx_sched_in(cpuctx, EVENT_ALL, task);
> +			cpu_ctx_sched_in(cpuctx, EVENT_ALL, task, mux);

'mux' is always zero in this function, isn't it?

>  		}
>  		perf_pmu_enable(cpuctx->ctx.pmu);
>  		perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
> @@ -1613,8 +1614,16 @@ perf_event_groups_rotate(struct rb_root *groups, int cpu)
>   */
>  #define perf_event_groups_for_each(event, iter, tree, node, list, link) \
>  	   for (iter = rb_first(tree); iter; iter = rb_next(iter))	\
> -		list_for_each_entry(event, &(rb_entry(iter,		\
> -			typeof(*event), node)->list), link)
> +		list_for_each_entry(event, &(rb_entry(iter, 		\
> +				typeof(*event), node)->list), link)

Is this an indentation change? What is it doing here?

> +
> +/*
> + * Iterate event groups related to specific cpu.
> + */
> +#define perf_event_groups_for_each_cpu(event, cpu, tree, list, link)	\
> +		list = perf_event_groups_get_list(tree, cpu);		\
> +		if (list)						\
> +			list_for_each_entry(event, list, link)

..or not, if there's no list.

>  
>  /*
>   * Add a event from the lists for its context.
> @@ -2397,36 +2406,38 @@ static void add_event_to_ctx(struct perf_event *event,
>  
>  static void ctx_sched_out(struct perf_event_context *ctx,
>  			  struct perf_cpu_context *cpuctx,
> -			  enum event_type_t event_type);
> +			  enum event_type_t event_type, int mux);
>  static void
>  ctx_sched_in(struct perf_event_context *ctx,
>  	     struct perf_cpu_context *cpuctx,
>  	     enum event_type_t event_type,
> -	     struct task_struct *task);
> +	     struct task_struct *task, int mux);
>  
>  static void task_ctx_sched_out(struct perf_cpu_context *cpuctx,
>  			       struct perf_event_context *ctx,
>  			       enum event_type_t event_type)
>  {
> +	int mux = 0;
> +
>  	if (!cpuctx->task_ctx)
>  		return;
>  
>  	if (WARN_ON_ONCE(ctx != cpuctx->task_ctx))
>  		return;
>  
> -	ctx_sched_out(ctx, cpuctx, event_type);
> +	ctx_sched_out(ctx, cpuctx, event_type, mux);

Just use 0.

>  }
>  
>  static void perf_event_sched_in(struct perf_cpu_context *cpuctx,
>  				struct perf_event_context *ctx,
> -				struct task_struct *task)
> +				struct task_struct *task, int mux)
>  {
> -	cpu_ctx_sched_in(cpuctx, EVENT_PINNED, task);
> +	cpu_ctx_sched_in(cpuctx, EVENT_PINNED, task, mux);
>  	if (ctx)
> -		ctx_sched_in(ctx, cpuctx, EVENT_PINNED, task);
> -	cpu_ctx_sched_in(cpuctx, EVENT_FLEXIBLE, task);
> +		ctx_sched_in(ctx, cpuctx, EVENT_PINNED, task, mux);
> +	cpu_ctx_sched_in(cpuctx, EVENT_FLEXIBLE, task, mux);
>  	if (ctx)
> -		ctx_sched_in(ctx, cpuctx, EVENT_FLEXIBLE, task);
> +		ctx_sched_in(ctx, cpuctx, EVENT_FLEXIBLE, task, mux);
>  }
>  
>  /*
> @@ -2450,6 +2461,7 @@ static void ctx_resched(struct perf_cpu_context *cpuctx,
>  {
>  	enum event_type_t ctx_event_type = event_type & EVENT_ALL;
>  	bool cpu_event = !!(event_type & EVENT_CPU);
> +	int mux = 0;
>  
>  	/*
>  	 * If pinned groups are involved, flexible groups also need to be
> @@ -2470,11 +2482,11 @@ static void ctx_resched(struct perf_cpu_context *cpuctx,
>  	 *  - otherwise, do nothing more.
>  	 */
>  	if (cpu_event)
> -		cpu_ctx_sched_out(cpuctx, ctx_event_type);
> +		cpu_ctx_sched_out(cpuctx, ctx_event_type, mux);
>  	else if (ctx_event_type & EVENT_PINNED)
> -		cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE);
> +		cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE, mux);
>  
> -	perf_event_sched_in(cpuctx, task_ctx, current);
> +	perf_event_sched_in(cpuctx, task_ctx, current, mux);

Also mux==0 in all cases in this function.

>  	perf_pmu_enable(cpuctx->ctx.pmu);
>  }
>  
> @@ -2491,7 +2503,7 @@ static int  __perf_install_in_context(void *info)
>  	struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
>  	struct perf_event_context *task_ctx = cpuctx->task_ctx;
>  	bool reprogram = true;
> -	int ret = 0;
> +	int ret = 0, mux =0;
>  
>  	raw_spin_lock(&cpuctx->ctx.lock);
>  	if (ctx->task) {
> @@ -2518,7 +2530,7 @@ static int  __perf_install_in_context(void *info)
>  	}
>  
>  	if (reprogram) {
> -		ctx_sched_out(ctx, cpuctx, EVENT_TIME);
> +		ctx_sched_out(ctx, cpuctx, EVENT_TIME, mux);
>  		add_event_to_ctx(event, ctx);
>  		ctx_resched(cpuctx, task_ctx, get_event_type(event));
>  	} else {
> @@ -2655,13 +2667,14 @@ static void __perf_event_enable(struct perf_event *event,
>  {
>  	struct perf_event *leader = event->group_leader;
>  	struct perf_event_context *task_ctx;
> +	int mux = 0;
>  
>  	if (event->state >= PERF_EVENT_STATE_INACTIVE ||
>  	    event->state <= PERF_EVENT_STATE_ERROR)
>  		return;
>  
>  	if (ctx->is_active)
> -		ctx_sched_out(ctx, cpuctx, EVENT_TIME);
> +		ctx_sched_out(ctx, cpuctx, EVENT_TIME, mux);
>  
>  	__perf_event_mark_enabled(event);
>  
> @@ -2671,7 +2684,7 @@ static void __perf_event_enable(struct perf_event *event,
>  	if (!event_filter_match(event)) {
>  		if (is_cgroup_event(event))
>  			perf_cgroup_defer_enabled(event);
> -		ctx_sched_in(ctx, cpuctx, EVENT_TIME, current);
> +		ctx_sched_in(ctx, cpuctx, EVENT_TIME, current, mux);
>  		return;
>  	}
>  
> @@ -2680,7 +2693,7 @@ static void __perf_event_enable(struct perf_event *event,
>  	 * then don't put it on unless the group is on.
>  	 */
>  	if (leader != event && leader->state != PERF_EVENT_STATE_ACTIVE) {
> -		ctx_sched_in(ctx, cpuctx, EVENT_TIME, current);
> +		ctx_sched_in(ctx, cpuctx, EVENT_TIME, current, mux);

And here.

>  		return;
>  	}
>  
> @@ -2876,11 +2889,13 @@ EXPORT_SYMBOL_GPL(perf_event_refresh);
>  
>  static void ctx_sched_out(struct perf_event_context *ctx,
>  			  struct perf_cpu_context *cpuctx,
> -			  enum event_type_t event_type)
> +			  enum event_type_t event_type, int mux)
>  {
>  	int is_active = ctx->is_active;
> +	struct list_head *group_list;
>  	struct perf_event *event;
>  	struct rb_node *node;
> +	int sw = -1, cpu = smp_processor_id();

Same thing seems to be happening with 'sw'.

>  	lockdep_assert_held(&ctx->lock);
>  
>  	if (likely(!ctx->nr_events)) {
> @@ -2926,17 +2941,47 @@ static void ctx_sched_out(struct perf_event_context *ctx,
>  
>  	perf_pmu_disable(ctx->pmu);
>  
> -	if (is_active & EVENT_PINNED)
> -		perf_event_groups_for_each(event, node,
> -				&ctx->pinned_groups, group_node,
> -				group_list, group_entry)
> -			group_sched_out(event, cpuctx, ctx);
> +	if (is_active & EVENT_PINNED) {
> +		if (mux) {

So it's 'rotate', really.

> +			perf_event_groups_for_each_cpu(event, cpu,
> +					&ctx->pinned_groups,
> +					group_list, group_entry) {
> +					group_sched_out(event, cpuctx, ctx);
> +			}
> +			perf_event_groups_for_each_cpu(event, sw,
> +					&ctx->pinned_groups,
> +					group_list, group_entry) {
> +					group_sched_out(event, cpuctx, ctx);
> +			}
> +		} else {
> +			perf_event_groups_for_each(event, node,
> +					&ctx->pinned_groups, group_node,
> +					group_list, group_entry) {
> +					group_sched_out(event, cpuctx, ctx);
> +			}
> +		}
> +	}
>  
> -	if (is_active & EVENT_FLEXIBLE)
> -		perf_event_groups_for_each(event, node,
> -				&ctx->flexible_groups, group_node,
> -				group_list, group_entry)
> -			group_sched_out(event, cpuctx, ctx);
> +	if (is_active & EVENT_FLEXIBLE) {
> +		if (mux) {
> +			perf_event_groups_for_each_cpu(event, cpu,
> +					&ctx->flexible_groups,
> +					group_list, group_entry) {
> +					group_sched_out(event, cpuctx, ctx);
> +			}
> +			perf_event_groups_for_each_cpu(event, sw,
> +					&ctx->flexible_groups,
> +					group_list, group_entry) {
> +					group_sched_out(event, cpuctx, ctx);
> +			}
> +		} else {
> +			perf_event_groups_for_each(event, node,
> +					&ctx->flexible_groups, group_node,
> +					group_list, group_entry) {
> +					group_sched_out(event, cpuctx, ctx);
> +			}
> +		}
> +	}
>  
>  	perf_pmu_enable(ctx->pmu);
>  }
> @@ -3225,9 +3270,9 @@ void __perf_event_task_sched_out(struct task_struct *task,
>   * Called with IRQs disabled
>   */
>  static void cpu_ctx_sched_out(struct perf_cpu_context *cpuctx,
> -			      enum event_type_t event_type)
> +			      enum event_type_t event_type, int mux)
>  {
> -	ctx_sched_out(&cpuctx->ctx, cpuctx, event_type);
> +	ctx_sched_out(&cpuctx->ctx, cpuctx, event_type, mux);
>  }
>  
>  static void
> @@ -3287,11 +3332,13 @@ static void
>  ctx_sched_in(struct perf_event_context *ctx,
>  	     struct perf_cpu_context *cpuctx,
>  	     enum event_type_t event_type,
> -	     struct task_struct *task)
> +	     struct task_struct *task, int mux)
>  {
>  	int is_active = ctx->is_active;
> +	struct list_head *group_list;
>  	struct perf_event *event;
>  	struct rb_node *node;
> +	int sw = -1, cpu = smp_processor_id();
>  
>  	lockdep_assert_held(&ctx->lock);
>  
> @@ -3319,35 +3366,69 @@ ctx_sched_in(struct perf_event_context *ctx,
>  	 * First go through the list and put on any pinned groups
>  	 * in order to give them the best chance of going on.
>  	 */
> -	if (is_active & EVENT_PINNED)
> -		perf_event_groups_for_each(event, node,
> -				&ctx->pinned_groups, group_node,
> -				group_list, group_entry)
> -			ctx_pinned_sched_in(event, cpuctx, ctx);
> +	if (is_active & EVENT_PINNED) {
> +		if (mux) {
> +			perf_event_groups_for_each_cpu(event, sw,
> +					&ctx->pinned_groups,
> +					group_list, group_entry) {
> +					ctx_pinned_sched_in(event, cpuctx, ctx);
> +			}
> +			perf_event_groups_for_each_cpu(event, cpu,
> +					&ctx->pinned_groups,
> +					group_list, group_entry) {
> +					ctx_pinned_sched_in(event, cpuctx, ctx);
> +			}
> +		} else {
> +			perf_event_groups_for_each(event, node,
> +					&ctx->pinned_groups, group_node,
> +					group_list, group_entry) {
> +					ctx_pinned_sched_in(event, cpuctx, ctx);
> +			}
> +		}
> +	}
>  
>  	/* Then walk through the lower prio flexible groups */
>  	if (is_active & EVENT_FLEXIBLE) {
>  		int can_add_hw = 1;
> -		perf_event_groups_for_each(event, node,
> -				&ctx->flexible_groups, group_node,
> -				group_list, group_entry)
> -			ctx_flexible_sched_in(event, cpuctx, ctx, &can_add_hw);
> +		if (mux) {
> +			perf_event_groups_for_each_cpu(event, sw,
> +					&ctx->flexible_groups,
> +					group_list, group_entry) {
> +					ctx_flexible_sched_in(event, cpuctx,
> +							ctx, &can_add_hw);
> +			}
> +			can_add_hw = 1;
> +			perf_event_groups_for_each_cpu(event, cpu,
> +					&ctx->flexible_groups,
> +					group_list, group_entry) {
> +					ctx_flexible_sched_in(event, cpuctx,
> +							ctx, &can_add_hw);
> +			}
> +		} else {
> +			perf_event_groups_for_each(event, node,
> +					&ctx->flexible_groups, group_node,
> +					group_list, group_entry) {
> +					ctx_flexible_sched_in(event, cpuctx,
> +							ctx, &can_add_hw);
> +			}
> +		}
>  	}
>  }
>  
>  static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx,
>  			     enum event_type_t event_type,
> -			     struct task_struct *task)
> +			     struct task_struct *task, int mux)
>  {
>  	struct perf_event_context *ctx = &cpuctx->ctx;
>  
> -	ctx_sched_in(ctx, cpuctx, event_type, task);
> +	ctx_sched_in(ctx, cpuctx, event_type, task, mux);
>  }
>  
>  static void perf_event_context_sched_in(struct perf_event_context *ctx,
>  					struct task_struct *task)
>  {
>  	struct perf_cpu_context *cpuctx;
> +	int mux = 0;
>  
>  	cpuctx = __get_cpu_context(ctx);
>  	if (cpuctx->task_ctx == ctx)
> @@ -3371,8 +3452,8 @@ static void perf_event_context_sched_in(struct perf_event_context *ctx,
>  	 * events, no need to flip the cpuctx's events around.
>  	 */
>  	if (!RB_EMPTY_ROOT(&ctx->pinned_groups))
> -		cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE);
> -	perf_event_sched_in(cpuctx, ctx, task);
> +		cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE, mux);
> +	perf_event_sched_in(cpuctx, ctx, task, mux);
>  	perf_pmu_enable(ctx->pmu);
>  
>  unlock:
> @@ -3618,7 +3699,7 @@ static void rotate_ctx(struct perf_event_context *ctx)
>  static int perf_rotate_context(struct perf_cpu_context *cpuctx)
>  {
>  	struct perf_event_context *ctx = NULL;
> -	int rotate = 0;
> +	int rotate = 0, mux = 1;
>  
>  	if (cpuctx->ctx.nr_events) {
>  		if (cpuctx->ctx.nr_events != cpuctx->ctx.nr_active)
> @@ -3637,15 +3718,15 @@ static int perf_rotate_context(struct perf_cpu_context *cpuctx)
>  	perf_ctx_lock(cpuctx, cpuctx->task_ctx);
>  	perf_pmu_disable(cpuctx->ctx.pmu);
>  
> -	cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE);
> +	cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE, mux);

It's '1'.