Date: Tue, 18 Jul 2017 03:39:49 -0700
From: tip-bot for Ingo Molnar <tipbot@zytor.com>
Message-ID: <tip-770f8eb8a990a8904bfd8a6849be147b40b6e1aa@git.kernel.org>
Cc: acme@redhat.com, alexander.shishkin@linux.intel.com,
        peterz@infradead.org, eranian@gmail.com, mingo@kernel.org,
        linux-kernel@vger.kernel.org, jolsa@redhat.com,
        torvalds@linux-foundation.org, acme@infradead.org, hpa@zytor.com,
        tglx@linutronix.de, vincent.weaver@maine.edu
Reply-To: acme@infradead.org, tglx@linutronix.de, vincent.weaver@maine.edu,
        hpa@zytor.com, peterz@infradead.org, eranian@gmail.com,
        acme@redhat.com, alexander.shishkin@linux.intel.com, jolsa@redhat.com,
        mingo@kernel.org, linux-kernel@vger.kernel.org,
        torvalds@linux-foundation.org
In-Reply-To: <20170715110049.36jvxnidy2flh6ll@gmail.com>
References: <20170715110049.36jvxnidy2flh6ll@gmail.com>
To: linux-tip-commits@vger.kernel.org
Subject: [tip:perf/urgent] Revert "perf/core: Optimize event rescheduling on
 active contexts"
Git-Commit-ID: 770f8eb8a990a8904bfd8a6849be147b40b6e1aa
Robot-ID: <tip-bot.git.kernel.org>
Robot-Unsubscribe: Contact <mailto:hpa@kernel.org> to get blacklisted from
 these emails
MIME-Version: 1.0
Content-Transfer-Encoding: 8bit
Content-Type: text/plain; charset=UTF-8
Content-Disposition: inline
Sender: linux-kernel-owner@vger.kernel.org
Content-Length: 8074
Lines: 234

Commit-ID:  770f8eb8a990a8904bfd8a6849be147b40b6e1aa
Gitweb:     http://git.kernel.org/tip/770f8eb8a990a8904bfd8a6849be147b40b6e1aa
Author:     Ingo Molnar <mingo@kernel.org>
AuthorDate: Sat, 15 Jul 2017 13:00:49 +0200
Committer:  Ingo Molnar <mingo@kernel.org>
CommitDate: Tue, 18 Jul 2017 10:44:47 +0200

Revert "perf/core: Optimize event rescheduling on active contexts"

This reverts commit 487f05e18aa4efacee6357480f293a5afe6593b5.

Vince Weaver reported that it breaks a testcase for pinned events:

| I've bisected one of them, this report is about:
|
|         tests/overflow/simul_oneshot_group_overflow
|
| This test creates an event group containing two sampling events, set
| to overflow to a signal handler (which disables and then refreshes the
| event).
|
| On a good kernel you get the following:
|         Event perf::instructions with period 1000000
|         Event perf::instructions with period 2000000
|                 fd 3 overflows: 946 (perf::instructions/1000000)
|                 fd 4 overflows: 473 (perf::instructions/2000000)
|         Ending counts:
|                 Count 0: 946379875
|                 Count 1: 946365218
|
| With the broken kernels you get:
|         Event perf::instructions with period 1000000
|         Event perf::instructions with period 2000000
|                 fd 3 overflows: 938 (perf::instructions/1000000)
|                 fd 4 overflows: 318 (perf::instructions/2000000)
|         Ending counts:
|                 Count 0: 946373080
|                 Count 1: 653373058
...
| additional relevant detail:
|         in the failing case, the group leader of the event set has
|                 .pinned=1
|         If I change that to .pinned=0 then the test passes.

As it's an optimization we can revert it for now until the root cause is found.

Adrian Hunter <adrian.hunter@intel.com>
Reported-by: Vince Weaver <vincent.weaver@maine.edu>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Arnaldo Carvalho de Melo <acme@infradead.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@gmail.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/20170715110049.36jvxnidy2flh6ll@gmail.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/events/core.c | 80 ++++++++--------------------------------------------
 1 file changed, 11 insertions(+), 69 deletions(-)

diff --git a/kernel/events/core.c b/kernel/events/core.c
index 9747e42..778aa25 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -359,8 +359,6 @@ enum event_type_t {
 	EVENT_FLEXIBLE = 0x1,
 	EVENT_PINNED = 0x2,
 	EVENT_TIME = 0x4,
-	/* see ctx_resched() for details */
-	EVENT_CPU = 0x8,
 	EVENT_ALL = EVENT_FLEXIBLE | EVENT_PINNED,
 };
 
@@ -1445,20 +1443,6 @@ static void update_group_times(struct perf_event *leader)
 		update_event_times(event);
 }
 
-static enum event_type_t get_event_type(struct perf_event *event)
-{
-	struct perf_event_context *ctx = event->ctx;
-	enum event_type_t event_type;
-
-	lockdep_assert_held(&ctx->lock);
-
-	event_type = event->attr.pinned ? EVENT_PINNED : EVENT_FLEXIBLE;
-	if (!ctx->task)
-		event_type |= EVENT_CPU;
-
-	return event_type;
-}
-
 static struct list_head *
 ctx_group_list(struct perf_event *event, struct perf_event_context *ctx)
 {
@@ -2232,8 +2216,7 @@ ctx_sched_in(struct perf_event_context *ctx,
 	     struct task_struct *task);
 
 static void task_ctx_sched_out(struct perf_cpu_context *cpuctx,
-			       struct perf_event_context *ctx,
-			       enum event_type_t event_type)
+			       struct perf_event_context *ctx)
 {
 	if (!cpuctx->task_ctx)
 		return;
@@ -2241,7 +2224,7 @@ static void task_ctx_sched_out(struct perf_cpu_context *cpuctx,
 	if (WARN_ON_ONCE(ctx != cpuctx->task_ctx))
 		return;
 
-	ctx_sched_out(ctx, cpuctx, event_type);
+	ctx_sched_out(ctx, cpuctx, EVENT_ALL);
 }
 
 static void perf_event_sched_in(struct perf_cpu_context *cpuctx,
@@ -2256,51 +2239,13 @@ static void perf_event_sched_in(struct perf_cpu_context *cpuctx,
 		ctx_sched_in(ctx, cpuctx, EVENT_FLEXIBLE, task);
 }
 
-/*
- * We want to maintain the following priority of scheduling:
- *  - CPU pinned (EVENT_CPU | EVENT_PINNED)
- *  - task pinned (EVENT_PINNED)
- *  - CPU flexible (EVENT_CPU | EVENT_FLEXIBLE)
- *  - task flexible (EVENT_FLEXIBLE).
- *
- * In order to avoid unscheduling and scheduling back in everything every
- * time an event is added, only do it for the groups of equal priority and
- * below.
- *
- * This can be called after a batch operation on task events, in which case
- * event_type is a bit mask of the types of events involved. For CPU events,
- * event_type is only either EVENT_PINNED or EVENT_FLEXIBLE.
- */
 static void ctx_resched(struct perf_cpu_context *cpuctx,
-			struct perf_event_context *task_ctx,
-			enum event_type_t event_type)
+			struct perf_event_context *task_ctx)
 {
-	enum event_type_t ctx_event_type = event_type & EVENT_ALL;
-	bool cpu_event = !!(event_type & EVENT_CPU);
-
-	/*
-	 * If pinned groups are involved, flexible groups also need to be
-	 * scheduled out.
-	 */
-	if (event_type & EVENT_PINNED)
-		event_type |= EVENT_FLEXIBLE;
-
 	perf_pmu_disable(cpuctx->ctx.pmu);
 	if (task_ctx)
-		task_ctx_sched_out(cpuctx, task_ctx, event_type);
-
-	/*
-	 * Decide which cpu ctx groups to schedule out based on the types
-	 * of events that caused rescheduling:
-	 *  - EVENT_CPU: schedule out corresponding groups;
-	 *  - EVENT_PINNED task events: schedule out EVENT_FLEXIBLE groups;
-	 *  - otherwise, do nothing more.
-	 */
-	if (cpu_event)
-		cpu_ctx_sched_out(cpuctx, ctx_event_type);
-	else if (ctx_event_type & EVENT_PINNED)
-		cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE);
-
+		task_ctx_sched_out(cpuctx, task_ctx);
+	cpu_ctx_sched_out(cpuctx, EVENT_ALL);
 	perf_event_sched_in(cpuctx, task_ctx, current);
 	perf_pmu_enable(cpuctx->ctx.pmu);
 }
@@ -2347,7 +2292,7 @@ static int  __perf_install_in_context(void *info)
 	if (reprogram) {
 		ctx_sched_out(ctx, cpuctx, EVENT_TIME);
 		add_event_to_ctx(event, ctx);
-		ctx_resched(cpuctx, task_ctx, get_event_type(event));
+		ctx_resched(cpuctx, task_ctx);
 	} else {
 		add_event_to_ctx(event, ctx);
 	}
@@ -2514,7 +2459,7 @@ static void __perf_event_enable(struct perf_event *event,
 	if (ctx->task)
 		WARN_ON_ONCE(task_ctx != ctx);
 
-	ctx_resched(cpuctx, task_ctx, get_event_type(event));
+	ctx_resched(cpuctx, task_ctx);
 }
 
 /*
@@ -2941,7 +2886,7 @@ unlock:
 
 	if (do_switch) {
 		raw_spin_lock(&ctx->lock);
-		task_ctx_sched_out(cpuctx, ctx, EVENT_ALL);
+		task_ctx_sched_out(cpuctx, ctx);
 		raw_spin_unlock(&ctx->lock);
 	}
 }
@@ -3498,7 +3443,6 @@ static int event_enable_on_exec(struct perf_event *event,
 static void perf_event_enable_on_exec(int ctxn)
 {
 	struct perf_event_context *ctx, *clone_ctx = NULL;
-	enum event_type_t event_type = 0;
 	struct perf_cpu_context *cpuctx;
 	struct perf_event *event;
 	unsigned long flags;
@@ -3512,17 +3456,15 @@ static void perf_event_enable_on_exec(int ctxn)
 	cpuctx = __get_cpu_context(ctx);
 	perf_ctx_lock(cpuctx, ctx);
 	ctx_sched_out(ctx, cpuctx, EVENT_TIME);
-	list_for_each_entry(event, &ctx->event_list, event_entry) {
+	list_for_each_entry(event, &ctx->event_list, event_entry)
 		enabled |= event_enable_on_exec(event, ctx);
-		event_type |= get_event_type(event);
-	}
 
 	/*
 	 * Unclone and reschedule this context if we enabled any event.
 	 */
 	if (enabled) {
 		clone_ctx = unclone_ctx(ctx);
-		ctx_resched(cpuctx, ctx, event_type);
+		ctx_resched(cpuctx, ctx);
 	} else {
 		ctx_sched_in(ctx, cpuctx, EVENT_TIME, current);
 	}
@@ -10466,7 +10408,7 @@ static void perf_event_exit_task_context(struct task_struct *child, int ctxn)
 	 * in.
 	 */
 	raw_spin_lock_irq(&child_ctx->lock);
-	task_ctx_sched_out(__get_cpu_context(child_ctx), child_ctx, EVENT_ALL);
+	task_ctx_sched_out(__get_cpu_context(child_ctx), child_ctx);
 
 	/*
 	 * Now that the context is inactive, destroy the task <-> ctx relation