Date: Thu, 31 Mar 2016 18:22:26 +0200
From: Peter Zijlstra <peterz@infradead.org>
To: Arnaldo Carvalho de Melo <acme@kernel.org>
Cc: kan.liang@intel.com, ak@linux.intel.com, eranian@google.com,
        vincent.weaver@maine.edu, tglx@linutronix.de, mingo@kernel.org,
        acme@redhat.com, jolsa@redhat.com, alexander.shishkin@linux.intel.com,
        ying.huang@linux.intel.com, linux-kernel@vger.kernel.org
Subject: Re: [PATCH V2 1/1] perf/core: don't find side-band event from all
 pmus
Message-ID: <20160331162226.GG11035@twins.programming.kicks-ass.net>
References: <1458757477-3781-1-git-send-email-kan.liang@intel.com>
 <20160329120609.GG3408@twins.programming.kicks-ass.net>
 <20160331144439.GB27708@kernel.org>
 <20160331145621.GM3430@twins.programming.kicks-ass.net>
 <20160331162141.GF11035@twins.programming.kicks-ass.net>
MIME-Version: 1.0
Content-Type: text/plain; charset=us-ascii
Content-Disposition: inline
In-Reply-To: <20160331162141.GF11035@twins.programming.kicks-ass.net>
User-Agent: Mutt/1.5.21 (2012-12-30)
Sender: linux-kernel-owner@vger.kernel.org
Content-Length: 6949
Lines: 240

On Thu, Mar 31, 2016 at 06:21:41PM +0200, Peter Zijlstra wrote:
> On Thu, Mar 31, 2016 at 04:56:21PM +0200, Peter Zijlstra wrote:
> > On Thu, Mar 31, 2016 at 11:44:39AM -0300, Arnaldo Carvalho de Melo wrote:
> > > It probably will cope, but can't we just emit one single record?
> > 
> > I'll try and figure something out...
> 
> less clever but probably good enough..

I'm an idiot; quilt refresh is needed..

---
Subject: perf/core: don't find side-band event from all pmus 
From: Kan Liang <kan.liang@intel.com>
Date: Wed, 23 Mar 2016 11:24:37 -0700

perf_event_aux funciton goes through all pmus and all events in whatever
contexts to find the side-band event to output, which is unnecessary and
expensive.

For example, the brk test case in lkp triggers many mmap operations, at
the time, perf with cycles:pp is also running on the system. As a
result, many perf_event_aux are invoked, and each would search all pmus
and all events. If we enable the uncore support (even when uncore event
are not really used), dozens of uncore pmus will be added into pmus
list, which can significantly decrease brk_test's ops_per_sec. Based on
our test, the ops_per_sec without uncore patch is 2647573, while the
ops_per_sec with uncore patch is only 1768444, which is a 33.2%
reduction.

To get at the per cpu side-band event, this patch put the side-band
events to four categories, which are tracked by 4 per-cpu lists. It only
finds the interested events from masked category.
To get at the per task side-band event, each task context for current task
will be searched. Because we don't want to go update more global state on
context switch.


Cc: vincent.weaver@maine.edu
Cc: mingo@kernel.org
Cc: acme@redhat.com
Cc: ak@linux.intel.com
Cc: jolsa@redhat.com
Cc: tglx@linutronix.de
Cc: eranian@google.com
Cc: alexander.shishkin@linux.intel.com
Reported-by: Huang, Ying <ying.huang@linux.intel.com>
Suggested-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Kan Liang <kan.liang@intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: http://lkml.kernel.org/r/1458757477-3781-1-git-send-email-kan.liang@intel.com
---

 include/linux/perf_event.h |    6 +++
 kernel/events/core.c       |   87 ++++++++++++++++++++++++++++++++++++++-------
 2 files changed, 80 insertions(+), 13 deletions(-)

--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -437,6 +437,11 @@ struct swevent_hlist {
 struct perf_cgroup;
 struct ring_buffer;
 
+struct pmu_event_list {
+	raw_spinlock_t		lock;
+	struct list_head	list;
+};
+
 /**
  * struct perf_event - performance event kernel representation:
  */
@@ -589,6 +594,7 @@ struct perf_event {
 	int				cgrp_defer_enabled;
 #endif
 
+	struct list_head		sb_list;
 #endif /* CONFIG_PERF_EVENTS */
 };
 
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -333,6 +333,7 @@ static atomic_t perf_sched_count;
 
 static DEFINE_PER_CPU(atomic_t, perf_cgroup_events);
 static DEFINE_PER_CPU(int, perf_sched_cb_usages);
+static DEFINE_PER_CPU(struct pmu_event_list, pmu_sb_events);
 
 static atomic_t nr_mmap_events __read_mostly;
 static atomic_t nr_comm_events __read_mostly;
@@ -3598,6 +3599,26 @@ static void free_event_rcu(struct rcu_he
 static void ring_buffer_attach(struct perf_event *event,
 			       struct ring_buffer *rb);
 
+static void detach_sb_event(struct perf_event *event)
+{
+	struct pmu_event_list *pel = per_cpu_ptr(&pmu_sb_events, event->cpu);
+
+	raw_spin_lock(&pel->lock);
+	list_del_rcu(&event->sb_list);
+	raw_spin_unlock(&pel->lock);
+}
+
+static void unaccount_pmu_sb_event(struct perf_event *event)
+{
+	if (event->parent)
+		return;
+
+	if (event->attach_state & PERF_ATTACH_TASK)
+		return;
+
+	detach_sb_event(event);
+}
+
 static void unaccount_event_cpu(struct perf_event *event, int cpu)
 {
 	if (event->parent)
@@ -3661,6 +3682,8 @@ static void unaccount_event(struct perf_
 	}
 
 	unaccount_event_cpu(event, event->cpu);
+
+	unaccount_pmu_sb_event(event);
 }
 
 static void perf_sched_delayed(struct work_struct *work)
@@ -5785,13 +5808,25 @@ perf_event_aux_task_ctx(perf_event_aux_o
 	rcu_read_unlock();
 }
 
+static void perf_event_sb_iterate(perf_event_aux_output_cb output, void *data)
+{
+	struct pmu_event_list *pel = this_cpu_ptr(&pmu_sb_events);
+	struct perf_event *event;
+
+	list_for_each_entry_rcu(event, &pel->list, sb_list) {
+		if (event->state < PERF_EVENT_STATE_INACTIVE)
+			continue;
+		if (!event_filter_match(event))
+			continue;
+		output(event, data);
+	}
+}
+
 static void
 perf_event_aux(perf_event_aux_output_cb output, void *data,
 	       struct perf_event_context *task_ctx)
 {
-	struct perf_cpu_context *cpuctx;
 	struct perf_event_context *ctx;
-	struct pmu *pmu;
 	int ctxn;
 
 	/*
@@ -5806,20 +5841,15 @@ perf_event_aux(perf_event_aux_output_cb
 	}
 
 	rcu_read_lock();
-	list_for_each_entry_rcu(pmu, &pmus, entry) {
-		cpuctx = get_cpu_ptr(pmu->pmu_cpu_context);
-		if (cpuctx->unique_pmu != pmu)
-			goto next;
-		perf_event_aux_ctx(&cpuctx->ctx, output, data);
-		ctxn = pmu->task_ctx_nr;
-		if (ctxn < 0)
-			goto next;
+	preempt_disable();
+	perf_event_sb_iterate(output, data);
+
+	for_each_task_context_nr(ctxn) {
 		ctx = rcu_dereference(current->perf_event_ctxp[ctxn]);
 		if (ctx)
 			perf_event_aux_ctx(ctx, output, data);
-next:
-		put_cpu_ptr(pmu->pmu_cpu_context);
 	}
+	preempt_enable();
 	rcu_read_unlock();
 }
 
@@ -7986,6 +8016,32 @@ static struct pmu *perf_init_event(struc
 	return pmu;
 }
 
+static void attach_sb_event(struct perf_event *event)
+{
+	struct pmu_event_list *pel = per_cpu_ptr(&pmu_sb_events, event->cpu);
+
+	raw_spin_lock(&pel->lock);
+	list_add_rcu(&event->sb_list, &pel->list);
+	raw_spin_unlock(&pel->lock);
+}
+
+static void account_pmu_sb_event(struct perf_event *event)
+{
+	struct perf_event_attr *attr = &event->attr;
+
+	if (event->parent)
+		return;
+
+	if (event->attach_state & PERF_ATTACH_TASK)
+		return;
+
+	if (attr->mmap || attr->mmap_data || attr->mmap2 ||
+	    attr->comm || attr->comm_exec ||
+	    attr->task ||
+	    attr->context_switch)
+		attach_sb_event(event);
+}
+
 static void account_event_cpu(struct perf_event *event, int cpu)
 {
 	if (event->parent)
@@ -8066,6 +8122,8 @@ static void account_event(struct perf_ev
 enabled:
 
 	account_event_cpu(event, event->cpu);
+
+	account_pmu_sb_event(event);
 }
 
 /*
@@ -8109,7 +8167,7 @@ perf_event_alloc(struct perf_event_attr
 	INIT_LIST_HEAD(&event->rb_entry);
 	INIT_LIST_HEAD(&event->active_entry);
 	INIT_HLIST_NODE(&event->hlist_entry);
-
+	INIT_LIST_HEAD(&event->sb_list);
 
 	init_waitqueue_head(&event->waitq);
 	init_irq_work(&event->pending, perf_pending_event);
@@ -9516,6 +9574,9 @@ static void __init perf_event_init_all_c
 		swhash = &per_cpu(swevent_htable, cpu);
 		mutex_init(&swhash->hlist_mutex);
 		INIT_LIST_HEAD(&per_cpu(active_ctx_list, cpu));
+
+		INIT_LIST_HEAD(&per_cpu(pmu_sb_events.list, cpu));
+		raw_spin_lock_init(&per_cpu(pmu_sb_events.lock, cpu));
 	}
 }