From: kan.liang@intel.com
To: peterz@infradead.org, mingo@redhat.com, acme@kernel.org,
        linux-kernel@vger.kernel.org
Cc: alexander.shishkin@linux.intel.com, tglx@linutronix.de,
        namhyung@kernel.org, jolsa@kernel.org, adrian.hunter@intel.com,
        wangnan0@huawei.com, mark.rutland@arm.com, andi@firstfloor.org,
        Kan Liang <kan.liang@intel.com>
Subject: [PATCH V3 1/6] perf/core: Introduce PERF_RECORD_OVERHEAD
Date: Thu,  8 Dec 2016 16:27:09 -0500
Message-Id: <1481232434-3574-2-git-send-email-kan.liang@intel.com>
In-Reply-To: <1481232434-3574-1-git-send-email-kan.liang@intel.com>
References: <1481232434-3574-1-git-send-email-kan.liang@intel.com>
Sender: linux-kernel-owner@vger.kernel.org
Content-Length: 4712
Lines: 168

From: Kan Liang <kan.liang@intel.com>

A new perf record is introduced to export perf overhead information to
userspace. So the user can measure the overhead of sampling directly.
If the user doesn't want to use this feature, it can be switched off by
configuring the user space tool.

Perf overhead is actually the sum of active pmu overhead. The pmu event
could be run on different CPU. To calculate the perf overhead, it needs
to collect the per-pmu per-cpu overhead information.
Each pmu has its own per-cpu cpuctx. It's a good place to store the
overhead information.

To output the overhead information, it takes advantage of the existing
event log mechanism. But the overhead information is the per-pmu
overhead, not per-event overhead.

Signed-off-by: Kan Liang <kan.liang@intel.com>
---
 include/linux/perf_event.h      |  6 ++++++
 include/uapi/linux/perf_event.h | 38 +++++++++++++++++++++++++++++++++-
 kernel/events/core.c            | 46 +++++++++++++++++++++++++++++++++++++++++
 3 files changed, 89 insertions(+), 1 deletion(-)

diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 4741ecd..946e8d8 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -792,6 +792,8 @@ struct perf_cpu_context {
 
 	struct list_head		sched_cb_entry;
 	int				sched_cb_usage;
+
+	struct perf_overhead_entry	overhead[PERF_OVERHEAD_MAX];
 };
 
 struct perf_output_handle {
@@ -998,6 +1000,10 @@ perf_event__output_id_sample(struct perf_event *event,
 extern void
 perf_log_lost_samples(struct perf_event *event, u64 lost);
 
+extern void
+perf_log_overhead(struct perf_event *event, u64 type,
+		  u32 cpu, u32 nr, u64 time);
+
 static inline bool is_sampling_event(struct perf_event *event)
 {
 	return event->attr.sample_period != 0;
diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h
index c66a485..101f8b3 100644
--- a/include/uapi/linux/perf_event.h
+++ b/include/uapi/linux/perf_event.h
@@ -344,7 +344,8 @@ struct perf_event_attr {
 				use_clockid    :  1, /* use @clockid for time fields */
 				context_switch :  1, /* context switch data */
 				write_backward :  1, /* Write ring buffer from end to beginning */
-				__reserved_1   : 36;
+				overhead       :  1, /* Log overhead information */
+				__reserved_1   : 35;
 
 	union {
 		__u32		wakeup_events;	  /* wakeup every n events */
@@ -862,6 +863,17 @@ enum perf_event_type {
 	 */
 	PERF_RECORD_SWITCH_CPU_WIDE		= 15,
 
+	/*
+	 * Records perf overhead
+	 * struct {
+	 *	struct perf_event_header	header;
+	 *	u64				type;
+	 *	struct perf_overhead_entry	entry;
+	 *	struct sample_id		sample_id;
+	 * };
+	 */
+	PERF_RECORD_OVERHEAD			= 16,
+
 	PERF_RECORD_MAX,			/* non-ABI */
 };
 
@@ -980,4 +992,28 @@ struct perf_branch_entry {
 		reserved:44;
 };
 
+/*
+ * The overhead could be common overhead (in core codes) or
+ * PMU specific overhead (in pmu specific codes).
+ */
+enum perf_record_overhead_type {
+	/* common overhead */
+	/* PMU specific */
+	PERF_OVERHEAD_MAX,
+};
+
+/*
+ * single overhead record layout:
+ *
+ *	 cpu: CPU id
+ *	  nr: Times of overhead happens.
+ *	      E.g. for NMI, nr == times of NMI handler are called.
+ *	time: Total overhead cost(ns)
+ */
+struct perf_overhead_entry {
+	__u32	cpu;
+	__u32	nr;
+	__u64	time;
+};
+
 #endif /* _UAPI_LINUX_PERF_EVENT_H */
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 02c8421..1420139 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -7034,6 +7034,52 @@ static void perf_log_itrace_start(struct perf_event *event)
 	perf_output_end(&handle);
 }
 
+
+/*
+ * Record overhead information
+ *
+ * The overhead logged here is the overhead for event pmu, not per-event overhead.
+ * This function only take advantage of the existing event log mechanism
+ * to log the overhead information.
+ *
+ */
+void perf_log_overhead(struct perf_event *event, u64 type,
+		       u32 cpu, u32 nr, u64 time)
+{
+	struct perf_output_handle handle;
+	struct perf_sample_data sample;
+	int ret;
+
+	struct {
+		struct perf_event_header	header;
+		u64				type;
+		struct perf_overhead_entry	overhead;
+	} overhead_event = {
+		.header = {
+			.type = PERF_RECORD_OVERHEAD,
+			.misc = 0,
+			.size = sizeof(overhead_event),
+		},
+		.type = type,
+		.overhead = {
+			.cpu = cpu,
+			.nr = nr,
+			.time = time,
+		},
+	};
+
+	perf_event_header__init_id(&overhead_event.header, &sample, event);
+	ret = perf_output_begin(&handle, event, overhead_event.header.size);
+
+	if (ret)
+		return;
+
+	perf_output_put(&handle, overhead_event);
+	perf_event__output_id_sample(event, &handle, &sample);
+
+	perf_output_end(&handle);
+}
+
 /*
  * Generic event overflow handling, sampling.
  */
-- 
2.4.3