2014-07-15 08:59:14

by Yan, Zheng

[permalink] [raw]
Subject: [PATCH v2 0/7] perf, x86: large PEBS interrupt threshold

This patch series implements large PEBS interrupt threshold. For some
limited cases, it can significantly reduce the sample overhead. Please
read patch 6's commit message for more information.

changes since v1:
- drop patch 'perf, core: Add all PMUs to pmu_idr'
- add comments for case that multiple counters overflow simultaneously


2014-07-15 08:59:36

by Yan, Zheng

[permalink] [raw]
Subject: [PATCH v2 3/7] perf, x86: use the PEBS auto reload mechanism when possible

When a fixed period is specified, this patch make perf use the PEBS
auto reload mechanism. This makes normal profiling faster, because
it avoids one costly MSR write in the PMI handler.

Signef-off-by: Yan, Zheng <[email protected]>
---
arch/x86/kernel/cpu/perf_event.c | 15 +++++++++------
arch/x86/kernel/cpu/perf_event_intel_ds.c | 7 +++++++
include/linux/perf_event.h | 1 +
3 files changed, 17 insertions(+), 6 deletions(-)

diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index 8868e9b..ae723c8 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -979,13 +979,16 @@ int x86_perf_event_set_period(struct perf_event *event)

per_cpu(pmc_prev_left[idx], smp_processor_id()) = left;

- /*
- * The hw event starts counting from this event offset,
- * mark it to be able to extra future deltas:
- */
- local64_set(&hwc->prev_count, (u64)-left);
+ if (!hwc->autoreload ||
+ local64_read(&hwc->prev_count) != (u64)-left) {
+ /*
+ * The hw event starts counting from this event offset,
+ * mark it to be able to extra future deltas:
+ */
+ local64_set(&hwc->prev_count, (u64)-left);

- wrmsrl(hwc->event_base, (u64)(-left) & x86_pmu.cntval_mask);
+ wrmsrl(hwc->event_base, (u64)(-left) & x86_pmu.cntval_mask);
+ }

/*
* Due to erratum on certan cpu we need
diff --git a/arch/x86/kernel/cpu/perf_event_intel_ds.c b/arch/x86/kernel/cpu/perf_event_intel_ds.c
index 980970c..1db4ce5 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_ds.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_ds.c
@@ -714,6 +714,7 @@ void intel_pmu_pebs_enable(struct perf_event *event)
struct hw_perf_event *hwc = &event->hw;

hwc->config &= ~ARCH_PERFMON_EVENTSEL_INT;
+ hwc->autoreload = !event->attr.freq;

cpuc->pebs_enabled |= 1ULL << hwc->idx;

@@ -721,6 +722,11 @@ void intel_pmu_pebs_enable(struct perf_event *event)
cpuc->pebs_enabled |= 1ULL << (hwc->idx + 32);
else if (event->hw.flags & PERF_X86_EVENT_PEBS_ST)
cpuc->pebs_enabled |= 1ULL << 63;
+
+ /* Use auto-reload if possible to save a MSR write in the PMI */
+ if (hwc->autoreload)
+ ds->pebs_event_reset[hwc->idx] =
+ (u64)-hwc->sample_period & x86_pmu.cntval_mask;
}

void intel_pmu_pebs_disable(struct perf_event *event)
@@ -739,6 +745,7 @@ void intel_pmu_pebs_disable(struct perf_event *event)
wrmsrl(MSR_IA32_PEBS_ENABLE, cpuc->pebs_enabled);

hwc->config |= ARCH_PERFMON_EVENTSEL_INT;
+ hwc->autoreload = false;
}

void intel_pmu_pebs_enable_all(void)
diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 5d665e8..37a2b70 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -149,6 +149,7 @@ struct hw_perf_event {

u64 freq_time_stamp;
u64 freq_count_stamp;
+ bool autoreload;
#endif
};

--
1.9.3

2014-07-15 08:59:43

by Yan, Zheng

[permalink] [raw]
Subject: [PATCH v2 6/7] perf, x86: enable large PEBS interrupt threshold for SNB/IVB/HSW

Signed-off-by: Yan, Zheng <[email protected]>
---
arch/x86/kernel/cpu/perf_event_intel.c | 4 ++++
1 file changed, 4 insertions(+)

diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c
index cb5a838..dba03b3 100644
--- a/arch/x86/kernel/cpu/perf_event_intel.c
+++ b/arch/x86/kernel/cpu/perf_event_intel.c
@@ -2456,6 +2456,7 @@ __init int intel_pmu_init(void)
intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_BACKEND] =
X86_CONFIG(.event=0xb1, .umask=0x01, .inv=1, .cmask=1);

+ x86_pmu.multi_pebs = true;
pr_cont("SandyBridge events, ");
break;
case 58: /* IvyBridge */
@@ -2484,6 +2485,7 @@ __init int intel_pmu_init(void)
intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_FRONTEND] =
X86_CONFIG(.event=0x0e, .umask=0x01, .inv=1, .cmask=1);

+ x86_pmu.multi_pebs = true;
pr_cont("IvyBridge events, ");
break;

@@ -2511,6 +2513,8 @@ __init int intel_pmu_init(void)
x86_pmu.get_event_constraints = hsw_get_event_constraints;
x86_pmu.cpu_events = hsw_events_attrs;
x86_pmu.lbr_double_abort = true;
+
+ x86_pmu.multi_pebs = true;
pr_cont("Haswell events, ");
break;

--
1.9.3

2014-07-15 08:59:40

by Yan, Zheng

[permalink] [raw]
Subject: [PATCH v2 5/7] perf, x86: drain PEBS buffer during context switch

Flush the PEBS buffer during context switch if PEBS interrupt threshold
is larger than one. This allows perf to supply TID for events.

Signed-off-by: Yan, Zheng <[email protected]>
---
arch/x86/kernel/cpu/perf_event.h | 3 +++
arch/x86/kernel/cpu/perf_event_intel.c | 11 +++++++-
arch/x86/kernel/cpu/perf_event_intel_ds.c | 42 ++++++++++++++++++++++++++++++
arch/x86/kernel/cpu/perf_event_intel_lbr.c | 2 --
4 files changed, 55 insertions(+), 3 deletions(-)

diff --git a/arch/x86/kernel/cpu/perf_event.h b/arch/x86/kernel/cpu/perf_event.h
index cb7cda8..eafea09 100644
--- a/arch/x86/kernel/cpu/perf_event.h
+++ b/arch/x86/kernel/cpu/perf_event.h
@@ -147,6 +147,7 @@ struct cpu_hw_events {
*/
struct debug_store *ds;
u64 pebs_enabled;
+ bool pebs_sched_cb_enabled;

/*
* Intel LBR bits
@@ -683,6 +684,8 @@ void intel_pmu_pebs_enable_all(void);

void intel_pmu_pebs_disable_all(void);

+void intel_pmu_pebs_sched_task(struct perf_event_context *ctx, bool sched_in);
+
void intel_ds_init(void);

void intel_pmu_lbr_sched_task(struct perf_event_context *ctx, bool sched_in);
diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c
index ef926ee..cb5a838 100644
--- a/arch/x86/kernel/cpu/perf_event_intel.c
+++ b/arch/x86/kernel/cpu/perf_event_intel.c
@@ -2035,6 +2035,15 @@ static void intel_pmu_cpu_dying(int cpu)
fini_debug_store_on_cpu(cpu);
}

+static void intel_pmu_sched_task(struct perf_event_context *ctx,
+ bool sched_in)
+{
+ if (x86_pmu.pebs_active)
+ intel_pmu_pebs_sched_task(ctx, sched_in);
+ if (x86_pmu.lbr_nr)
+ intel_pmu_lbr_sched_task(ctx, sched_in);
+}
+
PMU_FORMAT_ATTR(offcore_rsp, "config1:0-63");

PMU_FORMAT_ATTR(ldlat, "config1:0-15");
@@ -2086,7 +2095,7 @@ static __initconst const struct x86_pmu intel_pmu = {
.cpu_starting = intel_pmu_cpu_starting,
.cpu_dying = intel_pmu_cpu_dying,
.guest_get_msrs = intel_guest_get_msrs,
- .sched_task = intel_pmu_lbr_sched_task,
+ .sched_task = intel_pmu_sched_task,
};

static __init void intel_clovertown_quirk(void)
diff --git a/arch/x86/kernel/cpu/perf_event_intel_ds.c b/arch/x86/kernel/cpu/perf_event_intel_ds.c
index e17eb5b..dec8b8a 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_ds.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_ds.c
@@ -705,6 +705,26 @@ struct event_constraint *intel_pebs_constraints(struct perf_event *event)
return &emptyconstraint;
}

+void intel_pmu_drain_pebs_buffer(void)
+{
+ struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
+ struct debug_store *ds = cpuc->ds;
+ struct pt_regs regs;
+
+ if (!x86_pmu.pebs_active)
+ return;
+ if (ds->pebs_index <= ds->pebs_buffer_base)
+ return;
+
+ x86_pmu.drain_pebs(&regs);
+}
+
+void intel_pmu_pebs_sched_task(struct perf_event_context *ctx, bool sched_in)
+{
+ if (!sched_in)
+ intel_pmu_drain_pebs_buffer();
+}
+
/*
* Flags PEBS can handle without an PMI.
*
@@ -743,8 +763,16 @@ void intel_pmu_pebs_enable(struct perf_event *event)
!(event->attr.sample_type & ~PEBS_FREERUNNING_FLAGS)) {
threshold = ds->pebs_absolute_maximum -
x86_pmu.max_pebs_events * x86_pmu.pebs_record_size;
+ if (first_pebs) {
+ perf_sched_cb_enable(event->ctx->pmu);
+ cpuc->pebs_sched_cb_enabled = true;
+ }
} else {
threshold = ds->pebs_buffer_base + x86_pmu.pebs_record_size;
+ if (cpuc->pebs_sched_cb_enabled) {
+ perf_sched_cb_disable(event->ctx->pmu);
+ cpuc->pebs_sched_cb_enabled = false;
+ }
}
if (first_pebs || ds->pebs_interrupt_threshold > threshold)
ds->pebs_interrupt_threshold = threshold;
@@ -759,8 +787,19 @@ void intel_pmu_pebs_disable(struct perf_event *event)
{
struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
struct hw_perf_event *hwc = &event->hw;
+ struct debug_store *ds = cpuc->ds;
+ bool multi_pebs = false;
+
+ if (ds->pebs_interrupt_threshold >
+ ds->pebs_buffer_base + x86_pmu.pebs_record_size)
+ multi_pebs = true;

cpuc->pebs_enabled &= ~(1ULL << hwc->idx);
+ if (cpuc->pebs_sched_cb_enabled &&
+ !(cpuc->pebs_enabled & ((1ULL << MAX_PEBS_EVENTS) - 1))) {
+ perf_sched_cb_disable(event->ctx->pmu);
+ cpuc->pebs_sched_cb_enabled = false;
+ }

if (event->hw.constraint->flags & PERF_X86_EVENT_PEBS_LDLAT)
cpuc->pebs_enabled &= ~(1ULL << (hwc->idx + 32));
@@ -772,6 +811,9 @@ void intel_pmu_pebs_disable(struct perf_event *event)

hwc->config |= ARCH_PERFMON_EVENTSEL_INT;
hwc->autoreload = false;
+
+ if (multi_pebs)
+ intel_pmu_drain_pebs_buffer();
}

void intel_pmu_pebs_enable_all(void)
diff --git a/arch/x86/kernel/cpu/perf_event_intel_lbr.c b/arch/x86/kernel/cpu/perf_event_intel_lbr.c
index 430f1ad..a3df61d 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_lbr.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_lbr.c
@@ -199,8 +199,6 @@ void intel_pmu_lbr_enable(struct perf_event *event)
{
struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);

- if (!x86_pmu.lbr_nr)
- return;
/*
* Reset the LBR stack if we changed task context to
* avoid data leaks.
--
1.9.3

2014-07-15 09:00:18

by Yan, Zheng

[permalink] [raw]
Subject: [PATCH v2 7/7] tools, perf: Allow the user to disable time stamps

From: Andi Kleen <[email protected]>

Time stamps are always implicitely enabled for record currently.
The old --time/-T option is a nop.

Allow the user to disable timestamps by using --no-time

This can cause some minor misaccounting (by missing mmaps), but significantly
lowers the size of perf.data

The defaults are unchanged.

Signed-off-by: Andi Kleen <[email protected]>
---
tools/perf/builtin-record.c | 1 +
tools/perf/util/evsel.c | 9 ++++++---
2 files changed, 7 insertions(+), 3 deletions(-)

diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c
index 378b85b..8728c7c 100644
--- a/tools/perf/builtin-record.c
+++ b/tools/perf/builtin-record.c
@@ -776,6 +776,7 @@ static const char * const record_usage[] = {
*/
static struct record record = {
.opts = {
+ .sample_time = true,
.mmap_pages = UINT_MAX,
.user_freq = UINT_MAX,
.user_interval = ULLONG_MAX,
diff --git a/tools/perf/util/evsel.c b/tools/perf/util/evsel.c
index 8606175..1bc4093 100644
--- a/tools/perf/util/evsel.c
+++ b/tools/perf/util/evsel.c
@@ -632,9 +632,12 @@ void perf_evsel__config(struct perf_evsel *evsel, struct record_opts *opts)
if (opts->period)
perf_evsel__set_sample_bit(evsel, PERIOD);

- if (!perf_missing_features.sample_id_all &&
- (opts->sample_time || !opts->no_inherit ||
- target__has_cpu(&opts->target) || per_cpu))
+ /*
+ * When the user explicitely disabled time don't force it here.
+ */
+ if (opts->sample_time &&
+ (!perf_missing_features.sample_id_all &&
+ (!opts->no_inherit || target__has_cpu(&opts->target) || per_cpu)))
perf_evsel__set_sample_bit(evsel, TIME);

if (opts->raw_samples) {
--
1.9.3

2014-07-15 09:00:49

by Yan, Zheng

[permalink] [raw]
Subject: [PATCH v2 2/7] perf, x86: use context switch callback to flush LBR stack

Previous commit introduces context switch callback, its function
overlaps with the flush branch stack callback. So we can use the
context switch callback to flush LBR stack.

This patch adds code that uses the flush branch callback to
flush the LBR stack when task is being scheduled in. The callback
is enabled only when there are events use the LBR hardware. This
patch also removes all old flush branch stack code.

Signed-off-by: Yan, Zheng <[email protected]>
---
arch/x86/kernel/cpu/perf_event.c | 7 ---
arch/x86/kernel/cpu/perf_event.h | 3 +-
arch/x86/kernel/cpu/perf_event_intel.c | 14 +-----
arch/x86/kernel/cpu/perf_event_intel_lbr.c | 32 +++++++++++--
include/linux/perf_event.h | 6 ---
kernel/events/core.c | 77 ------------------------------
6 files changed, 30 insertions(+), 109 deletions(-)

diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index 7d22972..8868e9b 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -1880,12 +1880,6 @@ static void x86_pmu_sched_task(struct perf_event_context *ctx, bool sched_in)
x86_pmu.sched_task(ctx, sched_in);
}

-static void x86_pmu_flush_branch_stack(void)
-{
- if (x86_pmu.flush_branch_stack)
- x86_pmu.flush_branch_stack();
-}
-
void perf_check_microcode(void)
{
if (x86_pmu.check_microcode)
@@ -1912,7 +1906,6 @@ static struct pmu pmu = {
.commit_txn = x86_pmu_commit_txn,

.event_idx = x86_pmu_event_idx,
- .flush_branch_stack = x86_pmu_flush_branch_stack,
.sched_task = x86_pmu_sched_task,
};

diff --git a/arch/x86/kernel/cpu/perf_event.h b/arch/x86/kernel/cpu/perf_event.h
index e70b352..d8165f3 100644
--- a/arch/x86/kernel/cpu/perf_event.h
+++ b/arch/x86/kernel/cpu/perf_event.h
@@ -428,7 +428,6 @@ struct x86_pmu {
void (*cpu_dead)(int cpu);

void (*check_microcode)(void);
- void (*flush_branch_stack)(void);
void (*sched_task)(struct perf_event_context *ctx,
bool sched_in);

@@ -685,6 +684,8 @@ void intel_pmu_pebs_disable_all(void);

void intel_ds_init(void);

+void intel_pmu_lbr_sched_task(struct perf_event_context *ctx, bool sched_in);
+
void intel_pmu_lbr_reset(void);

void intel_pmu_lbr_enable(struct perf_event *event);
diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c
index adb02aa..ef926ee 100644
--- a/arch/x86/kernel/cpu/perf_event_intel.c
+++ b/arch/x86/kernel/cpu/perf_event_intel.c
@@ -2035,18 +2035,6 @@ static void intel_pmu_cpu_dying(int cpu)
fini_debug_store_on_cpu(cpu);
}

-static void intel_pmu_flush_branch_stack(void)
-{
- /*
- * Intel LBR does not tag entries with the
- * PID of the current task, then we need to
- * flush it on ctxsw
- * For now, we simply reset it
- */
- if (x86_pmu.lbr_nr)
- intel_pmu_lbr_reset();
-}
-
PMU_FORMAT_ATTR(offcore_rsp, "config1:0-63");

PMU_FORMAT_ATTR(ldlat, "config1:0-15");
@@ -2098,7 +2086,7 @@ static __initconst const struct x86_pmu intel_pmu = {
.cpu_starting = intel_pmu_cpu_starting,
.cpu_dying = intel_pmu_cpu_dying,
.guest_get_msrs = intel_guest_get_msrs,
- .flush_branch_stack = intel_pmu_flush_branch_stack,
+ .sched_task = intel_pmu_lbr_sched_task,
};

static __init void intel_clovertown_quirk(void)
diff --git a/arch/x86/kernel/cpu/perf_event_intel_lbr.c b/arch/x86/kernel/cpu/perf_event_intel_lbr.c
index 9dd2459..d6d5fcf 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_lbr.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_lbr.c
@@ -181,7 +181,7 @@ void intel_pmu_lbr_reset(void)
intel_pmu_lbr_reset_64();
}

-void intel_pmu_lbr_enable(struct perf_event *event)
+void intel_pmu_lbr_sched_task(struct perf_event_context *ctx, bool sched_in)
{
struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);

@@ -189,6 +189,23 @@ void intel_pmu_lbr_enable(struct perf_event *event)
return;

/*
+ * It is necessary to flush the stack on context switch. This happens
+ * when the branch stack does not tag its entries with the pid of the
+ * current task.
+ */
+ if (sched_in) {
+ intel_pmu_lbr_reset();
+ cpuc->lbr_context = ctx;
+ }
+}
+
+void intel_pmu_lbr_enable(struct perf_event *event)
+{
+ struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
+
+ if (!x86_pmu.lbr_nr)
+ return;
+ /*
* Reset the LBR stack if we changed task context to
* avoid data leaks.
*/
@@ -199,6 +216,8 @@ void intel_pmu_lbr_enable(struct perf_event *event)
cpuc->br_sel = event->hw.branch_reg.reg;

cpuc->lbr_users++;
+ if (cpuc->lbr_users == 1)
+ perf_sched_cb_enable(event->ctx->pmu);
}

void intel_pmu_lbr_disable(struct perf_event *event)
@@ -211,10 +230,13 @@ void intel_pmu_lbr_disable(struct perf_event *event)
cpuc->lbr_users--;
WARN_ON_ONCE(cpuc->lbr_users < 0);

- if (cpuc->enabled && !cpuc->lbr_users) {
- __intel_pmu_lbr_disable();
- /* avoid stale pointer */
- cpuc->lbr_context = NULL;
+ if (!cpuc->lbr_users) {
+ perf_sched_cb_disable(event->ctx->pmu);
+ if (cpuc->enabled) {
+ __intel_pmu_lbr_disable();
+ /* avoid stale pointer */
+ cpuc->lbr_context = NULL;
+ }
}
}

diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 73f3afa..5d665e8 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -259,11 +259,6 @@ struct pmu {
int (*event_idx) (struct perf_event *event); /*optional */

/*
- * flush branch stack on context-switches (needed in cpu-wide mode)
- */
- void (*flush_branch_stack) (void);
-
- /*
* context-switches callback for CPU PMU. Other PMUs shouldn't set
* this callback
*/
@@ -512,7 +507,6 @@ struct perf_event_context {
u64 generation;
int pin_count;
int nr_cgroups; /* cgroup evts */
- int nr_branch_stack; /* branch_stack evt */
struct rcu_head rcu_head;
};

diff --git a/kernel/events/core.c b/kernel/events/core.c
index 4e721fa..31f9209 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -143,7 +143,6 @@ enum event_type_t {
*/
struct static_key_deferred perf_sched_events __read_mostly;
static DEFINE_PER_CPU(atomic_t, perf_cgroup_events);
-static DEFINE_PER_CPU(atomic_t, perf_branch_stack_events);
static DEFINE_PER_CPU(int, perf_sched_cb_usages);

static atomic_t nr_mmap_events __read_mostly;
@@ -1137,9 +1136,6 @@ list_add_event(struct perf_event *event, struct perf_event_context *ctx)
if (is_cgroup_event(event))
ctx->nr_cgroups++;

- if (has_branch_stack(event))
- ctx->nr_branch_stack++;
-
list_add_rcu(&event->event_entry, &ctx->event_list);
if (!ctx->nr_events)
perf_pmu_rotate_start(ctx->pmu);
@@ -1302,9 +1298,6 @@ list_del_event(struct perf_event *event, struct perf_event_context *ctx)
cpuctx->cgrp = NULL;
}

- if (has_branch_stack(event))
- ctx->nr_branch_stack--;
-
ctx->nr_events--;
if (event->attr.inherit_stat)
ctx->nr_stat--;
@@ -2602,64 +2595,6 @@ static void perf_event_context_sched_in(struct perf_event_context *ctx,
}

/*
- * When sampling the branck stack in system-wide, it may be necessary
- * to flush the stack on context switch. This happens when the branch
- * stack does not tag its entries with the pid of the current task.
- * Otherwise it becomes impossible to associate a branch entry with a
- * task. This ambiguity is more likely to appear when the branch stack
- * supports priv level filtering and the user sets it to monitor only
- * at the user level (which could be a useful measurement in system-wide
- * mode). In that case, the risk is high of having a branch stack with
- * branch from multiple tasks. Flushing may mean dropping the existing
- * entries or stashing them somewhere in the PMU specific code layer.
- *
- * This function provides the context switch callback to the lower code
- * layer. It is invoked ONLY when there is at least one system-wide context
- * with at least one active event using taken branch sampling.
- */
-static void perf_branch_stack_sched_in(struct task_struct *prev,
- struct task_struct *task)
-{
- struct perf_cpu_context *cpuctx;
- struct pmu *pmu;
- unsigned long flags;
-
- /* no need to flush branch stack if not changing task */
- if (prev == task)
- return;
-
- local_irq_save(flags);
-
- rcu_read_lock();
-
- list_for_each_entry_rcu(pmu, &pmus, entry) {
- cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
-
- /*
- * check if the context has at least one
- * event using PERF_SAMPLE_BRANCH_STACK
- */
- if (cpuctx->ctx.nr_branch_stack > 0
- && pmu->flush_branch_stack) {
-
- perf_ctx_lock(cpuctx, cpuctx->task_ctx);
-
- perf_pmu_disable(pmu);
-
- pmu->flush_branch_stack();
-
- perf_pmu_enable(pmu);
-
- perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
- }
- }
-
- rcu_read_unlock();
-
- local_irq_restore(flags);
-}
-
-/*
* Called from scheduler to add the events of the current task
* with interrupts disabled.
*
@@ -2691,10 +2626,6 @@ void __perf_event_task_sched_in(struct task_struct *prev,
if (atomic_read(&__get_cpu_var(perf_cgroup_events)))
perf_cgroup_sched_in(prev, task);

- /* check for system-wide branch_stack events */
- if (atomic_read(&__get_cpu_var(perf_branch_stack_events)))
- perf_branch_stack_sched_in(prev, task);
-
if (__get_cpu_var(perf_sched_cb_usages))
perf_pmu_sched_task(prev, task, true);
}
@@ -3280,10 +3211,6 @@ static void unaccount_event_cpu(struct perf_event *event, int cpu)
if (event->parent)
return;

- if (has_branch_stack(event)) {
- if (!(event->attach_state & PERF_ATTACH_TASK))
- atomic_dec(&per_cpu(perf_branch_stack_events, cpu));
- }
if (is_cgroup_event(event))
atomic_dec(&per_cpu(perf_cgroup_events, cpu));
}
@@ -6769,10 +6696,6 @@ static void account_event_cpu(struct perf_event *event, int cpu)
if (event->parent)
return;

- if (has_branch_stack(event)) {
- if (!(event->attach_state & PERF_ATTACH_TASK))
- atomic_inc(&per_cpu(perf_branch_stack_events, cpu));
- }
if (is_cgroup_event(event))
atomic_inc(&per_cpu(perf_cgroup_events, cpu));
}
--
1.9.3

2014-07-15 09:00:47

by Yan, Zheng

[permalink] [raw]
Subject: [PATCH v2 4/7] perf, x86: large PEBS interrupt threshold

PEBS always had the capability to log samples to its buffers without
an interrupt. Traditionally perf has not used this but always set the
PEBS threshold to one.

For frequently occuring events (like cycles or branches or load/stores)
this in term requires using a relatively high sampling period to avoid
overloading the system, by only processing PMIs. This in term increases
sampling error.

For the common cases we still need to use the PMI because the PEBS
hardware has various limitations. The biggest one is that it can not
supply a callgraph. It also requires setting a fixed period, as the
hardware does not support adaptive period. Another issue is that it
cannot supply a time stamp and some other options. To supply a TID it
requires flushing on context switch. It can however supply the IP, the
load/store address, TSX information, registers, and some other things.

So we can make PEBS work for some specific cases, basically as long as
you can do without a callgraph and can set the period you can use this
new PEBS mode.

The main benefit is the ability to support much lower sampling period
(down to -c 1000) without extensive overhead.

One use cases is for example to increase the resolution of the c2c tool.
Another is double checking when you suspect the standard sampling has
too much sampling error.

Some numbers on the overhead, using cycle soak, comparing
"perf record --no-time -e cycles:p -c" to "perf record -e cycles:p -c"

period plain multi delta
10003 15 5 10
20003 15.7 4 11.7
40003 8.7 2.5 6.2
80003 4.1 1.4 2.7
100003 3.6 1.2 2.4
800003 4.4 1.4 3
1000003 0.6 0.4 0.2
2000003 0.4 0.3 0.1
4000003 0.3 0.2 0.1
10000003 0.3 0.2 0.1

The interesting part is the delta between multi-pebs and normal pebs. Above
-c 1000003 it does not really matter because the basic overhead is so low.
With periods below 80003 it becomes interesting.

Note in some other workloads (e.g. kernbench) the smaller sampling periods
cause much more overhead without multi-pebs, upto 80% (and throttling) have
been observed with -c 10003. multi pebs generally does not throttle.

Signed-off-by: Yan, Zheng <[email protected]>
---
arch/x86/kernel/cpu/perf_event.h | 1 +
arch/x86/kernel/cpu/perf_event_intel_ds.c | 98 +++++++++++++++++++++---------
arch/x86/kernel/cpu/perf_event_intel_lbr.c | 5 --
3 files changed, 71 insertions(+), 33 deletions(-)

diff --git a/arch/x86/kernel/cpu/perf_event.h b/arch/x86/kernel/cpu/perf_event.h
index d8165f3..cb7cda8 100644
--- a/arch/x86/kernel/cpu/perf_event.h
+++ b/arch/x86/kernel/cpu/perf_event.h
@@ -450,6 +450,7 @@ struct x86_pmu {
struct event_constraint *pebs_constraints;
void (*pebs_aliases)(struct perf_event *event);
int max_pebs_events;
+ bool multi_pebs;

/*
* Intel LBR
diff --git a/arch/x86/kernel/cpu/perf_event_intel_ds.c b/arch/x86/kernel/cpu/perf_event_intel_ds.c
index 1db4ce5..e17eb5b 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_ds.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_ds.c
@@ -11,7 +11,7 @@
#define BTS_RECORD_SIZE 24

#define BTS_BUFFER_SIZE (PAGE_SIZE << 4)
-#define PEBS_BUFFER_SIZE PAGE_SIZE
+#define PEBS_BUFFER_SIZE (PAGE_SIZE << 4)
#define PEBS_FIXUP_SIZE PAGE_SIZE

/*
@@ -251,7 +251,7 @@ static int alloc_pebs_buffer(int cpu)
{
struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds;
int node = cpu_to_node(cpu);
- int max, thresh = 1; /* always use a single PEBS record */
+ int max;
void *buffer, *ibuffer;

if (!x86_pmu.pebs)
@@ -281,9 +281,6 @@ static int alloc_pebs_buffer(int cpu)
ds->pebs_absolute_maximum = ds->pebs_buffer_base +
max * x86_pmu.pebs_record_size;

- ds->pebs_interrupt_threshold = ds->pebs_buffer_base +
- thresh * x86_pmu.pebs_record_size;
-
return 0;
}

@@ -708,14 +705,29 @@ struct event_constraint *intel_pebs_constraints(struct perf_event *event)
return &emptyconstraint;
}

+/*
+ * Flags PEBS can handle without an PMI.
+ *
+ * TID can only be handled by flushing at context switch.
+ */
+#define PEBS_FREERUNNING_FLAGS \
+ (PERF_SAMPLE_IP | PERF_SAMPLE_TID | PERF_SAMPLE_ADDR | \
+ PERF_SAMPLE_ID | PERF_SAMPLE_CPU | PERF_SAMPLE_STREAM_ID | \
+ PERF_SAMPLE_DATA_SRC | PERF_SAMPLE_IDENTIFIER | \
+ PERF_SAMPLE_TRANSACTION)
+
void intel_pmu_pebs_enable(struct perf_event *event)
{
struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
struct hw_perf_event *hwc = &event->hw;
+ struct debug_store *ds = cpuc->ds;
+ u64 threshold;
+ bool first_pebs;

hwc->config &= ~ARCH_PERFMON_EVENTSEL_INT;
hwc->autoreload = !event->attr.freq;

+ first_pebs = !(cpuc->pebs_enabled & ((1ULL << MAX_PEBS_EVENTS) - 1));
cpuc->pebs_enabled |= 1ULL << hwc->idx;

if (event->hw.flags & PERF_X86_EVENT_PEBS_LDLAT)
@@ -723,6 +735,20 @@ void intel_pmu_pebs_enable(struct perf_event *event)
else if (event->hw.flags & PERF_X86_EVENT_PEBS_ST)
cpuc->pebs_enabled |= 1ULL << 63;

+ /*
+ * When the event is constrained enough we can use a larger
+ * threshold and run the event with less frequent PMI.
+ */
+ if (x86_pmu.multi_pebs && hwc->autoreload &&
+ !(event->attr.sample_type & ~PEBS_FREERUNNING_FLAGS)) {
+ threshold = ds->pebs_absolute_maximum -
+ x86_pmu.max_pebs_events * x86_pmu.pebs_record_size;
+ } else {
+ threshold = ds->pebs_buffer_base + x86_pmu.pebs_record_size;
+ }
+ if (first_pebs || ds->pebs_interrupt_threshold > threshold)
+ ds->pebs_interrupt_threshold = threshold;
+
/* Use auto-reload if possible to save a MSR write in the PMI */
if (hwc->autoreload)
ds->pebs_event_reset[hwc->idx] =
@@ -867,7 +893,8 @@ static inline u64 intel_hsw_transaction(struct pebs_record_hsw *pebs)
}

static void __intel_pmu_pebs_event(struct perf_event *event,
- struct pt_regs *iregs, void *__pebs)
+ struct pt_regs *iregs, void *__pebs,
+ bool first_record)
{
/*
* We cast to the biggest pebs_record but are careful not to
@@ -880,7 +907,7 @@ static void __intel_pmu_pebs_event(struct perf_event *event,
u64 sample_type;
int fll, fst;

- if (!intel_pmu_save_and_restart(event))
+ if (first_record && !intel_pmu_save_and_restart(event))
return;

fll = event->hw.flags & PERF_X86_EVENT_PEBS_LDLAT;
@@ -956,8 +983,22 @@ static void __intel_pmu_pebs_event(struct perf_event *event,
if (has_branch_stack(event))
data.br_stack = &cpuc->lbr_stack;

- if (perf_event_overflow(event, &data, &regs))
- x86_pmu_stop(event, 0);
+ if (first_record) {
+ if (perf_event_overflow(event, &data, &regs))
+ x86_pmu_stop(event, 0);
+ } else {
+ struct perf_output_handle handle;
+ struct perf_event_header header;
+
+ perf_prepare_sample(&header, &data, event, &regs);
+
+ if (perf_output_begin(&handle, event, header.size))
+ return;
+
+ perf_output_sample(&handle, &header, &data, event);
+
+ perf_output_end(&handle);
+ }
}

static void intel_pmu_drain_pebs_core(struct pt_regs *iregs)
@@ -998,17 +1039,18 @@ static void intel_pmu_drain_pebs_core(struct pt_regs *iregs)
WARN_ONCE(n > 1, "bad leftover pebs %d\n", n);
at += n - 1;

- __intel_pmu_pebs_event(event, iregs, at);
+ __intel_pmu_pebs_event(event, iregs, at, true);
}

static void intel_pmu_drain_pebs_nhm(struct pt_regs *iregs)
{
struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
struct debug_store *ds = cpuc->ds;
- struct perf_event *event = NULL;
+ struct perf_event *event;
void *at, *top;
u64 status = 0;
int bit;
+ bool multi_pebs, first_record;

if (!x86_pmu.pebs_active)
return;
@@ -1021,17 +1063,19 @@ static void intel_pmu_drain_pebs_nhm(struct pt_regs *iregs)
if (unlikely(at > top))
return;

- /*
- * Should not happen, we program the threshold at 1 and do not
- * set a reset value.
- */
- WARN_ONCE(top - at > x86_pmu.max_pebs_events * x86_pmu.pebs_record_size,
- "Unexpected number of pebs records %ld\n",
- (long)(top - at) / x86_pmu.pebs_record_size);
+ if (ds->pebs_interrupt_threshold >
+ ds->pebs_buffer_base + x86_pmu.pebs_record_size)
+ multi_pebs = true;
+ else
+ multi_pebs = false;

for (; at < top; at += x86_pmu.pebs_record_size) {
struct pebs_record_nhm *p = at;

+ /*
+ * PEBS creates only one entry if multiple counters
+ * overflow simultaneously.
+ */
for_each_set_bit(bit, (unsigned long *)&p->status,
x86_pmu.max_pebs_events) {
event = cpuc->events[bit];
@@ -1042,17 +1086,15 @@ static void intel_pmu_drain_pebs_nhm(struct pt_regs *iregs)

if (!event->attr.precise_ip)
continue;
-
- if (__test_and_set_bit(bit, (unsigned long *)&status))
- continue;
-
- break;
+ if (!__test_and_set_bit(bit, (unsigned long *)&status)) {
+ first_record = true;
+ } else {
+ if (!multi_pebs)
+ continue;
+ first_record = false;
+ }
+ __intel_pmu_pebs_event(event, iregs, at, first_record);
}
-
- if (!event || bit >= x86_pmu.max_pebs_events)
- continue;
-
- __intel_pmu_pebs_event(event, iregs, at);
}
}

diff --git a/arch/x86/kernel/cpu/perf_event_intel_lbr.c b/arch/x86/kernel/cpu/perf_event_intel_lbr.c
index d6d5fcf..430f1ad 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_lbr.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_lbr.c
@@ -184,10 +184,6 @@ void intel_pmu_lbr_reset(void)
void intel_pmu_lbr_sched_task(struct perf_event_context *ctx, bool sched_in)
{
struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
-
- if (!x86_pmu.lbr_nr)
- return;
-
/*
* It is necessary to flush the stack on context switch. This happens
* when the branch stack does not tag its entries with the pid of the
@@ -408,7 +404,6 @@ static void intel_pmu_setup_sw_lbr_filter(struct perf_event *event)

if (br_type & PERF_SAMPLE_BRANCH_COND)
mask |= X86_BR_JCC;
-
/*
* stash actual user request into reg, it may
* be used by fixup code for some CPU
--
1.9.3

2014-07-15 08:59:27

by Yan, Zheng

[permalink] [raw]
Subject: [PATCH v2 1/7] perf, core: introduce pmu context switch callback

The callback is invoked when process is scheduled in or out.
It provides mechanism for later patches to save/store the LBR
stack. For the schedule in case, the callback is invoked at
the same place that flush branch stack callback is invoked.
So it also can replace the flush branch stack callback. To
avoid unnecessary overhead, the callback is enabled only when
there are events use the LBR stack.

Signed-off-by: Yan, Zheng <[email protected]>
---
arch/x86/kernel/cpu/perf_event.c | 7 +++++
arch/x86/kernel/cpu/perf_event.h | 2 ++
include/linux/perf_event.h | 9 ++++++
kernel/events/core.c | 59 ++++++++++++++++++++++++++++++++++++++++
4 files changed, 77 insertions(+)

diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index 2bdfbff..7d22972 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -1874,6 +1874,12 @@ static const struct attribute_group *x86_pmu_attr_groups[] = {
NULL,
};

+static void x86_pmu_sched_task(struct perf_event_context *ctx, bool sched_in)
+{
+ if (x86_pmu.sched_task)
+ x86_pmu.sched_task(ctx, sched_in);
+}
+
static void x86_pmu_flush_branch_stack(void)
{
if (x86_pmu.flush_branch_stack)
@@ -1907,6 +1913,7 @@ static struct pmu pmu = {

.event_idx = x86_pmu_event_idx,
.flush_branch_stack = x86_pmu_flush_branch_stack,
+ .sched_task = x86_pmu_sched_task,
};

void arch_perf_update_userpage(struct perf_event_mmap_page *userpg, u64 now)
diff --git a/arch/x86/kernel/cpu/perf_event.h b/arch/x86/kernel/cpu/perf_event.h
index 3b2f9bd..e70b352 100644
--- a/arch/x86/kernel/cpu/perf_event.h
+++ b/arch/x86/kernel/cpu/perf_event.h
@@ -429,6 +429,8 @@ struct x86_pmu {

void (*check_microcode)(void);
void (*flush_branch_stack)(void);
+ void (*sched_task)(struct perf_event_context *ctx,
+ bool sched_in);

/*
* Intel Arch Perfmon v2+
diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 707617a..73f3afa 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -262,6 +262,13 @@ struct pmu {
* flush branch stack on context-switches (needed in cpu-wide mode)
*/
void (*flush_branch_stack) (void);
+
+ /*
+ * context-switches callback for CPU PMU. Other PMUs shouldn't set
+ * this callback
+ */
+ void (*sched_task) (struct perf_event_context *ctx,
+ bool sched_in);
};

/**
@@ -557,6 +564,8 @@ extern void perf_event_delayed_put(struct task_struct *task);
extern void perf_event_print_debug(void);
extern void perf_pmu_disable(struct pmu *pmu);
extern void perf_pmu_enable(struct pmu *pmu);
+extern void perf_sched_cb_disable(struct pmu *pmu);
+extern void perf_sched_cb_enable(struct pmu *pmu);
extern int perf_event_task_disable(void);
extern int perf_event_task_enable(void);
extern int perf_event_refresh(struct perf_event *event, int refresh);
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 67e3b9c..4e721fa 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -144,6 +144,7 @@ enum event_type_t {
struct static_key_deferred perf_sched_events __read_mostly;
static DEFINE_PER_CPU(atomic_t, perf_cgroup_events);
static DEFINE_PER_CPU(atomic_t, perf_branch_stack_events);
+static DEFINE_PER_CPU(int, perf_sched_cb_usages);

static atomic_t nr_mmap_events __read_mostly;
static atomic_t nr_comm_events __read_mostly;
@@ -2362,6 +2363,58 @@ unlock:
}
}

+void perf_sched_cb_disable(struct pmu *pmu)
+{
+ this_cpu_dec(perf_sched_cb_usages);
+}
+
+void perf_sched_cb_enable(struct pmu *pmu)
+{
+ this_cpu_inc(perf_sched_cb_usages);
+}
+
+/*
+ * This function provides the context switch callback to the lower code
+ * layer. It is invoked ONLY when the context switch callback is enabled.
+ */
+static void perf_pmu_sched_task(struct task_struct *prev,
+ struct task_struct *next,
+ bool sched_in)
+{
+ struct perf_cpu_context *cpuctx;
+ struct pmu *pmu;
+ unsigned long flags;
+
+ if (prev == next)
+ return;
+
+ local_irq_save(flags);
+
+ rcu_read_lock();
+
+ list_for_each_entry_rcu(pmu, &pmus, entry) {
+ if (pmu->sched_task) {
+ cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
+
+ perf_ctx_lock(cpuctx, cpuctx->task_ctx);
+
+ perf_pmu_disable(pmu);
+
+ pmu->sched_task(cpuctx->task_ctx, sched_in);
+
+ perf_pmu_enable(pmu);
+
+ perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
+ /* only CPU PMU has context switch callback */
+ break;
+ }
+ }
+
+ rcu_read_unlock();
+
+ local_irq_restore(flags);
+}
+
#define for_each_task_context_nr(ctxn) \
for ((ctxn) = 0; (ctxn) < perf_nr_task_contexts; (ctxn)++)

@@ -2381,6 +2434,9 @@ void __perf_event_task_sched_out(struct task_struct *task,
{
int ctxn;

+ if (__get_cpu_var(perf_sched_cb_usages))
+ perf_pmu_sched_task(task, next, false);
+
for_each_task_context_nr(ctxn)
perf_event_context_sched_out(task, ctxn, next);

@@ -2638,6 +2694,9 @@ void __perf_event_task_sched_in(struct task_struct *prev,
/* check for system-wide branch_stack events */
if (atomic_read(&__get_cpu_var(perf_branch_stack_events)))
perf_branch_stack_sched_in(prev, task);
+
+ if (__get_cpu_var(perf_sched_cb_usages))
+ perf_pmu_sched_task(prev, task, true);
}

static u64 perf_calculate_period(struct perf_event *event, u64 nsec, u64 count)
--
1.9.3

2014-07-15 10:02:31

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [PATCH v2 0/7] perf, x86: large PEBS interrupt threshold



The first two patches are the same patches as from the LBR callstack
series, right?


Attachments:
(No filename) (86.00 B)
(No filename) (836.00 B)
Download all attachments

2014-07-15 10:14:34

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [PATCH v2 3/7] perf, x86: use the PEBS auto reload mechanism when possible

On Tue, Jul 15, 2014 at 04:58:55PM +0800, Yan, Zheng wrote:
> When a fixed period is specified, this patch make perf use the PEBS
> auto reload mechanism. This makes normal profiling faster, because
> it avoids one costly MSR write in the PMI handler.
>
> Signef-off-by: Yan, Zheng <[email protected]>
> ---
> arch/x86/kernel/cpu/perf_event.c | 15 +++++++++------
> arch/x86/kernel/cpu/perf_event_intel_ds.c | 7 +++++++
> include/linux/perf_event.h | 1 +
> 3 files changed, 17 insertions(+), 6 deletions(-)
>
> diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
> index 8868e9b..ae723c8 100644
> --- a/arch/x86/kernel/cpu/perf_event.c
> +++ b/arch/x86/kernel/cpu/perf_event.c
> @@ -979,13 +979,16 @@ int x86_perf_event_set_period(struct perf_event *event)
>
> per_cpu(pmc_prev_left[idx], smp_processor_id()) = left;
>
> - /*
> - * The hw event starts counting from this event offset,
> - * mark it to be able to extra future deltas:
> - */
> - local64_set(&hwc->prev_count, (u64)-left);
> + if (!hwc->autoreload ||
> + local64_read(&hwc->prev_count) != (u64)-left) {

Do you really need that line break? I suspect it comes it at or below 80
if you concat.

> + /*
> + * The hw event starts counting from this event offset,
> + * mark it to be able to extra future deltas:
> + */
> + local64_set(&hwc->prev_count, (u64)-left);
>
> - wrmsrl(hwc->event_base, (u64)(-left) & x86_pmu.cntval_mask);
> + wrmsrl(hwc->event_base, (u64)(-left) & x86_pmu.cntval_mask);
> + }
>
> /*
> * Due to erratum on certan cpu we need
> diff --git a/arch/x86/kernel/cpu/perf_event_intel_ds.c b/arch/x86/kernel/cpu/perf_event_intel_ds.c
> index 980970c..1db4ce5 100644
> --- a/arch/x86/kernel/cpu/perf_event_intel_ds.c
> +++ b/arch/x86/kernel/cpu/perf_event_intel_ds.c
> @@ -714,6 +714,7 @@ void intel_pmu_pebs_enable(struct perf_event *event)
> struct hw_perf_event *hwc = &event->hw;
>
> hwc->config &= ~ARCH_PERFMON_EVENTSEL_INT;
> + hwc->autoreload = !event->attr.freq;
>
> cpuc->pebs_enabled |= 1ULL << hwc->idx;
>
> @@ -721,6 +722,11 @@ void intel_pmu_pebs_enable(struct perf_event *event)
> cpuc->pebs_enabled |= 1ULL << (hwc->idx + 32);
> else if (event->hw.flags & PERF_X86_EVENT_PEBS_ST)
> cpuc->pebs_enabled |= 1ULL << 63;
> +
> + /* Use auto-reload if possible to save a MSR write in the PMI */
> + if (hwc->autoreload)
> + ds->pebs_event_reset[hwc->idx] =
> + (u64)-hwc->sample_period & x86_pmu.cntval_mask;

The rule is to add { } for any multi-line block, not the strict single
stmt as per the C language.

> }
>
> void intel_pmu_pebs_disable(struct perf_event *event)
> @@ -739,6 +745,7 @@ void intel_pmu_pebs_disable(struct perf_event *event)
> wrmsrl(MSR_IA32_PEBS_ENABLE, cpuc->pebs_enabled);
>
> hwc->config |= ARCH_PERFMON_EVENTSEL_INT;
> + hwc->autoreload = false;
> }
>
> void intel_pmu_pebs_enable_all(void)
> diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
> index 5d665e8..37a2b70 100644
> --- a/include/linux/perf_event.h
> +++ b/include/linux/perf_event.h
> @@ -149,6 +149,7 @@ struct hw_perf_event {
>
> u64 freq_time_stamp;
> u64 freq_count_stamp;
> + bool autoreload;

No, that's not the right place. If you'd looked at the structure you'd
have seen that this is the place for generic members, ones that apply to
all events, this is PEBS only, PEBS is limited to hardware events,
there's a section for that.

When you've found that, you'll note we have a flags field in there,
there's spare bits in there, use one.


Attachments:
(No filename) (3.51 kB)
(No filename) (836.00 B)
Download all attachments

2014-07-15 10:41:53

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [PATCH v2 4/7] perf, x86: large PEBS interrupt threshold

On Tue, Jul 15, 2014 at 04:58:56PM +0800, Yan, Zheng wrote:
> PEBS always had the capability to log samples to its buffers without
> an interrupt. Traditionally perf has not used this but always set the
> PEBS threshold to one.
>
> For frequently occuring events (like cycles or branches or load/stores)
> this in term requires using a relatively high sampling period to avoid
> overloading the system, by only processing PMIs. This in term increases
> sampling error.
>
> For the common cases we still need to use the PMI because the PEBS
> hardware has various limitations. The biggest one is that it can not
> supply a callgraph. It also requires setting a fixed period, as the
> hardware does not support adaptive period. Another issue is that it
> cannot supply a time stamp and some other options. To supply a TID it
> requires flushing on context switch. It can however supply the IP, the
> load/store address, TSX information, registers, and some other things.
>
> So we can make PEBS work for some specific cases, basically as long as
> you can do without a callgraph and can set the period you can use this
> new PEBS mode.
>
> The main benefit is the ability to support much lower sampling period
> (down to -c 1000) without extensive overhead.
>
> One use cases is for example to increase the resolution of the c2c tool.
> Another is double checking when you suspect the standard sampling has
> too much sampling error.
>
> Some numbers on the overhead, using cycle soak, comparing
> "perf record --no-time -e cycles:p -c" to "perf record -e cycles:p -c"
>
> period plain multi delta
> 10003 15 5 10
> 20003 15.7 4 11.7
> 40003 8.7 2.5 6.2
> 80003 4.1 1.4 2.7
> 100003 3.6 1.2 2.4
> 800003 4.4 1.4 3
> 1000003 0.6 0.4 0.2
> 2000003 0.4 0.3 0.1
> 4000003 0.3 0.2 0.1
> 10000003 0.3 0.2 0.1
>
> The interesting part is the delta between multi-pebs and normal pebs. Above
> -c 1000003 it does not really matter because the basic overhead is so low.
> With periods below 80003 it becomes interesting.
>
> Note in some other workloads (e.g. kernbench) the smaller sampling periods
> cause much more overhead without multi-pebs, upto 80% (and throttling) have
> been observed with -c 10003. multi pebs generally does not throttle.
>

And not a single word on the multiplex horror we talked about. That
should be mentioned, in detail.


Attachments:
(No filename) (2.39 kB)
(No filename) (836.00 B)
Download all attachments

2014-07-15 11:12:25

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [PATCH v2 6/7] perf, x86: enable large PEBS interrupt threshold for SNB/IVB/HSW

On Tue, Jul 15, 2014 at 04:58:58PM +0800, Yan, Zheng wrote:
> Signed-off-by: Yan, Zheng <[email protected]>
> ---
> arch/x86/kernel/cpu/perf_event_intel.c | 4 ++++
> 1 file changed, 4 insertions(+)
>
> diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c
> index cb5a838..dba03b3 100644
> --- a/arch/x86/kernel/cpu/perf_event_intel.c
> +++ b/arch/x86/kernel/cpu/perf_event_intel.c
> @@ -2456,6 +2456,7 @@ __init int intel_pmu_init(void)
> intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_BACKEND] =
> X86_CONFIG(.event=0xb1, .umask=0x01, .inv=1, .cmask=1);
>
> + x86_pmu.multi_pebs = true;
> pr_cont("SandyBridge events, ");
> break;
> case 58: /* IvyBridge */
> @@ -2484,6 +2485,7 @@ __init int intel_pmu_init(void)
> intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_FRONTEND] =
> X86_CONFIG(.event=0x0e, .umask=0x01, .inv=1, .cmask=1);
>
> + x86_pmu.multi_pebs = true;
> pr_cont("IvyBridge events, ");
> break;
>
> @@ -2511,6 +2513,8 @@ __init int intel_pmu_init(void)
> x86_pmu.get_event_constraints = hsw_get_event_constraints;
> x86_pmu.cpu_events = hsw_events_attrs;
> x86_pmu.lbr_double_abort = true;
> +
> + x86_pmu.multi_pebs = true;
> pr_cont("Haswell events, ");
> break;

NAK for the very same reason. All (PEBS capable) hardware supports this.


Attachments:
(No filename) (1.33 kB)
(No filename) (836.00 B)
Download all attachments

2014-07-15 11:38:58

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [PATCH v2 4/7] perf, x86: large PEBS interrupt threshold

On Tue, Jul 15, 2014 at 04:58:56PM +0800, Yan, Zheng wrote:
> Signed-off-by: Yan, Zheng <[email protected]>
> ---
> arch/x86/kernel/cpu/perf_event.h | 1 +
> arch/x86/kernel/cpu/perf_event_intel_ds.c | 98 +++++++++++++++++++++---------
> arch/x86/kernel/cpu/perf_event_intel_lbr.c | 5 --
> 3 files changed, 71 insertions(+), 33 deletions(-)
>
> diff --git a/arch/x86/kernel/cpu/perf_event.h b/arch/x86/kernel/cpu/perf_event.h
> index d8165f3..cb7cda8 100644
> --- a/arch/x86/kernel/cpu/perf_event.h
> +++ b/arch/x86/kernel/cpu/perf_event.h
> @@ -450,6 +450,7 @@ struct x86_pmu {
> struct event_constraint *pebs_constraints;
> void (*pebs_aliases)(struct perf_event *event);
> int max_pebs_events;
> + bool multi_pebs;

This needs to die.

> /*
> * Intel LBR
> diff --git a/arch/x86/kernel/cpu/perf_event_intel_ds.c b/arch/x86/kernel/cpu/perf_event_intel_ds.c
> index 1db4ce5..e17eb5b 100644
> --- a/arch/x86/kernel/cpu/perf_event_intel_ds.c
> +++ b/arch/x86/kernel/cpu/perf_event_intel_ds.c
> @@ -11,7 +11,7 @@
> #define BTS_RECORD_SIZE 24
>
> #define BTS_BUFFER_SIZE (PAGE_SIZE << 4)
> -#define PEBS_BUFFER_SIZE PAGE_SIZE
> +#define PEBS_BUFFER_SIZE (PAGE_SIZE << 4)

See: http://lkml.kernel.org/r/[email protected]

Also talk about why 64k, mention NMI duration/processing overhead etc..

> @@ -708,14 +705,29 @@ struct event_constraint *intel_pebs_constraints(struct perf_event *event)
> return &emptyconstraint;
> }
>
> +/*
> + * Flags PEBS can handle without an PMI.
> + *
> + * TID can only be handled by flushing at context switch.
> + */
> +#define PEBS_FREERUNNING_FLAGS \
> + (PERF_SAMPLE_IP | PERF_SAMPLE_TID | PERF_SAMPLE_ADDR | \
> + PERF_SAMPLE_ID | PERF_SAMPLE_CPU | PERF_SAMPLE_STREAM_ID | \
> + PERF_SAMPLE_DATA_SRC | PERF_SAMPLE_IDENTIFIER | \
> + PERF_SAMPLE_TRANSACTION)
> +
> void intel_pmu_pebs_enable(struct perf_event *event)
> {
> struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
> struct hw_perf_event *hwc = &event->hw;
> + struct debug_store *ds = cpuc->ds;
> + u64 threshold;
> + bool first_pebs;

flip those two lines

>
> hwc->config &= ~ARCH_PERFMON_EVENTSEL_INT;
> hwc->autoreload = !event->attr.freq;
>
> + first_pebs = !(cpuc->pebs_enabled & ((1ULL << MAX_PEBS_EVENTS) - 1));
> cpuc->pebs_enabled |= 1ULL << hwc->idx;
>
> if (event->hw.flags & PERF_X86_EVENT_PEBS_LDLAT)
> @@ -723,6 +735,20 @@ void intel_pmu_pebs_enable(struct perf_event *event)
> else if (event->hw.flags & PERF_X86_EVENT_PEBS_ST)
> cpuc->pebs_enabled |= 1ULL << 63;
>
> + /*
> + * When the event is constrained enough we can use a larger
> + * threshold and run the event with less frequent PMI.
> + */
> + if (x86_pmu.multi_pebs && hwc->autoreload &&
> + !(event->attr.sample_type & ~PEBS_FREERUNNING_FLAGS)) {
> + threshold = ds->pebs_absolute_maximum -
> + x86_pmu.max_pebs_events * x86_pmu.pebs_record_size;
> + } else {
> + threshold = ds->pebs_buffer_base + x86_pmu.pebs_record_size;
> + }

threshold = 1;
if ((hwc->flags & PERF_X86_EVENT_PEBS_RELOAD) &&
!(event->attr.sample_type & ~PEBS_FREERUNNING_FLAGS))
threshold = x86_pmu.max_pebs_events;

threshold = ds->pebs_buffer_base + threshold * x86_pmu.pebs_record_size;

> + if (first_pebs || ds->pebs_interrupt_threshold > threshold)
> + ds->pebs_interrupt_threshold = threshold;
> +
> /* Use auto-reload if possible to save a MSR write in the PMI */
> if (hwc->autoreload)
> ds->pebs_event_reset[hwc->idx] =

> @@ -880,7 +907,7 @@ static void __intel_pmu_pebs_event(struct perf_event *event,
> u64 sample_type;
> int fll, fst;
>
> - if (!intel_pmu_save_and_restart(event))
> + if (first_record && !intel_pmu_save_and_restart(event))
> return;
>
> fll = event->hw.flags & PERF_X86_EVENT_PEBS_LDLAT;
> @@ -956,8 +983,22 @@ static void __intel_pmu_pebs_event(struct perf_event *event,
> if (has_branch_stack(event))
> data.br_stack = &cpuc->lbr_stack;
>
> - if (perf_event_overflow(event, &data, &regs))
> - x86_pmu_stop(event, 0);
> + if (first_record) {
> + if (perf_event_overflow(event, &data, &regs))
> + x86_pmu_stop(event, 0);
> + } else {
> + struct perf_output_handle handle;
> + struct perf_event_header header;
> +
> + perf_prepare_sample(&header, &data, event, &regs);
> +
> + if (perf_output_begin(&handle, event, header.size))
> + return;
> +
> + perf_output_sample(&handle, &header, &data, event);
> +
> + perf_output_end(&handle);
> + }

That is disgusting, have a look at drain_bts_buffer() and try again.

> }
>
> static void intel_pmu_drain_pebs_core(struct pt_regs *iregs)
> @@ -998,17 +1039,18 @@ static void intel_pmu_drain_pebs_core(struct pt_regs *iregs)
> WARN_ONCE(n > 1, "bad leftover pebs %d\n", n);
> at += n - 1;
>
> - __intel_pmu_pebs_event(event, iregs, at);
> + __intel_pmu_pebs_event(event, iregs, at, true);
> }
>
> static void intel_pmu_drain_pebs_nhm(struct pt_regs *iregs)
> {
> struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
> struct debug_store *ds = cpuc->ds;
> - struct perf_event *event = NULL;
> + struct perf_event *event;
> void *at, *top;
> u64 status = 0;
> int bit;
> + bool multi_pebs, first_record;

These should not be needed, but its also at the wrong place if it were.

> if (!x86_pmu.pebs_active)
> return;

> @@ -1042,17 +1086,15 @@ static void intel_pmu_drain_pebs_nhm(struct pt_regs *iregs)
>
> if (!event->attr.precise_ip)
> continue;
> -
> - if (__test_and_set_bit(bit, (unsigned long *)&status))
> - continue;
> -
> - break;
> + if (!__test_and_set_bit(bit, (unsigned long *)&status)) {
> + first_record = true;
> + } else {
> + if (!multi_pebs)
> + continue;
> + first_record = false;
> + }
> + __intel_pmu_pebs_event(event, iregs, at, first_record);
> }
> -
> - if (!event || bit >= x86_pmu.max_pebs_events)
> - continue;
> -
> - __intel_pmu_pebs_event(event, iregs, at);

Distinct lack of properly handling the multi overflow case.

> }
> }
>
> diff --git a/arch/x86/kernel/cpu/perf_event_intel_lbr.c b/arch/x86/kernel/cpu/perf_event_intel_lbr.c
> index d6d5fcf..430f1ad 100644
> --- a/arch/x86/kernel/cpu/perf_event_intel_lbr.c
> +++ b/arch/x86/kernel/cpu/perf_event_intel_lbr.c
> @@ -184,10 +184,6 @@ void intel_pmu_lbr_reset(void)
> void intel_pmu_lbr_sched_task(struct perf_event_context *ctx, bool sched_in)
> {
> struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
> -
> - if (!x86_pmu.lbr_nr)
> - return;
> -
> /*
> * It is necessary to flush the stack on context switch. This happens
> * when the branch stack does not tag its entries with the pid of the
> @@ -408,7 +404,6 @@ static void intel_pmu_setup_sw_lbr_filter(struct perf_event *event)
>
> if (br_type & PERF_SAMPLE_BRANCH_COND)
> mask |= X86_BR_JCC;
> -
> /*
> * stash actual user request into reg, it may
> * be used by fixup code for some CPU

WTF?


Attachments:
(No filename) (6.80 kB)
(No filename) (836.00 B)
Download all attachments

2014-07-15 11:40:10

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [PATCH v2 1/7] perf, core: introduce pmu context switch callback

On Tue, Jul 15, 2014 at 04:58:53PM +0800, Yan, Zheng wrote:
> +void perf_sched_cb_disable(struct pmu *pmu)
> +{
> + this_cpu_dec(perf_sched_cb_usages);
> +}
> +
> +void perf_sched_cb_enable(struct pmu *pmu)
> +{
> + this_cpu_inc(perf_sched_cb_usages);
> +}

Still wrong names, they do not enable/disable. Enable/disable is a
strictly boolean thing, this is counting.


Attachments:
(No filename) (367.00 B)
(No filename) (836.00 B)
Download all attachments

2014-07-15 11:57:31

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [PATCH v2 5/7] perf, x86: drain PEBS buffer during context switch

On Tue, Jul 15, 2014 at 04:58:57PM +0800, Yan, Zheng wrote:
> +void intel_pmu_drain_pebs_buffer(void)
> +{
> + struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
> + struct debug_store *ds = cpuc->ds;
> + struct pt_regs regs;
> +
> + if (!x86_pmu.pebs_active)
> + return;
> + if (ds->pebs_index <= ds->pebs_buffer_base)
> + return;

Both implementations of drain_pebs() already do that.

> + x86_pmu.drain_pebs(&regs);
> +}

> @@ -759,8 +787,19 @@ void intel_pmu_pebs_disable(struct perf_event *event)
> {
> struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
> struct hw_perf_event *hwc = &event->hw;
> + struct debug_store *ds = cpuc->ds;
> + bool multi_pebs = false;
> +
> + if (ds->pebs_interrupt_threshold >
> + ds->pebs_buffer_base + x86_pmu.pebs_record_size)
> + multi_pebs = true;
>
> cpuc->pebs_enabled &= ~(1ULL << hwc->idx);
> + if (cpuc->pebs_sched_cb_enabled &&
> + !(cpuc->pebs_enabled & ((1ULL << MAX_PEBS_EVENTS) - 1))) {

You seem fond of that expression, maybe make it an inline somewhere to
avoid all this repetition.

> + perf_sched_cb_disable(event->ctx->pmu);
> + cpuc->pebs_sched_cb_enabled = false;
> + }
>
> if (event->hw.constraint->flags & PERF_X86_EVENT_PEBS_LDLAT)
> cpuc->pebs_enabled &= ~(1ULL << (hwc->idx + 32));
> @@ -772,6 +811,9 @@ void intel_pmu_pebs_disable(struct perf_event *event)
>
> hwc->config |= ARCH_PERFMON_EVENTSEL_INT;
> hwc->autoreload = false;
> +
> + if (multi_pebs)
> + intel_pmu_drain_pebs_buffer();
> }

Is that condition worth the effort? Seeing how you already need to load
the DS state to compute multi_pebs in the first place.

> diff --git a/arch/x86/kernel/cpu/perf_event_intel_lbr.c b/arch/x86/kernel/cpu/perf_event_intel_lbr.c
> index 430f1ad..a3df61d 100644
> --- a/arch/x86/kernel/cpu/perf_event_intel_lbr.c
> +++ b/arch/x86/kernel/cpu/perf_event_intel_lbr.c
> @@ -199,8 +199,6 @@ void intel_pmu_lbr_enable(struct perf_event *event)
> {
> struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
>
> - if (!x86_pmu.lbr_nr)
> - return;
> /*
> * Reset the LBR stack if we changed task context to
> * avoid data leaks.

More random hunks?


Attachments:
(No filename) (2.12 kB)
(No filename) (836.00 B)
Download all attachments

2014-07-16 01:13:10

by Yan, Zheng

[permalink] [raw]
Subject: Re: [PATCH v2 0/7] perf, x86: large PEBS interrupt threshold

On 07/15/2014 06:02 PM, Peter Zijlstra wrote:
>
>
> The first two patches are the same patches as from the LBR callstack
> series, right?
>
Yes

Yan, Zheng