LinuxLists.cc - [PATCH 0/7] perf, x86: Haswell LBR call stack support

[permalink] [raw]

Subject: [PATCH 7/7] perf, x86: Discard zero length call entries in LBR call stack

From: "Yan, Zheng" <[email protected]>

"Zero length call" uses the attribute of the call instruction to push
the immediate instruction pointer on to the stack and then pops off
that address into a register. This is accomplished without any matching
return instruction. It confuses the hardware and make the recorded call
stack incorrect. Try fixing the call stack by discarding zero length
call entries.

Signed-off-by: Yan, Zheng <[email protected]>
---
arch/x86/kernel/cpu/perf_event_intel_lbr.c | 13 +++++++++++--
1 file changed, 11 insertions(+), 2 deletions(-)

diff --git a/arch/x86/kernel/cpu/perf_event_intel_lbr.c b/arch/x86/kernel/cpu/perf_event_intel_lbr.c
index 3be2d7b..f28fd23 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_lbr.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_lbr.c
@@ -94,7 +94,8 @@ enum {
X86_BR_ABORT = 1 << 12,/* transaction abort */
X86_BR_IN_TX = 1 << 13,/* in transaction */
X86_BR_NO_TX = 1 << 14,/* not in transaction */
- X86_BR_CALL_STACK = 1 << 15,/* call stack */
+ X86_BR_ZERO_CALL = 1 << 15,/* zero length call */
+ X86_BR_CALL_STACK = 1 << 16,/* call stack */
};

#define X86_BR_PLM (X86_BR_USER | X86_BR_KERNEL)
@@ -111,13 +112,15 @@ enum {
X86_BR_JMP |\
X86_BR_IRQ |\
X86_BR_ABORT |\
- X86_BR_IND_CALL)
+ X86_BR_IND_CALL |\
+ X86_BR_ZERO_CALL)

#define X86_BR_ALL (X86_BR_PLM | X86_BR_ANY)

#define X86_BR_ANY_CALL \
(X86_BR_CALL |\
X86_BR_IND_CALL |\
+ X86_BR_ZERO_CALL |\
X86_BR_SYSCALL |\
X86_BR_IRQ |\
X86_BR_INT)
@@ -643,6 +646,12 @@ static int branch_type(unsigned long from, unsigned long to, int abort)
ret = X86_BR_INT;
break;
case 0xe8: /* call near rel */
+ insn_get_immediate(&insn);
+ if (insn.immediate1.value == 0) {
+ /* zero length call */
+ ret = X86_BR_ZERO_CALL;
+ break;
+ }
case 0x9a: /* call far absolute */
ret = X86_BR_CALL;
break;
--
1.8.1.4

2013-06-25 08:47:36

[permalink] [raw]

Subject: [PATCH 6/7] perf, x86: Use LBR call stack to get user callchain

From: "Yan, Zheng" <[email protected]>

Try enabling the LBR call stack feature if event request recording
callchain. Try utilizing the LBR call stack to get user callchain
in case of there is no frame pointer.

This patch also adds a cpu pmu attribute to enable/disable this
feature.

Signed-off-by: Yan, Zheng <[email protected]>
---
arch/x86/kernel/cpu/perf_event.c | 128 +++++++++++++++++++++--------
arch/x86/kernel/cpu/perf_event.h | 7 ++
arch/x86/kernel/cpu/perf_event_intel.c | 20 ++---
arch/x86/kernel/cpu/perf_event_intel_lbr.c | 3 +
include/linux/perf_event.h | 6 ++
kernel/events/core.c | 11 ++-
6 files changed, 126 insertions(+), 49 deletions(-)

diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index 639aa4d..a07eb03 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -399,37 +399,49 @@ int x86_pmu_hw_config(struct perf_event *event)

if (event->attr.precise_ip > precise)
return -EOPNOTSUPP;
+ }
+ /*
+ * check that PEBS LBR correction does not conflict with
+ * whatever the user is asking with attr->branch_sample_type
+ */
+ if (event->attr.precise_ip > 1 && x86_pmu.intel_cap.pebs_format < 2) {
+ u64 *br_type = &event->attr.branch_sample_type;
+
+ if (has_branch_stack(event)) {
+ if (!precise_br_compat(event))
+ return -EOPNOTSUPP;
+
+ /* branch_sample_type is compatible */
+
+ } else {
+ /*
+ * user did not specify branch_sample_type
+ *
+ * For PEBS fixups, we capture all
+ * the branches at the priv level of the
+ * event.
+ */
+ *br_type = PERF_SAMPLE_BRANCH_ANY;
+
+ if (!event->attr.exclude_user)
+ *br_type |= PERF_SAMPLE_BRANCH_USER;
+
+ if (!event->attr.exclude_kernel)
+ *br_type |= PERF_SAMPLE_BRANCH_KERNEL;
+ }
+ } else if ((event->attr.sample_type & PERF_SAMPLE_CALLCHAIN) &&
+ !has_branch_stack(event) &&
+ x86_pmu.attr_lbr_callstack &&
+ !event->attr.exclude_user &&
+ (event->attach_state & PERF_ATTACH_TASK)) {
/*
- * check that PEBS LBR correction does not conflict with
- * whatever the user is asking with attr->branch_sample_type
+ * user did not specify branch_sample_type,
+ * try using the LBR call stack facility to
+ * record call chains of user program.
*/
- if (event->attr.precise_ip > 1 &&
- x86_pmu.intel_cap.pebs_format < 2) {
- u64 *br_type = &event->attr.branch_sample_type;
-
- if (has_branch_stack(event)) {
- if (!precise_br_compat(event))
- return -EOPNOTSUPP;
-
- /* branch_sample_type is compatible */
-
- } else {
- /*
- * user did not specify branch_sample_type
- *
- * For PEBS fixups, we capture all
- * the branches at the priv level of the
- * event.
- */
- *br_type = PERF_SAMPLE_BRANCH_ANY;
-
- if (!event->attr.exclude_user)
- *br_type |= PERF_SAMPLE_BRANCH_USER;
-
- if (!event->attr.exclude_kernel)
- *br_type |= PERF_SAMPLE_BRANCH_KERNEL;
- }
- }
+ event->attr.branch_sample_type =
+ PERF_SAMPLE_BRANCH_USER |
+ PERF_SAMPLE_BRANCH_CALL_STACK;
}

/*
@@ -1825,10 +1837,34 @@ static ssize_t set_attr_rdpmc(struct device *cdev,
return count;
}

+static ssize_t get_attr_lbr_callstack(struct device *cdev,
+ struct device_attribute *attr, char *buf)
+{
+ return snprintf(buf, 40, "%d\n", x86_pmu.attr_lbr_callstack);
+}
+
+static ssize_t set_attr_lbr_callstack(struct device *cdev,
+ struct device_attribute *attr,
+ const char *buf, size_t count)
+{
+ unsigned long val = simple_strtoul(buf, NULL, 0);
+
+ if (x86_pmu.attr_lbr_callstack != !!val) {
+ if (val && !x86_pmu_has_lbr_callstack())
+ return -EOPNOTSUPP;
+ x86_pmu.attr_lbr_callstack = !!val;
+ }
+ return count;
+}
+
static DEVICE_ATTR(rdpmc, S_IRUSR | S_IWUSR, get_attr_rdpmc, set_attr_rdpmc);
+static DEVICE_ATTR(lbr_callstack, S_IRUSR | S_IWUSR,
+ get_attr_lbr_callstack, set_attr_lbr_callstack);
+

static struct attribute *x86_pmu_attrs[] = {
&dev_attr_rdpmc.attr,
+ &dev_attr_lbr_callstack.attr,
NULL,
};

@@ -1955,12 +1991,29 @@ static unsigned long get_segment_base(unsigned int segment)
return get_desc_base(desc + idx);
}

+static inline void
+perf_callchain_lbr_callstack(struct perf_callchain_entry *entry,
+ struct perf_sample_data *data)
+{
+ struct perf_branch_stack *br_stack = data->br_stack;
+
+ if (br_stack && br_stack->user_callstack &&
+ x86_pmu.attr_lbr_callstack) {
+ int i = 0;
+ while (i < br_stack->nr && entry->nr < PERF_MAX_STACK_DEPTH) {
+ perf_callchain_store(entry, br_stack->entries[i].from);
+ i++;
+ }
+ }
+}
+
#ifdef CONFIG_COMPAT

#include <asm/compat.h>

static inline int
-perf_callchain_user32(struct pt_regs *regs, struct perf_callchain_entry *entry)
+perf_callchain_user32(struct perf_callchain_entry *entry,
+ struct pt_regs *regs, struct perf_sample_data *data)
{
/* 32-bit process in 64-bit kernel. */
unsigned long ss_base, cs_base;
@@ -1989,11 +2042,16 @@ perf_callchain_user32(struct pt_regs *regs, struct perf_callchain_entry *entry)
perf_callchain_store(entry, cs_base + frame.return_address);
fp = compat_ptr(ss_base + frame.next_frame);
}
+
+ if (fp == compat_ptr(regs->bp))
+ perf_callchain_lbr_callstack(entry, data);
+
return 1;
}
#else
static inline int
-perf_callchain_user32(struct pt_regs *regs, struct perf_callchain_entry *entry)
+perf_callchain_user32(struct perf_callchain_entry *entry,
+ struct pt_regs *regs, struct perf_sample_data *data)
{
return 0;
}
@@ -2023,12 +2081,12 @@ void perf_callchain_user(struct perf_callchain_entry *entry,
if (!current->mm)
return;

- if (perf_callchain_user32(regs, entry))
+ if (perf_callchain_user32(entry, regs, data))
return;

while (entry->nr < PERF_MAX_STACK_DEPTH) {
unsigned long bytes;
- frame.next_frame = NULL;
+ frame.next_frame = NULL;
frame.return_address = 0;

bytes = copy_from_user_nmi(&frame, fp, sizeof(frame));
@@ -2041,6 +2099,10 @@ void perf_callchain_user(struct perf_callchain_entry *entry,
perf_callchain_store(entry, frame.return_address);
fp = frame.next_frame;
}
+
+ /* try LBR callstack if there is no frame pointer */
+ if (fp == (void __user *)regs->bp)
+ perf_callchain_lbr_callstack(entry, data);
}

/*
diff --git a/arch/x86/kernel/cpu/perf_event.h b/arch/x86/kernel/cpu/perf_event.h
index 0116970..536470d 100644
--- a/arch/x86/kernel/cpu/perf_event.h
+++ b/arch/x86/kernel/cpu/perf_event.h
@@ -390,6 +390,7 @@ struct x86_pmu {
* sysfs attrs
*/
int attr_rdpmc;
+ int attr_lbr_callstack;
struct attribute **format_attrs;
struct attribute **event_attrs;

@@ -496,6 +497,12 @@ static struct perf_pmu_events_attr event_attr_##v = { \

extern struct x86_pmu x86_pmu __read_mostly;

+static inline bool x86_pmu_has_lbr_callstack(void)
+{
+ return x86_pmu.lbr_sel_map &&
+ x86_pmu.lbr_sel_map[PERF_SAMPLE_BRANCH_CALL_STACK_SHIFT] > 0;
+}
+
DECLARE_PER_CPU(struct cpu_hw_events, cpu_hw_events);

int x86_perf_event_set_period(struct perf_event *event);
diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c
index f59b46e..baa8384 100644
--- a/arch/x86/kernel/cpu/perf_event_intel.c
+++ b/arch/x86/kernel/cpu/perf_event_intel.c
@@ -882,15 +882,10 @@ static __initconst const u64 atom_hw_cache_event_ids
},
};

-static inline bool intel_pmu_needs_lbr_smpl(struct perf_event *event)
+static inline bool intel_pmu_needs_lbr_callstack(struct perf_event *event)
{
- /* user explicitly requested branch sampling */
- if (has_branch_stack(event))
- return true;
-
- /* implicit branch sampling to correct PEBS skid */
- if (x86_pmu.intel_cap.pebs_trap && event->attr.precise_ip > 1 &&
- x86_pmu.intel_cap.pebs_format < 2)
+ if ((event->attr.sample_type & PERF_SAMPLE_CALLCHAIN) &&
+ (event->attr.branch_sample_type & PERF_SAMPLE_BRANCH_CALL_STACK))
return true;

return false;
@@ -1054,7 +1049,7 @@ static void intel_pmu_disable_event(struct perf_event *event)
* must disable before any actual event
* because any event may be combined with LBR
*/
- if (intel_pmu_needs_lbr_smpl(event))
+ if (needs_branch_stack(event))
intel_pmu_lbr_disable(event);

if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL)) {
@@ -1115,7 +1110,7 @@ static void intel_pmu_enable_event(struct perf_event *event)
* must enabled before any actual event
* because any event may be combined with LBR
*/
- if (intel_pmu_needs_lbr_smpl(event))
+ if (needs_branch_stack(event))
intel_pmu_lbr_enable(event);

if (event->attr.exclude_host)
@@ -1237,7 +1232,8 @@ again:

perf_sample_data_init(&data, 0, event->hw.last_period);

- if (has_branch_stack(event))
+ if (has_branch_stack(event) ||
+ (event->ctx->task && intel_pmu_needs_lbr_callstack(event)))
data.br_stack = &cpuc->lbr_stack;

if (perf_event_overflow(event, &data, regs))
@@ -1568,7 +1564,7 @@ static int intel_pmu_hw_config(struct perf_event *event)
if (event->attr.precise_ip && x86_pmu.pebs_aliases)
x86_pmu.pebs_aliases(event);

- if (intel_pmu_needs_lbr_smpl(event)) {
+ if (needs_branch_stack(event)) {
ret = intel_pmu_setup_lbr_filter(event);
if (ret)
return ret;
diff --git a/arch/x86/kernel/cpu/perf_event_intel_lbr.c b/arch/x86/kernel/cpu/perf_event_intel_lbr.c
index 43b16b4..3be2d7b 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_lbr.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_lbr.c
@@ -709,6 +709,8 @@ intel_pmu_lbr_filter(struct cpu_hw_events *cpuc)
int i, j, type;
bool compress = false;

+ cpuc->lbr_stack.user_callstack = branch_user_callstack(br_sel);
+
/* if sampling all branches, then nothing to filter */
if ((br_sel & X86_BR_ALL) == X86_BR_ALL)
return;
@@ -861,6 +863,7 @@ void intel_pmu_lbr_init_hsw(void)

x86_pmu.lbr_sel_mask = LBR_SEL_MASK;
x86_pmu.lbr_sel_map = hsw_lbr_sel_map;
+ x86_pmu.attr_lbr_callstack = 1;

pr_cont("16-deep LBR, ");
}
diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index fa4c1bf..168e66e 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -97,6 +97,7 @@ struct perf_branch_entry {
* recent branch.
*/
struct perf_branch_stack {
+ unsigned user_callstack:1;
__u64 nr;
struct perf_branch_entry entries[0];
};
@@ -759,6 +760,11 @@ static inline bool has_branch_stack(struct perf_event *event)
return event->attr.sample_type & PERF_SAMPLE_BRANCH_STACK;
}

+static inline bool needs_branch_stack(struct perf_event *event)
+{
+ return event->attr.branch_sample_type != 0;
+}
+
extern int perf_output_begin(struct perf_output_handle *handle,
struct perf_event *event, unsigned int size);
extern void perf_output_end(struct perf_output_handle *handle);
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 4aad901..38eaa2b 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -1117,7 +1117,7 @@ list_add_event(struct perf_event *event, struct perf_event_context *ctx)
if (is_cgroup_event(event))
ctx->nr_cgroups++;

- if (has_branch_stack(event))
+ if (needs_branch_stack(event))
ctx->nr_branch_stack++;

list_add_rcu(&event->event_entry, &ctx->event_list);
@@ -1274,7 +1274,7 @@ list_del_event(struct perf_event *event, struct perf_event_context *ctx)
cpuctx->cgrp = NULL;
}

- if (has_branch_stack(event)) {
+ if (needs_branch_stack(event)) {
if (ctx->is_active)
__get_cpu_var(perf_branch_stack_events)--;
ctx->nr_branch_stack--;
@@ -3155,7 +3155,7 @@ static void free_event(struct perf_event *event)
static_key_slow_dec_deferred(&perf_sched_events);
}

- if (has_branch_stack(event))
+ if (needs_branch_stack(event))
static_key_slow_dec_deferred(&perf_sched_events);
}

@@ -6545,6 +6545,9 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
if (attr->inherit && (attr->read_format & PERF_FORMAT_GROUP))
goto done;

+ if (!has_branch_stack(event))
+ event->attr.branch_sample_type = 0;
+
pmu = perf_init_event(event);

done:
@@ -6577,7 +6580,7 @@ done:
return ERR_PTR(err);
}
}
- if (has_branch_stack(event))
+ if (needs_branch_stack(event))
static_key_slow_inc(&perf_sched_events.key);
}

--
1.8.1.4

2013-06-25 08:50:00

[permalink] [raw]

Subject: [PATCH 5/7] perf, core: Pass perf_sample_data to perf_callchain()

From: "Yan, Zheng" <[email protected]>

New Intel CPU can record call chains by using existing last branch
record facility. perf_callchain_user() can make use of the call
chains recorded by hardware in case of there is no frame pointer.

Signed-off-by: Yan, Zheng <[email protected]>
---
arch/arm/kernel/perf_event.c | 4 ++--
arch/powerpc/perf/callchain.c | 4 ++--
arch/sparc/kernel/perf_event.c | 4 ++--
arch/x86/kernel/cpu/perf_event.c | 4 ++--
include/linux/perf_event.h | 3 ++-
kernel/events/callchain.c | 8 +++++---
kernel/events/core.c | 2 +-
kernel/events/internal.h | 3 ++-
8 files changed, 18 insertions(+), 14 deletions(-)

diff --git a/arch/arm/kernel/perf_event.c b/arch/arm/kernel/perf_event.c
index 8c3094d..3f84d3c 100644
--- a/arch/arm/kernel/perf_event.c
+++ b/arch/arm/kernel/perf_event.c
@@ -559,8 +559,8 @@ user_backtrace(struct frame_tail __user *tail,
return buftail.fp - 1;
}

-void
-perf_callchain_user(struct perf_callchain_entry *entry, struct pt_regs *regs)
+void perf_callchain_user(struct perf_callchain_entry *entry,
+ struct pt_regs *regs, struct perf_sample_data *data)
{
struct frame_tail __user *tail;

diff --git a/arch/powerpc/perf/callchain.c b/arch/powerpc/perf/callchain.c
index 74d1e78..b379ebc 100644
--- a/arch/powerpc/perf/callchain.c
+++ b/arch/powerpc/perf/callchain.c
@@ -482,8 +482,8 @@ static void perf_callchain_user_32(struct perf_callchain_entry *entry,
}
}

-void
-perf_callchain_user(struct perf_callchain_entry *entry, struct pt_regs *regs)
+void perf_callchain_user(struct perf_callchain_entry *entry,
+ struct pt_regs *regs, struct perf_sample_data *data)
{
if (current_is_64bit())
perf_callchain_user_64(entry, regs);
diff --git a/arch/sparc/kernel/perf_event.c b/arch/sparc/kernel/perf_event.c
index b5c38fa..cba0306 100644
--- a/arch/sparc/kernel/perf_event.c
+++ b/arch/sparc/kernel/perf_event.c
@@ -1785,8 +1785,8 @@ static void perf_callchain_user_32(struct perf_callchain_entry *entry,
} while (entry->nr < PERF_MAX_STACK_DEPTH);
}

-void
-perf_callchain_user(struct perf_callchain_entry *entry, struct pt_regs *regs)
+void perf_callchain_user(struct perf_callchain_entry *entry,
+ struct pt_regs *regs, struct perf_sample_data *data)
{
perf_callchain_store(entry, regs->tpc);

diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index 3843f80..639aa4d 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -1999,8 +1999,8 @@ perf_callchain_user32(struct pt_regs *regs, struct perf_callchain_entry *entry)
}
#endif

-void
-perf_callchain_user(struct perf_callchain_entry *entry, struct pt_regs *regs)
+void perf_callchain_user(struct perf_callchain_entry *entry,
+ struct pt_regs *regs, struct perf_sample_data *data)
{
struct stack_frame frame;
const void __user *fp;
diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index b3e4faf..fa4c1bf 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -701,7 +701,8 @@ extern void perf_event_fork(struct task_struct *tsk);
/* Callchains */
DECLARE_PER_CPU(struct perf_callchain_entry, perf_callchain_entry);

-extern void perf_callchain_user(struct perf_callchain_entry *entry, struct pt_regs *regs);
+extern void perf_callchain_user(struct perf_callchain_entry *entry, struct pt_regs *regs,
+ struct perf_sample_data *data);
extern void perf_callchain_kernel(struct perf_callchain_entry *entry, struct pt_regs *regs);

static inline void perf_callchain_store(struct perf_callchain_entry *entry, u64 ip)
diff --git a/kernel/events/callchain.c b/kernel/events/callchain.c
index c772061..bd7138a 100644
--- a/kernel/events/callchain.c
+++ b/kernel/events/callchain.c
@@ -30,7 +30,8 @@ __weak void perf_callchain_kernel(struct perf_callchain_entry *entry,
}

__weak void perf_callchain_user(struct perf_callchain_entry *entry,
- struct pt_regs *regs)
+ struct pt_regs *regs,
+ struct perf_sample_data *data)
{
}

@@ -154,7 +155,8 @@ put_callchain_entry(int rctx)
}

struct perf_callchain_entry *
-perf_callchain(struct perf_event *event, struct pt_regs *regs)
+perf_callchain(struct perf_event *event, struct pt_regs *regs,
+ struct perf_sample_data *data)
{
int rctx;
struct perf_callchain_entry *entry;
@@ -195,7 +197,7 @@ perf_callchain(struct perf_event *event, struct pt_regs *regs)
goto exit_put;

perf_callchain_store(entry, PERF_CONTEXT_USER);
- perf_callchain_user(entry, regs);
+ perf_callchain_user(entry, regs, data);
}
}

diff --git a/kernel/events/core.c b/kernel/events/core.c
index 1101ce8..4aad901 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -4545,7 +4545,7 @@ void perf_prepare_sample(struct perf_event_header *header,
if (sample_type & PERF_SAMPLE_CALLCHAIN) {
int size = 1;

- data->callchain = perf_callchain(event, regs);
+ data->callchain = perf_callchain(event, regs, data);

if (data->callchain)
size += data->callchain->nr;
diff --git a/kernel/events/internal.h b/kernel/events/internal.h
index ca65997..0e939e6 100644
--- a/kernel/events/internal.h
+++ b/kernel/events/internal.h
@@ -130,7 +130,8 @@ DEFINE_OUTPUT_COPY(__output_copy_user, arch_perf_out_copy_user)

/* Callchain handling */
extern struct perf_callchain_entry *
-perf_callchain(struct perf_event *event, struct pt_regs *regs);
+perf_callchain(struct perf_event *event, struct pt_regs *regs,
+ struct perf_sample_data *data);
extern int get_callchain_buffers(void);
extern void put_callchain_buffers(void);

--
1.8.1.4

2013-06-25 08:47:29

[permalink] [raw]

Subject: [PATCH 2/7] perf, x86: Basic Haswell LBR call stack support

From: "Yan, Zheng" <[email protected]>

The new HSW call stack feature provides a facility such that
unfiltered call data will be collected as normal, but as return
instructions are executed the last captured branch record is
popped from the LBR stack. Thus, branch information relative to
leaf functions will not be captured, while preserving the call
stack information of the main line execution path.

Signed-off-by: Yan, Zheng <[email protected]>
---
arch/x86/kernel/cpu/perf_event.h | 7 ++-
arch/x86/kernel/cpu/perf_event_intel.c | 2 +-
arch/x86/kernel/cpu/perf_event_intel_lbr.c | 89 ++++++++++++++++++++++--------
3 files changed, 74 insertions(+), 24 deletions(-)

diff --git a/arch/x86/kernel/cpu/perf_event.h b/arch/x86/kernel/cpu/perf_event.h
index a74d554..e14c963 100644
--- a/arch/x86/kernel/cpu/perf_event.h
+++ b/arch/x86/kernel/cpu/perf_event.h
@@ -448,7 +448,10 @@ struct x86_pmu {
};

enum {
- PERF_SAMPLE_BRANCH_SELECT_MAP_SIZE = PERF_SAMPLE_BRANCH_MAX_SHIFT,
+ PERF_SAMPLE_BRANCH_CALL_STACK_SHIFT = PERF_SAMPLE_BRANCH_MAX_SHIFT,
+ PERF_SAMPLE_BRANCH_SELECT_MAP_SIZE,
+
+ PERF_SAMPLE_BRANCH_CALL_STACK = 1U << PERF_SAMPLE_BRANCH_CALL_STACK_SHIFT,
};

#define x86_add_quirk(func_) \
@@ -681,6 +684,8 @@ void intel_pmu_lbr_init_atom(void);

void intel_pmu_lbr_init_snb(void);

+void intel_pmu_lbr_init_hsw(void);
+
int intel_pmu_setup_lbr_filter(struct perf_event *event);

int p4_pmu_init(void);
diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c
index a6eccf1..3e92a68 100644
--- a/arch/x86/kernel/cpu/perf_event_intel.c
+++ b/arch/x86/kernel/cpu/perf_event_intel.c
@@ -2276,7 +2276,7 @@ __init int intel_pmu_init(void)
memcpy(hw_cache_event_ids, snb_hw_cache_event_ids, sizeof(hw_cache_event_ids));
memcpy(hw_cache_extra_regs, snb_hw_cache_extra_regs, sizeof(hw_cache_extra_regs));

- intel_pmu_lbr_init_snb();
+ intel_pmu_lbr_init_hsw();

x86_pmu.event_constraints = intel_hsw_event_constraints;
x86_pmu.pebs_constraints = intel_hsw_pebs_event_constraints;
diff --git a/arch/x86/kernel/cpu/perf_event_intel_lbr.c b/arch/x86/kernel/cpu/perf_event_intel_lbr.c
index a72e9e9..2136320 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_lbr.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_lbr.c
@@ -39,6 +39,7 @@ static enum {
#define LBR_IND_JMP_BIT 6 /* do not capture indirect jumps */
#define LBR_REL_JMP_BIT 7 /* do not capture relative jumps */
#define LBR_FAR_BIT 8 /* do not capture far branches */
+#define LBR_CALL_STACK_BIT 9 /* enable call stack */

#define LBR_KERNEL (1 << LBR_KERNEL_BIT)
#define LBR_USER (1 << LBR_USER_BIT)
@@ -49,6 +50,7 @@ static enum {
#define LBR_REL_JMP (1 << LBR_REL_JMP_BIT)
#define LBR_IND_JMP (1 << LBR_IND_JMP_BIT)
#define LBR_FAR (1 << LBR_FAR_BIT)
+#define LBR_CALL_STACK (1 << LBR_CALL_STACK_BIT)

#define LBR_PLM (LBR_KERNEL | LBR_USER)

@@ -74,24 +76,25 @@ static enum {
* x86control flow changes include branches, interrupts, traps, faults
*/
enum {
- X86_BR_NONE = 0, /* unknown */
-
- X86_BR_USER = 1 << 0, /* branch target is user */
- X86_BR_KERNEL = 1 << 1, /* branch target is kernel */
-
- X86_BR_CALL = 1 << 2, /* call */
- X86_BR_RET = 1 << 3, /* return */
- X86_BR_SYSCALL = 1 << 4, /* syscall */
- X86_BR_SYSRET = 1 << 5, /* syscall return */
- X86_BR_INT = 1 << 6, /* sw interrupt */
- X86_BR_IRET = 1 << 7, /* return from interrupt */
- X86_BR_JCC = 1 << 8, /* conditional */
- X86_BR_JMP = 1 << 9, /* jump */
- X86_BR_IRQ = 1 << 10,/* hw interrupt or trap or fault */
- X86_BR_IND_CALL = 1 << 11,/* indirect calls */
- X86_BR_ABORT = 1 << 12,/* transaction abort */
- X86_BR_IN_TX = 1 << 13,/* in transaction */
- X86_BR_NO_TX = 1 << 14,/* not in transaction */
+ X86_BR_NONE = 0, /* unknown */
+
+ X86_BR_USER = 1 << 0, /* branch target is user */
+ X86_BR_KERNEL = 1 << 1, /* branch target is kernel */
+
+ X86_BR_CALL = 1 << 2, /* call */
+ X86_BR_RET = 1 << 3, /* return */
+ X86_BR_SYSCALL = 1 << 4, /* syscall */
+ X86_BR_SYSRET = 1 << 5, /* syscall return */
+ X86_BR_INT = 1 << 6, /* sw interrupt */
+ X86_BR_IRET = 1 << 7, /* return from interrupt */
+ X86_BR_JCC = 1 << 8, /* conditional */
+ X86_BR_JMP = 1 << 9, /* jump */
+ X86_BR_IRQ = 1 << 10,/* hw interrupt or trap or fault */
+ X86_BR_IND_CALL = 1 << 11,/* indirect calls */
+ X86_BR_ABORT = 1 << 12,/* transaction abort */
+ X86_BR_IN_TX = 1 << 13,/* in transaction */
+ X86_BR_NO_TX = 1 << 14,/* not in transaction */
+ X86_BR_CALL_STACK = 1 << 15,/* call stack */
};

#define X86_BR_PLM (X86_BR_USER | X86_BR_KERNEL)
@@ -135,7 +138,10 @@ static void __intel_pmu_lbr_enable(void)
wrmsrl(MSR_LBR_SELECT, cpuc->lbr_sel->config);

rdmsrl(MSR_IA32_DEBUGCTLMSR, debugctl);
- debugctl |= (DEBUGCTLMSR_LBR | DEBUGCTLMSR_FREEZE_LBRS_ON_PMI);
+ debugctl |= DEBUGCTLMSR_LBR;
+ /* LBR callstack does not work well with FREEZE_LBRS_ON_PMI */
+ if (!cpuc->lbr_sel || !(cpuc->lbr_sel->config & LBR_CALL_STACK))
+ debugctl |= DEBUGCTLMSR_FREEZE_LBRS_ON_PMI;
wrmsrl(MSR_IA32_DEBUGCTLMSR, debugctl);
}

@@ -333,7 +339,7 @@ void intel_pmu_lbr_read(void)
* - in case there is no HW filter
* - in case the HW filter has errata or limitations
*/
-static void intel_pmu_setup_sw_lbr_filter(struct perf_event *event)
+static int intel_pmu_setup_sw_lbr_filter(struct perf_event *event)
{
u64 br_type = event->attr.branch_sample_type;
int mask = 0;
@@ -367,11 +373,21 @@ static void intel_pmu_setup_sw_lbr_filter(struct perf_event *event)
if (br_type & PERF_SAMPLE_BRANCH_NO_TX)
mask |= X86_BR_NO_TX;

+ if (br_type & PERF_SAMPLE_BRANCH_CALL_STACK) {
+ if (!x86_pmu.lbr_sel_map)
+ return -EOPNOTSUPP;
+ if (mask & ~(X86_BR_USER | X86_BR_KERNEL))
+ return -EINVAL;
+ mask |= X86_BR_CALL | X86_BR_IND_CALL | X86_BR_RET |
+ X86_BR_CALL_STACK;
+ }
+
/*
* stash actual user request into reg, it may
* be used by fixup code for some CPU
*/
event->hw.branch_reg.reg = mask;
+ return 0;
}

/*
@@ -401,7 +417,7 @@ static int intel_pmu_setup_hw_lbr_filter(struct perf_event *event)
reg->idx = EXTRA_REG_LBR;

/* LBR_SELECT operates in suppress mode so invert mask */
- reg->config = ~mask & x86_pmu.lbr_sel_mask;
+ reg->config = mask ^ x86_pmu.lbr_sel_mask;

return 0;
}
@@ -419,7 +435,9 @@ int intel_pmu_setup_lbr_filter(struct perf_event *event)
/*
* setup SW LBR filter
*/
- intel_pmu_setup_sw_lbr_filter(event);
+ ret = intel_pmu_setup_sw_lbr_filter(event);
+ if (ret)
+ return ret;

/*
* setup HW LBR filter, if any
@@ -674,6 +692,19 @@ static const int snb_lbr_sel_map[PERF_SAMPLE_BRANCH_SELECT_MAP_SIZE] = {
[PERF_SAMPLE_BRANCH_IND_CALL_SHIFT] = LBR_IND_CALL,
};

+static const int hsw_lbr_sel_map[PERF_SAMPLE_BRANCH_SELECT_MAP_SIZE] = {
+ [PERF_SAMPLE_BRANCH_ANY_SHIFT] = LBR_ANY,
+ [PERF_SAMPLE_BRANCH_USER_SHIFT] = LBR_USER,
+ [PERF_SAMPLE_BRANCH_KERNEL_SHIFT] = LBR_KERNEL,
+ [PERF_SAMPLE_BRANCH_HV_SHIFT] = LBR_IGN,
+ [PERF_SAMPLE_BRANCH_ANY_RETURN_SHIFT] = LBR_RETURN | LBR_FAR,
+ [PERF_SAMPLE_BRANCH_ANY_CALL_SHIFT] = LBR_REL_CALL | LBR_IND_CALL
+ | LBR_FAR,
+ [PERF_SAMPLE_BRANCH_IND_CALL_SHIFT] = LBR_IND_CALL,
+ [PERF_SAMPLE_BRANCH_CALL_STACK_SHIFT] = LBR_REL_CALL | LBR_IND_CALL
+ | LBR_RETURN | LBR_CALL_STACK,
+};
+
/* core */
void intel_pmu_lbr_init_core(void)
{
@@ -730,6 +761,20 @@ void intel_pmu_lbr_init_snb(void)
pr_cont("16-deep LBR, ");
}

+/* haswell */
+void intel_pmu_lbr_init_hsw(void)
+{
+ x86_pmu.lbr_nr = 16;
+ x86_pmu.lbr_tos = MSR_LBR_TOS;
+ x86_pmu.lbr_from = MSR_LBR_NHM_FROM;
+ x86_pmu.lbr_to = MSR_LBR_NHM_TO;
+
+ x86_pmu.lbr_sel_mask = LBR_SEL_MASK;
+ x86_pmu.lbr_sel_map = hsw_lbr_sel_map;
+
+ pr_cont("16-deep LBR, ");
+}
+
/* atom */
void intel_pmu_lbr_init_atom(void)
{
--
1.8.1.4

2013-06-25 08:50:41

[permalink] [raw]

Subject: [PATCH 4/7] perf, x86: Save/resotre LBR stack during context switch

From: "Yan, Zheng" <[email protected]>

When the LBR call stack is enabled, it is necessary to save/restore
the stack on context switch. The solution is saving/restoring the
stack to/from task's perf event context. If task has no perf event
context, just flush the stack on context switch.

Signed-off-by: Yan, Zheng <[email protected]>
---
arch/x86/kernel/cpu/perf_event.c | 18 +++--
arch/x86/kernel/cpu/perf_event.h | 13 +++-
arch/x86/kernel/cpu/perf_event_intel.c | 13 ++--
arch/x86/kernel/cpu/perf_event_intel_lbr.c | 108 ++++++++++++++++++++++++++---
include/linux/perf_event.h | 6 +-
kernel/events/core.c | 65 +++++++++--------
6 files changed, 168 insertions(+), 55 deletions(-)

diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index b2eada9..3843f80 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -1768,6 +1768,13 @@ static int x86_pmu_event_idx(struct perf_event *event)
return idx + 1;
}

+static void x86_pmu_branch_stack_sched(struct perf_event_context *ctx,
+ bool sched_in)
+{
+ if (x86_pmu.branch_stack_sched)
+ x86_pmu.branch_stack_sched(ctx, sched_in);
+}
+
static void *x86_pmu_event_context_alloc(struct perf_event_context *parent_ctx)
{
struct perf_event_context *ctx;
@@ -1776,6 +1783,9 @@ static void *x86_pmu_event_context_alloc(struct perf_event_context *parent_ctx)
if (!ctx)
return ERR_PTR(-ENOMEM);

+ if (parent_ctx)
+ intel_pmu_lbr_init_context(ctx, parent_ctx);
+
return ctx;
}

@@ -1833,12 +1843,6 @@ static const struct attribute_group *x86_pmu_attr_groups[] = {
NULL,
};

-static void x86_pmu_flush_branch_stack(void)
-{
- if (x86_pmu.flush_branch_stack)
- x86_pmu.flush_branch_stack();
-}
-
void perf_check_microcode(void)
{
if (x86_pmu.check_microcode)
@@ -1865,7 +1869,7 @@ static struct pmu pmu = {
.commit_txn = x86_pmu_commit_txn,

.event_idx = x86_pmu_event_idx,
- .flush_branch_stack = x86_pmu_flush_branch_stack,
+ .branch_stack_sched = x86_pmu_branch_stack_sched,
.event_context_alloc = x86_pmu_event_context_alloc,
};

diff --git a/arch/x86/kernel/cpu/perf_event.h b/arch/x86/kernel/cpu/perf_event.h
index 08469de..0116970 100644
--- a/arch/x86/kernel/cpu/perf_event.h
+++ b/arch/x86/kernel/cpu/perf_event.h
@@ -405,7 +405,6 @@ struct x86_pmu {
void (*cpu_dead)(int cpu);

void (*check_microcode)(void);
- void (*flush_branch_stack)(void);

/*
* Intel Arch Perfmon v2+
@@ -434,6 +433,8 @@ struct x86_pmu {
int lbr_nr; /* hardware stack size */
u64 lbr_sel_mask; /* LBR_SELECT valid bits */
const int *lbr_sel_map; /* lbr_select mappings */
+ void (*branch_stack_sched)(struct perf_event_context *ctx,
+ bool sched_in);

/*
* Extra registers for events
@@ -456,6 +457,12 @@ enum {

struct x86_perf_event_context {
struct perf_event_context ctx;
+
+ u64 lbr_from[MAX_LBR_ENTRIES];
+ u64 lbr_to[MAX_LBR_ENTRIES];
+ u64 lbr_stack_gen;
+ int lbr_callstack_users;
+ bool lbr_stack_saved;
};

#define x86_add_quirk(func_) \
@@ -668,8 +675,12 @@ void intel_pmu_pebs_disable_all(void);

void intel_ds_init(void);

+void intel_pmu_lbr_init_context(struct perf_event_context *child_ctx,
+ struct perf_event_context *parent_ctx);
void intel_pmu_lbr_reset(void);

+void intel_pmu_lbr_sched(struct perf_event_context *ctx, bool sched_in);
+
void intel_pmu_lbr_enable(struct perf_event *event);

void intel_pmu_lbr_disable(struct perf_event *event);
diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c
index 3e92a68..f59b46e 100644
--- a/arch/x86/kernel/cpu/perf_event_intel.c
+++ b/arch/x86/kernel/cpu/perf_event_intel.c
@@ -1851,16 +1851,11 @@ static void intel_pmu_cpu_dying(int cpu)
fini_debug_store_on_cpu(cpu);
}

-static void intel_pmu_flush_branch_stack(void)
+static void intel_pmu_branch_stack_sched(struct perf_event_context *ctx,
+ bool sched_in)
{
- /*
- * Intel LBR does not tag entries with the
- * PID of the current task, then we need to
- * flush it on ctxsw
- * For now, we simply reset it
- */
if (x86_pmu.lbr_nr)
- intel_pmu_lbr_reset();
+ intel_pmu_lbr_sched(ctx, sched_in);
}

PMU_FORMAT_ATTR(offcore_rsp, "config1:0-63");
@@ -1914,7 +1909,7 @@ static __initconst const struct x86_pmu intel_pmu = {
.cpu_starting = intel_pmu_cpu_starting,
.cpu_dying = intel_pmu_cpu_dying,
.guest_get_msrs = intel_guest_get_msrs,
- .flush_branch_stack = intel_pmu_flush_branch_stack,
+ .branch_stack_sched = intel_pmu_branch_stack_sched,
};

static __init void intel_clovertown_quirk(void)
diff --git a/arch/x86/kernel/cpu/perf_event_intel_lbr.c b/arch/x86/kernel/cpu/perf_event_intel_lbr.c
index 2136320..43b16b4 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_lbr.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_lbr.c
@@ -181,6 +181,13 @@ void intel_pmu_lbr_reset(void)
intel_pmu_lbr_reset_32();
else
intel_pmu_lbr_reset_64();
+
+ wrmsrl(x86_pmu.lbr_tos, 0);
+}
+
+static inline bool branch_user_callstack(unsigned br_sel)
+{
+ return (br_sel & X86_BR_USER) && (br_sel & X86_BR_CALL_STACK);
}

void intel_pmu_lbr_enable(struct perf_event *event)
@@ -190,17 +197,23 @@ void intel_pmu_lbr_enable(struct perf_event *event)
if (!x86_pmu.lbr_nr)
return;

- /*
- * Reset the LBR stack if we changed task context to
- * avoid data leaks.
- */
- if (event->ctx->task && cpuc->lbr_context != event->ctx) {
- intel_pmu_lbr_reset();
- cpuc->lbr_context = event->ctx;
- }
cpuc->br_sel = event->hw.branch_reg.reg;
-
cpuc->lbr_users++;
+
+ if (event->ctx->task &&
+ branch_user_callstack(event->hw.branch_reg.reg)) {
+ struct x86_perf_event_context *task_ctx = (void *)event->ctx;
+ /*
+ * Reset the LBR stack if the call stack is not
+ * continuous enabled
+ */
+ if (task_ctx->lbr_callstack_users == 0 &&
+ task_ctx->lbr_stack_gen + 1 < event->ctx->sched_gen)
+ intel_pmu_lbr_reset();
+
+ task_ctx->lbr_callstack_users++;
+ task_ctx->lbr_stack_gen = event->ctx->sched_gen;
+ }
}

void intel_pmu_lbr_disable(struct perf_event *event)
@@ -210,6 +223,13 @@ void intel_pmu_lbr_disable(struct perf_event *event)
if (!x86_pmu.lbr_nr)
return;

+ if (event->ctx->task &&
+ branch_user_callstack(event->hw.branch_reg.reg)) {
+ struct x86_perf_event_context *task_ctx = (void *)event->ctx;
+
+ task_ctx->lbr_callstack_users--;
+ }
+
cpuc->lbr_users--;
WARN_ON_ONCE(cpuc->lbr_users < 0);

@@ -334,6 +354,76 @@ void intel_pmu_lbr_read(void)
intel_pmu_lbr_filter(cpuc);
}

+static void __intel_pmu_lbr_restore(struct x86_perf_event_context *task_ctx)
+{
+ int i;
+ unsigned lbr_idx, mask = x86_pmu.lbr_nr - 1;
+ u64 tos = intel_pmu_lbr_tos();
+
+ for (i = 0; i < x86_pmu.lbr_nr; i++) {
+ lbr_idx = (tos - i) & mask;
+ wrmsrl(x86_pmu.lbr_from + lbr_idx, task_ctx->lbr_from[i]);
+ wrmsrl(x86_pmu.lbr_to + lbr_idx, task_ctx->lbr_to[i]);
+ }
+ task_ctx->lbr_stack_saved = false;
+}
+
+static void __intel_pmu_lbr_save(struct x86_perf_event_context *task_ctx)
+{
+ int i;
+ unsigned lbr_idx, mask = x86_pmu.lbr_nr - 1;
+ u64 tos = intel_pmu_lbr_tos();
+
+ for (i = 0; i < x86_pmu.lbr_nr; i++) {
+ lbr_idx = (tos - i) & mask;
+ rdmsrl(x86_pmu.lbr_from + lbr_idx, task_ctx->lbr_from[i]);
+ rdmsrl(x86_pmu.lbr_to + lbr_idx, task_ctx->lbr_to[i]);
+ }
+ task_ctx->lbr_stack_gen = task_ctx->ctx.sched_gen;
+ task_ctx->lbr_stack_saved = true;
+}
+
+void intel_pmu_lbr_init_context(struct perf_event_context *child_ctx,
+ struct perf_event_context *parent_ctx)
+{
+ struct x86_perf_event_context *task_ctx, *parent_task_ctx;
+
+ if (!x86_pmu.lbr_nr)
+ return;
+
+ task_ctx = (struct x86_perf_event_context *)child_ctx;
+ parent_task_ctx = (struct x86_perf_event_context *)parent_ctx;
+
+ if (parent_task_ctx->lbr_callstack_users)
+ __intel_pmu_lbr_save(task_ctx);
+ else
+ task_ctx->lbr_stack_saved = false;
+}
+
+void intel_pmu_lbr_sched(struct perf_event_context *ctx, bool sched_in)
+{
+ struct x86_perf_event_context *task_ctx;
+
+ if (!x86_pmu.lbr_nr)
+ return;
+
+ if (!ctx) {
+ if (sched_in)
+ intel_pmu_lbr_reset();
+ return;
+ }
+
+ task_ctx = (struct x86_perf_event_context *)ctx;
+ if (sched_in) {
+ if (!task_ctx->lbr_stack_saved)
+ intel_pmu_lbr_reset();
+ else
+ __intel_pmu_lbr_restore(task_ctx);
+ } else {
+ __intel_pmu_lbr_save(task_ctx);
+ }
+}
+
/*
* SW filter is used:
* - in case there is no HW filter
diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index f6d1d59..b3e4faf 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -271,9 +271,10 @@ struct pmu {
int (*event_idx) (struct perf_event *event); /*optional */

/*
- * flush branch stack on context-switches (needed in cpu-wide mode)
+ * Save/restore LBR stack on context-switches
*/
- void (*flush_branch_stack) (void);
+ void (*branch_stack_sched) (struct perf_event_context *ctx,
+ bool sched_in);

/*
* Allocate PMU special perf event context
@@ -495,6 +496,7 @@ struct perf_event_context {
struct perf_event_context *parent_ctx;
u64 parent_gen;
u64 generation;
+ u64 sched_gen;
int pin_count;
int nr_cgroups; /* cgroup evts */
int nr_branch_stack; /* branch_stack evt */
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 3aececc..1101ce8 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -140,7 +140,7 @@ enum event_type_t {
*/
struct static_key_deferred perf_sched_events __read_mostly;
static DEFINE_PER_CPU(atomic_t, perf_cgroup_events);
-static DEFINE_PER_CPU(atomic_t, perf_branch_stack_events);
+static DEFINE_PER_CPU(int, perf_branch_stack_events);

static atomic_t nr_mmap_events __read_mostly;
static atomic_t nr_comm_events __read_mostly;
@@ -278,6 +278,9 @@ static void cpu_ctx_sched_out(struct perf_cpu_context *cpuctx,
static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx,
enum event_type_t event_type,
struct task_struct *task);
+static void perf_branch_stack_sched(struct task_struct *task1,
+ struct task_struct *task2,
+ bool sched_in);

static void update_context_time(struct perf_event_context *ctx);
static u64 perf_event_time(struct perf_event *event);
@@ -1271,8 +1274,11 @@ list_del_event(struct perf_event *event, struct perf_event_context *ctx)
cpuctx->cgrp = NULL;
}

- if (has_branch_stack(event))
+ if (has_branch_stack(event)) {
+ if (ctx->is_active)
+ __get_cpu_var(perf_branch_stack_events)--;
ctx->nr_branch_stack--;
+ }

ctx->nr_events--;
if (event->attr.inherit_stat)
@@ -1796,8 +1802,10 @@ static void perf_event_sched_in(struct perf_cpu_context *cpuctx,
struct task_struct *task)
{
cpu_ctx_sched_in(cpuctx, EVENT_PINNED, task);
- if (ctx)
+ if (ctx) {
+ ctx->sched_gen++;
ctx_sched_in(ctx, cpuctx, EVENT_PINNED, task);
+ }
cpu_ctx_sched_in(cpuctx, EVENT_FLEXIBLE, task);
if (ctx)
ctx_sched_in(ctx, cpuctx, EVENT_FLEXIBLE, task);
@@ -2102,6 +2110,9 @@ static void ctx_sched_out(struct perf_event_context *ctx,
if (likely(!ctx->nr_events))
return;

+ if (!ctx->is_active && is_active)
+ __get_cpu_var(perf_branch_stack_events) -= ctx->nr_branch_stack;
+
update_context_time(ctx);
update_cgrp_time_from_cpuctx(cpuctx);
if (!ctx->nr_active)
@@ -2291,6 +2302,10 @@ void __perf_event_task_sched_out(struct task_struct *task,
{
int ctxn;

+ /* check for branch_stack events running on this cpu */
+ if (__get_cpu_var(perf_branch_stack_events))
+ perf_branch_stack_sched(task, next, false);
+
for_each_task_context_nr(ctxn)
perf_event_context_sched_out(task, ctxn, next);

@@ -2398,6 +2413,9 @@ ctx_sched_in(struct perf_event_context *ctx,
if (likely(!ctx->nr_events))
return;

+ if (ctx->is_active && !is_active)
+ __get_cpu_var(perf_branch_stack_events) += ctx->nr_branch_stack;
+
now = perf_clock();
ctx->timestamp = now;
perf_cgroup_set_timestamp(task, ctx);
@@ -2471,15 +2489,17 @@ static void perf_event_context_sched_in(struct perf_event_context *ctx,
* layer. It is invoked ONLY when there is at least one system-wide context
* with at least one active event using taken branch sampling.
*/
-static void perf_branch_stack_sched_in(struct task_struct *prev,
- struct task_struct *task)
+static void perf_branch_stack_sched(struct task_struct *task1,
+ struct task_struct *task2,
+ bool sched_in)
{
struct perf_cpu_context *cpuctx;
+ struct perf_event_context *task_ctx;
struct pmu *pmu;
unsigned long flags;

/* no need to flush branch stack if not changing task */
- if (prev == task)
+ if (task1 == task2)
return;

local_irq_save(flags);
@@ -2488,25 +2508,26 @@ static void perf_branch_stack_sched_in(struct task_struct *prev,

list_for_each_entry_rcu(pmu, &pmus, entry) {
cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
+ task_ctx = cpuctx->task_ctx;

/*
* check if the context has at least one
* event using PERF_SAMPLE_BRANCH_STACK
*/
- if (cpuctx->ctx.nr_branch_stack > 0
- && pmu->flush_branch_stack) {
-
+ if (pmu->branch_stack_sched &&
+ (cpuctx->ctx.nr_branch_stack > 0 ||
+ (task_ctx && task_ctx->nr_branch_stack > 0))) {
pmu = cpuctx->ctx.pmu;

- perf_ctx_lock(cpuctx, cpuctx->task_ctx);
+ perf_ctx_lock(cpuctx, task_ctx);

perf_pmu_disable(pmu);

- pmu->flush_branch_stack();
+ pmu->branch_stack_sched(task_ctx, sched_in);

perf_pmu_enable(pmu);

- perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
+ perf_ctx_unlock(cpuctx, task_ctx);
}
}

@@ -2547,9 +2568,9 @@ void __perf_event_task_sched_in(struct task_struct *prev,
if (atomic_read(&__get_cpu_var(perf_cgroup_events)))
perf_cgroup_sched_in(prev, task);

- /* check for system-wide branch_stack events */
- if (atomic_read(&__get_cpu_var(perf_branch_stack_events)))
- perf_branch_stack_sched_in(prev, task);
+ /* check for branch_stack events running on this cpu */
+ if (__get_cpu_var(perf_branch_stack_events))
+ perf_branch_stack_sched(prev, task, true);
}

static u64 perf_calculate_period(struct perf_event *event, u64 nsec, u64 count)
@@ -3134,14 +3155,8 @@ static void free_event(struct perf_event *event)
static_key_slow_dec_deferred(&perf_sched_events);
}

- if (has_branch_stack(event)) {
+ if (has_branch_stack(event))
static_key_slow_dec_deferred(&perf_sched_events);
- /* is system-wide event */
- if (!(event->attach_state & PERF_ATTACH_TASK)) {
- atomic_dec(&per_cpu(perf_branch_stack_events,
- event->cpu));
- }
- }
}

if (event->rb) {
@@ -6562,12 +6577,8 @@ done:
return ERR_PTR(err);
}
}
- if (has_branch_stack(event)) {
+ if (has_branch_stack(event))
static_key_slow_inc(&perf_sched_events.key);
- if (!(event->attach_state & PERF_ATTACH_TASK))
- atomic_inc(&per_cpu(perf_branch_stack_events,
- event->cpu));
- }
}

return event;
--
1.8.1.4

2013-06-25 08:51:26

[permalink] [raw]

Subject: [PATCH 3/7] perf, x86: Introduce x86 special perf event context

From: "Yan, Zheng" <[email protected]>

The x86 special perf event context is named x86_perf_event_context,
We can enlarge it later to store PMU special data.

Signed-off-by: Yan, Zheng <[email protected]>
---
arch/x86/kernel/cpu/perf_event.c | 12 ++++++++++++
arch/x86/kernel/cpu/perf_event.h | 4 ++++
include/linux/perf_event.h | 5 +++++
kernel/events/core.c | 28 ++++++++++++++++++----------
4 files changed, 39 insertions(+), 10 deletions(-)

diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index afc2413..b2eada9 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -1768,6 +1768,17 @@ static int x86_pmu_event_idx(struct perf_event *event)
return idx + 1;
}

+static void *x86_pmu_event_context_alloc(struct perf_event_context *parent_ctx)
+{
+ struct perf_event_context *ctx;
+
+ ctx = kzalloc(sizeof(struct x86_perf_event_context), GFP_KERNEL);
+ if (!ctx)
+ return ERR_PTR(-ENOMEM);
+
+ return ctx;
+}
+
static ssize_t get_attr_rdpmc(struct device *cdev,
struct device_attribute *attr,
char *buf)
@@ -1855,6 +1866,7 @@ static struct pmu pmu = {

.event_idx = x86_pmu_event_idx,
.flush_branch_stack = x86_pmu_flush_branch_stack,
+ .event_context_alloc = x86_pmu_event_context_alloc,
};

void arch_perf_update_userpage(struct perf_event_mmap_page *userpg, u64 now)
diff --git a/arch/x86/kernel/cpu/perf_event.h b/arch/x86/kernel/cpu/perf_event.h
index e14c963..08469de 100644
--- a/arch/x86/kernel/cpu/perf_event.h
+++ b/arch/x86/kernel/cpu/perf_event.h
@@ -454,6 +454,10 @@ enum {
PERF_SAMPLE_BRANCH_CALL_STACK = 1U << PERF_SAMPLE_BRANCH_CALL_STACK_SHIFT,
};

+struct x86_perf_event_context {
+ struct perf_event_context ctx;
+};
+
#define x86_add_quirk(func_) \
do { \
static struct x86_pmu_quirk __quirk __initdata = { \
diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 50b3efd..f6d1d59 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -274,6 +274,11 @@ struct pmu {
* flush branch stack on context-switches (needed in cpu-wide mode)
*/
void (*flush_branch_stack) (void);
+
+ /*
+ * Allocate PMU special perf event context
+ */
+ void *(*event_context_alloc) (struct perf_event_context *parent_ctx);
};

/**
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 1db3af9..3aececc 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -2961,13 +2961,20 @@ static void __perf_event_init_context(struct perf_event_context *ctx)
}

static struct perf_event_context *
-alloc_perf_context(struct pmu *pmu, struct task_struct *task)
+alloc_perf_context(struct pmu *pmu, struct task_struct *task,
+ struct perf_event_context *parent_ctx)
{
struct perf_event_context *ctx;

- ctx = kzalloc(sizeof(struct perf_event_context), GFP_KERNEL);
- if (!ctx)
- return NULL;
+ if (pmu->event_context_alloc) {
+ ctx = pmu->event_context_alloc(parent_ctx);
+ if (IS_ERR(ctx))
+ return ctx;
+ } else {
+ ctx = kzalloc(sizeof(struct perf_event_context), GFP_KERNEL);
+ if (!ctx)
+ return ERR_PTR(-ENOMEM);
+ }

__perf_event_init_context(ctx);
if (task) {
@@ -3053,10 +3060,11 @@ retry:
++ctx->pin_count;
raw_spin_unlock_irqrestore(&ctx->lock, flags);
} else {
- ctx = alloc_perf_context(pmu, task);
- err = -ENOMEM;
- if (!ctx)
+ ctx = alloc_perf_context(pmu, task, NULL);
+ if (IS_ERR(ctx)) {
+ err = PTR_ERR(ctx);
goto errout;
+ }

err = 0;
mutex_lock(&task->perf_event_mutex);
@@ -7465,9 +7473,9 @@ inherit_task_group(struct perf_event *event, struct task_struct *parent,
* child.
*/

- child_ctx = alloc_perf_context(event->pmu, child);
- if (!child_ctx)
- return -ENOMEM;
+ child_ctx = alloc_perf_context(event->pmu, child, parent_ctx);
+ if (IS_ERR(child_ctx))
+ return PTR_ERR(child_ctx);

child->perf_event_ctxp[ctxn] = child_ctx;
}
--
1.8.1.4

2013-06-25 08:52:14

[permalink] [raw]

Subject: [PATCH 1/7] perf, x86: Reduce lbr_sel_map size

From: "Yan, Zheng" <[email protected]>

The index of lbr_sel_map is bit value of perf branch_sample_type.
We can reduce lbr_sel_map size by using bit shift as index.

Signed-off-by: Yan, Zheng <[email protected]>
---
arch/x86/kernel/cpu/perf_event.h | 4 +++
arch/x86/kernel/cpu/perf_event_intel_lbr.c | 50 ++++++++++++++----------------
include/uapi/linux/perf_event.h | 42 +++++++++++++++++--------
3 files changed, 56 insertions(+), 40 deletions(-)

diff --git a/arch/x86/kernel/cpu/perf_event.h b/arch/x86/kernel/cpu/perf_event.h
index 108dc75..a74d554 100644
--- a/arch/x86/kernel/cpu/perf_event.h
+++ b/arch/x86/kernel/cpu/perf_event.h
@@ -447,6 +447,10 @@ struct x86_pmu {
struct perf_guest_switch_msr *(*guest_get_msrs)(int *nr);
};

+enum {
+ PERF_SAMPLE_BRANCH_SELECT_MAP_SIZE = PERF_SAMPLE_BRANCH_MAX_SHIFT,
+};
+
#define x86_add_quirk(func_) \
do { \
static struct x86_pmu_quirk __quirk __initdata = { \
diff --git a/arch/x86/kernel/cpu/perf_event_intel_lbr.c b/arch/x86/kernel/cpu/perf_event_intel_lbr.c
index d5be06a..a72e9e9 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_lbr.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_lbr.c
@@ -69,10 +69,6 @@ static enum {
#define LBR_FROM_FLAG_IN_TX (1ULL << 62)
#define LBR_FROM_FLAG_ABORT (1ULL << 61)

-#define for_each_branch_sample_type(x) \
- for ((x) = PERF_SAMPLE_BRANCH_USER; \
- (x) < PERF_SAMPLE_BRANCH_MAX; (x) <<= 1)
-
/*
* x86control flow change classification
* x86control flow changes include branches, interrupts, traps, faults
@@ -387,14 +383,14 @@ static int intel_pmu_setup_hw_lbr_filter(struct perf_event *event)
{
struct hw_perf_event_extra *reg;
u64 br_type = event->attr.branch_sample_type;
- u64 mask = 0, m;
- u64 v;
+ u64 mask = 0, v;
+ int i;

- for_each_branch_sample_type(m) {
- if (!(br_type & m))
+ for (i = 0; i < PERF_SAMPLE_BRANCH_SELECT_MAP_SIZE; i++) {
+ if (!(br_type & (1U << i)))
continue;

- v = x86_pmu.lbr_sel_map[m];
+ v = x86_pmu.lbr_sel_map[i];
if (v == LBR_NOT_SUPP)
return -EOPNOTSUPP;

@@ -649,33 +645,33 @@ intel_pmu_lbr_filter(struct cpu_hw_events *cpuc)
/*
* Map interface branch filters onto LBR filters
*/
-static const int nhm_lbr_sel_map[PERF_SAMPLE_BRANCH_MAX] = {
- [PERF_SAMPLE_BRANCH_ANY] = LBR_ANY,
- [PERF_SAMPLE_BRANCH_USER] = LBR_USER,
- [PERF_SAMPLE_BRANCH_KERNEL] = LBR_KERNEL,
- [PERF_SAMPLE_BRANCH_HV] = LBR_IGN,
- [PERF_SAMPLE_BRANCH_ANY_RETURN] = LBR_RETURN | LBR_REL_JMP
- | LBR_IND_JMP | LBR_FAR,
+static const int nhm_lbr_sel_map[PERF_SAMPLE_BRANCH_SELECT_MAP_SIZE] = {
+ [PERF_SAMPLE_BRANCH_ANY_SHIFT] = LBR_ANY,
+ [PERF_SAMPLE_BRANCH_USER_SHIFT] = LBR_USER,
+ [PERF_SAMPLE_BRANCH_KERNEL_SHIFT] = LBR_KERNEL,
+ [PERF_SAMPLE_BRANCH_HV_SHIFT] = LBR_IGN,
+ [PERF_SAMPLE_BRANCH_ANY_RETURN_SHIFT] = LBR_RETURN | LBR_REL_JMP
+ | LBR_IND_JMP | LBR_FAR,
/*
* NHM/WSM erratum: must include REL_JMP+IND_JMP to get CALL branches
*/
- [PERF_SAMPLE_BRANCH_ANY_CALL] =
+ [PERF_SAMPLE_BRANCH_ANY_CALL_SHIFT] =
LBR_REL_CALL | LBR_IND_CALL | LBR_REL_JMP | LBR_IND_JMP | LBR_FAR,
/*
* NHM/WSM erratum: must include IND_JMP to capture IND_CALL
*/
- [PERF_SAMPLE_BRANCH_IND_CALL] = LBR_IND_CALL | LBR_IND_JMP,
+ [PERF_SAMPLE_BRANCH_IND_CALL_SHIFT] = LBR_IND_CALL | LBR_IND_JMP,
};

-static const int snb_lbr_sel_map[PERF_SAMPLE_BRANCH_MAX] = {
- [PERF_SAMPLE_BRANCH_ANY] = LBR_ANY,
- [PERF_SAMPLE_BRANCH_USER] = LBR_USER,
- [PERF_SAMPLE_BRANCH_KERNEL] = LBR_KERNEL,
- [PERF_SAMPLE_BRANCH_HV] = LBR_IGN,
- [PERF_SAMPLE_BRANCH_ANY_RETURN] = LBR_RETURN | LBR_FAR,
- [PERF_SAMPLE_BRANCH_ANY_CALL] = LBR_REL_CALL | LBR_IND_CALL
- | LBR_FAR,
- [PERF_SAMPLE_BRANCH_IND_CALL] = LBR_IND_CALL,
+static const int snb_lbr_sel_map[PERF_SAMPLE_BRANCH_SELECT_MAP_SIZE] = {
+ [PERF_SAMPLE_BRANCH_ANY_SHIFT] = LBR_ANY,
+ [PERF_SAMPLE_BRANCH_USER_SHIFT] = LBR_USER,
+ [PERF_SAMPLE_BRANCH_KERNEL_SHIFT] = LBR_KERNEL,
+ [PERF_SAMPLE_BRANCH_HV_SHIFT] = LBR_IGN,
+ [PERF_SAMPLE_BRANCH_ANY_RETURN_SHIFT] = LBR_RETURN | LBR_FAR,
+ [PERF_SAMPLE_BRANCH_ANY_CALL_SHIFT] = LBR_REL_CALL | LBR_IND_CALL
+ | LBR_FAR,
+ [PERF_SAMPLE_BRANCH_IND_CALL_SHIFT] = LBR_IND_CALL,
};

/* core */
diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h
index 0b1df41..2ec219e 100644
--- a/include/uapi/linux/perf_event.h
+++ b/include/uapi/linux/perf_event.h
@@ -148,20 +148,36 @@ enum perf_event_sample_format {
* The branch types can be combined, however BRANCH_ANY covers all types
* of branches and therefore it supersedes all the other types.
*/
+enum perf_branch_sample_type_shift {
+ PERF_SAMPLE_BRANCH_USER_SHIFT = 0, /* user branches */
+ PERF_SAMPLE_BRANCH_KERNEL_SHIFT = 1, /* kernel branches */
+ PERF_SAMPLE_BRANCH_HV_SHIFT = 2, /* hypervisor branches */
+
+ PERF_SAMPLE_BRANCH_ANY_SHIFT = 3, /* any branch types */
+ PERF_SAMPLE_BRANCH_ANY_CALL_SHIFT = 4, /* any call branch */
+ PERF_SAMPLE_BRANCH_ANY_RETURN_SHIFT = 5, /* any return branch */
+ PERF_SAMPLE_BRANCH_IND_CALL_SHIFT = 6, /* indirect calls */
+ PERF_SAMPLE_BRANCH_ABORT_TX_SHIFT = 7, /* transaction aborts */
+ PERF_SAMPLE_BRANCH_IN_TX_SHIFT = 8, /* in transaction */
+ PERF_SAMPLE_BRANCH_NO_TX_SHIFT = 9, /* not in transaction */
+
+ PERF_SAMPLE_BRANCH_MAX_SHIFT /* non-ABI */
+};
+
enum perf_branch_sample_type {
- PERF_SAMPLE_BRANCH_USER = 1U << 0, /* user branches */
- PERF_SAMPLE_BRANCH_KERNEL = 1U << 1, /* kernel branches */
- PERF_SAMPLE_BRANCH_HV = 1U << 2, /* hypervisor branches */
-
- PERF_SAMPLE_BRANCH_ANY = 1U << 3, /* any branch types */
- PERF_SAMPLE_BRANCH_ANY_CALL = 1U << 4, /* any call branch */
- PERF_SAMPLE_BRANCH_ANY_RETURN = 1U << 5, /* any return branch */
- PERF_SAMPLE_BRANCH_IND_CALL = 1U << 6, /* indirect calls */
- PERF_SAMPLE_BRANCH_ABORT_TX = 1U << 7, /* transaction aborts */
- PERF_SAMPLE_BRANCH_IN_TX = 1U << 8, /* in transaction */
- PERF_SAMPLE_BRANCH_NO_TX = 1U << 9, /* not in transaction */
-
- PERF_SAMPLE_BRANCH_MAX = 1U << 10, /* non-ABI */
+ PERF_SAMPLE_BRANCH_USER = 1U << PERF_SAMPLE_BRANCH_USER_SHIFT,
+ PERF_SAMPLE_BRANCH_KERNEL = 1U << PERF_SAMPLE_BRANCH_KERNEL_SHIFT,
+ PERF_SAMPLE_BRANCH_HV = 1U << PERF_SAMPLE_BRANCH_HV_SHIFT,
+
+ PERF_SAMPLE_BRANCH_ANY = 1U << PERF_SAMPLE_BRANCH_ANY_SHIFT,
+ PERF_SAMPLE_BRANCH_ANY_CALL = 1U << PERF_SAMPLE_BRANCH_ANY_CALL_SHIFT,
+ PERF_SAMPLE_BRANCH_ANY_RETURN = 1U << PERF_SAMPLE_BRANCH_ANY_RETURN_SHIFT,
+ PERF_SAMPLE_BRANCH_IND_CALL = 1U << PERF_SAMPLE_BRANCH_IND_CALL_SHIFT,
+ PERF_SAMPLE_BRANCH_ABORT_TX = 1U << PERF_SAMPLE_BRANCH_ABORT_TX_SHIFT,
+ PERF_SAMPLE_BRANCH_IN_TX = 1U << PERF_SAMPLE_BRANCH_IN_TX_SHIFT,
+ PERF_SAMPLE_BRANCH_NO_TX = 1U << PERF_SAMPLE_BRANCH_NO_TX_SHIFT,
+
+ PERF_SAMPLE_BRANCH_MAX = 1U << PERF_SAMPLE_BRANCH_MAX_SHIFT,
};

#define PERF_SAMPLE_BRANCH_PLM_ALL \
--
1.8.1.4

2013-06-25 12:33:05

[permalink] [raw]

Subject: Re: [PATCH 1/7] perf, x86: Reduce lbr_sel_map size

On Tue, Jun 25, 2013 at 10:47 AM, Yan, Zheng <[email protected]> wrote:
> From: "Yan, Zheng" <[email protected]>
>
> The index of lbr_sel_map is bit value of perf branch_sample_type.
> We can reduce lbr_sel_map size by using bit shift as index.
>
> Signed-off-by: Yan, Zheng <[email protected]>
> ---
> arch/x86/kernel/cpu/perf_event.h | 4 +++
> arch/x86/kernel/cpu/perf_event_intel_lbr.c | 50 ++++++++++++++----------------
> include/uapi/linux/perf_event.h | 42 +++++++++++++++++--------
> 3 files changed, 56 insertions(+), 40 deletions(-)
>
> diff --git a/arch/x86/kernel/cpu/perf_event.h b/arch/x86/kernel/cpu/perf_event.h
> index 108dc75..a74d554 100644
> --- a/arch/x86/kernel/cpu/perf_event.h
> +++ b/arch/x86/kernel/cpu/perf_event.h
> @@ -447,6 +447,10 @@ struct x86_pmu {
> struct perf_guest_switch_msr *(*guest_get_msrs)(int *nr);
> };
>
> +enum {
> + PERF_SAMPLE_BRANCH_SELECT_MAP_SIZE = PERF_SAMPLE_BRANCH_MAX_SHIFT,
> +};
> +
> #define x86_add_quirk(func_) \
> do { \
> static struct x86_pmu_quirk __quirk __initdata = { \
> diff --git a/arch/x86/kernel/cpu/perf_event_intel_lbr.c b/arch/x86/kernel/cpu/perf_event_intel_lbr.c
> index d5be06a..a72e9e9 100644
> --- a/arch/x86/kernel/cpu/perf_event_intel_lbr.c
> +++ b/arch/x86/kernel/cpu/perf_event_intel_lbr.c
> @@ -69,10 +69,6 @@ static enum {
> #define LBR_FROM_FLAG_IN_TX (1ULL << 62)
> #define LBR_FROM_FLAG_ABORT (1ULL << 61)
>
> -#define for_each_branch_sample_type(x) \
> - for ((x) = PERF_SAMPLE_BRANCH_USER; \
> - (x) < PERF_SAMPLE_BRANCH_MAX; (x) <<= 1)
> -
> /*
> * x86control flow change classification
> * x86control flow changes include branches, interrupts, traps, faults
> @@ -387,14 +383,14 @@ static int intel_pmu_setup_hw_lbr_filter(struct perf_event *event)
> {
> struct hw_perf_event_extra *reg;
> u64 br_type = event->attr.branch_sample_type;
> - u64 mask = 0, m;
> - u64 v;
> + u64 mask = 0, v;
> + int i;
>
> - for_each_branch_sample_type(m) {
> - if (!(br_type & m))
> + for (i = 0; i < PERF_SAMPLE_BRANCH_SELECT_MAP_SIZE; i++) {
> + if (!(br_type & (1U << i)))

Needs to be 1ULL to avoid bug later on. br_type is u64.

> continue;
>
> - v = x86_pmu.lbr_sel_map[m];
> + v = x86_pmu.lbr_sel_map[i];
> if (v == LBR_NOT_SUPP)
> return -EOPNOTSUPP;
>
> @@ -649,33 +645,33 @@ intel_pmu_lbr_filter(struct cpu_hw_events *cpuc)
> /*
> * Map interface branch filters onto LBR filters
> */
> -static const int nhm_lbr_sel_map[PERF_SAMPLE_BRANCH_MAX] = {
> - [PERF_SAMPLE_BRANCH_ANY] = LBR_ANY,
> - [PERF_SAMPLE_BRANCH_USER] = LBR_USER,
> - [PERF_SAMPLE_BRANCH_KERNEL] = LBR_KERNEL,
> - [PERF_SAMPLE_BRANCH_HV] = LBR_IGN,
> - [PERF_SAMPLE_BRANCH_ANY_RETURN] = LBR_RETURN | LBR_REL_JMP
> - | LBR_IND_JMP | LBR_FAR,
> +static const int nhm_lbr_sel_map[PERF_SAMPLE_BRANCH_SELECT_MAP_SIZE] = {
> + [PERF_SAMPLE_BRANCH_ANY_SHIFT] = LBR_ANY,
> + [PERF_SAMPLE_BRANCH_USER_SHIFT] = LBR_USER,
> + [PERF_SAMPLE_BRANCH_KERNEL_SHIFT] = LBR_KERNEL,
> + [PERF_SAMPLE_BRANCH_HV_SHIFT] = LBR_IGN,
> + [PERF_SAMPLE_BRANCH_ANY_RETURN_SHIFT] = LBR_RETURN | LBR_REL_JMP
> + | LBR_IND_JMP | LBR_FAR,
> /*
> * NHM/WSM erratum: must include REL_JMP+IND_JMP to get CALL branches
> */
> - [PERF_SAMPLE_BRANCH_ANY_CALL] =
> + [PERF_SAMPLE_BRANCH_ANY_CALL_SHIFT] =
> LBR_REL_CALL | LBR_IND_CALL | LBR_REL_JMP | LBR_IND_JMP | LBR_FAR,
> /*
> * NHM/WSM erratum: must include IND_JMP to capture IND_CALL
> */
> - [PERF_SAMPLE_BRANCH_IND_CALL] = LBR_IND_CALL | LBR_IND_JMP,
> + [PERF_SAMPLE_BRANCH_IND_CALL_SHIFT] = LBR_IND_CALL | LBR_IND_JMP,
> };
>
I think it would ease formatting readability
if the indexes could be constructed from a simple macro:
#define BR_SHIFT(a) \
PERF_SAMPLE_##a##_SHIFT

#define BR_SMPL(a) \
PERF_SAMPLE_##a

> -static const int snb_lbr_sel_map[PERF_SAMPLE_BRANCH_MAX] = {
> - [PERF_SAMPLE_BRANCH_ANY] = LBR_ANY,
> - [PERF_SAMPLE_BRANCH_USER] = LBR_USER,
> - [PERF_SAMPLE_BRANCH_KERNEL] = LBR_KERNEL,
> - [PERF_SAMPLE_BRANCH_HV] = LBR_IGN,
> - [PERF_SAMPLE_BRANCH_ANY_RETURN] = LBR_RETURN | LBR_FAR,
> - [PERF_SAMPLE_BRANCH_ANY_CALL] = LBR_REL_CALL | LBR_IND_CALL
> - | LBR_FAR,
> - [PERF_SAMPLE_BRANCH_IND_CALL] = LBR_IND_CALL,
> +static const int snb_lbr_sel_map[PERF_SAMPLE_BRANCH_SELECT_MAP_SIZE] = {
> + [PERF_SAMPLE_BRANCH_ANY_SHIFT] = LBR_ANY,
> + [PERF_SAMPLE_BRANCH_USER_SHIFT] = LBR_USER,
> + [PERF_SAMPLE_BRANCH_KERNEL_SHIFT] = LBR_KERNEL,
> + [PERF_SAMPLE_BRANCH_HV_SHIFT] = LBR_IGN,
> + [PERF_SAMPLE_BRANCH_ANY_RETURN_SHIFT] = LBR_RETURN | LBR_FAR,
> + [PERF_SAMPLE_BRANCH_ANY_CALL_SHIFT] = LBR_REL_CALL | LBR_IND_CALL
> + | LBR_FAR,
> + [PERF_SAMPLE_BRANCH_IND_CALL_SHIFT] = LBR_IND_CALL,
> };
>
> /* core */
> diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h
> index 0b1df41..2ec219e 100644
> --- a/include/uapi/linux/perf_event.h
> +++ b/include/uapi/linux/perf_event.h
> @@ -148,20 +148,36 @@ enum perf_event_sample_format {
> * The branch types can be combined, however BRANCH_ANY covers all types
> * of branches and therefore it supersedes all the other types.
> */
> +enum perf_branch_sample_type_shift {
> + PERF_SAMPLE_BRANCH_USER_SHIFT = 0, /* user branches */
> + PERF_SAMPLE_BRANCH_KERNEL_SHIFT = 1, /* kernel branches */
> + PERF_SAMPLE_BRANCH_HV_SHIFT = 2, /* hypervisor branches */
> +
> + PERF_SAMPLE_BRANCH_ANY_SHIFT = 3, /* any branch types */
> + PERF_SAMPLE_BRANCH_ANY_CALL_SHIFT = 4, /* any call branch */
> + PERF_SAMPLE_BRANCH_ANY_RETURN_SHIFT = 5, /* any return branch */
> + PERF_SAMPLE_BRANCH_IND_CALL_SHIFT = 6, /* indirect calls */
> + PERF_SAMPLE_BRANCH_ABORT_TX_SHIFT = 7, /* transaction aborts */
> + PERF_SAMPLE_BRANCH_IN_TX_SHIFT = 8, /* in transaction */
> + PERF_SAMPLE_BRANCH_NO_TX_SHIFT = 9, /* not in transaction */
> +
> + PERF_SAMPLE_BRANCH_MAX_SHIFT /* non-ABI */
> +};
> +
> enum perf_branch_sample_type {
> - PERF_SAMPLE_BRANCH_USER = 1U << 0, /* user branches */
> - PERF_SAMPLE_BRANCH_KERNEL = 1U << 1, /* kernel branches */
> - PERF_SAMPLE_BRANCH_HV = 1U << 2, /* hypervisor branches */
> -
> - PERF_SAMPLE_BRANCH_ANY = 1U << 3, /* any branch types */
> - PERF_SAMPLE_BRANCH_ANY_CALL = 1U << 4, /* any call branch */
> - PERF_SAMPLE_BRANCH_ANY_RETURN = 1U << 5, /* any return branch */
> - PERF_SAMPLE_BRANCH_IND_CALL = 1U << 6, /* indirect calls */
> - PERF_SAMPLE_BRANCH_ABORT_TX = 1U << 7, /* transaction aborts */
> - PERF_SAMPLE_BRANCH_IN_TX = 1U << 8, /* in transaction */
> - PERF_SAMPLE_BRANCH_NO_TX = 1U << 9, /* not in transaction */
> -
> - PERF_SAMPLE_BRANCH_MAX = 1U << 10, /* non-ABI */
> + PERF_SAMPLE_BRANCH_USER = 1U << PERF_SAMPLE_BRANCH_USER_SHIFT,
> + PERF_SAMPLE_BRANCH_KERNEL = 1U << PERF_SAMPLE_BRANCH_KERNEL_SHIFT,
> + PERF_SAMPLE_BRANCH_HV = 1U << PERF_SAMPLE_BRANCH_HV_SHIFT,
> +
> + PERF_SAMPLE_BRANCH_ANY = 1U << PERF_SAMPLE_BRANCH_ANY_SHIFT,
> + PERF_SAMPLE_BRANCH_ANY_CALL = 1U << PERF_SAMPLE_BRANCH_ANY_CALL_SHIFT,
> + PERF_SAMPLE_BRANCH_ANY_RETURN = 1U << PERF_SAMPLE_BRANCH_ANY_RETURN_SHIFT,
> + PERF_SAMPLE_BRANCH_IND_CALL = 1U << PERF_SAMPLE_BRANCH_IND_CALL_SHIFT,
> + PERF_SAMPLE_BRANCH_ABORT_TX = 1U << PERF_SAMPLE_BRANCH_ABORT_TX_SHIFT,
> + PERF_SAMPLE_BRANCH_IN_TX = 1U << PERF_SAMPLE_BRANCH_IN_TX_SHIFT,
> + PERF_SAMPLE_BRANCH_NO_TX = 1U << PERF_SAMPLE_BRANCH_NO_TX_SHIFT,
> +
> + PERF_SAMPLE_BRANCH_MAX = 1U << PERF_SAMPLE_BRANCH_MAX_SHIFT,
> };
>
> #define PERF_SAMPLE_BRANCH_PLM_ALL \
> --
> 1.8.1.4
>

2013-06-25 12:37:14

[permalink] [raw]

Subject: Re: [PATCH 2/7] perf, x86: Basic Haswell LBR call stack support

On Tue, Jun 25, 2013 at 10:47 AM, Yan, Zheng <[email protected]> wrote:
> From: "Yan, Zheng" <[email protected]>
>
> The new HSW call stack feature provides a facility such that
> unfiltered call data will be collected as normal, but as return
> instructions are executed the last captured branch record is
> popped from the LBR stack. Thus, branch information relative to
> leaf functions will not be captured, while preserving the call
> stack information of the main line execution path.
>
> Signed-off-by: Yan, Zheng <[email protected]>
> ---
> arch/x86/kernel/cpu/perf_event.h | 7 ++-
> arch/x86/kernel/cpu/perf_event_intel.c | 2 +-
> arch/x86/kernel/cpu/perf_event_intel_lbr.c | 89 ++++++++++++++++++++++--------
> 3 files changed, 74 insertions(+), 24 deletions(-)
>
> diff --git a/arch/x86/kernel/cpu/perf_event.h b/arch/x86/kernel/cpu/perf_event.h
> index a74d554..e14c963 100644
> --- a/arch/x86/kernel/cpu/perf_event.h
> +++ b/arch/x86/kernel/cpu/perf_event.h
> @@ -448,7 +448,10 @@ struct x86_pmu {
> };
>
> enum {
> - PERF_SAMPLE_BRANCH_SELECT_MAP_SIZE = PERF_SAMPLE_BRANCH_MAX_SHIFT,
> + PERF_SAMPLE_BRANCH_CALL_STACK_SHIFT = PERF_SAMPLE_BRANCH_MAX_SHIFT,
> + PERF_SAMPLE_BRANCH_SELECT_MAP_SIZE,
> +
> + PERF_SAMPLE_BRANCH_CALL_STACK = 1U << PERF_SAMPLE_BRANCH_CALL_STACK_SHIFT,
> };
>
> #define x86_add_quirk(func_) \
> @@ -681,6 +684,8 @@ void intel_pmu_lbr_init_atom(void);
>
> void intel_pmu_lbr_init_snb(void);
>
> +void intel_pmu_lbr_init_hsw(void);
> +
> int intel_pmu_setup_lbr_filter(struct perf_event *event);
>
> int p4_pmu_init(void);
> diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c
> index a6eccf1..3e92a68 100644
> --- a/arch/x86/kernel/cpu/perf_event_intel.c
> +++ b/arch/x86/kernel/cpu/perf_event_intel.c
> @@ -2276,7 +2276,7 @@ __init int intel_pmu_init(void)
> memcpy(hw_cache_event_ids, snb_hw_cache_event_ids, sizeof(hw_cache_event_ids));
> memcpy(hw_cache_extra_regs, snb_hw_cache_extra_regs, sizeof(hw_cache_extra_regs));
>
> - intel_pmu_lbr_init_snb();
> + intel_pmu_lbr_init_hsw();
>
> x86_pmu.event_constraints = intel_hsw_event_constraints;
> x86_pmu.pebs_constraints = intel_hsw_pebs_event_constraints;
> diff --git a/arch/x86/kernel/cpu/perf_event_intel_lbr.c b/arch/x86/kernel/cpu/perf_event_intel_lbr.c
> index a72e9e9..2136320 100644
> --- a/arch/x86/kernel/cpu/perf_event_intel_lbr.c
> +++ b/arch/x86/kernel/cpu/perf_event_intel_lbr.c
> @@ -39,6 +39,7 @@ static enum {
> #define LBR_IND_JMP_BIT 6 /* do not capture indirect jumps */
> #define LBR_REL_JMP_BIT 7 /* do not capture relative jumps */
> #define LBR_FAR_BIT 8 /* do not capture far branches */
> +#define LBR_CALL_STACK_BIT 9 /* enable call stack */
>
> #define LBR_KERNEL (1 << LBR_KERNEL_BIT)
> #define LBR_USER (1 << LBR_USER_BIT)
> @@ -49,6 +50,7 @@ static enum {
> #define LBR_REL_JMP (1 << LBR_REL_JMP_BIT)
> #define LBR_IND_JMP (1 << LBR_IND_JMP_BIT)
> #define LBR_FAR (1 << LBR_FAR_BIT)
> +#define LBR_CALL_STACK (1 << LBR_CALL_STACK_BIT)
>
> #define LBR_PLM (LBR_KERNEL | LBR_USER)
>
> @@ -74,24 +76,25 @@ static enum {
> * x86control flow changes include branches, interrupts, traps, faults
> */
> enum {
> - X86_BR_NONE = 0, /* unknown */
> -
> - X86_BR_USER = 1 << 0, /* branch target is user */
> - X86_BR_KERNEL = 1 << 1, /* branch target is kernel */
> -
> - X86_BR_CALL = 1 << 2, /* call */
> - X86_BR_RET = 1 << 3, /* return */
> - X86_BR_SYSCALL = 1 << 4, /* syscall */
> - X86_BR_SYSRET = 1 << 5, /* syscall return */
> - X86_BR_INT = 1 << 6, /* sw interrupt */
> - X86_BR_IRET = 1 << 7, /* return from interrupt */
> - X86_BR_JCC = 1 << 8, /* conditional */
> - X86_BR_JMP = 1 << 9, /* jump */
> - X86_BR_IRQ = 1 << 10,/* hw interrupt or trap or fault */
> - X86_BR_IND_CALL = 1 << 11,/* indirect calls */
> - X86_BR_ABORT = 1 << 12,/* transaction abort */
> - X86_BR_IN_TX = 1 << 13,/* in transaction */
> - X86_BR_NO_TX = 1 << 14,/* not in transaction */
> + X86_BR_NONE = 0, /* unknown */
> +
> + X86_BR_USER = 1 << 0, /* branch target is user */
> + X86_BR_KERNEL = 1 << 1, /* branch target is kernel */
> +
> + X86_BR_CALL = 1 << 2, /* call */
> + X86_BR_RET = 1 << 3, /* return */
> + X86_BR_SYSCALL = 1 << 4, /* syscall */
> + X86_BR_SYSRET = 1 << 5, /* syscall return */
> + X86_BR_INT = 1 << 6, /* sw interrupt */
> + X86_BR_IRET = 1 << 7, /* return from interrupt */
> + X86_BR_JCC = 1 << 8, /* conditional */
> + X86_BR_JMP = 1 << 9, /* jump */
> + X86_BR_IRQ = 1 << 10,/* hw interrupt or trap or fault */
> + X86_BR_IND_CALL = 1 << 11,/* indirect calls */
> + X86_BR_ABORT = 1 << 12,/* transaction abort */
> + X86_BR_IN_TX = 1 << 13,/* in transaction */
> + X86_BR_NO_TX = 1 << 14,/* not in transaction */
> + X86_BR_CALL_STACK = 1 << 15,/* call stack */
> };
>
> #define X86_BR_PLM (X86_BR_USER | X86_BR_KERNEL)
> @@ -135,7 +138,10 @@ static void __intel_pmu_lbr_enable(void)
> wrmsrl(MSR_LBR_SELECT, cpuc->lbr_sel->config);
>
> rdmsrl(MSR_IA32_DEBUGCTLMSR, debugctl);
> - debugctl |= (DEBUGCTLMSR_LBR | DEBUGCTLMSR_FREEZE_LBRS_ON_PMI);
> + debugctl |= DEBUGCTLMSR_LBR;
> + /* LBR callstack does not work well with FREEZE_LBRS_ON_PMI */
> + if (!cpuc->lbr_sel || !(cpuc->lbr_sel->config & LBR_CALL_STACK))
> + debugctl |= DEBUGCTLMSR_FREEZE_LBRS_ON_PMI;
> wrmsrl(MSR_IA32_DEBUGCTLMSR, debugctl);
> }
Is that a bug or a known limitation here?

In either case for HSW, this means the call-stack mode is only useful when
measuring user level code and MUST be enforced that way by the kernel.
In other words, callstack can ONLY be associated with events measuring
ONLY at the user level. Otherwise you lose correlation with counter overflow.

Don't see the code to enforce this restriction in this patch. Maybe it
is elsewhere.

>
> @@ -333,7 +339,7 @@ void intel_pmu_lbr_read(void)
> * - in case there is no HW filter
> * - in case the HW filter has errata or limitations
> */
> -static void intel_pmu_setup_sw_lbr_filter(struct perf_event *event)
> +static int intel_pmu_setup_sw_lbr_filter(struct perf_event *event)
> {
> u64 br_type = event->attr.branch_sample_type;
> int mask = 0;
> @@ -367,11 +373,21 @@ static void intel_pmu_setup_sw_lbr_filter(struct perf_event *event)
> if (br_type & PERF_SAMPLE_BRANCH_NO_TX)
> mask |= X86_BR_NO_TX;
>
> + if (br_type & PERF_SAMPLE_BRANCH_CALL_STACK) {
> + if (!x86_pmu.lbr_sel_map)
> + return -EOPNOTSUPP;
> + if (mask & ~(X86_BR_USER | X86_BR_KERNEL))
> + return -EINVAL;
> + mask |= X86_BR_CALL | X86_BR_IND_CALL | X86_BR_RET |
> + X86_BR_CALL_STACK;
> + }
> +
> /*
> * stash actual user request into reg, it may
> * be used by fixup code for some CPU
> */
> event->hw.branch_reg.reg = mask;
> + return 0;
> }
>
> /*
> @@ -401,7 +417,7 @@ static int intel_pmu_setup_hw_lbr_filter(struct perf_event *event)
> reg->idx = EXTRA_REG_LBR;
>
> /* LBR_SELECT operates in suppress mode so invert mask */
> - reg->config = ~mask & x86_pmu.lbr_sel_mask;
> + reg->config = mask ^ x86_pmu.lbr_sel_mask;
>
> return 0;
> }
> @@ -419,7 +435,9 @@ int intel_pmu_setup_lbr_filter(struct perf_event *event)
> /*
> * setup SW LBR filter
> */
> - intel_pmu_setup_sw_lbr_filter(event);
> + ret = intel_pmu_setup_sw_lbr_filter(event);
> + if (ret)
> + return ret;
>
> /*
> * setup HW LBR filter, if any
> @@ -674,6 +692,19 @@ static const int snb_lbr_sel_map[PERF_SAMPLE_BRANCH_SELECT_MAP_SIZE] = {
> [PERF_SAMPLE_BRANCH_IND_CALL_SHIFT] = LBR_IND_CALL,
> };
>
> +static const int hsw_lbr_sel_map[PERF_SAMPLE_BRANCH_SELECT_MAP_SIZE] = {
> + [PERF_SAMPLE_BRANCH_ANY_SHIFT] = LBR_ANY,
> + [PERF_SAMPLE_BRANCH_USER_SHIFT] = LBR_USER,
> + [PERF_SAMPLE_BRANCH_KERNEL_SHIFT] = LBR_KERNEL,
> + [PERF_SAMPLE_BRANCH_HV_SHIFT] = LBR_IGN,
> + [PERF_SAMPLE_BRANCH_ANY_RETURN_SHIFT] = LBR_RETURN | LBR_FAR,
> + [PERF_SAMPLE_BRANCH_ANY_CALL_SHIFT] = LBR_REL_CALL | LBR_IND_CALL
> + | LBR_FAR,
> + [PERF_SAMPLE_BRANCH_IND_CALL_SHIFT] = LBR_IND_CALL,
> + [PERF_SAMPLE_BRANCH_CALL_STACK_SHIFT] = LBR_REL_CALL | LBR_IND_CALL
> + | LBR_RETURN | LBR_CALL_STACK,
> +};
> +
> /* core */
> void intel_pmu_lbr_init_core(void)
> {
> @@ -730,6 +761,20 @@ void intel_pmu_lbr_init_snb(void)
> pr_cont("16-deep LBR, ");
> }
>
> +/* haswell */
> +void intel_pmu_lbr_init_hsw(void)
> +{
> + x86_pmu.lbr_nr = 16;
> + x86_pmu.lbr_tos = MSR_LBR_TOS;
> + x86_pmu.lbr_from = MSR_LBR_NHM_FROM;
> + x86_pmu.lbr_to = MSR_LBR_NHM_TO;
> +
> + x86_pmu.lbr_sel_mask = LBR_SEL_MASK;
> + x86_pmu.lbr_sel_map = hsw_lbr_sel_map;
> +
> + pr_cont("16-deep LBR, ");
> +}
> +
> /* atom */
> void intel_pmu_lbr_init_atom(void)
> {
> --
> 1.8.1.4
>

2013-06-25 12:41:00

[permalink] [raw]

Subject: Re: [PATCH 0/7] perf, x86: Haswell LBR call stack support

On Tue, Jun 25, 2013 at 10:47 AM, Yan, Zheng <[email protected]> wrote:
> From: "Yan, Zheng" <[email protected]>
>
> Haswell has a new feature that utilizes the existing Last Branch Record
> facility to record call chains. When the feature is enabled, function
> call will be collected as normal, but as return instructions are executed
> the last captured branch record is popped from the on-chip LBR registers.
> The LBR call stack facility can help perf to get call chains of progam
> without frame pointer. When perf tool requests PERF_SAMPLE_CALLCHAIN +
> PERF_SAMPLE_BRANCH_USER, this feature is dynamically enabled by default.
> This feature can be disabled/enabled through an attribute file in the cpu
> pmu sysfs directory.
>
> The LBR call stack has following known limitations
> 1. Zero length calls are not filtered out by hardware
> 2. Exception handing such as setjmp/longjmp will have calls/returns not
> match
> 3. Pushing different return address onto the stack will have calls/returns
> not match
>
I would also add that it does not work with leaf call optimizations:
A calls B, B calls C, C returns to A

and that can be fairly common with small functions.

2013-06-25 15:27:07

[permalink] [raw]

Subject: Re: [PATCH 0/7] perf, x86: Haswell LBR call stack support

> > The LBR call stack has following known limitations
> > 1. Zero length calls are not filtered out by hardware
> > 2. Exception handing such as setjmp/longjmp will have calls/returns not
> > match
> > 3. Pushing different return address onto the stack will have calls/returns
> > not match
> >
> I would also add that it does not work with leaf call optimizations:
> A calls B, B calls C, C returns to A

For this case it works as well as frame pointer or dwarf (or any
other backtracer), they simply do not show up in the callgraph
(unless you disable the tail calls).

I wouldn't consider it an additional limitation.

Another case that doesn't work with is TSX transactions.

Still it's extremly useful if you have nothing else ...

-Andi

--
[email protected] -- Speaking for myself only.

2013-06-25 15:30:39

[permalink] [raw]

Subject: Re: [PATCH 0/7] perf, x86: Haswell LBR call stack support

On Tue, Jun 25, 2013 at 04:47:12PM +0800, Yan, Zheng wrote:
> From: "Yan, Zheng" <[email protected]>

Can you put this description somewhere in Documentation as a file?
(slightly extended, e.g. with the path name of the file to enable)

I think the users will later need this kind of context to use
the feature successfully.

-Andi

>
> Haswell has a new feature that utilizes the existing Last Branch Record
> facility to record call chains. When the feature is enabled, function
> call will be collected as normal, but as return instructions are executed
> the last captured branch record is popped from the on-chip LBR registers.
> The LBR call stack facility can help perf to get call chains of progam
> without frame pointer. When perf tool requests PERF_SAMPLE_CALLCHAIN +
> PERF_SAMPLE_BRANCH_USER, this feature is dynamically enabled by default.
> This feature can be disabled/enabled through an attribute file in the cpu
> pmu sysfs directory.
>
> The LBR call stack has following known limitations
> 1. Zero length calls are not filtered out by hardware
> 2. Exception handing such as setjmp/longjmp will have calls/returns not
> match
> 3. Pushing different return address onto the stack will have calls/returns
> not match
>
> These patches are based upon tip/perf/core
>
> Regards
> Yan, Zheng
>

--
[email protected] -- Speaking for myself only.

2013-06-26 02:43:07

[permalink] [raw]

Subject: Re: [PATCH 2/7] perf, x86: Basic Haswell LBR call stack support

On 06/25/2013 08:37 PM, Stephane Eranian wrote:
> On Tue, Jun 25, 2013 at 10:47 AM, Yan, Zheng <[email protected]> wrote:
>> From: "Yan, Zheng" <[email protected]>
>>
>> The new HSW call stack feature provides a facility such that
>> unfiltered call data will be collected as normal, but as return
>> instructions are executed the last captured branch record is
>> popped from the LBR stack. Thus, branch information relative to
>> leaf functions will not be captured, while preserving the call
>> stack information of the main line execution path.
>>
>> Signed-off-by: Yan, Zheng <[email protected]>
>> ---
>> arch/x86/kernel/cpu/perf_event.h | 7 ++-
>> arch/x86/kernel/cpu/perf_event_intel.c | 2 +-
>> arch/x86/kernel/cpu/perf_event_intel_lbr.c | 89 ++++++++++++++++++++++--------
>> 3 files changed, 74 insertions(+), 24 deletions(-)
>>
>> diff --git a/arch/x86/kernel/cpu/perf_event.h b/arch/x86/kernel/cpu/perf_event.h
>> index a74d554..e14c963 100644
>> --- a/arch/x86/kernel/cpu/perf_event.h
>> +++ b/arch/x86/kernel/cpu/perf_event.h
>> @@ -448,7 +448,10 @@ struct x86_pmu {
>> };
>>
>> enum {
>> - PERF_SAMPLE_BRANCH_SELECT_MAP_SIZE = PERF_SAMPLE_BRANCH_MAX_SHIFT,
>> + PERF_SAMPLE_BRANCH_CALL_STACK_SHIFT = PERF_SAMPLE_BRANCH_MAX_SHIFT,
>> + PERF_SAMPLE_BRANCH_SELECT_MAP_SIZE,
>> +
>> + PERF_SAMPLE_BRANCH_CALL_STACK = 1U << PERF_SAMPLE_BRANCH_CALL_STACK_SHIFT,
>> };
>>
>> #define x86_add_quirk(func_) \
>> @@ -681,6 +684,8 @@ void intel_pmu_lbr_init_atom(void);
>>
>> void intel_pmu_lbr_init_snb(void);
>>
>> +void intel_pmu_lbr_init_hsw(void);
>> +
>> int intel_pmu_setup_lbr_filter(struct perf_event *event);
>>
>> int p4_pmu_init(void);
>> diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c
>> index a6eccf1..3e92a68 100644
>> --- a/arch/x86/kernel/cpu/perf_event_intel.c
>> +++ b/arch/x86/kernel/cpu/perf_event_intel.c
>> @@ -2276,7 +2276,7 @@ __init int intel_pmu_init(void)
>> memcpy(hw_cache_event_ids, snb_hw_cache_event_ids, sizeof(hw_cache_event_ids));
>> memcpy(hw_cache_extra_regs, snb_hw_cache_extra_regs, sizeof(hw_cache_extra_regs));
>>
>> - intel_pmu_lbr_init_snb();
>> + intel_pmu_lbr_init_hsw();
>>
>> x86_pmu.event_constraints = intel_hsw_event_constraints;
>> x86_pmu.pebs_constraints = intel_hsw_pebs_event_constraints;
>> diff --git a/arch/x86/kernel/cpu/perf_event_intel_lbr.c b/arch/x86/kernel/cpu/perf_event_intel_lbr.c
>> index a72e9e9..2136320 100644
>> --- a/arch/x86/kernel/cpu/perf_event_intel_lbr.c
>> +++ b/arch/x86/kernel/cpu/perf_event_intel_lbr.c
>> @@ -39,6 +39,7 @@ static enum {
>> #define LBR_IND_JMP_BIT 6 /* do not capture indirect jumps */
>> #define LBR_REL_JMP_BIT 7 /* do not capture relative jumps */
>> #define LBR_FAR_BIT 8 /* do not capture far branches */
>> +#define LBR_CALL_STACK_BIT 9 /* enable call stack */
>>
>> #define LBR_KERNEL (1 << LBR_KERNEL_BIT)
>> #define LBR_USER (1 << LBR_USER_BIT)
>> @@ -49,6 +50,7 @@ static enum {
>> #define LBR_REL_JMP (1 << LBR_REL_JMP_BIT)
>> #define LBR_IND_JMP (1 << LBR_IND_JMP_BIT)
>> #define LBR_FAR (1 << LBR_FAR_BIT)
>> +#define LBR_CALL_STACK (1 << LBR_CALL_STACK_BIT)
>>
>> #define LBR_PLM (LBR_KERNEL | LBR_USER)
>>
>> @@ -74,24 +76,25 @@ static enum {
>> * x86control flow changes include branches, interrupts, traps, faults
>> */
>> enum {
>> - X86_BR_NONE = 0, /* unknown */
>> -
>> - X86_BR_USER = 1 << 0, /* branch target is user */
>> - X86_BR_KERNEL = 1 << 1, /* branch target is kernel */
>> -
>> - X86_BR_CALL = 1 << 2, /* call */
>> - X86_BR_RET = 1 << 3, /* return */
>> - X86_BR_SYSCALL = 1 << 4, /* syscall */
>> - X86_BR_SYSRET = 1 << 5, /* syscall return */
>> - X86_BR_INT = 1 << 6, /* sw interrupt */
>> - X86_BR_IRET = 1 << 7, /* return from interrupt */
>> - X86_BR_JCC = 1 << 8, /* conditional */
>> - X86_BR_JMP = 1 << 9, /* jump */
>> - X86_BR_IRQ = 1 << 10,/* hw interrupt or trap or fault */
>> - X86_BR_IND_CALL = 1 << 11,/* indirect calls */
>> - X86_BR_ABORT = 1 << 12,/* transaction abort */
>> - X86_BR_IN_TX = 1 << 13,/* in transaction */
>> - X86_BR_NO_TX = 1 << 14,/* not in transaction */
>> + X86_BR_NONE = 0, /* unknown */
>> +
>> + X86_BR_USER = 1 << 0, /* branch target is user */
>> + X86_BR_KERNEL = 1 << 1, /* branch target is kernel */
>> +
>> + X86_BR_CALL = 1 << 2, /* call */
>> + X86_BR_RET = 1 << 3, /* return */
>> + X86_BR_SYSCALL = 1 << 4, /* syscall */
>> + X86_BR_SYSRET = 1 << 5, /* syscall return */
>> + X86_BR_INT = 1 << 6, /* sw interrupt */
>> + X86_BR_IRET = 1 << 7, /* return from interrupt */
>> + X86_BR_JCC = 1 << 8, /* conditional */
>> + X86_BR_JMP = 1 << 9, /* jump */
>> + X86_BR_IRQ = 1 << 10,/* hw interrupt or trap or fault */
>> + X86_BR_IND_CALL = 1 << 11,/* indirect calls */
>> + X86_BR_ABORT = 1 << 12,/* transaction abort */
>> + X86_BR_IN_TX = 1 << 13,/* in transaction */
>> + X86_BR_NO_TX = 1 << 14,/* not in transaction */
>> + X86_BR_CALL_STACK = 1 << 15,/* call stack */
>> };
>>
>> #define X86_BR_PLM (X86_BR_USER | X86_BR_KERNEL)
>> @@ -135,7 +138,10 @@ static void __intel_pmu_lbr_enable(void)
>> wrmsrl(MSR_LBR_SELECT, cpuc->lbr_sel->config);
>>
>> rdmsrl(MSR_IA32_DEBUGCTLMSR, debugctl);
>> - debugctl |= (DEBUGCTLMSR_LBR | DEBUGCTLMSR_FREEZE_LBRS_ON_PMI);
>> + debugctl |= DEBUGCTLMSR_LBR;
>> + /* LBR callstack does not work well with FREEZE_LBRS_ON_PMI */
>> + if (!cpuc->lbr_sel || !(cpuc->lbr_sel->config & LBR_CALL_STACK))
>> + debugctl |= DEBUGCTLMSR_FREEZE_LBRS_ON_PMI;
>> wrmsrl(MSR_IA32_DEBUGCTLMSR, debugctl);
>> }
> Is that a bug or a known limitation here?

it's a hardware bug.

>
> In either case for HSW, this means the call-stack mode is only useful when
> measuring user level code and MUST be enforced that way by the kernel.
> In other words, callstack can ONLY be associated with events measuring
> ONLY at the user level. Otherwise you lose correlation with counter overflow.
>
> Don't see the code to enforce this restriction in this patch. Maybe it
> is elsewhere.

the code is in x86_pmu_hw_config(), added by patch 6.

Regards
Yan, Zheng

>
>>
>> @@ -333,7 +339,7 @@ void intel_pmu_lbr_read(void)
>> * - in case there is no HW filter
>> * - in case the HW filter has errata or limitations
>> */
>> -static void intel_pmu_setup_sw_lbr_filter(struct perf_event *event)
>> +static int intel_pmu_setup_sw_lbr_filter(struct perf_event *event)
>> {
>> u64 br_type = event->attr.branch_sample_type;
>> int mask = 0;
>> @@ -367,11 +373,21 @@ static void intel_pmu_setup_sw_lbr_filter(struct perf_event *event)
>> if (br_type & PERF_SAMPLE_BRANCH_NO_TX)
>> mask |= X86_BR_NO_TX;
>>
>> + if (br_type & PERF_SAMPLE_BRANCH_CALL_STACK) {
>> + if (!x86_pmu.lbr_sel_map)
>> + return -EOPNOTSUPP;
>> + if (mask & ~(X86_BR_USER | X86_BR_KERNEL))
>> + return -EINVAL;
>> + mask |= X86_BR_CALL | X86_BR_IND_CALL | X86_BR_RET |
>> + X86_BR_CALL_STACK;
>> + }
>> +
>> /*
>> * stash actual user request into reg, it may
>> * be used by fixup code for some CPU
>> */
>> event->hw.branch_reg.reg = mask;
>> + return 0;
>> }
>>
>> /*
>> @@ -401,7 +417,7 @@ static int intel_pmu_setup_hw_lbr_filter(struct perf_event *event)
>> reg->idx = EXTRA_REG_LBR;
>>
>> /* LBR_SELECT operates in suppress mode so invert mask */
>> - reg->config = ~mask & x86_pmu.lbr_sel_mask;
>> + reg->config = mask ^ x86_pmu.lbr_sel_mask;
>>
>> return 0;
>> }
>> @@ -419,7 +435,9 @@ int intel_pmu_setup_lbr_filter(struct perf_event *event)
>> /*
>> * setup SW LBR filter
>> */
>> - intel_pmu_setup_sw_lbr_filter(event);
>> + ret = intel_pmu_setup_sw_lbr_filter(event);
>> + if (ret)
>> + return ret;
>>
>> /*
>> * setup HW LBR filter, if any
>> @@ -674,6 +692,19 @@ static const int snb_lbr_sel_map[PERF_SAMPLE_BRANCH_SELECT_MAP_SIZE] = {
>> [PERF_SAMPLE_BRANCH_IND_CALL_SHIFT] = LBR_IND_CALL,
>> };
>>
>> +static const int hsw_lbr_sel_map[PERF_SAMPLE_BRANCH_SELECT_MAP_SIZE] = {
>> + [PERF_SAMPLE_BRANCH_ANY_SHIFT] = LBR_ANY,
>> + [PERF_SAMPLE_BRANCH_USER_SHIFT] = LBR_USER,
>> + [PERF_SAMPLE_BRANCH_KERNEL_SHIFT] = LBR_KERNEL,
>> + [PERF_SAMPLE_BRANCH_HV_SHIFT] = LBR_IGN,
>> + [PERF_SAMPLE_BRANCH_ANY_RETURN_SHIFT] = LBR_RETURN | LBR_FAR,
>> + [PERF_SAMPLE_BRANCH_ANY_CALL_SHIFT] = LBR_REL_CALL | LBR_IND_CALL
>> + | LBR_FAR,
>> + [PERF_SAMPLE_BRANCH_IND_CALL_SHIFT] = LBR_IND_CALL,
>> + [PERF_SAMPLE_BRANCH_CALL_STACK_SHIFT] = LBR_REL_CALL | LBR_IND_CALL
>> + | LBR_RETURN | LBR_CALL_STACK,
>> +};
>> +
>> /* core */
>> void intel_pmu_lbr_init_core(void)
>> {
>> @@ -730,6 +761,20 @@ void intel_pmu_lbr_init_snb(void)
>> pr_cont("16-deep LBR, ");
>> }
>>
>> +/* haswell */
>> +void intel_pmu_lbr_init_hsw(void)
>> +{
>> + x86_pmu.lbr_nr = 16;
>> + x86_pmu.lbr_tos = MSR_LBR_TOS;
>> + x86_pmu.lbr_from = MSR_LBR_NHM_FROM;
>> + x86_pmu.lbr_to = MSR_LBR_NHM_TO;
>> +
>> + x86_pmu.lbr_sel_mask = LBR_SEL_MASK;
>> + x86_pmu.lbr_sel_map = hsw_lbr_sel_map;
>> +
>> + pr_cont("16-deep LBR, ");
>> +}
>> +
>> /* atom */
>> void intel_pmu_lbr_init_atom(void)
>> {
>> --
>> 1.8.1.4
>>

2013-06-26 06:05:50

[permalink] [raw]

Subject: Re: [PATCH 1/7] perf, x86: Reduce lbr_sel_map size

On 06/25/2013 08:33 PM, Stephane Eranian wrote:
> On Tue, Jun 25, 2013 at 10:47 AM, Yan, Zheng <[email protected]> wrote:
>> From: "Yan, Zheng" <[email protected]>
>>
>> The index of lbr_sel_map is bit value of perf branch_sample_type.
>> We can reduce lbr_sel_map size by using bit shift as index.
>>
>> Signed-off-by: Yan, Zheng <[email protected]>
>> ---
>> arch/x86/kernel/cpu/perf_event.h | 4 +++
>> arch/x86/kernel/cpu/perf_event_intel_lbr.c | 50 ++++++++++++++----------------
>> include/uapi/linux/perf_event.h | 42 +++++++++++++++++--------
>> 3 files changed, 56 insertions(+), 40 deletions(-)
>>
>> diff --git a/arch/x86/kernel/cpu/perf_event.h b/arch/x86/kernel/cpu/perf_event.h
>> index 108dc75..a74d554 100644
>> --- a/arch/x86/kernel/cpu/perf_event.h
>> +++ b/arch/x86/kernel/cpu/perf_event.h
>> @@ -447,6 +447,10 @@ struct x86_pmu {
>> struct perf_guest_switch_msr *(*guest_get_msrs)(int *nr);
>> };
>>
>> +enum {
>> + PERF_SAMPLE_BRANCH_SELECT_MAP_SIZE = PERF_SAMPLE_BRANCH_MAX_SHIFT,
>> +};
>> +
>> #define x86_add_quirk(func_) \
>> do { \
>> static struct x86_pmu_quirk __quirk __initdata = { \
>> diff --git a/arch/x86/kernel/cpu/perf_event_intel_lbr.c b/arch/x86/kernel/cpu/perf_event_intel_lbr.c
>> index d5be06a..a72e9e9 100644
>> --- a/arch/x86/kernel/cpu/perf_event_intel_lbr.c
>> +++ b/arch/x86/kernel/cpu/perf_event_intel_lbr.c
>> @@ -69,10 +69,6 @@ static enum {
>> #define LBR_FROM_FLAG_IN_TX (1ULL << 62)
>> #define LBR_FROM_FLAG_ABORT (1ULL << 61)
>>
>> -#define for_each_branch_sample_type(x) \
>> - for ((x) = PERF_SAMPLE_BRANCH_USER; \
>> - (x) < PERF_SAMPLE_BRANCH_MAX; (x) <<= 1)
>> -
>> /*
>> * x86control flow change classification
>> * x86control flow changes include branches, interrupts, traps, faults
>> @@ -387,14 +383,14 @@ static int intel_pmu_setup_hw_lbr_filter(struct perf_event *event)
>> {
>> struct hw_perf_event_extra *reg;
>> u64 br_type = event->attr.branch_sample_type;
>> - u64 mask = 0, m;
>> - u64 v;
>> + u64 mask = 0, v;
>> + int i;
>>
>> - for_each_branch_sample_type(m) {
>> - if (!(br_type & m))
>> + for (i = 0; i < PERF_SAMPLE_BRANCH_SELECT_MAP_SIZE; i++) {
>> + if (!(br_type & (1U << i)))
>
> Needs to be 1ULL to avoid bug later on. br_type is u64.

thanks, will fix.

>
>> continue;
>>
>> - v = x86_pmu.lbr_sel_map[m];
>> + v = x86_pmu.lbr_sel_map[i];
>> if (v == LBR_NOT_SUPP)
>> return -EOPNOTSUPP;
>>
>> @@ -649,33 +645,33 @@ intel_pmu_lbr_filter(struct cpu_hw_events *cpuc)
>> /*
>> * Map interface branch filters onto LBR filters
>> */
>> -static const int nhm_lbr_sel_map[PERF_SAMPLE_BRANCH_MAX] = {
>> - [PERF_SAMPLE_BRANCH_ANY] = LBR_ANY,
>> - [PERF_SAMPLE_BRANCH_USER] = LBR_USER,
>> - [PERF_SAMPLE_BRANCH_KERNEL] = LBR_KERNEL,
>> - [PERF_SAMPLE_BRANCH_HV] = LBR_IGN,
>> - [PERF_SAMPLE_BRANCH_ANY_RETURN] = LBR_RETURN | LBR_REL_JMP
>> - | LBR_IND_JMP | LBR_FAR,
>> +static const int nhm_lbr_sel_map[PERF_SAMPLE_BRANCH_SELECT_MAP_SIZE] = {
>> + [PERF_SAMPLE_BRANCH_ANY_SHIFT] = LBR_ANY,
>> + [PERF_SAMPLE_BRANCH_USER_SHIFT] = LBR_USER,
>> + [PERF_SAMPLE_BRANCH_KERNEL_SHIFT] = LBR_KERNEL,
>> + [PERF_SAMPLE_BRANCH_HV_SHIFT] = LBR_IGN,
>> + [PERF_SAMPLE_BRANCH_ANY_RETURN_SHIFT] = LBR_RETURN | LBR_REL_JMP
>> + | LBR_IND_JMP | LBR_FAR,
>> /*
>> * NHM/WSM erratum: must include REL_JMP+IND_JMP to get CALL branches
>> */
>> - [PERF_SAMPLE_BRANCH_ANY_CALL] =
>> + [PERF_SAMPLE_BRANCH_ANY_CALL_SHIFT] =
>> LBR_REL_CALL | LBR_IND_CALL | LBR_REL_JMP | LBR_IND_JMP | LBR_FAR,
>> /*
>> * NHM/WSM erratum: must include IND_JMP to capture IND_CALL
>> */
>> - [PERF_SAMPLE_BRANCH_IND_CALL] = LBR_IND_CALL | LBR_IND_JMP,
>> + [PERF_SAMPLE_BRANCH_IND_CALL_SHIFT] = LBR_IND_CALL | LBR_IND_JMP,
>> };
>>
> I think it would ease formatting readability
> if the indexes could be constructed from a simple macro:
> #define BR_SHIFT(a) \
> PERF_SAMPLE_##a##_SHIFT
>
> #define BR_SMPL(a) \
> PERF_SAMPLE_##a

#define BR_SHIFT(a) PERF_SAMPLE_##a##_SHIFT

static const int snb_lbr_sel_map[PERF_SAMPLE_BRANCH_SELECT_MAP_SIZE] = {
[BR_SHIFT(ANY)] = LBR_ANY,
[BR_SHIFT(USER)] = LBR_USER,
[BR_SHIFT(KERNEL) = LBR_KERNEL,
[BR_SHIFT(HV)] = LBR_IGN,
[BR_SHIFT(ANY_RETURN)] = LBR_RETURN | LBR_FAR,
[BR_SHIFT(ANY_CALL)] = LBR_REL_CALL | LBR_IND_CALL | LBR_FAR,
[BR_SHIFT(CALL_STACK)] = LBR_IND_CALL,
};

the code looks strange, I don't think it has better readability.

Regards
Yan, Zheng

>
>> -static const int snb_lbr_sel_map[PERF_SAMPLE_BRANCH_MAX] = {
>> - [PERF_SAMPLE_BRANCH_ANY] = LBR_ANY,
>> - [PERF_SAMPLE_BRANCH_USER] = LBR_USER,
>> - [PERF_SAMPLE_BRANCH_KERNEL] = LBR_KERNEL,
>> - [PERF_SAMPLE_BRANCH_HV] = LBR_IGN,
>> - [PERF_SAMPLE_BRANCH_ANY_RETURN] = LBR_RETURN | LBR_FAR,
>> - [PERF_SAMPLE_BRANCH_ANY_CALL] = LBR_REL_CALL | LBR_IND_CALL
>> - | LBR_FAR,
>> - [PERF_SAMPLE_BRANCH_IND_CALL] = LBR_IND_CALL,
>> +static const int snb_lbr_sel_map[PERF_SAMPLE_BRANCH_SELECT_MAP_SIZE] = {
>> + [PERF_SAMPLE_BRANCH_ANY_SHIFT] = LBR_ANY,
>> + [PERF_SAMPLE_BRANCH_USER_SHIFT] = LBR_USER,
>> + [PERF_SAMPLE_BRANCH_KERNEL_SHIFT] = LBR_KERNEL,
>> + [PERF_SAMPLE_BRANCH_HV_SHIFT] = LBR_IGN,
>> + [PERF_SAMPLE_BRANCH_ANY_RETURN_SHIFT] = LBR_RETURN | LBR_FAR,
>> + [PERF_SAMPLE_BRANCH_ANY_CALL_SHIFT] = LBR_REL_CALL | LBR_IND_CALL
>> + | LBR_FAR,
>> + [PERF_SAMPLE_BRANCH_IND_CALL_SHIFT] = LBR_IND_CALL,
>> };
>>
>> /* core */
>> diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h
>> index 0b1df41..2ec219e 100644
>> --- a/include/uapi/linux/perf_event.h
>> +++ b/include/uapi/linux/perf_event.h
>> @@ -148,20 +148,36 @@ enum perf_event_sample_format {
>> * The branch types can be combined, however BRANCH_ANY covers all types
>> * of branches and therefore it supersedes all the other types.
>> */
>> +enum perf_branch_sample_type_shift {
>> + PERF_SAMPLE_BRANCH_USER_SHIFT = 0, /* user branches */
>> + PERF_SAMPLE_BRANCH_KERNEL_SHIFT = 1, /* kernel branches */
>> + PERF_SAMPLE_BRANCH_HV_SHIFT = 2, /* hypervisor branches */
>> +
>> + PERF_SAMPLE_BRANCH_ANY_SHIFT = 3, /* any branch types */
>> + PERF_SAMPLE_BRANCH_ANY_CALL_SHIFT = 4, /* any call branch */
>> + PERF_SAMPLE_BRANCH_ANY_RETURN_SHIFT = 5, /* any return branch */
>> + PERF_SAMPLE_BRANCH_IND_CALL_SHIFT = 6, /* indirect calls */
>> + PERF_SAMPLE_BRANCH_ABORT_TX_SHIFT = 7, /* transaction aborts */
>> + PERF_SAMPLE_BRANCH_IN_TX_SHIFT = 8, /* in transaction */
>> + PERF_SAMPLE_BRANCH_NO_TX_SHIFT = 9, /* not in transaction */
>> +
>> + PERF_SAMPLE_BRANCH_MAX_SHIFT /* non-ABI */
>> +};
>> +
>> enum perf_branch_sample_type {
>> - PERF_SAMPLE_BRANCH_USER = 1U << 0, /* user branches */
>> - PERF_SAMPLE_BRANCH_KERNEL = 1U << 1, /* kernel branches */
>> - PERF_SAMPLE_BRANCH_HV = 1U << 2, /* hypervisor branches */
>> -
>> - PERF_SAMPLE_BRANCH_ANY = 1U << 3, /* any branch types */
>> - PERF_SAMPLE_BRANCH_ANY_CALL = 1U << 4, /* any call branch */
>> - PERF_SAMPLE_BRANCH_ANY_RETURN = 1U << 5, /* any return branch */
>> - PERF_SAMPLE_BRANCH_IND_CALL = 1U << 6, /* indirect calls */
>> - PERF_SAMPLE_BRANCH_ABORT_TX = 1U << 7, /* transaction aborts */
>> - PERF_SAMPLE_BRANCH_IN_TX = 1U << 8, /* in transaction */
>> - PERF_SAMPLE_BRANCH_NO_TX = 1U << 9, /* not in transaction */
>> -
>> - PERF_SAMPLE_BRANCH_MAX = 1U << 10, /* non-ABI */
>> + PERF_SAMPLE_BRANCH_USER = 1U << PERF_SAMPLE_BRANCH_USER_SHIFT,
>> + PERF_SAMPLE_BRANCH_KERNEL = 1U << PERF_SAMPLE_BRANCH_KERNEL_SHIFT,
>> + PERF_SAMPLE_BRANCH_HV = 1U << PERF_SAMPLE_BRANCH_HV_SHIFT,
>> +
>> + PERF_SAMPLE_BRANCH_ANY = 1U << PERF_SAMPLE_BRANCH_ANY_SHIFT,
>> + PERF_SAMPLE_BRANCH_ANY_CALL = 1U << PERF_SAMPLE_BRANCH_ANY_CALL_SHIFT,
>> + PERF_SAMPLE_BRANCH_ANY_RETURN = 1U << PERF_SAMPLE_BRANCH_ANY_RETURN_SHIFT,
>> + PERF_SAMPLE_BRANCH_IND_CALL = 1U << PERF_SAMPLE_BRANCH_IND_CALL_SHIFT,
>> + PERF_SAMPLE_BRANCH_ABORT_TX = 1U << PERF_SAMPLE_BRANCH_ABORT_TX_SHIFT,
>> + PERF_SAMPLE_BRANCH_IN_TX = 1U << PERF_SAMPLE_BRANCH_IN_TX_SHIFT,
>> + PERF_SAMPLE_BRANCH_NO_TX = 1U << PERF_SAMPLE_BRANCH_NO_TX_SHIFT,
>> +
>> + PERF_SAMPLE_BRANCH_MAX = 1U << PERF_SAMPLE_BRANCH_MAX_SHIFT,
>> };
>>
>> #define PERF_SAMPLE_BRANCH_PLM_ALL \
>> --
>> 1.8.1.4
>>

2013-06-26 09:00:51

[permalink] [raw]

Subject: Re: [PATCH 6/7] perf, x86: Use LBR call stack to get user callchain

On Tue, Jun 25, 2013 at 10:47 AM, Yan, Zheng <[email protected]> wrote:
> From: "Yan, Zheng" <[email protected]>
>
> Try enabling the LBR call stack feature if event request recording
> callchain. Try utilizing the LBR call stack to get user callchain
> in case of there is no frame pointer.
>
> This patch also adds a cpu pmu attribute to enable/disable this
> feature.
>
> Signed-off-by: Yan, Zheng <[email protected]>
> ---
> arch/x86/kernel/cpu/perf_event.c | 128 +++++++++++++++++++++--------
> arch/x86/kernel/cpu/perf_event.h | 7 ++
> arch/x86/kernel/cpu/perf_event_intel.c | 20 ++---
> arch/x86/kernel/cpu/perf_event_intel_lbr.c | 3 +
> include/linux/perf_event.h | 6 ++
> kernel/events/core.c | 11 ++-
> 6 files changed, 126 insertions(+), 49 deletions(-)
>
> diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
> index 639aa4d..a07eb03 100644
> --- a/arch/x86/kernel/cpu/perf_event.c
> +++ b/arch/x86/kernel/cpu/perf_event.c
> @@ -399,37 +399,49 @@ int x86_pmu_hw_config(struct perf_event *event)
>
> if (event->attr.precise_ip > precise)
> return -EOPNOTSUPP;
> + }
> + /*
> + * check that PEBS LBR correction does not conflict with
> + * whatever the user is asking with attr->branch_sample_type
> + */
> + if (event->attr.precise_ip > 1 && x86_pmu.intel_cap.pebs_format < 2) {
> + u64 *br_type = &event->attr.branch_sample_type;
> +
> + if (has_branch_stack(event)) {
> + if (!precise_br_compat(event))
> + return -EOPNOTSUPP;
> +
> + /* branch_sample_type is compatible */
> +
> + } else {
> + /*
> + * user did not specify branch_sample_type
> + *
> + * For PEBS fixups, we capture all
> + * the branches at the priv level of the
> + * event.
> + */
> + *br_type = PERF_SAMPLE_BRANCH_ANY;
> +
> + if (!event->attr.exclude_user)
> + *br_type |= PERF_SAMPLE_BRANCH_USER;
> +
> + if (!event->attr.exclude_kernel)
> + *br_type |= PERF_SAMPLE_BRANCH_KERNEL;
> + }
> + } else if ((event->attr.sample_type & PERF_SAMPLE_CALLCHAIN) &&
> + !has_branch_stack(event) &&
> + x86_pmu.attr_lbr_callstack &&
> + !event->attr.exclude_user &&
> + (event->attach_state & PERF_ATTACH_TASK)) {
> /*
> - * check that PEBS LBR correction does not conflict with
> - * whatever the user is asking with attr->branch_sample_type
> + * user did not specify branch_sample_type,
> + * try using the LBR call stack facility to
> + * record call chains of user program.
> */
> - if (event->attr.precise_ip > 1 &&
> - x86_pmu.intel_cap.pebs_format < 2) {
> - u64 *br_type = &event->attr.branch_sample_type;
> -
> - if (has_branch_stack(event)) {
> - if (!precise_br_compat(event))
> - return -EOPNOTSUPP;
> -
> - /* branch_sample_type is compatible */
> -
> - } else {
> - /*
> - * user did not specify branch_sample_type
> - *
> - * For PEBS fixups, we capture all
> - * the branches at the priv level of the
> - * event.
> - */
> - *br_type = PERF_SAMPLE_BRANCH_ANY;
> -
> - if (!event->attr.exclude_user)
> - *br_type |= PERF_SAMPLE_BRANCH_USER;
> -
> - if (!event->attr.exclude_kernel)
> - *br_type |= PERF_SAMPLE_BRANCH_KERNEL;
> - }
> - }
> + event->attr.branch_sample_type =
> + PERF_SAMPLE_BRANCH_USER |
> + PERF_SAMPLE_BRANCH_CALL_STACK;
> }
>
> /*
> @@ -1825,10 +1837,34 @@ static ssize_t set_attr_rdpmc(struct device *cdev,
> return count;
> }
>
> +static ssize_t get_attr_lbr_callstack(struct device *cdev,
> + struct device_attribute *attr, char *buf)
> +{
> + return snprintf(buf, 40, "%d\n", x86_pmu.attr_lbr_callstack);
> +}
> +
> +static ssize_t set_attr_lbr_callstack(struct device *cdev,
> + struct device_attribute *attr,
> + const char *buf, size_t count)
> +{
> + unsigned long val = simple_strtoul(buf, NULL, 0);
> +
> + if (x86_pmu.attr_lbr_callstack != !!val) {
> + if (val && !x86_pmu_has_lbr_callstack())
> + return -EOPNOTSUPP;
> + x86_pmu.attr_lbr_callstack = !!val;
> + }
> + return count;
> +}
> +
> static DEVICE_ATTR(rdpmc, S_IRUSR | S_IWUSR, get_attr_rdpmc, set_attr_rdpmc);
> +static DEVICE_ATTR(lbr_callstack, S_IRUSR | S_IWUSR,
> + get_attr_lbr_callstack, set_attr_lbr_callstack);
> +
>
Given the known limitations, I think it would be good for users to be allowed
to probe the current setting of lbr_callstack.

$ cat /sys/devices/cpu/lbr_callstack
cat: /sys/devices/cpu/lbr_callstack: Permission denied

> static struct attribute *x86_pmu_attrs[] = {
> &dev_attr_rdpmc.attr,
> + &dev_attr_lbr_callstack.attr,
> NULL,
> };
>
> @@ -1955,12 +1991,29 @@ static unsigned long get_segment_base(unsigned int segment)
> return get_desc_base(desc + idx);
> }
>
> +static inline void
> +perf_callchain_lbr_callstack(struct perf_callchain_entry *entry,
> + struct perf_sample_data *data)
> +{
> + struct perf_branch_stack *br_stack = data->br_stack;
> +
> + if (br_stack && br_stack->user_callstack &&
> + x86_pmu.attr_lbr_callstack) {
> + int i = 0;
> + while (i < br_stack->nr && entry->nr < PERF_MAX_STACK_DEPTH) {
> + perf_callchain_store(entry, br_stack->entries[i].from);
> + i++;
> + }
> + }
> +}
> +
> #ifdef CONFIG_COMPAT
>
> #include <asm/compat.h>
>
> static inline int
> -perf_callchain_user32(struct pt_regs *regs, struct perf_callchain_entry *entry)
> +perf_callchain_user32(struct perf_callchain_entry *entry,
> + struct pt_regs *regs, struct perf_sample_data *data)
> {
> /* 32-bit process in 64-bit kernel. */
> unsigned long ss_base, cs_base;
> @@ -1989,11 +2042,16 @@ perf_callchain_user32(struct pt_regs *regs, struct perf_callchain_entry *entry)
> perf_callchain_store(entry, cs_base + frame.return_address);
> fp = compat_ptr(ss_base + frame.next_frame);
> }
> +
> + if (fp == compat_ptr(regs->bp))
> + perf_callchain_lbr_callstack(entry, data);
> +
> return 1;
> }
> #else
> static inline int
> -perf_callchain_user32(struct pt_regs *regs, struct perf_callchain_entry *entry)
> +perf_callchain_user32(struct perf_callchain_entry *entry,
> + struct pt_regs *regs, struct perf_sample_data *data)
> {
> return 0;
> }
> @@ -2023,12 +2081,12 @@ void perf_callchain_user(struct perf_callchain_entry *entry,
> if (!current->mm)
> return;
>
> - if (perf_callchain_user32(regs, entry))
> + if (perf_callchain_user32(entry, regs, data))
> return;
>
> while (entry->nr < PERF_MAX_STACK_DEPTH) {
> unsigned long bytes;
> - frame.next_frame = NULL;
> + frame.next_frame = NULL;
> frame.return_address = 0;
>
> bytes = copy_from_user_nmi(&frame, fp, sizeof(frame));
> @@ -2041,6 +2099,10 @@ void perf_callchain_user(struct perf_callchain_entry *entry,
> perf_callchain_store(entry, frame.return_address);
> fp = frame.next_frame;
> }
> +
> + /* try LBR callstack if there is no frame pointer */
> + if (fp == (void __user *)regs->bp)
> + perf_callchain_lbr_callstack(entry, data);
> }
>
> /*
> diff --git a/arch/x86/kernel/cpu/perf_event.h b/arch/x86/kernel/cpu/perf_event.h
> index 0116970..536470d 100644
> --- a/arch/x86/kernel/cpu/perf_event.h
> +++ b/arch/x86/kernel/cpu/perf_event.h
> @@ -390,6 +390,7 @@ struct x86_pmu {
> * sysfs attrs
> */
> int attr_rdpmc;
> + int attr_lbr_callstack;
> struct attribute **format_attrs;
> struct attribute **event_attrs;
>
> @@ -496,6 +497,12 @@ static struct perf_pmu_events_attr event_attr_##v = { \
>
> extern struct x86_pmu x86_pmu __read_mostly;
>
> +static inline bool x86_pmu_has_lbr_callstack(void)
> +{
> + return x86_pmu.lbr_sel_map &&
> + x86_pmu.lbr_sel_map[PERF_SAMPLE_BRANCH_CALL_STACK_SHIFT] > 0;
> +}
> +
> DECLARE_PER_CPU(struct cpu_hw_events, cpu_hw_events);
>
> int x86_perf_event_set_period(struct perf_event *event);
> diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c
> index f59b46e..baa8384 100644
> --- a/arch/x86/kernel/cpu/perf_event_intel.c
> +++ b/arch/x86/kernel/cpu/perf_event_intel.c
> @@ -882,15 +882,10 @@ static __initconst const u64 atom_hw_cache_event_ids
> },
> };
>
> -static inline bool intel_pmu_needs_lbr_smpl(struct perf_event *event)
> +static inline bool intel_pmu_needs_lbr_callstack(struct perf_event *event)
> {
> - /* user explicitly requested branch sampling */
> - if (has_branch_stack(event))
> - return true;
> -
> - /* implicit branch sampling to correct PEBS skid */
> - if (x86_pmu.intel_cap.pebs_trap && event->attr.precise_ip > 1 &&
> - x86_pmu.intel_cap.pebs_format < 2)
> + if ((event->attr.sample_type & PERF_SAMPLE_CALLCHAIN) &&
> + (event->attr.branch_sample_type & PERF_SAMPLE_BRANCH_CALL_STACK))
> return true;
>
> return false;
> @@ -1054,7 +1049,7 @@ static void intel_pmu_disable_event(struct perf_event *event)
> * must disable before any actual event
> * because any event may be combined with LBR
> */
> - if (intel_pmu_needs_lbr_smpl(event))
> + if (needs_branch_stack(event))
> intel_pmu_lbr_disable(event);
>
> if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL)) {
> @@ -1115,7 +1110,7 @@ static void intel_pmu_enable_event(struct perf_event *event)
> * must enabled before any actual event
> * because any event may be combined with LBR
> */
> - if (intel_pmu_needs_lbr_smpl(event))
> + if (needs_branch_stack(event))
> intel_pmu_lbr_enable(event);
>
> if (event->attr.exclude_host)
> @@ -1237,7 +1232,8 @@ again:
>
> perf_sample_data_init(&data, 0, event->hw.last_period);
>
> - if (has_branch_stack(event))
> + if (has_branch_stack(event) ||
> + (event->ctx->task && intel_pmu_needs_lbr_callstack(event)))
> data.br_stack = &cpuc->lbr_stack;
>
> if (perf_event_overflow(event, &data, regs))
> @@ -1568,7 +1564,7 @@ static int intel_pmu_hw_config(struct perf_event *event)
> if (event->attr.precise_ip && x86_pmu.pebs_aliases)
> x86_pmu.pebs_aliases(event);
>
> - if (intel_pmu_needs_lbr_smpl(event)) {
> + if (needs_branch_stack(event)) {
> ret = intel_pmu_setup_lbr_filter(event);
> if (ret)
> return ret;
> diff --git a/arch/x86/kernel/cpu/perf_event_intel_lbr.c b/arch/x86/kernel/cpu/perf_event_intel_lbr.c
> index 43b16b4..3be2d7b 100644
> --- a/arch/x86/kernel/cpu/perf_event_intel_lbr.c
> +++ b/arch/x86/kernel/cpu/perf_event_intel_lbr.c
> @@ -709,6 +709,8 @@ intel_pmu_lbr_filter(struct cpu_hw_events *cpuc)
> int i, j, type;
> bool compress = false;
>
> + cpuc->lbr_stack.user_callstack = branch_user_callstack(br_sel);
> +
> /* if sampling all branches, then nothing to filter */
> if ((br_sel & X86_BR_ALL) == X86_BR_ALL)
> return;
> @@ -861,6 +863,7 @@ void intel_pmu_lbr_init_hsw(void)
>
> x86_pmu.lbr_sel_mask = LBR_SEL_MASK;
> x86_pmu.lbr_sel_map = hsw_lbr_sel_map;
> + x86_pmu.attr_lbr_callstack = 1;
>
> pr_cont("16-deep LBR, ");
> }
> diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
> index fa4c1bf..168e66e 100644
> --- a/include/linux/perf_event.h
> +++ b/include/linux/perf_event.h
> @@ -97,6 +97,7 @@ struct perf_branch_entry {
> * recent branch.
> */
> struct perf_branch_stack {
> + unsigned user_callstack:1;
> __u64 nr;
> struct perf_branch_entry entries[0];
> };
> @@ -759,6 +760,11 @@ static inline bool has_branch_stack(struct perf_event *event)
> return event->attr.sample_type & PERF_SAMPLE_BRANCH_STACK;
> }
>
> +static inline bool needs_branch_stack(struct perf_event *event)
> +{
> + return event->attr.branch_sample_type != 0;
> +}
> +
> extern int perf_output_begin(struct perf_output_handle *handle,
> struct perf_event *event, unsigned int size);
> extern void perf_output_end(struct perf_output_handle *handle);
> diff --git a/kernel/events/core.c b/kernel/events/core.c
> index 4aad901..38eaa2b 100644
> --- a/kernel/events/core.c
> +++ b/kernel/events/core.c
> @@ -1117,7 +1117,7 @@ list_add_event(struct perf_event *event, struct perf_event_context *ctx)
> if (is_cgroup_event(event))
> ctx->nr_cgroups++;
>
> - if (has_branch_stack(event))
> + if (needs_branch_stack(event))
> ctx->nr_branch_stack++;
>
> list_add_rcu(&event->event_entry, &ctx->event_list);
> @@ -1274,7 +1274,7 @@ list_del_event(struct perf_event *event, struct perf_event_context *ctx)
> cpuctx->cgrp = NULL;
> }
>
> - if (has_branch_stack(event)) {
> + if (needs_branch_stack(event)) {
> if (ctx->is_active)
> __get_cpu_var(perf_branch_stack_events)--;
> ctx->nr_branch_stack--;
> @@ -3155,7 +3155,7 @@ static void free_event(struct perf_event *event)
> static_key_slow_dec_deferred(&perf_sched_events);
> }
>
> - if (has_branch_stack(event))
> + if (needs_branch_stack(event))
> static_key_slow_dec_deferred(&perf_sched_events);
> }
>
> @@ -6545,6 +6545,9 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
> if (attr->inherit && (attr->read_format & PERF_FORMAT_GROUP))
> goto done;
>
> + if (!has_branch_stack(event))
> + event->attr.branch_sample_type = 0;
> +
> pmu = perf_init_event(event);
>
> done:
> @@ -6577,7 +6580,7 @@ done:
> return ERR_PTR(err);
> }
> }
> - if (has_branch_stack(event))
> + if (needs_branch_stack(event))
> static_key_slow_inc(&perf_sched_events.key);
> }
>
> --
> 1.8.1.4
>

2013-06-26 12:42:21

[permalink] [raw]

Subject: Re: [PATCH 6/7] perf, x86: Use LBR call stack to get user callchain

On Tue, Jun 25, 2013 at 10:47 AM, Yan, Zheng <[email protected]> wrote:
> From: "Yan, Zheng" <[email protected]>
>
> Try enabling the LBR call stack feature if event request recording
> callchain. Try utilizing the LBR call stack to get user callchain
> in case of there is no frame pointer.
>
> This patch also adds a cpu pmu attribute to enable/disable this
> feature.
>
> Signed-off-by: Yan, Zheng <[email protected]>
> ---
> arch/x86/kernel/cpu/perf_event.c | 128 +++++++++++++++++++++--------
> arch/x86/kernel/cpu/perf_event.h | 7 ++
> arch/x86/kernel/cpu/perf_event_intel.c | 20 ++---
> arch/x86/kernel/cpu/perf_event_intel_lbr.c | 3 +
> include/linux/perf_event.h | 6 ++
> kernel/events/core.c | 11 ++-
> 6 files changed, 126 insertions(+), 49 deletions(-)
>
> diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
> index 639aa4d..a07eb03 100644
> --- a/arch/x86/kernel/cpu/perf_event.c
> +++ b/arch/x86/kernel/cpu/perf_event.c
> @@ -399,37 +399,49 @@ int x86_pmu_hw_config(struct perf_event *event)
>
> if (event->attr.precise_ip > precise)
> return -EOPNOTSUPP;
> + }
> + /*
> + * check that PEBS LBR correction does not conflict with
> + * whatever the user is asking with attr->branch_sample_type
> + */
> + if (event->attr.precise_ip > 1 && x86_pmu.intel_cap.pebs_format < 2) {
> + u64 *br_type = &event->attr.branch_sample_type;
> +
> + if (has_branch_stack(event)) {
> + if (!precise_br_compat(event))
> + return -EOPNOTSUPP;
> +
> + /* branch_sample_type is compatible */
> +
> + } else {
> + /*
> + * user did not specify branch_sample_type
> + *
> + * For PEBS fixups, we capture all
> + * the branches at the priv level of the
> + * event.
> + */
> + *br_type = PERF_SAMPLE_BRANCH_ANY;
> +
> + if (!event->attr.exclude_user)
> + *br_type |= PERF_SAMPLE_BRANCH_USER;
> +
> + if (!event->attr.exclude_kernel)
> + *br_type |= PERF_SAMPLE_BRANCH_KERNEL;
> + }
> + } else if ((event->attr.sample_type & PERF_SAMPLE_CALLCHAIN) &&
> + !has_branch_stack(event) &&
> + x86_pmu.attr_lbr_callstack &&
> + !event->attr.exclude_user &&
> + (event->attach_state & PERF_ATTACH_TASK)) {

Yes, that's the test but it is wrong. I can pass the test if
I have exclude_user = exclude_kernel = 0.

You want:
!event->attr.exclude_user && event->attr.exclude_kernel &&

I tested that and it works.

> /*
> - * check that PEBS LBR correction does not conflict with
> - * whatever the user is asking with attr->branch_sample_type
> + * user did not specify branch_sample_type,
> + * try using the LBR call stack facility to
> + * record call chains of user program.
> */
> - if (event->attr.precise_ip > 1 &&
> - x86_pmu.intel_cap.pebs_format < 2) {
> - u64 *br_type = &event->attr.branch_sample_type;
> -
> - if (has_branch_stack(event)) {
> - if (!precise_br_compat(event))
> - return -EOPNOTSUPP;
> -
> - /* branch_sample_type is compatible */
> -
> - } else {
> - /*
> - * user did not specify branch_sample_type
> - *
> - * For PEBS fixups, we capture all
> - * the branches at the priv level of the
> - * event.
> - */
> - *br_type = PERF_SAMPLE_BRANCH_ANY;
> -
> - if (!event->attr.exclude_user)
> - *br_type |= PERF_SAMPLE_BRANCH_USER;
> -
> - if (!event->attr.exclude_kernel)
> - *br_type |= PERF_SAMPLE_BRANCH_KERNEL;
> - }
> - }
> + event->attr.branch_sample_type =
> + PERF_SAMPLE_BRANCH_USER |
> + PERF_SAMPLE_BRANCH_CALL_STACK;
> }
>
> /*
> @@ -1825,10 +1837,34 @@ static ssize_t set_attr_rdpmc(struct device *cdev,
> return count;
> }
>
> +static ssize_t get_attr_lbr_callstack(struct device *cdev,
> + struct device_attribute *attr, char *buf)
> +{
> + return snprintf(buf, 40, "%d\n", x86_pmu.attr_lbr_callstack);
> +}
> +
> +static ssize_t set_attr_lbr_callstack(struct device *cdev,
> + struct device_attribute *attr,
> + const char *buf, size_t count)
> +{
> + unsigned long val = simple_strtoul(buf, NULL, 0);
> +
> + if (x86_pmu.attr_lbr_callstack != !!val) {
> + if (val && !x86_pmu_has_lbr_callstack())
> + return -EOPNOTSUPP;
> + x86_pmu.attr_lbr_callstack = !!val;
> + }
> + return count;
> +}
> +
> static DEVICE_ATTR(rdpmc, S_IRUSR | S_IWUSR, get_attr_rdpmc, set_attr_rdpmc);
> +static DEVICE_ATTR(lbr_callstack, S_IRUSR | S_IWUSR,
> + get_attr_lbr_callstack, set_attr_lbr_callstack);
> +
>
> static struct attribute *x86_pmu_attrs[] = {
> &dev_attr_rdpmc.attr,
> + &dev_attr_lbr_callstack.attr,
> NULL,
> };
>
> @@ -1955,12 +1991,29 @@ static unsigned long get_segment_base(unsigned int segment)
> return get_desc_base(desc + idx);
> }
>
> +static inline void
> +perf_callchain_lbr_callstack(struct perf_callchain_entry *entry,
> + struct perf_sample_data *data)
> +{
> + struct perf_branch_stack *br_stack = data->br_stack;
> +
> + if (br_stack && br_stack->user_callstack &&
> + x86_pmu.attr_lbr_callstack) {
> + int i = 0;
> + while (i < br_stack->nr && entry->nr < PERF_MAX_STACK_DEPTH) {
> + perf_callchain_store(entry, br_stack->entries[i].from);
> + i++;
> + }
> + }
> +}
> +
> #ifdef CONFIG_COMPAT
>
> #include <asm/compat.h>
>
> static inline int
> -perf_callchain_user32(struct pt_regs *regs, struct perf_callchain_entry *entry)
> +perf_callchain_user32(struct perf_callchain_entry *entry,
> + struct pt_regs *regs, struct perf_sample_data *data)
> {
> /* 32-bit process in 64-bit kernel. */
> unsigned long ss_base, cs_base;
> @@ -1989,11 +2042,16 @@ perf_callchain_user32(struct pt_regs *regs, struct perf_callchain_entry *entry)
> perf_callchain_store(entry, cs_base + frame.return_address);
> fp = compat_ptr(ss_base + frame.next_frame);
> }
> +
> + if (fp == compat_ptr(regs->bp))
> + perf_callchain_lbr_callstack(entry, data);
> +
> return 1;
> }
> #else
> static inline int
> -perf_callchain_user32(struct pt_regs *regs, struct perf_callchain_entry *entry)
> +perf_callchain_user32(struct perf_callchain_entry *entry,
> + struct pt_regs *regs, struct perf_sample_data *data)
> {
> return 0;
> }
> @@ -2023,12 +2081,12 @@ void perf_callchain_user(struct perf_callchain_entry *entry,
> if (!current->mm)
> return;
>
> - if (perf_callchain_user32(regs, entry))
> + if (perf_callchain_user32(entry, regs, data))
> return;
>
> while (entry->nr < PERF_MAX_STACK_DEPTH) {
> unsigned long bytes;
> - frame.next_frame = NULL;
> + frame.next_frame = NULL;
> frame.return_address = 0;
>
> bytes = copy_from_user_nmi(&frame, fp, sizeof(frame));
> @@ -2041,6 +2099,10 @@ void perf_callchain_user(struct perf_callchain_entry *entry,
> perf_callchain_store(entry, frame.return_address);
> fp = frame.next_frame;
> }
> +
> + /* try LBR callstack if there is no frame pointer */
> + if (fp == (void __user *)regs->bp)
> + perf_callchain_lbr_callstack(entry, data);
> }
>
> /*
> diff --git a/arch/x86/kernel/cpu/perf_event.h b/arch/x86/kernel/cpu/perf_event.h
> index 0116970..536470d 100644
> --- a/arch/x86/kernel/cpu/perf_event.h
> +++ b/arch/x86/kernel/cpu/perf_event.h
> @@ -390,6 +390,7 @@ struct x86_pmu {
> * sysfs attrs
> */
> int attr_rdpmc;
> + int attr_lbr_callstack;
> struct attribute **format_attrs;
> struct attribute **event_attrs;
>
> @@ -496,6 +497,12 @@ static struct perf_pmu_events_attr event_attr_##v = { \
>
> extern struct x86_pmu x86_pmu __read_mostly;
>
> +static inline bool x86_pmu_has_lbr_callstack(void)
> +{
> + return x86_pmu.lbr_sel_map &&
> + x86_pmu.lbr_sel_map[PERF_SAMPLE_BRANCH_CALL_STACK_SHIFT] > 0;
> +}
> +
> DECLARE_PER_CPU(struct cpu_hw_events, cpu_hw_events);
>
> int x86_perf_event_set_period(struct perf_event *event);
> diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c
> index f59b46e..baa8384 100644
> --- a/arch/x86/kernel/cpu/perf_event_intel.c
> +++ b/arch/x86/kernel/cpu/perf_event_intel.c
> @@ -882,15 +882,10 @@ static __initconst const u64 atom_hw_cache_event_ids
> },
> };
>
> -static inline bool intel_pmu_needs_lbr_smpl(struct perf_event *event)
> +static inline bool intel_pmu_needs_lbr_callstack(struct perf_event *event)
> {
> - /* user explicitly requested branch sampling */
> - if (has_branch_stack(event))
> - return true;
> -
> - /* implicit branch sampling to correct PEBS skid */
> - if (x86_pmu.intel_cap.pebs_trap && event->attr.precise_ip > 1 &&
> - x86_pmu.intel_cap.pebs_format < 2)
> + if ((event->attr.sample_type & PERF_SAMPLE_CALLCHAIN) &&
> + (event->attr.branch_sample_type & PERF_SAMPLE_BRANCH_CALL_STACK))
> return true;
>
> return false;
> @@ -1054,7 +1049,7 @@ static void intel_pmu_disable_event(struct perf_event *event)
> * must disable before any actual event
> * because any event may be combined with LBR
> */
> - if (intel_pmu_needs_lbr_smpl(event))
> + if (needs_branch_stack(event))
> intel_pmu_lbr_disable(event);
>
> if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL)) {
> @@ -1115,7 +1110,7 @@ static void intel_pmu_enable_event(struct perf_event *event)
> * must enabled before any actual event
> * because any event may be combined with LBR
> */
> - if (intel_pmu_needs_lbr_smpl(event))
> + if (needs_branch_stack(event))
> intel_pmu_lbr_enable(event);
>
> if (event->attr.exclude_host)
> @@ -1237,7 +1232,8 @@ again:
>
> perf_sample_data_init(&data, 0, event->hw.last_period);
>
> - if (has_branch_stack(event))
> + if (has_branch_stack(event) ||
> + (event->ctx->task && intel_pmu_needs_lbr_callstack(event)))
> data.br_stack = &cpuc->lbr_stack;
>
> if (perf_event_overflow(event, &data, regs))
> @@ -1568,7 +1564,7 @@ static int intel_pmu_hw_config(struct perf_event *event)
> if (event->attr.precise_ip && x86_pmu.pebs_aliases)
> x86_pmu.pebs_aliases(event);
>
> - if (intel_pmu_needs_lbr_smpl(event)) {
> + if (needs_branch_stack(event)) {
> ret = intel_pmu_setup_lbr_filter(event);
> if (ret)
> return ret;
> diff --git a/arch/x86/kernel/cpu/perf_event_intel_lbr.c b/arch/x86/kernel/cpu/perf_event_intel_lbr.c
> index 43b16b4..3be2d7b 100644
> --- a/arch/x86/kernel/cpu/perf_event_intel_lbr.c
> +++ b/arch/x86/kernel/cpu/perf_event_intel_lbr.c
> @@ -709,6 +709,8 @@ intel_pmu_lbr_filter(struct cpu_hw_events *cpuc)
> int i, j, type;
> bool compress = false;
>
> + cpuc->lbr_stack.user_callstack = branch_user_callstack(br_sel);
> +
> /* if sampling all branches, then nothing to filter */
> if ((br_sel & X86_BR_ALL) == X86_BR_ALL)
> return;
> @@ -861,6 +863,7 @@ void intel_pmu_lbr_init_hsw(void)
>
> x86_pmu.lbr_sel_mask = LBR_SEL_MASK;
> x86_pmu.lbr_sel_map = hsw_lbr_sel_map;
> + x86_pmu.attr_lbr_callstack = 1;
>
> pr_cont("16-deep LBR, ");
> }
> diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
> index fa4c1bf..168e66e 100644
> --- a/include/linux/perf_event.h
> +++ b/include/linux/perf_event.h
> @@ -97,6 +97,7 @@ struct perf_branch_entry {
> * recent branch.
> */
> struct perf_branch_stack {
> + unsigned user_callstack:1;
> __u64 nr;
> struct perf_branch_entry entries[0];
> };
> @@ -759,6 +760,11 @@ static inline bool has_branch_stack(struct perf_event *event)
> return event->attr.sample_type & PERF_SAMPLE_BRANCH_STACK;
> }
>
> +static inline bool needs_branch_stack(struct perf_event *event)
> +{
> + return event->attr.branch_sample_type != 0;
> +}
> +
> extern int perf_output_begin(struct perf_output_handle *handle,
> struct perf_event *event, unsigned int size);
> extern void perf_output_end(struct perf_output_handle *handle);
> diff --git a/kernel/events/core.c b/kernel/events/core.c
> index 4aad901..38eaa2b 100644
> --- a/kernel/events/core.c
> +++ b/kernel/events/core.c
> @@ -1117,7 +1117,7 @@ list_add_event(struct perf_event *event, struct perf_event_context *ctx)
> if (is_cgroup_event(event))
> ctx->nr_cgroups++;
>
> - if (has_branch_stack(event))
> + if (needs_branch_stack(event))
> ctx->nr_branch_stack++;
>
> list_add_rcu(&event->event_entry, &ctx->event_list);
> @@ -1274,7 +1274,7 @@ list_del_event(struct perf_event *event, struct perf_event_context *ctx)
> cpuctx->cgrp = NULL;
> }
>
> - if (has_branch_stack(event)) {
> + if (needs_branch_stack(event)) {
> if (ctx->is_active)
> __get_cpu_var(perf_branch_stack_events)--;
> ctx->nr_branch_stack--;
> @@ -3155,7 +3155,7 @@ static void free_event(struct perf_event *event)
> static_key_slow_dec_deferred(&perf_sched_events);
> }
>
> - if (has_branch_stack(event))
> + if (needs_branch_stack(event))
> static_key_slow_dec_deferred(&perf_sched_events);
> }
>
> @@ -6545,6 +6545,9 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
> if (attr->inherit && (attr->read_format & PERF_FORMAT_GROUP))
> goto done;
>
> + if (!has_branch_stack(event))
> + event->attr.branch_sample_type = 0;
> +
> pmu = perf_init_event(event);
>
> done:
> @@ -6577,7 +6580,7 @@ done:
> return ERR_PTR(err);
> }
> }
> - if (has_branch_stack(event))
> + if (needs_branch_stack(event))
> static_key_slow_inc(&perf_sched_events.key);
> }
>
> --
> 1.8.1.4
>

2013-06-26 12:45:44

[permalink] [raw]

Subject: Re: [PATCH 6/7] perf, x86: Use LBR call stack to get user callchain

Hi,

And I think the reason why kernel is not possible deserves
a full explanation in the changelog and in the code.
It is still not clear to me whether this is a hw bug or a
limitation.

On Wed, Jun 26, 2013 at 2:42 PM, Stephane Eranian <[email protected]> wrote:
> On Tue, Jun 25, 2013 at 10:47 AM, Yan, Zheng <[email protected]> wrote:
>> From: "Yan, Zheng" <[email protected]>
>>
>> Try enabling the LBR call stack feature if event request recording
>> callchain. Try utilizing the LBR call stack to get user callchain
>> in case of there is no frame pointer.
>>
>> This patch also adds a cpu pmu attribute to enable/disable this
>> feature.
>>
>> Signed-off-by: Yan, Zheng <[email protected]>
>> ---
>> arch/x86/kernel/cpu/perf_event.c | 128 +++++++++++++++++++++--------
>> arch/x86/kernel/cpu/perf_event.h | 7 ++
>> arch/x86/kernel/cpu/perf_event_intel.c | 20 ++---
>> arch/x86/kernel/cpu/perf_event_intel_lbr.c | 3 +
>> include/linux/perf_event.h | 6 ++
>> kernel/events/core.c | 11 ++-
>> 6 files changed, 126 insertions(+), 49 deletions(-)
>>
>> diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
>> index 639aa4d..a07eb03 100644
>> --- a/arch/x86/kernel/cpu/perf_event.c
>> +++ b/arch/x86/kernel/cpu/perf_event.c
>> @@ -399,37 +399,49 @@ int x86_pmu_hw_config(struct perf_event *event)
>>
>> if (event->attr.precise_ip > precise)
>> return -EOPNOTSUPP;
>> + }
>> + /*
>> + * check that PEBS LBR correction does not conflict with
>> + * whatever the user is asking with attr->branch_sample_type
>> + */
>> + if (event->attr.precise_ip > 1 && x86_pmu.intel_cap.pebs_format < 2) {
>> + u64 *br_type = &event->attr.branch_sample_type;
>> +
>> + if (has_branch_stack(event)) {
>> + if (!precise_br_compat(event))
>> + return -EOPNOTSUPP;
>> +
>> + /* branch_sample_type is compatible */
>> +
>> + } else {
>> + /*
>> + * user did not specify branch_sample_type
>> + *
>> + * For PEBS fixups, we capture all
>> + * the branches at the priv level of the
>> + * event.
>> + */
>> + *br_type = PERF_SAMPLE_BRANCH_ANY;
>> +
>> + if (!event->attr.exclude_user)
>> + *br_type |= PERF_SAMPLE_BRANCH_USER;
>> +
>> + if (!event->attr.exclude_kernel)
>> + *br_type |= PERF_SAMPLE_BRANCH_KERNEL;
>> + }
>> + } else if ((event->attr.sample_type & PERF_SAMPLE_CALLCHAIN) &&
>> + !has_branch_stack(event) &&
>> + x86_pmu.attr_lbr_callstack &&
>> + !event->attr.exclude_user &&
>> + (event->attach_state & PERF_ATTACH_TASK)) {
>
> Yes, that's the test but it is wrong. I can pass the test if
> I have exclude_user = exclude_kernel = 0.
>
> You want:
> !event->attr.exclude_user && event->attr.exclude_kernel &&
>
> I tested that and it works.
>
>> /*
>> - * check that PEBS LBR correction does not conflict with
>> - * whatever the user is asking with attr->branch_sample_type
>> + * user did not specify branch_sample_type,
>> + * try using the LBR call stack facility to
>> + * record call chains of user program.
>> */
>> - if (event->attr.precise_ip > 1 &&
>> - x86_pmu.intel_cap.pebs_format < 2) {
>> - u64 *br_type = &event->attr.branch_sample_type;
>> -
>> - if (has_branch_stack(event)) {
>> - if (!precise_br_compat(event))
>> - return -EOPNOTSUPP;
>> -
>> - /* branch_sample_type is compatible */
>> -
>> - } else {
>> - /*
>> - * user did not specify branch_sample_type
>> - *
>> - * For PEBS fixups, we capture all
>> - * the branches at the priv level of the
>> - * event.
>> - */
>> - *br_type = PERF_SAMPLE_BRANCH_ANY;
>> -
>> - if (!event->attr.exclude_user)
>> - *br_type |= PERF_SAMPLE_BRANCH_USER;
>> -
>> - if (!event->attr.exclude_kernel)
>> - *br_type |= PERF_SAMPLE_BRANCH_KERNEL;
>> - }
>> - }
>> + event->attr.branch_sample_type =
>> + PERF_SAMPLE_BRANCH_USER |
>> + PERF_SAMPLE_BRANCH_CALL_STACK;
>> }
>>
>> /*
>> @@ -1825,10 +1837,34 @@ static ssize_t set_attr_rdpmc(struct device *cdev,
>> return count;
>> }
>>
>> +static ssize_t get_attr_lbr_callstack(struct device *cdev,
>> + struct device_attribute *attr, char *buf)
>> +{
>> + return snprintf(buf, 40, "%d\n", x86_pmu.attr_lbr_callstack);
>> +}
>> +
>> +static ssize_t set_attr_lbr_callstack(struct device *cdev,
>> + struct device_attribute *attr,
>> + const char *buf, size_t count)
>> +{
>> + unsigned long val = simple_strtoul(buf, NULL, 0);
>> +
>> + if (x86_pmu.attr_lbr_callstack != !!val) {
>> + if (val && !x86_pmu_has_lbr_callstack())
>> + return -EOPNOTSUPP;
>> + x86_pmu.attr_lbr_callstack = !!val;
>> + }
>> + return count;
>> +}
>> +
>> static DEVICE_ATTR(rdpmc, S_IRUSR | S_IWUSR, get_attr_rdpmc, set_attr_rdpmc);
>> +static DEVICE_ATTR(lbr_callstack, S_IRUSR | S_IWUSR,
>> + get_attr_lbr_callstack, set_attr_lbr_callstack);
>> +
>>
>> static struct attribute *x86_pmu_attrs[] = {
>> &dev_attr_rdpmc.attr,
>> + &dev_attr_lbr_callstack.attr,
>> NULL,
>> };
>>
>> @@ -1955,12 +1991,29 @@ static unsigned long get_segment_base(unsigned int segment)
>> return get_desc_base(desc + idx);
>> }
>>
>> +static inline void
>> +perf_callchain_lbr_callstack(struct perf_callchain_entry *entry,
>> + struct perf_sample_data *data)
>> +{
>> + struct perf_branch_stack *br_stack = data->br_stack;
>> +
>> + if (br_stack && br_stack->user_callstack &&
>> + x86_pmu.attr_lbr_callstack) {
>> + int i = 0;
>> + while (i < br_stack->nr && entry->nr < PERF_MAX_STACK_DEPTH) {
>> + perf_callchain_store(entry, br_stack->entries[i].from);
>> + i++;
>> + }
>> + }
>> +}
>> +
>> #ifdef CONFIG_COMPAT
>>
>> #include <asm/compat.h>
>>
>> static inline int
>> -perf_callchain_user32(struct pt_regs *regs, struct perf_callchain_entry *entry)
>> +perf_callchain_user32(struct perf_callchain_entry *entry,
>> + struct pt_regs *regs, struct perf_sample_data *data)
>> {
>> /* 32-bit process in 64-bit kernel. */
>> unsigned long ss_base, cs_base;
>> @@ -1989,11 +2042,16 @@ perf_callchain_user32(struct pt_regs *regs, struct perf_callchain_entry *entry)
>> perf_callchain_store(entry, cs_base + frame.return_address);
>> fp = compat_ptr(ss_base + frame.next_frame);
>> }
>> +
>> + if (fp == compat_ptr(regs->bp))
>> + perf_callchain_lbr_callstack(entry, data);
>> +
>> return 1;
>> }
>> #else
>> static inline int
>> -perf_callchain_user32(struct pt_regs *regs, struct perf_callchain_entry *entry)
>> +perf_callchain_user32(struct perf_callchain_entry *entry,
>> + struct pt_regs *regs, struct perf_sample_data *data)
>> {
>> return 0;
>> }
>> @@ -2023,12 +2081,12 @@ void perf_callchain_user(struct perf_callchain_entry *entry,
>> if (!current->mm)
>> return;
>>
>> - if (perf_callchain_user32(regs, entry))
>> + if (perf_callchain_user32(entry, regs, data))
>> return;
>>
>> while (entry->nr < PERF_MAX_STACK_DEPTH) {
>> unsigned long bytes;
>> - frame.next_frame = NULL;
>> + frame.next_frame = NULL;
>> frame.return_address = 0;
>>
>> bytes = copy_from_user_nmi(&frame, fp, sizeof(frame));
>> @@ -2041,6 +2099,10 @@ void perf_callchain_user(struct perf_callchain_entry *entry,
>> perf_callchain_store(entry, frame.return_address);
>> fp = frame.next_frame;
>> }
>> +
>> + /* try LBR callstack if there is no frame pointer */
>> + if (fp == (void __user *)regs->bp)
>> + perf_callchain_lbr_callstack(entry, data);
>> }
>>
>> /*
>> diff --git a/arch/x86/kernel/cpu/perf_event.h b/arch/x86/kernel/cpu/perf_event.h
>> index 0116970..536470d 100644
>> --- a/arch/x86/kernel/cpu/perf_event.h
>> +++ b/arch/x86/kernel/cpu/perf_event.h
>> @@ -390,6 +390,7 @@ struct x86_pmu {
>> * sysfs attrs
>> */
>> int attr_rdpmc;
>> + int attr_lbr_callstack;
>> struct attribute **format_attrs;
>> struct attribute **event_attrs;
>>
>> @@ -496,6 +497,12 @@ static struct perf_pmu_events_attr event_attr_##v = { \
>>
>> extern struct x86_pmu x86_pmu __read_mostly;
>>
>> +static inline bool x86_pmu_has_lbr_callstack(void)
>> +{
>> + return x86_pmu.lbr_sel_map &&
>> + x86_pmu.lbr_sel_map[PERF_SAMPLE_BRANCH_CALL_STACK_SHIFT] > 0;
>> +}
>> +
>> DECLARE_PER_CPU(struct cpu_hw_events, cpu_hw_events);
>>
>> int x86_perf_event_set_period(struct perf_event *event);
>> diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c
>> index f59b46e..baa8384 100644
>> --- a/arch/x86/kernel/cpu/perf_event_intel.c
>> +++ b/arch/x86/kernel/cpu/perf_event_intel.c
>> @@ -882,15 +882,10 @@ static __initconst const u64 atom_hw_cache_event_ids
>> },
>> };
>>
>> -static inline bool intel_pmu_needs_lbr_smpl(struct perf_event *event)
>> +static inline bool intel_pmu_needs_lbr_callstack(struct perf_event *event)
>> {
>> - /* user explicitly requested branch sampling */
>> - if (has_branch_stack(event))
>> - return true;
>> -
>> - /* implicit branch sampling to correct PEBS skid */
>> - if (x86_pmu.intel_cap.pebs_trap && event->attr.precise_ip > 1 &&
>> - x86_pmu.intel_cap.pebs_format < 2)
>> + if ((event->attr.sample_type & PERF_SAMPLE_CALLCHAIN) &&
>> + (event->attr.branch_sample_type & PERF_SAMPLE_BRANCH_CALL_STACK))
>> return true;
>>
>> return false;
>> @@ -1054,7 +1049,7 @@ static void intel_pmu_disable_event(struct perf_event *event)
>> * must disable before any actual event
>> * because any event may be combined with LBR
>> */
>> - if (intel_pmu_needs_lbr_smpl(event))
>> + if (needs_branch_stack(event))
>> intel_pmu_lbr_disable(event);
>>
>> if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL)) {
>> @@ -1115,7 +1110,7 @@ static void intel_pmu_enable_event(struct perf_event *event)
>> * must enabled before any actual event
>> * because any event may be combined with LBR
>> */
>> - if (intel_pmu_needs_lbr_smpl(event))
>> + if (needs_branch_stack(event))
>> intel_pmu_lbr_enable(event);
>>
>> if (event->attr.exclude_host)
>> @@ -1237,7 +1232,8 @@ again:
>>
>> perf_sample_data_init(&data, 0, event->hw.last_period);
>>
>> - if (has_branch_stack(event))
>> + if (has_branch_stack(event) ||
>> + (event->ctx->task && intel_pmu_needs_lbr_callstack(event)))
>> data.br_stack = &cpuc->lbr_stack;
>>
>> if (perf_event_overflow(event, &data, regs))
>> @@ -1568,7 +1564,7 @@ static int intel_pmu_hw_config(struct perf_event *event)
>> if (event->attr.precise_ip && x86_pmu.pebs_aliases)
>> x86_pmu.pebs_aliases(event);
>>
>> - if (intel_pmu_needs_lbr_smpl(event)) {
>> + if (needs_branch_stack(event)) {
>> ret = intel_pmu_setup_lbr_filter(event);
>> if (ret)
>> return ret;
>> diff --git a/arch/x86/kernel/cpu/perf_event_intel_lbr.c b/arch/x86/kernel/cpu/perf_event_intel_lbr.c
>> index 43b16b4..3be2d7b 100644
>> --- a/arch/x86/kernel/cpu/perf_event_intel_lbr.c
>> +++ b/arch/x86/kernel/cpu/perf_event_intel_lbr.c
>> @@ -709,6 +709,8 @@ intel_pmu_lbr_filter(struct cpu_hw_events *cpuc)
>> int i, j, type;
>> bool compress = false;
>>
>> + cpuc->lbr_stack.user_callstack = branch_user_callstack(br_sel);
>> +
>> /* if sampling all branches, then nothing to filter */
>> if ((br_sel & X86_BR_ALL) == X86_BR_ALL)
>> return;
>> @@ -861,6 +863,7 @@ void intel_pmu_lbr_init_hsw(void)
>>
>> x86_pmu.lbr_sel_mask = LBR_SEL_MASK;
>> x86_pmu.lbr_sel_map = hsw_lbr_sel_map;
>> + x86_pmu.attr_lbr_callstack = 1;
>>
>> pr_cont("16-deep LBR, ");
>> }
>> diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
>> index fa4c1bf..168e66e 100644
>> --- a/include/linux/perf_event.h
>> +++ b/include/linux/perf_event.h
>> @@ -97,6 +97,7 @@ struct perf_branch_entry {
>> * recent branch.
>> */
>> struct perf_branch_stack {
>> + unsigned user_callstack:1;
>> __u64 nr;
>> struct perf_branch_entry entries[0];
>> };
>> @@ -759,6 +760,11 @@ static inline bool has_branch_stack(struct perf_event *event)
>> return event->attr.sample_type & PERF_SAMPLE_BRANCH_STACK;
>> }
>>
>> +static inline bool needs_branch_stack(struct perf_event *event)
>> +{
>> + return event->attr.branch_sample_type != 0;
>> +}
>> +
>> extern int perf_output_begin(struct perf_output_handle *handle,
>> struct perf_event *event, unsigned int size);
>> extern void perf_output_end(struct perf_output_handle *handle);
>> diff --git a/kernel/events/core.c b/kernel/events/core.c
>> index 4aad901..38eaa2b 100644
>> --- a/kernel/events/core.c
>> +++ b/kernel/events/core.c
>> @@ -1117,7 +1117,7 @@ list_add_event(struct perf_event *event, struct perf_event_context *ctx)
>> if (is_cgroup_event(event))
>> ctx->nr_cgroups++;
>>
>> - if (has_branch_stack(event))
>> + if (needs_branch_stack(event))
>> ctx->nr_branch_stack++;
>>
>> list_add_rcu(&event->event_entry, &ctx->event_list);
>> @@ -1274,7 +1274,7 @@ list_del_event(struct perf_event *event, struct perf_event_context *ctx)
>> cpuctx->cgrp = NULL;
>> }
>>
>> - if (has_branch_stack(event)) {
>> + if (needs_branch_stack(event)) {
>> if (ctx->is_active)
>> __get_cpu_var(perf_branch_stack_events)--;
>> ctx->nr_branch_stack--;
>> @@ -3155,7 +3155,7 @@ static void free_event(struct perf_event *event)
>> static_key_slow_dec_deferred(&perf_sched_events);
>> }
>>
>> - if (has_branch_stack(event))
>> + if (needs_branch_stack(event))
>> static_key_slow_dec_deferred(&perf_sched_events);
>> }
>>
>> @@ -6545,6 +6545,9 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
>> if (attr->inherit && (attr->read_format & PERF_FORMAT_GROUP))
>> goto done;
>>
>> + if (!has_branch_stack(event))
>> + event->attr.branch_sample_type = 0;
>> +
>> pmu = perf_init_event(event);
>>
>> done:
>> @@ -6577,7 +6580,7 @@ done:
>> return ERR_PTR(err);
>> }
>> }
>> - if (has_branch_stack(event))
>> + if (needs_branch_stack(event))
>> static_key_slow_inc(&perf_sched_events.key);
>> }
>>
>> --
>> 1.8.1.4
>>

2013-06-26 15:24:57

[permalink] [raw]

Subject: Re: [PATCH 4/7] perf, x86: Save/resotre LBR stack during context switch

On Tue, Jun 25, 2013 at 04:47:16PM +0800, Yan, Zheng wrote:
> From: "Yan, Zheng" <[email protected]>
>
> When the LBR call stack is enabled, it is necessary to save/restore
> the stack on context switch. The solution is saving/restoring the
> stack to/from task's perf event context. If task has no perf event
> context, just flush the stack on context switch.

So I have some problems with this; suppose the LBR is shared between a
cpu and a task event, when the task switches back in, it will over-write
the LBR content with old stuff.

In that case the CPU event's LBR stack can be actively wrong, as opposed
to being empty.

2013-06-26 15:29:59

[permalink] [raw]

Subject: Re: [PATCH 0/7] perf, x86: Haswell LBR call stack support

On Tue, Jun 25, 2013 at 04:47:12PM +0800, Yan, Zheng wrote:
> From: "Yan, Zheng" <[email protected]>
>
> Haswell has a new feature that utilizes the existing Last Branch Record
> facility to record call chains. When the feature is enabled, function
> call will be collected as normal, but as return instructions are executed
> the last captured branch record is popped from the on-chip LBR registers.
> The LBR call stack facility can help perf to get call chains of progam
> without frame pointer. When perf tool requests PERF_SAMPLE_CALLCHAIN +
> PERF_SAMPLE_BRANCH_USER, this feature is dynamically enabled by default.
> This feature can be disabled/enabled through an attribute file in the cpu
> pmu sysfs directory.
>
> The LBR call stack has following known limitations
> 1. Zero length calls are not filtered out by hardware
> 2. Exception handing such as setjmp/longjmp will have calls/returns not
> match
> 3. Pushing different return address onto the stack will have calls/returns
> not match
>

You fail to mention what happens when the callstack is deeper than the
LBR is big -- a rather common issue I'd think.

>From what I gather if you push when full, the TOS rotates and eats the
tail allowing you to add another entry to the head.

If you pop when empty; nothing happens.

So on pretty much every program you'd be lucky to get the top of the
callstack but can end up with nearly nothing.

Given that, and the other limitations I don't think its a fair
replacement for user callchains.

2013-06-26 15:44:53

[permalink] [raw]

Subject: Re: [PATCH 0/7] perf, x86: Haswell LBR call stack support

On Tue, Jun 25, 2013 at 04:47:12PM +0800, Yan, Zheng wrote:
> From: "Yan, Zheng" <[email protected]>
>
> Haswell has a new feature that utilizes the existing Last Branch Record
> facility to record call chains. When the feature is enabled, function
> call will be collected as normal, but as return instructions are executed
> the last captured branch record is popped from the on-chip LBR registers.
> The LBR call stack facility can help perf to get call chains of progam
> without frame pointer. When perf tool requests PERF_SAMPLE_CALLCHAIN +
> PERF_SAMPLE_BRANCH_USER, this feature is dynamically enabled by default.
> This feature can be disabled/enabled through an attribute file in the cpu
> pmu sysfs directory.
>
> The LBR call stack has following known limitations
> 1. Zero length calls are not filtered out by hardware
> 2. Exception handing such as setjmp/longjmp will have calls/returns not
> match
> 3. Pushing different return address onto the stack will have calls/returns
> not match
>
> These patches are based upon tip/perf/core

These patches are also done wrong; the first patches should add HSW
support for the existing features. Only then should you do patches for
new fail^wfeatures.

2013-06-26 16:40:31

[permalink] [raw]

Subject: Re: [PATCH 0/7] perf, x86: Haswell LBR call stack support

> So on pretty much every program you'd be lucky to get the top of the
> callstack but can end up with nearly nothing.

The last few entries up are usually the interesting ones, and those will
be preserved.

Do you really care about main() for profiling?

When you pop back up there may be a few situations where you
have a very small visible stack, but as soon as the program
starts nesting deeper again that will be fixed.

> Given that, and the other limitations I don't think its a fair
> replacement for user callchains.

It's not intended to be a full replacement, just to cover
cases that cannot give you callgraphs at all today.

-Andi

--
[email protected] -- Speaking for myself only.

2013-06-26 16:48:22

[permalink] [raw]

Subject: Re: [PATCH 0/7] perf, x86: Haswell LBR call stack support

On Wed, Jun 26, 2013 at 1:54 PM, Peter Zijlstra <[email protected]> wrote:
> On Tue, Jun 25, 2013 at 04:47:12PM +0800, Yan, Zheng wrote:
>> From: "Yan, Zheng" <[email protected]>
>>
>> Haswell has a new feature that utilizes the existing Last Branch Record
>> facility to record call chains. When the feature is enabled, function
>> call will be collected as normal, but as return instructions are executed
>> the last captured branch record is popped from the on-chip LBR registers.
>> The LBR call stack facility can help perf to get call chains of progam
>> without frame pointer. When perf tool requests PERF_SAMPLE_CALLCHAIN +
>> PERF_SAMPLE_BRANCH_USER, this feature is dynamically enabled by default.
>> This feature can be disabled/enabled through an attribute file in the cpu
>> pmu sysfs directory.
>>
>> The LBR call stack has following known limitations
>> 1. Zero length calls are not filtered out by hardware
>> 2. Exception handing such as setjmp/longjmp will have calls/returns not
>> match
>> 3. Pushing different return address onto the stack will have calls/returns
>> not match
>>
>
> You fail to mention what happens when the callstack is deeper than the
> LBR is big -- a rather common issue I'd think.
>
LBR is statistical callstack. By nature, it cannot capture the entire chain.

> From what I gather if you push when full, the TOS rotates and eats the
> tail allowing you to add another entry to the head.
>
> If you pop when empty; nothing happens.
>
Not sure they know "empty" from "non empty", they just move the LBR_TOS
by one entry on returns.

> So on pretty much every program you'd be lucky to get the top of the
> callstack but can end up with nearly nothing.
>
You will get the calls closest to the interrupt.

> Given that, and the other limitations I don't think its a fair
> replacement for user callchains.

Well, the one advantage I see is that it works on stripped/optimized
binaries without fp or dwarf info. Compared to dwarf and the stack
snapshot, it does incur less overhead most likely. But yes, it comes
with limitations.

2013-06-26 16:59:45

[permalink] [raw]

Subject: Re: [PATCH 0/7] perf, x86: Haswell LBR call stack support

> These patches are also done wrong; the first patches should add HSW
> support for the existing features. Only then should you do patches for
> new fail^wfeatures.

The basic Haswell LBR support is already in tip.

It's pretty much the same as Sandy Bridge, except the the sign
extension need to extend two additional bits.

The only other part not yet in are the exposed filters for INTX/ABORT, plus
the workaround for the duplicated ABORT record. These are orthogonal
to callstack LBR.

-Andi

2013-06-26 17:11:29

[permalink] [raw]

Subject: Re: [PATCH 0/7] perf, x86: Haswell LBR call stack support

On Wed, Jun 26, 2013 at 06:59:42PM +0200, Andi Kleen wrote:
> > These patches are also done wrong; the first patches should add HSW
> > support for the existing features. Only then should you do patches for
> > new fail^wfeatures.
>
> The basic Haswell LBR support is already in tip.

D'uh indeed. Sorry for the noise.

2013-06-27 01:41:04

[permalink] [raw]

Subject: Re: [PATCH 6/7] perf, x86: Use LBR call stack to get user callchain

On 06/26/2013 08:42 PM, Stephane Eranian wrote:
> On Tue, Jun 25, 2013 at 10:47 AM, Yan, Zheng <[email protected]> wrote:
>> From: "Yan, Zheng" <[email protected]>
>>
>> Try enabling the LBR call stack feature if event request recording
>> callchain. Try utilizing the LBR call stack to get user callchain
>> in case of there is no frame pointer.
>>
>> This patch also adds a cpu pmu attribute to enable/disable this
>> feature.
>>
>> Signed-off-by: Yan, Zheng <[email protected]>
>> ---
>> arch/x86/kernel/cpu/perf_event.c | 128 +++++++++++++++++++++--------
>> arch/x86/kernel/cpu/perf_event.h | 7 ++
>> arch/x86/kernel/cpu/perf_event_intel.c | 20 ++---
>> arch/x86/kernel/cpu/perf_event_intel_lbr.c | 3 +
>> include/linux/perf_event.h | 6 ++
>> kernel/events/core.c | 11 ++-
>> 6 files changed, 126 insertions(+), 49 deletions(-)
>>
>> diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
>> index 639aa4d..a07eb03 100644
>> --- a/arch/x86/kernel/cpu/perf_event.c
>> +++ b/arch/x86/kernel/cpu/perf_event.c
>> @@ -399,37 +399,49 @@ int x86_pmu_hw_config(struct perf_event *event)
>>
>> if (event->attr.precise_ip > precise)
>> return -EOPNOTSUPP;
>> + }
>> + /*
>> + * check that PEBS LBR correction does not conflict with
>> + * whatever the user is asking with attr->branch_sample_type
>> + */
>> + if (event->attr.precise_ip > 1 && x86_pmu.intel_cap.pebs_format < 2) {
>> + u64 *br_type = &event->attr.branch_sample_type;
>> +
>> + if (has_branch_stack(event)) {
>> + if (!precise_br_compat(event))
>> + return -EOPNOTSUPP;
>> +
>> + /* branch_sample_type is compatible */
>> +
>> + } else {
>> + /*
>> + * user did not specify branch_sample_type
>> + *
>> + * For PEBS fixups, we capture all
>> + * the branches at the priv level of the
>> + * event.
>> + */
>> + *br_type = PERF_SAMPLE_BRANCH_ANY;
>> +
>> + if (!event->attr.exclude_user)
>> + *br_type |= PERF_SAMPLE_BRANCH_USER;
>> +
>> + if (!event->attr.exclude_kernel)
>> + *br_type |= PERF_SAMPLE_BRANCH_KERNEL;
>> + }
>> + } else if ((event->attr.sample_type & PERF_SAMPLE_CALLCHAIN) &&
>> + !has_branch_stack(event) &&
>> + x86_pmu.attr_lbr_callstack &&
>> + !event->attr.exclude_user &&
>> + (event->attach_state & PERF_ATTACH_TASK)) {
>
> Yes, that's the test but it is wrong. I can pass the test if
> I have exclude_user = exclude_kernel = 0.

It's OK to have "exclude_user = exclude_kernel = 0". The LBR callstack feature is only used
for getting callchain of user program. If "exclude_kernel = 0", we still use frame pointer
to get the callchain.

Regards
Yan, Zheng

>
> You want:
> !event->attr.exclude_user && event->attr.exclude_kernel &&
>
> I tested that and it works.
>
>> /*
>> - * check that PEBS LBR correction does not conflict with
>> - * whatever the user is asking with attr->branch_sample_type
>> + * user did not specify branch_sample_type,
>> + * try using the LBR call stack facility to
>> + * record call chains of user program.
>> */
>> - if (event->attr.precise_ip > 1 &&
>> - x86_pmu.intel_cap.pebs_format < 2) {
>> - u64 *br_type = &event->attr.branch_sample_type;
>> -
>> - if (has_branch_stack(event)) {
>> - if (!precise_br_compat(event))
>> - return -EOPNOTSUPP;
>> -
>> - /* branch_sample_type is compatible */
>> -
>> - } else {
>> - /*
>> - * user did not specify branch_sample_type
>> - *
>> - * For PEBS fixups, we capture all
>> - * the branches at the priv level of the
>> - * event.
>> - */
>> - *br_type = PERF_SAMPLE_BRANCH_ANY;
>> -
>> - if (!event->attr.exclude_user)
>> - *br_type |= PERF_SAMPLE_BRANCH_USER;
>> -
>> - if (!event->attr.exclude_kernel)
>> - *br_type |= PERF_SAMPLE_BRANCH_KERNEL;
>> - }
>> - }
>> + event->attr.branch_sample_type =
>> + PERF_SAMPLE_BRANCH_USER |
>> + PERF_SAMPLE_BRANCH_CALL_STACK;
>> }
>>
>> /*
>> @@ -1825,10 +1837,34 @@ static ssize_t set_attr_rdpmc(struct device *cdev,
>> return count;
>> }
>>
>> +static ssize_t get_attr_lbr_callstack(struct device *cdev,
>> + struct device_attribute *attr, char *buf)
>> +{
>> + return snprintf(buf, 40, "%d\n", x86_pmu.attr_lbr_callstack);
>> +}
>> +
>> +static ssize_t set_attr_lbr_callstack(struct device *cdev,
>> + struct device_attribute *attr,
>> + const char *buf, size_t count)
>> +{
>> + unsigned long val = simple_strtoul(buf, NULL, 0);
>> +
>> + if (x86_pmu.attr_lbr_callstack != !!val) {
>> + if (val && !x86_pmu_has_lbr_callstack())
>> + return -EOPNOTSUPP;
>> + x86_pmu.attr_lbr_callstack = !!val;
>> + }
>> + return count;
>> +}
>> +
>> static DEVICE_ATTR(rdpmc, S_IRUSR | S_IWUSR, get_attr_rdpmc, set_attr_rdpmc);
>> +static DEVICE_ATTR(lbr_callstack, S_IRUSR | S_IWUSR,
>> + get_attr_lbr_callstack, set_attr_lbr_callstack);
>> +
>>
>> static struct attribute *x86_pmu_attrs[] = {
>> &dev_attr_rdpmc.attr,
>> + &dev_attr_lbr_callstack.attr,
>> NULL,
>> };
>>
>> @@ -1955,12 +1991,29 @@ static unsigned long get_segment_base(unsigned int segment)
>> return get_desc_base(desc + idx);
>> }
>>
>> +static inline void
>> +perf_callchain_lbr_callstack(struct perf_callchain_entry *entry,
>> + struct perf_sample_data *data)
>> +{
>> + struct perf_branch_stack *br_stack = data->br_stack;
>> +
>> + if (br_stack && br_stack->user_callstack &&
>> + x86_pmu.attr_lbr_callstack) {
>> + int i = 0;
>> + while (i < br_stack->nr && entry->nr < PERF_MAX_STACK_DEPTH) {
>> + perf_callchain_store(entry, br_stack->entries[i].from);
>> + i++;
>> + }
>> + }
>> +}
>> +
>> #ifdef CONFIG_COMPAT
>>
>> #include <asm/compat.h>
>>
>> static inline int
>> -perf_callchain_user32(struct pt_regs *regs, struct perf_callchain_entry *entry)
>> +perf_callchain_user32(struct perf_callchain_entry *entry,
>> + struct pt_regs *regs, struct perf_sample_data *data)
>> {
>> /* 32-bit process in 64-bit kernel. */
>> unsigned long ss_base, cs_base;
>> @@ -1989,11 +2042,16 @@ perf_callchain_user32(struct pt_regs *regs, struct perf_callchain_entry *entry)
>> perf_callchain_store(entry, cs_base + frame.return_address);
>> fp = compat_ptr(ss_base + frame.next_frame);
>> }
>> +
>> + if (fp == compat_ptr(regs->bp))
>> + perf_callchain_lbr_callstack(entry, data);
>> +
>> return 1;
>> }
>> #else
>> static inline int
>> -perf_callchain_user32(struct pt_regs *regs, struct perf_callchain_entry *entry)
>> +perf_callchain_user32(struct perf_callchain_entry *entry,
>> + struct pt_regs *regs, struct perf_sample_data *data)
>> {
>> return 0;
>> }
>> @@ -2023,12 +2081,12 @@ void perf_callchain_user(struct perf_callchain_entry *entry,
>> if (!current->mm)
>> return;
>>
>> - if (perf_callchain_user32(regs, entry))
>> + if (perf_callchain_user32(entry, regs, data))
>> return;
>>
>> while (entry->nr < PERF_MAX_STACK_DEPTH) {
>> unsigned long bytes;
>> - frame.next_frame = NULL;
>> + frame.next_frame = NULL;
>> frame.return_address = 0;
>>
>> bytes = copy_from_user_nmi(&frame, fp, sizeof(frame));
>> @@ -2041,6 +2099,10 @@ void perf_callchain_user(struct perf_callchain_entry *entry,
>> perf_callchain_store(entry, frame.return_address);
>> fp = frame.next_frame;
>> }
>> +
>> + /* try LBR callstack if there is no frame pointer */
>> + if (fp == (void __user *)regs->bp)
>> + perf_callchain_lbr_callstack(entry, data);
>> }
>>
>> /*
>> diff --git a/arch/x86/kernel/cpu/perf_event.h b/arch/x86/kernel/cpu/perf_event.h
>> index 0116970..536470d 100644
>> --- a/arch/x86/kernel/cpu/perf_event.h
>> +++ b/arch/x86/kernel/cpu/perf_event.h
>> @@ -390,6 +390,7 @@ struct x86_pmu {
>> * sysfs attrs
>> */
>> int attr_rdpmc;
>> + int attr_lbr_callstack;
>> struct attribute **format_attrs;
>> struct attribute **event_attrs;
>>
>> @@ -496,6 +497,12 @@ static struct perf_pmu_events_attr event_attr_##v = { \
>>
>> extern struct x86_pmu x86_pmu __read_mostly;
>>
>> +static inline bool x86_pmu_has_lbr_callstack(void)
>> +{
>> + return x86_pmu.lbr_sel_map &&
>> + x86_pmu.lbr_sel_map[PERF_SAMPLE_BRANCH_CALL_STACK_SHIFT] > 0;
>> +}
>> +
>> DECLARE_PER_CPU(struct cpu_hw_events, cpu_hw_events);
>>
>> int x86_perf_event_set_period(struct perf_event *event);
>> diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c
>> index f59b46e..baa8384 100644
>> --- a/arch/x86/kernel/cpu/perf_event_intel.c
>> +++ b/arch/x86/kernel/cpu/perf_event_intel.c
>> @@ -882,15 +882,10 @@ static __initconst const u64 atom_hw_cache_event_ids
>> },
>> };
>>
>> -static inline bool intel_pmu_needs_lbr_smpl(struct perf_event *event)
>> +static inline bool intel_pmu_needs_lbr_callstack(struct perf_event *event)
>> {
>> - /* user explicitly requested branch sampling */
>> - if (has_branch_stack(event))
>> - return true;
>> -
>> - /* implicit branch sampling to correct PEBS skid */
>> - if (x86_pmu.intel_cap.pebs_trap && event->attr.precise_ip > 1 &&
>> - x86_pmu.intel_cap.pebs_format < 2)
>> + if ((event->attr.sample_type & PERF_SAMPLE_CALLCHAIN) &&
>> + (event->attr.branch_sample_type & PERF_SAMPLE_BRANCH_CALL_STACK))
>> return true;
>>
>> return false;
>> @@ -1054,7 +1049,7 @@ static void intel_pmu_disable_event(struct perf_event *event)
>> * must disable before any actual event
>> * because any event may be combined with LBR
>> */
>> - if (intel_pmu_needs_lbr_smpl(event))
>> + if (needs_branch_stack(event))
>> intel_pmu_lbr_disable(event);
>>
>> if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL)) {
>> @@ -1115,7 +1110,7 @@ static void intel_pmu_enable_event(struct perf_event *event)
>> * must enabled before any actual event
>> * because any event may be combined with LBR
>> */
>> - if (intel_pmu_needs_lbr_smpl(event))
>> + if (needs_branch_stack(event))
>> intel_pmu_lbr_enable(event);
>>
>> if (event->attr.exclude_host)
>> @@ -1237,7 +1232,8 @@ again:
>>
>> perf_sample_data_init(&data, 0, event->hw.last_period);
>>
>> - if (has_branch_stack(event))
>> + if (has_branch_stack(event) ||
>> + (event->ctx->task && intel_pmu_needs_lbr_callstack(event)))
>> data.br_stack = &cpuc->lbr_stack;
>>
>> if (perf_event_overflow(event, &data, regs))
>> @@ -1568,7 +1564,7 @@ static int intel_pmu_hw_config(struct perf_event *event)
>> if (event->attr.precise_ip && x86_pmu.pebs_aliases)
>> x86_pmu.pebs_aliases(event);
>>
>> - if (intel_pmu_needs_lbr_smpl(event)) {
>> + if (needs_branch_stack(event)) {
>> ret = intel_pmu_setup_lbr_filter(event);
>> if (ret)
>> return ret;
>> diff --git a/arch/x86/kernel/cpu/perf_event_intel_lbr.c b/arch/x86/kernel/cpu/perf_event_intel_lbr.c
>> index 43b16b4..3be2d7b 100644
>> --- a/arch/x86/kernel/cpu/perf_event_intel_lbr.c
>> +++ b/arch/x86/kernel/cpu/perf_event_intel_lbr.c
>> @@ -709,6 +709,8 @@ intel_pmu_lbr_filter(struct cpu_hw_events *cpuc)
>> int i, j, type;
>> bool compress = false;
>>
>> + cpuc->lbr_stack.user_callstack = branch_user_callstack(br_sel);
>> +
>> /* if sampling all branches, then nothing to filter */
>> if ((br_sel & X86_BR_ALL) == X86_BR_ALL)
>> return;
>> @@ -861,6 +863,7 @@ void intel_pmu_lbr_init_hsw(void)
>>
>> x86_pmu.lbr_sel_mask = LBR_SEL_MASK;
>> x86_pmu.lbr_sel_map = hsw_lbr_sel_map;
>> + x86_pmu.attr_lbr_callstack = 1;
>>
>> pr_cont("16-deep LBR, ");
>> }
>> diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
>> index fa4c1bf..168e66e 100644
>> --- a/include/linux/perf_event.h
>> +++ b/include/linux/perf_event.h
>> @@ -97,6 +97,7 @@ struct perf_branch_entry {
>> * recent branch.
>> */
>> struct perf_branch_stack {
>> + unsigned user_callstack:1;
>> __u64 nr;
>> struct perf_branch_entry entries[0];
>> };
>> @@ -759,6 +760,11 @@ static inline bool has_branch_stack(struct perf_event *event)
>> return event->attr.sample_type & PERF_SAMPLE_BRANCH_STACK;
>> }
>>
>> +static inline bool needs_branch_stack(struct perf_event *event)
>> +{
>> + return event->attr.branch_sample_type != 0;
>> +}
>> +
>> extern int perf_output_begin(struct perf_output_handle *handle,
>> struct perf_event *event, unsigned int size);
>> extern void perf_output_end(struct perf_output_handle *handle);
>> diff --git a/kernel/events/core.c b/kernel/events/core.c
>> index 4aad901..38eaa2b 100644
>> --- a/kernel/events/core.c
>> +++ b/kernel/events/core.c
>> @@ -1117,7 +1117,7 @@ list_add_event(struct perf_event *event, struct perf_event_context *ctx)
>> if (is_cgroup_event(event))
>> ctx->nr_cgroups++;
>>
>> - if (has_branch_stack(event))
>> + if (needs_branch_stack(event))
>> ctx->nr_branch_stack++;
>>
>> list_add_rcu(&event->event_entry, &ctx->event_list);
>> @@ -1274,7 +1274,7 @@ list_del_event(struct perf_event *event, struct perf_event_context *ctx)
>> cpuctx->cgrp = NULL;
>> }
>>
>> - if (has_branch_stack(event)) {
>> + if (needs_branch_stack(event)) {
>> if (ctx->is_active)
>> __get_cpu_var(perf_branch_stack_events)--;
>> ctx->nr_branch_stack--;
>> @@ -3155,7 +3155,7 @@ static void free_event(struct perf_event *event)
>> static_key_slow_dec_deferred(&perf_sched_events);
>> }
>>
>> - if (has_branch_stack(event))
>> + if (needs_branch_stack(event))
>> static_key_slow_dec_deferred(&perf_sched_events);
>> }
>>
>> @@ -6545,6 +6545,9 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
>> if (attr->inherit && (attr->read_format & PERF_FORMAT_GROUP))
>> goto done;
>>
>> + if (!has_branch_stack(event))
>> + event->attr.branch_sample_type = 0;
>> +
>> pmu = perf_init_event(event);
>>
>> done:
>> @@ -6577,7 +6580,7 @@ done:
>> return ERR_PTR(err);
>> }
>> }
>> - if (has_branch_stack(event))
>> + if (needs_branch_stack(event))
>> static_key_slow_inc(&perf_sched_events.key);
>> }
>>
>> --
>> 1.8.1.4
>>

2013-06-27 01:52:43

[permalink] [raw]

Subject: Re: [PATCH 6/7] perf, x86: Use LBR call stack to get user callchain

On 06/26/2013 08:45 PM, Stephane Eranian wrote:
> Hi,
>
> And I think the reason why kernel is not possible deserves
> a full explanation in the changelog and in the code.
> It is still not clear to me whether this is a hw bug or a
> limitation.

It's a hardware bug. The problem I encountered is: if FREEZE_ON_PMI bit is set and
PMI happens near call/return instruction, the LBR_TOS register may have a superfluous
increase/decrease. (increase/decrease by two for call/return instruction)

Regards
Yan, Zheng

>
>
> On Wed, Jun 26, 2013 at 2:42 PM, Stephane Eranian <[email protected]> wrote:
>> On Tue, Jun 25, 2013 at 10:47 AM, Yan, Zheng <[email protected]> wrote:
>>> From: "Yan, Zheng" <[email protected]>
>>>
>>> Try enabling the LBR call stack feature if event request recording
>>> callchain. Try utilizing the LBR call stack to get user callchain
>>> in case of there is no frame pointer.
>>>
>>> This patch also adds a cpu pmu attribute to enable/disable this
>>> feature.
>>>
>>> Signed-off-by: Yan, Zheng <[email protected]>
>>> ---
>>> arch/x86/kernel/cpu/perf_event.c | 128 +++++++++++++++++++++--------
>>> arch/x86/kernel/cpu/perf_event.h | 7 ++
>>> arch/x86/kernel/cpu/perf_event_intel.c | 20 ++---
>>> arch/x86/kernel/cpu/perf_event_intel_lbr.c | 3 +
>>> include/linux/perf_event.h | 6 ++
>>> kernel/events/core.c | 11 ++-
>>> 6 files changed, 126 insertions(+), 49 deletions(-)
>>>
>>> diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
>>> index 639aa4d..a07eb03 100644
>>> --- a/arch/x86/kernel/cpu/perf_event.c
>>> +++ b/arch/x86/kernel/cpu/perf_event.c
>>> @@ -399,37 +399,49 @@ int x86_pmu_hw_config(struct perf_event *event)
>>>
>>> if (event->attr.precise_ip > precise)
>>> return -EOPNOTSUPP;
>>> + }
>>> + /*
>>> + * check that PEBS LBR correction does not conflict with
>>> + * whatever the user is asking with attr->branch_sample_type
>>> + */
>>> + if (event->attr.precise_ip > 1 && x86_pmu.intel_cap.pebs_format < 2) {
>>> + u64 *br_type = &event->attr.branch_sample_type;
>>> +
>>> + if (has_branch_stack(event)) {
>>> + if (!precise_br_compat(event))
>>> + return -EOPNOTSUPP;
>>> +
>>> + /* branch_sample_type is compatible */
>>> +
>>> + } else {
>>> + /*
>>> + * user did not specify branch_sample_type
>>> + *
>>> + * For PEBS fixups, we capture all
>>> + * the branches at the priv level of the
>>> + * event.
>>> + */
>>> + *br_type = PERF_SAMPLE_BRANCH_ANY;
>>> +
>>> + if (!event->attr.exclude_user)
>>> + *br_type |= PERF_SAMPLE_BRANCH_USER;
>>> +
>>> + if (!event->attr.exclude_kernel)
>>> + *br_type |= PERF_SAMPLE_BRANCH_KERNEL;
>>> + }
>>> + } else if ((event->attr.sample_type & PERF_SAMPLE_CALLCHAIN) &&
>>> + !has_branch_stack(event) &&
>>> + x86_pmu.attr_lbr_callstack &&
>>> + !event->attr.exclude_user &&
>>> + (event->attach_state & PERF_ATTACH_TASK)) {
>>
>> Yes, that's the test but it is wrong. I can pass the test if
>> I have exclude_user = exclude_kernel = 0.
>>
>> You want:
>> !event->attr.exclude_user && event->attr.exclude_kernel &&
>>
>> I tested that and it works.
>>
>>> /*
>>> - * check that PEBS LBR correction does not conflict with
>>> - * whatever the user is asking with attr->branch_sample_type
>>> + * user did not specify branch_sample_type,
>>> + * try using the LBR call stack facility to
>>> + * record call chains of user program.
>>> */
>>> - if (event->attr.precise_ip > 1 &&
>>> - x86_pmu.intel_cap.pebs_format < 2) {
>>> - u64 *br_type = &event->attr.branch_sample_type;
>>> -
>>> - if (has_branch_stack(event)) {
>>> - if (!precise_br_compat(event))
>>> - return -EOPNOTSUPP;
>>> -
>>> - /* branch_sample_type is compatible */
>>> -
>>> - } else {
>>> - /*
>>> - * user did not specify branch_sample_type
>>> - *
>>> - * For PEBS fixups, we capture all
>>> - * the branches at the priv level of the
>>> - * event.
>>> - */
>>> - *br_type = PERF_SAMPLE_BRANCH_ANY;
>>> -
>>> - if (!event->attr.exclude_user)
>>> - *br_type |= PERF_SAMPLE_BRANCH_USER;
>>> -
>>> - if (!event->attr.exclude_kernel)
>>> - *br_type |= PERF_SAMPLE_BRANCH_KERNEL;
>>> - }
>>> - }
>>> + event->attr.branch_sample_type =
>>> + PERF_SAMPLE_BRANCH_USER |
>>> + PERF_SAMPLE_BRANCH_CALL_STACK;
>>> }
>>>
>>> /*
>>> @@ -1825,10 +1837,34 @@ static ssize_t set_attr_rdpmc(struct device *cdev,
>>> return count;
>>> }
>>>
>>> +static ssize_t get_attr_lbr_callstack(struct device *cdev,
>>> + struct device_attribute *attr, char *buf)
>>> +{
>>> + return snprintf(buf, 40, "%d\n", x86_pmu.attr_lbr_callstack);
>>> +}
>>> +
>>> +static ssize_t set_attr_lbr_callstack(struct device *cdev,
>>> + struct device_attribute *attr,
>>> + const char *buf, size_t count)
>>> +{
>>> + unsigned long val = simple_strtoul(buf, NULL, 0);
>>> +
>>> + if (x86_pmu.attr_lbr_callstack != !!val) {
>>> + if (val && !x86_pmu_has_lbr_callstack())
>>> + return -EOPNOTSUPP;
>>> + x86_pmu.attr_lbr_callstack = !!val;
>>> + }
>>> + return count;
>>> +}
>>> +
>>> static DEVICE_ATTR(rdpmc, S_IRUSR | S_IWUSR, get_attr_rdpmc, set_attr_rdpmc);
>>> +static DEVICE_ATTR(lbr_callstack, S_IRUSR | S_IWUSR,
>>> + get_attr_lbr_callstack, set_attr_lbr_callstack);
>>> +
>>>
>>> static struct attribute *x86_pmu_attrs[] = {
>>> &dev_attr_rdpmc.attr,
>>> + &dev_attr_lbr_callstack.attr,
>>> NULL,
>>> };
>>>
>>> @@ -1955,12 +1991,29 @@ static unsigned long get_segment_base(unsigned int segment)
>>> return get_desc_base(desc + idx);
>>> }
>>>
>>> +static inline void
>>> +perf_callchain_lbr_callstack(struct perf_callchain_entry *entry,
>>> + struct perf_sample_data *data)
>>> +{
>>> + struct perf_branch_stack *br_stack = data->br_stack;
>>> +
>>> + if (br_stack && br_stack->user_callstack &&
>>> + x86_pmu.attr_lbr_callstack) {
>>> + int i = 0;
>>> + while (i < br_stack->nr && entry->nr < PERF_MAX_STACK_DEPTH) {
>>> + perf_callchain_store(entry, br_stack->entries[i].from);
>>> + i++;
>>> + }
>>> + }
>>> +}
>>> +
>>> #ifdef CONFIG_COMPAT
>>>
>>> #include <asm/compat.h>
>>>
>>> static inline int
>>> -perf_callchain_user32(struct pt_regs *regs, struct perf_callchain_entry *entry)
>>> +perf_callchain_user32(struct perf_callchain_entry *entry,
>>> + struct pt_regs *regs, struct perf_sample_data *data)
>>> {
>>> /* 32-bit process in 64-bit kernel. */
>>> unsigned long ss_base, cs_base;
>>> @@ -1989,11 +2042,16 @@ perf_callchain_user32(struct pt_regs *regs, struct perf_callchain_entry *entry)
>>> perf_callchain_store(entry, cs_base + frame.return_address);
>>> fp = compat_ptr(ss_base + frame.next_frame);
>>> }
>>> +
>>> + if (fp == compat_ptr(regs->bp))
>>> + perf_callchain_lbr_callstack(entry, data);
>>> +
>>> return 1;
>>> }
>>> #else
>>> static inline int
>>> -perf_callchain_user32(struct pt_regs *regs, struct perf_callchain_entry *entry)
>>> +perf_callchain_user32(struct perf_callchain_entry *entry,
>>> + struct pt_regs *regs, struct perf_sample_data *data)
>>> {
>>> return 0;
>>> }
>>> @@ -2023,12 +2081,12 @@ void perf_callchain_user(struct perf_callchain_entry *entry,
>>> if (!current->mm)
>>> return;
>>>
>>> - if (perf_callchain_user32(regs, entry))
>>> + if (perf_callchain_user32(entry, regs, data))
>>> return;
>>>
>>> while (entry->nr < PERF_MAX_STACK_DEPTH) {
>>> unsigned long bytes;
>>> - frame.next_frame = NULL;
>>> + frame.next_frame = NULL;
>>> frame.return_address = 0;
>>>
>>> bytes = copy_from_user_nmi(&frame, fp, sizeof(frame));
>>> @@ -2041,6 +2099,10 @@ void perf_callchain_user(struct perf_callchain_entry *entry,
>>> perf_callchain_store(entry, frame.return_address);
>>> fp = frame.next_frame;
>>> }
>>> +
>>> + /* try LBR callstack if there is no frame pointer */
>>> + if (fp == (void __user *)regs->bp)
>>> + perf_callchain_lbr_callstack(entry, data);
>>> }
>>>
>>> /*
>>> diff --git a/arch/x86/kernel/cpu/perf_event.h b/arch/x86/kernel/cpu/perf_event.h
>>> index 0116970..536470d 100644
>>> --- a/arch/x86/kernel/cpu/perf_event.h
>>> +++ b/arch/x86/kernel/cpu/perf_event.h
>>> @@ -390,6 +390,7 @@ struct x86_pmu {
>>> * sysfs attrs
>>> */
>>> int attr_rdpmc;
>>> + int attr_lbr_callstack;
>>> struct attribute **format_attrs;
>>> struct attribute **event_attrs;
>>>
>>> @@ -496,6 +497,12 @@ static struct perf_pmu_events_attr event_attr_##v = { \
>>>
>>> extern struct x86_pmu x86_pmu __read_mostly;
>>>
>>> +static inline bool x86_pmu_has_lbr_callstack(void)
>>> +{
>>> + return x86_pmu.lbr_sel_map &&
>>> + x86_pmu.lbr_sel_map[PERF_SAMPLE_BRANCH_CALL_STACK_SHIFT] > 0;
>>> +}
>>> +
>>> DECLARE_PER_CPU(struct cpu_hw_events, cpu_hw_events);
>>>
>>> int x86_perf_event_set_period(struct perf_event *event);
>>> diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c
>>> index f59b46e..baa8384 100644
>>> --- a/arch/x86/kernel/cpu/perf_event_intel.c
>>> +++ b/arch/x86/kernel/cpu/perf_event_intel.c
>>> @@ -882,15 +882,10 @@ static __initconst const u64 atom_hw_cache_event_ids
>>> },
>>> };
>>>
>>> -static inline bool intel_pmu_needs_lbr_smpl(struct perf_event *event)
>>> +static inline bool intel_pmu_needs_lbr_callstack(struct perf_event *event)
>>> {
>>> - /* user explicitly requested branch sampling */
>>> - if (has_branch_stack(event))
>>> - return true;
>>> -
>>> - /* implicit branch sampling to correct PEBS skid */
>>> - if (x86_pmu.intel_cap.pebs_trap && event->attr.precise_ip > 1 &&
>>> - x86_pmu.intel_cap.pebs_format < 2)
>>> + if ((event->attr.sample_type & PERF_SAMPLE_CALLCHAIN) &&
>>> + (event->attr.branch_sample_type & PERF_SAMPLE_BRANCH_CALL_STACK))
>>> return true;
>>>
>>> return false;
>>> @@ -1054,7 +1049,7 @@ static void intel_pmu_disable_event(struct perf_event *event)
>>> * must disable before any actual event
>>> * because any event may be combined with LBR
>>> */
>>> - if (intel_pmu_needs_lbr_smpl(event))
>>> + if (needs_branch_stack(event))
>>> intel_pmu_lbr_disable(event);
>>>
>>> if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL)) {
>>> @@ -1115,7 +1110,7 @@ static void intel_pmu_enable_event(struct perf_event *event)
>>> * must enabled before any actual event
>>> * because any event may be combined with LBR
>>> */
>>> - if (intel_pmu_needs_lbr_smpl(event))
>>> + if (needs_branch_stack(event))
>>> intel_pmu_lbr_enable(event);
>>>
>>> if (event->attr.exclude_host)
>>> @@ -1237,7 +1232,8 @@ again:
>>>
>>> perf_sample_data_init(&data, 0, event->hw.last_period);
>>>
>>> - if (has_branch_stack(event))
>>> + if (has_branch_stack(event) ||
>>> + (event->ctx->task && intel_pmu_needs_lbr_callstack(event)))
>>> data.br_stack = &cpuc->lbr_stack;
>>>
>>> if (perf_event_overflow(event, &data, regs))
>>> @@ -1568,7 +1564,7 @@ static int intel_pmu_hw_config(struct perf_event *event)
>>> if (event->attr.precise_ip && x86_pmu.pebs_aliases)
>>> x86_pmu.pebs_aliases(event);
>>>
>>> - if (intel_pmu_needs_lbr_smpl(event)) {
>>> + if (needs_branch_stack(event)) {
>>> ret = intel_pmu_setup_lbr_filter(event);
>>> if (ret)
>>> return ret;
>>> diff --git a/arch/x86/kernel/cpu/perf_event_intel_lbr.c b/arch/x86/kernel/cpu/perf_event_intel_lbr.c
>>> index 43b16b4..3be2d7b 100644
>>> --- a/arch/x86/kernel/cpu/perf_event_intel_lbr.c
>>> +++ b/arch/x86/kernel/cpu/perf_event_intel_lbr.c
>>> @@ -709,6 +709,8 @@ intel_pmu_lbr_filter(struct cpu_hw_events *cpuc)
>>> int i, j, type;
>>> bool compress = false;
>>>
>>> + cpuc->lbr_stack.user_callstack = branch_user_callstack(br_sel);
>>> +
>>> /* if sampling all branches, then nothing to filter */
>>> if ((br_sel & X86_BR_ALL) == X86_BR_ALL)
>>> return;
>>> @@ -861,6 +863,7 @@ void intel_pmu_lbr_init_hsw(void)
>>>
>>> x86_pmu.lbr_sel_mask = LBR_SEL_MASK;
>>> x86_pmu.lbr_sel_map = hsw_lbr_sel_map;
>>> + x86_pmu.attr_lbr_callstack = 1;
>>>
>>> pr_cont("16-deep LBR, ");
>>> }
>>> diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
>>> index fa4c1bf..168e66e 100644
>>> --- a/include/linux/perf_event.h
>>> +++ b/include/linux/perf_event.h
>>> @@ -97,6 +97,7 @@ struct perf_branch_entry {
>>> * recent branch.
>>> */
>>> struct perf_branch_stack {
>>> + unsigned user_callstack:1;
>>> __u64 nr;
>>> struct perf_branch_entry entries[0];
>>> };
>>> @@ -759,6 +760,11 @@ static inline bool has_branch_stack(struct perf_event *event)
>>> return event->attr.sample_type & PERF_SAMPLE_BRANCH_STACK;
>>> }
>>>
>>> +static inline bool needs_branch_stack(struct perf_event *event)
>>> +{
>>> + return event->attr.branch_sample_type != 0;
>>> +}
>>> +
>>> extern int perf_output_begin(struct perf_output_handle *handle,
>>> struct perf_event *event, unsigned int size);
>>> extern void perf_output_end(struct perf_output_handle *handle);
>>> diff --git a/kernel/events/core.c b/kernel/events/core.c
>>> index 4aad901..38eaa2b 100644
>>> --- a/kernel/events/core.c
>>> +++ b/kernel/events/core.c
>>> @@ -1117,7 +1117,7 @@ list_add_event(struct perf_event *event, struct perf_event_context *ctx)
>>> if (is_cgroup_event(event))
>>> ctx->nr_cgroups++;
>>>
>>> - if (has_branch_stack(event))
>>> + if (needs_branch_stack(event))
>>> ctx->nr_branch_stack++;
>>>
>>> list_add_rcu(&event->event_entry, &ctx->event_list);
>>> @@ -1274,7 +1274,7 @@ list_del_event(struct perf_event *event, struct perf_event_context *ctx)
>>> cpuctx->cgrp = NULL;
>>> }
>>>
>>> - if (has_branch_stack(event)) {
>>> + if (needs_branch_stack(event)) {
>>> if (ctx->is_active)
>>> __get_cpu_var(perf_branch_stack_events)--;
>>> ctx->nr_branch_stack--;
>>> @@ -3155,7 +3155,7 @@ static void free_event(struct perf_event *event)
>>> static_key_slow_dec_deferred(&perf_sched_events);
>>> }
>>>
>>> - if (has_branch_stack(event))
>>> + if (needs_branch_stack(event))
>>> static_key_slow_dec_deferred(&perf_sched_events);
>>> }
>>>
>>> @@ -6545,6 +6545,9 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
>>> if (attr->inherit && (attr->read_format & PERF_FORMAT_GROUP))
>>> goto done;
>>>
>>> + if (!has_branch_stack(event))
>>> + event->attr.branch_sample_type = 0;
>>> +
>>> pmu = perf_init_event(event);
>>>
>>> done:
>>> @@ -6577,7 +6580,7 @@ done:
>>> return ERR_PTR(err);
>>> }
>>> }
>>> - if (has_branch_stack(event))
>>> + if (needs_branch_stack(event))
>>> static_key_slow_inc(&perf_sched_events.key);
>>> }
>>>
>>> --
>>> 1.8.1.4
>>>

2013-06-27 08:07:30

[permalink] [raw]

Subject: Re: [PATCH 0/7] perf, x86: Haswell LBR call stack support

On 06/27/2013 12:48 AM, Stephane Eranian wrote:
> On Wed, Jun 26, 2013 at 1:54 PM, Peter Zijlstra <[email protected]> wrote:
>> On Tue, Jun 25, 2013 at 04:47:12PM +0800, Yan, Zheng wrote:
>>> From: "Yan, Zheng" <[email protected]>
>>>
>>> Haswell has a new feature that utilizes the existing Last Branch Record
>>> facility to record call chains. When the feature is enabled, function
>>> call will be collected as normal, but as return instructions are executed
>>> the last captured branch record is popped from the on-chip LBR registers.
>>> The LBR call stack facility can help perf to get call chains of progam
>>> without frame pointer. When perf tool requests PERF_SAMPLE_CALLCHAIN +
>>> PERF_SAMPLE_BRANCH_USER, this feature is dynamically enabled by default.
>>> This feature can be disabled/enabled through an attribute file in the cpu
>>> pmu sysfs directory.
>>>
>>> The LBR call stack has following known limitations
>>> 1. Zero length calls are not filtered out by hardware
>>> 2. Exception handing such as setjmp/longjmp will have calls/returns not
>>> match
>>> 3. Pushing different return address onto the stack will have calls/returns
>>> not match
>>>
>>
>> You fail to mention what happens when the callstack is deeper than the
>> LBR is big -- a rather common issue I'd think.
>>
> LBR is statistical callstack. By nature, it cannot capture the entire chain.
>
>> From what I gather if you push when full, the TOS rotates and eats the
>> tail allowing you to add another entry to the head.
>>
>> If you pop when empty; nothing happens.
>>
> Not sure they know "empty" from "non empty", they just move the LBR_TOS
> by one entry on returns.

When pop, it decreases LBR_TOS by one and clear the popped LBR_FROM/LBR_TO MSRs.
If pop when empty, you will get an empty callchains.

Regards
Yan, Zheng

>
>> So on pretty much every program you'd be lucky to get the top of the
>> callstack but can end up with nearly nothing.
>>
> You will get the calls closest to the interrupt.
>
>> Given that, and the other limitations I don't think its a fair
>> replacement for user callchains.
>
> Well, the one advantage I see is that it works on stripped/optimized
> binaries without fp or dwarf info. Compared to dwarf and the stack
> snapshot, it does incur less overhead most likely. But yes, it comes
> with limitations.
>

2013-06-27 08:58:48

[permalink] [raw]

Subject: Re: [PATCH 6/7] perf, x86: Use LBR call stack to get user callchain

On Thu, Jun 27, 2013 at 3:40 AM, Yan, Zheng <[email protected]> wrote:
> On 06/26/2013 08:42 PM, Stephane Eranian wrote:
>> On Tue, Jun 25, 2013 at 10:47 AM, Yan, Zheng <[email protected]> wrote:
>>> From: "Yan, Zheng" <[email protected]>
>>>
>>> Try enabling the LBR call stack feature if event request recording
>>> callchain. Try utilizing the LBR call stack to get user callchain
>>> in case of there is no frame pointer.
>>>
>>> This patch also adds a cpu pmu attribute to enable/disable this
>>> feature.
>>>
>>> Signed-off-by: Yan, Zheng <[email protected]>
>>> ---
>>> arch/x86/kernel/cpu/perf_event.c | 128 +++++++++++++++++++++--------
>>> arch/x86/kernel/cpu/perf_event.h | 7 ++
>>> arch/x86/kernel/cpu/perf_event_intel.c | 20 ++---
>>> arch/x86/kernel/cpu/perf_event_intel_lbr.c | 3 +
>>> include/linux/perf_event.h | 6 ++
>>> kernel/events/core.c | 11 ++-
>>> 6 files changed, 126 insertions(+), 49 deletions(-)
>>>
>>> diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
>>> index 639aa4d..a07eb03 100644
>>> --- a/arch/x86/kernel/cpu/perf_event.c
>>> +++ b/arch/x86/kernel/cpu/perf_event.c
>>> @@ -399,37 +399,49 @@ int x86_pmu_hw_config(struct perf_event *event)
>>>
>>> if (event->attr.precise_ip > precise)
>>> return -EOPNOTSUPP;
>>> + }
>>> + /*
>>> + * check that PEBS LBR correction does not conflict with
>>> + * whatever the user is asking with attr->branch_sample_type
>>> + */
>>> + if (event->attr.precise_ip > 1 && x86_pmu.intel_cap.pebs_format < 2) {
>>> + u64 *br_type = &event->attr.branch_sample_type;
>>> +
>>> + if (has_branch_stack(event)) {
>>> + if (!precise_br_compat(event))
>>> + return -EOPNOTSUPP;
>>> +
>>> + /* branch_sample_type is compatible */
>>> +
>>> + } else {
>>> + /*
>>> + * user did not specify branch_sample_type
>>> + *
>>> + * For PEBS fixups, we capture all
>>> + * the branches at the priv level of the
>>> + * event.
>>> + */
>>> + *br_type = PERF_SAMPLE_BRANCH_ANY;
>>> +
>>> + if (!event->attr.exclude_user)
>>> + *br_type |= PERF_SAMPLE_BRANCH_USER;
>>> +
>>> + if (!event->attr.exclude_kernel)
>>> + *br_type |= PERF_SAMPLE_BRANCH_KERNEL;
>>> + }
>>> + } else if ((event->attr.sample_type & PERF_SAMPLE_CALLCHAIN) &&
>>> + !has_branch_stack(event) &&
>>> + x86_pmu.attr_lbr_callstack &&
>>> + !event->attr.exclude_user &&
>>> + (event->attach_state & PERF_ATTACH_TASK)) {
>>
>> Yes, that's the test but it is wrong. I can pass the test if
>> I have exclude_user = exclude_kernel = 0.
>
> It's OK to have "exclude_user = exclude_kernel = 0". The LBR callstack feature is only used
> for getting callchain of user program. If "exclude_kernel = 0", we still use frame pointer
> to get the callchain.
>
And what's the point of collecting the user callstack in a case when
the counter overflowed
in the kernel? It won't be directly correlated to the IIP.

Now this mode is useful for some other measurements, but I think you
are after correlating
interrupted IP with callstack.

> Regards
> Yan, Zheng
>
>
>>
>> You want:
>> !event->attr.exclude_user && event->attr.exclude_kernel &&
>>
>> I tested that and it works.
>>
>>> /*
>>> - * check that PEBS LBR correction does not conflict with
>>> - * whatever the user is asking with attr->branch_sample_type
>>> + * user did not specify branch_sample_type,
>>> + * try using the LBR call stack facility to
>>> + * record call chains of user program.
>>> */
>>> - if (event->attr.precise_ip > 1 &&
>>> - x86_pmu.intel_cap.pebs_format < 2) {
>>> - u64 *br_type = &event->attr.branch_sample_type;
>>> -
>>> - if (has_branch_stack(event)) {
>>> - if (!precise_br_compat(event))
>>> - return -EOPNOTSUPP;
>>> -
>>> - /* branch_sample_type is compatible */
>>> -
>>> - } else {
>>> - /*
>>> - * user did not specify branch_sample_type
>>> - *
>>> - * For PEBS fixups, we capture all
>>> - * the branches at the priv level of the
>>> - * event.
>>> - */
>>> - *br_type = PERF_SAMPLE_BRANCH_ANY;
>>> -
>>> - if (!event->attr.exclude_user)
>>> - *br_type |= PERF_SAMPLE_BRANCH_USER;
>>> -
>>> - if (!event->attr.exclude_kernel)
>>> - *br_type |= PERF_SAMPLE_BRANCH_KERNEL;
>>> - }
>>> - }
>>> + event->attr.branch_sample_type =
>>> + PERF_SAMPLE_BRANCH_USER |
>>> + PERF_SAMPLE_BRANCH_CALL_STACK;
>>> }
>>>
>>> /*
>>> @@ -1825,10 +1837,34 @@ static ssize_t set_attr_rdpmc(struct device *cdev,
>>> return count;
>>> }
>>>
>>> +static ssize_t get_attr_lbr_callstack(struct device *cdev,
>>> + struct device_attribute *attr, char *buf)
>>> +{
>>> + return snprintf(buf, 40, "%d\n", x86_pmu.attr_lbr_callstack);
>>> +}
>>> +
>>> +static ssize_t set_attr_lbr_callstack(struct device *cdev,
>>> + struct device_attribute *attr,
>>> + const char *buf, size_t count)
>>> +{
>>> + unsigned long val = simple_strtoul(buf, NULL, 0);
>>> +
>>> + if (x86_pmu.attr_lbr_callstack != !!val) {
>>> + if (val && !x86_pmu_has_lbr_callstack())
>>> + return -EOPNOTSUPP;
>>> + x86_pmu.attr_lbr_callstack = !!val;
>>> + }
>>> + return count;
>>> +}
>>> +
>>> static DEVICE_ATTR(rdpmc, S_IRUSR | S_IWUSR, get_attr_rdpmc, set_attr_rdpmc);
>>> +static DEVICE_ATTR(lbr_callstack, S_IRUSR | S_IWUSR,
>>> + get_attr_lbr_callstack, set_attr_lbr_callstack);
>>> +
>>>
>>> static struct attribute *x86_pmu_attrs[] = {
>>> &dev_attr_rdpmc.attr,
>>> + &dev_attr_lbr_callstack.attr,
>>> NULL,
>>> };
>>>
>>> @@ -1955,12 +1991,29 @@ static unsigned long get_segment_base(unsigned int segment)
>>> return get_desc_base(desc + idx);
>>> }
>>>
>>> +static inline void
>>> +perf_callchain_lbr_callstack(struct perf_callchain_entry *entry,
>>> + struct perf_sample_data *data)
>>> +{
>>> + struct perf_branch_stack *br_stack = data->br_stack;
>>> +
>>> + if (br_stack && br_stack->user_callstack &&
>>> + x86_pmu.attr_lbr_callstack) {
>>> + int i = 0;
>>> + while (i < br_stack->nr && entry->nr < PERF_MAX_STACK_DEPTH) {
>>> + perf_callchain_store(entry, br_stack->entries[i].from);
>>> + i++;
>>> + }
>>> + }
>>> +}
>>> +
>>> #ifdef CONFIG_COMPAT
>>>
>>> #include <asm/compat.h>
>>>
>>> static inline int
>>> -perf_callchain_user32(struct pt_regs *regs, struct perf_callchain_entry *entry)
>>> +perf_callchain_user32(struct perf_callchain_entry *entry,
>>> + struct pt_regs *regs, struct perf_sample_data *data)
>>> {
>>> /* 32-bit process in 64-bit kernel. */
>>> unsigned long ss_base, cs_base;
>>> @@ -1989,11 +2042,16 @@ perf_callchain_user32(struct pt_regs *regs, struct perf_callchain_entry *entry)
>>> perf_callchain_store(entry, cs_base + frame.return_address);
>>> fp = compat_ptr(ss_base + frame.next_frame);
>>> }
>>> +
>>> + if (fp == compat_ptr(regs->bp))
>>> + perf_callchain_lbr_callstack(entry, data);
>>> +
>>> return 1;
>>> }
>>> #else
>>> static inline int
>>> -perf_callchain_user32(struct pt_regs *regs, struct perf_callchain_entry *entry)
>>> +perf_callchain_user32(struct perf_callchain_entry *entry,
>>> + struct pt_regs *regs, struct perf_sample_data *data)
>>> {
>>> return 0;
>>> }
>>> @@ -2023,12 +2081,12 @@ void perf_callchain_user(struct perf_callchain_entry *entry,
>>> if (!current->mm)
>>> return;
>>>
>>> - if (perf_callchain_user32(regs, entry))
>>> + if (perf_callchain_user32(entry, regs, data))
>>> return;
>>>
>>> while (entry->nr < PERF_MAX_STACK_DEPTH) {
>>> unsigned long bytes;
>>> - frame.next_frame = NULL;
>>> + frame.next_frame = NULL;
>>> frame.return_address = 0;
>>>
>>> bytes = copy_from_user_nmi(&frame, fp, sizeof(frame));
>>> @@ -2041,6 +2099,10 @@ void perf_callchain_user(struct perf_callchain_entry *entry,
>>> perf_callchain_store(entry, frame.return_address);
>>> fp = frame.next_frame;
>>> }
>>> +
>>> + /* try LBR callstack if there is no frame pointer */
>>> + if (fp == (void __user *)regs->bp)
>>> + perf_callchain_lbr_callstack(entry, data);
>>> }
>>>
>>> /*
>>> diff --git a/arch/x86/kernel/cpu/perf_event.h b/arch/x86/kernel/cpu/perf_event.h
>>> index 0116970..536470d 100644
>>> --- a/arch/x86/kernel/cpu/perf_event.h
>>> +++ b/arch/x86/kernel/cpu/perf_event.h
>>> @@ -390,6 +390,7 @@ struct x86_pmu {
>>> * sysfs attrs
>>> */
>>> int attr_rdpmc;
>>> + int attr_lbr_callstack;
>>> struct attribute **format_attrs;
>>> struct attribute **event_attrs;
>>>
>>> @@ -496,6 +497,12 @@ static struct perf_pmu_events_attr event_attr_##v = { \
>>>
>>> extern struct x86_pmu x86_pmu __read_mostly;
>>>
>>> +static inline bool x86_pmu_has_lbr_callstack(void)
>>> +{
>>> + return x86_pmu.lbr_sel_map &&
>>> + x86_pmu.lbr_sel_map[PERF_SAMPLE_BRANCH_CALL_STACK_SHIFT] > 0;
>>> +}
>>> +
>>> DECLARE_PER_CPU(struct cpu_hw_events, cpu_hw_events);
>>>
>>> int x86_perf_event_set_period(struct perf_event *event);
>>> diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c
>>> index f59b46e..baa8384 100644
>>> --- a/arch/x86/kernel/cpu/perf_event_intel.c
>>> +++ b/arch/x86/kernel/cpu/perf_event_intel.c
>>> @@ -882,15 +882,10 @@ static __initconst const u64 atom_hw_cache_event_ids
>>> },
>>> };
>>>
>>> -static inline bool intel_pmu_needs_lbr_smpl(struct perf_event *event)
>>> +static inline bool intel_pmu_needs_lbr_callstack(struct perf_event *event)
>>> {
>>> - /* user explicitly requested branch sampling */
>>> - if (has_branch_stack(event))
>>> - return true;
>>> -
>>> - /* implicit branch sampling to correct PEBS skid */
>>> - if (x86_pmu.intel_cap.pebs_trap && event->attr.precise_ip > 1 &&
>>> - x86_pmu.intel_cap.pebs_format < 2)
>>> + if ((event->attr.sample_type & PERF_SAMPLE_CALLCHAIN) &&
>>> + (event->attr.branch_sample_type & PERF_SAMPLE_BRANCH_CALL_STACK))
>>> return true;
>>>
>>> return false;
>>> @@ -1054,7 +1049,7 @@ static void intel_pmu_disable_event(struct perf_event *event)
>>> * must disable before any actual event
>>> * because any event may be combined with LBR
>>> */
>>> - if (intel_pmu_needs_lbr_smpl(event))
>>> + if (needs_branch_stack(event))
>>> intel_pmu_lbr_disable(event);
>>>
>>> if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL)) {
>>> @@ -1115,7 +1110,7 @@ static void intel_pmu_enable_event(struct perf_event *event)
>>> * must enabled before any actual event
>>> * because any event may be combined with LBR
>>> */
>>> - if (intel_pmu_needs_lbr_smpl(event))
>>> + if (needs_branch_stack(event))
>>> intel_pmu_lbr_enable(event);
>>>
>>> if (event->attr.exclude_host)
>>> @@ -1237,7 +1232,8 @@ again:
>>>
>>> perf_sample_data_init(&data, 0, event->hw.last_period);
>>>
>>> - if (has_branch_stack(event))
>>> + if (has_branch_stack(event) ||
>>> + (event->ctx->task && intel_pmu_needs_lbr_callstack(event)))
>>> data.br_stack = &cpuc->lbr_stack;
>>>
>>> if (perf_event_overflow(event, &data, regs))
>>> @@ -1568,7 +1564,7 @@ static int intel_pmu_hw_config(struct perf_event *event)
>>> if (event->attr.precise_ip && x86_pmu.pebs_aliases)
>>> x86_pmu.pebs_aliases(event);
>>>
>>> - if (intel_pmu_needs_lbr_smpl(event)) {
>>> + if (needs_branch_stack(event)) {
>>> ret = intel_pmu_setup_lbr_filter(event);
>>> if (ret)
>>> return ret;
>>> diff --git a/arch/x86/kernel/cpu/perf_event_intel_lbr.c b/arch/x86/kernel/cpu/perf_event_intel_lbr.c
>>> index 43b16b4..3be2d7b 100644
>>> --- a/arch/x86/kernel/cpu/perf_event_intel_lbr.c
>>> +++ b/arch/x86/kernel/cpu/perf_event_intel_lbr.c
>>> @@ -709,6 +709,8 @@ intel_pmu_lbr_filter(struct cpu_hw_events *cpuc)
>>> int i, j, type;
>>> bool compress = false;
>>>
>>> + cpuc->lbr_stack.user_callstack = branch_user_callstack(br_sel);
>>> +
>>> /* if sampling all branches, then nothing to filter */
>>> if ((br_sel & X86_BR_ALL) == X86_BR_ALL)
>>> return;
>>> @@ -861,6 +863,7 @@ void intel_pmu_lbr_init_hsw(void)
>>>
>>> x86_pmu.lbr_sel_mask = LBR_SEL_MASK;
>>> x86_pmu.lbr_sel_map = hsw_lbr_sel_map;
>>> + x86_pmu.attr_lbr_callstack = 1;
>>>
>>> pr_cont("16-deep LBR, ");
>>> }
>>> diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
>>> index fa4c1bf..168e66e 100644
>>> --- a/include/linux/perf_event.h
>>> +++ b/include/linux/perf_event.h
>>> @@ -97,6 +97,7 @@ struct perf_branch_entry {
>>> * recent branch.
>>> */
>>> struct perf_branch_stack {
>>> + unsigned user_callstack:1;
>>> __u64 nr;
>>> struct perf_branch_entry entries[0];
>>> };
>>> @@ -759,6 +760,11 @@ static inline bool has_branch_stack(struct perf_event *event)
>>> return event->attr.sample_type & PERF_SAMPLE_BRANCH_STACK;
>>> }
>>>
>>> +static inline bool needs_branch_stack(struct perf_event *event)
>>> +{
>>> + return event->attr.branch_sample_type != 0;
>>> +}
>>> +
>>> extern int perf_output_begin(struct perf_output_handle *handle,
>>> struct perf_event *event, unsigned int size);
>>> extern void perf_output_end(struct perf_output_handle *handle);
>>> diff --git a/kernel/events/core.c b/kernel/events/core.c
>>> index 4aad901..38eaa2b 100644
>>> --- a/kernel/events/core.c
>>> +++ b/kernel/events/core.c
>>> @@ -1117,7 +1117,7 @@ list_add_event(struct perf_event *event, struct perf_event_context *ctx)
>>> if (is_cgroup_event(event))
>>> ctx->nr_cgroups++;
>>>
>>> - if (has_branch_stack(event))
>>> + if (needs_branch_stack(event))
>>> ctx->nr_branch_stack++;
>>>
>>> list_add_rcu(&event->event_entry, &ctx->event_list);
>>> @@ -1274,7 +1274,7 @@ list_del_event(struct perf_event *event, struct perf_event_context *ctx)
>>> cpuctx->cgrp = NULL;
>>> }
>>>
>>> - if (has_branch_stack(event)) {
>>> + if (needs_branch_stack(event)) {
>>> if (ctx->is_active)
>>> __get_cpu_var(perf_branch_stack_events)--;
>>> ctx->nr_branch_stack--;
>>> @@ -3155,7 +3155,7 @@ static void free_event(struct perf_event *event)
>>> static_key_slow_dec_deferred(&perf_sched_events);
>>> }
>>>
>>> - if (has_branch_stack(event))
>>> + if (needs_branch_stack(event))
>>> static_key_slow_dec_deferred(&perf_sched_events);
>>> }
>>>
>>> @@ -6545,6 +6545,9 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
>>> if (attr->inherit && (attr->read_format & PERF_FORMAT_GROUP))
>>> goto done;
>>>
>>> + if (!has_branch_stack(event))
>>> + event->attr.branch_sample_type = 0;
>>> +
>>> pmu = perf_init_event(event);
>>>
>>> done:
>>> @@ -6577,7 +6580,7 @@ done:
>>> return ERR_PTR(err);
>>> }
>>> }
>>> - if (has_branch_stack(event))
>>> + if (needs_branch_stack(event))
>>> static_key_slow_inc(&perf_sched_events.key);
>>> }
>>>
>>> --
>>> 1.8.1.4
>>>
>

2013-06-28 02:24:50