2010-11-11 16:15:33

by Andi Kleen

[permalink] [raw]
Subject: [PATCH 1/3] perf-events: Add support for supplementary event registers

From: Andi Kleen <[email protected]>

Intel Nehalem/Westmere have a special OFFCORE_RESPONSE event
that can be used to monitor any offcore accesses from a core.
This is a very useful event for various tunings, and it's
also needed to implement the generic LLC-* events correctly.

Unfortunately this event requires programming a mask in a separate
register. And worse this separate register is per core, not per
CPU thread.

This patch adds:
- Teaches perf_events that OFFCORE_RESPONSE need extra parameters.
- Adds a new field to the user interface to pass the extra mask.
This reuses one of the unused fields for perf events. The change
is ABI neutral because noone is likely to have used OFFCORE_RESPONSE
before (with zero mask it wouldn't count anything)
- Add support to the Intel perf_event core to schedule the per
core resource. I tried to add generic infrastructure for this
that could be also used for other core resources.
The basic code has is patterned after the similar AMD northbridge
constraints code.

Thanks to Stephane Eranian who pointed out some problems
in the original version and suggested improvements.

Cc: [email protected]
Signed-off-by: Andi Kleen <[email protected]>
---
arch/x86/kernel/cpu/perf_event.c | 56 +++++++++++++++
arch/x86/kernel/cpu/perf_event_intel.c | 120 ++++++++++++++++++++++++++++++++
include/linux/perf_event.h | 7 ++-
3 files changed, 182 insertions(+), 1 deletions(-)

diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index ed63101..97133ec 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -93,6 +93,8 @@ struct amd_nb {
struct event_constraint event_constraints[X86_PMC_IDX_MAX];
};

+struct intel_percore;
+
#define MAX_LBR_ENTRIES 16

struct cpu_hw_events {
@@ -126,6 +128,8 @@ struct cpu_hw_events {
void *lbr_context;
struct perf_branch_stack lbr_stack;
struct perf_branch_entry lbr_entries[MAX_LBR_ENTRIES];
+ int percore_used;
+ struct intel_percore *per_core;

/*
* AMD specific bits
@@ -175,6 +179,24 @@ struct cpu_hw_events {
#define for_each_event_constraint(e, c) \
for ((e) = (c); (e)->weight; (e)++)

+/*
+ * Extra registers for specific events.
+ * Some events need large masks and require external MSRs.
+ * Define a mapping to these extra registers.
+ */
+
+struct extra_reg {
+ unsigned event;
+ unsigned msr;
+ u64 config_mask;
+ u64 valid_mask;
+};
+
+#define EVENT_EXTRA_REG(event, msr, m, vm) { event, msr, m, vm }
+#define INTEL_EVENT_EXTRA_REG(event, msr, vm) \
+ EVENT_EXTRA_REG(event, msr, ARCH_PERFMON_EVENTSEL_EVENT, vm)
+#define EVENT_EXTRA_END {}
+
union perf_capabilities {
struct {
u64 lbr_format : 6;
@@ -219,6 +241,7 @@ struct x86_pmu {
void (*put_event_constraints)(struct cpu_hw_events *cpuc,
struct perf_event *event);
struct event_constraint *event_constraints;
+ struct event_constraint *percore_constraints;
void (*quirks)(void);
int perfctr_second_write;

@@ -247,6 +270,11 @@ struct x86_pmu {
*/
unsigned long lbr_tos, lbr_from, lbr_to; /* MSR base regs */
int lbr_nr; /* hardware stack size */
+
+ /*
+ * Extra registers for events
+ */
+ struct extra_reg *extra_regs;
};

static struct x86_pmu x86_pmu __read_mostly;
@@ -530,6 +558,28 @@ static int x86_pmu_hw_config(struct perf_event *event)
}

/*
+ * Find and validate any extra registers to set up.
+ */
+static int x86_pmu_extra_regs(struct perf_event *event)
+{
+ struct extra_reg *er;
+
+ if (!x86_pmu.extra_regs)
+ return 0;
+
+ for (er = x86_pmu.extra_regs; er->msr; er++) {
+ if (er->event != (event->attr.config & er->config_mask))
+ continue;
+ if (event->attr.event_extra & ~er->valid_mask)
+ return -EINVAL;
+ event->hw.extra_reg = er->msr;
+ event->hw.extra_config = event->attr.event_extra;
+ break;
+ }
+ return 0;
+}
+
+/*
* Setup the hardware configuration for a given attr_type
*/
static int __x86_pmu_event_init(struct perf_event *event)
@@ -561,6 +611,10 @@ static int __x86_pmu_event_init(struct perf_event *event)
event->hw.last_cpu = -1;
event->hw.last_tag = ~0ULL;

+ err = x86_pmu_extra_regs(event);
+ if (err)
+ return err;
+
return x86_pmu.hw_config(event);
}

@@ -876,6 +930,8 @@ static inline void __x86_pmu_enable_event(struct hw_perf_event *hwc,
u64 enable_mask)
{
wrmsrl(hwc->config_base + hwc->idx, hwc->config | enable_mask);
+ if (hwc->extra_reg)
+ wrmsrl(hwc->extra_reg, hwc->extra_config);
}

static inline void x86_pmu_disable_event(struct perf_event *event)
diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c
index c8f5c08..bbe7fba 100644
--- a/arch/x86/kernel/cpu/perf_event_intel.c
+++ b/arch/x86/kernel/cpu/perf_event_intel.c
@@ -1,5 +1,14 @@
#ifdef CONFIG_CPU_SUP_INTEL

+struct intel_percore {
+ raw_spinlock_t lock;
+ int ref;
+ u64 config;
+ unsigned extra_reg;
+ u64 extra_config;
+};
+static DEFINE_PER_CPU(struct intel_percore, intel_percore);
+
/*
* Intel PerfMon, used on Core and later.
*/
@@ -64,6 +73,18 @@ static struct event_constraint intel_nehalem_event_constraints[] =
EVENT_CONSTRAINT_END
};

+static struct extra_reg intel_nehalem_extra_regs[] =
+{
+ INTEL_EVENT_EXTRA_REG(0xb7, 0x1a6, 0xffff), /* OFFCORE_RESPONSE1 */
+ EVENT_EXTRA_END
+};
+
+static struct event_constraint intel_nehalem_percore_constraints[] =
+{
+ INTEL_EVENT_CONSTRAINT(0xb7, 0),
+ EVENT_CONSTRAINT_END
+};
+
static struct event_constraint intel_westmere_event_constraints[] =
{
FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */
@@ -76,6 +97,20 @@ static struct event_constraint intel_westmere_event_constraints[] =
EVENT_CONSTRAINT_END
};

+static struct extra_reg intel_westmere_extra_regs[] =
+{
+ INTEL_EVENT_EXTRA_REG(0xb7, 0x1a6, 0xffff), /* OFFCORE_RESPONSE1 */
+ INTEL_EVENT_EXTRA_REG(0xbb, 0x1a7, 0xffff), /* OFFCORE_RESPONSE2 */
+ EVENT_EXTRA_END
+};
+
+static struct event_constraint intel_westmere_percore_constraints[] =
+{
+ INTEL_EVENT_CONSTRAINT(0xb7, 0),
+ INTEL_EVENT_CONSTRAINT(0xbb, 0),
+ EVENT_CONSTRAINT_END
+};
+
static struct event_constraint intel_gen_event_constraints[] =
{
FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */
@@ -794,6 +829,56 @@ intel_bts_constraints(struct perf_event *event)
}

static struct event_constraint *
+intel_percore_constraints(struct cpu_hw_events *cpuc, struct perf_event *event)
+{
+ struct hw_perf_event *hwc = &event->hw;
+ unsigned e = hwc->config & ARCH_PERFMON_EVENTSEL_EVENT;
+ struct event_constraint *c;
+ struct intel_percore *pc;
+
+ if (!x86_pmu.percore_constraints)
+ return NULL;
+
+ for (c = x86_pmu.percore_constraints; c->cmask; c++) {
+ if (e != c->code)
+ continue;
+
+ c = NULL;
+
+ /*
+ * Allocate resource per core.
+ * Currently only one such per core resource can be allocated.
+ */
+ pc = cpuc->per_core;
+ if (!pc)
+ break;
+ raw_spin_lock(&pc->lock);
+ if (pc->ref > 0) {
+ /* Allow identical settings */
+ if (hwc->config == pc->config &&
+ hwc->extra_reg == pc->extra_reg &&
+ hwc->extra_config == pc->extra_config) {
+ pc->ref++;
+ cpuc->percore_used = 1;
+ } else {
+ /* Deny due to conflict */
+ c = &emptyconstraint;
+ }
+ } else {
+ pc->config = hwc->config;
+ pc->extra_reg = hwc->extra_reg;
+ pc->extra_config = hwc->extra_config;
+ pc->ref = 1;
+ cpuc->percore_used = 1;
+ }
+ raw_spin_unlock(&pc->lock);
+ return c;
+ }
+
+ return NULL;
+}
+
+static struct event_constraint *
intel_get_event_constraints(struct cpu_hw_events *cpuc, struct perf_event *event)
{
struct event_constraint *c;
@@ -806,9 +891,29 @@ intel_get_event_constraints(struct cpu_hw_events *cpuc, struct perf_event *event
if (c)
return c;

+ c = intel_percore_constraints(cpuc, event);
+ if (c)
+ return c;
+
return x86_get_event_constraints(cpuc, event);
}

+static void intel_put_event_constraints(struct cpu_hw_events *cpuc,
+ struct perf_event *event)
+{
+ struct intel_percore *pc;
+
+ if (!cpuc->percore_used)
+ return;
+
+ pc = cpuc->per_core;
+ raw_spin_lock(&pc->lock);
+ pc->ref--;
+ BUG_ON(pc->ref < 0);
+ raw_spin_unlock(&pc->lock);
+ cpuc->percore_used = 0;
+}
+
static int intel_pmu_hw_config(struct perf_event *event)
{
int ret = x86_pmu_hw_config(event);
@@ -854,6 +959,7 @@ static __initconst const struct x86_pmu core_pmu = {
*/
.max_period = (1ULL << 31) - 1,
.get_event_constraints = intel_get_event_constraints,
+ .put_event_constraints = intel_put_event_constraints,
.event_constraints = intel_core_event_constraints,
};

@@ -929,6 +1035,7 @@ static __init int intel_pmu_init(void)
union cpuid10_eax eax;
unsigned int unused;
unsigned int ebx;
+ int cpu;
int version;

if (!cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) {
@@ -1010,7 +1117,10 @@ static __init int intel_pmu_init(void)
intel_pmu_lbr_init_nhm();

x86_pmu.event_constraints = intel_nehalem_event_constraints;
+ x86_pmu.percore_constraints =
+ intel_nehalem_percore_constraints;
x86_pmu.enable_all = intel_pmu_nhm_enable_all;
+ x86_pmu.extra_regs = intel_nehalem_extra_regs;
pr_cont("Nehalem events, ");
break;

@@ -1032,7 +1142,10 @@ static __init int intel_pmu_init(void)
intel_pmu_lbr_init_nhm();

x86_pmu.event_constraints = intel_westmere_event_constraints;
+ x86_pmu.percore_constraints =
+ intel_westmere_percore_constraints;
x86_pmu.enable_all = intel_pmu_nhm_enable_all;
+ x86_pmu.extra_regs = intel_westmere_extra_regs;
pr_cont("Westmere events, ");
break;

@@ -1043,6 +1156,13 @@ static __init int intel_pmu_init(void)
x86_pmu.event_constraints = intel_gen_event_constraints;
pr_cont("generic architected perfmon, ");
}
+
+ for_each_possible_cpu(cpu) {
+ raw_spin_lock_init(&per_cpu(intel_percore, cpu).lock);
+ per_cpu(cpu_hw_events, cpu).per_core =
+ &per_cpu(intel_percore,
+ cpumask_first(topology_core_cpumask(cpu)));
+ }
return 0;
}

diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 057bf22..a353594 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -224,7 +224,10 @@ struct perf_event_attr {
};

__u32 bp_type;
- __u64 bp_addr;
+ union {
+ __u64 bp_addr;
+ __u64 event_extra; /* Extra for some events */
+ };
__u64 bp_len;
};

@@ -529,6 +532,8 @@ struct hw_perf_event {
unsigned long event_base;
int idx;
int last_cpu;
+ unsigned extra_reg;
+ u64 extra_config;
};
struct { /* software */
struct hrtimer hrtimer;
--
1.7.1


2010-11-11 16:15:38

by Andi Kleen

[permalink] [raw]
Subject: [PATCH 2/3] perf: Add support for extra parameters for raw events

From: Andi Kleen <[email protected]>

Add support to the perf tool to specify extra values for raw events
and pass them to the kernel.

The new format is -e rXXXX[,YYYY]

Signed-off-by: Andi Kleen <[email protected]>
---
tools/perf/Documentation/perf-list.txt | 6 ++++++
tools/perf/builtin-report.c | 7 ++++---
tools/perf/util/parse-events.c | 26 ++++++++++++++++++++------
tools/perf/util/parse-events.h | 2 +-
tools/perf/util/ui/browsers/hists.c | 3 ++-
5 files changed, 33 insertions(+), 11 deletions(-)

diff --git a/tools/perf/Documentation/perf-list.txt b/tools/perf/Documentation/perf-list.txt
index 399751b..c398390 100644
--- a/tools/perf/Documentation/perf-list.txt
+++ b/tools/perf/Documentation/perf-list.txt
@@ -61,6 +61,12 @@ raw encoding of 0x1A8 can be used:
You should refer to the processor specific documentation for getting these
details. Some of them are referenced in the SEE ALSO section below.

+Some special events like the Intel OFFCORE_RESPONSE events require
+extra parameters. These can be specified by appending a command and
+the extra parameter as a hex number. Since the bitmasks can be
+complicated to compute it is recommended to use a suitable mapping file
+with --event-map.
+
OPTIONS
-------
None
diff --git a/tools/perf/builtin-report.c b/tools/perf/builtin-report.c
index 5de405d..7f6bcfb 100644
--- a/tools/perf/builtin-report.c
+++ b/tools/perf/builtin-report.c
@@ -187,7 +187,8 @@ static int process_read_event(event_t *event, struct perf_session *session __use
attr = perf_header__find_attr(event->read.id, &session->header);

if (show_threads) {
- const char *name = attr ? __event_name(attr->type, attr->config)
+ const char *name = attr ? __event_name(attr->type, attr->config,
+ 0)
: "unknown";
perf_read_values_add_value(&show_threads_values,
event->read.pid, event->read.tid,
@@ -197,7 +198,7 @@ static int process_read_event(event_t *event, struct perf_session *session __use
}

dump_printf(": %d %d %s %Lu\n", event->read.pid, event->read.tid,
- attr ? __event_name(attr->type, attr->config) : "FAIL",
+ attr ? __event_name(attr->type, attr->config, 0) : "FAIL",
event->read.value);

return 0;
@@ -275,7 +276,7 @@ static int hists__tty_browse_tree(struct rb_root *tree, const char *help)
const char *evname = NULL;

if (rb_first(&hists->entries) != rb_last(&hists->entries))
- evname = __event_name(hists->type, hists->config);
+ evname = __event_name(hists->type, hists->config, 0);

hists__fprintf_nr_sample_events(hists, evname, stdout);
hists__fprintf(hists, NULL, false, stdout);
diff --git a/tools/perf/util/parse-events.c b/tools/perf/util/parse-events.c
index 2cc7b3d..0fd3b8f 100644
--- a/tools/perf/util/parse-events.c
+++ b/tools/perf/util/parse-events.c
@@ -279,15 +279,18 @@ const char *event_name(int counter)
u64 config = attrs[counter].config;
int type = attrs[counter].type;

- return __event_name(type, config);
+ return __event_name(type, config, attrs[counter].event_extra);
}

-const char *__event_name(int type, u64 config)
+const char *__event_name(int type, u64 config, u64 extra)
{
- static char buf[32];
+ static char buf[64];
+ int n;

if (type == PERF_TYPE_RAW) {
- sprintf(buf, "raw 0x%llx", config);
+ n = sprintf(buf, "raw 0x%llx", config);
+ if (extra)
+ sprintf(buf + n, ",%#llx", extra);
return buf;
}

@@ -669,9 +672,20 @@ parse_raw_event(const char **strp, struct perf_event_attr *attr)
return EVT_FAILED;
n = hex2u64(str + 1, &config);
if (n > 0) {
- *strp = str + n + 1;
+ str += n + 1;
+ *strp = str;
attr->type = PERF_TYPE_RAW;
attr->config = config;
+
+ if (*str++ == ',') {
+ n = hex2u64(str + 1, &config);
+ if (n > 0) {
+ attr->event_extra = config;
+ str += n + 1;
+ *strp = str;
+ }
+ }
+
return EVT_HANDLED;
}
return EVT_FAILED;
@@ -983,7 +997,7 @@ void print_events(void)

printf("\n");
printf(" %-42s [%s]\n",
- "rNNN (see 'perf list --help' on how to encode it)",
+ "rNNN[,EEE] (see 'perf list --help' on how to encode it)",
event_type_descriptors[PERF_TYPE_RAW]);
printf("\n");

diff --git a/tools/perf/util/parse-events.h b/tools/perf/util/parse-events.h
index 1d6df9c..a4e20ed 100644
--- a/tools/perf/util/parse-events.h
+++ b/tools/perf/util/parse-events.h
@@ -21,7 +21,7 @@ extern struct perf_event_attr attrs[MAX_COUNTERS];
extern char *filters[MAX_COUNTERS];

extern const char *event_name(int ctr);
-extern const char *__event_name(int type, u64 config);
+extern const char *__event_name(int type, u64 config, u64 extra);

extern int parse_events(const struct option *opt, const char *str, int unset);
extern int parse_filter(const struct option *opt, const char *str, int unset);
diff --git a/tools/perf/util/ui/browsers/hists.c b/tools/perf/util/ui/browsers/hists.c
index ebda8c3..fdbdfe1 100644
--- a/tools/perf/util/ui/browsers/hists.c
+++ b/tools/perf/util/ui/browsers/hists.c
@@ -991,7 +991,8 @@ int hists__tui_browse_tree(struct rb_root *self, const char *help)

while (nd) {
struct hists *hists = rb_entry(nd, struct hists, rb_node);
- const char *ev_name = __event_name(hists->type, hists->config);
+ const char *ev_name = __event_name(hists->type, hists->config,
+ 0);

key = hists__browse(hists, help, ev_name);
switch (key) {
--
1.7.1

2010-11-11 16:15:47

by Andi Kleen

[permalink] [raw]
Subject: [PATCH 3/3] perf-events: Fix LLC-* events on Intel Nehalem/Westmere

From: Andi Kleen <[email protected]>

The generic perf LLC-* events do count the L2 caches, not the real
L3 LLC on Intel Nehalem and Westmere. This lead to quite some confusion.

Fixing this properly requires use of the special OFFCORE_RESPONSE
events which need a separate mask register. This has been implemented
in a earlier patch.

Now use this infrastructure to set correct events for the LLC-*
on Nehalem and Westmere

Signed-off-by: Andi Kleen <[email protected]>
---
arch/x86/kernel/cpu/perf_event.c | 7 +++++-
arch/x86/kernel/cpu/perf_event_intel.c | 37 ++++++++++++++++++++++++++++++++
2 files changed, 43 insertions(+), 1 deletions(-)

diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index 97133ec..3e429a1 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -299,6 +299,10 @@ static u64 __read_mostly hw_cache_event_ids
[PERF_COUNT_HW_CACHE_MAX]
[PERF_COUNT_HW_CACHE_OP_MAX]
[PERF_COUNT_HW_CACHE_RESULT_MAX];
+static u64 __read_mostly hw_cache_extra_regs
+ [PERF_COUNT_HW_CACHE_MAX]
+ [PERF_COUNT_HW_CACHE_OP_MAX]
+ [PERF_COUNT_HW_CACHE_RESULT_MAX];

/*
* Propagate event elapsed time into the generic event.
@@ -455,7 +459,8 @@ set_ext_hw_attr(struct hw_perf_event *hwc, struct perf_event_attr *attr)
return -EINVAL;

hwc->config |= val;
-
+ hwc->extra_config =
+ hw_cache_extra_regs[cache_type][cache_op][cache_result];
return 0;
}

diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c
index bbe7fba..a06dae3 100644
--- a/arch/x86/kernel/cpu/perf_event_intel.c
+++ b/arch/x86/kernel/cpu/perf_event_intel.c
@@ -215,6 +215,39 @@ static __initconst const u64 westmere_hw_cache_event_ids
},
};

+/*
+ * OFFCORE_RESPONSE MSR bits (subset), See IA32 SDM Vol 3 30.6.1.3
+ */
+
+#define DMND_DATA_RD (1 << 0)
+#define DMND_RFO (1 << 1)
+#define DMND_WB (1 << 3)
+#define PF_DATA_RD (1 << 4)
+#define PF_DATA_RFO (1 << 5)
+#define RESP_UNCORE_HIT (1 << 8)
+#define RESP_MISS (0xf600) /* non uncore hit */
+
+static __initconst const u64 nehalem_hw_cache_extra_regs
+ [PERF_COUNT_HW_CACHE_MAX]
+ [PERF_COUNT_HW_CACHE_OP_MAX]
+ [PERF_COUNT_HW_CACHE_RESULT_MAX] =
+{
+ [ C(LL ) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = DMND_DATA_RD|RESP_UNCORE_HIT,
+ [ C(RESULT_MISS) ] = DMND_DATA_RD|RESP_MISS,
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = DMND_RFO|DMND_WB|RESP_UNCORE_HIT,
+ [ C(RESULT_MISS) ] = DMND_RFO|DMND_WB|RESP_MISS,
+ },
+ [ C(OP_PREFETCH) ] = {
+ [ C(RESULT_ACCESS) ] = PF_DATA_RD|PF_DATA_RFO|RESP_UNCORE_HIT,
+ [ C(RESULT_MISS) ] = PF_DATA_RD|PF_DATA_RFO|RESP_MISS,
+ },
+ }
+};
+
static __initconst const u64 nehalem_hw_cache_event_ids
[PERF_COUNT_HW_CACHE_MAX]
[PERF_COUNT_HW_CACHE_OP_MAX]
@@ -1113,6 +1146,8 @@ static __init int intel_pmu_init(void)
case 46: /* 45 nm nehalem-ex, "Beckton" */
memcpy(hw_cache_event_ids, nehalem_hw_cache_event_ids,
sizeof(hw_cache_event_ids));
+ memcpy(hw_cache_extra_regs, nehalem_hw_cache_extra_regs,
+ sizeof(hw_cache_extra_regs));

intel_pmu_lbr_init_nhm();

@@ -1138,6 +1173,8 @@ static __init int intel_pmu_init(void)
case 44: /* 32 nm nehalem, "Gulftown" */
memcpy(hw_cache_event_ids, westmere_hw_cache_event_ids,
sizeof(hw_cache_event_ids));
+ memcpy(hw_cache_extra_regs, nehalem_hw_cache_extra_regs,
+ sizeof(hw_cache_extra_regs));

intel_pmu_lbr_init_nhm();

--
1.7.1

2010-11-11 17:35:22

by Andi Kleen

[permalink] [raw]
Subject: [PATCH/FIX] perf-events: Put the per cpu state for intel_pmu too


Small bug fix for the earlier offcore_response patch:
need to put the events for intel_pmu too, not only core_pmu.

Signed-off-by: Andi Kleen <[email protected]>

diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c
index a06dae3..9a71ffd 100644
--- a/arch/x86/kernel/cpu/perf_event_intel.c
+++ b/arch/x86/kernel/cpu/perf_event_intel.c
@@ -1031,6 +1031,7 @@ static __initconst const struct x86_pmu intel_pmu = {
*/
.max_period = (1ULL << 31) - 1,
.get_event_constraints = intel_get_event_constraints,
+ .put_event_constraints = intel_put_event_constraints,

.cpu_starting = intel_pmu_cpu_starting,
.cpu_dying = intel_pmu_cpu_dying,

--
[email protected] -- Speaking for myself only.

2010-11-11 17:54:44

by Corey Ashford

[permalink] [raw]
Subject: Re: [PATCH 2/3] perf: Add support for extra parameters for raw events

On 11/11/2010 08:15 AM, Andi Kleen wrote:
> From: Andi Kleen<[email protected]>
>
> Add support to the perf tool to specify extra values for raw events
> and pass them to the kernel.
>
> The new format is -e rXXXX[,YYYY]

It's not clear to me that a comma is the best character to use in this
instance, because multiple events can be listed after the -e separated
by commas as well, for example: -e rXXXX,cycles.
-e rXXXX,YYYY,cycles would be confusing to read.

How about -e rXXXX[+YYYY]

- Corey

2010-11-11 17:54:49

by Stephane Eranian

[permalink] [raw]
Subject: Re: [PATCH/FIX] perf-events: Put the per cpu state for intel_pmu too

On Thu, Nov 11, 2010 at 6:35 PM, Andi Kleen <[email protected]> wrote:
>
> Small bug fix for the earlier offcore_response patch:
> need to put the events for intel_pmu too, not only core_pmu.
>

You don't need it for core pmu, this is for Core Duo/Solo
if I recall correctly.


> Signed-off-by: Andi Kleen <[email protected]>
>
> diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c
> index a06dae3..9a71ffd 100644
> --- a/arch/x86/kernel/cpu/perf_event_intel.c
> +++ b/arch/x86/kernel/cpu/perf_event_intel.c
> @@ -1031,6 +1031,7 @@ static __initconst const struct x86_pmu intel_pmu = {
>         */
>        .max_period             = (1ULL << 31) - 1,
>        .get_event_constraints  = intel_get_event_constraints,
> +       .put_event_constraints  = intel_put_event_constraints,
>
>        .cpu_starting           = intel_pmu_cpu_starting,
>        .cpu_dying              = intel_pmu_cpu_dying,
>
> --
> [email protected] -- Speaking for myself only.
>

2010-11-11 18:06:58

by Stephane Eranian

[permalink] [raw]
Subject: Re: [PATCH 1/3] perf-events: Add support for supplementary event registers

Andi,

Thanks for creating this patch. It was on my TODO list for a while.
OFFCORE_RESPONSE is indeed a very useful event.

One thing I noticed in your patch is that you don't special
case the configuration where HT is off. In that case, the
sharing problem goes away. I think you could override
either way during init.

Some more tidbits:
- OFFCORE_RESPONSE_0 is 0x01b7
- OFFCORE_RESPONSE_1 is 0x01bb

The umask is not zero but 1. Dont' know if you get
something meaningful is you pass a umask of zero.
But that's the user's responsibility to set this right.

An alternative approach could have been to stash the
extra MSR value in the upper 32-bit value of the config.
It's 16-bit wide today. OFFCORE_RESPONSE is a
model specific event. There is no guarantee it will be
there in future CPU, so it would be safe to do that as well.

On Thu, Nov 11, 2010 at 5:15 PM, Andi Kleen <[email protected]> wrote:
> From: Andi Kleen <[email protected]>
>
> Intel Nehalem/Westmere have a special OFFCORE_RESPONSE event
> that can be used to monitor any offcore accesses from a core.
> This is a very useful event for various tunings, and it's
> also needed to implement the generic LLC-* events correctly.
>
> Unfortunately this event requires programming a mask in a separate
> register. And worse this separate register is per core, not per
> CPU thread.
>
> This patch adds:
> - Teaches perf_events that OFFCORE_RESPONSE need extra parameters.
> - Adds a new field to the user interface to pass the extra mask.
> This reuses one of the unused fields for perf events. The change
> is ABI neutral because noone is likely to have used OFFCORE_RESPONSE
> before (with zero mask it wouldn't count anything)
> - Add support to the Intel perf_event core to schedule the per
> core resource. I tried to add generic infrastructure for this
> that could be also used for other core resources.
> The basic code has is patterned after the similar AMD northbridge
> constraints code.
>
> Thanks to Stephane Eranian who pointed out some problems
> in the original version and suggested improvements.
>
> Cc: [email protected]
> Signed-off-by: Andi Kleen <[email protected]>
> ---
>  arch/x86/kernel/cpu/perf_event.c       |   56 +++++++++++++++
>  arch/x86/kernel/cpu/perf_event_intel.c |  120 ++++++++++++++++++++++++++++++++
>  include/linux/perf_event.h             |    7 ++-
>  3 files changed, 182 insertions(+), 1 deletions(-)
>
> diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
> index ed63101..97133ec 100644
> --- a/arch/x86/kernel/cpu/perf_event.c
> +++ b/arch/x86/kernel/cpu/perf_event.c
> @@ -93,6 +93,8 @@ struct amd_nb {
>        struct event_constraint event_constraints[X86_PMC_IDX_MAX];
>  };
>
> +struct intel_percore;
> +
>  #define MAX_LBR_ENTRIES                16
>
>  struct cpu_hw_events {
> @@ -126,6 +128,8 @@ struct cpu_hw_events {
>        void                            *lbr_context;
>        struct perf_branch_stack        lbr_stack;
>        struct perf_branch_entry        lbr_entries[MAX_LBR_ENTRIES];
> +       int                             percore_used;
> +       struct intel_percore            *per_core;
>
>        /*
>         * AMD specific bits
> @@ -175,6 +179,24 @@ struct cpu_hw_events {
>  #define for_each_event_constraint(e, c)        \
>        for ((e) = (c); (e)->weight; (e)++)
>
> +/*
> + * Extra registers for specific events.
> + * Some events need large masks and require external MSRs.
> + * Define a mapping to these extra registers.
> + */
> +
> +struct extra_reg {
> +       unsigned event;
> +       unsigned msr;
> +       u64 config_mask;
> +       u64 valid_mask;
> +};
> +
> +#define EVENT_EXTRA_REG(event, msr, m, vm) { event, msr, m, vm }
> +#define INTEL_EVENT_EXTRA_REG(event, msr, vm) \
> +       EVENT_EXTRA_REG(event, msr, ARCH_PERFMON_EVENTSEL_EVENT, vm)
> +#define EVENT_EXTRA_END {}
> +
>  union perf_capabilities {
>        struct {
>                u64     lbr_format    : 6;
> @@ -219,6 +241,7 @@ struct x86_pmu {
>        void            (*put_event_constraints)(struct cpu_hw_events *cpuc,
>                                                 struct perf_event *event);
>        struct event_constraint *event_constraints;
> +       struct event_constraint *percore_constraints;
>        void            (*quirks)(void);
>        int             perfctr_second_write;
>
> @@ -247,6 +270,11 @@ struct x86_pmu {
>         */
>        unsigned long   lbr_tos, lbr_from, lbr_to; /* MSR base regs       */
>        int             lbr_nr;                    /* hardware stack size */
> +
> +       /*
> +        * Extra registers for events
> +        */
> +       struct extra_reg *extra_regs;
>  };
>
>  static struct x86_pmu x86_pmu __read_mostly;
> @@ -530,6 +558,28 @@ static int x86_pmu_hw_config(struct perf_event *event)
>  }
>
>  /*
> + * Find and validate any extra registers to set up.
> + */
> +static int x86_pmu_extra_regs(struct perf_event *event)
> +{
> +       struct extra_reg *er;
> +
> +       if (!x86_pmu.extra_regs)
> +               return 0;
> +
> +       for (er = x86_pmu.extra_regs; er->msr; er++) {
> +               if (er->event != (event->attr.config & er->config_mask))
> +                       continue;
> +               if (event->attr.event_extra & ~er->valid_mask)
> +                       return -EINVAL;
> +               event->hw.extra_reg = er->msr;
> +               event->hw.extra_config = event->attr.event_extra;
> +               break;
> +       }
> +       return 0;
> +}
> +
> +/*
>  * Setup the hardware configuration for a given attr_type
>  */
>  static int __x86_pmu_event_init(struct perf_event *event)
> @@ -561,6 +611,10 @@ static int __x86_pmu_event_init(struct perf_event *event)
>        event->hw.last_cpu = -1;
>        event->hw.last_tag = ~0ULL;
>
> +       err = x86_pmu_extra_regs(event);
> +       if (err)
> +               return err;
> +
>        return x86_pmu.hw_config(event);
>  }
>
> @@ -876,6 +930,8 @@ static inline void __x86_pmu_enable_event(struct hw_perf_event *hwc,
>                                          u64 enable_mask)
>  {
>        wrmsrl(hwc->config_base + hwc->idx, hwc->config | enable_mask);
> +       if (hwc->extra_reg)
> +               wrmsrl(hwc->extra_reg, hwc->extra_config);
>  }
>
>  static inline void x86_pmu_disable_event(struct perf_event *event)
> diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c
> index c8f5c08..bbe7fba 100644
> --- a/arch/x86/kernel/cpu/perf_event_intel.c
> +++ b/arch/x86/kernel/cpu/perf_event_intel.c
> @@ -1,5 +1,14 @@
>  #ifdef CONFIG_CPU_SUP_INTEL
>
> +struct intel_percore {
> +       raw_spinlock_t lock;
> +       int ref;
> +       u64 config;
> +       unsigned extra_reg;
> +       u64 extra_config;
> +};
> +static DEFINE_PER_CPU(struct intel_percore, intel_percore);
> +
>  /*
>  * Intel PerfMon, used on Core and later.
>  */
> @@ -64,6 +73,18 @@ static struct event_constraint intel_nehalem_event_constraints[] =
>        EVENT_CONSTRAINT_END
>  };
>
> +static struct extra_reg intel_nehalem_extra_regs[] =
> +{
> +       INTEL_EVENT_EXTRA_REG(0xb7, 0x1a6, 0xffff), /* OFFCORE_RESPONSE1 */
> +       EVENT_EXTRA_END
> +};
> +
> +static struct event_constraint intel_nehalem_percore_constraints[] =
> +{
> +       INTEL_EVENT_CONSTRAINT(0xb7, 0),
> +       EVENT_CONSTRAINT_END
> +};
> +
>  static struct event_constraint intel_westmere_event_constraints[] =
>  {
>        FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */
> @@ -76,6 +97,20 @@ static struct event_constraint intel_westmere_event_constraints[] =
>        EVENT_CONSTRAINT_END
>  };
>
> +static struct extra_reg intel_westmere_extra_regs[] =
> +{
> +       INTEL_EVENT_EXTRA_REG(0xb7, 0x1a6, 0xffff), /* OFFCORE_RESPONSE1 */
> +       INTEL_EVENT_EXTRA_REG(0xbb, 0x1a7, 0xffff), /* OFFCORE_RESPONSE2 */
> +       EVENT_EXTRA_END
> +};
> +
> +static struct event_constraint intel_westmere_percore_constraints[] =
> +{
> +       INTEL_EVENT_CONSTRAINT(0xb7, 0),
> +       INTEL_EVENT_CONSTRAINT(0xbb, 0),
> +       EVENT_CONSTRAINT_END
> +};
> +
>  static struct event_constraint intel_gen_event_constraints[] =
>  {
>        FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */
> @@ -794,6 +829,56 @@ intel_bts_constraints(struct perf_event *event)
>  }
>
>  static struct event_constraint *
> +intel_percore_constraints(struct cpu_hw_events *cpuc, struct perf_event *event)
> +{
> +       struct hw_perf_event *hwc = &event->hw;
> +       unsigned e = hwc->config & ARCH_PERFMON_EVENTSEL_EVENT;
> +       struct event_constraint *c;
> +       struct intel_percore *pc;
> +
> +       if (!x86_pmu.percore_constraints)
> +               return NULL;
> +
> +       for (c = x86_pmu.percore_constraints; c->cmask; c++) {
> +               if (e != c->code)
> +                       continue;
> +
> +               c = NULL;
> +
> +               /*
> +                * Allocate resource per core.
> +                * Currently only one such per core resource can be allocated.
> +                */
> +               pc = cpuc->per_core;
> +               if (!pc)
> +                       break;
> +               raw_spin_lock(&pc->lock);
> +               if (pc->ref > 0) {
> +                       /* Allow identical settings */
> +                       if (hwc->config == pc->config &&
> +                           hwc->extra_reg == pc->extra_reg &&
> +                           hwc->extra_config == pc->extra_config) {
> +                               pc->ref++;
> +                               cpuc->percore_used = 1;
> +                       } else {
> +                               /* Deny due to conflict */
> +                               c = &emptyconstraint;
> +                       }
> +               } else {
> +                       pc->config = hwc->config;
> +                       pc->extra_reg = hwc->extra_reg;
> +                       pc->extra_config = hwc->extra_config;
> +                       pc->ref = 1;
> +                       cpuc->percore_used = 1;
> +               }
> +               raw_spin_unlock(&pc->lock);
> +               return c;
> +       }
> +
> +       return NULL;
> +}
> +
> +static struct event_constraint *
>  intel_get_event_constraints(struct cpu_hw_events *cpuc, struct perf_event *event)
>  {
>        struct event_constraint *c;
> @@ -806,9 +891,29 @@ intel_get_event_constraints(struct cpu_hw_events *cpuc, struct perf_event *event
>        if (c)
>                return c;
>
> +       c = intel_percore_constraints(cpuc, event);
> +       if (c)
> +               return c;
> +
>        return x86_get_event_constraints(cpuc, event);
>  }
>
> +static void intel_put_event_constraints(struct cpu_hw_events *cpuc,
> +                                       struct perf_event *event)
> +{
> +       struct intel_percore *pc;
> +
> +       if (!cpuc->percore_used)
> +               return;
> +
> +       pc = cpuc->per_core;
> +       raw_spin_lock(&pc->lock);
> +       pc->ref--;
> +       BUG_ON(pc->ref < 0);
> +       raw_spin_unlock(&pc->lock);
> +       cpuc->percore_used = 0;
> +}
> +
>  static int intel_pmu_hw_config(struct perf_event *event)
>  {
>        int ret = x86_pmu_hw_config(event);
> @@ -854,6 +959,7 @@ static __initconst const struct x86_pmu core_pmu = {
>         */
>        .max_period             = (1ULL << 31) - 1,
>        .get_event_constraints  = intel_get_event_constraints,
> +       .put_event_constraints  = intel_put_event_constraints,
>        .event_constraints      = intel_core_event_constraints,
>  };
>
> @@ -929,6 +1035,7 @@ static __init int intel_pmu_init(void)
>        union cpuid10_eax eax;
>        unsigned int unused;
>        unsigned int ebx;
> +       int cpu;
>        int version;
>
>        if (!cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) {
> @@ -1010,7 +1117,10 @@ static __init int intel_pmu_init(void)
>                intel_pmu_lbr_init_nhm();
>
>                x86_pmu.event_constraints = intel_nehalem_event_constraints;
> +               x86_pmu.percore_constraints =
> +                       intel_nehalem_percore_constraints;
>                x86_pmu.enable_all = intel_pmu_nhm_enable_all;
> +               x86_pmu.extra_regs = intel_nehalem_extra_regs;
>                pr_cont("Nehalem events, ");
>                break;
>
> @@ -1032,7 +1142,10 @@ static __init int intel_pmu_init(void)
>                intel_pmu_lbr_init_nhm();
>
>                x86_pmu.event_constraints = intel_westmere_event_constraints;
> +               x86_pmu.percore_constraints =
> +                       intel_westmere_percore_constraints;
>                x86_pmu.enable_all = intel_pmu_nhm_enable_all;
> +               x86_pmu.extra_regs = intel_westmere_extra_regs;
>                pr_cont("Westmere events, ");
>                break;
>
> @@ -1043,6 +1156,13 @@ static __init int intel_pmu_init(void)
>                x86_pmu.event_constraints = intel_gen_event_constraints;
>                pr_cont("generic architected perfmon, ");
>        }
> +
> +       for_each_possible_cpu(cpu) {
> +               raw_spin_lock_init(&per_cpu(intel_percore, cpu).lock);
> +               per_cpu(cpu_hw_events, cpu).per_core =
> +                       &per_cpu(intel_percore,
> +                                cpumask_first(topology_core_cpumask(cpu)));
> +       }
>        return 0;
>  }
>
> diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
> index 057bf22..a353594 100644
> --- a/include/linux/perf_event.h
> +++ b/include/linux/perf_event.h
> @@ -224,7 +224,10 @@ struct perf_event_attr {
>        };
>
>        __u32                   bp_type;
> -       __u64                   bp_addr;
> +       union {
> +               __u64                   bp_addr;
> +               __u64                   event_extra; /* Extra for some events */
> +       };
>        __u64                   bp_len;
>  };
>
> @@ -529,6 +532,8 @@ struct hw_perf_event {
>                        unsigned long   event_base;
>                        int             idx;
>                        int             last_cpu;
> +                       unsigned        extra_reg;
> +                       u64             extra_config;
>                };
>                struct { /* software */
>                        struct hrtimer  hrtimer;
> --
> 1.7.1
>
>

2010-11-11 18:39:07

by Andi Kleen

[permalink] [raw]
Subject: Re: [PATCH 2/3] perf: Add support for extra parameters for raw events


> It's not clear to me that a comma is the best character to use in this
> instance, because multiple events can be listed after the -e separated
> by commas as well, for example: -e rXXXX,cycles.
> -e rXXXX,YYYY,cycles would be confusing to read.
>
> How about -e rXXXX[+YYYY]

That's a good point. The comma seems to work in my testing though for
some reason, but I guess
I broke the multiple events case then. I'll switch over to +

-Andi

2010-11-11 18:42:08

by Andi Kleen

[permalink] [raw]
Subject: Re: [PATCH 1/3] perf-events: Add support for supplementary event registers

On 11/11/2010 7:06 PM, Stephane Eranian wrote:

Thanks for the review.

> One thing I noticed in your patch is that you don't special
> case the configuration where HT is off. In that case, the
> sharing problem goes away. I think you could override
> either way during init.

The allocator should handle that transparently. With HT off the resource is
always free. Or do you see any concrete problems with the current code path?

> Some more tidbits:
> - OFFCORE_RESPONSE_0 is 0x01b7
> - OFFCORE_RESPONSE_1 is 0x01bb
>
> The umask is not zero but 1. Dont' know if you get
> something meaningful is you pass a umask of zero.

Hmm I seem to get events that look meaningful with 0. But you're right 1
is better. I used that in manual tests, but it wasn't in the cache number
mappings. I'll fix that for the next version.

-Andi

2010-11-11 18:44:08

by Andi Kleen

[permalink] [raw]
Subject: Re: [PATCH/FIX] perf-events: Put the per cpu state for intel_pmu too

On 11/11/2010 6:54 PM, Stephane Eranian wrote:
> On Thu, Nov 11, 2010 at 6:35 PM, Andi Kleen<[email protected]> wrote:
>> Small bug fix for the earlier offcore_response patch:
>> need to put the events for intel_pmu too, not only core_pmu.
>>
> You don't need it for core pmu, this is for Core Duo/Solo
> if I recall correctly.

It shouldn't matter because Core Duo doesn't have any per CPU resources,
but
it seems more symetric to call it in any case.

It may make a difference if other users of the per CPU resource code get
added.

-Andi

2010-11-11 19:09:14

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [PATCH 1/3] perf-events: Add support for supplementary event registers

On Thu, 2010-11-11 at 17:15 +0100, Andi Kleen wrote:

> diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
> index ed63101..97133ec 100644
> --- a/arch/x86/kernel/cpu/perf_event.c
> +++ b/arch/x86/kernel/cpu/perf_event.c
> @@ -93,6 +93,8 @@ struct amd_nb {
> struct event_constraint event_constraints[X86_PMC_IDX_MAX];
> };
>
> +struct intel_percore;
> +
> #define MAX_LBR_ENTRIES 16
>
> struct cpu_hw_events {
> @@ -126,6 +128,8 @@ struct cpu_hw_events {
> void *lbr_context;
> struct perf_branch_stack lbr_stack;
> struct perf_branch_entry lbr_entries[MAX_LBR_ENTRIES];

There wants to be a comment here, this is definitely not LBR stuff.

> + int percore_used;
> + struct intel_percore *per_core;

Either per_core != NULL implies percore_used or it should be state
inside the struct.

>
> /*
> * AMD specific bits
> @@ -175,6 +179,24 @@ struct cpu_hw_events {
> #define for_each_event_constraint(e, c) \
> for ((e) = (c); (e)->weight; (e)++)
>
> +/*
> + * Extra registers for specific events.
> + * Some events need large masks and require external MSRs.
> + * Define a mapping to these extra registers.
> + */
> +
> +struct extra_reg {
> + unsigned event;
> + unsigned msr;
> + u64 config_mask;
> + u64 valid_mask;
> +};

s/unsigned/unsigned int/ (also later in the patch) and then align the
variable names.

> +
> +#define EVENT_EXTRA_REG(event, msr, m, vm) { event, msr, m, vm }

C99 initializers please

> +#define INTEL_EVENT_EXTRA_REG(event, msr, vm) \
> + EVENT_EXTRA_REG(event, msr, ARCH_PERFMON_EVENTSEL_EVENT, vm)
> +#define EVENT_EXTRA_END {}

Does that imply a zero filled struct?

> diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c
> index c8f5c08..bbe7fba 100644
> --- a/arch/x86/kernel/cpu/perf_event_intel.c
> +++ b/arch/x86/kernel/cpu/perf_event_intel.c
> @@ -1,5 +1,14 @@
> #ifdef CONFIG_CPU_SUP_INTEL
>
> +struct intel_percore {
> + raw_spinlock_t lock;
> + int ref;
> + u64 config;
> + unsigned extra_reg;
> + u64 extra_config;
> +};
> +static DEFINE_PER_CPU(struct intel_percore, intel_percore);

Please dynamically allocate these when needed, just like the AMD
north-bridge structure.

> /*
> * Intel PerfMon, used on Core and later.
> */

> diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
> index 057bf22..a353594 100644
> --- a/include/linux/perf_event.h
> +++ b/include/linux/perf_event.h
> @@ -224,7 +224,10 @@ struct perf_event_attr {
> };
>
> __u32 bp_type;
> - __u64 bp_addr;
> + union {
> + __u64 bp_addr;
> + __u64 event_extra; /* Extra for some events */
> + };
> __u64 bp_len;
> };

I think I like Stephane's suggestion better, frob them into the existing
u64 word, since its model specific and we still have 33 empty bits in
the control register there's plenty space.


2010-11-11 19:25:54

by Andi Kleen

[permalink] [raw]
Subject: Re: [PATCH 1/3] perf-events: Add support for supplementary event registers

On Thu, Nov 11, 2010 at 08:09:14PM +0100, Peter Zijlstra wrote:

Thanks for the review. I discovered another problem on my own too.

> > + int percore_used;
> > + struct intel_percore *per_core;
>
> Either per_core != NULL implies percore_used or it should be state
> inside the struct.

It does not, I'll clarify.

> > +#define INTEL_EVENT_EXTRA_REG(event, msr, vm) \
> > + EVENT_EXTRA_REG(event, msr, ARCH_PERFMON_EVENTSEL_EVENT, vm)
> > +#define EVENT_EXTRA_END {}
>
> Does that imply a zero filled struct?

Yes.

> > +++ b/arch/x86/kernel/cpu/perf_event_intel.c
> > @@ -1,5 +1,14 @@
> > #ifdef CONFIG_CPU_SUP_INTEL
> >
> > +struct intel_percore {
> > + raw_spinlock_t lock;
> > + int ref;
> > + u64 config;
> > + unsigned extra_reg;
> > + u64 extra_config;
> > +};
> > +static DEFINE_PER_CPU(struct intel_percore, intel_percore);
>
> Please dynamically allocate these when needed, just like the AMD
> north-bridge structure.

Fully dynamic is difficult because the topology discovery does not
really handle that nicely.

I can allocate at boot, but it will not save a lot of memory
(just one entry per core)

To be honest I would prefer not to do that change, are you sure
you want it?

> I think I like Stephane's suggestion better, frob them into the existing
> u64 word, since its model specific and we still have 33 empty bits in
> the control register there's plenty space.

Ok. I'll see how many changes that needs.

-Andi

--
[email protected] -- Speaking for myself only.

2010-11-11 19:36:40

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [PATCH 1/3] perf-events: Add support for supplementary event registers

On Thu, 2010-11-11 at 20:25 +0100, Andi Kleen wrote:
> On Thu, Nov 11, 2010 at 08:09:14PM +0100, Peter Zijlstra wrote:

> > > + int percore_used;
> > > + struct intel_percore *per_core;
> >
> > Either per_core != NULL implies percore_used or it should be state
> > inside the struct.
>
> It does not, I'll clarify.

Can we make it so, that is, move that int inside the intel_percore
struct?

> > > +++ b/arch/x86/kernel/cpu/perf_event_intel.c
> > > @@ -1,5 +1,14 @@
> > > #ifdef CONFIG_CPU_SUP_INTEL
> > >
> > > +struct intel_percore {
> > > + raw_spinlock_t lock;
> > > + int ref;
> > > + u64 config;
> > > + unsigned extra_reg;
> > > + u64 extra_config;
> > > +};
> > > +static DEFINE_PER_CPU(struct intel_percore, intel_percore);
> >
> > Please dynamically allocate these when needed, just like the AMD
> > north-bridge structure.
>
> Fully dynamic is difficult because the topology discovery does not
> really handle that nicely.
>
> I can allocate at boot, but it will not save a lot of memory
> (just one entry per core)
>
> To be honest I would prefer not to do that change, are you sure
> you want it?

Hrm,.. but we only need to do this on nhm/wsm init, we already have code
detecting the cpu model. And I'm quite sure you know where to poke to
see if HT is enabled.

2010-11-11 19:38:35

by Corey Ashford

[permalink] [raw]
Subject: Re: [PATCH 2/3] perf: Add support for extra parameters for raw events

On 11/11/2010 10:39 AM, Andi Kleen wrote:
>
>> It's not clear to me that a comma is the best character to use in this
>> instance, because multiple events can be listed after the -e separated
>> by commas as well, for example: -e rXXXX,cycles.
>> -e rXXXX,YYYY,cycles would be confusing to read.
>>
>> How about -e rXXXX[+YYYY]
>
> That's a good point. The comma seems to work in my testing though for
> some reason, but I guess
> I broke the multiple events case then. I'll switch over to +
>
> -Andi
>


I think it would have parsed OK, but it is a confusing syntax to look
at. I should point out that there are also the u,k,h modifiers which
can be specified, even on raw events, so the full syntax looks something
like:

-e rXXXX[+YYYY][:u|:k|:h]

- Corey

2010-11-11 19:45:13

by Andi Kleen

[permalink] [raw]
Subject: Re: [PATCH 1/3] perf-events: Add support for supplementary event registers

On 11/11/2010 8:36 PM, Peter Zijlstra wrote:
>
> Can we make it so, that is, move that int inside the intel_percore
> struct?
>

It's needed per cpu. Basically the flag says "bother to look into the
percore structure"
This way most of this is kept out of the fast paths. But one CPU in a core
may need it and the other not.

In principle it could be merged into some other cpuc flag word if you
have any suggestions.

>
> Hrm,.. but we only need to do this on nhm/wsm init, we already have code
> detecting the cpu model. And I'm quite sure you know where to poke to
> see if HT is enabled.

Ok I guess I can do a dynamic per cpu allocation for this case.

-Andi

2010-11-11 19:49:13

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [PATCH 1/3] perf-events: Add support for supplementary event registers

On Thu, 2010-11-11 at 20:45 +0100, Andi Kleen wrote:
> > Can we make it so, that is, move that int inside the intel_percore
> > struct?
> >
>
> It's needed per cpu. Basically the flag says "bother to look into the
> percore structure"
> This way most of this is kept out of the fast paths. But one CPU in a core
> may need it and the other not.
>
> In principle it could be merged into some other cpuc flag word if you
> have any suggestions.

Nah, I missed that detail, having it as is seems fine.

2010-11-11 19:49:41

by Andi Kleen

[permalink] [raw]
Subject: Re: [PATCH 2/3] perf: Add support for extra parameters for raw events

> I think it would have parsed OK, but it is a confusing syntax to
> look at. I should point out that there are also the u,k,h modifiers
> which can be specified, even on raw events, so the full syntax looks
> something like:
>
> -e rXXXX[+YYYY][:u|:k|:h]

Ok I can fix the help text for that.

-Andi

--
[email protected] -- Speaking for myself only.

2010-11-11 19:59:44

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [PATCH 2/3] perf: Add support for extra parameters for raw events

On Thu, 2010-11-11 at 20:49 +0100, Andi Kleen wrote:
> > I think it would have parsed OK, but it is a confusing syntax to
> > look at. I should point out that there are also the u,k,h modifiers
> > which can be specified, even on raw events, so the full syntax looks
> > something like:
> >
> > -e rXXXX[+YYYY][:u|:k|:h]
>
> Ok I can fix the help text for that.

If we're frobbing it in the existing config qword, do we still need
special syntax?

Also, I think we can use the same mechanism to program the
PEBS-load-latency MSR, right?

2010-11-11 20:13:08

by Andi Kleen

[permalink] [raw]
Subject: Re: [PATCH 2/3] perf: Add support for extra parameters for raw events

On Thu, Nov 11, 2010 at 08:59:47PM +0100, Peter Zijlstra wrote:
> On Thu, 2010-11-11 at 20:49 +0100, Andi Kleen wrote:
> > > I think it would have parsed OK, but it is a confusing syntax to
> > > look at. I should point out that there are also the u,k,h modifiers
> > > which can be specified, even on raw events, so the full syntax looks
> > > something like:
> > >
> > > -e rXXXX[+YYYY][:u|:k|:h]
> >
> > Ok I can fix the help text for that.
>
> If we're frobbing it in the existing config qword, do we still need
> special syntax?

No, it could be all merged then.

>
> Also, I think we can use the same mechanism to program the
> PEBS-load-latency MSR, right?

I haven't looked at that, but if it's a per core resource
the infrastructure can be reused at least.

-Andi
--
[email protected] -- Speaking for myself only.

2010-11-11 20:37:49

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [PATCH 2/3] perf: Add support for extra parameters for raw events

On Thu, 2010-11-11 at 21:12 +0100, Andi Kleen wrote:
> > Also, I think we can use the same mechanism to program the
> > PEBS-load-latency MSR, right?
>
> I haven't looked at that, but if it's a per core resource
> the infrastructure can be reused at least.

Can't remember if the load-latency msr is per-core, but its an extra reg
that needs to be set.

The nhm/wsm LBR config msr is per-core though, so that could use your
infrastructure too.

2010-11-11 21:29:34

by Stephane Eranian

[permalink] [raw]
Subject: Re: [PATCH/FIX] perf-events: Put the per cpu state for intel_pmu too

On Thu, Nov 11, 2010 at 7:44 PM, Andi Kleen <[email protected]> wrote:
> On 11/11/2010 6:54 PM, Stephane Eranian wrote:
>>
>> On Thu, Nov 11, 2010 at 6:35 PM, Andi Kleen<[email protected]>  wrote:
>>>
>>> Small bug fix for the earlier offcore_response patch:
>>> need to put the events for intel_pmu too, not only core_pmu.
>>>
>> You don't need it for core pmu, this is for Core Duo/Solo
>> if I recall correctly.
>
> It shouldn't matter because Core Duo doesn't have any per CPU resources, but
> it seems more symetric to call it in any case.
>
There is no HT on these processors. I don't know of any valuable HW
features that is
not support for these.

2010-11-12 10:27:59

by Andi Kleen

[permalink] [raw]
Subject: Re: [PATCH 2/3] perf: Add support for extra parameters for raw events

On Thu, Nov 11, 2010 at 09:37:38PM +0100, Peter Zijlstra wrote:
> On Thu, 2010-11-11 at 21:12 +0100, Andi Kleen wrote:
> > > Also, I think we can use the same mechanism to program the
> > > PEBS-load-latency MSR, right?
> >
> > I haven't looked at that, but if it's a per core resource
> > the infrastructure can be reused at least.
>
> Can't remember if the load-latency msr is per-core, but its an extra reg
> that needs to be set.

Ok. I guess it shouldn't be too hard to add.

Load-latency is pretty useful too.

>
> The nhm/wsm LBR config msr is per-core though, so that could use your
> infrastructure too.

Stephane mentioned that too. Not right now, but perhaps later I'll
look at using this.

-Andi


--
[email protected] -- Speaking for myself only.

2010-11-12 10:39:27

by Stephane Eranian

[permalink] [raw]
Subject: Re: [PATCH 2/3] perf: Add support for extra parameters for raw events

On Thu, Nov 11, 2010 at 8:59 PM, Peter Zijlstra <[email protected]> wrote:
> On Thu, 2010-11-11 at 20:49 +0100, Andi Kleen wrote:
>> > I think it would have parsed OK, but it is a confusing syntax to
>> > look at.  I should point out that there are also the u,k,h modifiers
>> > which can be specified, even on raw events, so the full syntax looks
>> > something like:
>> >
>> > -e rXXXX[+YYYY][:u|:k|:h]
>>
>> Ok I can fix the help text for that.
>
> If we're frobbing it in the existing config qword, do we still need
> special syntax?
>
I don't think you need special syntax. You can simply come up with
the 64-bit raw hex value. corey recently added a small utility to do
this via libpfm4: perf stat -e `evt2raw unhalted_core_cycles:u` ....

With libpfm4, I'll expose the event as a normal one. Therefore
you would do:

perf stat -e OFFCORE_RESPONSE_0:PF_DATA_RD:REMOTE_DRAM ...

This will encode to:
attr.config = 0x01b7ULL | ((1ULL << 4) | (1ULL << 13)) << 32


> Also, I think we can use the same mechanism to program the
> PEBS-load-latency MSR, right?
>
Yes, we could hardcode the latency the same way.

2010-11-12 10:41:31

by Stephane Eranian

[permalink] [raw]
Subject: Re: [PATCH 2/3] perf: Add support for extra parameters for raw events

On Thu, Nov 11, 2010 at 9:37 PM, Peter Zijlstra <[email protected]> wrote:
> On Thu, 2010-11-11 at 21:12 +0100, Andi Kleen wrote:
>> > Also, I think we can use the same mechanism to program the
>> > PEBS-load-latency MSR, right?
>>
>> I haven't looked at that, but if it's a per core resource
>> the infrastructure can be reused at least.
>
> Can't remember if the load-latency msr is per-core, but its an extra reg
> that needs to be set.

Load latency is luckily per thread. Nothing special to do to handle this one.
We can restore its value from attr->config.

>
> The nhm/wsm LBR config msr is per-core though, so that could use your
> infrastructure too.
>
Exactly.

2010-11-12 10:48:07

by Andi Kleen

[permalink] [raw]
Subject: Re: [PATCH 2/3] perf: Add support for extra parameters for raw events

> I don't think you need special syntax. You can simply come up with
> the 64-bit raw hex value. corey recently added a small utility to do
> this via libpfm4: perf stat -e `evt2raw unhalted_core_cycles:u` ....

I added a small patch to perf to read event mappings from a flat mapping file.

> > Also, I think we can use the same mechanism to program the
> > PEBS-load-latency MSR, right?
> >
> Yes, we could hardcode the latency the same way.

At least right now it would still need a constraint, because only
one counter can use it.

-Andi

--
[email protected] -- Speaking for myself only.

2010-11-12 10:48:57

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [PATCH 2/3] perf: Add support for extra parameters for raw events

On Fri, 2010-11-12 at 11:41 +0100, Stephane Eranian wrote:
> > Can't remember if the load-latency msr is per-core, but its an extra reg
> > that needs to be set.
>
> Load latency is luckily per thread. Nothing special to do to handle this one.
> We can restore its value from attr->config.

Right, but in effect Andi adds two pieces of infrastructure,

- extra_reg, which determines if a particular cntr value needs extra
data, this is useful for both the offcore msr and the pebs-ll msr.

- per-core constraints, which is useful for shared msrs like the
offcore and lbr config things.

So while his current patch set only uses either piece once -- to provide
offcore support -- both pieces have in fact at least one more potential
user (for future patches).

2010-11-12 10:49:21

by Stephane Eranian

[permalink] [raw]
Subject: Re: [PATCH 2/3] perf: Add support for extra parameters for raw events

On Fri, Nov 12, 2010 at 11:27 AM, Andi Kleen <[email protected]> wrote:
> On Thu, Nov 11, 2010 at 09:37:38PM +0100, Peter Zijlstra wrote:
>> On Thu, 2010-11-11 at 21:12 +0100, Andi Kleen wrote:
>> > > Also, I think we can use the same mechanism to program the
>> > > PEBS-load-latency MSR, right?
>> >
>> > I haven't looked at that, but if it's a per core resource
>> > the infrastructure can be reused at least.
>>
>> Can't remember if the load-latency msr is per-core, but its an extra reg
>> that needs to be set.
>
> Ok. I guess it shouldn't be too hard to add.
>
> Load-latency is pretty useful too.
>
Yes, it is to figure out where cache misses are.

>>
>> The nhm/wsm LBR config msr is per-core though, so that could use your
>> infrastructure too.
>
> Stephane mentioned that too. Not right now, but perhaps later I'll
> look at using this.
>
The difficulty with PEBS-LL (load latency) is not so much the encoding of the
latency. It is more how to expose the data back to user. You get a full PEBS
record for each miss. So you get the PEBS machine state + data addr, miss
latency, and data source (where did the line come from). We have already
discussed how to expose machine state in general. I'll work on a patch for this.
So we can get the general PEBS machine state out. However, the question is
how do we expose data addr, latency, data source? We can reuse the
SAMPLE_ADDR for the data address. Sample IP would point to the load
instruction (with help from LBR to correct the off by one issue). For
the latency
and data source, I proposed using pseudo regs and leveraging the sampled machine
state mechanism. An alternative may be to define a new record type that would b
generic enough to be reusable, for instance { instr_addr, data_addr,
latency, data_src, flags; }.

But let's fix OFFCORE_RESPONSE_* first.

2010-11-12 10:52:23

by Stephane Eranian

[permalink] [raw]
Subject: Re: [PATCH 2/3] perf: Add support for extra parameters for raw events

On Fri, Nov 12, 2010 at 11:48 AM, Andi Kleen <[email protected]> wrote:
>> I don't think you need special syntax. You can simply come up with
>> the 64-bit raw hex value. corey recently added a small utility to do
>> this via libpfm4: perf stat -e `evt2raw unhalted_core_cycles:u` ....
>
> I added a small patch to perf to read event mappings from a flat mapping file.
>
>> > Also, I think we can use the same mechanism to program the
>> > PEBS-load-latency MSR, right?
>> >
>> Yes, we could hardcode the latency the same way.
>
> At least right now it would still need a constraint, because only
> one counter can use it.
>

True. But there are already contraint tables for PEBS. For instance,
if I use INST_RETIRED (0x3c) with PEBS, then it cannot use a fixed
counter. So you can do a constraint for MEM_LOAD_RETIRED:LAT_ABOVE_THRES
the same way (include the umask).

2010-11-12 10:56:13

by Stephane Eranian

[permalink] [raw]
Subject: Re: [PATCH 2/3] perf: Add support for extra parameters for raw events

On Fri, Nov 12, 2010 at 11:52 AM, Stephane Eranian <[email protected]> wrote:
> On Fri, Nov 12, 2010 at 11:48 AM, Andi Kleen <[email protected]> wrote:
>>> I don't think you need special syntax. You can simply come up with
>>> the 64-bit raw hex value. corey recently added a small utility to do
>>> this via libpfm4: perf stat -e `evt2raw unhalted_core_cycles:u` ....
>>
>> I added a small patch to perf to read event mappings from a flat mapping file.
>>
>>> > Also, I think we can use the same mechanism to program the
>>> > PEBS-load-latency MSR, right?
>>> >
>>> Yes, we could hardcode the latency the same way.
>>
>> At least right now it would still need a constraint, because only
>> one counter can use it.
>>
>
> True. But there are already contraint tables for PEBS. For instance,
> if I use INST_RETIRED (0x3c) with PEBS, then it cannot use a fixed
> counter. So you can do a constraint for MEM_LOAD_RETIRED:LAT_ABOVE_THRES
> the same way (include the umask).
>
Second thought on this. There is no contraint for the event
MEM_LOAD_RETIRED:LAT_ABOVE_THRES. It can be measured in any
of the 4 generic counters. It's just that it can only be measured once given
the extra MSR would then be shared. One way to implement this constraint
is indeed to mark the event has being able to run only in one counter. That
may be the simplest way to implement this. In that case I would use a
counter which does not have many other constraints, e.g. 3 or 4.

2010-11-12 11:37:07

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [PATCH 2/3] perf: Add support for extra parameters for raw events

On Fri, 2010-11-12 at 11:49 +0100, Stephane Eranian wrote:
> The difficulty with PEBS-LL (load latency) is not so much the encoding of the
> latency. It is more how to expose the data back to user. You get a full PEBS
> record for each miss. So you get the PEBS machine state + data addr, miss
> latency, and data source (where did the line come from). We have already
> discussed how to expose machine state in general. I'll work on a patch for this.

Frederic was working on this PERF_SAMPLE_REGS stuff as well for his copy
the stack top and let dwarfs go wild at it patches.

> So we can get the general PEBS machine state out. However, the question is
> how do we expose data addr, latency, data source? We can reuse the
> SAMPLE_ADDR for the data address. Sample IP would point to the load
> instruction (with help from LBR to correct the off by one issue). For
> the latency

Right, PERF_SAMPLE_IP and PERF_SAMPLE_ADDR

> and data source, I proposed using pseudo regs and leveraging the sampled machine
> state mechanism. An alternative may be to define a new record type that would b
> generic enough to be reusable, for instance { instr_addr, data_addr,
> latency, data_src, flags; }.

I'm not sure I like the idea of pseudo regs.. I'm afraid it'll get messy
quite quickly. Load-latency is a bit like IBS that way, terribly messy.

2010-11-12 13:00:48

by Stephane Eranian

[permalink] [raw]
Subject: Re: [PATCH 2/3] perf: Add support for extra parameters for raw events

On Fri, Nov 12, 2010 at 12:36 PM, Peter Zijlstra <[email protected]> wrote:
> On Fri, 2010-11-12 at 11:49 +0100, Stephane Eranian wrote:
>> The difficulty with PEBS-LL (load latency) is not so much the encoding of the
>> latency. It is more how to expose the data back to user. You get a full PEBS
>> record for each miss. So you get the PEBS machine state + data addr, miss
>> latency, and data source (where did the line come from). We have already
>> discussed how to expose machine state in general. I'll work on a patch for this.
>
> Frederic was working on this PERF_SAMPLE_REGS stuff as well for his copy
> the stack top and let dwarfs go wild at it patches.
>
Ok, I'll talk to him again about this then.

>> So we can get the general PEBS machine state out. However, the question is
>> how do we expose data addr, latency, data source? We can reuse the
>> SAMPLE_ADDR for the data address. Sample IP would point to the load
>> instruction (with help from LBR to correct the off by one issue). For
>> the latency
>
> Right, PERF_SAMPLE_IP and PERF_SAMPLE_ADDR
>
>> and data source, I proposed using pseudo regs and leveraging the sampled machine
>> state mechanism. An alternative may be to define a new record type that would b
>> generic enough to be reusable, for instance { instr_addr, data_addr,
>> latency, data_src, flags; }.
>
> I'm not sure I like the idea of pseudo regs.. I'm afraid it'll get messy
> quite quickly. Load-latency is a bit like IBS that way, terribly messy.
>

I don't understand what aspect you think is messy. When you are sampling
cache misses, you expect to get the tuple (instr addr, data addr, latency,
data source). That is what you get with AMD IBS, Nehalem PEBS-LL and
also Itanium D-EAR. I am sure IBM Power has something similar as well.
To collect this, you can either store the info in registers (AMD, Itanium)
or in a buffer (PEBS). But regardless of that you will always have to expose
the tuple. We have a solution for two out of 4 fields that reuses the existing
infrastructure. We need something else for the other two.

We should expect that in the future PMUs will collect more than code addresses.

2010-11-12 13:22:08

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [PATCH 2/3] perf: Add support for extra parameters for raw events

On Fri, 2010-11-12 at 14:00 +0100, Stephane Eranian wrote:
> I don't understand what aspect you think is messy. When you are sampling
> cache misses, you expect to get the tuple (instr addr, data addr, latency,
> data source).

Its the data source thing I have most trouble with -- see below. The
latency isn't immediately clear either, I mean the larger the bubble the
more hits the instruction will get, so there should be a correlation
between samples and latency.

> That is what you get with AMD IBS, Nehalem PEBS-LL and
> also Itanium D-EAR. I am sure IBM Power has something similar as well.
> To collect this, you can either store the info in registers (AMD, Itanium)
> or in a buffer (PEBS). But regardless of that you will always have to expose
> the tuple. We have a solution for two out of 4 fields that reuses the existing
> infrastructure. We need something else for the other two.

Well, if Intel PEBS, IA64 and PPC64 all have a data source thing we can
simply add PERF_SAMPLE_SOURCE or somesuch and use that.

Do IA64/PPC64 have latency fields as well? PERF_SAMPLE_LATENCY would
seem to be the thing to use in that case.

BTW, what's the status of perf on IA64? And do we really still care
about that platform, its pretty much dead isn't it?

> We should expect that in the future PMUs will collect more than code addresses.

Sure, but I hate stuff that counts multiple events on a single base like
IBS does, and LL is similar to that, its a fetch retire counter and then
you report where fetch was satisfied from. So in effect you're measuring
l1/l2/l3/dram hit/miss all at the same time but on a fetch basis.

Note that we need proper userspace for such crap as well, and libpfm
doesn't count, we need a full analysis tool in perf itself.

2010-11-12 13:30:29

by Frederic Weisbecker

[permalink] [raw]
Subject: Re: [PATCH 2/3] perf: Add support for extra parameters for raw events

On Fri, Nov 12, 2010 at 02:00:43PM +0100, Stephane Eranian wrote:
> On Fri, Nov 12, 2010 at 12:36 PM, Peter Zijlstra <[email protected]> wrote:
> > On Fri, 2010-11-12 at 11:49 +0100, Stephane Eranian wrote:
> >> The difficulty with PEBS-LL (load latency) is not so much the encoding of the
> >> latency. It is more how to expose the data back to user. You get a full PEBS
> >> record for each miss. So you get the PEBS machine state + data addr, miss
> >> latency, and data source (where did the line come from). We have already
> >> discussed how to expose machine state in general. I'll work on a patch for this.
> >
> > Frederic was working on this PERF_SAMPLE_REGS stuff as well for his copy
> > the stack top and let dwarfs go wild at it patches.
> >
> Ok, I'll talk to him again about this then.


Would you like me to respin this isolated chunk of my dwarf cfi unwinding patchset?
So that we can discuss the ABI.

Thanks.

2010-11-12 14:03:36

by Stephane Eranian

[permalink] [raw]
Subject: Re: [PATCH 2/3] perf: Add support for extra parameters for raw events

Peter,

On Fri, Nov 12, 2010 at 2:21 PM, Peter Zijlstra <[email protected]> wrote:
> On Fri, 2010-11-12 at 14:00 +0100, Stephane Eranian wrote:
>> I don't understand what aspect you think is messy. When you are sampling
>> cache misses, you expect to get the tuple (instr addr, data addr, latency,
>> data source).
>
> Its the data source thing I have most trouble with -- see below. The
> latency isn't immediately clear either, I mean the larger the bubble the
> more hits the instruction will get, so there should be a correlation
> between samples and latency.
>

The latency is the miss latency, i.e., time to bring the cache line back
at the time the miss is detected. That's not because you see a latency
of 20 cycles that you can assume the line came from the LLC cache, it
may have been in-flight by the time the load was issued. In other words,
the latency may not be enough to figure out where the line actually came.

As for the correlation to cycles sampling, they don't point to the same
location. With cycles, you point to the stalled instructions, i.e., where
you wait for the data to arrive. With PEBS-LL (and variations on the
other archs), you point to the missing load instructions. Sometimes
those can be far apart, it depends on the code flow, instruction scheduling
by the compiler and so on. Backtracing from the stall instruction to the
missing load is tricky business especially with branches, interrupts and
such. Some people have tried that in the past.

What you are really after here is identifying load misses which do incur serious
stalls in your program. No single HW feature provides that. But by combining
cache miss and cycle profiles, I think you can get a good handle on this.

Although the latency is a good hint for potential stalls, there is no guarantee.
A miss latency could be completely overlapped with executions. PEBS-LL
(or variations on the other arch) won't report the overlap. You have
to correlate
this with a cycle profiling. However, it you get latencies of > 40 cycles
or more it is highly unlikely the compiler was able to hide that, thus those are
good candidates for prefetching of some sort (assuming you get lots of samples
like these).

>> That is what you get with AMD IBS, Nehalem PEBS-LL and
>> also Itanium D-EAR. I am sure IBM Power has something similar as well.
>> To collect this, you can either store the info in registers (AMD, Itanium)
>> or in a buffer (PEBS). But regardless of that you will always have to expose
>> the tuple. We have a solution for two out of 4 fields that reuses the existing
>> infrastructure. We need something else for the other two.
>
> Well, if Intel PEBS, IA64 and PPC64 all have a data source thing we can
> simply add PERF_SAMPLE_SOURCE or somesuch and use that.
>

Itanium definitively does have data source, so does IBS. Don't know about
PPC64.

> Do IA64/PPC64 have latency fields as well? PERF_SAMPLE_LATENCY would
> seem to be the thing to use in that case.
>
That's fine too.

> BTW, what's the status of perf on IA64? And do we really still care
> about that platform, its pretty much dead isn't it?
>

It is not dead, there is one more CPU in the making if I recall.
I did touch base with Tony Luck last week on this. I think adding
support for the basic counting stuff should be possible. You have
4 counters, with event constraints. Getting the constraints right
for some events is a bit tricky and the constraint may depend on
the other events being measured. I have the code to do this at
the user level. If somebody wants to tackle, I am willing to help.
Otherwise, it will have to wait until I get some more spare time
and access to Itanium Hw again.

>> We should expect that in the future PMUs will collect more than code addresses.
>
> Sure, but I hate stuff that counts multiple events on a single base like
> IBS does, and LL is similar to that, its a fetch retire counter and then
> you report where fetch was satisfied from. So in effect you're measuring
> l1/l2/l3/dram hit/miss all at the same time but on a fetch basis.
>

PEBS-LL is different. You are counting on a single event which is
MEM_LOAD_RETIRED. The threshold is a refinement to filter out
useless misses (threshold can be as low as 4 cycles, L1D latency).
When you sample on this you are only looking a explicit data load
misses. You ignore the code side and prefetches.

You need to wait until the instruction retires to be sure about the
miss latency. So associating this with LLC_MISSES instead would
be harder. By construction, you can also only track one load at a time.

> Note that we need proper userspace for such crap as well, and libpfm
> doesn't count, we need a full analysis tool in perf itself.

I understand that.

2010-11-12 15:00:16

by Stephane Eranian

[permalink] [raw]
Subject: Re: [PATCH 2/3] perf: Add support for extra parameters for raw events

On Fri, Nov 12, 2010 at 2:30 PM, Frederic Weisbecker <[email protected]> wrote:
> On Fri, Nov 12, 2010 at 02:00:43PM +0100, Stephane Eranian wrote:
>> On Fri, Nov 12, 2010 at 12:36 PM, Peter Zijlstra <[email protected]> wrote:
>> > On Fri, 2010-11-12 at 11:49 +0100, Stephane Eranian wrote:
>> >> The difficulty with PEBS-LL (load latency) is not so much the encoding of the
>> >> latency. It is more how to expose the data back to user. You get a full PEBS
>> >> record for each miss. So you get the PEBS machine state + data addr, miss
>> >> latency, and data source (where did the line come from). We have already
>> >> discussed how to expose machine state in general. I'll work on a patch for this.
>> >
>> > Frederic was working on this PERF_SAMPLE_REGS stuff as well for his copy
>> > the stack top and let dwarfs go wild at it patches.
>> >
>> Ok, I'll talk to him again about this then.
>
>
> Would you like me to respin this isolated chunk of my dwarf cfi unwinding patchset?
> So that we can discuss the ABI.
>
I'd like to see your proposed kernel API to expose machine state.

> Thanks.
>
>