This patch series introduces model specific events and impments AMD
IBS (Instruction Based Sampling) for perf_events.
IBS is documented here:
BIOS and Kernel Developer’s Guide (BKDG) For AMD Family 10h Processors
http://support.amd.com/us/Processor_TechDocs/31116.pdf
AMD64 Architecture Programmer’s Manual Volume 2: System Programming
http://support.amd.com/us/Processor_TechDocs/24593.pdf
The general approach is to introduce a flag to mark an event as model
specific. With that flag set a model specific ibs (raw) config value
can be passed to the pmu for setup. When there are ibs samples
available, it is sent back as a raw data sample to the userland. So we
have a raw config value and raw sampling data. This requires the
userland to setup ibs and further extract and process sampling data.
Patches 1-8 rework and refactor the code to prepare the ibs
implementation. This is done in patches 9-12.
I will add ibs example code to libpfm4.
-Robert
There may exist constraints with a cmask set to zero. In this case
for_each_event_constraint() will not work properly. Now weight is used
instead of the cmask for loop exit detection. Weight is always a value
other than zero since the default contains the HWEIGHT from the
counter mask and in other cases a value of zero does not fit too.
This is in preparation of ibs event constraints that wont have a
cmask.
Signed-off-by: Robert Richter <[email protected]>
---
arch/x86/kernel/cpu/perf_event.c | 2 +-
1 files changed, 1 insertions(+), 1 deletions(-)
diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index f66f52a..feda380 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -170,7 +170,7 @@ struct cpu_hw_events {
EVENT_CONSTRAINT(0, 0, 0)
#define for_each_event_constraint(e, c) \
- for ((e) = (c); (e)->cmask; (e)++)
+ for ((e) = (c); (e)->weight; (e)++)
union perf_capabilities {
struct {
--
1.7.0.3
The new code checks if IBS is available on the cpu. It implements only
a basic detection that should be later extended to read ibs cpuid
feature flags.
Signed-off-by: Robert Richter <[email protected]>
---
arch/x86/kernel/cpu/perf_event.c | 5 +++++
arch/x86/kernel/cpu/perf_event_amd.c | 2 ++
2 files changed, 7 insertions(+), 0 deletions(-)
diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index 67b99a9..a42d033 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -242,6 +242,11 @@ struct x86_pmu {
*/
unsigned long lbr_tos, lbr_from, lbr_to; /* MSR base regs */
int lbr_nr; /* hardware stack size */
+
+ /*
+ * AMD IBS
+ */
+ int ibs; /* cpuid flags */
};
static struct x86_pmu x86_pmu __read_mostly;
diff --git a/arch/x86/kernel/cpu/perf_event_amd.c b/arch/x86/kernel/cpu/perf_event_amd.c
index 611df11..246304d 100644
--- a/arch/x86/kernel/cpu/perf_event_amd.c
+++ b/arch/x86/kernel/cpu/perf_event_amd.c
@@ -402,6 +402,8 @@ static __init int amd_pmu_init(void)
return -ENODEV;
x86_pmu = amd_pmu;
+ if (boot_cpu_has(X86_FEATURE_IBS))
+ x86_pmu.ibs = 1;
/* Events are common for all AMDs */
memcpy(hw_cache_event_ids, amd_hw_cache_event_ids,
--
1.7.0.3
This patch implements code to handle ibs interrupts. If ibs data is
available a raw perf_event data sample is created and sent back to the
userland. Currently only the data is stored only in the raw data, but
this could be extended in a later patch by generating generic event
data such as the rip from the ibs sampling data.
Signed-off-by: Robert Richter <[email protected]>
---
arch/x86/include/asm/msr-index.h | 3 ++
arch/x86/kernel/cpu/perf_event_amd.c | 65 +++++++++++++++++++++++++++++++++-
2 files changed, 67 insertions(+), 1 deletions(-)
diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h
index bc473ac..a7e4aa5 100644
--- a/arch/x86/include/asm/msr-index.h
+++ b/arch/x86/include/asm/msr-index.h
@@ -113,6 +113,7 @@
#define MSR_AMD64_IBSFETCHCTL 0xc0011030
#define MSR_AMD64_IBSFETCHLINAD 0xc0011031
#define MSR_AMD64_IBSFETCHPHYSAD 0xc0011032
+#define MSR_AMD64_IBSFETCH_SIZE 3
#define MSR_AMD64_IBSOPCTL 0xc0011033
#define MSR_AMD64_IBSOPRIP 0xc0011034
#define MSR_AMD64_IBSOPDATA 0xc0011035
@@ -120,7 +121,9 @@
#define MSR_AMD64_IBSOPDATA3 0xc0011037
#define MSR_AMD64_IBSDCLINAD 0xc0011038
#define MSR_AMD64_IBSDCPHYSAD 0xc0011039
+#define MSR_AMD64_IBSOP_SIZE 7
#define MSR_AMD64_IBSCTL 0xc001103a
+#define MSR_AMD64_IBS_SIZE_MAX MSR_AMD64_IBSOP_SIZE
/* Fam 10h MSRs */
#define MSR_FAM10H_MMIO_CONF_BASE 0xc0010058
diff --git a/arch/x86/kernel/cpu/perf_event_amd.c b/arch/x86/kernel/cpu/perf_event_amd.c
index 3dc327c..78b0b34 100644
--- a/arch/x86/kernel/cpu/perf_event_amd.c
+++ b/arch/x86/kernel/cpu/perf_event_amd.c
@@ -283,6 +283,69 @@ static inline void __amd_pmu_enable_ibs_event(struct hw_perf_event *hwc)
__x86_pmu_enable_event(hwc, IBS_OP_ENABLE);
}
+static int amd_pmu_check_ibs(int idx, unsigned int msr, u64 valid,
+ u64 reenable, int size, struct pt_regs *iregs)
+{
+ struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
+ struct perf_event *event = cpuc->events[idx];
+ struct perf_sample_data data;
+ struct perf_raw_record raw;
+ struct pt_regs regs;
+ u64 buffer[MSR_AMD64_IBS_SIZE_MAX];
+ u64 *buf = buffer;
+ int i;
+
+ if (!test_bit(idx, cpuc->active_mask))
+ return 0;
+
+ rdmsrl(msr++, *buf);
+ if (!(*buf++ & valid))
+ return 0;
+
+ perf_sample_data_init(&data, 0);
+ if (event->attr.sample_type & PERF_SAMPLE_RAW) {
+ for (i = 1; i < size; i++)
+ rdmsrl(msr++, *buf++);
+ raw.size = sizeof(u64) * size;
+ raw.data = buffer;
+ data.raw = &raw;
+ }
+
+ regs = *iregs; /* later: update ip from ibs sample */
+
+ if (perf_event_overflow(event, 1, &data, ®s))
+ x86_pmu_stop(event);
+ else
+ __x86_pmu_enable_event(&event->hw, reenable);
+
+ return 1;
+}
+
+static int amd_pmu_handle_irq(struct pt_regs *regs)
+{
+ int handled, handled2;
+
+ handled = x86_pmu_handle_irq(regs);
+
+ if (!x86_pmu.ibs)
+ return handled;
+
+ handled2 = 0;
+ handled2 += amd_pmu_check_ibs(X86_PMC_IDX_SPECIAL_IBS_FETCH,
+ MSR_AMD64_IBSFETCHCTL, IBS_FETCH_VAL,
+ IBS_FETCH_ENABLE, MSR_AMD64_IBSFETCH_SIZE,
+ regs);
+ handled2 += amd_pmu_check_ibs(X86_PMC_IDX_SPECIAL_IBS_OP,
+ MSR_AMD64_IBSOPCTL, IBS_OP_VAL,
+ IBS_OP_ENABLE, MSR_AMD64_IBSOP_SIZE,
+ regs);
+
+ if (handled2)
+ inc_irq_stat(apic_perf_irqs);
+
+ return (handled || handled2);
+}
+
static void amd_pmu_disable_all(void)
{
x86_pmu_disable_all();
@@ -608,7 +671,7 @@ static void amd_pmu_cpu_dead(int cpu)
static __initconst const struct x86_pmu amd_pmu = {
.name = "AMD",
- .handle_irq = x86_pmu_handle_irq,
+ .handle_irq = amd_pmu_handle_irq,
.disable_all = amd_pmu_disable_all,
.enable_all = amd_pmu_enable_all,
.enable = amd_pmu_enable_event,
--
1.7.0.3
The changes are needed to introduce ibs events that are implemented as
special x86 events.
Signed-off-by: Robert Richter <[email protected]>
---
arch/x86/kernel/cpu/perf_event.c | 18 +++++++++---------
1 files changed, 9 insertions(+), 9 deletions(-)
diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index 2a7c2fc..67b99a9 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -281,7 +281,7 @@ x86_perf_event_update(struct perf_event *event)
int idx = hwc->idx;
s64 delta;
- if (idx == X86_PMC_IDX_SPECIAL_BTS)
+ if (idx >= X86_PMC_IDX_SPECIAL)
return 0;
/*
@@ -758,10 +758,10 @@ static inline void x86_assign_hw_event(struct perf_event *event,
hwc->last_cpu = smp_processor_id();
hwc->last_tag = ++cpuc->tags[i];
- if (hwc->idx == X86_PMC_IDX_SPECIAL_BTS) {
- hwc->config_base = 0;
- hwc->event_base = 0;
- } else if (hwc->idx >= X86_PMC_IDX_FIXED) {
+ if (hwc->idx < X86_PMC_IDX_FIXED) {
+ hwc->config_base = x86_pmu.eventsel;
+ hwc->event_base = x86_pmu.perfctr;
+ } else if (hwc->idx < X86_PMC_IDX_SPECIAL) {
hwc->config_base = MSR_ARCH_PERFMON_FIXED_CTR_CTRL;
/*
* We set it so that event_base + idx in wrmsr/rdmsr maps to
@@ -769,9 +769,9 @@ static inline void x86_assign_hw_event(struct perf_event *event,
*/
hwc->event_base =
MSR_ARCH_PERFMON_FIXED_CTR0 - X86_PMC_IDX_FIXED;
- } else {
- hwc->config_base = x86_pmu.eventsel;
- hwc->event_base = x86_pmu.perfctr;
+ } else if (hwc->idx == X86_PMC_IDX_SPECIAL_BTS) {
+ hwc->config_base = 0;
+ hwc->event_base = 0;
}
}
@@ -874,7 +874,7 @@ x86_perf_event_set_period(struct perf_event *event)
s64 period = hwc->sample_period;
int ret = 0, idx = hwc->idx;
- if (idx == X86_PMC_IDX_SPECIAL_BTS)
+ if (idx >= X86_PMC_IDX_SPECIAL_BTS)
return 0;
/*
--
1.7.0.3
There are some pmu events such as Intel BTS or AMD IBS that do not fit
in the generic or fixed performance counter scheme. The upper bits
starting at bit 48 of the 64 bit counter mask are reserved for such
events and can be used to handle them. The events can be identified by
its index in the bit mask.
Signed-off-by: Robert Richter <[email protected]>
---
arch/x86/include/asm/perf_event.h | 3 ++-
arch/x86/kernel/cpu/perf_event.c | 6 +++---
arch/x86/kernel/cpu/perf_event_intel.c | 10 +++++-----
arch/x86/kernel/cpu/perf_event_intel_ds.c | 4 ++--
4 files changed, 12 insertions(+), 11 deletions(-)
diff --git a/arch/x86/include/asm/perf_event.h b/arch/x86/include/asm/perf_event.h
index f6d43db..9f10215 100644
--- a/arch/x86/include/asm/perf_event.h
+++ b/arch/x86/include/asm/perf_event.h
@@ -10,6 +10,7 @@
#define X86_PMC_IDX_GENERIC 0
#define X86_PMC_IDX_FIXED 32
+#define X86_PMC_IDX_SPECIAL 48
#define X86_PMC_IDX_MAX 64
#define MSR_ARCH_PERFMON_PERFCTR0 0xc1
@@ -107,7 +108,7 @@ union cpuid10_edx {
* values are used by actual fixed events and higher values are used
* to indicate other overflow conditions in the PERF_GLOBAL_STATUS msr.
*/
-#define X86_PMC_IDX_FIXED_BTS (X86_PMC_IDX_FIXED + 16)
+#define X86_PMC_IDX_SPECIAL_BTS (X86_PMC_IDX_SPECIAL + 0)
/* IbsFetchCtl bits/masks */
#define IBS_FETCH_RAND_EN (1ULL<<57)
diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index feda380..2a7c2fc 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -281,7 +281,7 @@ x86_perf_event_update(struct perf_event *event)
int idx = hwc->idx;
s64 delta;
- if (idx == X86_PMC_IDX_FIXED_BTS)
+ if (idx == X86_PMC_IDX_SPECIAL_BTS)
return 0;
/*
@@ -758,7 +758,7 @@ static inline void x86_assign_hw_event(struct perf_event *event,
hwc->last_cpu = smp_processor_id();
hwc->last_tag = ++cpuc->tags[i];
- if (hwc->idx == X86_PMC_IDX_FIXED_BTS) {
+ if (hwc->idx == X86_PMC_IDX_SPECIAL_BTS) {
hwc->config_base = 0;
hwc->event_base = 0;
} else if (hwc->idx >= X86_PMC_IDX_FIXED) {
@@ -874,7 +874,7 @@ x86_perf_event_set_period(struct perf_event *event)
s64 period = hwc->sample_period;
int ret = 0, idx = hwc->idx;
- if (idx == X86_PMC_IDX_FIXED_BTS)
+ if (idx == X86_PMC_IDX_SPECIAL_BTS)
return 0;
/*
diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c
index a4b56ac..c7e6145 100644
--- a/arch/x86/kernel/cpu/perf_event_intel.c
+++ b/arch/x86/kernel/cpu/perf_event_intel.c
@@ -458,7 +458,7 @@ static void intel_pmu_disable_all(void)
wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, 0);
- if (test_bit(X86_PMC_IDX_FIXED_BTS, cpuc->active_mask))
+ if (test_bit(X86_PMC_IDX_SPECIAL_BTS, cpuc->active_mask))
intel_pmu_disable_bts();
intel_pmu_pebs_disable_all();
@@ -473,9 +473,9 @@ static void intel_pmu_enable_all(int added)
intel_pmu_lbr_enable_all();
wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, x86_pmu.intel_ctrl);
- if (test_bit(X86_PMC_IDX_FIXED_BTS, cpuc->active_mask)) {
+ if (test_bit(X86_PMC_IDX_SPECIAL_BTS, cpuc->active_mask)) {
struct perf_event *event =
- cpuc->events[X86_PMC_IDX_FIXED_BTS];
+ cpuc->events[X86_PMC_IDX_SPECIAL_BTS];
if (WARN_ON_ONCE(!event))
return;
@@ -550,7 +550,7 @@ static void intel_pmu_disable_event(struct perf_event *event)
{
struct hw_perf_event *hwc = &event->hw;
- if (unlikely(hwc->idx == X86_PMC_IDX_FIXED_BTS)) {
+ if (unlikely(hwc->idx == X86_PMC_IDX_SPECIAL_BTS)) {
intel_pmu_disable_bts();
intel_pmu_drain_bts_buffer();
return;
@@ -602,7 +602,7 @@ static void intel_pmu_enable_event(struct perf_event *event)
{
struct hw_perf_event *hwc = &event->hw;
- if (unlikely(hwc->idx == X86_PMC_IDX_FIXED_BTS)) {
+ if (unlikely(hwc->idx == X86_PMC_IDX_SPECIAL_BTS)) {
if (!__get_cpu_var(cpu_hw_events).enabled)
return;
diff --git a/arch/x86/kernel/cpu/perf_event_intel_ds.c b/arch/x86/kernel/cpu/perf_event_intel_ds.c
index ec8b2e1..e49a68a 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_ds.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_ds.c
@@ -176,7 +176,7 @@ static int reserve_ds_buffers(void)
*/
static struct event_constraint bts_constraint =
- EVENT_CONSTRAINT(0, 1ULL << X86_PMC_IDX_FIXED_BTS, 0);
+ EVENT_CONSTRAINT(0, 1ULL << X86_PMC_IDX_SPECIAL_BTS, 0);
static void intel_pmu_enable_bts(u64 config)
{
@@ -223,7 +223,7 @@ static void intel_pmu_drain_bts_buffer(void)
u64 to;
u64 flags;
};
- struct perf_event *event = cpuc->events[X86_PMC_IDX_FIXED_BTS];
+ struct perf_event *event = cpuc->events[X86_PMC_IDX_SPECIAL_BTS];
struct bts_record *at, *top;
struct perf_output_handle handle;
struct perf_event_header header;
--
1.7.0.3
Moving x86_setup_perfctr(), no other changes made.
Signed-off-by: Robert Richter <[email protected]>
---
arch/x86/kernel/cpu/perf_event.c | 120 +++++++++++++++++++-------------------
1 files changed, 59 insertions(+), 61 deletions(-)
diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index d275277..84b6107 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -426,7 +426,65 @@ set_ext_hw_attr(struct hw_perf_event *hwc, struct perf_event_attr *attr)
return 0;
}
-static int x86_setup_perfctr(struct perf_event *event);
+static int x86_setup_perfctr(struct perf_event *event)
+{
+ struct perf_event_attr *attr = &event->attr;
+ struct hw_perf_event *hwc = &event->hw;
+ u64 config;
+
+ if (!hwc->sample_period) {
+ hwc->sample_period = x86_pmu.max_period;
+ hwc->last_period = hwc->sample_period;
+ atomic64_set(&hwc->period_left, hwc->sample_period);
+ } else {
+ /*
+ * If we have a PMU initialized but no APIC
+ * interrupts, we cannot sample hardware
+ * events (user-space has to fall back and
+ * sample via a hrtimer based software event):
+ */
+ if (!x86_pmu.apic)
+ return -EOPNOTSUPP;
+ }
+
+ if (attr->type == PERF_TYPE_RAW)
+ return 0;
+
+ if (attr->type == PERF_TYPE_HW_CACHE)
+ return set_ext_hw_attr(hwc, attr);
+
+ if (attr->config >= x86_pmu.max_events)
+ return -EINVAL;
+
+ /*
+ * The generic map:
+ */
+ config = x86_pmu.event_map(attr->config);
+
+ if (config == 0)
+ return -ENOENT;
+
+ if (config == -1LL)
+ return -EINVAL;
+
+ /*
+ * Branch tracing:
+ */
+ if ((attr->config == PERF_COUNT_HW_BRANCH_INSTRUCTIONS) &&
+ (hwc->sample_period == 1)) {
+ /* BTS is not supported by this architecture. */
+ if (!x86_pmu.bts)
+ return -EOPNOTSUPP;
+
+ /* BTS is currently only allowed for user-mode. */
+ if (!attr->exclude_kernel)
+ return -EOPNOTSUPP;
+ }
+
+ hwc->config |= config;
+
+ return 0;
+}
static int x86_pmu_hw_config(struct perf_event *event)
{
@@ -493,66 +551,6 @@ static int __hw_perf_event_init(struct perf_event *event)
return x86_setup_perfctr(event);
}
-static int x86_setup_perfctr(struct perf_event *event)
-{
- struct perf_event_attr *attr = &event->attr;
- struct hw_perf_event *hwc = &event->hw;
- u64 config;
-
- if (!hwc->sample_period) {
- hwc->sample_period = x86_pmu.max_period;
- hwc->last_period = hwc->sample_period;
- atomic64_set(&hwc->period_left, hwc->sample_period);
- } else {
- /*
- * If we have a PMU initialized but no APIC
- * interrupts, we cannot sample hardware
- * events (user-space has to fall back and
- * sample via a hrtimer based software event):
- */
- if (!x86_pmu.apic)
- return -EOPNOTSUPP;
- }
-
- if (attr->type == PERF_TYPE_RAW)
- return 0;
-
- if (attr->type == PERF_TYPE_HW_CACHE)
- return set_ext_hw_attr(hwc, attr);
-
- if (attr->config >= x86_pmu.max_events)
- return -EINVAL;
-
- /*
- * The generic map:
- */
- config = x86_pmu.event_map(attr->config);
-
- if (config == 0)
- return -ENOENT;
-
- if (config == -1LL)
- return -EINVAL;
-
- /*
- * Branch tracing:
- */
- if ((attr->config == PERF_COUNT_HW_BRANCH_INSTRUCTIONS) &&
- (hwc->sample_period == 1)) {
- /* BTS is not supported by this architecture. */
- if (!x86_pmu.bts)
- return -EOPNOTSUPP;
-
- /* BTS is currently only allowed for user-mode. */
- if (!attr->exclude_kernel)
- return -EOPNOTSUPP;
- }
-
- hwc->config |= config;
-
- return 0;
-}
-
static void x86_pmu_disable_all(void)
{
struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
--
1.7.0.3
The perfctr setup calls are in the corresponding .hw_config()
functions now. This makes it possible to introduce config functions
for other pmu events that are not perfctr specific.
Also, all of a sudden the code looks much nicer.
Signed-off-by: Robert Richter <[email protected]>
---
arch/x86/kernel/cpu/perf_event.c | 9 ++-------
arch/x86/kernel/cpu/perf_event_p4.c | 2 +-
2 files changed, 3 insertions(+), 8 deletions(-)
diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index 84b6107..5945128 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -505,7 +505,7 @@ static int x86_pmu_hw_config(struct perf_event *event)
if (event->attr.type == PERF_TYPE_RAW)
event->hw.config |= event->attr.config & X86_RAW_EVENT_MASK;
- return 0;
+ return x86_setup_perfctr(event);
}
/*
@@ -543,12 +543,7 @@ static int __hw_perf_event_init(struct perf_event *event)
event->hw.last_cpu = -1;
event->hw.last_tag = ~0ULL;
- /* Processor specifics */
- err = x86_pmu.hw_config(event);
- if (err)
- return err;
-
- return x86_setup_perfctr(event);
+ return x86_pmu.hw_config(event);
}
static void x86_pmu_disable_all(void)
diff --git a/arch/x86/kernel/cpu/perf_event_p4.c b/arch/x86/kernel/cpu/perf_event_p4.c
index 15367cc..9e00205 100644
--- a/arch/x86/kernel/cpu/perf_event_p4.c
+++ b/arch/x86/kernel/cpu/perf_event_p4.c
@@ -455,7 +455,7 @@ static int p4_hw_config(struct perf_event *event)
(p4_config_pack_escr(P4_ESCR_MASK_HT) |
p4_config_pack_cccr(P4_CCCR_MASK_HT));
- return 0;
+ return x86_setup_perfctr(event);
}
static inline void p4_pmu_clear_cccr_ovf(struct hw_perf_event *hwc)
--
1.7.0.3
Split __hw_perf_event_init() to configure pmu events other than
perfctrs. Perfctr code is moved to a separate function
x86_setup_perfctr(). This and the following patches refactor the code.
Split in multiple patches for better review.
Signed-off-by: Robert Richter <[email protected]>
---
arch/x86/kernel/cpu/perf_event.c | 20 ++++++++++++++------
1 files changed, 14 insertions(+), 6 deletions(-)
diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index 626154a..d275277 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -426,6 +426,8 @@ set_ext_hw_attr(struct hw_perf_event *hwc, struct perf_event_attr *attr)
return 0;
}
+static int x86_setup_perfctr(struct perf_event *event);
+
static int x86_pmu_hw_config(struct perf_event *event)
{
/*
@@ -453,9 +455,6 @@ static int x86_pmu_hw_config(struct perf_event *event)
*/
static int __hw_perf_event_init(struct perf_event *event)
{
- struct perf_event_attr *attr = &event->attr;
- struct hw_perf_event *hwc = &event->hw;
- u64 config;
int err;
if (!x86_pmu_initialized())
@@ -482,15 +481,24 @@ static int __hw_perf_event_init(struct perf_event *event)
event->destroy = hw_perf_event_destroy;
- hwc->idx = -1;
- hwc->last_cpu = -1;
- hwc->last_tag = ~0ULL;
+ event->hw.idx = -1;
+ event->hw.last_cpu = -1;
+ event->hw.last_tag = ~0ULL;
/* Processor specifics */
err = x86_pmu.hw_config(event);
if (err)
return err;
+ return x86_setup_perfctr(event);
+}
+
+static int x86_setup_perfctr(struct perf_event *event)
+{
+ struct perf_event_attr *attr = &event->attr;
+ struct hw_perf_event *hwc = &event->hw;
+ u64 config;
+
if (!hwc->sample_period) {
hwc->sample_period = x86_pmu.max_period;
hwc->last_period = hwc->sample_period;
--
1.7.0.3
This implements the perf nmi handler for ibs interrupts. The code was
copied from oprofile and should be merged somewhen.
Signed-off-by: Robert Richter <[email protected]>
---
arch/x86/kernel/cpu/perf_event.c | 10 ++++
arch/x86/kernel/cpu/perf_event_amd.c | 87 ++++++++++++++++++++++++++++++++++
2 files changed, 97 insertions(+), 0 deletions(-)
diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index a42d033..8f9674f 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -383,12 +383,15 @@ static void release_pmc_hardware(void) {}
static int reserve_ds_buffers(void);
static void release_ds_buffers(void);
+static int reserve_ibs_hardware(void);
+static void release_ibs_hardware(void);
static void hw_perf_event_destroy(struct perf_event *event)
{
if (atomic_dec_and_mutex_lock(&active_events, &pmc_reserve_mutex)) {
release_pmc_hardware();
release_ds_buffers();
+ release_ibs_hardware();
mutex_unlock(&pmc_reserve_mutex);
}
}
@@ -537,6 +540,13 @@ static int __hw_perf_event_init(struct perf_event *event)
if (err)
release_pmc_hardware();
}
+ if (!err) {
+ err = reserve_ibs_hardware();
+ if (err) {
+ release_ds_buffers();
+ release_pmc_hardware();
+ }
+ }
}
if (!err)
atomic_inc(&active_events);
diff --git a/arch/x86/kernel/cpu/perf_event_amd.c b/arch/x86/kernel/cpu/perf_event_amd.c
index 246304d..27daead 100644
--- a/arch/x86/kernel/cpu/perf_event_amd.c
+++ b/arch/x86/kernel/cpu/perf_event_amd.c
@@ -1,5 +1,7 @@
#ifdef CONFIG_CPU_SUP_AMD
+#include <linux/pci.h>
+
static DEFINE_RAW_SPINLOCK(amd_nb_lock);
static __initconst const u64 amd_hw_cache_event_ids
@@ -106,6 +108,91 @@ static const u64 amd_perfmon_event_map[] =
[PERF_COUNT_HW_BRANCH_MISSES] = 0x00c5,
};
+#ifdef CONFIG_X86_LOCAL_APIC
+
+/* IBS - apic initialization, taken from oprofile, should be unified */
+
+static u8 ibs_eilvt_off;
+
+static inline void apic_init_ibs_nmi_per_cpu(void *arg)
+{
+ ibs_eilvt_off = setup_APIC_eilvt_ibs(0, APIC_EILVT_MSG_NMI, 0);
+}
+
+static inline void apic_clear_ibs_nmi_per_cpu(void *arg)
+{
+ setup_APIC_eilvt_ibs(0, APIC_EILVT_MSG_FIX, 1);
+}
+
+static int init_ibs_nmi(void)
+{
+#define IBSCTL_LVTOFFSETVAL (1 << 8)
+#define IBSCTL 0x1cc
+ struct pci_dev *cpu_cfg;
+ int nodes;
+ u32 value = 0;
+
+ /* per CPU setup */
+ on_each_cpu(apic_init_ibs_nmi_per_cpu, NULL, 1);
+
+ nodes = 0;
+ cpu_cfg = NULL;
+ do {
+ cpu_cfg = pci_get_device(PCI_VENDOR_ID_AMD,
+ PCI_DEVICE_ID_AMD_10H_NB_MISC,
+ cpu_cfg);
+ if (!cpu_cfg)
+ break;
+ ++nodes;
+ pci_write_config_dword(cpu_cfg, IBSCTL, ibs_eilvt_off
+ | IBSCTL_LVTOFFSETVAL);
+ pci_read_config_dword(cpu_cfg, IBSCTL, &value);
+ if (value != (ibs_eilvt_off | IBSCTL_LVTOFFSETVAL)) {
+ pci_dev_put(cpu_cfg);
+ printk(KERN_DEBUG "Failed to setup IBS LVT offset, "
+ "IBSCTL = 0x%08x", value);
+ return 1;
+ }
+ } while (1);
+
+ if (!nodes) {
+ printk(KERN_DEBUG "No CPU node configured for IBS");
+ return 1;
+ }
+
+ return 0;
+}
+
+/* uninitialize the APIC for the IBS interrupts if needed */
+static void clear_ibs_nmi(void)
+{
+ on_each_cpu(apic_clear_ibs_nmi_per_cpu, NULL, 1);
+}
+
+#else
+
+static inline int init_ibs_nmi(void) { return 1; }
+static inline void clear_ibs_nmi(void) { }
+
+#endif
+
+static int reserve_ibs_hardware(void)
+{
+ if (!x86_pmu.ibs)
+ return 0;
+ if (init_ibs_nmi())
+ /* something went wrong, disable ibs */
+ x86_pmu.ibs = 0;
+ return 0;
+}
+
+static void release_ibs_hardware(void)
+{
+ if (!x86_pmu.ibs)
+ return;
+ clear_ibs_nmi();
+}
+
static u64 amd_pmu_event_map(int hw_event)
{
return amd_perfmon_event_map[hw_event];
--
1.7.0.3
This patch implements IBS for perf_event. It extends the AMD pmu to
handle model specific IBS events.
With the attr.model_spec bit set we can setup hardware events others
than generic performance counters. A special PMU 64 bit config value
can be passed through the perf_event interface. The concept of PMU
model-specific arguments was practiced already in Perfmon2. The type
of event (8 bits) is determinded from the config value too, bit 32-39
are reserved for this.
There are two types of IBS events for instruction fetch (IBS_FETCH)
and instruction execution (IBS_OP). Both events are implemented as
special x86 events. The event allocation is implemented by using
special event constraints for ibs. This way, the generic event
scheduler can be used to allocate ibs events.
IBS can only be set up with raw (model specific) config values and raw
data samples. The event attributes for the syscall should be
programmed like this (IBS_FETCH):
memset(&attr, 0, sizeof(attr));
attr.type = PERF_TYPE_RAW;
attr.sample_type = PERF_SAMPLE_CPU | PERF_SAMPLE_RAW;
attr.config = IBS_FETCH_CONFIG_DEFAULT
attr.config |=
((unsigned long long)MODEL_SPEC_TYPE_IBS_FETCH << 32)
& MODEL_SPEC_TYPE_MASK;
attr.model_spec = 1;
The whole ibs example will be part of libpfm4.
The next patch implements the ibs interrupt handler.
Cc: Stephane Eranian <[email protected]>
Signed-off-by: Robert Richter <[email protected]>
---
arch/x86/include/asm/perf_event.h | 4 +-
arch/x86/kernel/cpu/perf_event.c | 20 ++++
arch/x86/kernel/cpu/perf_event_amd.c | 161 ++++++++++++++++++++++++++++++++-
3 files changed, 179 insertions(+), 6 deletions(-)
diff --git a/arch/x86/include/asm/perf_event.h b/arch/x86/include/asm/perf_event.h
index 9f10215..fd5c326 100644
--- a/arch/x86/include/asm/perf_event.h
+++ b/arch/x86/include/asm/perf_event.h
@@ -102,13 +102,15 @@ union cpuid10_edx {
#define X86_PMC_IDX_FIXED_BUS_CYCLES (X86_PMC_IDX_FIXED + 2)
/*
- * We model BTS tracing as another fixed-mode PMC.
+ * Masks for special PMU features
*
* We choose a value in the middle of the fixed event range, since lower
* values are used by actual fixed events and higher values are used
* to indicate other overflow conditions in the PERF_GLOBAL_STATUS msr.
*/
#define X86_PMC_IDX_SPECIAL_BTS (X86_PMC_IDX_SPECIAL + 0)
+#define X86_PMC_IDX_SPECIAL_IBS_FETCH (X86_PMC_IDX_SPECIAL + 1)
+#define X86_PMC_IDX_SPECIAL_IBS_OP (X86_PMC_IDX_SPECIAL + 2)
/* IbsFetchCtl bits/masks */
#define IBS_FETCH_RAND_EN (1ULL<<57)
diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index 8f9674f..e2328f4 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -184,6 +184,26 @@ union perf_capabilities {
};
/*
+ * Model specific hardware events
+ *
+ * With the attr.model_spec bit set we can setup hardware events
+ * others than generic performance counters. A special PMU 64 bit
+ * config value can be passed through the perf_event interface. The
+ * concept of PMU model-specific arguments was practiced already in
+ * Perfmon2. The type of event (8 bits) is determinded from the config
+ * value too, bit 32-39 are reserved for this.
+ */
+#define MODEL_SPEC_TYPE_IBS_FETCH 0
+#define MODEL_SPEC_TYPE_IBS_OP 1
+
+#define MODEL_SPEC_TYPE_MASK (0xFFULL << 32)
+
+static inline int get_model_spec_type(u64 config)
+{
+ return (config & MODEL_SPEC_TYPE_MASK) >> 32;
+}
+
+/*
* struct x86_pmu - generic x86 pmu
*/
struct x86_pmu {
diff --git a/arch/x86/kernel/cpu/perf_event_amd.c b/arch/x86/kernel/cpu/perf_event_amd.c
index 27daead..3dc327c 100644
--- a/arch/x86/kernel/cpu/perf_event_amd.c
+++ b/arch/x86/kernel/cpu/perf_event_amd.c
@@ -2,6 +2,10 @@
#include <linux/pci.h>
+#define IBS_FETCH_CONFIG_MASK (IBS_FETCH_RAND_EN | IBS_FETCH_MAX_CNT)
+#define IBS_OP_CONFIG_MASK (IBS_OP_CNT_CTL | IBS_OP_MAX_CNT)
+#define AMD64_NUM_COUNTERS 4
+
static DEFINE_RAW_SPINLOCK(amd_nb_lock);
static __initconst const u64 amd_hw_cache_event_ids
@@ -193,6 +197,118 @@ static void release_ibs_hardware(void)
clear_ibs_nmi();
}
+static inline void amd_pmu_disable_ibs(void)
+{
+ struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
+ u64 val;
+
+ if (test_bit(X86_PMC_IDX_SPECIAL_IBS_FETCH, cpuc->active_mask)) {
+ rdmsrl(MSR_AMD64_IBSFETCHCTL, val);
+ val &= ~IBS_FETCH_ENABLE;
+ wrmsrl(MSR_AMD64_IBSFETCHCTL, val);
+ }
+ if (test_bit(X86_PMC_IDX_SPECIAL_IBS_OP, cpuc->active_mask)) {
+ rdmsrl(MSR_AMD64_IBSOPCTL, val);
+ val &= ~IBS_OP_ENABLE;
+ wrmsrl(MSR_AMD64_IBSOPCTL, val);
+ }
+}
+
+static inline void amd_pmu_enable_ibs(void)
+{
+ struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
+ u64 val;
+
+ if (test_bit(X86_PMC_IDX_SPECIAL_IBS_FETCH, cpuc->active_mask)) {
+ rdmsrl(MSR_AMD64_IBSFETCHCTL, val);
+ val |= IBS_FETCH_ENABLE;
+ wrmsrl(MSR_AMD64_IBSFETCHCTL, val);
+ }
+ if (test_bit(X86_PMC_IDX_SPECIAL_IBS_OP, cpuc->active_mask)) {
+ rdmsrl(MSR_AMD64_IBSOPCTL, val);
+ val |= IBS_OP_ENABLE;
+ wrmsrl(MSR_AMD64_IBSOPCTL, val);
+ }
+}
+
+static int amd_pmu_ibs_config(struct perf_event *event)
+{
+ int type;
+
+ if (!x86_pmu.ibs)
+ return -ENODEV;
+
+ if (event->hw.sample_period)
+ /*
+ * The usage of the sample period attribute to
+ * calculate the IBS max count value is not yet
+ * supported, the max count must be in the raw config
+ * value.
+ */
+ return -ENOSYS;
+
+ if (event->attr.type != PERF_TYPE_RAW)
+ /* only raw sample types are supported */
+ return -EINVAL;
+
+ type = get_model_spec_type(event->attr.config);
+ switch (type) {
+ case MODEL_SPEC_TYPE_IBS_FETCH:
+ event->hw.config = IBS_FETCH_CONFIG_MASK & event->attr.config;
+ event->hw.idx = X86_PMC_IDX_SPECIAL_IBS_FETCH;
+ /*
+ * dirty hack, needed for __x86_pmu_enable_event(), we
+ * should better change event->hw.config_base into
+ * event->hw.config_msr that already includes the index
+ */
+ event->hw.config_base = MSR_AMD64_IBSFETCHCTL - event->hw.idx;
+ break;
+ case MODEL_SPEC_TYPE_IBS_OP:
+ event->hw.config = IBS_OP_CONFIG_MASK & event->attr.config;
+ event->hw.idx = X86_PMC_IDX_SPECIAL_IBS_OP;
+ event->hw.config_base = MSR_AMD64_IBSOPCTL - event->hw.idx;
+ break;
+ default:
+ return -ENODEV;
+ }
+
+ return 0;
+}
+
+static inline void __amd_pmu_enable_ibs_event(struct hw_perf_event *hwc)
+{
+ if (hwc->idx == X86_PMC_IDX_SPECIAL_IBS_FETCH)
+ __x86_pmu_enable_event(hwc, IBS_FETCH_ENABLE);
+ else if (hwc->idx == X86_PMC_IDX_SPECIAL_IBS_OP)
+ __x86_pmu_enable_event(hwc, IBS_OP_ENABLE);
+}
+
+static void amd_pmu_disable_all(void)
+{
+ x86_pmu_disable_all();
+ amd_pmu_disable_ibs();
+}
+
+static void amd_pmu_enable_all(int added)
+{
+ x86_pmu_enable_all(added);
+ amd_pmu_enable_ibs();
+}
+
+static void amd_pmu_enable_event(struct perf_event *event)
+{
+ struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
+ struct hw_perf_event *hwc = &event->hw;
+
+ if (!cpuc->enabled)
+ return;
+
+ if (hwc->idx < X86_PMC_IDX_SPECIAL)
+ __x86_pmu_enable_event(hwc, ARCH_PERFMON_EVENTSEL_ENABLE);
+ else
+ __amd_pmu_enable_ibs_event(hwc);
+}
+
static u64 amd_pmu_event_map(int hw_event)
{
return amd_perfmon_event_map[hw_event];
@@ -200,7 +316,12 @@ static u64 amd_pmu_event_map(int hw_event)
static int amd_pmu_hw_config(struct perf_event *event)
{
- int ret = x86_pmu_hw_config(event);
+ int ret;
+
+ if (event->attr.model_spec)
+ return amd_pmu_ibs_config(event);
+
+ ret = x86_pmu_hw_config(event);
if (ret)
return ret;
@@ -214,6 +335,23 @@ static int amd_pmu_hw_config(struct perf_event *event)
}
/*
+ * AMD64 events - list of special events (IBS)
+ */
+static struct event_constraint amd_event_constraints[] =
+{
+ /*
+ * The value for the weight of these constraints is higher
+ * than in the unconstrainted case to process ibs after the
+ * generic counters in x86_schedule_events().
+ */
+ __EVENT_CONSTRAINT(0, 1ULL << X86_PMC_IDX_SPECIAL_IBS_FETCH, 0,
+ AMD64_NUM_COUNTERS + 1),
+ __EVENT_CONSTRAINT(0, 1ULL << X86_PMC_IDX_SPECIAL_IBS_OP, 0,
+ AMD64_NUM_COUNTERS + 1),
+ EVENT_CONSTRAINT_END
+};
+
+/*
* AMD64 events are detected based on their event codes.
*/
static inline int amd_is_nb_event(struct hw_perf_event *hwc)
@@ -299,10 +437,23 @@ amd_get_event_constraints(struct cpu_hw_events *cpuc, struct perf_event *event)
struct hw_perf_event *hwc = &event->hw;
struct amd_nb *nb = cpuc->amd_nb;
struct perf_event *old = NULL;
+ struct event_constraint *c;
int max = x86_pmu.num_counters;
int i, j, k = -1;
/*
+ * The index of special events must be set in
+ * hw_perf_event_init(). The constraints are used for resource
+ * allocation by the event scheduler.
+ */
+ if (hwc->idx >= X86_PMC_IDX_SPECIAL) {
+ for_each_event_constraint(c, amd_event_constraints) {
+ if (test_bit(hwc->idx, c->idxmsk))
+ return c;
+ }
+ }
+
+ /*
* if not NB event or no NB, then no constraints
*/
if (!(amd_has_nb(cpuc) && amd_is_nb_event(hwc)))
@@ -458,9 +609,9 @@ static void amd_pmu_cpu_dead(int cpu)
static __initconst const struct x86_pmu amd_pmu = {
.name = "AMD",
.handle_irq = x86_pmu_handle_irq,
- .disable_all = x86_pmu_disable_all,
- .enable_all = x86_pmu_enable_all,
- .enable = x86_pmu_enable_event,
+ .disable_all = amd_pmu_disable_all,
+ .enable_all = amd_pmu_enable_all,
+ .enable = amd_pmu_enable_event,
.disable = x86_pmu_disable_event,
.hw_config = amd_pmu_hw_config,
.schedule_events = x86_schedule_events,
@@ -468,7 +619,7 @@ static __initconst const struct x86_pmu amd_pmu = {
.perfctr = MSR_K7_PERFCTR0,
.event_map = amd_pmu_event_map,
.max_events = ARRAY_SIZE(amd_perfmon_event_map),
- .num_counters = 4,
+ .num_counters = AMD64_NUM_COUNTERS,
.cntval_bits = 48,
.cntval_mask = (1ULL << 48) - 1,
.apic = 1,
--
1.7.0.3
This patch introduces a flag to mark events as model specific. The
flag can be used to setup hardware events other than generic
performance counters by passing special config data to the pmu. The
data can be interpreted different from generic events and can be used
for other purposes.
The concept of PMU model-specific arguments was practiced already in
Perfmon2. Perfmon2 provides an interface to pass model specific config
values to the pmu and receive event specific samples back. The
implementation of the model specific flag extends the perf_event i/f
by this feature too.
The userland program must be aware of the cpu model to create
model specific events. This could be done for example by checking the
cpu and family.
The model_spec flag can be arbitrarily reused on other models or
architectures. For backward compatibility all architectures must
return an error, if model_spec events are not supported and the bit is
set.
E.g., this flag is used to setup IBS on an AMD cpu. IBS is not common
to pmu features from other vendors or architectures. The pmu must be
setup with a special config value. Sample data is returned in a
certain format back to the userland. An IBS event is passed in the
config field of the event attributes and with model_spec and raw event
flag enabled. The samples are then passed back in a raw sample.
Signed-off-by: Robert Richter <[email protected]>
---
arch/powerpc/kernel/perf_event.c | 3 +++
arch/sh/kernel/perf_event.c | 3 +++
arch/sparc/kernel/perf_event.c | 3 +++
arch/x86/kernel/cpu/perf_event.c | 3 +++
include/linux/perf_event.h | 3 ++-
5 files changed, 14 insertions(+), 1 deletions(-)
diff --git a/arch/powerpc/kernel/perf_event.c b/arch/powerpc/kernel/perf_event.c
index 08460a2..8e68e15 100644
--- a/arch/powerpc/kernel/perf_event.c
+++ b/arch/powerpc/kernel/perf_event.c
@@ -1037,6 +1037,9 @@ const struct pmu *hw_perf_event_init(struct perf_event *event)
event->hw.config_base = ev;
event->hw.idx = 0;
+ if (attr->model_spec)
+ return ERR_PTR(-EOPNOTSUPP);
+
/*
* If we are not running on a hypervisor, force the
* exclude_hv bit to 0 so that we don't care what
diff --git a/arch/sh/kernel/perf_event.c b/arch/sh/kernel/perf_event.c
index 81b6de4..eef545a 100644
--- a/arch/sh/kernel/perf_event.c
+++ b/arch/sh/kernel/perf_event.c
@@ -109,6 +109,9 @@ static int __hw_perf_event_init(struct perf_event *event)
if (!sh_pmu_initialized())
return -ENODEV;
+ if (attr->model_spec)
+ return -EOPNOTSUPP;
+
/*
* All of the on-chip counters are "limited", in that they have
* no interrupts, and are therefore unable to do sampling without
diff --git a/arch/sparc/kernel/perf_event.c b/arch/sparc/kernel/perf_event.c
index e277193..5c1d2a4 100644
--- a/arch/sparc/kernel/perf_event.c
+++ b/arch/sparc/kernel/perf_event.c
@@ -1083,6 +1083,9 @@ static int __hw_perf_event_init(struct perf_event *event)
} else
return -EOPNOTSUPP;
+ if (attr->model_spec)
+ return -EOPNOTSUPP;
+
/* We save the enable bits in the config_base. */
hwc->config_base = sparc_pmu->irq_bit;
if (!attr->exclude_user)
diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index 5945128..9eeffad 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -432,6 +432,9 @@ static int x86_setup_perfctr(struct perf_event *event)
struct hw_perf_event *hwc = &event->hw;
u64 config;
+ if (attr->model_spec)
+ return -EOPNOTSUPP;
+
if (!hwc->sample_period) {
hwc->sample_period = x86_pmu.max_period;
hwc->last_period = hwc->sample_period;
diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 6e96cc8..e90ba6e 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -204,8 +204,9 @@ struct perf_event_attr {
task : 1, /* trace fork/exit */
watermark : 1, /* wakeup_watermark */
precise : 1, /* OoO invariant counter */
+ model_spec : 1, /* model specific hw event */
- __reserved_1 : 48;
+ __reserved_1 : 47;
union {
__u32 wakeup_events; /* wakeup every n events */
--
1.7.0.3
To reuse this function for events with different enable bit masks,
this mask is part of the function's argument list now.
The function will be used later to control ibs events too.
Signed-off-by: Robert Richter <[email protected]>
---
arch/x86/kernel/cpu/perf_event.c | 9 +++++----
arch/x86/kernel/cpu/perf_event_intel.c | 5 +++--
2 files changed, 8 insertions(+), 6 deletions(-)
diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index 9eeffad..f66f52a 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -847,10 +847,10 @@ void hw_perf_enable(void)
x86_pmu.enable_all(added);
}
-static inline void __x86_pmu_enable_event(struct hw_perf_event *hwc)
+static inline void __x86_pmu_enable_event(struct hw_perf_event *hwc,
+ u64 enable_mask)
{
- wrmsrl(hwc->config_base + hwc->idx,
- hwc->config | ARCH_PERFMON_EVENTSEL_ENABLE);
+ wrmsrl(hwc->config_base + hwc->idx, hwc->config | enable_mask);
}
static inline void x86_pmu_disable_event(struct perf_event *event)
@@ -922,7 +922,8 @@ static void x86_pmu_enable_event(struct perf_event *event)
{
struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
if (cpuc->enabled)
- __x86_pmu_enable_event(&event->hw);
+ __x86_pmu_enable_event(&event->hw,
+ ARCH_PERFMON_EVENTSEL_ENABLE);
}
/*
diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c
index a099df9..a4b56ac 100644
--- a/arch/x86/kernel/cpu/perf_event_intel.c
+++ b/arch/x86/kernel/cpu/perf_event_intel.c
@@ -513,7 +513,8 @@ static void intel_pmu_nhm_enable_all(int added)
if (!event)
continue;
- __x86_pmu_enable_event(&event->hw);
+ __x86_pmu_enable_event(&event->hw,
+ ARCH_PERFMON_EVENTSEL_ENABLE);
}
}
intel_pmu_enable_all(added);
@@ -617,7 +618,7 @@ static void intel_pmu_enable_event(struct perf_event *event)
if (unlikely(event->attr.precise))
intel_pmu_pebs_enable(event);
- __x86_pmu_enable_event(hwc);
+ __x86_pmu_enable_event(hwc, ARCH_PERFMON_EVENTSEL_ENABLE);
}
/*
--
1.7.0.3
On 13.04.10 22:23:09, Robert Richter wrote:
> This patch series introduces model specific events and impments AMD
> IBS (Instruction Based Sampling) for perf_events.
The patch set triggers this warning in perf_prepare_sample():
WARN_ON_ONCE(size & (sizeof(u64)-1))
Should a raw data sample padded to 64 bit?
-Robert
--
Advanced Micro Devices, Inc.
Operating System Research Center
email: [email protected]
On 13.04.10 22:45:19, Robert Richter wrote:
> On 13.04.10 22:23:09, Robert Richter wrote:
> > This patch series introduces model specific events and impments AMD
> > IBS (Instruction Based Sampling) for perf_events.
>
> The patch set triggers this warning in perf_prepare_sample():
>
> WARN_ON_ONCE(size & (sizeof(u64)-1))
Just found Stephane's patch on the mailing list that removes the warning.
--
Advanced Micro Devices, Inc.
Operating System Research Center
email: [email protected]
On Tue, 2010-04-13 at 22:45 +0200, Robert Richter wrote:
> On 13.04.10 22:23:09, Robert Richter wrote:
> > This patch series introduces model specific events and impments AMD
> > IBS (Instruction Based Sampling) for perf_events.
>
> The patch set triggers this warning in perf_prepare_sample():
>
> WARN_ON_ONCE(size & (sizeof(u64)-1))
>
> Should a raw data sample padded to 64 bit?
Yes it should.
On Tue, 2010-04-13 at 22:23 +0200, Robert Richter wrote:
> This patch series introduces model specific events and impments AMD
> IBS (Instruction Based Sampling) for perf_events.
I would much rather it uses the ->precise thing PEBS also uses,
otherwise we keep growing funny arch extensions and end up with a
totally fragmented trainwreck of an ABI.
> The general approach is to introduce a flag to mark an event as model
> specific. With that flag set a model specific ibs (raw) config value
> can be passed to the pmu for setup. When there are ibs samples
> available, it is sent back as a raw data sample to the userland. So we
> have a raw config value and raw sampling data. This requires the
> userland to setup ibs and further extract and process sampling data.
>
> Patches 1-8 rework and refactor the code to prepare the ibs
> implementation. This is done in patches 9-12.
>
> I will add ibs example code to libpfm4.
Please add a valid usecase to tools/perf/ instead.
On Tue, 2010-04-13 at 22:23 +0200, Robert Richter wrote:
> This implements the perf nmi handler for ibs interrupts. The code was
> copied from oprofile and should be merged somewhen.
>
> Signed-off-by: Robert Richter <[email protected]>
> ---
> arch/x86/kernel/cpu/perf_event.c | 10 ++++
> arch/x86/kernel/cpu/perf_event_amd.c | 87 ++++++++++++++++++++++++++++++++++
> 2 files changed, 97 insertions(+), 0 deletions(-)
>
> diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
> index a42d033..8f9674f 100644
> --- a/arch/x86/kernel/cpu/perf_event.c
> +++ b/arch/x86/kernel/cpu/perf_event.c
> +static int init_ibs_nmi(void)
> +{
> +#define IBSCTL_LVTOFFSETVAL (1 << 8)
> +#define IBSCTL 0x1cc
> + struct pci_dev *cpu_cfg;
> + int nodes;
> + u32 value = 0;
> +
> + /* per CPU setup */
> + on_each_cpu(apic_init_ibs_nmi_per_cpu, NULL, 1);
> +
> + nodes = 0;
> + cpu_cfg = NULL;
> + do {
> + cpu_cfg = pci_get_device(PCI_VENDOR_ID_AMD,
> + PCI_DEVICE_ID_AMD_10H_NB_MISC,
> + cpu_cfg);
> + if (!cpu_cfg)
> + break;
> + ++nodes;
> + pci_write_config_dword(cpu_cfg, IBSCTL, ibs_eilvt_off
> + | IBSCTL_LVTOFFSETVAL);
> + pci_read_config_dword(cpu_cfg, IBSCTL, &value);
> + if (value != (ibs_eilvt_off | IBSCTL_LVTOFFSETVAL)) {
> + pci_dev_put(cpu_cfg);
> + printk(KERN_DEBUG "Failed to setup IBS LVT offset, "
> + "IBSCTL = 0x%08x", value);
> + return 1;
> + }
> + } while (1);
> +
> + if (!nodes) {
> + printk(KERN_DEBUG "No CPU node configured for IBS");
> + return 1;
> + }
> +
> + return 0;
> +}
> +
> +/* uninitialize the APIC for the IBS interrupts if needed */
> +static void clear_ibs_nmi(void)
> +{
> + on_each_cpu(apic_clear_ibs_nmi_per_cpu, NULL, 1);
> +}
That on_each_cpu() looks wonky, why isn't this in the hotplug hooks?
On 15.04.10 14:57:35, Peter Zijlstra wrote:
> > +/* uninitialize the APIC for the IBS interrupts if needed */
> > +static void clear_ibs_nmi(void)
> > +{
> > + on_each_cpu(apic_clear_ibs_nmi_per_cpu, NULL, 1);
> > +}
>
>
> That on_each_cpu() looks wonky, why isn't this in the hotplug hooks?
Right, it should be there. Will update this patch.
-Robert
--
Advanced Micro Devices, Inc.
Operating System Research Center
email: [email protected]
On 15.04.10 09:44:21, Peter Zijlstra wrote:
> On Tue, 2010-04-13 at 22:23 +0200, Robert Richter wrote:
> > This patch series introduces model specific events and impments AMD
> > IBS (Instruction Based Sampling) for perf_events.
>
> I would much rather it uses the ->precise thing PEBS also uses,
> otherwise we keep growing funny arch extensions and end up with a
> totally fragmented trainwreck of an ABI.
I agree that an exiting flag could be reused. But the naming 'precise'
could be misleading. Maybe we rename it to 'model_spec' or something
else that underlines the idea of having model specific setups.
> > The general approach is to introduce a flag to mark an event as model
> > specific. With that flag set a model specific ibs (raw) config value
> > can be passed to the pmu for setup. When there are ibs samples
> > available, it is sent back as a raw data sample to the userland. So we
> > have a raw config value and raw sampling data. This requires the
> > userland to setup ibs and further extract and process sampling data.
> >
> > Patches 1-8 rework and refactor the code to prepare the ibs
> > implementation. This is done in patches 9-12.
> >
> > I will add ibs example code to libpfm4.
>
> Please add a valid usecase to tools/perf/ instead.
I will also provide an example for tools/perf but reusing libpfm4 was
a first quick solution for me.
-Robert
--
Advanced Micro Devices, Inc.
Operating System Research Center
email: [email protected]
Robert,
Some comments below.
On Tue, Apr 13, 2010 at 10:23 PM, Robert Richter <[email protected]> wrote:
> This patch implements code to handle ibs interrupts. If ibs data is
> available a raw perf_event data sample is created and sent back to the
> userland. Currently only the data is stored only in the raw data, but
> this could be extended in a later patch by generating generic event
> data such as the rip from the ibs sampling data.
>
> Signed-off-by: Robert Richter <[email protected]>
> ---
> arch/x86/include/asm/msr-index.h | 3 ++
> arch/x86/kernel/cpu/perf_event_amd.c | 65 +++++++++++++++++++++++++++++++++-
> 2 files changed, 67 insertions(+), 1 deletions(-)
>
> diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h
> index bc473ac..a7e4aa5 100644
> --- a/arch/x86/include/asm/msr-index.h
> +++ b/arch/x86/include/asm/msr-index.h
> @@ -113,6 +113,7 @@
> #define MSR_AMD64_IBSFETCHCTL 0xc0011030
> #define MSR_AMD64_IBSFETCHLINAD 0xc0011031
> #define MSR_AMD64_IBSFETCHPHYSAD 0xc0011032
> +#define MSR_AMD64_IBSFETCH_SIZE 3
I would use COUNT instead of size given the unit is registers not bytes.
> #define MSR_AMD64_IBSOPCTL 0xc0011033
> #define MSR_AMD64_IBSOPRIP 0xc0011034
> #define MSR_AMD64_IBSOPDATA 0xc0011035
> @@ -120,7 +121,9 @@
> #define MSR_AMD64_IBSOPDATA3 0xc0011037
> #define MSR_AMD64_IBSDCLINAD 0xc0011038
> #define MSR_AMD64_IBSDCPHYSAD 0xc0011039
> +#define MSR_AMD64_IBSOP_SIZE 7
Idem here for IBSOP.
> #define MSR_AMD64_IBSCTL 0xc001103a
> +#define MSR_AMD64_IBS_SIZE_MAX MSR_AMD64_IBSOP_SIZE
>
Idem.
> /* Fam 10h MSRs */
> #define MSR_FAM10H_MMIO_CONF_BASE 0xc0010058
> diff --git a/arch/x86/kernel/cpu/perf_event_amd.c b/arch/x86/kernel/cpu/perf_event_amd.c
> index 3dc327c..78b0b34 100644
> --- a/arch/x86/kernel/cpu/perf_event_amd.c
> +++ b/arch/x86/kernel/cpu/perf_event_amd.c
> @@ -283,6 +283,69 @@ static inline void __amd_pmu_enable_ibs_event(struct hw_perf_event *hwc)
> __x86_pmu_enable_event(hwc, IBS_OP_ENABLE);
> }
>
> +static int amd_pmu_check_ibs(int idx, unsigned int msr, u64 valid,
> + u64 reenable, int size, struct pt_regs *iregs)
> +{
> + struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
> + struct perf_event *event = cpuc->events[idx];
> + struct perf_sample_data data;
> + struct perf_raw_record raw;
> + struct pt_regs regs;
> + u64 buffer[MSR_AMD64_IBS_SIZE_MAX];
> + u64 *buf = buffer;
> + int i;
> +
> + if (!test_bit(idx, cpuc->active_mask))
> + return 0;
> +
> + rdmsrl(msr++, *buf);
> + if (!(*buf++ & valid))
> + return 0;
> +
> + perf_sample_data_init(&data, 0);
> + if (event->attr.sample_type & PERF_SAMPLE_RAW) {
> + for (i = 1; i < size; i++)
> + rdmsrl(msr++, *buf++);
> + raw.size = sizeof(u64) * size;
> + raw.data = buffer;
> + data.raw = &raw;
> + }
> +
Need to add the padding: raw.size = sizeof(u64) * size + sizeof(u32);
> + regs = *iregs; /* later: update ip from ibs sample */
> +
> + if (perf_event_overflow(event, 1, &data, ®s))
> + x86_pmu_stop(event);
> + else
> + __x86_pmu_enable_event(&event->hw, reenable);
> +
> + return 1;
> +}
> +
> +static int amd_pmu_handle_irq(struct pt_regs *regs)
> +{
> + int handled, handled2;
> +
> + handled = x86_pmu_handle_irq(regs);
> +
> + if (!x86_pmu.ibs)
> + return handled;
> +
> + handled2 = 0;
> + handled2 += amd_pmu_check_ibs(X86_PMC_IDX_SPECIAL_IBS_FETCH,
> + MSR_AMD64_IBSFETCHCTL, IBS_FETCH_VAL,
> + IBS_FETCH_ENABLE, MSR_AMD64_IBSFETCH_SIZE,
> + regs);
> + handled2 += amd_pmu_check_ibs(X86_PMC_IDX_SPECIAL_IBS_OP,
> + MSR_AMD64_IBSOPCTL, IBS_OP_VAL,
> + IBS_OP_ENABLE, MSR_AMD64_IBSOP_SIZE,
> + regs);
> +
> + if (handled2)
> + inc_irq_stat(apic_perf_irqs);
> +
If you have both regular counter intr + IBS you will double-count
apic_perf_irqs.
I would do: if (handled2 && !handled) inc_irq_stat().
Comments below:
On Tue, Apr 13, 2010 at 10:23 PM, Robert Richter <[email protected]> wrote:
> This patch implements IBS for perf_event. It extends the AMD pmu to
> handle model specific IBS events.
>
> With the attr.model_spec bit set we can setup hardware events others
> than generic performance counters. A special PMU 64 bit config value
> can be passed through the perf_event interface. The concept of PMU
> model-specific arguments was practiced already in Perfmon2. The type
> of event (8 bits) is determinded from the config value too, bit 32-39
> are reserved for this.
>
> There are two types of IBS events for instruction fetch (IBS_FETCH)
> and instruction execution (IBS_OP). Both events are implemented as
> special x86 events. The event allocation is implemented by using
> special event constraints for ibs. This way, the generic event
> scheduler can be used to allocate ibs events.
>
> IBS can only be set up with raw (model specific) config values and raw
> data samples. The event attributes for the syscall should be
> programmed like this (IBS_FETCH):
>
> memset(&attr, 0, sizeof(attr));
> attr.type = PERF_TYPE_RAW;
> attr.sample_type = PERF_SAMPLE_CPU | PERF_SAMPLE_RAW;
> attr.config = IBS_FETCH_CONFIG_DEFAULT
> attr.config |=
> ((unsigned long long)MODEL_SPEC_TYPE_IBS_FETCH << 32)
> & MODEL_SPEC_TYPE_MASK;
> attr.model_spec = 1;
>
> The whole ibs example will be part of libpfm4.
>
> The next patch implements the ibs interrupt handler.
>
> Cc: Stephane Eranian <[email protected]>
> Signed-off-by: Robert Richter <[email protected]>
> ---
> arch/x86/include/asm/perf_event.h | 4 +-
> arch/x86/kernel/cpu/perf_event.c | 20 ++++
> arch/x86/kernel/cpu/perf_event_amd.c | 161 ++++++++++++++++++++++++++++++++-
> 3 files changed, 179 insertions(+), 6 deletions(-)
>
> diff --git a/arch/x86/include/asm/perf_event.h b/arch/x86/include/asm/perf_event.h
> index 9f10215..fd5c326 100644
> --- a/arch/x86/include/asm/perf_event.h
> +++ b/arch/x86/include/asm/perf_event.h
> @@ -102,13 +102,15 @@ union cpuid10_edx {
> #define X86_PMC_IDX_FIXED_BUS_CYCLES (X86_PMC_IDX_FIXED + 2)
>
> /*
> - * We model BTS tracing as another fixed-mode PMC.
> + * Masks for special PMU features
> *
> * We choose a value in the middle of the fixed event range, since lower
> * values are used by actual fixed events and higher values are used
> * to indicate other overflow conditions in the PERF_GLOBAL_STATUS msr.
> */
> #define X86_PMC_IDX_SPECIAL_BTS (X86_PMC_IDX_SPECIAL + 0)
> +#define X86_PMC_IDX_SPECIAL_IBS_FETCH (X86_PMC_IDX_SPECIAL + 1)
> +#define X86_PMC_IDX_SPECIAL_IBS_OP (X86_PMC_IDX_SPECIAL + 2)
>
> /* IbsFetchCtl bits/masks */
> #define IBS_FETCH_RAND_EN (1ULL<<57)
> diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
> index 8f9674f..e2328f4 100644
> --- a/arch/x86/kernel/cpu/perf_event.c
> +++ b/arch/x86/kernel/cpu/perf_event.c
> @@ -184,6 +184,26 @@ union perf_capabilities {
> };
>
> /*
> + * Model specific hardware events
> + *
> + * With the attr.model_spec bit set we can setup hardware events
> + * others than generic performance counters. A special PMU 64 bit
> + * config value can be passed through the perf_event interface. The
> + * concept of PMU model-specific arguments was practiced already in
> + * Perfmon2. The type of event (8 bits) is determinded from the config
> + * value too, bit 32-39 are reserved for this.
> + */
> +#define MODEL_SPEC_TYPE_IBS_FETCH 0
> +#define MODEL_SPEC_TYPE_IBS_OP 1
> +
> +#define MODEL_SPEC_TYPE_MASK (0xFFULL << 32)
> +
> +static inline int get_model_spec_type(u64 config)
> +{
> + return (config & MODEL_SPEC_TYPE_MASK) >> 32;
> +}
> +
> +/*
> * struct x86_pmu - generic x86 pmu
> */
> struct x86_pmu {
> diff --git a/arch/x86/kernel/cpu/perf_event_amd.c b/arch/x86/kernel/cpu/perf_event_amd.c
> index 27daead..3dc327c 100644
> --- a/arch/x86/kernel/cpu/perf_event_amd.c
> +++ b/arch/x86/kernel/cpu/perf_event_amd.c
> @@ -2,6 +2,10 @@
>
> #include <linux/pci.h>
>
> +#define IBS_FETCH_CONFIG_MASK (IBS_FETCH_RAND_EN | IBS_FETCH_MAX_CNT)
> +#define IBS_OP_CONFIG_MASK (IBS_OP_CNT_CTL | IBS_OP_MAX_CNT)
> +#define AMD64_NUM_COUNTERS 4
> +
> static DEFINE_RAW_SPINLOCK(amd_nb_lock);
>
> static __initconst const u64 amd_hw_cache_event_ids
> @@ -193,6 +197,118 @@ static void release_ibs_hardware(void)
> clear_ibs_nmi();
> }
>
> +static inline void amd_pmu_disable_ibs(void)
> +{
> + struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
> + u64 val;
> +
> + if (test_bit(X86_PMC_IDX_SPECIAL_IBS_FETCH, cpuc->active_mask)) {
> + rdmsrl(MSR_AMD64_IBSFETCHCTL, val);
> + val &= ~IBS_FETCH_ENABLE;
> + wrmsrl(MSR_AMD64_IBSFETCHCTL, val);
> + }
> + if (test_bit(X86_PMC_IDX_SPECIAL_IBS_OP, cpuc->active_mask)) {
> + rdmsrl(MSR_AMD64_IBSOPCTL, val);
> + val &= ~IBS_OP_ENABLE;
> + wrmsrl(MSR_AMD64_IBSOPCTL, val);
> + }
> +}
> +
> +static inline void amd_pmu_enable_ibs(void)
> +{
> + struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
> + u64 val;
> +
> + if (test_bit(X86_PMC_IDX_SPECIAL_IBS_FETCH, cpuc->active_mask)) {
> + rdmsrl(MSR_AMD64_IBSFETCHCTL, val);
> + val |= IBS_FETCH_ENABLE;
> + wrmsrl(MSR_AMD64_IBSFETCHCTL, val);
> + }
> + if (test_bit(X86_PMC_IDX_SPECIAL_IBS_OP, cpuc->active_mask)) {
> + rdmsrl(MSR_AMD64_IBSOPCTL, val);
> + val |= IBS_OP_ENABLE;
> + wrmsrl(MSR_AMD64_IBSOPCTL, val);
> + }
> +}
Aren't you picking up stale state by using read-modify-write here?
> +
> +static int amd_pmu_ibs_config(struct perf_event *event)
> +{
> + int type;
> +
> + if (!x86_pmu.ibs)
> + return -ENODEV;
> +
> + if (event->hw.sample_period)
> + /*
> + * The usage of the sample period attribute to
> + * calculate the IBS max count value is not yet
> + * supported, the max count must be in the raw config
> + * value.
> + */
> + return -ENOSYS;
> +
What is the problem with directly using the period here, rejecting
any value that is off range or with bottom 4 bits set?
> + if (event->attr.type != PERF_TYPE_RAW)
> + /* only raw sample types are supported */
> + return -EINVAL;
> +
> + type = get_model_spec_type(event->attr.config);
> + switch (type) {
> + case MODEL_SPEC_TYPE_IBS_FETCH:
> + event->hw.config = IBS_FETCH_CONFIG_MASK & event->attr.config;
> + event->hw.idx = X86_PMC_IDX_SPECIAL_IBS_FETCH;
> + /*
> + * dirty hack, needed for __x86_pmu_enable_event(), we
> + * should better change event->hw.config_base into
> + * event->hw.config_msr that already includes the index
> + */
> + event->hw.config_base = MSR_AMD64_IBSFETCHCTL - event->hw.idx;
> + break;
> + case MODEL_SPEC_TYPE_IBS_OP:
> + event->hw.config = IBS_OP_CONFIG_MASK & event->attr.config;
> + event->hw.idx = X86_PMC_IDX_SPECIAL_IBS_OP;
> + event->hw.config_base = MSR_AMD64_IBSOPCTL - event->hw.idx;
> + break;
IBSOP.cnt_ctl only available from RevC, need to check and reject if older.
> + default:
> + return -ENODEV;
> + }
> +
> + return 0;
> +}
> +
> +static inline void __amd_pmu_enable_ibs_event(struct hw_perf_event *hwc)
> +{
> + if (hwc->idx == X86_PMC_IDX_SPECIAL_IBS_FETCH)
> + __x86_pmu_enable_event(hwc, IBS_FETCH_ENABLE);
> + else if (hwc->idx == X86_PMC_IDX_SPECIAL_IBS_OP)
> + __x86_pmu_enable_event(hwc, IBS_OP_ENABLE);
> +}
> +
> +static void amd_pmu_disable_all(void)
> +{
> + x86_pmu_disable_all();
> + amd_pmu_disable_ibs();
> +}
> +
> +static void amd_pmu_enable_all(int added)
> +{
> + x86_pmu_enable_all(added);
> + amd_pmu_enable_ibs();
> +}
> +
> +static void amd_pmu_enable_event(struct perf_event *event)
> +{
> + struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
> + struct hw_perf_event *hwc = &event->hw;
> +
> + if (!cpuc->enabled)
> + return;
> +
> + if (hwc->idx < X86_PMC_IDX_SPECIAL)
> + __x86_pmu_enable_event(hwc, ARCH_PERFMON_EVENTSEL_ENABLE);
> + else
> + __amd_pmu_enable_ibs_event(hwc);
> +}
> +
> static u64 amd_pmu_event_map(int hw_event)
> {
> return amd_perfmon_event_map[hw_event];
> @@ -200,7 +316,12 @@ static u64 amd_pmu_event_map(int hw_event)
>
> static int amd_pmu_hw_config(struct perf_event *event)
> {
> - int ret = x86_pmu_hw_config(event);
> + int ret;
> +
> + if (event->attr.model_spec)
> + return amd_pmu_ibs_config(event);
> +
> + ret = x86_pmu_hw_config(event);
>
> if (ret)
> return ret;
> @@ -214,6 +335,23 @@ static int amd_pmu_hw_config(struct perf_event *event)
> }
>
> /*
> + * AMD64 events - list of special events (IBS)
> + */
> +static struct event_constraint amd_event_constraints[] =
> +{
> + /*
> + * The value for the weight of these constraints is higher
> + * than in the unconstrainted case to process ibs after the
> + * generic counters in x86_schedule_events().
> + */
> + __EVENT_CONSTRAINT(0, 1ULL << X86_PMC_IDX_SPECIAL_IBS_FETCH, 0,
> + AMD64_NUM_COUNTERS + 1),
> + __EVENT_CONSTRAINT(0, 1ULL << X86_PMC_IDX_SPECIAL_IBS_OP, 0,
> + AMD64_NUM_COUNTERS + 1),
> + EVENT_CONSTRAINT_END
> +};
> +
I think you could define EVENT_IBS_CONSTRAINT() and shorten
the definitions here. You could pass FETCH or OP and the macro
would do the bit shifting. This is how it's done for fixed counters on Intel.
On 15.04.10 15:11:30, Robert Richter wrote:
> On 15.04.10 14:57:35, Peter Zijlstra wrote:
> > > +/* uninitialize the APIC for the IBS interrupts if needed */
> > > +static void clear_ibs_nmi(void)
> > > +{
> > > + on_each_cpu(apic_clear_ibs_nmi_per_cpu, NULL, 1);
> > > +}
> >
> >
> > That on_each_cpu() looks wonky, why isn't this in the hotplug hooks?
>
> Right, it should be there. Will update this patch.
Peter,
please see my updated version below.
-Robert
---
>From bcb2c4aec6bf09fc7c05f31dde9dacd57b9c679c Mon Sep 17 00:00:00 2001
From: Robert Richter <[email protected]>
Date: Mon, 19 Apr 2010 15:32:37 +0200
Subject: [PATCH] perf, x86: setup NMI handler for IBS
This implements the perf nmi handler for ibs interrupts. The code was
copied from oprofile and should be merged somewhen.
Signed-off-by: Robert Richter <[email protected]>
---
arch/x86/kernel/cpu/perf_event.c | 5 ++
arch/x86/kernel/cpu/perf_event_amd.c | 85 ++++++++++++++++++++++++++++++++++
2 files changed, 90 insertions(+), 0 deletions(-)
diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index 940107f..0ad8c45 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -516,6 +516,8 @@ static int x86_pmu_hw_config(struct perf_event *event)
return x86_setup_perfctr(event);
}
+static inline void init_ibs_nmi(void);
+
/*
* Setup the hardware configuration for a given attr_type
*/
@@ -537,6 +539,8 @@ static int __hw_perf_event_init(struct perf_event *event)
if (err)
release_pmc_hardware();
}
+ if (!err)
+ init_ibs_nmi();
}
if (!err)
atomic_inc(&active_events);
@@ -1381,6 +1385,7 @@ static void __init pmu_check_apic(void)
return;
x86_pmu.apic = 0;
+ x86_pmu.ibs = 0;
pr_info("no APIC, boot with the \"lapic\" boot parameter to force-enable it.\n");
pr_info("no hardware sampling interrupt available.\n");
}
diff --git a/arch/x86/kernel/cpu/perf_event_amd.c b/arch/x86/kernel/cpu/perf_event_amd.c
index 246304d..a6ce6f8 100644
--- a/arch/x86/kernel/cpu/perf_event_amd.c
+++ b/arch/x86/kernel/cpu/perf_event_amd.c
@@ -1,5 +1,7 @@
#ifdef CONFIG_CPU_SUP_AMD
+#include <linux/pci.h>
+
static DEFINE_RAW_SPINLOCK(amd_nb_lock);
static __initconst const u64 amd_hw_cache_event_ids
@@ -106,6 +108,85 @@ static const u64 amd_perfmon_event_map[] =
[PERF_COUNT_HW_BRANCH_MISSES] = 0x00c5,
};
+#ifdef CONFIG_X86_LOCAL_APIC
+
+/* IBS - apic initialization, taken from oprofile, should be unified */
+
+/*
+ * Currently there is no early pci ecs access implemented, so this
+ * can't be put into amd_pmu_init(). For now we initialize it in
+ * __hw_perf_event_init().
+ */
+
+static int __init_ibs_nmi(void)
+{
+#define IBSCTL_LVTOFFSETVAL (1 << 8)
+#define IBSCTL 0x1cc
+ struct pci_dev *cpu_cfg;
+ int nodes;
+ u32 value = 0;
+ u8 ibs_eilvt_off;
+
+ if (!x86_pmu.ibs)
+ return 0;
+
+ ibs_eilvt_off = setup_APIC_eilvt_ibs(0, APIC_EILVT_MSG_FIX, 0);
+
+ nodes = 0;
+ cpu_cfg = NULL;
+ do {
+ cpu_cfg = pci_get_device(PCI_VENDOR_ID_AMD,
+ PCI_DEVICE_ID_AMD_10H_NB_MISC,
+ cpu_cfg);
+ if (!cpu_cfg)
+ break;
+ ++nodes;
+ pci_write_config_dword(cpu_cfg, IBSCTL, ibs_eilvt_off
+ | IBSCTL_LVTOFFSETVAL);
+ pci_read_config_dword(cpu_cfg, IBSCTL, &value);
+ if (value != (ibs_eilvt_off | IBSCTL_LVTOFFSETVAL)) {
+ pci_dev_put(cpu_cfg);
+ printk(KERN_DEBUG "Failed to setup IBS LVT offset, "
+ "IBSCTL = 0x%08x", value);
+ return 1;
+ }
+ } while (1);
+
+ if (!nodes) {
+ printk(KERN_DEBUG "No CPU node configured for IBS");
+ return 1;
+ }
+
+ return 0;
+}
+
+static inline void init_ibs_nmi(void)
+{
+ if (__init_ibs_nmi())
+ /* something went wrong, disable ibs */
+ x86_pmu.ibs = 0;
+}
+
+static inline void apic_init_ibs(void)
+{
+ if (x86_pmu.ibs)
+ setup_APIC_eilvt_ibs(0, APIC_EILVT_MSG_NMI, 0);
+}
+
+static inline void apic_clear_ibs(void)
+{
+ if (x86_pmu.ibs)
+ setup_APIC_eilvt_ibs(0, APIC_EILVT_MSG_FIX, 1);
+}
+
+#else
+
+static inline void init_ibs_nmi(void) { }
+static inline void apic_init_ibs(void) { }
+static inline void apic_clear_ibs(void) { }
+
+#endif
+
static u64 amd_pmu_event_map(int hw_event)
{
return amd_perfmon_event_map[hw_event];
@@ -343,6 +424,8 @@ static void amd_pmu_cpu_starting(int cpu)
cpuc->amd_nb->refcnt++;
raw_spin_unlock(&amd_nb_lock);
+
+ apic_init_ibs();
}
static void amd_pmu_cpu_dead(int cpu)
@@ -366,6 +449,8 @@ static void amd_pmu_cpu_dead(int cpu)
}
raw_spin_unlock(&amd_nb_lock);
+
+ apic_clear_ibs();
}
static __initconst const struct x86_pmu amd_pmu = {
--
1.7.0.3
--
Advanced Micro Devices, Inc.
Operating System Research Center
email: [email protected]
On 19.04.10 15:46:29, Stephane Eranian wrote:
> > +static inline void amd_pmu_disable_ibs(void)
> > +{
> > + ? ? ? struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
> > + ? ? ? u64 val;
> > +
> > + ? ? ? if (test_bit(X86_PMC_IDX_SPECIAL_IBS_FETCH, cpuc->active_mask)) {
> > + ? ? ? ? ? ? ? rdmsrl(MSR_AMD64_IBSFETCHCTL, val);
> > + ? ? ? ? ? ? ? val &= ~IBS_FETCH_ENABLE;
> > + ? ? ? ? ? ? ? wrmsrl(MSR_AMD64_IBSFETCHCTL, val);
> > + ? ? ? }
> > + ? ? ? if (test_bit(X86_PMC_IDX_SPECIAL_IBS_OP, cpuc->active_mask)) {
> > + ? ? ? ? ? ? ? rdmsrl(MSR_AMD64_IBSOPCTL, val);
> > + ? ? ? ? ? ? ? val &= ~IBS_OP_ENABLE;
> > + ? ? ? ? ? ? ? wrmsrl(MSR_AMD64_IBSOPCTL, val);
> > + ? ? ? }
> > +}
> > +
> > +static inline void amd_pmu_enable_ibs(void)
> > +{
> > + ? ? ? struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
> > + ? ? ? u64 val;
> > +
> > + ? ? ? if (test_bit(X86_PMC_IDX_SPECIAL_IBS_FETCH, cpuc->active_mask)) {
> > + ? ? ? ? ? ? ? rdmsrl(MSR_AMD64_IBSFETCHCTL, val);
> > + ? ? ? ? ? ? ? val |= IBS_FETCH_ENABLE;
> > + ? ? ? ? ? ? ? wrmsrl(MSR_AMD64_IBSFETCHCTL, val);
> > + ? ? ? }
> > + ? ? ? if (test_bit(X86_PMC_IDX_SPECIAL_IBS_OP, cpuc->active_mask)) {
> > + ? ? ? ? ? ? ? rdmsrl(MSR_AMD64_IBSOPCTL, val);
> > + ? ? ? ? ? ? ? val |= IBS_OP_ENABLE;
> > + ? ? ? ? ? ? ? wrmsrl(MSR_AMD64_IBSOPCTL, val);
> > + ? ? ? }
> > +}
>
> Aren't you picking up stale state by using read-modify-write here?
Hmm, there is a reason for this implementation. Both functions are
only used in .enable_all() and .disable_all() which was originally
used in atomic sections to disable NMIs during event scheduling and
setup. For this short switch off of the pmu the register state is not
saved. On Intel this is implemented by only writing to
MSR_CORE_PERF_GLOBAL_CTRL. The proper way to enable events is to use
amd_pmu_enable_event() which is x86_pmu.enable(event).
Locking at the latest sources this might have changed a little in
between and we have to check that this functions above are not used to
enable events. So I am not really sure if the register may be setup
wrong. But if this happens, then only for the first sample after
enabling or reenabling of the whole pmu since the interrupt handler is
using __x86_pmu_enable_event() to reenable ibs. Thus I would prever
the implementation above and instead reimplement the nmi handling (see
below).
Also, We should avoid a global pmu disable generally since it is
expensive and also the pmu state may not be restored properly for some
events on some hw revisions. But the code must then be atomic.
An alternative solution to disable NMIs on AMD could be to mask the
nmi by writing to the lapic instead of the counter msrs. This could be
more efficient and the pmu will go on with sampling without the need
to restore the state. Or this way: the interrupt handler does not
handle events and only clears the reason if some 'atomic' flag is set?
IMHO I think, also the implementation for x86_pmu_enable_all() and
x86_pmu_disable_all() is not accurate, since the state is not stored
when disabling all counters and then reenabling it with the init
value. On Intel counters this is without impact since the global ctrl
msr is used. In all other cases x86_pmu_enable_all() does not restore
the previous counter state.
-Robert
--
Advanced Micro Devices, Inc.
Operating System Research Center
email: [email protected]
On 19.04.10 14:19:57, Stephane Eranian wrote:
> > diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h
> > index bc473ac..a7e4aa5 100644
> > --- a/arch/x86/include/asm/msr-index.h
> > +++ b/arch/x86/include/asm/msr-index.h
> > @@ -113,6 +113,7 @@
> > ?#define MSR_AMD64_IBSFETCHCTL ? ? ? ? ?0xc0011030
> > ?#define MSR_AMD64_IBSFETCHLINAD ? ? ? ? ? ? ? ?0xc0011031
> > ?#define MSR_AMD64_IBSFETCHPHYSAD ? ? ? 0xc0011032
> > +#define MSR_AMD64_IBSFETCH_SIZE ? ? ? ? ? ? ? ?3
>
> I would use COUNT instead of size given the unit is registers not bytes.
I will change the naming for all macros.
> > +static int amd_pmu_check_ibs(int idx, unsigned int msr, u64 valid,
> > + ? ? ? ? ? ? ? ? ? ? ? ? ? ?u64 reenable, int size, struct pt_regs *iregs)
> > +{
> > + ? ? ? struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
> > + ? ? ? struct perf_event *event = cpuc->events[idx];
> > + ? ? ? struct perf_sample_data data;
> > + ? ? ? struct perf_raw_record raw;
> > + ? ? ? struct pt_regs regs;
> > + ? ? ? u64 buffer[MSR_AMD64_IBS_SIZE_MAX];
> > + ? ? ? u64 *buf = buffer;
> > + ? ? ? int i;
> > +
> > + ? ? ? if (!test_bit(idx, cpuc->active_mask))
> > + ? ? ? ? ? ? ? return 0;
> > +
> > + ? ? ? rdmsrl(msr++, *buf);
> > + ? ? ? if (!(*buf++ & valid))
> > + ? ? ? ? ? ? ? return 0;
> > +
> > + ? ? ? perf_sample_data_init(&data, 0);
> > + ? ? ? if (event->attr.sample_type & PERF_SAMPLE_RAW) {
> > + ? ? ? ? ? ? ? for (i = 1; i < size; i++)
> > + ? ? ? ? ? ? ? ? ? ? ? rdmsrl(msr++, *buf++);
> > + ? ? ? ? ? ? ? raw.size = sizeof(u64) * size;
> > + ? ? ? ? ? ? ? raw.data = buffer;
> > + ? ? ? ? ? ? ? data.raw = &raw;
> > + ? ? ? }
> > +
>
> Need to add the padding: raw.size = sizeof(u64) * size + sizeof(u32);
Yes, this triggers a warning. Will change it and add 4 padding bytes
at the buffer start (to be also 8 byte aligned).
> > +static int amd_pmu_handle_irq(struct pt_regs *regs)
> > +{
> > + ? ? ? int handled, handled2;
> > +
> > + ? ? ? handled = x86_pmu_handle_irq(regs);
> > +
> > + ? ? ? if (!x86_pmu.ibs)
> > + ? ? ? ? ? ? ? return handled;
> > +
> > + ? ? ? handled2 = 0;
> > + ? ? ? handled2 += amd_pmu_check_ibs(X86_PMC_IDX_SPECIAL_IBS_FETCH,
> > + ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? MSR_AMD64_IBSFETCHCTL, IBS_FETCH_VAL,
> > + ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? IBS_FETCH_ENABLE, MSR_AMD64_IBSFETCH_SIZE,
> > + ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? regs);
> > + ? ? ? handled2 += amd_pmu_check_ibs(X86_PMC_IDX_SPECIAL_IBS_OP,
> > + ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? MSR_AMD64_IBSOPCTL, IBS_OP_VAL,
> > + ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? IBS_OP_ENABLE, MSR_AMD64_IBSOP_SIZE,
> > + ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? regs);
> > +
> > + ? ? ? if (handled2)
> > + ? ? ? ? ? ? ? inc_irq_stat(apic_perf_irqs);
> > +
>
> If you have both regular counter intr + IBS you will double-count
> apic_perf_irqs.
> I would do: if (handled2 && !handled) inc_irq_stat().
>
True, will change this.
-Robert
--
Advanced Micro Devices, Inc.
Operating System Research Center
email: [email protected]
On 19.04.10 15:46:29, Stephane Eranian wrote:
> > + ? ? ? if (event->hw.sample_period)
> > + ? ? ? ? ? ? ? /*
> > + ? ? ? ? ? ? ? ?* The usage of the sample period attribute to
> > + ? ? ? ? ? ? ? ?* calculate the IBS max count value is not yet
> > + ? ? ? ? ? ? ? ?* supported, the max count must be in the raw config
> > + ? ? ? ? ? ? ? ?* value.
> > + ? ? ? ? ? ? ? ?*/
> > + ? ? ? ? ? ? ? return -ENOSYS;
> > +
> What is the problem with directly using the period here, rejecting
> any value that is off range or with bottom 4 bits set?
Yes, I will create an updated version of this patch.
> > + ? ? ? if (event->attr.type != PERF_TYPE_RAW)
> > + ? ? ? ? ? ? ? /* only raw sample types are supported */
> > + ? ? ? ? ? ? ? return -EINVAL;
> > +
> > + ? ? ? type = get_model_spec_type(event->attr.config);
> > + ? ? ? switch (type) {
> > + ? ? ? case MODEL_SPEC_TYPE_IBS_FETCH:
> > + ? ? ? ? ? ? ? event->hw.config = IBS_FETCH_CONFIG_MASK & event->attr.config;
> > + ? ? ? ? ? ? ? event->hw.idx = X86_PMC_IDX_SPECIAL_IBS_FETCH;
> > + ? ? ? ? ? ? ? /*
> > + ? ? ? ? ? ? ? ?* dirty hack, needed for __x86_pmu_enable_event(), we
> > + ? ? ? ? ? ? ? ?* should better change event->hw.config_base into
> > + ? ? ? ? ? ? ? ?* event->hw.config_msr that already includes the index
> > + ? ? ? ? ? ? ? ?*/
> > + ? ? ? ? ? ? ? event->hw.config_base = MSR_AMD64_IBSFETCHCTL - event->hw.idx;
> > + ? ? ? ? ? ? ? break;
> > + ? ? ? case MODEL_SPEC_TYPE_IBS_OP:
> > + ? ? ? ? ? ? ? event->hw.config = IBS_OP_CONFIG_MASK & event->attr.config;
> > + ? ? ? ? ? ? ? event->hw.idx = X86_PMC_IDX_SPECIAL_IBS_OP;
> > + ? ? ? ? ? ? ? event->hw.config_base = MSR_AMD64_IBSOPCTL - event->hw.idx;
> > + ? ? ? ? ? ? ? break;
>
> IBSOP.cnt_ctl only available from RevC, need to check and reject if older.
Right, for this patch I will modify IBS_OP_CONFIG_MASK to RevB only
bits, later I will add cpuid detection and pmu revision checks in a
separate patch.
> > +static struct event_constraint amd_event_constraints[] =
> > +{
> > + ? ? ? /*
> > + ? ? ? ?* The value for the weight of these constraints is higher
> > + ? ? ? ?* than in the unconstrainted case to process ibs after the
> > + ? ? ? ?* generic counters in x86_schedule_events().
> > + ? ? ? ?*/
> > + ? ? ? __EVENT_CONSTRAINT(0, 1ULL << X86_PMC_IDX_SPECIAL_IBS_FETCH, 0,
> > + ? ? ? ? ? ? ? ? ? ? ? ? ?AMD64_NUM_COUNTERS + 1),
> > + ? ? ? __EVENT_CONSTRAINT(0, 1ULL << X86_PMC_IDX_SPECIAL_IBS_OP, 0,
> > + ? ? ? ? ? ? ? ? ? ? ? ? ?AMD64_NUM_COUNTERS + 1),
> > + ? ? ? EVENT_CONSTRAINT_END
> > +};
> > +
> I think you could define EVENT_IBS_CONSTRAINT() and shorten
> the definitions here. You could pass FETCH or OP and the macro
> would do the bit shifting. This is how it's done for fixed counters on Intel.
Ok, I will update this.
-Robert
--
Advanced Micro Devices, Inc.
Operating System Research Center
email: [email protected]
On 20.04.10 18:05:57, Robert Richter wrote:
> > What is the problem with directly using the period here, rejecting
> > any value that is off range or with bottom 4 bits set?
>
> Yes, I will create an updated version of this patch.
Stephane, do you think having the lower 4 bits set is worth an EINVAL?
I would rather ignore them since the accuracy is not really necessary
compared to a range lets say from 100000 cycles? Otherwise this will
make the setup of ibs much more complicated. The check could be moved
to userland and generate a waring or so.
-Robert
--
Advanced Micro Devices, Inc.
Operating System Research Center
email: [email protected]
On Wed, Apr 21, 2010 at 10:47 AM, Robert Richter <[email protected]> wrote:
> On 20.04.10 18:05:57, Robert Richter wrote:
>> > What is the problem with directly using the period here, rejecting
>> > any value that is off range or with bottom 4 bits set?
>>
>> Yes, I will create an updated version of this patch.
>
> Stephane, do you think having the lower 4 bits set is worth an EINVAL?
> I would rather ignore them since the accuracy is not really necessary
> compared to a range lets say from 100000 cycles? Otherwise this will
> make the setup of ibs much more complicated. The check could be moved
> to userland and generate a waring or so.
Explain why you think it would be more complicated?
If I recall there is already a function to validate the attrs:
amd_pmu_hw_config().
But may be you are talking about userland setup.
Here is one argument why this might be important. Some people like to
know exactly
the sampling period because they use a particular value, like a prime
number. You
chopping off the bottom 4 bits could break this logic silently.
On 21.04.10 11:02:42, Stephane Eranian wrote:
> On Wed, Apr 21, 2010 at 10:47 AM, Robert Richter <[email protected]> wrote:
> > On 20.04.10 18:05:57, Robert Richter wrote:
> >> > What is the problem with directly using the period here, rejecting
> >> > any value that is off range or with bottom 4 bits set?
> >>
> >> Yes, I will create an updated version of this patch.
> >
> > Stephane, do you think having the lower 4 bits set is worth an EINVAL?
> > I would rather ignore them since the accuracy is not really necessary
> > compared to a range lets say from 100000 cycles? Otherwise this will
> > make the setup of ibs much more complicated. The check could be moved
> > to userland and generate a waring or so.
>
> Explain why you think it would be more complicated?
> If I recall there is already a function to validate the attrs:
> amd_pmu_hw_config().
> But may be you are talking about userland setup.
>
> Here is one argument why this might be important. Some people like to
> know exactly
> the sampling period because they use a particular value, like a prime
> number. You
> chopping off the bottom 4 bits could break this logic silently.
Ok, I see your point. I was thinking of some decimal value used to set
the sample period. You will then have to check if the lower 4 bits are
set or not by doing a dec to hex conversion and so on. But I realized
that multiples of 10000 can be devided by 16 and thus all lower 4 bits
are always cleared.
So, I will check the lower 4 bits and return an error if they are set.
Thanks,
-Robert
--
Advanced Micro Devices, Inc.
Operating System Research Center
email: [email protected]
On 21.04.10 11:21:45, Robert Richter wrote:
> On 21.04.10 11:02:42, Stephane Eranian wrote:
> > On Wed, Apr 21, 2010 at 10:47 AM, Robert Richter <[email protected]> wrote:
> > > On 20.04.10 18:05:57, Robert Richter wrote:
> > >> > What is the problem with directly using the period here, rejecting
> > >> > any value that is off range or with bottom 4 bits set?
> > >>
> > >> Yes, I will create an updated version of this patch.
Stephane,
please see my updated version below.
-Robert
--
>From 79738167383838eb93f0388f8b7983fe822b36c4 Mon Sep 17 00:00:00 2001
From: Robert Richter <[email protected]>
Date: Wed, 21 Apr 2010 12:43:20 +0200
Subject: [PATCH] perf, x86: implement AMD IBS event configuration
This patch implements IBS for perf_event. It extends the AMD pmu to
handle model specific IBS events.
With the attr.model_spec bit set we can setup hardware events others
than generic performance counters. A special PMU 64 bit config value
can be passed through the perf_event interface. The concept of PMU
model-specific arguments was practiced already in Perfmon2. The type
of event (8 bits) is determinded from the config value too, bit 32-39
are reserved for this.
There are two types of IBS events for instruction fetch (IBS_FETCH)
and instruction execution (IBS_OP). Both events are implemented as
special x86 events. The event allocation is implemented by using
special event constraints for ibs. This way, the generic event
scheduler can be used to allocate ibs events.
Except for the sample period IBS can only be set up with raw (model
specific) config values and raw data samples. The event attributes for
the syscall should be programmed like this (IBS_FETCH):
memset(&attr, 0, sizeof(attr));
attr.type = PERF_TYPE_RAW;
attr.sample_type = PERF_SAMPLE_CPU | PERF_SAMPLE_RAW;
attr.config = IBS_FETCH_CONFIG_DEFAULT
attr.config |=
((unsigned long long)MODEL_SPEC_TYPE_IBS_FETCH << 32)
& MODEL_SPEC_TYPE_MASK;
attr.model_spec = 1;
The whole ibs example will be part of libpfm4.
The ibs interrupt handle is implemented in the next patch.
Cc: Stephane Eranian <[email protected]>
Signed-off-by: Robert Richter <[email protected]>
---
arch/x86/include/asm/perf_event.h | 11 ++-
arch/x86/kernel/cpu/perf_event.c | 23 ++++
arch/x86/kernel/cpu/perf_event_amd.c | 189 +++++++++++++++++++++++++++++++++-
3 files changed, 214 insertions(+), 9 deletions(-)
diff --git a/arch/x86/include/asm/perf_event.h b/arch/x86/include/asm/perf_event.h
index 7e51c75..dace4e2 100644
--- a/arch/x86/include/asm/perf_event.h
+++ b/arch/x86/include/asm/perf_event.h
@@ -44,14 +44,15 @@
#define AMD64_RAW_EVENT_MASK \
(X86_RAW_EVENT_MASK | \
AMD64_EVENTSEL_EVENT)
+#define AMD64_NUM_COUNTERS 4
-#define ARCH_PERFMON_UNHALTED_CORE_CYCLES_SEL 0x3c
+#define ARCH_PERFMON_UNHALTED_CORE_CYCLES_SEL 0x3c
#define ARCH_PERFMON_UNHALTED_CORE_CYCLES_UMASK (0x00 << 8)
-#define ARCH_PERFMON_UNHALTED_CORE_CYCLES_INDEX 0
+#define ARCH_PERFMON_UNHALTED_CORE_CYCLES_INDEX 0
#define ARCH_PERFMON_UNHALTED_CORE_CYCLES_PRESENT \
(1 << (ARCH_PERFMON_UNHALTED_CORE_CYCLES_INDEX))
-#define ARCH_PERFMON_BRANCH_MISSES_RETIRED 6
+#define ARCH_PERFMON_BRANCH_MISSES_RETIRED 6
/*
* Intel "Architectural Performance Monitoring" CPUID
@@ -102,13 +103,15 @@ union cpuid10_edx {
#define X86_PMC_IDX_FIXED_BUS_CYCLES (X86_PMC_IDX_FIXED + 2)
/*
- * We model BTS tracing as another fixed-mode PMC.
+ * Masks for special PMU features
*
* We choose a value in the middle of the fixed event range, since lower
* values are used by actual fixed events and higher values are used
* to indicate other overflow conditions in the PERF_GLOBAL_STATUS msr.
*/
#define X86_PMC_IDX_SPECIAL_BTS (X86_PMC_IDX_SPECIAL + 0)
+#define X86_PMC_IDX_SPECIAL_IBS_FETCH (X86_PMC_IDX_SPECIAL + 1)
+#define X86_PMC_IDX_SPECIAL_IBS_OP (X86_PMC_IDX_SPECIAL + 2)
/* IbsFetchCtl bits/masks */
#define IBS_FETCH_RAND_EN (1ULL<<57)
diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index 0ad8c45..cc0fd61 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -146,6 +146,9 @@ struct cpu_hw_events {
#define INTEL_EVENT_CONSTRAINT(c, n) \
EVENT_CONSTRAINT(c, n, ARCH_PERFMON_EVENTSEL_EVENT)
+#define AMD_IBS_EVENT_CONSTRAINT(idx) \
+ __EVENT_CONSTRAINT(0, 1ULL << (idx), 0, AMD64_NUM_COUNTERS + 1)
+
/*
* Constraint on the Event code + UMask + fixed-mask
*
@@ -184,6 +187,26 @@ union perf_capabilities {
};
/*
+ * Model specific hardware events
+ *
+ * With the attr.model_spec bit set we can setup hardware events
+ * others than generic performance counters. A special PMU 64 bit
+ * config value can be passed through the perf_event interface. The
+ * concept of PMU model-specific arguments was practiced already in
+ * Perfmon2. The type of event (8 bits) is determinded from the config
+ * value too, bit 32-39 are reserved for this.
+ */
+#define MODEL_SPEC_TYPE_IBS_FETCH 0
+#define MODEL_SPEC_TYPE_IBS_OP 1
+
+#define MODEL_SPEC_TYPE_MASK (0xFFULL << 32)
+
+static inline u8 get_model_spec_type(u64 config)
+{
+ return (config & MODEL_SPEC_TYPE_MASK) >> 32;
+}
+
+/*
* struct x86_pmu - generic x86 pmu
*/
struct x86_pmu {
diff --git a/arch/x86/kernel/cpu/perf_event_amd.c b/arch/x86/kernel/cpu/perf_event_amd.c
index a6ce6f8..0093a52 100644
--- a/arch/x86/kernel/cpu/perf_event_amd.c
+++ b/arch/x86/kernel/cpu/perf_event_amd.c
@@ -2,6 +2,31 @@
#include <linux/pci.h>
+#define IBS_FETCH_CONFIG_MASK (IBS_FETCH_RAND_EN | IBS_FETCH_MAX_CNT)
+#define IBS_OP_CONFIG_MASK IBS_OP_MAX_CNT
+
+struct ibs_map {
+ int idx;
+ u64 cnt_mask;
+ u64 valid_mask;
+ unsigned long msr;
+};
+
+static struct ibs_map ibs_map[] = {
+ [MODEL_SPEC_TYPE_IBS_FETCH] = {
+ .idx = X86_PMC_IDX_SPECIAL_IBS_FETCH,
+ .cnt_mask = IBS_FETCH_MAX_CNT,
+ .valid_mask = IBS_FETCH_CONFIG_MASK,
+ .msr = MSR_AMD64_IBSFETCHCTL,
+ },
+ [MODEL_SPEC_TYPE_IBS_OP] = {
+ .idx = X86_PMC_IDX_SPECIAL_IBS_OP,
+ .cnt_mask = IBS_OP_MAX_CNT,
+ .valid_mask = IBS_OP_CONFIG_MASK,
+ .msr = MSR_AMD64_IBSOPCTL,
+ },
+};
+
static DEFINE_RAW_SPINLOCK(amd_nb_lock);
static __initconst const u64 amd_hw_cache_event_ids
@@ -187,6 +212,127 @@ static inline void apic_clear_ibs(void) { }
#endif
+static inline void amd_pmu_disable_ibs(void)
+{
+ struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
+ u64 val;
+
+ if (test_bit(X86_PMC_IDX_SPECIAL_IBS_FETCH, cpuc->active_mask)) {
+ rdmsrl(MSR_AMD64_IBSFETCHCTL, val);
+ val &= ~IBS_FETCH_ENABLE;
+ wrmsrl(MSR_AMD64_IBSFETCHCTL, val);
+ }
+ if (test_bit(X86_PMC_IDX_SPECIAL_IBS_OP, cpuc->active_mask)) {
+ rdmsrl(MSR_AMD64_IBSOPCTL, val);
+ val &= ~IBS_OP_ENABLE;
+ wrmsrl(MSR_AMD64_IBSOPCTL, val);
+ }
+}
+
+static inline void amd_pmu_enable_ibs(void)
+{
+ struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
+ u64 val;
+
+ if (test_bit(X86_PMC_IDX_SPECIAL_IBS_FETCH, cpuc->active_mask)) {
+ rdmsrl(MSR_AMD64_IBSFETCHCTL, val);
+ val |= IBS_FETCH_ENABLE;
+ wrmsrl(MSR_AMD64_IBSFETCHCTL, val);
+ }
+ if (test_bit(X86_PMC_IDX_SPECIAL_IBS_OP, cpuc->active_mask)) {
+ rdmsrl(MSR_AMD64_IBSOPCTL, val);
+ val |= IBS_OP_ENABLE;
+ wrmsrl(MSR_AMD64_IBSOPCTL, val);
+ }
+}
+
+static int amd_pmu_ibs_config(struct perf_event *event)
+{
+ u8 type;
+ u64 max_cnt, config;
+ struct ibs_map *map;
+
+ if (!x86_pmu.ibs)
+ return -ENODEV;
+
+ if (event->attr.type != PERF_TYPE_RAW)
+ /* only raw sample types are supported */
+ return -EINVAL;
+
+ type = get_model_spec_type(event->attr.config);
+ if (type >= ARRAY_SIZE(ibs_map))
+ return -ENODEV;
+
+ map = &ibs_map[type];
+ config = event->attr.config;
+ if (event->hw.sample_period) {
+ if (config & map->cnt_mask)
+ /* raw max_cnt may not be set */
+ return -EINVAL;
+ if (event->hw.sample_period & 0x0f)
+ /* lower 4 bits can not be set in ibs max cnt */
+ return -EINVAL;
+ max_cnt = event->hw.sample_period >> 4;
+ if (max_cnt & ~map->cnt_mask)
+ /* out of range */
+ return -EINVAL;
+ config |= max_cnt;
+ } else {
+ max_cnt = event->attr.config & map->cnt_mask;
+ }
+
+ if (!max_cnt)
+ return -EINVAL;
+
+ if (config & ~map->valid_mask)
+ return -EINVAL;
+
+ event->hw.config = config;
+ event->hw.idx = map->idx;
+ /*
+ * dirty hack, needed for __x86_pmu_enable_event(), we
+ * should better change event->hw.config_base into
+ * event->hw.config_msr that already includes the index
+ */
+ event->hw.config_base = map->msr - event->hw.idx;
+
+ return 0;
+}
+
+static inline void __amd_pmu_enable_ibs_event(struct hw_perf_event *hwc)
+{
+ if (hwc->idx == X86_PMC_IDX_SPECIAL_IBS_FETCH)
+ __x86_pmu_enable_event(hwc, IBS_FETCH_ENABLE);
+ else if (hwc->idx == X86_PMC_IDX_SPECIAL_IBS_OP)
+ __x86_pmu_enable_event(hwc, IBS_OP_ENABLE);
+}
+
+static void amd_pmu_disable_all(void)
+{
+ x86_pmu_disable_all();
+ amd_pmu_disable_ibs();
+}
+
+static void amd_pmu_enable_all(int added)
+{
+ x86_pmu_enable_all(added);
+ amd_pmu_enable_ibs();
+}
+
+static void amd_pmu_enable_event(struct perf_event *event)
+{
+ struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
+ struct hw_perf_event *hwc = &event->hw;
+
+ if (!cpuc->enabled)
+ return;
+
+ if (hwc->idx < X86_PMC_IDX_SPECIAL)
+ __x86_pmu_enable_event(hwc, ARCH_PERFMON_EVENTSEL_ENABLE);
+ else
+ __amd_pmu_enable_ibs_event(hwc);
+}
+
static u64 amd_pmu_event_map(int hw_event)
{
return amd_perfmon_event_map[hw_event];
@@ -194,7 +340,12 @@ static u64 amd_pmu_event_map(int hw_event)
static int amd_pmu_hw_config(struct perf_event *event)
{
- int ret = x86_pmu_hw_config(event);
+ int ret;
+
+ if (event->attr.model_spec)
+ return amd_pmu_ibs_config(event);
+
+ ret = x86_pmu_hw_config(event);
if (ret)
return ret;
@@ -208,6 +359,21 @@ static int amd_pmu_hw_config(struct perf_event *event)
}
/*
+ * AMD64 events - list of special events (IBS)
+ */
+static struct event_constraint amd_event_constraints[] =
+{
+ /*
+ * The value for the weight of these constraints is higher
+ * than in the unconstrainted case to process ibs after the
+ * generic counters in x86_schedule_events().
+ */
+ AMD_IBS_EVENT_CONSTRAINT(X86_PMC_IDX_SPECIAL_IBS_FETCH),
+ AMD_IBS_EVENT_CONSTRAINT(X86_PMC_IDX_SPECIAL_IBS_OP),
+ EVENT_CONSTRAINT_END
+};
+
+/*
* AMD64 events are detected based on their event codes.
*/
static inline int amd_is_nb_event(struct hw_perf_event *hwc)
@@ -293,10 +459,23 @@ amd_get_event_constraints(struct cpu_hw_events *cpuc, struct perf_event *event)
struct hw_perf_event *hwc = &event->hw;
struct amd_nb *nb = cpuc->amd_nb;
struct perf_event *old = NULL;
+ struct event_constraint *c;
int max = x86_pmu.num_counters;
int i, j, k = -1;
/*
+ * The index of special events must be set in
+ * hw_perf_event_init(). The constraints are used for resource
+ * allocation by the event scheduler.
+ */
+ if (hwc->idx >= X86_PMC_IDX_SPECIAL) {
+ for_each_event_constraint(c, amd_event_constraints) {
+ if (test_bit(hwc->idx, c->idxmsk))
+ return c;
+ }
+ }
+
+ /*
* if not NB event or no NB, then no constraints
*/
if (!(amd_has_nb(cpuc) && amd_is_nb_event(hwc)))
@@ -456,9 +635,9 @@ static void amd_pmu_cpu_dead(int cpu)
static __initconst const struct x86_pmu amd_pmu = {
.name = "AMD",
.handle_irq = x86_pmu_handle_irq,
- .disable_all = x86_pmu_disable_all,
- .enable_all = x86_pmu_enable_all,
- .enable = x86_pmu_enable_event,
+ .disable_all = amd_pmu_disable_all,
+ .enable_all = amd_pmu_enable_all,
+ .enable = amd_pmu_enable_event,
.disable = x86_pmu_disable_event,
.hw_config = amd_pmu_hw_config,
.schedule_events = x86_schedule_events,
@@ -466,7 +645,7 @@ static __initconst const struct x86_pmu amd_pmu = {
.perfctr = MSR_K7_PERFCTR0,
.event_map = amd_pmu_event_map,
.max_events = ARRAY_SIZE(amd_perfmon_event_map),
- .num_counters = 4,
+ .num_counters = AMD64_NUM_COUNTERS,
.cntval_bits = 48,
.cntval_mask = (1ULL << 48) - 1,
.apic = 1,
--
1.7.0.3
--
Advanced Micro Devices, Inc.
Operating System Research Center
email: [email protected]
Robert,
Some more comments about model_spec.
> Except for the sample period IBS can only be set up with raw (model
> specific) config values and raw data samples. The event attributes for
> the syscall should be programmed like this (IBS_FETCH):
>
> memset(&attr, 0, sizeof(attr));
> attr.type = PERF_TYPE_RAW;
> attr.sample_type = PERF_SAMPLE_CPU | PERF_SAMPLE_RAW;
> attr.config = IBS_FETCH_CONFIG_DEFAULT
> attr.config |=
> ((unsigned long long)MODEL_SPEC_TYPE_IBS_FETCH << 32)
> & MODEL_SPEC_TYPE_MASK;
> attr.model_spec = 1;
>
Why do you need model_spec, in addition to your special encoding?
> /*
> + * Model specific hardware events
> + *
> + * With the attr.model_spec bit set we can setup hardware events
> + * others than generic performance counters. A special PMU 64 bit
> + * config value can be passed through the perf_event interface. The
> + * concept of PMU model-specific arguments was practiced already in
> + * Perfmon2. The type of event (8 bits) is determinded from the config
> + * value too, bit 32-39 are reserved for this.
> + */
Isn't the config field big enough to encode all the information you need?
In the kernel, you could check bit 32-39 and based on host CPU determine
whether it refers to IBS or is a bogus value. I am trying to figure out what
model_spec buys you. I believe RAW does not mean the final value as
accepted by HW but a value that must be interpreted by the model-specific
code to eventually come up with a raw HW value. In the current code, the
RAW value is never passed as is, it is assembled from various bits and
pieces incl. attr.config of course.
On Thu, 2010-04-15 at 17:16 +0200, Robert Richter wrote:
> On 15.04.10 09:44:21, Peter Zijlstra wrote:
> > On Tue, 2010-04-13 at 22:23 +0200, Robert Richter wrote:
> > > This patch series introduces model specific events and impments AMD
> > > IBS (Instruction Based Sampling) for perf_events.
> >
> > I would much rather it uses the ->precise thing PEBS also uses,
> > otherwise we keep growing funny arch extensions and end up with a
> > totally fragmented trainwreck of an ABI.
>
> I agree that an exiting flag could be reused. But the naming 'precise'
> could be misleading. Maybe we rename it to 'model_spec' or something
> else that underlines the idea of having model specific setups.
Right, so I really hate PERF_SAMPLE_RAW, and I'm considering simply
removing that for PEBS as well, its just too ugly. If we want the
register set we need to work on getting PERF_SAMPLE_REGS in a sensible
shape.
As to the meaning for ->precise, its meant to convey the counters are
not affected by skid and the like, I thought IBS provided exact IPs as
well (/me should re-read the IBS docs).
The thing with something like ->model_spec and PERF_SAMPLE_RAW is that
it doesn't provide a clear model, the user doesn't know what to expect
of it, it could be anything.
We want the ABI to express clear concepts, and things like lets bypass
everything and just dump stuff out to userspace really are to be avoided
at all costs.
Sadly IBS seems to be an utter trainwreck in the concept department (I'm
still struggling how to make a sensible interpretation of the data it
gathers).
The thing I absolutely want to avoid is the ABI becoming a fragmented
trainwreck like oprofile is.
Also not using sample_period for the sample period is of course utterly
unacceptable.
On Wed, Apr 21, 2010 at 2:11 PM, Peter Zijlstra <[email protected]> wrote:
> On Thu, 2010-04-15 at 17:16 +0200, Robert Richter wrote:
>> On 15.04.10 09:44:21, Peter Zijlstra wrote:
>> > On Tue, 2010-04-13 at 22:23 +0200, Robert Richter wrote:
>> > > This patch series introduces model specific events and impments AMD
>> > > IBS (Instruction Based Sampling) for perf_events.
>> >
>> > I would much rather it uses the ->precise thing PEBS also uses,
>> > otherwise we keep growing funny arch extensions and end up with a
>> > totally fragmented trainwreck of an ABI.
>>
>> I agree that an exiting flag could be reused. But the naming 'precise'
>> could be misleading. Maybe we rename it to 'model_spec' or something
>> else that underlines the idea of having model specific setups.
>
> Right, so I really hate PERF_SAMPLE_RAW, and I'm considering simply
> removing that for PEBS as well, its just too ugly. If we want the
> register set we need to work on getting PERF_SAMPLE_REGS in a sensible
> shape.
>
I wonder why SAMPLE_RAW went in in the first place, then. What was the
justification for it, traces?
Okay, so you're suggesting everything is exposed via PERF_SAMPLE_REGS.
PEBS does capture machine state which is easily mapped onto PERF_SAMPLE_REGS.
Well, that's until you look at PEB-LL on Nehalem where is captures
latencies and data
addresses.
IBS does not capture machine state in the sense of general purpose registers.
IBS captures micro-architectural info about an instruction and stores
this into a
handful of IBS registers. Those could be funneled through PERF_SAMPLE_REGS
as well, I believe. But that means, PERF_SAMPLE_REGS would need some
configuration
bitmask to name the registers of interest, e.g. EAX, EDX, IBSOP_DATA,
IBSOP_PHYSAD,
and so on.
As you pointed out a long time ago, IBS returns to much information to
be abstracted
easily unless you're willing to drop part of the information it
returns, e.g., concentrate
on cache miss info only. But this goes back to a point I made early
on: there are
many usage models, different metrics need different data. You don't
want to prevent
any usage model.
> As to the meaning for ->precise, its meant to convey the counters are
> not affected by skid and the like, I thought IBS provided exact IPs as
> well (/me should re-read the IBS docs).
>
By construction it is given that it tracks an instruction. Thus, the
IP is always
that of the monitored instruction: no skid. The difference, though, it
that is not
associated with an actual counter+event. IBS keeps its own counter. You can
easily create a pseudo event (which is what Robert does in his patch).
> The thing with something like ->model_spec and PERF_SAMPLE_RAW is that
> it doesn't provide a clear model, the user doesn't know what to expect
> of it, it could be anything.
>
I think we can avoid model_spec.
> We want the ABI to express clear concepts, and things like lets bypass
> everything and just dump stuff out to userspace really are to be avoided
> at all costs.
>
> Sadly IBS seems to be an utter trainwreck in the concept department (I'm
> still struggling how to make a sensible interpretation of the data it
> gathers).
>
Concept is simple: track an instruction (uop) as it traverses the pipeline and
gather all sort of micro-architectural info. When the instruction retires,
interrupt the CPU and deliver the info via registers. The current implementation
is not without problems, but you can already gather useful info such as instrs
latencies, data cache misses.
Speaking of data cache misses, I believe there may be a way to abstract
sampling on cache misses, i.e, capture where they occur, that would work
with both IBS and PEBS-LL. That may be a good way to start exposing
some of the IBS features.
> Also not using sample_period for the sample period is of course utterly
> unacceptable.
>
That is fixed now.
On 21.04.10 14:11:05, Peter Zijlstra wrote:
> On Thu, 2010-04-15 at 17:16 +0200, Robert Richter wrote:
> > On 15.04.10 09:44:21, Peter Zijlstra wrote:
> > > On Tue, 2010-04-13 at 22:23 +0200, Robert Richter wrote:
> > > > This patch series introduces model specific events and impments AMD
> > > > IBS (Instruction Based Sampling) for perf_events.
> > >
> > > I would much rather it uses the ->precise thing PEBS also uses,
> > > otherwise we keep growing funny arch extensions and end up with a
> > > totally fragmented trainwreck of an ABI.
> >
> > I agree that an exiting flag could be reused. But the naming 'precise'
> > could be misleading. Maybe we rename it to 'model_spec' or something
> > else that underlines the idea of having model specific setups.
>
> Right, so I really hate PERF_SAMPLE_RAW, and I'm considering simply
> removing that for PEBS as well, its just too ugly. If we want the
> register set we need to work on getting PERF_SAMPLE_REGS in a sensible
> shape.
The ABI should provide hw access to all pmu features. As it is not
possible to abstract these features for all models and architectures
in the same way and a new feature may work completely different, the
only solution I see is to provide a way to write to pmu registers and
receive sampling data unfiltered back. For IBS for example the data in
a sample does not fit into existing generic events. Making IBS generic
also does not help much, since it will not be available on different
models or architectures. Introducing event types that will never
reused do not need to be generalized, it is better to generalize the
way how to handle this kind of events. This is the reason I like the
model_spec/raw_config/raw_sample approach.
> As to the meaning for ->precise, its meant to convey the counters are
> not affected by skid and the like, I thought IBS provided exact IPs as
> well (/me should re-read the IBS docs).
Yes, the real rip is in the sample, but there is much more data than
that. So the rip is only a subset.
> The thing with something like ->model_spec and PERF_SAMPLE_RAW is that
> it doesn't provide a clear model, the user doesn't know what to expect
> of it, it could be anything.
>
> We want the ABI to express clear concepts, and things like lets bypass
> everything and just dump stuff out to userspace really are to be avoided
> at all costs.
Of course, it could be anything. But this is not the intention. If
configs and buffers or close or exactly as the hw i/f, then it is
spec'ed and well defined. The user only have to know how to configure
a certain hw feature of a special model and how to get data back. This
is how IBS is implemented. Maybe this is the same that you have in
mind with PERF_SAMPLE_REG? This interface can then be reused for a
very different feature and this looks to me like a clear solution. I
do not see alternatives. And even if we process the samples in the
kernel somehow, in the end we pack it and send it to userspace.
> Sadly IBS seems to be an utter trainwreck in the concept department (I'm
> still struggling how to make a sensible interpretation of the data it
> gathers).
That's the point, there is no generalization of this type of data, but
still it is useful.
-Robert
>
> The thing I absolutely want to avoid is the ABI becoming a fragmented
> trainwreck like oprofile is.
>
> Also not using sample_period for the sample period is of course utterly
> unacceptable.
>
--
Advanced Micro Devices, Inc.
Operating System Research Center
email: [email protected]
On 21.04.10 13:37:27, Stephane Eranian wrote:
> Why do you need model_spec, in addition to your special encoding?
>
> > ?/*
> > + * Model specific hardware events
> > + *
> > + * With the attr.model_spec bit set we can setup hardware events
> > + * others than generic performance counters. A special PMU 64 bit
> > + * config value can be passed through the perf_event interface. The
> > + * concept of PMU model-specific arguments was practiced already in
> > + * Perfmon2. The type of event (8 bits) is determinded from the config
> > + * value too, bit 32-39 are reserved for this.
> > + */
> Isn't the config field big enough to encode all the information you need?
> In the kernel, you could check bit 32-39 and based on host CPU determine
> whether it refers to IBS or is a bogus value. I am trying to figure out what
> model_spec buys you. I believe RAW does not mean the final value as
> accepted by HW but a value that must be interpreted by the model-specific
> code to eventually come up with a raw HW value. In the current code, the
> RAW value is never passed as is, it is assembled from various bits and
> pieces incl. attr.config of course.
The raw config value without the model_spec bit set would asume a
config value for a generic x86 counter. We could reuse also an unused
bit in this config value, but what if this bit will be somewhen used?
Or, it is available on one hw bot not another? So I found it much
cleaner to introduce this attribute flag that can be reused on other
architectures. Also, you will then have the freedom to implement model
specific generic events without using raw_config. As most pmu features
are setup with a 64 bit config value, I would prefer to have also the
encoding of the model specific event type outside of the config
value. A want to be close to the hw register layout without shifting
bits back and forth. This may also introduce trouble in the future if
we need all 64 bits.
-Robert
--
Advanced Micro Devices, Inc.
Operating System Research Center
email: [email protected]
On Wed, Apr 21, 2010 at 8:26 PM, Robert Richter <[email protected]> wrote:
> On 21.04.10 15:21:35, Stephane Eranian wrote:
>> Okay, so you're suggesting everything is exposed via PERF_SAMPLE_REGS.
>> PEBS does capture machine state which is easily mapped onto PERF_SAMPLE_REGS.
>> Well, that's until you look at PEB-LL on Nehalem where is captures
>> latencies and data
>> addresses.
>>
>> Those could be funneled through PERF_SAMPLE_REGS
>> as well, I believe. But that means, PERF_SAMPLE_REGS would need some
>> configuration
>> bitmask to name the registers of interest, e.g. EAX, EDX, IBSOP_DATA,
>> IBSOP_PHYSAD,
>> and so on.
>
> What is the idea of PERF_SAMPLE_REGS? A git grep PERF_SAMPLE_REGS only
> returns a single line in -tip. I know nothing about it.
>
You may want to collect some of the general purpose registers
(interrupted state) in each
sample. This is what you get today in a signal handler (ucontext) for
instance. That may
also be a way to export PEBS samples.
On 21.04.10 15:21:35, Stephane Eranian wrote:
> Okay, so you're suggesting everything is exposed via PERF_SAMPLE_REGS.
> PEBS does capture machine state which is easily mapped onto PERF_SAMPLE_REGS.
> Well, that's until you look at PEB-LL on Nehalem where is captures
> latencies and data
> addresses.
>
> Those could be funneled through PERF_SAMPLE_REGS
> as well, I believe. But that means, PERF_SAMPLE_REGS would need some
> configuration
> bitmask to name the registers of interest, e.g. EAX, EDX, IBSOP_DATA,
> IBSOP_PHYSAD,
> and so on.
What is the idea of PERF_SAMPLE_REGS? A git grep PERF_SAMPLE_REGS only
returns a single line in -tip. I know nothing about it.
-Robert
--
Advanced Micro Devices, Inc.
Operating System Research Center
email: [email protected]
On 19.04.10 14:19:57, Stephane Eranian wrote:
> > + ? ? ? perf_sample_data_init(&data, 0);
> > + ? ? ? if (event->attr.sample_type & PERF_SAMPLE_RAW) {
> > + ? ? ? ? ? ? ? for (i = 1; i < size; i++)
> > + ? ? ? ? ? ? ? ? ? ? ? rdmsrl(msr++, *buf++);
> > + ? ? ? ? ? ? ? raw.size = sizeof(u64) * size;
> > + ? ? ? ? ? ? ? raw.data = buffer;
> > + ? ? ? ? ? ? ? data.raw = &raw;
> > + ? ? ? }
> > +
>
> Need to add the padding: raw.size = sizeof(u64) * size + sizeof(u32);
The u32 size header in the raw sampling data layout leads to unaligned
memory access for 64 bit values. No matter if the padding is inserted
at the beginning or end of the raw sample, it introduces either on the
sending or receiving side an 32 bit offset.
Wouldn't it be better to simply make size an u64 or to add another
reserved u32 value to the header. This could introduce some overhead
on smaller architectures, but currently this is only used on x86. The
event size encoding of the ring_buffer implementation could be another
alternative.
-Robert
--
Advanced Micro Devices, Inc.
Operating System Research Center
email: [email protected]
On Wed, Apr 21, 2010 at 6:58 PM, Robert Richter <[email protected]> wrote:
> On 21.04.10 13:37:27, Stephane Eranian wrote:
>> Why do you need model_spec, in addition to your special encoding?
>>
>> > /*
>> > + * Model specific hardware events
>> > + *
>> > + * With the attr.model_spec bit set we can setup hardware events
>> > + * others than generic performance counters. A special PMU 64 bit
>> > + * config value can be passed through the perf_event interface. The
>> > + * concept of PMU model-specific arguments was practiced already in
>> > + * Perfmon2. The type of event (8 bits) is determinded from the config
>> > + * value too, bit 32-39 are reserved for this.
>> > + */
>> Isn't the config field big enough to encode all the information you need?
>> In the kernel, you could check bit 32-39 and based on host CPU determine
>> whether it refers to IBS or is a bogus value. I am trying to figure out what
>> model_spec buys you. I believe RAW does not mean the final value as
>> accepted by HW but a value that must be interpreted by the model-specific
>> code to eventually come up with a raw HW value. In the current code, the
>> RAW value is never passed as is, it is assembled from various bits and
>> pieces incl. attr.config of course.
>
> The raw config value without the model_spec bit set would asume a
> config value for a generic x86 counter. We could reuse also an unused
> bit in this config value, but what if this bit will be somewhen used?
I have not yet seen a definition for what PERF_TYPE_RAW actually
means, unfortunately. But I would not necessarily the same conclusion
you did for raw without model_spec. I think the interpretation of raw
config is up to the machine/model specific layer, it could be a counting
event and something else.
To use IBS regardless, it seems the app needs to be aware it is
available on the host and that with or without model_spec.
You know that on a Barcelona-class system, you will never need
more than 42 bits. So you could reuse bits 63-42 to encode anything
you want.
When you go to another class of system which may use more than 42 bits,
then the app will have to know the type of host to use any
model-specific feature,
thus it is also safe to assume it will now how to encode those feature
for that host.
I don't see how model_spec would solve that particular problem.
To avoid the problem with full 64-bit raw config, you'd have to have another
u64 model_spec field and not just a bit to encode the feature type,
i.e., what you
are load in bits 32-63 today.
> Or, it is available on one hw bot not another? So I found it much
> cleaner to introduce this attribute flag that can be reused on other
> architectures. Also, you will then have the freedom to implement model
> specific generic events without using raw_config. As most pmu features
> are setup with a 64 bit config value, I would prefer to have also the
> encoding of the model specific event type outside of the config
> value. A want to be close to the hw register layout without shifting
> bits back and forth. This may also introduce trouble in the future if
> we need all 64 bits.
>
> -Robert
>
> --
> Advanced Micro Devices, Inc.
> Operating System Research Center
> email: [email protected]
>
>
On 13.04.10 22:23:09, Robert Richter wrote:
> This patch series introduces model specific events and impments AMD
> IBS (Instruction Based Sampling) for perf_events.
Peter,
I suggest to apply patches 1, 2, 3, 5, 6 to tip/perf/core if you don't
have objections. I will then resubmit a new version of all remaining
patches.
-Robert
--
Advanced Micro Devices, Inc.
Operating System Research Center
email: [email protected]