2009-10-19 15:03:52

by Stephane Eranian

[permalink] [raw]
Subject: [PATCH] perf_events: improve Intel event scheduling

This patch improves Intel event scheduling by maximizing
the use of PMU registers regardless of the order in which
events are submitted.

The algorithm takes into account the list of counter constraints
for each event. It assigns events to counters from the most
constrained, i.e., works on only one counter, to the least
constrained, i.e., works on any counter.

Fixed counter events and the BTS special event are also handled via
this algorithm which is designed to be fairly generic.

The patch also updates the validation of an event to use the
scheduling algorithm. This will cause early failure in
perf_event_open().

Signed-off-by: Stephane Eranian <[email protected]>
---
arch/x86/include/asm/perf_event.h | 6 +-
arch/x86/kernel/cpu/perf_event.c | 497 +++++++++++++++++++++++--------------
2 files changed, 318 insertions(+), 185 deletions(-)

diff --git a/arch/x86/include/asm/perf_event.h b/arch/x86/include/asm/perf_event.h
index 8d9f854..7c737af 100644
--- a/arch/x86/include/asm/perf_event.h
+++ b/arch/x86/include/asm/perf_event.h
@@ -26,7 +26,9 @@
/*
* Includes eventsel and unit mask as well:
*/
-#define ARCH_PERFMON_EVENT_MASK 0xffff
+#define ARCH_PERFMON_EVENTSEL_EVENT_MASK 0x00ff
+#define ARCH_PERFMON_EVENTSEL_UNIT_MASK 0xff00
+#define ARCH_PERFMON_EVENT_MASK (ARCH_PERFMON_EVENTSEL_UNIT_MASK|ARCH_PERFMON_EVENTSEL_EVENT_MASK)

/*
* filter mask to validate fixed counter events.
@@ -38,6 +40,8 @@
* The any-thread option is supported starting with v3.
*/
#define ARCH_PERFMON_EVENT_FILTER_MASK 0xff840000
+#define ARCH_PERFMON_FIXED_EVENT_MASK (ARCH_PERFMON_EVENT_FILTER_MASK|ARCH_PERFMON_EVENT_MASK)
+

#define ARCH_PERFMON_UNHALTED_CORE_CYCLES_SEL 0x3c
#define ARCH_PERFMON_UNHALTED_CORE_CYCLES_UMASK (0x00 << 8)
diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index 2e20bca..0f96c51 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -7,6 +7,7 @@
* Copyright (C) 2009 Advanced Micro Devices, Inc., Robert Richter
* Copyright (C) 2008-2009 Red Hat, Inc., Peter Zijlstra <[email protected]>
* Copyright (C) 2009 Intel Corporation, <[email protected]>
+ * Copyright (C) 2009 Google, Inc., Stephane Eranian
*
* For licencing details see kernel-base/COPYING
*/
@@ -68,6 +69,15 @@ struct debug_store {
u64 pebs_event_reset[MAX_PEBS_EVENTS];
};

+#define BITS_TO_U64(nr) DIV_ROUND_UP(nr, BITS_PER_BYTE * sizeof(u64))
+
+struct event_constraint {
+ u64 idxmsk[BITS_TO_U64(X86_PMC_IDX_MAX)];
+ int code;
+ int mask;
+ int weight;
+};
+
struct cpu_hw_events {
struct perf_event *events[X86_PMC_IDX_MAX];
unsigned long used_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
@@ -75,19 +85,23 @@ struct cpu_hw_events {
unsigned long interrupts;
int enabled;
struct debug_store *ds;
-};

-struct event_constraint {
- unsigned long idxmsk[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
- int code;
+ int n_events;
+ struct event_constraint *constraints[X86_PMC_IDX_MAX];
+ struct perf_event *event_list[X86_PMC_IDX_MAX]; /* in order they are collected */
};

-#define EVENT_CONSTRAINT(c, m) { .code = (c), .idxmsk[0] = (m) }
-#define EVENT_CONSTRAINT_END { .code = 0, .idxmsk[0] = 0 }
+#define EVENT_CONSTRAINT(c, n, w, m) { \
+ .code = (c), \
+ .mask = (m), \
+ .weight = (w), \
+ .idxmsk[0] = (n) }

-#define for_each_event_constraint(e, c) \
- for ((e) = (c); (e)->idxmsk[0]; (e)++)
+#define EVENT_CONSTRAINT_END \
+ { .code = 0, .mask = 0, .weight = 0, .idxmsk[0] = 0 }

+#define for_each_event_constraint(e, c) \
+ for ((e) = (c); (e)->weight; (e)++)

/*
* struct x86_pmu - generic x86 pmu
@@ -114,8 +128,7 @@ struct x86_pmu {
u64 intel_ctrl;
void (*enable_bts)(u64 config);
void (*disable_bts)(void);
- int (*get_event_idx)(struct cpu_hw_events *cpuc,
- struct hw_perf_event *hwc);
+ struct event_constraint *(*get_event_constraints)(struct perf_event *event);
};

static struct x86_pmu x86_pmu __read_mostly;
@@ -124,7 +137,7 @@ static DEFINE_PER_CPU(struct cpu_hw_events, cpu_hw_events) = {
.enabled = 1,
};

-static const struct event_constraint *event_constraints;
+static struct event_constraint *event_constraints;

/*
* Not sure about some of these
@@ -171,14 +184,14 @@ static u64 p6_pmu_raw_event(u64 hw_event)
return hw_event & P6_EVNTSEL_MASK;
}

-static const struct event_constraint intel_p6_event_constraints[] =
+static struct event_constraint intel_p6_event_constraints[] =
{
- EVENT_CONSTRAINT(0xc1, 0x1), /* FLOPS */
- EVENT_CONSTRAINT(0x10, 0x1), /* FP_COMP_OPS_EXE */
- EVENT_CONSTRAINT(0x11, 0x1), /* FP_ASSIST */
- EVENT_CONSTRAINT(0x12, 0x2), /* MUL */
- EVENT_CONSTRAINT(0x13, 0x2), /* DIV */
- EVENT_CONSTRAINT(0x14, 0x1), /* CYCLES_DIV_BUSY */
+ EVENT_CONSTRAINT(0xc1, 0x1, 1, 0xff), /* FLOPS */
+ EVENT_CONSTRAINT(0x10, 0x1, 1, 0xff), /* FP_COMP_OPS_EXE */
+ EVENT_CONSTRAINT(0x11, 0x1, 1, 0xff), /* FP_ASSIST */
+ EVENT_CONSTRAINT(0x12, 0x2, 1, 0xff), /* MUL */
+ EVENT_CONSTRAINT(0x13, 0x2, 1, 0xff), /* DIV */
+ EVENT_CONSTRAINT(0x14, 0x1, 1, 0xff), /* CYCLES_DIV_BUSY */
EVENT_CONSTRAINT_END
};

@@ -196,32 +209,35 @@ static const u64 intel_perfmon_event_map[] =
[PERF_COUNT_HW_BUS_CYCLES] = 0x013c,
};

-static const struct event_constraint intel_core_event_constraints[] =
-{
- EVENT_CONSTRAINT(0x10, 0x1), /* FP_COMP_OPS_EXE */
- EVENT_CONSTRAINT(0x11, 0x2), /* FP_ASSIST */
- EVENT_CONSTRAINT(0x12, 0x2), /* MUL */
- EVENT_CONSTRAINT(0x13, 0x2), /* DIV */
- EVENT_CONSTRAINT(0x14, 0x1), /* CYCLES_DIV_BUSY */
- EVENT_CONSTRAINT(0x18, 0x1), /* IDLE_DURING_DIV */
- EVENT_CONSTRAINT(0x19, 0x2), /* DELAYED_BYPASS */
- EVENT_CONSTRAINT(0xa1, 0x1), /* RS_UOPS_DISPATCH_CYCLES */
- EVENT_CONSTRAINT(0xcb, 0x1), /* MEM_LOAD_RETIRED */
+static struct event_constraint intel_core_event_constraints[] =
+{
+ EVENT_CONSTRAINT(0xc0, (0x3|(1ULL<<32)), 3, ARCH_PERFMON_FIXED_EVENT_MASK), /* INSTRUCTIONS_RETIRED */
+ EVENT_CONSTRAINT(0x3c, (0x3|(1ULL<<33)), 3, ARCH_PERFMON_FIXED_EVENT_MASK), /* UNHALTED_CORE_CYCLES */
+ EVENT_CONSTRAINT(0x10, 0x1, 1, ARCH_PERFMON_EVENTSEL_EVENT_MASK), /* FP_COMP_OPS_EXE */
+ EVENT_CONSTRAINT(0x11, 0x2, 1, ARCH_PERFMON_EVENTSEL_EVENT_MASK), /* FP_ASSIST */
+ EVENT_CONSTRAINT(0x12, 0x2, 1, ARCH_PERFMON_EVENTSEL_EVENT_MASK), /* MUL */
+ EVENT_CONSTRAINT(0x13, 0x2, 1, ARCH_PERFMON_EVENTSEL_EVENT_MASK), /* DIV */
+ EVENT_CONSTRAINT(0x14, 0x1, 1, ARCH_PERFMON_EVENTSEL_EVENT_MASK), /* CYCLES_DIV_BUSY */
+ EVENT_CONSTRAINT(0x18, 0x1, 1, ARCH_PERFMON_EVENTSEL_EVENT_MASK), /* IDLE_DURING_DIV */
+ EVENT_CONSTRAINT(0x19, 0x2, 1, ARCH_PERFMON_EVENTSEL_EVENT_MASK), /* DELAYED_BYPASS */
+ EVENT_CONSTRAINT(0xa1, 0x1, 1, ARCH_PERFMON_EVENTSEL_EVENT_MASK), /* RS_UOPS_DISPATCH_CYCLES */
+ EVENT_CONSTRAINT(0xcb, 0x1, 1, ARCH_PERFMON_EVENTSEL_EVENT_MASK), /* MEM_LOAD_RETIRED */
EVENT_CONSTRAINT_END
};

-static const struct event_constraint intel_nehalem_event_constraints[] =
-{
- EVENT_CONSTRAINT(0x40, 0x3), /* L1D_CACHE_LD */
- EVENT_CONSTRAINT(0x41, 0x3), /* L1D_CACHE_ST */
- EVENT_CONSTRAINT(0x42, 0x3), /* L1D_CACHE_LOCK */
- EVENT_CONSTRAINT(0x43, 0x3), /* L1D_ALL_REF */
- EVENT_CONSTRAINT(0x4e, 0x3), /* L1D_PREFETCH */
- EVENT_CONSTRAINT(0x4c, 0x3), /* LOAD_HIT_PRE */
- EVENT_CONSTRAINT(0x51, 0x3), /* L1D */
- EVENT_CONSTRAINT(0x52, 0x3), /* L1D_CACHE_PREFETCH_LOCK_FB_HIT */
- EVENT_CONSTRAINT(0x53, 0x3), /* L1D_CACHE_LOCK_FB_HIT */
- EVENT_CONSTRAINT(0xc5, 0x3), /* CACHE_LOCK_CYCLES */
+static struct event_constraint intel_nehalem_event_constraints[] = {
+ EVENT_CONSTRAINT(0xc0, (0x3|(1ULL<<32)), 3, ARCH_PERFMON_FIXED_EVENT_MASK), /* INSTRUCTIONS_RETIRED */
+ EVENT_CONSTRAINT(0x3c, (0x3|(1ULL<<33)), 3, ARCH_PERFMON_FIXED_EVENT_MASK), /* UNHALTED_CORE_CYCLES */
+ EVENT_CONSTRAINT(0x40, 0x3, 2, ARCH_PERFMON_EVENTSEL_EVENT_MASK), /* L1D_CACHE_LD */
+ EVENT_CONSTRAINT(0x41, 0x3, 2, ARCH_PERFMON_EVENTSEL_EVENT_MASK), /* L1D_CACHE_ST */
+ EVENT_CONSTRAINT(0x42, 0x3, 2, ARCH_PERFMON_EVENTSEL_EVENT_MASK), /* L1D_CACHE_LOCK */
+ EVENT_CONSTRAINT(0x43, 0x3, 2, ARCH_PERFMON_EVENTSEL_EVENT_MASK), /* L1D_ALL_REF */
+ EVENT_CONSTRAINT(0x4e, 0x3, 2, ARCH_PERFMON_EVENTSEL_EVENT_MASK), /* L1D_PREFETCH */
+ EVENT_CONSTRAINT(0x4c, 0x3, 2, ARCH_PERFMON_EVENTSEL_EVENT_MASK), /* LOAD_HIT_PRE */
+ EVENT_CONSTRAINT(0x51, 0x3, 2, ARCH_PERFMON_EVENTSEL_EVENT_MASK), /* L1D */
+ EVENT_CONSTRAINT(0x52, 0x3, 2, ARCH_PERFMON_EVENTSEL_EVENT_MASK), /* L1D_CACHE_PREFETCH_LOCK_FB_HIT */
+ EVENT_CONSTRAINT(0x53, 0x3, 2, ARCH_PERFMON_EVENTSEL_EVENT_MASK), /* L1D_CACHE_LOCK_FB_HIT */
+ EVENT_CONSTRAINT(0xc5, 0x3, 2, ARCH_PERFMON_EVENTSEL_EVENT_MASK), /* CACHE_LOCK_CYCLES */
EVENT_CONSTRAINT_END
};

@@ -1120,9 +1136,15 @@ static void amd_pmu_disable_all(void)

void hw_perf_disable(void)
{
+ struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
+
if (!x86_pmu_initialized())
return;
- return x86_pmu.disable_all();
+
+ if (cpuc->enabled)
+ cpuc->n_events = 0;
+
+ x86_pmu.disable_all();
}

static void p6_pmu_enable_all(void)
@@ -1391,124 +1413,6 @@ static void amd_pmu_enable_event(struct hw_perf_event *hwc, int idx)
x86_pmu_enable_event(hwc, idx);
}

-static int fixed_mode_idx(struct hw_perf_event *hwc)
-{
- unsigned int hw_event;
-
- hw_event = hwc->config & ARCH_PERFMON_EVENT_MASK;
-
- if (unlikely((hw_event ==
- x86_pmu.event_map(PERF_COUNT_HW_BRANCH_INSTRUCTIONS)) &&
- (hwc->sample_period == 1)))
- return X86_PMC_IDX_FIXED_BTS;
-
- if (!x86_pmu.num_events_fixed)
- return -1;
-
- /*
- * fixed counters do not take all possible filters
- */
- if (hwc->config & ARCH_PERFMON_EVENT_FILTER_MASK)
- return -1;
-
- if (unlikely(hw_event == x86_pmu.event_map(PERF_COUNT_HW_INSTRUCTIONS)))
- return X86_PMC_IDX_FIXED_INSTRUCTIONS;
- if (unlikely(hw_event == x86_pmu.event_map(PERF_COUNT_HW_CPU_CYCLES)))
- return X86_PMC_IDX_FIXED_CPU_CYCLES;
- if (unlikely(hw_event == x86_pmu.event_map(PERF_COUNT_HW_BUS_CYCLES)))
- return X86_PMC_IDX_FIXED_BUS_CYCLES;
-
- return -1;
-}
-
-/*
- * generic counter allocator: get next free counter
- */
-static int
-gen_get_event_idx(struct cpu_hw_events *cpuc, struct hw_perf_event *hwc)
-{
- int idx;
-
- idx = find_first_zero_bit(cpuc->used_mask, x86_pmu.num_events);
- return idx == x86_pmu.num_events ? -1 : idx;
-}
-
-/*
- * intel-specific counter allocator: check event constraints
- */
-static int
-intel_get_event_idx(struct cpu_hw_events *cpuc, struct hw_perf_event *hwc)
-{
- const struct event_constraint *event_constraint;
- int i, code;
-
- if (!event_constraints)
- goto skip;
-
- code = hwc->config & CORE_EVNTSEL_EVENT_MASK;
-
- for_each_event_constraint(event_constraint, event_constraints) {
- if (code == event_constraint->code) {
- for_each_bit(i, event_constraint->idxmsk, X86_PMC_IDX_MAX) {
- if (!test_and_set_bit(i, cpuc->used_mask))
- return i;
- }
- return -1;
- }
- }
-skip:
- return gen_get_event_idx(cpuc, hwc);
-}
-
-static int
-x86_schedule_event(struct cpu_hw_events *cpuc, struct hw_perf_event *hwc)
-{
- int idx;
-
- idx = fixed_mode_idx(hwc);
- if (idx == X86_PMC_IDX_FIXED_BTS) {
- /* BTS is already occupied. */
- if (test_and_set_bit(idx, cpuc->used_mask))
- return -EAGAIN;
-
- hwc->config_base = 0;
- hwc->event_base = 0;
- hwc->idx = idx;
- } else if (idx >= 0) {
- /*
- * Try to get the fixed event, if that is already taken
- * then try to get a generic event:
- */
- if (test_and_set_bit(idx, cpuc->used_mask))
- goto try_generic;
-
- hwc->config_base = MSR_ARCH_PERFMON_FIXED_CTR_CTRL;
- /*
- * We set it so that event_base + idx in wrmsr/rdmsr maps to
- * MSR_ARCH_PERFMON_FIXED_CTR0 ... CTR2:
- */
- hwc->event_base =
- MSR_ARCH_PERFMON_FIXED_CTR0 - X86_PMC_IDX_FIXED;
- hwc->idx = idx;
- } else {
- idx = hwc->idx;
- /* Try to get the previous generic event again */
- if (idx == -1 || test_and_set_bit(idx, cpuc->used_mask)) {
-try_generic:
- idx = x86_pmu.get_event_idx(cpuc, hwc);
- if (idx == -1)
- return -EAGAIN;
-
- set_bit(idx, cpuc->used_mask);
- hwc->idx = idx;
- }
- hwc->config_base = x86_pmu.eventsel;
- hwc->event_base = x86_pmu.perfctr;
- }
-
- return idx;
-}
-
/*
* Find a PMC slot for the freshly enabled / scheduled in event:
*/
@@ -1518,7 +1422,7 @@ static int x86_pmu_enable(struct perf_event *event)
struct hw_perf_event *hwc = &event->hw;
int idx;

- idx = x86_schedule_event(cpuc, hwc);
+ idx = hwc->idx;
if (idx < 0)
return idx;

@@ -1958,6 +1862,224 @@ perf_event_nmi_handler(struct notifier_block *self,
return NOTIFY_STOP;
}

+static struct event_constraint bts_constraint={
+ .code = 0,
+ .mask = 0,
+ .weight = 1,
+ .idxmsk[0] = 1ULL << X86_PMC_IDX_FIXED_BTS
+};
+
+static struct event_constraint *intel_special_constraints(struct perf_event *event)
+{
+ unsigned int hw_event;
+
+ hw_event = event->hw.config & ARCH_PERFMON_EVENT_MASK;
+
+ if (unlikely((hw_event ==
+ x86_pmu.event_map(PERF_COUNT_HW_BRANCH_INSTRUCTIONS)) &&
+ (event->hw.sample_period == 1)))
+ return &bts_constraint;
+
+ return NULL;
+}
+
+static struct event_constraint *intel_get_event_constraints(struct perf_event *event)
+{
+ struct event_constraint *c;
+
+ c = intel_special_constraints(event);
+ if (c)
+ return c;
+
+ if (event_constraints)
+ for_each_event_constraint(c, event_constraints) {
+ if ((event->hw.config & c->mask) == c->code)
+ return c;
+ }
+
+ return NULL;
+}
+
+static struct event_constraint *amd_get_event_constraints(struct perf_event *event)
+{
+ return NULL;
+}
+
+static int schedule_events(struct cpu_hw_events *cpuhw, int n, bool assign)
+{
+ int i, j , w, num, lim;
+ int weight, wmax;
+ struct event_constraint *c;
+ unsigned long used_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
+ int assignments[X86_PMC_IDX_MAX];
+ struct hw_perf_event *hwc;
+
+ bitmap_zero(used_mask, X86_PMC_IDX_MAX);
+
+ /*
+ * weight = number of possible counters
+ *
+ * 1 = most constrained, only works on one counter
+ * wmax = least constrained, works on 1 fixed counter
+ * or any generic counter
+ *
+ * assign events to counters starting with most
+ * constrained events.
+ */
+ wmax = 1 + x86_pmu.num_events;
+ num = n;
+ for(w=1; num && w <= wmax; w++) {
+
+ /* for each event */
+ for(i=0; i < n; i++) {
+ c = cpuhw->constraints[i];
+ hwc = &cpuhw->event_list[i]->hw;
+
+ weight = c ? c->weight : x86_pmu.num_events;
+ if (weight != w)
+ continue;
+
+ /*
+ * try to reuse previous assignment
+ *
+ * This is possible despite the fact that
+ * events or events order may have changed.
+ *
+ * What matters is the level of constraints
+ * of an event and this is constant for now.
+ *
+ * This is possible also because we always
+ * scan from most to least constrained. Thus,
+ * if a counter can be reused, it means no,
+ * more constrained events, needed it. And
+ * next events will either compete for it
+ * (which cannot be solved anyway) or they
+ * have fewer constraints, and they can use
+ * another counter.
+ */
+ j = hwc->idx;
+ if (j != -1 && !test_bit(j, used_mask))
+ goto skip;
+
+ if (c) {
+ lim = X86_PMC_IDX_MAX;
+ for_each_bit(j, (unsigned long *)c->idxmsk, lim)
+ if (!test_bit(j, used_mask))
+ break;
+
+ } else {
+ lim = x86_pmu.num_events;
+ /*
+ * fixed counter events have necessarily a
+ * constraint thus we come here only for
+ * generic counters and thus we limit the
+ * scan to those
+ */
+ j = find_first_zero_bit(used_mask, lim);
+ }
+ if (j == lim)
+ return -EAGAIN;
+skip:
+ set_bit(j, used_mask);
+ assignments[i] = j;
+ num--;
+ }
+ }
+ if (num)
+ return -ENOSPC;
+
+ /* just simulate scheduling */
+ if (!assign)
+ return 0;
+
+ /*
+ * commit assignments
+ */
+ for(i=0; i < n; i++) {
+ hwc = &cpuhw->event_list[i]->hw;
+
+ hwc->idx = assignments[i];
+
+ set_bit(hwc->idx, cpuhw->used_mask);
+
+ if (hwc->idx == X86_PMC_IDX_FIXED_BTS) {
+ hwc->config_base = 0;
+ hwc->event_base = 0;
+ } else if (hwc->idx >= X86_PMC_IDX_FIXED) {
+ hwc->config_base = MSR_ARCH_PERFMON_FIXED_CTR_CTRL;
+ /*
+ * We set it so that event_base + idx in wrmsr/rdmsr maps to
+ * MSR_ARCH_PERFMON_FIXED_CTR0 ... CTR2:
+ */
+ hwc->event_base =
+ MSR_ARCH_PERFMON_FIXED_CTR0 - X86_PMC_IDX_FIXED;
+ } else {
+ hwc->config_base = x86_pmu.eventsel;
+ hwc->event_base = x86_pmu.perfctr;
+ }
+ }
+ cpuhw->n_events = n;
+ return 0;
+}
+
+static int collect_events(struct cpu_hw_events *cpuhw, struct perf_event *leader)
+{
+ struct perf_event *event;
+ int n, max_count;
+
+ max_count = x86_pmu.num_events + x86_pmu.num_events_fixed;
+
+ /* current number of events already accepted */
+ n = cpuhw->n_events;
+
+ if (!is_software_event(leader)) {
+ if (n >= max_count)
+ return -ENOSPC;
+ cpuhw->constraints[n] = x86_pmu.get_event_constraints(leader);
+ cpuhw->event_list[n] = leader;
+ n++;
+ }
+
+ list_for_each_entry(event, &leader->sibling_list, group_entry) {
+ if (is_software_event(event) ||
+ event->state == PERF_EVENT_STATE_OFF)
+ continue;
+
+ if (n >= max_count)
+ return -ENOSPC;
+
+ cpuhw->constraints[n] = x86_pmu.get_event_constraints(event);
+ cpuhw->event_list[n] = event;
+ n++;
+ }
+ return n;
+}
+
+/*
+ * Called to enable a whole group of events.
+ * Returns 1 if the group was enabled, or -EAGAIN if it could not be.
+ * Assumes the caller has disabled interrupts and has
+ * frozen the PMU with hw_perf_save_disable.
+ */
+int hw_perf_group_sched_in(struct perf_event *leader,
+ struct perf_cpu_context *cpuctx,
+ struct perf_event_context *ctx, int cpu)
+{
+ struct cpu_hw_events *cpuhw = &per_cpu(cpu_hw_events, cpu);
+ int n, ret;
+
+ n = collect_events(cpuhw, leader);
+ if (n < 0)
+ return n;
+
+ ret = schedule_events(cpuhw, n, true);
+ if (ret)
+ return ret;
+
+ /* 0 means successful and enable each event in caller */
+ return 0;
+}
+
static __read_mostly struct notifier_block perf_event_nmi_notifier = {
.notifier_call = perf_event_nmi_handler,
.next = NULL,
@@ -1989,7 +2111,7 @@ static struct x86_pmu p6_pmu = {
*/
.event_bits = 32,
.event_mask = (1ULL << 32) - 1,
- .get_event_idx = intel_get_event_idx,
+ .get_event_constraints = intel_get_event_constraints
};

static struct x86_pmu intel_pmu = {
@@ -2013,7 +2135,7 @@ static struct x86_pmu intel_pmu = {
.max_period = (1ULL << 31) - 1,
.enable_bts = intel_pmu_enable_bts,
.disable_bts = intel_pmu_disable_bts,
- .get_event_idx = intel_get_event_idx,
+ .get_event_constraints = intel_get_event_constraints
};

static struct x86_pmu amd_pmu = {
@@ -2034,7 +2156,7 @@ static struct x86_pmu amd_pmu = {
.apic = 1,
/* use highest bit to detect overflow */
.max_period = (1ULL << 47) - 1,
- .get_event_idx = gen_get_event_idx,
+ .get_event_constraints = amd_get_event_constraints
};

static int p6_pmu_init(void)
@@ -2123,8 +2245,8 @@ static int intel_pmu_init(void)
memcpy(hw_cache_event_ids, core2_hw_cache_event_ids,
sizeof(hw_cache_event_ids));

- pr_cont("Core2 events, ");
event_constraints = intel_core_event_constraints;
+ pr_cont("Core2 events, ");
break;
default:
case 26:
@@ -2224,36 +2346,43 @@ static const struct pmu pmu = {
.unthrottle = x86_pmu_unthrottle,
};

-static int
-validate_event(struct cpu_hw_events *cpuc, struct perf_event *event)
-{
- struct hw_perf_event fake_event = event->hw;
-
- if (event->pmu != &pmu)
- return 0;
-
- return x86_schedule_event(cpuc, &fake_event);
-}
-
+/*
+ * validate a single event group
+ *
+ * validation include:
+ * - check events are compatible which each other
+ * - events do not compete for the same counter
+ * - number of events <= number of counters
+ *
+ * validation ensures the group can be loaded onto the
+ * PMU if it were the only group available.
+ */
static int validate_group(struct perf_event *event)
{
- struct perf_event *sibling, *leader = event->group_leader;
+ struct perf_event *leader = event->group_leader;
struct cpu_hw_events fake_pmu;
+ int n, ret;

memset(&fake_pmu, 0, sizeof(fake_pmu));

- if (!validate_event(&fake_pmu, leader))
+ /*
+ * the event is not yet connected with its
+ * siblings thererfore we must first collect
+ * existing siblings, then add the new event
+ * before we can simulate the scheduling
+ */
+ n = collect_events(&fake_pmu, leader);
+ if (n < 0)
return -ENOSPC;

- list_for_each_entry(sibling, &leader->sibling_list, group_entry) {
- if (!validate_event(&fake_pmu, sibling))
- return -ENOSPC;
- }
+ fake_pmu.n_events = n;

- if (!validate_event(&fake_pmu, event))
+ n = collect_events(&fake_pmu, event);
+ if (n < 0)
return -ENOSPC;

- return 0;
+ ret = schedule_events(&fake_pmu, n, false);
+ return ret ? -ENOSPC : 0;
}

const struct pmu *hw_perf_event_init(struct perf_event *event)
--
1.5.4.3


2009-11-18 16:32:52

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [PATCH] perf_events: improve Intel event scheduling

Sorry on the delay.

In general I'd very much like to see some of this generalized because I
think Sparc64 has very similar 'simple' constraints, I'll have to defer
to Paul on how much of this could possibly be re-used for powerpc.

On Mon, 2009-10-19 at 17:03 +0200, Stephane Eranian wrote:
> arch/x86/include/asm/perf_event.h | 6 +-
> arch/x86/kernel/cpu/perf_event.c | 497 +++++++++++++++++++++++--------------
> 2 files changed, 318 insertions(+), 185 deletions(-)
>
> diff --git a/arch/x86/include/asm/perf_event.h b/arch/x86/include/asm/perf_event.h
> index 8d9f854..7c737af 100644
> --- a/arch/x86/include/asm/perf_event.h
> +++ b/arch/x86/include/asm/perf_event.h
> @@ -26,7 +26,9 @@
> /*
> * Includes eventsel and unit mask as well:
> */
> -#define ARCH_PERFMON_EVENT_MASK 0xffff
> +#define ARCH_PERFMON_EVENTSEL_EVENT_MASK 0x00ff
> +#define ARCH_PERFMON_EVENTSEL_UNIT_MASK 0xff00
> +#define ARCH_PERFMON_EVENT_MASK (ARCH_PERFMON_EVENTSEL_UNIT_MASK|ARCH_PERFMON_EVENTSEL_EVENT_MASK)

There's various forms of the above throughout the code, it would be nice
not to add more..

And in general this patch has way too many >80 lines and other style
nits, but those can be fixed up easily enough I guess.

> diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
> index 2e20bca..0f96c51 100644
> --- a/arch/x86/kernel/cpu/perf_event.c
> +++ b/arch/x86/kernel/cpu/perf_event.c

> @@ -68,6 +69,15 @@ struct debug_store {
> u64 pebs_event_reset[MAX_PEBS_EVENTS];
> };
>
> +#define BITS_TO_U64(nr) DIV_ROUND_UP(nr, BITS_PER_BYTE * sizeof(u64))

Do we need this, is it realistic to expect X86_PMC_IDX_MAX to be
anything else than 64?

> -#define EVENT_CONSTRAINT(c, m) { .code = (c), .idxmsk[0] = (m) }
> -#define EVENT_CONSTRAINT_END { .code = 0, .idxmsk[0] = 0 }
> +#define EVENT_CONSTRAINT(c, n, w, m) { \
> + .code = (c), \
> + .mask = (m), \
> + .weight = (w), \
> + .idxmsk[0] = (n) }

If not, we can do away with the weight argument here and use hweight64()
which should reduce to a compile time constant.

> @@ -124,7 +137,7 @@ static DEFINE_PER_CPU(struct cpu_hw_events, cpu_hw_events) = {
> .enabled = 1,
> };
>
> -static const struct event_constraint *event_constraints;
> +static struct event_constraint *event_constraints;

I'm thinking this ought to live in x86_pmu, or possible, if we can
generalize this enough, in pmu.

> +static struct event_constraint intel_core_event_constraints[] =
> +{

Inconsistent style with below:

> +static struct event_constraint intel_nehalem_event_constraints[] = {
> + EVENT_CONSTRAINT(0xc0, (0x3|(1ULL<<32)), 3, ARCH_PERFMON_FIXED_EVENT_MASK), /* INSTRUCTIONS_RETIRED */
> + EVENT_CONSTRAINT(0x3c, (0x3|(1ULL<<33)), 3, ARCH_PERFMON_FIXED_EVENT_MASK), /* UNHALTED_CORE_CYCLES */
> + EVENT_CONSTRAINT(0x40, 0x3, 2, ARCH_PERFMON_EVENTSEL_EVENT_MASK), /* L1D_CACHE_LD */
> + EVENT_CONSTRAINT(0x41, 0x3, 2, ARCH_PERFMON_EVENTSEL_EVENT_MASK), /* L1D_CACHE_ST */
> + EVENT_CONSTRAINT(0x42, 0x3, 2, ARCH_PERFMON_EVENTSEL_EVENT_MASK), /* L1D_CACHE_LOCK */
> + EVENT_CONSTRAINT(0x43, 0x3, 2, ARCH_PERFMON_EVENTSEL_EVENT_MASK), /* L1D_ALL_REF */
> + EVENT_CONSTRAINT(0x4e, 0x3, 2, ARCH_PERFMON_EVENTSEL_EVENT_MASK), /* L1D_PREFETCH */
> + EVENT_CONSTRAINT(0x4c, 0x3, 2, ARCH_PERFMON_EVENTSEL_EVENT_MASK), /* LOAD_HIT_PRE */
> + EVENT_CONSTRAINT(0x51, 0x3, 2, ARCH_PERFMON_EVENTSEL_EVENT_MASK), /* L1D */
> + EVENT_CONSTRAINT(0x52, 0x3, 2, ARCH_PERFMON_EVENTSEL_EVENT_MASK), /* L1D_CACHE_PREFETCH_LOCK_FB_HIT */
> + EVENT_CONSTRAINT(0x53, 0x3, 2, ARCH_PERFMON_EVENTSEL_EVENT_MASK), /* L1D_CACHE_LOCK_FB_HIT */
> + EVENT_CONSTRAINT(0xc5, 0x3, 2, ARCH_PERFMON_EVENTSEL_EVENT_MASK), /* CACHE_LOCK_CYCLES */
> EVENT_CONSTRAINT_END
> };

Would be nice to get rid of that EVENT_MASK part, maybe write
EVENT_CONSTRAINT like:

.mask = ARCH_PERFMON_EVENTSEL_EVENT_MASK | ((n)>>32 ? ARCH_PERFMON_FIXED_MASK : 0),

Which ought to work for Intel based things, AMD will need a different
base event mask.

> @@ -1120,9 +1136,15 @@ static void amd_pmu_disable_all(void)
>
> void hw_perf_disable(void)
> {
> + struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
> +
> if (!x86_pmu_initialized())
> return;
> - return x86_pmu.disable_all();
> +
> + if (cpuc->enabled)
> + cpuc->n_events = 0;
> +
> + x86_pmu.disable_all();
> }

Right, so I cannot directly see the above being correct. You fully erase
the PMU state when we disable it, but things like single counter
add/remove doesn't reprogram the full PMU afaict.

The powerpc code has n_added, which indicates a delta algorithm is used
there.

> +static struct event_constraint *intel_get_event_constraints(struct perf_event *event)
> +{
> + struct event_constraint *c;
> +
> + c = intel_special_constraints(event);
> + if (c)
> + return c;
> +
> + if (event_constraints)
> + for_each_event_constraint(c, event_constraints) {
> + if ((event->hw.config & c->mask) == c->code)
> + return c;
> + }

This wants extra { }, since its a multi-line stmt.

> + return NULL;
> +}
> +
> +static struct event_constraint *amd_get_event_constraints(struct perf_event *event)
> +{
> + return NULL;
> +}

I guess we'll need to fill that out a bit more, but that can be another
patch.

> +static int schedule_events(struct cpu_hw_events *cpuhw, int n, bool assign)
> +{
> + int i, j , w, num, lim;
> + int weight, wmax;
> + struct event_constraint *c;
> + unsigned long used_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
> + int assignments[X86_PMC_IDX_MAX];
> + struct hw_perf_event *hwc;
> +
> + bitmap_zero(used_mask, X86_PMC_IDX_MAX);
> +
> + /*
> + * weight = number of possible counters
> + *
> + * 1 = most constrained, only works on one counter
> + * wmax = least constrained, works on 1 fixed counter
> + * or any generic counter
> + *
> + * assign events to counters starting with most
> + * constrained events.
> + */
> + wmax = 1 + x86_pmu.num_events;
> + num = n;
> + for(w=1; num && w <= wmax; w++) {
> +
> + /* for each event */
> + for(i=0; i < n; i++) {
> + c = cpuhw->constraints[i];
> + hwc = &cpuhw->event_list[i]->hw;
> +
> + weight = c ? c->weight : x86_pmu.num_events;
> + if (weight != w)
> + continue;
> +
> + /*
> + * try to reuse previous assignment
> + *
> + * This is possible despite the fact that
> + * events or events order may have changed.
> + *
> + * What matters is the level of constraints
> + * of an event and this is constant for now.
> + *
> + * This is possible also because we always
> + * scan from most to least constrained. Thus,
> + * if a counter can be reused, it means no,
> + * more constrained events, needed it. And
> + * next events will either compete for it
> + * (which cannot be solved anyway) or they
> + * have fewer constraints, and they can use
> + * another counter.
> + */
> + j = hwc->idx;
> + if (j != -1 && !test_bit(j, used_mask))
> + goto skip;
> +
> + if (c) {
> + lim = X86_PMC_IDX_MAX;
> + for_each_bit(j, (unsigned long *)c->idxmsk, lim)
> + if (!test_bit(j, used_mask))
> + break;
> +
> + } else {
> + lim = x86_pmu.num_events;
> + /*
> + * fixed counter events have necessarily a
> + * constraint thus we come here only for
> + * generic counters and thus we limit the
> + * scan to those
> + */
> + j = find_first_zero_bit(used_mask, lim);
> + }
> + if (j == lim)
> + return -EAGAIN;
> +skip:
> + set_bit(j, used_mask);
> + assignments[i] = j;
> + num--;
> + }
> + }
> + if (num)
> + return -ENOSPC;
> +
> + /* just simulate scheduling */
> + if (!assign)
> + return 0;
> +
> + /*
> + * commit assignments
> + */
> + for(i=0; i < n; i++) {
> + hwc = &cpuhw->event_list[i]->hw;
> +
> + hwc->idx = assignments[i];
> +
> + set_bit(hwc->idx, cpuhw->used_mask);
> +
> + if (hwc->idx == X86_PMC_IDX_FIXED_BTS) {
> + hwc->config_base = 0;
> + hwc->event_base = 0;
> + } else if (hwc->idx >= X86_PMC_IDX_FIXED) {
> + hwc->config_base = MSR_ARCH_PERFMON_FIXED_CTR_CTRL;
> + /*
> + * We set it so that event_base + idx in wrmsr/rdmsr maps to
> + * MSR_ARCH_PERFMON_FIXED_CTR0 ... CTR2:
> + */
> + hwc->event_base =
> + MSR_ARCH_PERFMON_FIXED_CTR0 - X86_PMC_IDX_FIXED;
> + } else {
> + hwc->config_base = x86_pmu.eventsel;
> + hwc->event_base = x86_pmu.perfctr;
> + }
> + }
> + cpuhw->n_events = n;
> + return 0;
> +}

Straight forward O(n^2) algorithm looking for the best match, seems good
from a single read -- will go over it again on another day to find more
details.

> +static int collect_events(struct cpu_hw_events *cpuhw, struct perf_event *leader)
> +{
> + struct perf_event *event;
> + int n, max_count;
> +
> + max_count = x86_pmu.num_events + x86_pmu.num_events_fixed;
> +
> + /* current number of events already accepted */
> + n = cpuhw->n_events;
> +
> + if (!is_software_event(leader)) {

With things like the hw-breakpoint stuff also growing a pmu !
is_software() isn't strong enough, something like:

static inline int is_x86_event(struct perf_event *event)
{
return event->pmu == &pmu;
}

Should work though.

> + if (n >= max_count)
> + return -ENOSPC;
> + cpuhw->constraints[n] = x86_pmu.get_event_constraints(leader);
> + cpuhw->event_list[n] = leader;
> + n++;
> + }
> +
> + list_for_each_entry(event, &leader->sibling_list, group_entry) {
> + if (is_software_event(event) ||
> + event->state == PERF_EVENT_STATE_OFF)
> + continue;
> +
> + if (n >= max_count)
> + return -ENOSPC;
> +
> + cpuhw->constraints[n] = x86_pmu.get_event_constraints(event);
> + cpuhw->event_list[n] = event;
> + n++;
> + }
> + return n;
> +}
> +
> +/*
> + * Called to enable a whole group of events.
> + * Returns 1 if the group was enabled, or -EAGAIN if it could not be.
> + * Assumes the caller has disabled interrupts and has
> + * frozen the PMU with hw_perf_save_disable.
> + */
> +int hw_perf_group_sched_in(struct perf_event *leader,
> + struct perf_cpu_context *cpuctx,
> + struct perf_event_context *ctx, int cpu)
> +{
> + struct cpu_hw_events *cpuhw = &per_cpu(cpu_hw_events, cpu);
> + int n, ret;
> +
> + n = collect_events(cpuhw, leader);
> + if (n < 0)
> + return n;
> +
> + ret = schedule_events(cpuhw, n, true);
> + if (ret)
> + return ret;
> +
> + /* 0 means successful and enable each event in caller */
> + return 0;
> +}

This is where powerpc does n_added += n, and it delays the
schedule_events() bit to hw_perf_enable() conditional on n_added > 0.
When you remove events it simply keeps the current layout and disables
the one.

> @@ -2123,8 +2245,8 @@ static int intel_pmu_init(void)
> memcpy(hw_cache_event_ids, core2_hw_cache_event_ids,
> sizeof(hw_cache_event_ids));
>
> - pr_cont("Core2 events, ");
> event_constraints = intel_core_event_constraints;
> + pr_cont("Core2 events, ");
> break;
> default:
> case 26:

Not that I object to the above change, but it seems out of place in this
patch.

> +/*
> + * validate a single event group
> + *
> + * validation include:
> + * - check events are compatible which each other
> + * - events do not compete for the same counter
> + * - number of events <= number of counters
> + *
> + * validation ensures the group can be loaded onto the
> + * PMU if it were the only group available.
> + */
> static int validate_group(struct perf_event *event)
> {
> - struct perf_event *sibling, *leader = event->group_leader;
> + struct perf_event *leader = event->group_leader;
> struct cpu_hw_events fake_pmu;
> + int n, ret;
>
> memset(&fake_pmu, 0, sizeof(fake_pmu));
>
> - if (!validate_event(&fake_pmu, leader))
> + /*
> + * the event is not yet connected with its
> + * siblings thererfore we must first collect
> + * existing siblings, then add the new event
> + * before we can simulate the scheduling
> + */
> + n = collect_events(&fake_pmu, leader);
> + if (n < 0)
> return -ENOSPC;
>
> - list_for_each_entry(sibling, &leader->sibling_list, group_entry) {
> - if (!validate_event(&fake_pmu, sibling))
> - return -ENOSPC;
> - }
> + fake_pmu.n_events = n;
>
> - if (!validate_event(&fake_pmu, event))
> + n = collect_events(&fake_pmu, event);
> + if (n < 0)
> return -ENOSPC;
>
> - return 0;
> + ret = schedule_events(&fake_pmu, n, false);
> + return ret ? -ENOSPC : 0;
> }

This seems good.

2009-12-11 11:00:19

by Stephane Eranian

[permalink] [raw]
Subject: Re: [PATCH] perf_events: improve Intel event scheduling

On Wed, Nov 18, 2009 at 5:32 PM, Peter Zijlstra <[email protected]> wrote:

> In general I'd very much like to see some of this generalized because I
> think Sparc64 has very similar 'simple' constraints, I'll have to defer
> to Paul on how much of this could possibly be re-used for powerpc.
>
Let's wait until we have all X86 constraint scheduling code. There is
way more needed. I have a first cut on the AMD64 constraints but
there is an issue in the generic/arch specific API that needs to be
flushed out first (see below).

> On Mon, 2009-10-19 at 17:03 +0200, Stephane Eranian wrote:
>> diff --git a/arch/x86/include/asm/perf_event.h b/arch/x86/include/asm/perf_event.h
>> index 8d9f854..7c737af 100644
>> --- a/arch/x86/include/asm/perf_event.h
>> +++ b/arch/x86/include/asm/perf_event.h
>> @@ -26,7 +26,9 @@
>>  /*
>>   * Includes eventsel and unit mask as well:
>>   */
>> -#define ARCH_PERFMON_EVENT_MASK                                  0xffff
>> +#define ARCH_PERFMON_EVENTSEL_EVENT_MASK                 0x00ff
>> +#define ARCH_PERFMON_EVENTSEL_UNIT_MASK                          0xff00
>> +#define ARCH_PERFMON_EVENT_MASK                                  (ARCH_PERFMON_EVENTSEL_UNIT_MASK|ARCH_PERFMON_EVENTSEL_EVENT_MASK)
>
> There's various forms of the above throughout the code, it would be nice
> not to add more..
>
I will simplify this.
>
>> diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
>> index 2e20bca..0f96c51 100644
>> --- a/arch/x86/kernel/cpu/perf_event.c
>> +++ b/arch/x86/kernel/cpu/perf_event.c
>
>> @@ -68,6 +69,15 @@ struct debug_store {
>>       u64     pebs_event_reset[MAX_PEBS_EVENTS];
>>  };
>>
>> +#define BITS_TO_U64(nr) DIV_ROUND_UP(nr, BITS_PER_BYTE * sizeof(u64))
>
> Do we need this, is it realistic to expect X86_PMC_IDX_MAX to be
> anything else than 64?
>
The issue had to do with i386 mode where long are 32 bits < 64. And in
particular with the initializer .idxmsk[0] = V. In the future we may exceed
32 registers. That means the initializer would have to change. But I guess
we have quite some ways before this case is reached. So I will revert all
of this to unsigned long.



>> -#define EVENT_CONSTRAINT(c, m) { .code = (c), .idxmsk[0] = (m) }
>> -#define EVENT_CONSTRAINT_END  { .code = 0, .idxmsk[0] = 0 }
>> +#define EVENT_CONSTRAINT(c, n, w, m) { \
>> +     .code = (c),    \
>> +     .mask = (m),    \
>> +     .weight = (w),  \
>> +     .idxmsk[0] = (n) }
>
> If not, we can do away with the weight argument here and use hweight64()
> which should reduce to a compile time constant.
>
I have dropped weight and I use bitmap_weight() to recompute, with all
the inlining
it should come out to be a simple popcnt instruction.


>> @@ -124,7 +137,7 @@ static DEFINE_PER_CPU(struct cpu_hw_events, cpu_hw_events) = {
>>         .enabled = 1,
>>  };
>>
>> -static const struct event_constraint *event_constraints;
>> +static struct event_constraint *event_constraints;
>
> I'm thinking this ought to live in x86_pmu, or possible, if we can
> generalize this enough, in pmu.

True.

> Inconsistent style with below:
>
>> +static struct event_constraint intel_nehalem_event_constraints[] = {
>> +     EVENT_CONSTRAINT(0xc0, (0x3|(1ULL<<32)), 3, ARCH_PERFMON_FIXED_EVENT_MASK), /* INSTRUCTIONS_RETIRED */
>
> Would be nice to get rid of that EVENT_MASK part, maybe write
> EVENT_CONSTRAINT like:
>
>  .mask = ARCH_PERFMON_EVENTSEL_EVENT_MASK | ((n)>>32 ? ARCH_PERFMON_FIXED_MASK : 0),
>
> Which ought to work for Intel based things, AMD will need a different
> base event mask.
>
Will try to clean this up. AMD does not need a mask, it is a
completely different kind of constraints.

>> @@ -1120,9 +1136,15 @@ static void amd_pmu_disable_all(void)
>>
>>  void hw_perf_disable(void)
>>  {
>> +     struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
>> +
>>       if (!x86_pmu_initialized())
>>               return;
>> -     return x86_pmu.disable_all();
>> +
>> +     if (cpuc->enabled)
>> +             cpuc->n_events = 0;
>> +
>> +     x86_pmu.disable_all();
>>  }
>
> Right, so I cannot directly see the above being correct. You fully erase
> the PMU state when we disable it, but things like single counter
> add/remove doesn't reprogram the full PMU afaict.
>
Here is the key point. There is no clear API that says 'this is the final stop
for this event group, i.e., next start will be with a different
group'. In other words,
there is no signal that the 'resource' can be freed. This is a
showstopper for the
AMD constraints. The generic code needs to signal that this is the
final stop for
the event group, so the machine specific layer can release the registers. The
hw_perf_disable() is used in many places and does not indicate the same thing
as what I just described.

>> +static int collect_events(struct cpu_hw_events *cpuhw, struct perf_event *leader)
>> +{
>> +     struct perf_event *event;
>> +     int n, max_count;
>> +
>> +     max_count = x86_pmu.num_events + x86_pmu.num_events_fixed;
>> +
>> +     /* current number of events already accepted */
>> +     n = cpuhw->n_events;
>> +
>> +     if (!is_software_event(leader)) {
>
> With things like the hw-breakpoint stuff also growing a pmu !
> is_software() isn't strong enough, something like:
>
> static inline int is_x86_event(struct perf_event *event)
> {
>        return event->pmu == &pmu;
> }
>
Agreed, I am using something like this now.

>> +int hw_perf_group_sched_in(struct perf_event *leader,
>> +            struct perf_cpu_context *cpuctx,
>> +            struct perf_event_context *ctx, int cpu)
>> +{
>> +     struct cpu_hw_events *cpuhw = &per_cpu(cpu_hw_events, cpu);
>> +     int n, ret;
>> +
>> +     n = collect_events(cpuhw, leader);
>> +     if (n < 0)
>> +             return n;
>> +
>> +     ret = schedule_events(cpuhw, n, true);
>> +     if (ret)
>> +             return ret;
>> +
>> +     /* 0 means successful and enable each event in caller */
>> +     return 0;
>> +}
>
> This is where powerpc does n_added += n, and it delays the
> schedule_events() bit to hw_perf_enable() conditional on n_added > 0.
> When you remove events it simply keeps the current layout and disables
> the one.
>
Will look into this.

>> @@ -2123,8 +2245,8 @@ static int intel_pmu_init(void)
>>               memcpy(hw_cache_event_ids, core2_hw_cache_event_ids,
>>                      sizeof(hw_cache_event_ids));
>>
>> -             pr_cont("Core2 events, ");
>>               event_constraints = intel_core_event_constraints;
>> +             pr_cont("Core2 events, ");
>>               break;
>>       default:
>>       case 26:
>
> Not that I object to the above change, but it seems out of place in this
> patch.
>
Will remove that.

2009-12-11 11:59:15

by Stephane Eranian

[permalink] [raw]
Subject: Re: [PATCH] perf_events: improve Intel event scheduling

>
>>> @@ -1120,9 +1136,15 @@ static void amd_pmu_disable_all(void)
>>>
>>>  void hw_perf_disable(void)
>>>  {
>>> +     struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
>>> +
>>>       if (!x86_pmu_initialized())
>>>               return;
>>> -     return x86_pmu.disable_all();
>>> +
>>> +     if (cpuc->enabled)
>>> +             cpuc->n_events = 0;
>>> +
>>> +     x86_pmu.disable_all();
>>>  }
>>
>> Right, so I cannot directly see the above being correct. You fully erase
>> the PMU state when we disable it, but things like single counter
>> add/remove doesn't reprogram the full PMU afaict.
>>

>>> +int hw_perf_group_sched_in(struct perf_event *leader,
>>> +            struct perf_cpu_context *cpuctx,
>>> +            struct perf_event_context *ctx, int cpu)
>>> +{
>>> +     struct cpu_hw_events *cpuhw = &per_cpu(cpu_hw_events, cpu);
>>> +     int n, ret;
>>> +
>>> +     n = collect_events(cpuhw, leader);
>>> +     if (n < 0)
>>> +             return n;
>>> +
>>> +     ret = schedule_events(cpuhw, n, true);
>>> +     if (ret)
>>> +             return ret;
>>> +
>>> +     /* 0 means successful and enable each event in caller */
>>> +     return 0;
>>> +}
>>
>> This is where powerpc does n_added += n, and it delays the
>> schedule_events() bit to hw_perf_enable() conditional on n_added > 0.
>> When you remove events it simply keeps the current layout and disables
>> the one.
>>
There is a major difference between PPC and X86 here. PPC has a centralized
register to control start/stop. This register uses bitmask to enable
or disable counters. Thus, in hw_perf_enable(), if n_added=0, then you
just need to
use the pre-computed bitmask. Otherwise, you need to recompute the bitmask to
include the new registers. The assignment of events and validation is done in
hw_group_sched_in().

In X86, assignment and validation is done in hw_group_sched_in(). Activation is
done individually for each counter. There is no centralized register
used here, thus
no bitmask to update.

Disabling a counter does not trigger a complete reschedule of events.
This happens
only when hw_group_sched_in() is called.

The n_events = 0 in hw_perf_disable() is used to signal that something
is changing.
It should not be here but here. The problem is that
hw_group_sched_in() needs a way
to know that it is called for a completely new series of group
scheduling so it can
discard any previous assignment. This goes back to the issue I raised
in my previous
email. You could add a parameter to hw_group_sched_in() that would
indicate this is
the first group. that would cause n_events =0 and the function would
start accumulating
events for the new scheduling period.

2009-12-11 14:52:45

by Stephane Eranian

[permalink] [raw]
Subject: Re: [PATCH] perf_events: improve Intel event scheduling

On Fri, Dec 11, 2009 at 12:00 PM, stephane eranian
<[email protected]> wrote:
>>> --- a/arch/x86/kernel/cpu/perf_event.c
>>> +++ b/arch/x86/kernel/cpu/perf_event.c
>>
>>> @@ -68,6 +69,15 @@ struct debug_store {
>>>       u64     pebs_event_reset[MAX_PEBS_EVENTS];
>>>  };
>>>
>>> +#define BITS_TO_U64(nr) DIV_ROUND_UP(nr, BITS_PER_BYTE * sizeof(u64))
>>
>> Do we need this, is it realistic to expect X86_PMC_IDX_MAX to be
>> anything else than 64?
>>
> The issue had to do with i386 mode where long are 32 bits  < 64. And in
> particular with the initializer .idxmsk[0] = V. In the future we may exceed
> 32 registers. That means the initializer would have to change. But I guess
> we have quite some ways before this case is reached. So I will revert all
> of this to unsigned long.
>
Well, in fact we have to use u64 because you are already using register
indexes > 32, e.g., for the Intel fixed counters which start at position 32.
Using unsigned long would make the static initializer uglier in this case.

2009-12-21 15:41:17

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [PATCH] perf_events: improve Intel event scheduling

On Fri, 2009-12-11 at 12:59 +0100, stephane eranian wrote:

> There is a major difference between PPC and X86 here. PPC has a
> centralized register to control start/stop. This register uses
> bitmask to enable or disable counters. Thus, in hw_perf_enable(), if
> n_added=0, then you just need to use the pre-computed bitmask.
> Otherwise, you need to recompute the bitmask to include the new
> registers. The assignment of events and validation is done in
> hw_group_sched_in().
>
> In X86, assignment and validation is done in hw_group_sched_in().
> Activation is done individually for each counter. There is no
> centralized register used here, thus no bitmask to update.

intel core2 has the global control reg, but for all intents and purposes
the perf_enable/disable calls emulate this global enable/disable.

> Disabling a counter does not trigger a complete reschedule of events.
> This happens only when hw_group_sched_in() is called.
>
> The n_events = 0 in hw_perf_disable() is used to signal that something
> is changing. It should not be here but here. The problem is that
> hw_group_sched_in() needs a way to know that it is called for a
> completely new series of group scheduling so it can discard any
> previous assignment. This goes back to the issue I raised in my
> previous email. You could add a parameter to hw_group_sched_in() that
> would indicate this is the first group. that would cause n_events =0
> and the function would start accumulating events for the new
> scheduling period.

I'm not really seeing the problem here...


perf_disable() <-- shut down the full pmu

pmu->disable() <-- hey someone got removed (easy free the reg)
pmu->enable() <-- hey someone got added (harder, check constraints)

hw_perf_group_sched_in() <-- hey a full group got added
(better than multiple ->enable)

perf_enable() <-- re-enable pmu


So ->disable() is used to track freeing, ->enable is used to add
individual counters, check constraints etc..

hw_perf_group_sched_in() is used to optimize the full group enable.

Afaict that is what power does (Paul?) and that should I think be
sufficient to track x86 as well.

Since sched_in() is balanced with sched_out(), the ->disable() calls
should provide the required information as to the occupation of the pmu.
I don't see the need for more hooks.

Paul, could you comment, since you did all this for power?

2009-12-21 19:00:26

by Stephane Eranian

[permalink] [raw]
Subject: Re: [PATCH] perf_events: improve Intel event scheduling

Hi,

[Repost because of HTML]

On Mon, Dec 21, 2009 at 4:40 PM, Peter Zijlstra <[email protected]> wrote:
>
> On Fri, 2009-12-11 at 12:59 +0100, stephane eranian wrote:
>
> > There is a major difference between PPC and X86 here. PPC has a
> > centralized register to control start/stop. This register  uses
> > bitmask to enable or disable counters. Thus, in hw_perf_enable(), if
> > n_added=0, then you just need to use the pre-computed bitmask.
> > Otherwise, you need to recompute the bitmask to include the new
> > registers. The assignment of events and validation is done in
> > hw_group_sched_in().
> >
> > In X86, assignment and validation is done in hw_group_sched_in().
> > Activation is done individually for each counter. There is no
> > centralized register used here, thus no bitmask to update.
>
> intel core2 has the global control reg, but for all intents and purposes
> the perf_enable/disable calls emulate this global enable/disable.
>
> > Disabling a counter does not trigger a complete reschedule of events.
> > This happens only when hw_group_sched_in() is called.
> >
> > The n_events = 0 in hw_perf_disable() is used to signal that something
> > is changing. It should not be here but here. The problem is that
> > hw_group_sched_in() needs a way to know that it is called for a
> > completely new series of group scheduling so it can discard any
> > previous assignment. This goes back to the issue I raised in my
> > previous email. You could add a parameter to hw_group_sched_in() that
> > would indicate this is the first group. that would cause n_events =0
> > and the function would start accumulating events for the new
> > scheduling period.
>
> I'm not really seeing the problem here...
>
>
>  perf_disable() <-- shut down the full pmu
>
>  pmu->disable() <-- hey someone got removed (easy free the reg)
>  pmu->enable()  <-- hey someone got added (harder, check constraints)
>
>  hw_perf_group_sched_in() <-- hey a full group got added
>                              (better than multiple ->enable)
>
>  perf_enable() <-- re-enable pmu
>
>
> So ->disable() is used to track freeing, ->enable is used to add
> individual counters, check constraints etc..
>
> hw_perf_group_sched_in() is used to optimize the full group enable.
>

Does that mean that after a disable() I can assume that there won't
be an enable() without a group_sched_in()?

I suspect not. In fact, there is a counter-example in perf_ctx_adjust_freq().

> Afaict that is what power does (Paul?) and that should I think be
> sufficient to track x86 as well.
>
> Since sched_in() is balanced with sched_out(), the ->disable() calls
> should provide the required information as to the occupation of the pmu.
> I don't see the need for more hooks.
>
> Paul, could you comment, since you did all this for power?
>

2009-12-21 19:32:12

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [PATCH] perf_events: improve Intel event scheduling

On Mon, 2009-12-21 at 20:00 +0100, Stephane Eranian wrote:

> > perf_disable() <-- shut down the full pmu
> >
> > pmu->disable() <-- hey someone got removed (easy free the reg)
> > pmu->enable() <-- hey someone got added (harder, check constraints)
> >
> > hw_perf_group_sched_in() <-- hey a full group got added
> > (better than multiple ->enable)
> >
> > perf_enable() <-- re-enable pmu
> >
> >
> > So ->disable() is used to track freeing, ->enable is used to add
> > individual counters, check constraints etc..
> >
> > hw_perf_group_sched_in() is used to optimize the full group enable.
> >
>
> Does that mean that after a disable() I can assume that there won't
> be an enable() without a group_sched_in()?
>
> I suspect not. In fact, there is a counter-example in perf_ctx_adjust_freq().

Why would that be a problem?

A perf_disable() will simply disable the pmu, but not alter its
configuration, perf_enable() will program the pmu and re-enable it (with
the obvious optimization that if the current state and the new state
match we can skip the actual programming).

If a ->disable() was observed between it will simply not re-enable that
particular counter, that is it will remove that counters' config, and
perf_enable() can do a smart reprogram by simply no re-enabling that
counter.

If an ->enable() or hw_perf_group_sched_in() was observed in between,
you have to recompute the full state in order to validate the
availability, if that fails, no state change, if it succeeds you have a
new pmu state and perf_enable() will program that.

In the case of perf_ctx_adjust_freq():

if (!interrupts) {
perf_disable();
event->pmu->disable(event);
atomic64_set(&hwc->period_left, 0);
event->pmu->enable(event);
perf_enable();
}

You'll very likely end up with the same state you had before, but if not
who cares -- its supposed to be an unlikely case.

That is, the ->disable() will clear the cpu_hw_events::used_mask bit.
The ->enable() will want to place the counter on its last know reg,
which (not so very surprisingly) will be available, hence it will
trivially succeed, depending on its smarts it might or might not find
that the 'new' counter's config is identical to the current hw state, if
not it might set a cpu_hw_event::reprogram state.

perf_enable() will, when it sees cpu_hw_event::reprogram, re-write the
pmu config and re-enable the whole thing (global ctrl reg, or iterate
individual EN bits).


2009-12-21 20:59:51

by Stephane Eranian

[permalink] [raw]
Subject: Re: [PATCH] perf_events: improve Intel event scheduling

On Mon, Dec 21, 2009 at 8:31 PM, Peter Zijlstra <[email protected]> wrote:
> On Mon, 2009-12-21 at 20:00 +0100, Stephane Eranian wrote:
>
>> >  perf_disable() <-- shut down the full pmu
>> >
>> >  pmu->disable() <-- hey someone got removed (easy free the reg)
>> >  pmu->enable()  <-- hey someone got added (harder, check constraints)
>> >
>> >  hw_perf_group_sched_in() <-- hey a full group got added
>> >                              (better than multiple ->enable)
>> >
>> >  perf_enable() <-- re-enable pmu
>> >
>> >
>> > So ->disable() is used to track freeing, ->enable is used to add
>> > individual counters, check constraints etc..
>> >
>> > hw_perf_group_sched_in() is used to optimize the full group enable.
>> >
>>
>> Does that mean that after a disable() I can assume that there won't
>> be an enable() without a group_sched_in()?
>>
>> I suspect not. In fact, there is a counter-example in perf_ctx_adjust_freq().
>
> Why would that be a problem?
>
> A perf_disable() will simply disable the pmu, but not alter its
> configuration, perf_enable() will program the pmu and re-enable it (with
> the obvious optimization that if the current state and the new state
> match we can skip the actual programming).
>
> If a ->disable() was observed between it will simply not re-enable that
> particular counter, that is it will remove that counters' config, and
> perf_enable() can do a smart reprogram by simply no re-enabling that
> counter.
>
> If an ->enable() or hw_perf_group_sched_in() was observed in between,
> you have to recompute the full state in order to validate the
> availability, if that fails, no state change, if it succeeds you have a
> new pmu state and perf_enable() will program that.
>

Ok, so what you are suggesting is that the assignment is actually done
incrementally in ->enable(). hw_group_sched_in() would simply validate
that a single group is sane (i.e., can be scheduled if it was alone).

> In the case of perf_ctx_adjust_freq():
>
>        if (!interrupts) {
>                perf_disable();
>                event->pmu->disable(event);
>                atomic64_set(&hwc->period_left, 0);
>                event->pmu->enable(event);
>                perf_enable();
>        }
>
> You'll very likely end up with the same state you had before, but if not
> who cares -- its supposed to be an unlikely case.
>
> That is, the ->disable() will clear the cpu_hw_events::used_mask bit.
> The ->enable() will want to place the counter on its last know reg,
> which (not so very surprisingly) will be available, hence it will

Not in the case I am looking at on AMD. The counter you need to grab
depends on what is going on on the other cores on the socket. So
there is no guarantee that you will get the same. Something similar
exists on Nehalem but it has to do with HT.

> trivially succeed, depending on its smarts it might or might not find
> that the 'new' counter's config is identical to the current hw state, if
> not it might set a cpu_hw_event::reprogram state.
>
> perf_enable() will, when it sees cpu_hw_event::reprogram, re-write the
> pmu config and re-enable the whole thing (global ctrl reg, or iterate
> individual EN bits).

2009-12-21 21:19:38

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [PATCH] perf_events: improve Intel event scheduling

On Mon, 2009-12-21 at 21:59 +0100, Stephane Eranian wrote:
> On Mon, Dec 21, 2009 at 8:31 PM, Peter Zijlstra <[email protected]> wrote:
> > On Mon, 2009-12-21 at 20:00 +0100, Stephane Eranian wrote:
> >
> >> > perf_disable() <-- shut down the full pmu
> >> >
> >> > pmu->disable() <-- hey someone got removed (easy free the reg)
> >> > pmu->enable() <-- hey someone got added (harder, check constraints)
> >> >
> >> > hw_perf_group_sched_in() <-- hey a full group got added
> >> > (better than multiple ->enable)
> >> >
> >> > perf_enable() <-- re-enable pmu
> >> >
> >> >
> >> > So ->disable() is used to track freeing, ->enable is used to add
> >> > individual counters, check constraints etc..
> >> >
> >> > hw_perf_group_sched_in() is used to optimize the full group enable.
> >> >
> >>
> >> Does that mean that after a disable() I can assume that there won't
> >> be an enable() without a group_sched_in()?
> >>
> >> I suspect not. In fact, there is a counter-example in perf_ctx_adjust_freq().
> >
> > Why would that be a problem?
> >
> > A perf_disable() will simply disable the pmu, but not alter its
> > configuration, perf_enable() will program the pmu and re-enable it (with
> > the obvious optimization that if the current state and the new state
> > match we can skip the actual programming).
> >
> > If a ->disable() was observed between it will simply not re-enable that
> > particular counter, that is it will remove that counters' config, and
> > perf_enable() can do a smart reprogram by simply no re-enabling that
> > counter.
> >
> > If an ->enable() or hw_perf_group_sched_in() was observed in between,
> > you have to recompute the full state in order to validate the
> > availability, if that fails, no state change, if it succeeds you have a
> > new pmu state and perf_enable() will program that.
> >
>
> Ok, so what you are suggesting is that the assignment is actually done
> incrementally in ->enable(). hw_group_sched_in() would simply validate
> that a single group is sane (i.e., can be scheduled if it was alone).

hw_perf_group_sched_in() can be used to do a whole group at once (see
how a !0 return value will short-circuit the whole incremental group
buildup), but yes ->enable() is incremental.

> > In the case of perf_ctx_adjust_freq():
> >
> > if (!interrupts) {
> > perf_disable();
> > event->pmu->disable(event);
> > atomic64_set(&hwc->period_left, 0);
> > event->pmu->enable(event);
> > perf_enable();
> > }
> >
> > You'll very likely end up with the same state you had before, but if not
> > who cares -- its supposed to be an unlikely case.
> >
> > That is, the ->disable() will clear the cpu_hw_events::used_mask bit.
> > The ->enable() will want to place the counter on its last know reg,
> > which (not so very surprisingly) will be available, hence it will
>
> Not in the case I am looking at on AMD. The counter you need to grab
> depends on what is going on on the other cores on the socket. So
> there is no guarantee that you will get the same. Something similar
> exists on Nehalem but it has to do with HT.

Well the above code is assumed correct, so we need to make it true,
which should not be too hard to do.

If there are cross cpu constraints (AMD has per node constraints IIRC)
then we need a structure that describes these (a per node structure for
AND, a per core structure for HT), if we take a lock on this structure
the first time we touch it in a perf_disable() region, and release it on
perf_enable(), then we ensure those constraints remain invariant.

With something like that in place the above code will still be valid,
since the act of removing the counter will freeze all needed state to
re-instate it.

Does that make sense?

2009-12-22 01:10:33

by Paul Mackerras

[permalink] [raw]
Subject: Re: [PATCH] perf_events: improve Intel event scheduling

On Fri, Dec 11, 2009 at 12:59:16PM +0100, stephane eranian wrote:

> There is a major difference between PPC and X86 here. PPC has a centralized
> register to control start/stop. This register uses bitmask to enable
> or disable counters. Thus, in hw_perf_enable(), if n_added=0, then you
> just need to
> use the pre-computed bitmask. Otherwise, you need to recompute the bitmask to
> include the new registers. The assignment of events and validation is done in
> hw_group_sched_in().

That's not entirely accurate. Yes there is a global start/stop bit,
but there isn't a bitmask to enable or disable counters. There is a
selector bitfield for each counter (except the limited-function
counters) and you can set the selector to the 'count nothing' value if
you don't want a particular counter to count.

Validation is done in hw_group_sched_in() but not the assignment of
events to counters. That's done in hw_perf_enable(), via the
model-specific ppmu->compute_mmcr() call.

> In X86, assignment and validation is done in hw_group_sched_in(). Activation is
> done individually for each counter. There is no centralized register
> used here, thus
> no bitmask to update.
>
> Disabling a counter does not trigger a complete reschedule of events.
> This happens
> only when hw_group_sched_in() is called.
>
> The n_events = 0 in hw_perf_disable() is used to signal that something
> is changing.
> It should not be here but here.

The meaning of "It should not be here but here" is quite unclear to me.

> The problem is that
> hw_group_sched_in() needs a way
> to know that it is called for a completely new series of group
> scheduling so it can
> discard any previous assignment. This goes back to the issue I raised
> in my previous
> email. You could add a parameter to hw_group_sched_in() that would
> indicate this is
> the first group. that would cause n_events =0 and the function would
> start accumulating
> events for the new scheduling period.

I don't think hw_group_sched_in is ever called for a completely new
series of group scheduling. If you have per-cpu counters active, they
don't get scheduled out and in again with each task switch. So you
will tend to get a hw_pmu_disable call, then a series of disable calls
for the per-task events for the old task, then a series of
hw_group_sched_in calls for the per-task events for the new task, then
a hw_pmu_enable call.

On powerpc we maintain an array with pointers to all the currently
active events. That makes it easy to know at hw_pmu_enable() time
what events need to be put on the PMU. Also it means that at
hw_group_sched_in time you can look at the whole set of events,
including the ones just added, to see if it's feasible to put them all
on. At that point we just check feasibility, which is quite quick and
easy using the bit-vector encoding of constraints. The bit-vector
encoding lets us represent multiple constraints of various forms in
one pair of 64-bit values per event. We can express constraints such
as "you can have at most N events in a class X" or "you can't have
events in all of classes A, B, C and D" or "control register bitfield
X must be set to Y", and then check that a set of events satisfies all
the constraints with some simple integer arithmetic. I don't know
exactly what constraints you have on x86 but I would be surprised if
you couldn't handle them the same way.

Paul.

2009-12-22 01:10:31

by Paul Mackerras

[permalink] [raw]
Subject: Re: [PATCH] perf_events: improve Intel event scheduling

On Mon, Dec 21, 2009 at 04:40:40PM +0100, Peter Zijlstra wrote:

> I'm not really seeing the problem here...
>
>
> perf_disable() <-- shut down the full pmu
>
> pmu->disable() <-- hey someone got removed (easy free the reg)
> pmu->enable() <-- hey someone got added (harder, check constraints)
>
> hw_perf_group_sched_in() <-- hey a full group got added
> (better than multiple ->enable)
>
> perf_enable() <-- re-enable pmu
>
>
> So ->disable() is used to track freeing, ->enable is used to add
> individual counters, check constraints etc..
>
> hw_perf_group_sched_in() is used to optimize the full group enable.
>
> Afaict that is what power does (Paul?) and that should I think be
> sufficient to track x86 as well.

That sounds right to me.

> Since sched_in() is balanced with sched_out(), the ->disable() calls
> should provide the required information as to the occupation of the pmu.
> I don't see the need for more hooks.
>
> Paul, could you comment, since you did all this for power?

On powerpc we maintain a list of currently enabled events in the arch
code. Does x86 do that as well?

If you have the list (or array) of events easily accessible, it's
relatively easy to check whether the whole set is feasible at any
point, without worrying about which events were recently added. The
perf_event structure has a spot where the arch code can store which
PMU register is used for that event, so you can easily optimize the
case where the event doesn't move.

Like you, I'm not seeing where the difficulty lies. Perhaps Stephane
could give us a detailed example if he still thinks there's a
difficulty.

Paul.

2009-12-22 01:10:35

by Paul Mackerras

[permalink] [raw]
Subject: Re: [PATCH] perf_events: improve Intel event scheduling

On Mon, Dec 21, 2009 at 09:59:45PM +0100, Stephane Eranian wrote:

> Ok, so what you are suggesting is that the assignment is actually done
> incrementally in ->enable(). hw_group_sched_in() would simply validate
> that a single group is sane (i.e., can be scheduled if it was alone).

No, hw_group_sched_in needs to validate that this group can go on
along with everything else that has already been enabled. But as I
have said, if you have the complete list of enabled events to hand,
that's not hard.

On the other hand, hw_perf_event_init does need to validate that a
single group is sane by itself.

Paul.

2009-12-29 14:47:43

by Stephane Eranian

[permalink] [raw]
Subject: Re: [PATCH] perf_events: improve Intel event scheduling

Paul,


So if I understand what both of you are saying, it seems that
event scheduling has to take place in the pmu->enable() callback
which is per-event.

In the case of X86, you can chose to do a best-effort scheduling,
i.e., only assign
the new event if there is a compatible free counter. That would be incremental.

But the better solution would be to re-examine the whole situation and
potentially
move existing enabled events around to free a counter if the new event is more
constrained. That would require stopping the PMU, rewriting config and
data registers
and re-enabling the PMU. This latter solution is the only possibility
to avoid ordering
side effects, i.e., the assignment of events to counters depends on
the order in which
events are created (or enabled).

The register can be considered freed by pmu->disable() if scheduling takes place
in pmu->enable().

>From what Paul was saying about hw_perf_group_sched_in(), it seems like this
function can be used to check if a new group is compatible with the existing
enabled events. Compatible means that there is a possible assignment of
events to counters.

As for the n_added logic, it seems like perf_disable() resets n_added to zero.
N_added is incremented in pmu->enable(), i.e., add one event, or the
hw_perf_group_sched_in(), i.e., add a whole group. Scheduling is based on
n_events. The point of n_added is to verify whether something needs to be
done, i.e., event scheduling, if an event or group was added between
perf_disable()
and perf_enable(). In pmu->disable(), the list of enabled events is
compacted and
n_events is decremented.

Did I get this right?

All the enable and disable calls can be called from NMI interrupt context
and thus must be very careful with locks.



On Tue, Dec 22, 2009 at 2:02 AM, Paul Mackerras <[email protected]> wrote:
> On Mon, Dec 21, 2009 at 04:40:40PM +0100, Peter Zijlstra wrote:
>
>> I'm not really seeing the problem here...
>>
>>
>>  perf_disable() <-- shut down the full pmu
>>
>>  pmu->disable() <-- hey someone got removed (easy free the reg)
>>  pmu->enable()  <-- hey someone got added (harder, check constraints)
>>
>>  hw_perf_group_sched_in() <-- hey a full group got added
>>                               (better than multiple ->enable)
>>
>>  perf_enable() <-- re-enable pmu
>>
>>
>> So ->disable() is used to track freeing, ->enable is used to add
>> individual counters, check constraints etc..
>>
>> hw_perf_group_sched_in() is used to optimize the full group enable.
>>
>> Afaict that is what power does (Paul?) and that should I think be
>> sufficient to track x86 as well.
>
> That sounds right to me.
>
>> Since sched_in() is balanced with sched_out(), the ->disable() calls
>> should provide the required information as to the occupation of the pmu.
>> I don't see the need for more hooks.
>>
>> Paul, could you comment, since you did all this for power?
>
> On powerpc we maintain a list of currently enabled events in the arch
> code.  Does x86 do that as well?
>
> If you have the list (or array) of events easily accessible, it's
> relatively easy to check whether the whole set is feasible at any
> point, without worrying about which events were recently added.  The
> perf_event structure has a spot where the arch code can store which
> PMU register is used for that event, so you can easily optimize the
> case where the event doesn't move.
>
> Like you, I'm not seeing where the difficulty lies.  Perhaps Stephane
> could give us a detailed example if he still thinks there's a
> difficulty.
>
> Paul.
>



--
Stephane Eranian | EMEA Software Engineering
Google France | 38 avenue de l'Opéra | 75002 Paris
Tel : +33 (0) 1 42 68 53 00
This email may be confidential or privileged. If you received this
communication by mistake, please
don't forward it to anyone else, please erase all copies and
attachments, and please let me know that
it went to the wrong person. Thanks

2010-01-07 04:13:53

by Paul Mackerras

[permalink] [raw]
Subject: Re: [PATCH] perf_events: improve Intel event scheduling

Stephane,

> So if I understand what both of you are saying, it seems that
> event scheduling has to take place in the pmu->enable() callback
> which is per-event.

How much you have to do in the pmu->enable() function depends on
whether the PMU is already stopped (via a call hw_perf_disable()) or
not. If it is already stopped you don't have to do the full event
scheduling computation, you only have to do enough to work out if the
current set of events plus the event being enabled is feasible, and
record the event for later. In other words, you can defer the actual
scheduling until hw_perf_enable() time. Most calls to the
pmu->enable() function are with the PMU already stopped, so it's a
worthwhile optimization.

> In the case of X86, you can chose to do a best-effort scheduling,
> i.e., only assign
> the new event if there is a compatible free counter. That would be incremental.
>
> But the better solution would be to re-examine the whole situation and
> potentially
> move existing enabled events around to free a counter if the new event is more
> constrained. That would require stopping the PMU, rewriting config and
> data registers
> and re-enabling the PMU. This latter solution is the only possibility
> to avoid ordering
> side effects, i.e., the assignment of events to counters depends on
> the order in which
> events are created (or enabled).

On powerpc, the pmu->enable() function always stops the PMU if it
wasn't already stopped. That simplifies the code a lot because it
means that I can do all the actual event scheduling (i.e. deciding on
which event goes on which counter and working out PMU control register
values) in hw_perf_enable().

> The register can be considered freed by pmu->disable() if scheduling takes place
> in pmu->enable().
>
> >From what Paul was saying about hw_perf_group_sched_in(), it seems like this
> function can be used to check if a new group is compatible with the existing
> enabled events. Compatible means that there is a possible assignment of
> events to counters.

The hw_perf_group_sched_in() function is an optimization so I can add
all the counters in the group to the set under consideration and then
do just one feasibility check, rather than adding each counter in
the group in turn and doing the feasibility check for each one.

> As for the n_added logic, it seems like perf_disable() resets n_added to zero.
> N_added is incremented in pmu->enable(), i.e., add one event, or the
> hw_perf_group_sched_in(), i.e., add a whole group. Scheduling is based on
> n_events. The point of n_added is to verify whether something needs to be
> done, i.e., event scheduling, if an event or group was added between
> perf_disable()
> and perf_enable(). In pmu->disable(), the list of enabled events is
> compacted and
> n_events is decremented.
>
> Did I get this right?

The main point of n_added was so that hw_perf_enable() could know
whether the current set of events is a subset of the last set. If it
is a subset, the scheduling decisions are much simpler.

> All the enable and disable calls can be called from NMI interrupt context
> and thus must be very careful with locks.

I didn't think the pmu->enable() and pmu->disable() functions could be
called from NMI context. Also, I would expect that if hw_perf_enable
and hw_perf_disable are called from NMI context, that the calls would
be balanced. That simplifies things a lot.

Paul.

2010-01-07 08:55:30

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [PATCH] perf_events: improve Intel event scheduling

On Thu, 2010-01-07 at 15:13 +1100, Paul Mackerras wrote:
>
> On powerpc, the pmu->enable() function always stops the PMU if it
> wasn't already stopped.

I just looked at the generic code and it looks like kernel/perf_event.c
always calls perf_disable() before calling ->enable().


2010-01-07 09:01:09

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [PATCH] perf_events: improve Intel event scheduling

On Thu, 2010-01-07 at 15:13 +1100, Paul Mackerras wrote:
>
> > All the enable and disable calls can be called from NMI interrupt context
> > and thus must be very careful with locks.
>
> I didn't think the pmu->enable() and pmu->disable() functions could be
> called from NMI context.

I don't think they're called from NMI context either, most certainly not
from the generic code.

The x86 calls the raw disable from nmi to throttle the counter, but all
that (should) do is disable that counter, which is limited to a single
msr write. After that it schedules a full disable by sending a self-ipi.


2010-01-07 09:54:37

by Stephane Eranian

[permalink] [raw]
Subject: Re: [PATCH] perf_events: improve Intel event scheduling

Hi,

Ok, so I made some progress yesterday on all of this.

The key elements are:
- pmu->enable() is always called from generic with PMU disabled
- pmu->disable() is called with PMU possibly enabled
- hw_perf_group_sched_in() is always called with PMU disabled

I got the n_added logic working now on X86.

I noticed the difference in pmu->enabled() between Power and X86.
On PPC, you disable the whole PMU. On X86, that's not the case.

Now, I do the scheduling in hw_perf_enable(). Just like on PPC, I also
move events around if their register assignment has changed. It is not
quite working yet. I must have something wrong with the read and rewrite
code.

I will experiment with pmu->enable(). Given the key elements above, I think
Paul is right, all scheduling can be deferred until hw_perf_enable().

But there is a catch. I noticed that hw_perf_enable() is void. In
other words, it
means that if scheduling fails, you won't notice. This is not a problem on PPC
but will be on AMD64. That's because the scheduling depends on what goes on
on the other cores on the socket. In other words, things can change between
pmu->enable()/hw_perf_group_sched_in() and hw_perf_enable(). Unless we lock
something down in between.


On Thu, Jan 7, 2010 at 10:00 AM, Peter Zijlstra <[email protected]> wrote:
> On Thu, 2010-01-07 at 15:13 +1100, Paul Mackerras wrote:
>>
>> > All the enable and disable calls can be called from NMI interrupt context
>> > and thus must be very careful with locks.
>>
>> I didn't think the pmu->enable() and pmu->disable() functions could be
>> called from NMI context.
>
> I don't think they're called from NMI context either, most certainly not
> from the generic code.
>
> The x86 calls the raw disable from nmi to throttle the counter, but all
> that (should) do is disable that counter, which is limited to a single
> msr write. After that it schedules a full disable by sending a self-ipi.
>
>
>
>



--
Stephane Eranian | EMEA Software Engineering
Google France | 38 avenue de l'Opéra | 75002 Paris
Tel : +33 (0) 1 42 68 53 00
This email may be confidential or privileged. If you received this
communication by mistake, please
don't forward it to anyone else, please erase all copies and
attachments, and please let me know that
it went to the wrong person. Thanks

2010-01-07 10:01:56

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [PATCH] perf_events: improve Intel event scheduling

On Thu, 2010-01-07 at 10:54 +0100, Stephane Eranian wrote:
>
> Ok, so I made some progress yesterday on all of this.
>
> The key elements are:
> - pmu->enable() is always called from generic with PMU disabled
> - pmu->disable() is called with PMU possibly enabled
> - hw_perf_group_sched_in() is always called with PMU disabled
>
> I got the n_added logic working now on X86.
>
> I noticed the difference in pmu->enabled() between Power and X86.
> On PPC, you disable the whole PMU. On X86, that's not the case.
>
> Now, I do the scheduling in hw_perf_enable(). Just like on PPC, I also
> move events around if their register assignment has changed. It is not
> quite working yet. I must have something wrong with the read and rewrite
> code.
>
> I will experiment with pmu->enable(). Given the key elements above, I think
> Paul is right, all scheduling can be deferred until hw_perf_enable().
>
> But there is a catch. I noticed that hw_perf_enable() is void. In
> other words, it
> means that if scheduling fails, you won't notice. This is not a problem on PPC
> but will be on AMD64. That's because the scheduling depends on what goes on
> on the other cores on the socket. In other words, things can change between
> pmu->enable()/hw_perf_group_sched_in() and hw_perf_enable(). Unless we lock
> something down in between.

You have to lock stuff, you can't fail hw_perf_enable() because at that
point we've lost all track of what failed.