2010-11-21 12:01:31

by Lin Ming

[permalink] [raw]
Subject: [RFC PATCH 2/3 v2] perf: Implement Nehalem uncore pmu

For the background of Nehalem uncore pmu, see Intel SDM Volume 3B
"30.6.2 Performance Monitoring Facility in the Uncore"

1. data structure

struct node_hw_events {
struct perf_event *events[X86_PMC_IDX_MAX];
int n_events;
struct spinlock lock;
};

struct node_hw_events is the per node structure.
"lock" protects add/delete events to uncore pmu.

TODO: this need to be converted to per-socket since "uncore" refers to
subsystems in the physical processor package that are shared by multiple
processor cores.

2. Uncore pmu NMI handling

All the 4 cores are programmed to receive uncore counter overflow
interrupt. The NMI handler(running on 1 of the 4 cores) handle all
counters enabled by all 4 cores.

Signed-off-by: Lin Ming <[email protected]>
---
arch/x86/include/asm/msr-index.h | 1 +
arch/x86/include/asm/perf_event.h | 5 +
arch/x86/kernel/cpu/Makefile | 1 +
arch/x86/kernel/cpu/perf_event.c | 6 +-
arch/x86/kernel/cpu/perf_event_intel_uncore.c | 475 +++++++++++++++++++++++++
arch/x86/kernel/cpu/perf_event_intel_uncore.h | 87 +++++
include/linux/perf_event.h | 1 +
7 files changed, 571 insertions(+), 5 deletions(-)
create mode 100644 arch/x86/kernel/cpu/perf_event_intel_uncore.c
create mode 100644 arch/x86/kernel/cpu/perf_event_intel_uncore.h

diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h
index 6b89f5e..a1cc40b 100644
--- a/arch/x86/include/asm/msr-index.h
+++ b/arch/x86/include/asm/msr-index.h
@@ -81,6 +81,7 @@
#define DEBUGCTLMSR_BTS_OFF_OS (1UL << 9)
#define DEBUGCTLMSR_BTS_OFF_USR (1UL << 10)
#define DEBUGCTLMSR_FREEZE_LBRS_ON_PMI (1UL << 11)
+#define DEBUGCTLMSR_ENABLE_UNCORE_PMI (1UL << 13)

#define MSR_IA32_MC0_CTL 0x00000400
#define MSR_IA32_MC0_STATUS 0x00000401
diff --git a/arch/x86/include/asm/perf_event.h b/arch/x86/include/asm/perf_event.h
index 550e26b..9572166 100644
--- a/arch/x86/include/asm/perf_event.h
+++ b/arch/x86/include/asm/perf_event.h
@@ -75,6 +75,10 @@ union cpuid10_edx {
unsigned int full;
};

+struct pmu_nmi_state {
+ unsigned int marked;
+ int handled;
+};

/*
* Fixed-purpose performance events:
@@ -127,6 +131,7 @@ union cpuid10_edx {
#ifdef CONFIG_PERF_EVENTS
extern void init_hw_perf_events(void);
extern void perf_events_lapic_init(void);
+extern void init_uncore_pmu(void);

#define PERF_EVENT_INDEX_OFFSET 0

diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile
index 3f0ebe4..db4bf99 100644
--- a/arch/x86/kernel/cpu/Makefile
+++ b/arch/x86/kernel/cpu/Makefile
@@ -27,6 +27,7 @@ obj-$(CONFIG_CPU_SUP_TRANSMETA_32) += transmeta.o
obj-$(CONFIG_CPU_SUP_UMC_32) += umc.o

obj-$(CONFIG_PERF_EVENTS) += perf_event.o
+obj-$(CONFIG_PERF_EVENTS) += perf_event_intel_uncore.o

obj-$(CONFIG_X86_MCE) += mcheck/
obj-$(CONFIG_MTRR) += mtrr/
diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index cb16b9c..81517bc 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -1196,11 +1196,6 @@ void perf_events_lapic_init(void)
apic_write(APIC_LVTPC, APIC_DM_NMI);
}

-struct pmu_nmi_state {
- unsigned int marked;
- int handled;
-};
-
static DEFINE_PER_CPU(struct pmu_nmi_state, pmu_nmi);

static int __kprobes
@@ -1348,6 +1343,7 @@ void __init init_hw_perf_events(void)

switch (boot_cpu_data.x86_vendor) {
case X86_VENDOR_INTEL:
+ init_uncore_pmu();
err = intel_pmu_init();
break;
case X86_VENDOR_AMD:
diff --git a/arch/x86/kernel/cpu/perf_event_intel_uncore.c b/arch/x86/kernel/cpu/perf_event_intel_uncore.c
new file mode 100644
index 0000000..908643d
--- /dev/null
+++ b/arch/x86/kernel/cpu/perf_event_intel_uncore.c
@@ -0,0 +1,475 @@
+#include "perf_event_intel_uncore.h"
+
+static struct node_hw_events *uncore_events[MAX_NUMNODES];
+static bool uncore_pmu_initialized;
+static atomic_t active_uncore_events;
+
+static void uncore_pmu_enable_fixed_event(struct perf_event *event)
+{
+ struct hw_perf_event *hwc = &event->hw;
+
+ wrmsrl(hwc->config_base, MSR_UNCORE_FIXED_EN | MSR_UNCORE_FIXED_PMI);
+}
+
+static void uncore_pmu_enable_event(struct perf_event *event)
+{
+ struct hw_perf_event *hwc = &event->hw;
+
+ if (hwc->idx == UNCORE_FIXED_EVENT_IDX)
+ uncore_pmu_enable_fixed_event(event);
+ else
+ wrmsrl(hwc->config_base + hwc->idx, hwc->config | UNCORE_EVENTSEL_ENABLE);
+}
+
+static void uncore_pmu_disable_fixed_event(struct perf_event *event)
+{
+ struct hw_perf_event *hwc = &event->hw;
+
+ wrmsrl(hwc->config_base, 0);
+}
+
+static void uncore_pmu_disable_event(struct perf_event *event)
+{
+ struct hw_perf_event *hwc = &event->hw;
+
+ if (hwc->idx == UNCORE_FIXED_EVENT_IDX)
+ uncore_pmu_disable_fixed_event(event);
+ else
+ wrmsrl(hwc->config_base + hwc->idx, hwc->config);
+}
+
+static void uncore_pmu_enable_all(void)
+{
+ u64 ctrl;
+
+ /*
+ * (0xFULL << 48): 1 of the 4 cores can receive NMI each time
+ * but we don't know which core will receive the NMI when overflow happens
+ */
+ ctrl = ((1 << UNCORE_NUM_GENERAL_COUNTERS) - 1) | (0xFULL << 48);
+ ctrl |= MSR_UNCORE_PERF_GLOBAL_CTRL_EN_FC0;
+
+ /*
+ * Freeze the uncore pmu on overflow of any uncore counter.
+ * This makes unocre NMI handling easier.
+ */
+ ctrl |= MSR_UNCORE_PERF_GLOBAL_CTRL_PMI_FRZ;
+
+ wrmsrl(MSR_UNCORE_PERF_GLOBAL_CTRL, ctrl);
+}
+
+static void uncore_pmu_disable_all(void)
+{
+ wrmsrl(MSR_UNCORE_PERF_GLOBAL_CTRL, 0);
+}
+
+static void uncore_perf_event_destroy(struct perf_event *event)
+{
+ atomic_dec(&active_uncore_events);
+}
+
+static int uncore_pmu_event_init(struct perf_event *event)
+{
+ struct hw_perf_event *hwc = &event->hw;
+
+ if (!uncore_pmu_initialized)
+ return -ENOENT;
+
+ switch (event->attr.type) {
+ case PERF_TYPE_UNCORE:
+ /*
+ * Uncore PMU does measure at all privilege level all the time.
+ * So it doesn't make sense to specify any exclude bits.
+ */
+ if (event->attr.exclude_user || event->attr.exclude_kernel
+ || event->attr.exclude_hv || event->attr.exclude_idle)
+ return -ENOENT;
+ break;
+
+ default:
+ return -ENOENT;
+ }
+
+ if (!hwc->sample_period) {
+ hwc->sample_period = (1ULL << UNCORE_CNTVAL_BITS) - 1;
+ hwc->last_period = hwc->sample_period;
+ local64_set(&hwc->period_left, hwc->sample_period);
+ }
+
+ atomic_inc(&active_uncore_events);
+
+ event->destroy = uncore_perf_event_destroy;
+
+ hwc->idx = -1;
+ hwc->config = (event->attr.config & UNCORE_RAW_EVENT_MASK) | UNCORE_EVENTSEL_PMI;
+ if ((hwc->config & UNCORE_EVENTSEL_EVENT) == UNCORE_FIXED_EVENT) {
+ hwc->config_base = MSR_UNCORE_FIXED_CTR_CTRL;
+ hwc->event_base = MSR_UNCORE_FIXED_CTR0;
+ } else {
+ hwc->config_base = MSR_UNCORE_PERFEVTSEL0;
+ hwc->event_base = MSR_UNCORE_PMC0;
+ }
+
+ return 0;
+}
+
+static int
+uncore_perf_event_set_period(struct perf_event *event)
+{
+ struct hw_perf_event *hwc = &event->hw;
+ s64 left = local64_read(&hwc->period_left);
+ s64 period = hwc->sample_period;
+ u64 max_period = (1ULL << UNCORE_CNTVAL_BITS) - 1;
+ int ret = 0, idx = hwc->idx;
+
+ /*
+ * If we are way outside a reasonable range then just skip forward:
+ */
+ if (unlikely(left <= -period)) {
+ left = period;
+ local64_set(&hwc->period_left, left);
+ hwc->last_period = period;
+ ret = 1;
+ }
+
+ if (unlikely(left <= 0)) {
+ left += period;
+ local64_set(&hwc->period_left, left);
+ hwc->last_period = period;
+ ret = 1;
+ }
+
+ if (left > max_period)
+ left = max_period;
+
+ /*
+ * The hw event starts counting from this event offset,
+ * mark it to be able to extra future deltas:
+ */
+ local64_set(&hwc->prev_count, (u64)-left);
+
+ if (idx == UNCORE_FIXED_EVENT_IDX)
+ idx = 0;
+ wrmsrl(hwc->event_base + idx, (u64)(-left) & max_period);
+
+ perf_event_update_userpage(event);
+
+ return ret;
+}
+
+static void uncore_pmu_start(struct perf_event *event, int flags)
+{
+ if (flags & PERF_EF_RELOAD)
+ uncore_perf_event_set_period(event);
+
+ uncore_pmu_enable_event(event);
+
+ perf_event_update_userpage(event);
+}
+
+static void uncore_pmu_stop(struct perf_event *event, int flags)
+{
+ struct hw_perf_event *hwc = &event->hw;
+ int idx = hwc->idx;
+
+ uncore_pmu_disable_event(event);
+
+ if (flags & PERF_EF_UPDATE) {
+ if (idx == UNCORE_FIXED_EVENT_IDX)
+ hwc->idx = 0;
+ x86_perf_event_update(event, UNCORE_CNTVAL_BITS);
+ hwc->idx = idx;
+ }
+}
+
+static int uncore_pmu_add(struct perf_event *event, int flags)
+{
+ int node = numa_node_id();
+ int ret = 1;
+ int i = 0, fixed = 0;
+
+ spin_lock(&uncore_events[node]->lock);
+
+ if ((event->attr.config & UNCORE_EVENTSEL_EVENT) == UNCORE_FIXED_EVENT) {
+ i = UNCORE_FIXED_EVENT_IDX;
+ fixed = 1;
+ }
+ for (; i < X86_PMC_IDX_MAX; i++) {
+ if (!fixed && i == UNCORE_NUM_GENERAL_COUNTERS)
+ break;
+ if (!uncore_events[node]->events[i]) {
+ uncore_events[node]->events[i] = event;
+ uncore_events[node]->n_events++;
+
+ event->hw.idx = i;
+ if (flags & PERF_EF_START)
+ uncore_pmu_start(event, PERF_EF_RELOAD);
+ ret = 0;
+ break;
+ }
+
+ if (i == UNCORE_FIXED_EVENT_IDX)
+ break;
+ }
+
+ if (uncore_events[node]->n_events == 1)
+ uncore_pmu_enable_all();
+
+ spin_unlock(&uncore_events[node]->lock);
+
+ return ret;
+}
+
+static void uncore_pmu_del(struct perf_event *event, int flags)
+{
+ int node = numa_node_id();
+ struct hw_perf_event *hwc = &event->hw;
+ int i;
+
+ spin_lock(&uncore_events[node]->lock);
+
+ for (i = 0; i < X86_PMC_IDX_MAX; i++) {
+ if (uncore_events[node]->events[i] == event) {
+ uncore_events[node]->events[hwc->idx] = NULL;
+ uncore_events[node]->n_events--;
+
+ uncore_pmu_stop(event, PERF_EF_UPDATE);
+ break;
+ }
+ }
+
+ if (uncore_events[node]->n_events == 0)
+ uncore_pmu_disable_all();
+
+ spin_unlock(&uncore_events[node]->lock);
+}
+
+static void uncore_pmu_read(struct perf_event *event)
+{
+ x86_perf_event_update(event, UNCORE_CNTVAL_BITS);
+}
+
+static struct pmu uncore_pmu = {
+ .event_init = uncore_pmu_event_init,
+ .add = uncore_pmu_add,
+ .del = uncore_pmu_del,
+ .start = uncore_pmu_start,
+ .stop = uncore_pmu_stop,
+ .read = uncore_pmu_read,
+};
+
+
+static inline u64 uncore_pmu_get_status(void)
+{
+ u64 status;
+
+ rdmsrl(MSR_UNCORE_PERF_GLOBAL_STATUS, status);
+
+ return status;
+}
+
+static inline void uncore_pmu_ack_status(u64 ack)
+{
+ wrmsrl(MSR_UNCORE_PERF_GLOBAL_OVF_CTRL, ack);
+}
+
+static int uncore_pmu_save_and_restart(struct perf_event *event)
+{
+ x86_perf_event_update(event, UNCORE_CNTVAL_BITS);
+ return uncore_perf_event_set_period(event);
+}
+
+static int uncore_pmu_handle_irq(struct pt_regs *regs)
+{
+ struct perf_sample_data data;
+ struct node_hw_events *uncore_node;
+ int node;
+ int bit;
+ u64 status;
+ int handled = 0;
+
+ /*
+ * Don't need to disable uncore PMU globally,
+ * because it's set to freeze on any overflow.
+ * See uncore_pmu_enable_all
+ */
+
+ perf_sample_data_init(&data, 0);
+
+ node = numa_node_id();
+ uncore_node = uncore_events[node];
+
+ status = uncore_pmu_get_status();
+ if (!status) {
+ uncore_pmu_enable_all();
+ return 1;
+ }
+
+again:
+ uncore_pmu_ack_status(status);
+
+ status &= ~(MSR_UNCORE_PERF_GLOBAL_STATUS_OVF_PMI |
+ MSR_UNCORE_PERF_GLOBAL_STATUS_CHG);
+
+ for_each_set_bit(bit, (unsigned long *)&status, X86_PMC_IDX_MAX) {
+ struct perf_event *event = uncore_node->events[bit];
+
+ handled++;
+
+ if (!uncore_pmu_save_and_restart(event))
+ continue;
+
+ data.period = event->hw.last_period;
+
+ if (perf_event_overflow(event, 1, &data, regs))
+ uncore_pmu_stop(event, 0);
+ }
+
+ /*
+ * Repeat if there is more work to be done:
+ */
+ status = uncore_pmu_get_status();
+ if (status)
+ goto again;
+
+ uncore_pmu_enable_all();
+ return handled;
+}
+
+/* Copy from perf_event_nmi_handler */
+
+static DEFINE_PER_CPU(struct pmu_nmi_state, pmu_uncore_nmi);
+
+static int __kprobes
+perf_event_uncore_nmi_handler(struct notifier_block *self,
+ unsigned long cmd, void *__args)
+{
+ struct die_args *args = __args;
+ unsigned int this_nmi;
+ int handled;
+
+ if (!atomic_read(&active_uncore_events))
+ return NOTIFY_DONE;
+
+ switch (cmd) {
+ case DIE_NMI:
+ case DIE_NMI_IPI:
+ break;
+ case DIE_NMIUNKNOWN:
+ this_nmi = percpu_read(irq_stat.__nmi_count);
+ if (this_nmi != __get_cpu_var(pmu_uncore_nmi).marked)
+ /* let the kernel handle the unknown nmi */
+ return NOTIFY_DONE;
+ /*
+ * This one is a PMU back-to-back nmi. Two events
+ * trigger 'simultaneously' raising two back-to-back
+ * NMIs. If the first NMI handles both, the latter
+ * will be empty and daze the CPU. So, we drop it to
+ * avoid false-positive 'unknown nmi' messages.
+ */
+ return NOTIFY_STOP;
+ default:
+ return NOTIFY_DONE;
+ }
+
+ apic_write(APIC_LVTPC, APIC_DM_NMI);
+
+ handled = uncore_pmu_handle_irq(args->regs);
+ if (!handled)
+ return NOTIFY_DONE;
+
+ this_nmi = percpu_read(irq_stat.__nmi_count);
+ if ((handled > 1) ||
+ /* the next nmi could be a back-to-back nmi */
+ ((__get_cpu_var(pmu_uncore_nmi).marked == this_nmi) &&
+ (__get_cpu_var(pmu_uncore_nmi).handled > 1))) {
+ /*
+ * We could have two subsequent back-to-back nmis: The
+ * first handles more than one counter, the 2nd
+ * handles only one counter and the 3rd handles no
+ * counter.
+ *
+ * This is the 2nd nmi because the previous was
+ * handling more than one counter. We will mark the
+ * next (3rd) and then drop it if unhandled.
+ */
+ __get_cpu_var(pmu_uncore_nmi).marked = this_nmi + 1;
+ __get_cpu_var(pmu_uncore_nmi).handled = handled;
+ }
+
+ return NOTIFY_STOP;
+}
+
+static __read_mostly struct notifier_block perf_event_uncore_nmi_notifier = {
+ .notifier_call = perf_event_uncore_nmi_handler,
+ .next = NULL,
+ .priority = 1
+};
+
+void __init init_uncore_pmu(void)
+{
+ union cpuid01_eax eax;
+ unsigned int unused;
+ unsigned int model;
+ int i, node;
+
+ if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL)
+ return;
+
+ cpuid(1, &eax.full, &unused, &unused, &unused);
+
+ /* Check CPUID signatures: 06_1AH, 06_1EH, 06_1FH */
+ model = eax.split.model | (eax.split.ext_model << 4);
+ if (eax.split.family != 6 || (model != 0x1A && model != 0x1E && model != 0x1F))
+ return;
+
+ pr_cont("Nehalem uncore pmu, \n");
+
+ for_each_node(node) {
+ uncore_events[node] = kmalloc_node(sizeof(struct node_hw_events),
+ GFP_KERNEL | __GFP_ZERO, node);
+ if (unlikely(!uncore_events[node]))
+ goto fail;
+
+ spin_lock_init(&uncore_events[node]->lock);
+ }
+
+ perf_pmu_register(&uncore_pmu);
+ register_die_notifier(&perf_event_uncore_nmi_notifier);
+ uncore_pmu_initialized = true;
+ return;
+
+fail:
+ for (i = 0; i < node; i++)
+ kfree(uncore_events[node]);
+}
+
+static void uncore_setup_per_cpu(void *info)
+{
+ u64 val;
+
+ /*
+ * PMI delivery due to an uncore counter overflow is enabled by
+ * setting IA32_DEBUG_CTL.Offcore_PMI_EN to 1.
+ */
+ rdmsrl(MSR_IA32_DEBUGCTLMSR, val);
+ wrmsrl(MSR_IA32_DEBUGCTLMSR, val | DEBUGCTLMSR_ENABLE_UNCORE_PMI);
+ apic_write(APIC_LVTPC, APIC_DM_NMI);
+}
+
+static int __init uncore_per_cpu_init(void)
+{
+ int cpu;
+
+ if (!uncore_pmu_initialized)
+ return 0;
+
+ /*
+ * Setup each logical cpu, so they can receive uncore pmu interrupt.
+ */
+ for_each_online_cpu(cpu)
+ smp_call_function_single(cpu, uncore_setup_per_cpu, NULL, 1);
+
+ return 0;
+}
+__initcall(uncore_per_cpu_init);
diff --git a/arch/x86/kernel/cpu/perf_event_intel_uncore.h b/arch/x86/kernel/cpu/perf_event_intel_uncore.h
new file mode 100644
index 0000000..e616c7b
--- /dev/null
+++ b/arch/x86/kernel/cpu/perf_event_intel_uncore.h
@@ -0,0 +1,87 @@
+#include <linux/perf_event.h>
+#include <linux/capability.h>
+#include <linux/notifier.h>
+#include <linux/hardirq.h>
+#include <linux/kprobes.h>
+#include <linux/module.h>
+#include <linux/kdebug.h>
+#include <linux/sched.h>
+#include <linux/uaccess.h>
+#include <linux/slab.h>
+#include <linux/highmem.h>
+#include <linux/cpu.h>
+#include <linux/bitops.h>
+
+#include <asm/apic.h>
+#include <asm/stacktrace.h>
+#include <asm/nmi.h>
+#include <asm/compat.h>
+
+#define MSR_UNCORE_PERF_GLOBAL_CTRL 0x391
+#define MSR_UNCORE_PERF_GLOBAL_STATUS 0x392
+#define MSR_UNCORE_PERF_GLOBAL_OVF_CTRL 0x393
+#define MSR_UNCORE_FIXED_CTR0 0x394
+#define MSR_UNCORE_FIXED_CTR_CTRL 0x395
+#define MSR_UNCORE_ADDR_OPCODE_MATCH 0x396
+
+#define MSR_UNCORE_PMC0 0x3b0
+
+#define MSR_UNCORE_PERFEVTSEL0 0x3c0
+
+#define MSR_UNCORE_PERF_GLOBAL_CTRL_EN_FC0 (1ULL << 32)
+#define MSR_UNCORE_PERF_GLOBAL_CTRL_PMI_CORE0 (1ULL << 48)
+#define MSR_UNCORE_PERF_GLOBAL_CTRL_PMI_FRZ (1ULL << 63)
+
+#define MSR_UNCORE_PERF_GLOBAL_STATUS_OVF_PMI (1ULL << 61)
+#define MSR_UNCORE_PERF_GLOBAL_STATUS_CHG (1ULL << 63)
+
+#define MSR_UNCORE_FIXED_EN (1ULL << 0)
+#define MSR_UNCORE_FIXED_PMI (1ULL << 2)
+
+#define UNCORE_EVENTSEL_EVENT 0x000000FFULL
+#define UNCORE_EVENTSEL_UMASK 0x0000FF00ULL
+#define UNCORE_EVENTSEL_OCC_CTR_RST (1ULL << 17)
+#define UNCORE_EVENTSEL_EDGE (1ULL << 18)
+#define UNCORE_EVENTSEL_PMI (1ULL << 20)
+#define UNCORE_EVENTSEL_ENABLE (1ULL << 22)
+#define UNCORE_EVENTSEL_INV (1ULL << 23)
+#define UNCORE_EVENTSEL_CMASK 0xFF000000ULL
+
+#define UNCORE_RAW_EVENT_MASK \
+ (UNCORE_EVENTSEL_EVENT | \
+ UNCORE_EVENTSEL_UMASK | \
+ UNCORE_EVENTSEL_EDGE | \
+ UNCORE_EVENTSEL_INV | \
+ UNCORE_EVENTSEL_CMASK)
+
+#define UNCORE_CNTVAL_BITS 48
+
+/* 8 general purpose counters + 1 fixed-function counter */
+#define UNCORE_NUM_GENERAL_COUNTERS 8
+#define UNCORE_NUM_FIXED_COUNTERS 1
+#define UNCORE_NUM_COUNTERS (UNCORE_NUM_GENERAL_COUNTERS + UNCORE_NUM_FIXED_COUNTERS)
+
+/* TBD: fix event config value passed by userspace */
+#define UNCORE_FIXED_EVENT 0xFF
+#define UNCORE_FIXED_EVENT_IDX 32
+
+union cpuid01_eax {
+ struct {
+ unsigned int stepping:4;
+ unsigned int model:4;
+ unsigned int family:4;
+ unsigned int type:2;
+ unsigned int reserve:2;
+ unsigned int ext_model:4;
+ unsigned int ext_family:4;
+ } split;
+ unsigned int full;
+};
+
+struct node_hw_events {
+ struct perf_event *events[X86_PMC_IDX_MAX]; /* in counter order */
+ int n_events;
+ struct spinlock lock;
+};
+
+extern u64 x86_perf_event_update(struct perf_event *event, int cntval_bits);
diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 40150f3..7acc40c 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -32,6 +32,7 @@ enum perf_type_id {
PERF_TYPE_HW_CACHE = 3,
PERF_TYPE_RAW = 4,
PERF_TYPE_BREAKPOINT = 5,
+ PERF_TYPE_UNCORE = 6,

PERF_TYPE_MAX, /* non-ABI */
};
--
1.7.2.3






2010-11-21 12:46:23

by Andi Kleen

[permalink] [raw]
Subject: Re: [RFC PATCH 2/3 v2] perf: Implement Nehalem uncore pmu


>
> 2. Uncore pmu NMI handling
>
> All the 4 cores are programmed to receive uncore counter overflow
> interrupt. The NMI handler(running on 1 of the 4 cores) handle all
> counters enabled by all 4 cores.

Really for uncore monitoring there is no need to use an NMI handler.
You can't profile a core anyways, so you can just delay the reporting
a little bit. It may simplify the code to not use one here
and just use an ordinary handler.

In general since there is already much trouble with overloaded
NMI events avoiding new NMIs is a good idea.



> +
> +static struct node_hw_events *uncore_events[MAX_NUMNODES];

Don't declare static arrays with MAX_NUMNODES, that number can be
very large and cause unnecessary bloat. Better use per CPU data or similar
(e.g. with alloc_percpu)

> + /*
> + * The hw event starts counting from this event offset,
> + * mark it to be able to extra future deltas:
> + */
> + local64_set(&hwc->prev_count, (u64)-left);

Your use of local* seems dubious. That is only valid if it's really
all on the same CPU. Is that really true?

> +static int uncore_pmu_add(struct perf_event *event, int flags)
> +{
> + int node = numa_node_id();

this should be still package id

> + /* Check CPUID signatures: 06_1AH, 06_1EH, 06_1FH */
> + model = eax.split.model | (eax.split.ext_model << 4);
> + if (eax.split.family != 6 || (model != 0x1A && model != 0x1E && model !=
> 0x1F))
> + return;

You can just get that from boot_cpu_data, no need to call cpuid

> +#include <linux/perf_event.h>
> +#include <linux/capability.h>
> +#include <linux/notifier.h>
> +#include <linux/hardirq.h>
> +#include <linux/kprobes.h>
> +#include <linux/module.h>
> +#include <linux/kdebug.h>
> +#include <linux/sched.h>
> +#include <linux/uaccess.h>
> +#include <linux/slab.h>
> +#include <linux/highmem.h>
> +#include <linux/cpu.h>
> +#include <linux/bitops.h>

Do you really need all these includes?


-Andi

2010-11-21 14:04:29

by Lin Ming

[permalink] [raw]
Subject: Re: [RFC PATCH 2/3 v2] perf: Implement Nehalem uncore pmu

On Sun, 2010-11-21 at 20:46 +0800, Andi Kleen wrote:
> >
> > 2. Uncore pmu NMI handling
> >
> > All the 4 cores are programmed to receive uncore counter overflow
> > interrupt. The NMI handler(running on 1 of the 4 cores) handle all
> > counters enabled by all 4 cores.
>
> Really for uncore monitoring there is no need to use an NMI handler.
> You can't profile a core anyways, so you can just delay the reporting
> a little bit. It may simplify the code to not use one here
> and just use an ordinary handler.

OK, I can use on ordinary interrupt handler here.

>
> In general since there is already much trouble with overloaded
> NMI events avoiding new NMIs is a good idea.
>
>
>
> > +
> > +static struct node_hw_events *uncore_events[MAX_NUMNODES];
>
> Don't declare static arrays with MAX_NUMNODES, that number can be
> very large and cause unnecessary bloat. Better use per CPU data or similar
> (e.g. with alloc_percpu)

I really need is a per physical cpu data here, is alloc_percpu enough?

>
> > + /*
> > + * The hw event starts counting from this event offset,
> > + * mark it to be able to extra future deltas:
> > + */
> > + local64_set(&hwc->prev_count, (u64)-left);
>
> Your use of local* seems dubious. That is only valid if it's really
> all on the same CPU. Is that really true?

Good catch! That is not true.

The interrupt handler is running on one core and the
data(hwc->prev_count) maybe on another core.

Any idea to set this cross-core data?

>
> > +static int uncore_pmu_add(struct perf_event *event, int flags)
> > +{
> > + int node = numa_node_id();
>
> this should be still package id

Understand, this is in my TODO.

>
> > + /* Check CPUID signatures: 06_1AH, 06_1EH, 06_1FH */
> > + model = eax.split.model | (eax.split.ext_model << 4);
> > + if (eax.split.family != 6 || (model != 0x1A && model != 0x1E && model !=
> > 0x1F))
> > + return;
>
> You can just get that from boot_cpu_data, no need to call cpuid

Nice, will use it.

>
> > +#include <linux/perf_event.h>
> > +#include <linux/capability.h>
> > +#include <linux/notifier.h>
> > +#include <linux/hardirq.h>
> > +#include <linux/kprobes.h>
> > +#include <linux/module.h>
> > +#include <linux/kdebug.h>
> > +#include <linux/sched.h>
> > +#include <linux/uaccess.h>
> > +#include <linux/slab.h>
> > +#include <linux/highmem.h>
> > +#include <linux/cpu.h>
> > +#include <linux/bitops.h>
>
> Do you really need all these includes?

Only

#include <linux/perf_event.h>
#include <linux/kprobes.h>
#include <linux/hardirq.h>
#include <linux/slab.h>

are needed.

Thanks for the comments.
Lin Ming

2010-11-21 17:00:55

by Andi Kleen

[permalink] [raw]
Subject: Re: [RFC PATCH 2/3 v2] perf: Implement Nehalem uncore pmu


BTW another thing I noticed that if you ever add opcode/address
matching you'll need to add the new parameters for the address
at least to the input perf_event structure. opcode could
in theory be encoded in the upper 32bits like offcore does,
but address needs to be extra. It's only a small
incremental step, but may make this more useful.

>> Really for uncore monitoring there is no need to use an NMI handler.
>> You can't profile a core anyways, so you can just delay the reporting
>> a little bit. It may simplify the code to not use one here
>> and just use an ordinary handler.
>
> OK, I can use on ordinary interrupt handler here.

You'll need to allocate a vector, it shouldn't be too difficult.

>>
>> In general since there is already much trouble with overloaded
>> NMI events avoiding new NMIs is a good idea.
>>
>>
>>
>> > +
>> > +static struct node_hw_events *uncore_events[MAX_NUMNODES];
>>
>> Don't declare static arrays with MAX_NUMNODES, that number can be
>> very large and cause unnecessary bloat. Better use per CPU data or
>> similar
>> (e.g. with alloc_percpu)
>
> I really need is a per physical cpu data here, is alloc_percpu enough?

If you use a per cpu array then each CPU can carry a pointer
to its per socket data structure.

This could use a similar scheme as the per core data I submitted
recently.

>
> Any idea to set this cross-core data?

s/local/atomic/

But if it's just stores/loads without read-modify-write you
can just use normal stores.


>
>>
>> > +static int uncore_pmu_add(struct perf_event *event, int flags)
>> > +{
>> > + int node = numa_node_id();
>>
>> this should be still package id
>
> Understand, this is in my TODO.

With the per cpu pointer scheme you likely don't even need it,
just check the topology at set up time (similar as in my patch,
just using the package)


-Andi

2010-11-21 17:44:26

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [RFC PATCH 2/3 v2] perf: Implement Nehalem uncore pmu

On Sun, 2010-11-21 at 22:04 +0800, Lin Ming wrote:
> On Sun, 2010-11-21 at 20:46 +0800, Andi Kleen wrote:
> > >
> > > 2. Uncore pmu NMI handling
> > >
> > > All the 4 cores are programmed to receive uncore counter overflow
> > > interrupt. The NMI handler(running on 1 of the 4 cores) handle all
> > > counters enabled by all 4 cores.
> >
> > Really for uncore monitoring there is no need to use an NMI handler.
> > You can't profile a core anyways, so you can just delay the reporting
> > a little bit. It may simplify the code to not use one here
> > and just use an ordinary handler.
>
> OK, I can use on ordinary interrupt handler here.

Does the hardware actually allow using a different interrupt source?

> >
> > In general since there is already much trouble with overloaded
> > NMI events avoiding new NMIs is a good idea.
> >
> >
> >
> > > +
> > > +static struct node_hw_events *uncore_events[MAX_NUMNODES];
> >
> > Don't declare static arrays with MAX_NUMNODES, that number can be
> > very large and cause unnecessary bloat. Better use per CPU data or similar
> > (e.g. with alloc_percpu)
>
> I really need is a per physical cpu data here, is alloc_percpu enough?

Nah, simply manually allocate bits using kmalloc_node(), that's
something I still need to fix in Andi's patches as well.

> > > + /*
> > > + * The hw event starts counting from this event offset,
> > > + * mark it to be able to extra future deltas:
> > > + */
> > > + local64_set(&hwc->prev_count, (u64)-left);
> >
> > Your use of local* seems dubious. That is only valid if it's really
> > all on the same CPU. Is that really true?
>
> Good catch! That is not true.
>
> The interrupt handler is running on one core and the
> data(hwc->prev_count) maybe on another core.
>
> Any idea to set this cross-core data?

IIRC you can steer the uncore interrupts (it has a mask somewhere)
simply steer everything to the first cpu in the nodemask?


2010-11-23 10:00:51

by Stephane Eranian

[permalink] [raw]
Subject: Re: [RFC PATCH 2/3 v2] perf: Implement Nehalem uncore pmu

On Sun, Nov 21, 2010 at 6:44 PM, Peter Zijlstra <[email protected]> wrote:
> On Sun, 2010-11-21 at 22:04 +0800, Lin Ming wrote:
>> On Sun, 2010-11-21 at 20:46 +0800, Andi Kleen wrote:
>> > >
>> > > 2. Uncore pmu NMI handling
>> > >
>> > > All the 4 cores are programmed to receive uncore counter overflow
>> > > interrupt. The NMI handler(running on 1 of the 4 cores) handle all
>> > > counters enabled by all 4 cores.
>> >
>> > Really for uncore monitoring there is no need to use an NMI handler.
>> > You can't profile a core anyways, so you can just delay the reporting
>> > a little bit. It may simplify the code to not use one here
>> > and just use an ordinary handler.
>>
>> OK, I can use on ordinary interrupt handler here.
>
> Does the hardware actually allow using a different interrupt source?
>
It does not. It's using whatever you've programmed into the APIC
LVT vector, AFAIK. Uncore interrupt mode is enabled via
IA32_DEBUGCTL. Regarless of sampling or not, you need the interrupt
to virtualize the counters to 64 bits.

2010-11-23 10:17:14

by Stephane Eranian

[permalink] [raw]
Subject: Re: [RFC PATCH 2/3 v2] perf: Implement Nehalem uncore pmu

Lin,

On Sun, Nov 21, 2010 at 1:01 PM, Lin Ming <[email protected]> wrote:
> +static void uncore_pmu_enable_all(void)
> +{
> +       u64 ctrl;
> +
> +       /*
> +        * (0xFULL << 48): 1 of the 4 cores can receive NMI each time
> +        * but we don't know which core will receive the NMI when overflow happens
> +        */

That does not sound right. If you set bit 48-51 to 1, then all 4 cores
will receive EVERY
interrupt, i.e., it's a broadcast. That seems to contradict your
comment: 1 of the 4. Unless
you meant, they all get the interrupt and one will handle it, the
other will find nothing to
process. But I don't see the atomic op that would make this true in
uncore_handle_irq().

I also think that if you want all processors to receive the
interrupts, then the mask should
be 0xff when HT is on. The manual is rather obscure on this, but it
does make sense.


> +       ctrl = ((1 << UNCORE_NUM_GENERAL_COUNTERS) - 1) | (0xFULL << 48);
> +       ctrl |= MSR_UNCORE_PERF_GLOBAL_CTRL_EN_FC0;
> +
> +       /*
> +        * Freeze the uncore pmu on overflow of any uncore counter.
> +        * This makes unocre NMI handling easier.
> +        */
> +       ctrl |= MSR_UNCORE_PERF_GLOBAL_CTRL_PMI_FRZ;
> +
> +       wrmsrl(MSR_UNCORE_PERF_GLOBAL_CTRL, ctrl);
> +}
> +

2010-11-24 01:32:34

by Lin Ming

[permalink] [raw]
Subject: Re: [RFC PATCH 2/3 v2] perf: Implement Nehalem uncore pmu

On Tue, 2010-11-23 at 18:17 +0800, Stephane Eranian wrote:
> Lin,
>
> On Sun, Nov 21, 2010 at 1:01 PM, Lin Ming <[email protected]> wrote:
> > +static void uncore_pmu_enable_all(void)
> > +{
> > + u64 ctrl;
> > +
> > + /*
> > + * (0xFULL << 48): 1 of the 4 cores can receive NMI each time
> > + * but we don't know which core will receive the NMI when overflow happens
> > + */
>
> That does not sound right. If you set bit 48-51 to 1, then all 4 cores
> will receive EVERY
> interrupt, i.e., it's a broadcast. That seems to contradict your
> comment: 1 of the 4. Unless
> you meant, they all get the interrupt and one will handle it, the
> other will find nothing to
> process. But I don't see the atomic op that would make this true in
> uncore_handle_irq().

I thought it's a broadcast too in the v1 patches, let me double check
it.

>
> I also think that if you want all processors to receive the
> interrupts, then the mask should
> be 0xff when HT is on. The manual is rather obscure on this, but it
> does make sense.

Kernel panics if 0xff is set, but it maybe bugs of my code.

Anyway, is it told in some errata that the mask should be 0xff when HT
is on?

Thanks,
Lin Ming

>
>
> > + ctrl = ((1 << UNCORE_NUM_GENERAL_COUNTERS) - 1) | (0xFULL << 48);
> > + ctrl |= MSR_UNCORE_PERF_GLOBAL_CTRL_EN_FC0;
> > +
> > + /*
> > + * Freeze the uncore pmu on overflow of any uncore counter.
> > + * This makes unocre NMI handling easier.
> > + */
> > + ctrl |= MSR_UNCORE_PERF_GLOBAL_CTRL_PMI_FRZ;
> > +
> > + wrmsrl(MSR_UNCORE_PERF_GLOBAL_CTRL, ctrl);
> > +}
> > +

2010-11-24 09:54:00

by Lin Ming

[permalink] [raw]
Subject: Re: [RFC PATCH 2/3 v2] perf: Implement Nehalem uncore pmu

On Mon, 2010-11-22 at 01:44 +0800, Peter Zijlstra wrote:
> On Sun, 2010-11-21 at 22:04 +0800, Lin Ming wrote:
> > On Sun, 2010-11-21 at 20:46 +0800, Andi Kleen wrote:
> > > >
> > > > 2. Uncore pmu NMI handling
> > > >
> > > > All the 4 cores are programmed to receive uncore counter overflow
> > > > interrupt. The NMI handler(running on 1 of the 4 cores) handle all
> > > > counters enabled by all 4 cores.
> > >
> > > Really for uncore monitoring there is no need to use an NMI handler.
> > > You can't profile a core anyways, so you can just delay the reporting
> > > a little bit. It may simplify the code to not use one here
> > > and just use an ordinary handler.
> >
> > OK, I can use on ordinary interrupt handler here.
>
> Does the hardware actually allow using a different interrupt source?
>
> > >
> > > In general since there is already much trouble with overloaded
> > > NMI events avoiding new NMIs is a good idea.
> > >
> > >
> > >
> > > > +
> > > > +static struct node_hw_events *uncore_events[MAX_NUMNODES];
> > >
> > > Don't declare static arrays with MAX_NUMNODES, that number can be
> > > very large and cause unnecessary bloat. Better use per CPU data or similar
> > > (e.g. with alloc_percpu)
> >
> > I really need is a per physical cpu data here, is alloc_percpu enough?
>
> Nah, simply manually allocate bits using kmalloc_node(), that's
> something I still need to fix in Andi's patches as well.

I'm writing this like AMD NB events allocation.

Thanks,
Lin Ming

>
> > > > + /*
> > > > + * The hw event starts counting from this event offset,
> > > > + * mark it to be able to extra future deltas:
> > > > + */
> > > > + local64_set(&hwc->prev_count, (u64)-left);
> > >
> > > Your use of local* seems dubious. That is only valid if it's really
> > > all on the same CPU. Is that really true?
> >
> > Good catch! That is not true.
> >
> > The interrupt handler is running on one core and the
> > data(hwc->prev_count) maybe on another core.
> >
> > Any idea to set this cross-core data?
>
> IIRC you can steer the uncore interrupts (it has a mask somewhere)
> simply steer everything to the first cpu in the nodemask?
>
>
>

2010-11-25 00:23:17

by Lin Ming

[permalink] [raw]
Subject: Re: [RFC PATCH 2/3 v2] perf: Implement Nehalem uncore pmu

On Tue, 2010-11-23 at 18:00 +0800, Stephane Eranian wrote:
> On Sun, Nov 21, 2010 at 6:44 PM, Peter Zijlstra <[email protected]> wrote:
> > On Sun, 2010-11-21 at 22:04 +0800, Lin Ming wrote:
> >> On Sun, 2010-11-21 at 20:46 +0800, Andi Kleen wrote:
> >> > >
> >> > > 2. Uncore pmu NMI handling
> >> > >
> >> > > All the 4 cores are programmed to receive uncore counter overflow
> >> > > interrupt. The NMI handler(running on 1 of the 4 cores) handle all
> >> > > counters enabled by all 4 cores.
> >> >
> >> > Really for uncore monitoring there is no need to use an NMI handler.
> >> > You can't profile a core anyways, so you can just delay the reporting
> >> > a little bit. It may simplify the code to not use one here
> >> > and just use an ordinary handler.
> >>
> >> OK, I can use on ordinary interrupt handler here.
> >
> > Does the hardware actually allow using a different interrupt source?
> >
> It does not. It's using whatever you've programmed into the APIC
> LVT vector, AFAIK. Uncore interrupt mode is enabled via
> IA32_DEBUGCTL. Regarless of sampling or not, you need the interrupt
> to virtualize the counters to 64 bits.

If only counting(perf stat) makes sense for uncore events, do we still
need an interrupt handler?

48 bits counter is not that easy to overflow in practice.

2010-11-25 06:08:58

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [RFC PATCH 2/3 v2] perf: Implement Nehalem uncore pmu

On Thu, 2010-11-25 at 08:24 +0800, Lin Ming wrote:
> On Tue, 2010-11-23 at 18:00 +0800, Stephane Eranian wrote:
> > On Sun, Nov 21, 2010 at 6:44 PM, Peter Zijlstra <[email protected]> wrote:
> > > On Sun, 2010-11-21 at 22:04 +0800, Lin Ming wrote:
> > >> On Sun, 2010-11-21 at 20:46 +0800, Andi Kleen wrote:
> > >> > >
> > >> > > 2. Uncore pmu NMI handling
> > >> > >
> > >> > > All the 4 cores are programmed to receive uncore counter overflow
> > >> > > interrupt. The NMI handler(running on 1 of the 4 cores) handle all
> > >> > > counters enabled by all 4 cores.
> > >> >
> > >> > Really for uncore monitoring there is no need to use an NMI handler.
> > >> > You can't profile a core anyways, so you can just delay the reporting
> > >> > a little bit. It may simplify the code to not use one here
> > >> > and just use an ordinary handler.
> > >>
> > >> OK, I can use on ordinary interrupt handler here.
> > >
> > > Does the hardware actually allow using a different interrupt source?
> > >
> > It does not. It's using whatever you've programmed into the APIC
> > LVT vector, AFAIK. Uncore interrupt mode is enabled via
> > IA32_DEBUGCTL. Regarless of sampling or not, you need the interrupt
> > to virtualize the counters to 64 bits.
>
> If only counting(perf stat) makes sense for uncore events, do we still
> need an interrupt handler?

Yep, I see no reason to dis-allow sampling. Sure its hard to make sense
of it, but since there are people who offline all but one cpu of a
package, I bet there are people who will run just one task on a package
as well.

Just because it doesn't make sense in general doesn't mean there isn't
anybody who'd want to do it and actually knows wth he's doing.

> 48 bits counter is not that easy to overflow in practice.

Still..

2010-11-25 06:26:30

by Lin Ming

[permalink] [raw]
Subject: Re: [RFC PATCH 2/3 v2] perf: Implement Nehalem uncore pmu

On Thu, 2010-11-25 at 14:09 +0800, Peter Zijlstra wrote:
> On Thu, 2010-11-25 at 08:24 +0800, Lin Ming wrote:
> > On Tue, 2010-11-23 at 18:00 +0800, Stephane Eranian wrote:
> > > On Sun, Nov 21, 2010 at 6:44 PM, Peter Zijlstra <[email protected]> wrote:
> > > > On Sun, 2010-11-21 at 22:04 +0800, Lin Ming wrote:
> > > >> On Sun, 2010-11-21 at 20:46 +0800, Andi Kleen wrote:
> > > >> > >
> > > >> > > 2. Uncore pmu NMI handling
> > > >> > >
> > > >> > > All the 4 cores are programmed to receive uncore counter overflow
> > > >> > > interrupt. The NMI handler(running on 1 of the 4 cores) handle all
> > > >> > > counters enabled by all 4 cores.
> > > >> >
> > > >> > Really for uncore monitoring there is no need to use an NMI handler.
> > > >> > You can't profile a core anyways, so you can just delay the reporting
> > > >> > a little bit. It may simplify the code to not use one here
> > > >> > and just use an ordinary handler.
> > > >>
> > > >> OK, I can use on ordinary interrupt handler here.
> > > >
> > > > Does the hardware actually allow using a different interrupt source?
> > > >
> > > It does not. It's using whatever you've programmed into the APIC
> > > LVT vector, AFAIK. Uncore interrupt mode is enabled via
> > > IA32_DEBUGCTL. Regarless of sampling or not, you need the interrupt
> > > to virtualize the counters to 64 bits.
> >
> > If only counting(perf stat) makes sense for uncore events, do we still
> > need an interrupt handler?
>
> Yep, I see no reason to dis-allow sampling. Sure its hard to make sense
> of it, but since there are people who offline all but one cpu of a
> package, I bet there are people who will run just one task on a package
> as well.
>
> Just because it doesn't make sense in general doesn't mean there isn't
> anybody who'd want to do it and actually knows wth he's doing.
>
> > 48 bits counter is not that easy to overflow in practice.
>
> Still..

OK, will do more tests, then send out a new version.

Thanks.

2010-11-25 08:48:50

by Stephane Eranian

[permalink] [raw]
Subject: Re: [RFC PATCH 2/3 v2] perf: Implement Nehalem uncore pmu

On Thu, Nov 25, 2010 at 7:09 AM, Peter Zijlstra <[email protected]> wrote:
> On Thu, 2010-11-25 at 08:24 +0800, Lin Ming wrote:
>> On Tue, 2010-11-23 at 18:00 +0800, Stephane Eranian wrote:
>> > On Sun, Nov 21, 2010 at 6:44 PM, Peter Zijlstra <[email protected]> wrote:
>> > > On Sun, 2010-11-21 at 22:04 +0800, Lin Ming wrote:
>> > >> On Sun, 2010-11-21 at 20:46 +0800, Andi Kleen wrote:
>> > >> > >
>> > >> > > 2. Uncore pmu NMI handling
>> > >> > >
>> > >> > > All the 4 cores are programmed to receive uncore counter overflow
>> > >> > > interrupt. The NMI handler(running on 1 of the 4 cores) handle all
>> > >> > > counters enabled by all 4 cores.
>> > >> >
>> > >> > Really for uncore monitoring there is no need to use an NMI handler.
>> > >> > You can't profile a core anyways, so you can just delay the reporting
>> > >> > a little bit. It may simplify the code to not use one here
>> > >> > and just use an ordinary handler.
>> > >>
>> > >> OK, I can use on ordinary interrupt handler here.
>> > >
>> > > Does the hardware actually allow using a different interrupt source?
>> > >
>> > It does not. It's using whatever you've programmed into the APIC
>> > LVT vector, AFAIK. Uncore interrupt mode is enabled via
>> > IA32_DEBUGCTL. Regarless of sampling or not, you need the interrupt
>> > to virtualize the counters to 64 bits.
>>
>> If only counting(perf stat) makes sense for uncore events, do we still
>> need an interrupt handler?
>
> Yep, I see no reason to dis-allow sampling. Sure its hard to make sense
> of it, but since there are people who offline all but one cpu of a
> package, I bet there are people who will run just one task on a package
> as well.
>
> Just because it doesn't make sense in general doesn't mean there isn't
> anybody who'd want to do it and actually knows wth he's doing.
>
>> 48 bits counter is not that easy to overflow in practice.
>
> Still..
>
Agreed.

2010-11-25 18:21:03

by Andi Kleen

[permalink] [raw]
Subject: Re: [RFC PATCH 2/3 v2] perf: Implement Nehalem uncore pmu

> Yep, I see no reason to dis-allow sampling. Sure its hard to make sense
> of it, but since there are people who offline all but one cpu of a
> package,

Assuming they don't have any active PCI devices either.

> I bet there are people who will run just one task on a package
> as well.

In that case the sampling has a 1/NUM-CPU-THREADS-IN-PACKAGE chance
to report the right task (or actually somewhat less because the measurement
skew for uncore is much higher than for normal events)

Really for per core measurements using the OFFCORE events is much better.

-Andi
--
[email protected] -- Speaking for myself only.

2010-11-25 21:10:09

by Stephane Eranian

[permalink] [raw]
Subject: Re: [RFC PATCH 2/3 v2] perf: Implement Nehalem uncore pmu

On Thu, Nov 25, 2010 at 7:20 PM, Andi Kleen <[email protected]> wrote:
>> Yep, I see no reason to dis-allow sampling. Sure its hard to make sense
>> of it, but since there are people who offline all but one cpu of a
>> package,
>
> Assuming they don't have any active PCI devices either.
>
Good point.

>> I bet there are people who will run just one task on a package
>> as well.
>
> In that case the sampling has a 1/NUM-CPU-THREADS-IN-PACKAGE chance
> to report the right task (or actually somewhat less because the measurement
> skew for uncore is much higher than for normal events)
>
> Really for per core measurements using the OFFCORE events is much better.
>
yes, OFFCORE_RESPONSE is much more useful.

2010-11-26 05:15:19

by Lin Ming

[permalink] [raw]
Subject: Re: [RFC PATCH 2/3 v2] perf: Implement Nehalem uncore pmu

On Tue, Nov 23, 2010 at 6:17 PM, Stephane Eranian <[email protected]> wrote:
> Lin,
>
> On Sun, Nov 21, 2010 at 1:01 PM, Lin Ming <[email protected]> wrote:
>> +static void uncore_pmu_enable_all(void)
>> +{
>> + ? ? ? u64 ctrl;
>> +
>> + ? ? ? /*
>> + ? ? ? ?* (0xFULL << 48): 1 of the 4 cores can receive NMI each time
>> + ? ? ? ?* but we don't know which core will receive the NMI when overflow happens
>> + ? ? ? ?*/
>
> That does not sound right. If you set bit 48-51 to 1, then all 4 cores
> will receive EVERY
> interrupt, i.e., it's a broadcast. That seems to contradict your
> comment: 1 of the 4. Unless
> you meant, they all get the interrupt and one will handle it, the
> other will find nothing to
> process. But I don't see the atomic op that would make this true in
> uncore_handle_irq().

Stephane,

The interrupt model is strange, it behaves differently when HT on/off.

If HT is off, all 4 cores will receive every interrupt, i.e., it's a broadcast.

If HT is on, only 1 of the 4 cores will receive the interrupt(both
Threads in that core receive the interrupt),
and it can't be determined which core will receive the interrupt.

Did you ever observe this?

>
> I also think that if you want all processors to receive the
> interrupts, then the mask should
> be 0xff when HT is on. The manual is rather obscure on this, but it
> does make sense.

I tried to set the mask 0xff when HT is on, but kernel panics, because
the reserve bits are set.

Thanks,
Lin Ming

>
>
>> + ? ? ? ctrl = ((1 << UNCORE_NUM_GENERAL_COUNTERS) - 1) | (0xFULL << 48);
>> + ? ? ? ctrl |= MSR_UNCORE_PERF_GLOBAL_CTRL_EN_FC0;
>> +
>> + ? ? ? /*
>> + ? ? ? ?* Freeze the uncore pmu on overflow of any uncore counter.
>> + ? ? ? ?* This makes unocre NMI handling easier.
>> + ? ? ? ?*/
>> + ? ? ? ctrl |= MSR_UNCORE_PERF_GLOBAL_CTRL_PMI_FRZ;
>> +
>> + ? ? ? wrmsrl(MSR_UNCORE_PERF_GLOBAL_CTRL, ctrl);
>> +}
>> +
> --

2010-11-26 08:18:09

by Stephane Eranian

[permalink] [raw]
Subject: Re: [RFC PATCH 2/3 v2] perf: Implement Nehalem uncore pmu

On Fri, Nov 26, 2010 at 6:15 AM, Lin Ming <[email protected]> wrote:
> On Tue, Nov 23, 2010 at 6:17 PM, Stephane Eranian <[email protected]> wrote:
>> Lin,
>>
>> On Sun, Nov 21, 2010 at 1:01 PM, Lin Ming <[email protected]> wrote:
>>> +static void uncore_pmu_enable_all(void)
>>> +{
>>> +       u64 ctrl;
>>> +
>>> +       /*
>>> +        * (0xFULL << 48): 1 of the 4 cores can receive NMI each time
>>> +        * but we don't know which core will receive the NMI when overflow happens
>>> +        */
>>
>> That does not sound right. If you set bit 48-51 to 1, then all 4 cores
>> will receive EVERY
>> interrupt, i.e., it's a broadcast. That seems to contradict your
>> comment: 1 of the 4. Unless
>> you meant, they all get the interrupt and one will handle it, the
>> other will find nothing to
>> process. But I don't see the atomic op that would make this true in
>> uncore_handle_irq().
>
> Stephane,
>
> The interrupt model is strange, it behaves differently when HT on/off.
>
> If HT is off, all 4 cores will receive every interrupt, i.e., it's a broadcast.
>
That's if yo set the mask to 0xf, right?

In the perf_event model, given that any one of the 4 cores can be used
to program uncore events, you have no choice but to broadcast to all
4 cores. Each has to demultiplex and figure out which of its counters
have overflowed.

> If HT is on, only 1 of the 4 cores will receive the interrupt(both
> Threads in that core receive the interrupt),
> and it can't be determined which core will receive the interrupt.
>
> Did you ever observe this?
>
No because I never set more than one bit in the mask.

> I tried to set the mask 0xff when HT is on, but kernel panics, because
> the reserve bits are set.

Let me check on this. It would seem to imply that in HT mode, both threads
necessarily receive the interrupts.

Was that on Nehalem or Westmere?

2010-11-26 08:29:44

by Lin Ming

[permalink] [raw]
Subject: Re: [RFC PATCH 2/3 v2] perf: Implement Nehalem uncore pmu

On Fri, Nov 26, 2010 at 4:18 PM, Stephane Eranian <[email protected]> wrote:
> On Fri, Nov 26, 2010 at 6:15 AM, Lin Ming <[email protected]> wrote:
>> On Tue, Nov 23, 2010 at 6:17 PM, Stephane Eranian <[email protected]> wrote:
>>> Lin,
>>>
>>> On Sun, Nov 21, 2010 at 1:01 PM, Lin Ming <[email protected]> wrote:
>>>> +static void uncore_pmu_enable_all(void)
>>>> +{
>>>> + ? ? ? u64 ctrl;
>>>> +
>>>> + ? ? ? /*
>>>> + ? ? ? ?* (0xFULL << 48): 1 of the 4 cores can receive NMI each time
>>>> + ? ? ? ?* but we don't know which core will receive the NMI when overflow happens
>>>> + ? ? ? ?*/
>>>
>>> That does not sound right. If you set bit 48-51 to 1, then all 4 cores
>>> will receive EVERY
>>> interrupt, i.e., it's a broadcast. That seems to contradict your
>>> comment: 1 of the 4. Unless
>>> you meant, they all get the interrupt and one will handle it, the
>>> other will find nothing to
>>> process. But I don't see the atomic op that would make this true in
>>> uncore_handle_irq().
>>
>> Stephane,
>>
>> The interrupt model is strange, it behaves differently when HT on/off.
>>
>> If HT is off, all 4 cores will receive every interrupt, i.e., it's a broadcast.
>>
> That's if yo set the mask to 0xf, right?

Right.

>
> In the perf_event model, given that any one of the 4 cores can be used
> to program uncore events, you have no choice but to broadcast to all
> 4 cores. Each has to demultiplex and figure out which of its counters
> have overflowed.

This is what my upcoming v3 patches doing.

>
>> If HT is on, only 1 of the 4 cores will receive the interrupt(both
>> Threads in that core receive the interrupt),
>> and it can't be determined which core will receive the interrupt.
>>
>> Did you ever observe this?
>>
> No because I never set more than one bit in the mask.
>
>> I tried to set the mask 0xff when HT is on, but kernel panics, because
>> the reserve bits are set.
>
> Let me check on this. It would seem to imply that in HT mode, both threads
> necessarily receive the interrupts.
>
> Was that on Nehalem or Westmere?

Nehalem.

Thanks,
Lin Ming

2010-11-26 08:33:46

by Stephane Eranian

[permalink] [raw]
Subject: Re: [RFC PATCH 2/3 v2] perf: Implement Nehalem uncore pmu

Lin,

Looked at the perfmon code, and it seems the mask is actual
cores, not threads:
rdmsrl(MSR_NHM_UNC_GLOBAL_CTRL, val);
val |= 1ULL << (48 + cpu_data(smp_processor_id()).cpu_core_id);
wrmsrl(MSR_NHM_UNC_GLOBAL_CTRL, val);

That seems to imply both threads will get the interrupt.

In the the overflowed event was programmed from on of the two threads, that
means one will process the overflow, the other will get spurious.

On the cores where no uncore was programmed, then both threads will have
a spurious interrupt.

That brings up back to the 'spurious interrupt' issue and the 'NMI
Dazed' message
that Don tried to eliminate. Now we have a new situation where we will
get interrupt
with no work to do, so the perf_event will pass the interrupt onto the
next subsystem
and eventually we will get the 'dazed' message. I am just guessing here....


On Fri, Nov 26, 2010 at 9:18 AM, Stephane Eranian <[email protected]> wrote:
> On Fri, Nov 26, 2010 at 6:15 AM, Lin Ming <[email protected]> wrote:
>> On Tue, Nov 23, 2010 at 6:17 PM, Stephane Eranian <[email protected]> wrote:
>>> Lin,
>>>
>>> On Sun, Nov 21, 2010 at 1:01 PM, Lin Ming <[email protected]> wrote:
>>>> +static void uncore_pmu_enable_all(void)
>>>> +{
>>>> +       u64 ctrl;
>>>> +
>>>> +       /*
>>>> +        * (0xFULL << 48): 1 of the 4 cores can receive NMI each time
>>>> +        * but we don't know which core will receive the NMI when overflow happens
>>>> +        */
>>>
>>> That does not sound right. If you set bit 48-51 to 1, then all 4 cores
>>> will receive EVERY
>>> interrupt, i.e., it's a broadcast. That seems to contradict your
>>> comment: 1 of the 4. Unless
>>> you meant, they all get the interrupt and one will handle it, the
>>> other will find nothing to
>>> process. But I don't see the atomic op that would make this true in
>>> uncore_handle_irq().
>>
>> Stephane,
>>
>> The interrupt model is strange, it behaves differently when HT on/off.
>>
>> If HT is off, all 4 cores will receive every interrupt, i.e., it's a broadcast.
>>
> That's if yo set the mask to 0xf, right?
>
> In the perf_event model, given that any one of the 4 cores can be used
> to program uncore events, you have no choice but to broadcast to all
> 4 cores. Each has to demultiplex and figure out which of its counters
> have overflowed.
>
>> If HT is on, only 1 of the 4 cores will receive the interrupt(both
>> Threads in that core receive the interrupt),
>> and it can't be determined which core will receive the interrupt.
>>
>> Did you ever observe this?
>>
> No because I never set more than one bit in the mask.
>
>> I tried to set the mask 0xff when HT is on, but kernel panics, because
>> the reserve bits are set.
>
> Let me check on this. It would seem to imply that in HT mode, both threads
> necessarily receive the interrupts.
>
> Was that on Nehalem or Westmere?
>

2010-11-26 09:00:13

by Lin Ming

[permalink] [raw]
Subject: Re: [RFC PATCH 2/3 v2] perf: Implement Nehalem uncore pmu

On Fri, Nov 26, 2010 at 4:33 PM, Stephane Eranian <[email protected]> wrote:
> Lin,
>
> Looked at the perfmon code, and it seems the mask is actual
> cores, not threads:
> ? ? ? ? ? ? ? ?rdmsrl(MSR_NHM_UNC_GLOBAL_CTRL, val);
> ? ? ? ? ? ? ? ?val |= 1ULL << (48 + cpu_data(smp_processor_id()).cpu_core_id);
> ? ? ? ? ? ? ? ?wrmsrl(MSR_NHM_UNC_GLOBAL_CTRL, val);
>
> That seems to imply both threads will get the interrupt.
>
> In the the overflowed event was programmed from on of the two threads, that
> means one will process the overflow, the other will get spurious.
>
> On the cores where no uncore was programmed, then both threads will have
> a spurious interrupt.

But in my test, if HT is on, only the 2 theads in one of the four cores
will receive the interrupt. Even worse, we don't know which core will
receive the interrupt
when overflow happens.

I'll do more tests to verify this.

>
> That brings up back to the 'spurious interrupt' issue and the 'NMI
> Dazed' message
> that Don tried to eliminate. Now we have a new situation where we will
> get interrupt
> with no work to do, so the perf_event will pass the interrupt onto the
> next subsystem
> and eventually we will get the 'dazed' message. I am just guessing here....

Add Don.

Thanks,
Lin Ming

>
>
> On Fri, Nov 26, 2010 at 9:18 AM, Stephane Eranian <[email protected]> wrote:
>> On Fri, Nov 26, 2010 at 6:15 AM, Lin Ming <[email protected]> wrote:
>>> On Tue, Nov 23, 2010 at 6:17 PM, Stephane Eranian <[email protected]> wrote:
>>>> Lin,
>>>>
>>>> On Sun, Nov 21, 2010 at 1:01 PM, Lin Ming <[email protected]> wrote:
>>>>> +static void uncore_pmu_enable_all(void)
>>>>> +{
>>>>> + ? ? ? u64 ctrl;
>>>>> +
>>>>> + ? ? ? /*
>>>>> + ? ? ? ?* (0xFULL << 48): 1 of the 4 cores can receive NMI each time
>>>>> + ? ? ? ?* but we don't know which core will receive the NMI when overflow happens
>>>>> + ? ? ? ?*/
>>>>
>>>> That does not sound right. If you set bit 48-51 to 1, then all 4 cores
>>>> will receive EVERY
>>>> interrupt, i.e., it's a broadcast. That seems to contradict your
>>>> comment: 1 of the 4. Unless
>>>> you meant, they all get the interrupt and one will handle it, the
>>>> other will find nothing to
>>>> process. But I don't see the atomic op that would make this true in
>>>> uncore_handle_irq().
>>>
>>> Stephane,
>>>
>>> The interrupt model is strange, it behaves differently when HT on/off.
>>>
>>> If HT is off, all 4 cores will receive every interrupt, i.e., it's a broadcast.
>>>
>> That's if yo set the mask to 0xf, right?
>>
>> In the perf_event model, given that any one of the 4 cores can be used
>> to program uncore events, you have no choice but to broadcast to all
>> 4 cores. Each has to demultiplex and figure out which of its counters
>> have overflowed.
>>
>>> If HT is on, only 1 of the 4 cores will receive the interrupt(both
>>> Threads in that core receive the interrupt),
>>> and it can't be determined which core will receive the interrupt.
>>>
>>> Did you ever observe this?
>>>
>> No because I never set more than one bit in the mask.
>>
>>> I tried to set the mask 0xff when HT is on, but kernel panics, because
>>> the reserve bits are set.
>>
>> Let me check on this. It would seem to imply that in HT mode, both threads
>> necessarily receive the interrupts.
>>
>> Was that on Nehalem or Westmere?
>>
>

2010-11-26 10:06:48

by Stephane Eranian

[permalink] [raw]
Subject: Re: [RFC PATCH 2/3 v2] perf: Implement Nehalem uncore pmu

On Fri, Nov 26, 2010 at 10:00 AM, Lin Ming <[email protected]> wrote:
> On Fri, Nov 26, 2010 at 4:33 PM, Stephane Eranian <[email protected]> wrote:
>> Lin,
>>
>> Looked at the perfmon code, and it seems the mask is actual
>> cores, not threads:
>>                rdmsrl(MSR_NHM_UNC_GLOBAL_CTRL, val);
>>                val |= 1ULL << (48 + cpu_data(smp_processor_id()).cpu_core_id);
>>                wrmsrl(MSR_NHM_UNC_GLOBAL_CTRL, val);
>>
>> That seems to imply both threads will get the interrupt.
>>
>> In the the overflowed event was programmed from on of the two threads, that
>> means one will process the overflow, the other will get spurious.
>>
>> On the cores where no uncore was programmed, then both threads will have
>> a spurious interrupt.
>
> But in my test, if HT is on, only the 2 theads in one of the four cores
> will receive the interrupt. Even worse, we don't know which core will
> receive the interrupt
> when overflow happens.
>
The MSR_NHM_UNC_GLOBAL_CTRL is per socket not per core.

> I'll do more tests to verify this.

In your tests, are your programming the same uncore event
across all CPUs? If so then you may have a race condition
setting the MSR because it read-modify-write.

What about you program only one uncore event from one CPU?

>
>>
>> That brings up back to the 'spurious interrupt' issue and the 'NMI
>> Dazed' message
>> that Don tried to eliminate. Now we have a new situation where we will
>> get interrupt
>> with no work to do, so the perf_event will pass the interrupt onto the
>> next subsystem
>> and eventually we will get the 'dazed' message. I am just guessing here....
>
> Add Don.
>
> Thanks,
> Lin Ming
>
>>
>>
>> On Fri, Nov 26, 2010 at 9:18 AM, Stephane Eranian <[email protected]> wrote:
>>> On Fri, Nov 26, 2010 at 6:15 AM, Lin Ming <[email protected]> wrote:
>>>> On Tue, Nov 23, 2010 at 6:17 PM, Stephane Eranian <[email protected]> wrote:
>>>>> Lin,
>>>>>
>>>>> On Sun, Nov 21, 2010 at 1:01 PM, Lin Ming <[email protected]> wrote:
>>>>>> +static void uncore_pmu_enable_all(void)
>>>>>> +{
>>>>>> +       u64 ctrl;
>>>>>> +
>>>>>> +       /*
>>>>>> +        * (0xFULL << 48): 1 of the 4 cores can receive NMI each time
>>>>>> +        * but we don't know which core will receive the NMI when overflow happens
>>>>>> +        */
>>>>>
>>>>> That does not sound right. If you set bit 48-51 to 1, then all 4 cores
>>>>> will receive EVERY
>>>>> interrupt, i.e., it's a broadcast. That seems to contradict your
>>>>> comment: 1 of the 4. Unless
>>>>> you meant, they all get the interrupt and one will handle it, the
>>>>> other will find nothing to
>>>>> process. But I don't see the atomic op that would make this true in
>>>>> uncore_handle_irq().
>>>>
>>>> Stephane,
>>>>
>>>> The interrupt model is strange, it behaves differently when HT on/off.
>>>>
>>>> If HT is off, all 4 cores will receive every interrupt, i.e., it's a broadcast.
>>>>
>>> That's if yo set the mask to 0xf, right?
>>>
>>> In the perf_event model, given that any one of the 4 cores can be used
>>> to program uncore events, you have no choice but to broadcast to all
>>> 4 cores. Each has to demultiplex and figure out which of its counters
>>> have overflowed.
>>>
>>>> If HT is on, only 1 of the 4 cores will receive the interrupt(both
>>>> Threads in that core receive the interrupt),
>>>> and it can't be determined which core will receive the interrupt.
>>>>
>>>> Did you ever observe this?
>>>>
>>> No because I never set more than one bit in the mask.
>>>
>>>> I tried to set the mask 0xff when HT is on, but kernel panics, because
>>>> the reserve bits are set.
>>>
>>> Let me check on this. It would seem to imply that in HT mode, both threads
>>> necessarily receive the interrupts.
>>>
>>> Was that on Nehalem or Westmere?
>>>
>>
>

2010-11-26 11:24:08

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [RFC PATCH 2/3 v2] perf: Implement Nehalem uncore pmu

On Fri, 2010-11-26 at 09:18 +0100, Stephane Eranian wrote:

> In the perf_event model, given that any one of the 4 cores can be used
> to program uncore events, you have no choice but to broadcast to all
> 4 cores. Each has to demultiplex and figure out which of its counters
> have overflowed.

Not really, you can redirect all these events to the first online cpu of
the node.

You can re-write event->cpu in pmu::event_init(), and register cpu
hotplug notifiers to migrate the state around.

2010-11-26 11:25:56

by Stephane Eranian

[permalink] [raw]
Subject: Re: [RFC PATCH 2/3 v2] perf: Implement Nehalem uncore pmu

On Fri, Nov 26, 2010 at 12:24 PM, Peter Zijlstra <[email protected]> wrote:
> On Fri, 2010-11-26 at 09:18 +0100, Stephane Eranian wrote:
>
>> In the perf_event model, given that any one of the 4 cores can be used
>> to program uncore events, you have no choice but to broadcast to all
>> 4 cores. Each has to demultiplex and figure out which of its counters
>> have overflowed.
>
> Not really, you can redirect all these events to the first online cpu of
> the node.
>
> You can re-write event->cpu in pmu::event_init(), and register cpu
> hotplug notifiers to migrate the state around.
>
I am sure you could. But then the user thinks the event is controlled
from CPUx when it's actually from CPUz. I am sure it can work but
that's confusing, especially interrupt-wise.

2010-11-26 11:36:51

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [RFC PATCH 2/3 v2] perf: Implement Nehalem uncore pmu

On Fri, 2010-11-26 at 12:25 +0100, Stephane Eranian wrote:
> On Fri, Nov 26, 2010 at 12:24 PM, Peter Zijlstra <[email protected]> wrote:
> > On Fri, 2010-11-26 at 09:18 +0100, Stephane Eranian wrote:
> >
> >> In the perf_event model, given that any one of the 4 cores can be used
> >> to program uncore events, you have no choice but to broadcast to all
> >> 4 cores. Each has to demultiplex and figure out which of its counters
> >> have overflowed.
> >
> > Not really, you can redirect all these events to the first online cpu of
> > the node.
> >
> > You can re-write event->cpu in pmu::event_init(), and register cpu
> > hotplug notifiers to migrate the state around.
> >
> I am sure you could. But then the user thinks the event is controlled
> from CPUx when it's actually from CPUz. I am sure it can work but
> that's confusing, especially interrupt-wise.

Well, its either that or keeping a node wide state like we do for AMD
and serialize everything from there.

And I'm not sure what's most expensive, steering the interrupt to one
core only, or broadcasting every interrupt, I'd favour the first
approach.

The whole thing is a node-wide resource, so the user needs to think in
nodes anyway, we already do a cpu->node mapping for identifying the
thing.

2010-11-26 11:41:37

by Stephane Eranian

[permalink] [raw]
Subject: Re: [RFC PATCH 2/3 v2] perf: Implement Nehalem uncore pmu

On Fri, Nov 26, 2010 at 12:36 PM, Peter Zijlstra <[email protected]> wrote:
> On Fri, 2010-11-26 at 12:25 +0100, Stephane Eranian wrote:
>> On Fri, Nov 26, 2010 at 12:24 PM, Peter Zijlstra <[email protected]> wrote:
>> > On Fri, 2010-11-26 at 09:18 +0100, Stephane Eranian wrote:
>> >
>> >> In the perf_event model, given that any one of the 4 cores can be used
>> >> to program uncore events, you have no choice but to broadcast to all
>> >> 4 cores. Each has to demultiplex and figure out which of its counters
>> >> have overflowed.
>> >
>> > Not really, you can redirect all these events to the first online cpu of
>> > the node.
>> >
>> > You can re-write event->cpu in pmu::event_init(), and register cpu
>> > hotplug notifiers to migrate the state around.
>> >
>> I am sure you could. But then the user thinks the event is controlled
>> from CPUx when it's actually from CPUz. I am sure it can work but
>> that's confusing, especially interrupt-wise.
>
> Well, its either that or keeping a node wide state like we do for AMD
> and serialize everything from there.
>
> And I'm not sure what's most expensive, steering the interrupt to one
> core only, or broadcasting every interrupt, I'd favour the first
> approach.

I think the one core-only approach will limit the spurious interrupt aspect.
In perfmon, that's how I had it setup. The first CPU where uncore is
accessed owns the uncore PMU for the socket, thus all interrupts are
routed there. What you are proposing is the same. Now you can chose
you hardcode which is the default core to handle this, or (better) you
use the first core that accesses uncore.

>
> The whole thing is a node-wide resource, so the user needs to think in
> nodes anyway, we already do a cpu->node mapping for identifying the
> thing.
>
Agreed.

2010-11-26 16:25:43

by Lin Ming

[permalink] [raw]
Subject: Re: [RFC PATCH 2/3 v2] perf: Implement Nehalem uncore pmu

On Fri, Nov 26, 2010 at 7:41 PM, Stephane Eranian <[email protected]> wrote:
> On Fri, Nov 26, 2010 at 12:36 PM, Peter Zijlstra <[email protected]> wrote:
>> On Fri, 2010-11-26 at 12:25 +0100, Stephane Eranian wrote:
>>> On Fri, Nov 26, 2010 at 12:24 PM, Peter Zijlstra <[email protected]> wrote:
>>> > On Fri, 2010-11-26 at 09:18 +0100, Stephane Eranian wrote:
>>> >
>>> >> In the perf_event model, given that any one of the 4 cores can be used
>>> >> to program uncore events, you have no choice but to broadcast to all
>>> >> 4 cores. Each has to demultiplex and figure out which of its counters
>>> >> have overflowed.
>>> >
>>> > Not really, you can redirect all these events to the first online cpu of
>>> > the node.
>>> >
>>> > You can re-write event->cpu in pmu::event_init(), and register cpu
>>> > hotplug notifiers to migrate the state around.
>>> >
>>> I am sure you could. But then the user thinks the event is controlled
>>> from CPUx when it's actually from CPUz. I am sure it can work but
>>> that's confusing, especially interrupt-wise.
>>
>> Well, its either that or keeping a node wide state like we do for AMD
>> and serialize everything from there.
>>
>> And I'm not sure what's most expensive, steering the interrupt to one
>> core only, or broadcasting every interrupt, I'd favour the first
>> approach.
>
> I think the one core-only approach will limit the spurious interrupt aspect.
> In perfmon, that's how I had it setup. The first CPU where uncore is
> accessed owns the uncore PMU for the socket, thus all interrupts are
> routed there. What you are proposing is the same. Now you can chose
> you hardcode which is the default core to handle this, or (better) you
> use the first core that accesses uncore.
>
>>
>> The whole thing is a node-wide resource, so the user needs to think in
>> nodes anyway, we already do a cpu->node mapping for identifying the
>> thing.
>>
> Agreed.
>

Hi, all

Thanks for all the comments.
I'm on travel Nov 27 to Nov 30.

I'll address the comments when I'm back.

Thanks,
Lin Ming

2010-12-01 03:19:27

by Lin Ming

[permalink] [raw]
Subject: Re: [RFC PATCH 2/3 v2] perf: Implement Nehalem uncore pmu

On Fri, 2010-11-26 at 18:06 +0800, Stephane Eranian wrote:
> On Fri, Nov 26, 2010 at 10:00 AM, Lin Ming <[email protected]> wrote:
> > On Fri, Nov 26, 2010 at 4:33 PM, Stephane Eranian <[email protected]> wrote:
> >> Lin,
> >>
> >> Looked at the perfmon code, and it seems the mask is actual
> >> cores, not threads:
> >> rdmsrl(MSR_NHM_UNC_GLOBAL_CTRL, val);
> >> val |= 1ULL << (48 + cpu_data(smp_processor_id()).cpu_core_id);
> >> wrmsrl(MSR_NHM_UNC_GLOBAL_CTRL, val);
> >>
> >> That seems to imply both threads will get the interrupt.
> >>
> >> In the the overflowed event was programmed from on of the two threads, that
> >> means one will process the overflow, the other will get spurious.
> >>
> >> On the cores where no uncore was programmed, then both threads will have
> >> a spurious interrupt.
> >
> > But in my test, if HT is on, only the 2 theads in one of the four cores
> > will receive the interrupt. Even worse, we don't know which core will
> > receive the interrupt
> > when overflow happens.
> >
> The MSR_NHM_UNC_GLOBAL_CTRL is per socket not per core.

Understood.

>
> > I'll do more tests to verify this.
>
> In your tests, are your programming the same uncore event
> across all CPUs? If so then you may have a race condition
> setting the MSR because it read-modify-write.
>
> What about you program only one uncore event from one CPU?

This is what I tested, programming only one uncore event from one CPU.
When HT is off, all four cores in the socket receive the interrupt.
When HT is on, only the 2 threads in one of the four cores receive the
interrupt.

2010-12-01 03:26:18

by Lin Ming

[permalink] [raw]
Subject: Re: [RFC PATCH 2/3 v2] perf: Implement Nehalem uncore pmu

On Fri, 2010-11-26 at 19:36 +0800, Peter Zijlstra wrote:
> On Fri, 2010-11-26 at 12:25 +0100, Stephane Eranian wrote:
> > On Fri, Nov 26, 2010 at 12:24 PM, Peter Zijlstra <[email protected]> wrote:
> > > On Fri, 2010-11-26 at 09:18 +0100, Stephane Eranian wrote:
> > >
> > >> In the perf_event model, given that any one of the 4 cores can be used
> > >> to program uncore events, you have no choice but to broadcast to all
> > >> 4 cores. Each has to demultiplex and figure out which of its counters
> > >> have overflowed.
> > >
> > > Not really, you can redirect all these events to the first online cpu of
> > > the node.
> > >
> > > You can re-write event->cpu in pmu::event_init(), and register cpu
> > > hotplug notifiers to migrate the state around.
> > >
> > I am sure you could. But then the user thinks the event is controlled
> > from CPUx when it's actually from CPUz. I am sure it can work but
> > that's confusing, especially interrupt-wise.
>
> Well, its either that or keeping a node wide state like we do for AMD
> and serialize everything from there.
>
> And I'm not sure what's most expensive, steering the interrupt to one
> core only, or broadcasting every interrupt, I'd favour the first
> approach.
>
> The whole thing is a node-wide resource, so the user needs to think in
> nodes anyway, we already do a cpu->node mapping for identifying the
> thing.

How about a new sub-command for node-wide events statistics?

perf node -n <node> -e <event>?

2010-12-01 11:37:52

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [RFC PATCH 2/3 v2] perf: Implement Nehalem uncore pmu

On Wed, 2010-12-01 at 11:28 +0800, Lin Ming wrote:

> How about a new sub-command for node-wide events statistics?
>
> perf node -n <node> -e <event>?

Maybe as a very slim wrapper around perf stat, but personally I wouldn't
care.

2010-12-01 13:04:49

by Stephane Eranian

[permalink] [raw]
Subject: Re: [RFC PATCH 2/3 v2] perf: Implement Nehalem uncore pmu

On Wed, Dec 1, 2010 at 4:21 AM, Lin Ming <[email protected]> wrote:
>
> On Fri, 2010-11-26 at 18:06 +0800, Stephane Eranian wrote:
> > On Fri, Nov 26, 2010 at 10:00 AM, Lin Ming <[email protected]> wrote:
> > > On Fri, Nov 26, 2010 at 4:33 PM, Stephane Eranian <[email protected]> wrote:
> > >> Lin,
> > >>
> > >> Looked at the perfmon code, and it seems the mask is actual
> > >> cores, not threads:
> > >>                rdmsrl(MSR_NHM_UNC_GLOBAL_CTRL, val);
> > >>                val |= 1ULL << (48 + cpu_data(smp_processor_id()).cpu_core_id);
> > >>                wrmsrl(MSR_NHM_UNC_GLOBAL_CTRL, val);
> > >>
> > >> That seems to imply both threads will get the interrupt.
> > >>
> > >> In the the overflowed event was programmed from on of the two threads, that
> > >> means one will process the overflow, the other will get spurious.
> > >>
> > >> On the cores where no uncore was programmed, then both threads will have
> > >> a spurious interrupt.
> > >
> > > But in my test, if HT is on, only the 2 theads in one of the four cores
> > > will receive the interrupt. Even worse, we don't know which core will
> > > receive the interrupt
> > > when overflow happens.
> > >
> > The MSR_NHM_UNC_GLOBAL_CTRL is per socket not per core.
>
> Understood.
>
> >
> > > I'll do more tests to verify this.
> >
> > In your tests, are your programming the same uncore event
> > across all CPUs? If so then you may have a race condition
> > setting the MSR because it read-modify-write.
> >
> > What about you program only one uncore event from one CPU?
>
> This is what I tested, programming only one uncore event from one CPU.

> When HT is off, all four cores in the socket receive the interrupt.

If the value of the MSR is 0xf << 48?

> When HT is on, only the 2 threads in one of the four cores receive the
> interrupt.
Something is not right here. Next week, I may be able to run some tests
on a Nehalem using perfmon to compare. Could you also send me your
latest uncore patch against tip-x86?
Thanks.

2010-12-01 14:08:22

by Andi Kleen

[permalink] [raw]
Subject: Re: [RFC PATCH 2/3 v2] perf: Implement Nehalem uncore pmu

> How about a new sub-command for node-wide events statistics?
>
> perf node -n <node> -e <event>?

Seems like the best option to me (and not allowing the uncore events
for the other commands)

But I don't like "node" because in a non NUMA kernel you won't have nodes,
but this is still useful. Or with NUMA emulation you may have nodes
that doesn't match the sockets.

Maybe perf package or perf socket ?

-Andi
--
[email protected] -- Speaking for myself only.

2010-12-01 14:18:44

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [RFC PATCH 2/3 v2] perf: Implement Nehalem uncore pmu

On Wed, 2010-12-01 at 15:08 +0100, Andi Kleen wrote:
> Seems like the best option to me (and not allowing the uncore events
> for the other commands)
>
Andi, I don't care what you think, that's simply not going to happen.

2010-12-02 05:24:03

by Lin Ming

[permalink] [raw]
Subject: Re: [RFC PATCH 2/3 v2] perf: Implement Nehalem uncore pmu

On Wed, 2010-12-01 at 21:04 +0800, Stephane Eranian wrote:
> On Wed, Dec 1, 2010 at 4:21 AM, Lin Ming <[email protected]> wrote:
> >
> > On Fri, 2010-11-26 at 18:06 +0800, Stephane Eranian wrote:
> > > On Fri, Nov 26, 2010 at 10:00 AM, Lin Ming <[email protected]> wrote:
> > > > On Fri, Nov 26, 2010 at 4:33 PM, Stephane Eranian <[email protected]> wrote:
> > > >> Lin,
> > > >>
> > > >> Looked at the perfmon code, and it seems the mask is actual
> > > >> cores, not threads:
> > > >> rdmsrl(MSR_NHM_UNC_GLOBAL_CTRL, val);
> > > >> val |= 1ULL << (48 + cpu_data(smp_processor_id()).cpu_core_id);
> > > >> wrmsrl(MSR_NHM_UNC_GLOBAL_CTRL, val);
> > > >>
> > > >> That seems to imply both threads will get the interrupt.
> > > >>
> > > >> In the the overflowed event was programmed from on of the two threads, that
> > > >> means one will process the overflow, the other will get spurious.
> > > >>
> > > >> On the cores where no uncore was programmed, then both threads will have
> > > >> a spurious interrupt.
> > > >
> > > > But in my test, if HT is on, only the 2 theads in one of the four cores
> > > > will receive the interrupt. Even worse, we don't know which core will
> > > > receive the interrupt
> > > > when overflow happens.
> > > >
> > > The MSR_NHM_UNC_GLOBAL_CTRL is per socket not per core.
> >
> > Understood.
> >
> > >
> > > > I'll do more tests to verify this.
> > >
> > > In your tests, are your programming the same uncore event
> > > across all CPUs? If so then you may have a race condition
> > > setting the MSR because it read-modify-write.
> > >
> > > What about you program only one uncore event from one CPU?
> >
> > This is what I tested, programming only one uncore event from one CPU.
>
> > When HT is off, all four cores in the socket receive the interrupt.
>
> If the value of the MSR is 0xf << 48?

Yes, the EN_PMI_CORE* bits are set to 0xf.

>
> > When HT is on, only the 2 threads in one of the four cores receive the
> > interrupt.
> Something is not right here. Next week, I may be able to run some tests
> on a Nehalem using perfmon to compare. Could you also send me your
> latest uncore patch against tip-x86?
> Thanks.

I just send it out.
http://lkml.org/lkml/2010/12/2/4

Thanks,
Lin Ming