LinuxLists.cc - [patch 21/24] perfmon: Intel architectural PMU support (x86)

2008-11-26 08:49:38

Subject: [patch 21/24] perfmon: Intel architectural PMU support (x86)

This patch adds Intel architectural PMU support (version 1, 2, and 3).

Signed-off-by: Stephane Eranian <[email protected]>
--

Index: o3/arch/x86/perfmon/Makefile
===================================================================
--- o3.orig/arch/x86/perfmon/Makefile 2008-11-25 18:09:47.000000000 +0100
+++ o3/arch/x86/perfmon/Makefile 2008-11-25 18:21:33.000000000 +0100
@@ -3,3 +3,4 @@
# Contributed by Stephane Eranian <[email protected]>
#
obj-$(CONFIG_PERFMON) += perfmon.o
+obj-$(CONFIG_X86_PERFMON_INTEL_ARCH) += perfmon_intel_arch.o
Index: o3/arch/x86/perfmon/perfmon_intel_arch.c
===================================================================
--- /dev/null 1970-01-01 00:00:00.000000000 +0000
+++ o3/arch/x86/perfmon/perfmon_intel_arch.c 2008-11-25 18:22:00.000000000 +0100
@@ -0,0 +1,628 @@
+/*
+ * This file contains the Intel architectural perfmon v1, v2, v3
+ * description tables.
+ *
+ * Architectural perfmon was introduced with Intel Core Solo/Duo
+ * processors.
+ *
+ * Copyright (c) 2006-2007 Hewlett-Packard Development Company, L.P.
+ * Contributed by Stephane Eranian <[email protected]>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+ * 02111-1307 USA
+ */
+#include <linux/kprobes.h>
+#include <linux/perfmon_kern.h>
+#include <asm/msr.h>
+#include <asm/apic.h>
+
+static u64 enable_mask[PFM_MAX_PMCS];
+static u16 max_enable;
+static int pfm_intel_arch_version;
+
+DEFINE_PER_CPU(u64, saved_global_ctrl);
+
+/*
+ * - upper 32 bits are reserved
+ * - INT: APIC enable bit is reserved (forced to 1)
+ * - bit 21 is reserved
+ *
+ * RSVD: reserved bits are 1
+ */
+#define PFM_IA_PMC_RSVD ((~((1ULL<<32)-1)) \
+ | (1ULL<<20) \
+ | (1ULL<<21))
+
+/*
+ * force Local APIC interrupt on overflow
+ * disable with NO_EMUL64
+ */
+#define PFM_IA_PMC_VAL (1ULL<<20)
+#define PFM_IA_NO64 (1ULL<<20)
+
+/*
+ * architectuture specifies that:
+ * IA32_PMCx MSR : starts at 0x0c1 & occupy a contiguous block of MSR
+ * IA32_PERFEVTSELx MSR : starts at 0x186 & occupy a contiguous block of MSR
+ * MSR_GEN_FIXED_CTR0 : starts at 0x309 & occupy a contiguous block of MSR
+ */
+#define MSR_GEN_SEL_BASE MSR_P6_EVNTSEL0
+#define MSR_GEN_PMC_BASE MSR_P6_PERFCTR0
+#define MSR_GEN_FIXED_PMC_BASE MSR_CORE_PERF_FIXED_CTR0
+
+/*
+ * layout of EAX for CPUID.0xa leaf function
+ */
+struct pmu_eax {
+ unsigned int version:8; /* architectural perfmon version */
+ unsigned int num_cnt:8; /* number of generic counters */
+ unsigned int cnt_width:8; /* width of generic counters */
+ unsigned int ebx_length:8; /* number of architected events */
+};
+
+/*
+ * layout of EDX for CPUID.0xa leaf function when perfmon v2 is detected
+ */
+struct pmu_edx {
+ unsigned int num_cnt:5; /* number of fixed counters */
+ unsigned int cnt_width:8; /* width of fixed counters */
+ unsigned int reserved:19;
+};
+
+static void pfm_intel_arch_acquire_pmu_percpu(void);
+static void pfm_intel_arch_release_pmu_percpu(void);
+static int pfm_intel_arch_stop_save(struct pfm_context *ctx,
+ struct pfm_event_set *set);
+static int pfm_intel_arch_has_ovfls(struct pfm_context *ctx);
+static void __kprobes pfm_intel_arch_quiesce(void);
+
+/*
+ * physical addresses of MSR controlling the perfevtsel and counter registers
+ */
+struct pfm_arch_pmu_info pfm_intel_arch_pmu_info = {
+ .stop_save = pfm_intel_arch_stop_save,
+ .has_ovfls = pfm_intel_arch_has_ovfls,
+ .quiesce = pfm_intel_arch_quiesce,
+ .acquire_pmu_percpu = pfm_intel_arch_acquire_pmu_percpu,
+ .release_pmu_percpu = pfm_intel_arch_release_pmu_percpu
+};
+
+#define PFM_IA_C(n) { \
+ .type = PFM_REG_I64, \
+ .desc = "PERFEVTSEL"#n, \
+ .dfl_val = PFM_IA_PMC_VAL, \
+ .rsvd_msk = PFM_IA_PMC_RSVD, \
+ .no_emul64_msk = PFM_IA_NO64, \
+ .hw_addr = MSR_GEN_SEL_BASE+(n) \
+ }
+
+#define PFM_IA_D(n) \
+ { .type = PFM_REG_C, \
+ .desc = "PMC"#n, \
+ .hw_addr = MSR_P6_PERFCTR0+n, \
+ .dep_pmcs[0] = 1ULL << n \
+ }
+
+#define PFM_IA_FD(n) \
+ { .type = PFM_REG_C, \
+ .desc = "FIXED_CTR"#n, \
+ .hw_addr = MSR_CORE_PERF_FIXED_CTR0+n,\
+ .dep_pmcs[0] = 1ULL << 16 \
+ }
+
+
+static struct pfm_regmap_desc pfm_intel_arch_pmc_desc[] = {
+/* pmc0 */ PFM_IA_C(0), PFM_IA_C(1), PFM_IA_C(2), PFM_IA_C(3),
+/* pmc4 */ PFM_IA_C(4), PFM_IA_C(5), PFM_IA_C(6), PFM_IA_C(7),
+/* pmc8 */ PFM_IA_C(8), PFM_IA_C(9), PFM_IA_C(10), PFM_IA_C(11),
+/* pmc12 */ PFM_IA_C(12), PFM_IA_C(13), PFM_IA_C(14), PFM_IA_C(15),
+
+/* pmc16 */ { .type = PFM_REG_I,
+ .desc = "FIXED_CTRL",
+ .dfl_val = 0x8888888888888888ULL, /* force PMI */
+ .rsvd_msk = 0, /* set dynamically */
+ .no_emul64_msk = 0,
+ .hw_addr = MSR_CORE_PERF_FIXED_CTR_CTRL
+ },
+};
+#define PFM_IA_MAX_PMCS ARRAY_SIZE(pfm_intel_arch_pmc_desc)
+
+static struct pfm_regmap_desc pfm_intel_arch_pmd_desc[] = {
+/* pmd0 */ PFM_IA_D(0), PFM_IA_D(1), PFM_IA_D(2), PFM_IA_D(3),
+/* pmd4 */ PFM_IA_D(4), PFM_IA_D(5), PFM_IA_D(6), PFM_IA_D(7),
+/* pmd8 */ PFM_IA_D(8), PFM_IA_D(9), PFM_IA_D(10), PFM_IA_D(11),
+/* pmd12 */ PFM_IA_D(12), PFM_IA_D(13), PFM_IA_D(14), PFM_IA_D(15),
+
+/* pmd16 */ PFM_IA_FD(0), PFM_IA_FD(1), PFM_IA_FD(2), PFM_IA_FD(3),
+/* pmd20 */ PFM_IA_FD(4), PFM_IA_FD(5), PFM_IA_FD(6), PFM_IA_FD(7),
+/* pmd24 */ PFM_IA_FD(8), PFM_IA_FD(9), PFM_IA_FD(10), PFM_IA_FD(11),
+/* pmd28 */ PFM_IA_FD(16), PFM_IA_FD(17), PFM_IA_FD(18), PFM_IA_FD(19)
+};
+#define PFM_IA_MAX_PMDS ARRAY_SIZE(pfm_intel_arch_pmd_desc)
+
+#define PFM_IA_MAX_CNT 16 /* # generic counters in mapping table */
+#define PFM_IA_MAX_FCNT 16 /* # of fixed counters in mapping table */
+#define PFM_IA_FCNT_BASE 16 /* base index of fixed counters PMD */
+
+static struct pfm_pmu_config pfm_intel_arch_pmu_conf;
+
+static void pfm_intel_arch_check_errata(void)
+{
+ /*
+ * Core Duo errata AE49 (no fix). Both counters share a single
+ * enable bit in PERFEVTSEL0
+ */
+ if (current_cpu_data.x86 == 6 && current_cpu_data.x86_model == 14)
+ pfm_intel_arch_pmu_info.flags |= PFM_X86_FL_NO_SHARING;
+}
+
+static inline void set_enable_mask(unsigned int i)
+{
+ pfm_arch_bv_set_bit(i, enable_mask);
+
+ /* max_enable = highest + 1 */
+ if ((i+1) > max_enable)
+ max_enable = i+ 1;
+}
+
+static void pfm_intel_arch_setup_generic(unsigned int version,
+ unsigned int width,
+ unsigned int count)
+{
+ u64 rsvd;
+ unsigned int i;
+
+ /*
+ * first we handle the generic counters:
+ *
+ * - ensure HW does not have more registers than hardcoded in the tables
+ * - adjust rsvd_msk to actual counter width
+ * - initialize enable_mask (list of PMC with start/stop capability)
+ * - mark unused hardcoded generic counters as unimplemented
+ */
+
+ /*
+ * min of number of Hw counters and hardcoded in the tables
+ */
+ if (count >= PFM_IA_MAX_CNT) {
+ printk(KERN_INFO "perfmon: Limiting number of generic counters"
+ " to %u, HW supports %u",
+ PFM_IA_MAX_CNT, count);
+ count = PFM_IA_MAX_CNT;
+ }
+
+ /*
+ * adjust rsvd_msk for generic counters based on actual width
+ * initialize enable_mask (1 per pmd)
+ */
+ rsvd = ~((1ULL << width)-1);
+ for (i = 0; i < count; i++) {
+ pfm_intel_arch_pmd_desc[i].rsvd_msk = rsvd;
+ set_enable_mask(i);
+ }
+
+ /*
+ * handle version 3 new anythread bit (21)
+ */
+ if (version == 3) {
+ for (i = 0; i < count; i++)
+ pfm_intel_arch_pmc_desc[i].rsvd_msk &= ~(1ULL << 21);
+ }
+
+
+ /*
+ * mark unused generic counters as not available
+ */
+ for (i = count ; i < PFM_IA_MAX_CNT; i++) {
+ pfm_intel_arch_pmd_desc[i].type = PFM_REG_NA;
+ pfm_intel_arch_pmc_desc[i].type = PFM_REG_NA;
+ }
+}
+
+static void pfm_intel_arch_setup_fixed(unsigned int version,
+ unsigned int width,
+ unsigned int count)
+{
+ u64 rsvd, dfl;
+ unsigned int i;
+
+ /*
+ * handle the fixed counters (if any):
+ *
+ * - ensure HW does not have more registers than hardcoded in the tables
+ * - adjust rsvd_msk to actual counter width
+ * - initialize enable_mask (list of PMC with start/stop capability)
+ * - mark unused hardcoded generic counters as unimplemented
+ */
+ if (count >= PFM_IA_MAX_FCNT) {
+ printk(KERN_INFO "perfmon: Limiting number of fixed counters"
+ " to %u, HW supports %u",
+ PFM_IA_MAX_FCNT, count);
+ count = PFM_IA_MAX_FCNT;
+ }
+ /*
+ * adjust rsvd_msk for fixed counters based on actual width
+ */
+ rsvd = ~((1ULL << width)-1);
+ for (i = 0; i < count; i++)
+ pfm_intel_arch_pmd_desc[PFM_IA_FCNT_BASE+i].rsvd_msk = rsvd;
+
+ /*
+ * handle version new anythread bit (bit 2)
+ */
+ if (version == 3)
+ rsvd = 1ULL << 3;
+ else
+ rsvd = 3ULL << 2;
+
+ pfm_intel_arch_pmc_desc[16].rsvd_msk = 0;
+ for (i = 0; i < count; i++)
+ pfm_intel_arch_pmc_desc[16].rsvd_msk |= rsvd << (i<<2);
+
+ /*
+ * mark unused fixed counters as unimplemented
+ *
+ * update the rsvd_msk, dfl_val in FIXED_CTRL:
+ * - rsvd_msk: set all 4 bits
+ * - dfl_val : clear all 4 bits
+ */
+ dfl = pfm_intel_arch_pmc_desc[16].dfl_val;
+ rsvd = pfm_intel_arch_pmc_desc[16].rsvd_msk;
+
+ for (i = count ; i < PFM_IA_MAX_FCNT; i++) {
+ pfm_intel_arch_pmd_desc[PFM_IA_FCNT_BASE+i].type = PFM_REG_NA;
+ rsvd |= 0xfULL << (i<<2);
+ dfl &= ~(0xfULL << (i<<2));
+ }
+
+ /*
+ * FIXED_CTR_CTRL unavailable when no fixed counters are defined
+ */
+ if (!count) {
+ pfm_intel_arch_pmc_desc[16].type = PFM_REG_NA;
+ } else {
+ /* update rsvd_mask and dfl_val */
+ pfm_intel_arch_pmc_desc[16].rsvd_msk = rsvd;
+ pfm_intel_arch_pmc_desc[16].dfl_val = dfl;
+ set_enable_mask(16);
+ }
+}
+
+static int pfm_intel_arch_probe_pmu(void)
+{
+ union {
+ unsigned int val;
+ struct pmu_eax eax;
+ struct pmu_edx edx;
+ } eax, edx;
+ unsigned int ebx, ecx;
+ unsigned int width = 0;
+
+ edx.val = 0;
+
+ if (!cpu_has_arch_perfmon) {
+ PFM_INFO("no support for Intel architectural PMU");
+ return -1;
+ }
+
+ if (!cpu_has_apic) {
+ PFM_INFO("no Local APIC, try rebooting with lapic option");
+ return -1;
+ }
+
+ /* cpuid() call protected by cpu_has_arch_perfmon */
+ cpuid(0xa, &eax.val, &ebx, &ecx, &edx.val);
+
+ /*
+ * some 6/15 models have buggy BIOS
+ */
+ if (eax.eax.version == 0
+ && current_cpu_data.x86 == 6 && current_cpu_data.x86_model == 15) {
+ PFM_INFO("buggy v2 BIOS, adjusting for 2 generic counters");
+ eax.eax.version = 2;
+ eax.eax.num_cnt = 2;
+ eax.eax.cnt_width = 40;
+ }
+
+ /*
+ * some v2 BIOSes are incomplete
+ */
+ if (eax.eax.version == 2 && !edx.edx.num_cnt) {
+ PFM_INFO("buggy v2 BIOS, adjusting for 3 fixed counters");
+ edx.edx.num_cnt = 3;
+ edx.edx.cnt_width = 40;
+ }
+
+ /*
+ * no fixed counters on earlier versions
+ */
+ if (eax.eax.version < 2) {
+ edx.val = 0;
+ } else {
+ /*
+ * use the min value of both widths until we support
+ * variable width counters
+ */
+ width = eax.eax.cnt_width < edx.edx.cnt_width ?
+ eax.eax.cnt_width : edx.edx.cnt_width;
+ }
+
+ /*
+ * Intel Atom processors have a buggy firmware which does not report
+ * the correct number of fixed counters
+ */
+ if (eax.eax.version == 3 && edx.edx.num_cnt < 3
+ && current_cpu_data.x86 == 6 && current_cpu_data.x86_model == 28) {
+ PFM_INFO("buggy v3 BIOS, adjusting for 3 fixed counters");
+ edx.edx.num_cnt = 3;
+ }
+
+ PFM_INFO("detected architecural perfmon v%d", eax.eax.version);
+ PFM_INFO("num_gen=%d width=%d num_fixed=%d width=%d",
+ eax.eax.num_cnt,
+ eax.eax.cnt_width,
+ edx.edx.num_cnt,
+ edx.edx.cnt_width);
+
+ pfm_intel_arch_setup_generic(eax.eax.version,
+ width,
+ eax.eax.num_cnt);
+
+ pfm_intel_arch_setup_fixed(eax.eax.version,
+ width,
+ edx.edx.num_cnt);
+
+ pfm_intel_arch_check_errata();
+
+ pfm_intel_arch_version = eax.eax.version;
+
+ return 0;
+}
+
+/**
+ * pfm_intel_arch_has_ovfls - check for pending overflow condition
+ * @ctx: context to work on
+ *
+ * detect if counters have overflowed.
+ * return:
+ * 0 : no overflow
+ * 1 : at least one overflow
+ */
+static int __kprobes pfm_intel_arch_has_ovfls(struct pfm_context *ctx)
+{
+ u64 *cnt_mask;
+ u64 wmask, val;
+ u16 i, num;
+
+ cnt_mask = ctx->regs.cnt_pmds;
+ num = ctx->regs.num_counters;
+ wmask = 1ULL << pfm_pmu_conf->counter_width;
+
+ /*
+ * we can leverage the fact that we know the mapping
+ * to hardcode the MSR address and avoid accessing
+ * more cachelines
+ *
+ * We need to check cnt_mask because not all registers
+ * may be available.
+ */
+ for (i = 0; num; i++) {
+ if (pfm_arch_bv_test_bit(i, cnt_mask)) {
+ rdmsrl(pfm_intel_arch_pmd_desc[i].hw_addr, val);
+ if (!(val & wmask))
+ return 1;
+ num--;
+ }
+ }
+ return 0;
+}
+
+static int pfm_intel_arch_stop_save(struct pfm_context *ctx,
+ struct pfm_event_set *set)
+{
+ u64 used_mask[PFM_PMC_BV];
+ u64 val, wmask, ovfl_mask;
+ u32 i, count;
+
+ wmask = 1ULL << pfm_pmu_conf->counter_width;
+
+ pfm_arch_bv_and(used_mask,
+ set->used_pmcs,
+ enable_mask,
+ max_enable);
+
+ count = pfm_arch_bv_weight(used_mask, max_enable);
+
+ /*
+ * stop monitoring
+ * Unfortunately, this is very expensive!
+ * wrmsrl() is serializing.
+ */
+ for (i = 0; count; i++) {
+ if (pfm_arch_bv_test_bit(i, used_mask)) {
+ wrmsrl(pfm_pmu_conf->pmc_desc[i].hw_addr, 0);
+ count--;
+ }
+ }
+
+ /*
+ * if we already having a pending overflow condition, we simply
+ * return to take care of this first.
+ */
+ if (set->npend_ovfls)
+ return 1;
+
+ ovfl_mask = pfm_pmu_conf->ovfl_mask;
+
+ /*
+ * check for pending overflows and save PMDs (combo)
+ * we employ used_pmds because we also need to save
+ * and not just check for pending interrupts.
+ *
+ * all pmds are counters
+ */
+ count = set->nused_pmds;
+ for (i = 0; count; i++) {
+ if (pfm_arch_bv_test_bit(i, set->used_pmds)) {
+ val = pfm_arch_read_pmd(ctx, i);
+ if (!(val & wmask)) {
+ pfm_arch_bv_set_bit(i, set->povfl_pmds);
+ set->npend_ovfls++;
+ }
+ val = (set->pmds[i] & ~ovfl_mask)
+ | (val & ovfl_mask);
+ set->pmds[i] = val;
+ count--;
+ }
+ }
+ /* 0 means: no need to save PMDs at upper level */
+ return 0;
+}
+
+/**
+ * pfm_intel_arch_quiesce - stop monitoring without grabbing any lock
+ *
+ * called from NMI interrupt handler to immediately stop monitoring
+ * cannot grab any lock, including perfmon related locks
+ */
+static void __kprobes pfm_intel_arch_quiesce(void)
+{
+ u16 i;
+
+ /*
+ * PMC16 is the fixed control register so it has a
+ * distinct MSR address
+ *
+ * We do not use the hw_addr field in the table to avoid touching
+ * too many cachelines
+ */
+ for (i = 0; i < pfm_pmu_conf->regs_all.max_pmc; i++) {
+ if (pfm_arch_bv_test_bit(i, pfm_pmu_conf->regs_all.pmcs)) {
+ if (i == 16)
+ wrmsrl(MSR_CORE_PERF_FIXED_CTR_CTRL, 0);
+ else
+ wrmsrl(MSR_P6_EVNTSEL0+i, 0);
+ }
+ }
+}
+/**
+* pfm_intel_arch_release_pmu_percpu - release PMU resource per CPU
+*
+* Since v2, there exists global control MSR, to start/stop and
+* also collect overflow status information. In particular,
+* GLOBAL_CTRL controls start/stop and has one bit per counter.
+* To maintain backward compatibility with v1, the power-on value
+* of GLOBAL_CTRL should be such that generic counters are enabled
+* but fixed counters are disabled (true on Penryn and Atom currently).
+*
+* Here, we simply make sure that all available counters are enabled.
+* After that, start/stop is controlled on a per-counter basis.
+*/
+static void pfm_intel_arch_acquire_pmu_percpu(void)
+{
+ struct pfm_regmap_desc *d;
+ u64 mask = 0;
+ unsigned int i;
+
+ /* nothing to do for v1 */
+ if (pfm_intel_arch_version < 2)
+ return;
+
+ /*
+ * build bitmask of registers that are available to
+ * us. In some cases, there may be fewer registers than
+ * what the PMU supports due to sharing with other kernel
+ * subsystems, such as NMI
+ */
+ d = pfm_pmu_conf->pmd_desc;
+ for (i=0; i < 16; i++) {
+ if ((d[i].type & PFM_REG_I) == 0)
+ continue;
+ mask |= 1ull << i;
+ }
+ for (i=16; i < PFM_IA_MAX_PMDS; i++) {
+ if ((d[i].type & PFM_REG_I) == 0)
+ continue;
+ mask |= 1ull << (32+i-16);
+ }
+ /*
+ * keep a local copy of the current MSR_CORE_PERF_GLOBAL_CTRL
+ */
+ rdmsrl(MSR_CORE_PERF_GLOBAL_CTRL, __get_cpu_var(saved_global_ctrl));
+
+ PFM_DBG("global=0x%llx set to 0x%llx",
+ __get_cpu_var(saved_global_ctrl),
+ mask);
+ /*
+ * enable all registers
+ *
+ * No need to quiesce PMU. If there is a overflow, it will be
+ * treated as spurious by the handler
+ */
+ wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, mask);
+}
+
+/**
+* pfm_intel_arch_release_pmu_percpu - release PMU resource per CPU
+*
+* Since v2, there exists global control MSR, to start/stop and
+* also collect overflow status information. In particular,
+* GLOBAL_CTRL controls start/stop and has one bit per counter.
+* To maintain backward compatibility with v1, the power-on value
+* of GLOBAL_CTRL should be such that generic counters are enabled
+* but fixed counters are disabled (true on Penryn and Atom currently).
+*
+* Here, we are done using the PMU. so we restore the power-on value.
+*/
+static void pfm_intel_arch_release_pmu_percpu(void)
+{
+ /* nothing to do for v1 */
+ if (pfm_intel_arch_version < 2)
+ return;
+
+ PFM_DBG("global_ctrl restored to 0x%llx\n",
+ __get_cpu_var(saved_global_ctrl));
+
+ wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, __get_cpu_var(saved_global_ctrl));
+}
+
+/*
+ * Counters may have model-specific width. Yet the documentation says
+ * that only the lower 32 bits can be written to due to the specification
+ * of wrmsr. bits [32-(w-1)] are sign extensions of bit 31. Bits [w-63] must
+ * not be set (see rsvd_msk for PMDs). As such the effective width of a
+ * counter is 31 bits only regardless of what CPUID.0xa returns.
+ *
+ * See IA-32 Intel Architecture Software developer manual Vol 3B chapter 18
+ */
+static struct pfm_pmu_config pfm_intel_arch_pmu_conf = {
+ .pmu_name = "Intel architectural",
+ .pmd_desc = pfm_intel_arch_pmd_desc,
+ .counter_width = 31,
+ .num_pmc_entries = PFM_IA_MAX_PMCS,
+ .num_pmd_entries = PFM_IA_MAX_PMDS,
+ .pmc_desc = pfm_intel_arch_pmc_desc,
+ .version = "1.0",
+ .pmu_info = &pfm_intel_arch_pmu_info
+};
+
+static int __init pfm_intel_arch_pmu_init_module(void)
+{
+ if (pfm_intel_arch_probe_pmu())
+ return -ENOSYS;
+
+ return pfm_pmu_register(&pfm_intel_arch_pmu_conf);
+}
+
+device_initcall(pfm_intel_arch_pmu_init_module);
Index: o3/arch/x86/perfmon/Kconfig
===================================================================
--- o3.orig/arch/x86/perfmon/Kconfig 2008-11-25 18:09:47.000000000 +0100
+++ o3/arch/x86/perfmon/Kconfig 2008-11-25 18:21:33.000000000 +0100
@@ -15,4 +15,11 @@
help
Enables perfmon debugging support

+config X86_PERFMON_INTEL_ARCH
+ bool "Support for Intel architectural perfmon v1/v2/v3"
+ depends on PERFMON
+ default n
+ help
+ Enables support for Intel architectural performance counters.
+ This feature was introduced with Intel Core Solo/Core Duo processors.
endmenu

--

2008-11-26 14:56:22

by Thomas Gleixner

[permalink] [raw]

Subject: Re: [patch 21/24] perfmon: Intel architectural PMU support (x86)

On Wed, 26 Nov 2008, [email protected] wrote:

> +static u64 enable_mask[PFM_MAX_PMCS];

Why do we need enable_mask twice for AMD and Intel ?

> +static u16 max_enable;
> +static int pfm_intel_arch_version;
> +
> +DEFINE_PER_CPU(u64, saved_global_ctrl);

static

> +/*
> + * layout of EAX for CPUID.0xa leaf function
> + */
> +struct pmu_eax {
> + unsigned int version:8; /* architectural perfmon version */
> + unsigned int num_cnt:8; /* number of generic counters */
> + unsigned int cnt_width:8; /* width of generic counters */
> + unsigned int ebx_length:8; /* number of architected events */
> +};

in arch/x86/include/asm/intel_arch_perfmon.h we have already:

union cpuid10_eax {
struct {
unsigned int version_id:8;
unsigned int num_counters:8;
unsigned int bit_width:8;
unsigned int mask_length:8;
} split;
unsigned int full;
};

Can we either use this or remove it ?

> +/*
> + * layout of EDX for CPUID.0xa leaf function when perfmon v2 is detected
> + */
> +struct pmu_edx {
> + unsigned int num_cnt:5; /* number of fixed counters */
> + unsigned int cnt_width:8; /* width of fixed counters */
> + unsigned int reserved:19;
> +};

> +static void pfm_intel_arch_acquire_pmu_percpu(void);
> +static void pfm_intel_arch_release_pmu_percpu(void);
> +static int pfm_intel_arch_stop_save(struct pfm_context *ctx,
> + struct pfm_event_set *set);
> +static int pfm_intel_arch_has_ovfls(struct pfm_context *ctx);
> +static void __kprobes pfm_intel_arch_quiesce(void);
> +
> +/*
> + * physical addresses of MSR controlling the perfevtsel and counter registers
> + */
> +struct pfm_arch_pmu_info pfm_intel_arch_pmu_info = {
> + .stop_save = pfm_intel_arch_stop_save,
> + .has_ovfls = pfm_intel_arch_has_ovfls,
> + .quiesce = pfm_intel_arch_quiesce,
> + .acquire_pmu_percpu = pfm_intel_arch_acquire_pmu_percpu,
> + .release_pmu_percpu = pfm_intel_arch_release_pmu_percpu
> +};

A) static
B) Please move it to the bottom to avoid all the forward declarations.

> +static void pfm_intel_arch_check_errata(void)

__init ?

> +static void pfm_intel_arch_setup_generic(unsigned int version,

Ditto.

> +static void pfm_intel_arch_setup_fixed(unsigned int version,

Ditto.

> +static int pfm_intel_arch_probe_pmu(void)

Ditto.

> + /*
> + * handle version new anythread bit (bit 2)
> + */

-ENOPARSE

> + if (version == 3)
> + rsvd = 1ULL << 3;

This sets bit 3

> + else
> + rsvd = 3ULL << 2;

And this sets bit 2 and 3.

> +static int pfm_intel_arch_stop_save(struct pfm_context *ctx,
> + struct pfm_event_set *set)
> +{
> + u64 used_mask[PFM_PMC_BV];
> + u64 val, wmask, ovfl_mask;
> + u32 i, count;
> +
> + wmask = 1ULL << pfm_pmu_conf->counter_width;
> +
> + pfm_arch_bv_and(used_mask,
> + set->used_pmcs,
> + enable_mask,
> + max_enable);
> +
> + count = pfm_arch_bv_weight(used_mask, max_enable);

So we have:

set->used_pmcs and enable_mask and max_enable.

Why can set->used_pmcs contain bits which are not in the enable_mask
in the first place ? Why does the arch code not tell the generic code
which pmcs are available so we can avoid all this mask, weight
whatever magic ?

We store the same information in slightly different incarnations in
various places and then we need to mingle them all together to get to
the real data. That makes no sense at all.

> + /*
> + * stop monitoring
> + * Unfortunately, this is very expensive!
> + * wrmsrl() is serializing.
> + */
> + for (i = 0; count; i++) {
> + if (pfm_arch_bv_test_bit(i, used_mask)) {
> + wrmsrl(pfm_pmu_conf->pmc_desc[i].hw_addr, 0);
> + count--;
> + }
> + }
> +
> + /*
> + * if we already having a pending overflow condition, we simply
> + * return to take care of this first.
> + */
> + if (set->npend_ovfls)
> + return 1;

Why are the counters enabled at all when an overflow is pending, which
stopped the counters anyway ?

Thanks,

tglx

2008-11-26 15:45:39

by Stephane Eranian

[permalink] [raw]

Subject: Re: [patch 21/24] perfmon: Intel architectural PMU support (x86)

Thomas,

On Wed, Nov 26, 2008 at 3:55 PM, Thomas Gleixner <[email protected]> wrote:
> On Wed, 26 Nov 2008, [email protected] wrote:
>
>> +static u64 enable_mask[PFM_MAX_PMCS];
>
> Why do we need enable_mask twice for AMD and Intel ?
>
Ok, some explanations are needed here.

Both perfmon_intel_arch.c and perfmon_amd64.c are supposed to be kernel modules.
They are hardcoded right now to make the patch simpler. Have PMU description be
modules is a key features because it allows adding new processor support without
necessarily patch the core kernel or rebooting. This has been working
nicely on Itanium.
With the introduction of Intel architectural Perfmon (IA32 SDM chapter
18), this becomes
possible on Intel X86 as well. on AMD64 processor, this is not really
an issue as they
all use the same architected PMU, except for family 10h which has nice
extensions.

In anycase, the idea is to encapsulate as much as possible code
related into a PMU model
into each module. That is why you are seing some redundancy.

There is a difference between enable_mask and used_pmcs. The used_pmcs
bitmasks shows
all the config registers in use. Whereas enable_mask shows the all
config registers which have
start/stop capabilities. For the basic AMD64 PMU (4 counters)
used_pmcs and enable_mask
are equivalent, but that is not the case on Barcelona once we support
IBS and sampling. So
for now, I could clean this up and drop enable_mask to use plain used_pmcs.

>> +static u16 max_enable;
>> +static int pfm_intel_arch_version;
>> +
>> +DEFINE_PER_CPU(u64, saved_global_ctrl);
>
> static
>
>> +/*
>> + * layout of EAX for CPUID.0xa leaf function
>> + */
>> +struct pmu_eax {
>> + unsigned int version:8; /* architectural perfmon version */
>> + unsigned int num_cnt:8; /* number of generic counters */
>> + unsigned int cnt_width:8; /* width of generic counters */
>> + unsigned int ebx_length:8; /* number of architected events */
>> +};
>
> in arch/x86/include/asm/intel_arch_perfmon.h we have already:
>
> union cpuid10_eax {
> struct {
> unsigned int version_id:8;
> unsigned int num_counters:8;
> unsigned int bit_width:8;
> unsigned int mask_length:8;
> } split;
> unsigned int full;
> };
>
> Can we either use this or remove it ?
>
>> +/*
>> + * layout of EDX for CPUID.0xa leaf function when perfmon v2 is detected
>> + */
>> +struct pmu_edx {
>> + unsigned int num_cnt:5; /* number of fixed counters */
>> + unsigned int cnt_width:8; /* width of fixed counters */
>> + unsigned int reserved:19;
>> +};
>
>> +static void pfm_intel_arch_acquire_pmu_percpu(void);
>> +static void pfm_intel_arch_release_pmu_percpu(void);
>> +static int pfm_intel_arch_stop_save(struct pfm_context *ctx,
>> + struct pfm_event_set *set);
>> +static int pfm_intel_arch_has_ovfls(struct pfm_context *ctx);
>> +static void __kprobes pfm_intel_arch_quiesce(void);
>> +
>> +/*
>> + * physical addresses of MSR controlling the perfevtsel and counter registers
>> + */
>> +struct pfm_arch_pmu_info pfm_intel_arch_pmu_info = {
>> + .stop_save = pfm_intel_arch_stop_save,
>> + .has_ovfls = pfm_intel_arch_has_ovfls,
>> + .quiesce = pfm_intel_arch_quiesce,
>> + .acquire_pmu_percpu = pfm_intel_arch_acquire_pmu_percpu,
>> + .release_pmu_percpu = pfm_intel_arch_release_pmu_percpu
>> +};
>
> A) static
> B) Please move it to the bottom to avoid all the forward declarations.
>
>> +static void pfm_intel_arch_check_errata(void)
>
> __init ?
>
>> +static void pfm_intel_arch_setup_generic(unsigned int version,
>
> Ditto.
>
>> +static void pfm_intel_arch_setup_fixed(unsigned int version,
>
> Ditto.
>
>> +static int pfm_intel_arch_probe_pmu(void)
>
> Ditto.
>
>> + /*
>> + * handle version new anythread bit (bit 2)
>> + */
>
> -ENOPARSE
>
>> + if (version == 3)
>> + rsvd = 1ULL << 3;
>
> This sets bit 3
>
>> + else
>> + rsvd = 3ULL << 2;
>
> And this sets bit 2 and 3.
>
>> +static int pfm_intel_arch_stop_save(struct pfm_context *ctx,
>> + struct pfm_event_set *set)
>> +{
>> + u64 used_mask[PFM_PMC_BV];
>> + u64 val, wmask, ovfl_mask;
>> + u32 i, count;
>> +
>> + wmask = 1ULL << pfm_pmu_conf->counter_width;
>> +
>> + pfm_arch_bv_and(used_mask,
>> + set->used_pmcs,
>> + enable_mask,
>> + max_enable);
>> +
>> + count = pfm_arch_bv_weight(used_mask, max_enable);
>
> So we have:
>
> set->used_pmcs and enable_mask and max_enable.
>
> Why can set->used_pmcs contain bits which are not in the enable_mask
> in the first place ? Why does the arch code not tell the generic code
> which pmcs are available so we can avoid all this mask, weight
> whatever magic ?
>

Because used_pmcs is part of generic code and enable_mask is a x86 construct.
As I said above, for now, I could drop enable_mask.
The arch code already export the list of available pmcs and pmds in
impl_pmcs and impl_pmds.

There is a difference between impl_pmcs and used_pmcs. The former list
everything that is
available. The latter shows what we are currently using. We may be
using fewer registers than
what is available, and we use this information to avoid
saving/restoring MSR on context switch
for instance.

> We store the same information in slightly different incarnations in
> various places and then we need to mingle them all together to get to
> the real data. That makes no sense at all.
>
>> + /*
>> + * stop monitoring
>> + * Unfortunately, this is very expensive!
>> + * wrmsrl() is serializing.
>> + */
>> + for (i = 0; count; i++) {
>> + if (pfm_arch_bv_test_bit(i, used_mask)) {
>> + wrmsrl(pfm_pmu_conf->pmc_desc[i].hw_addr, 0);
>> + count--;
>> + }
>> + }
>> +
>> + /*
>> + * if we already having a pending overflow condition, we simply
>> + * return to take care of this first.
>> + */
>> + if (set->npend_ovfls)
>> + return 1;
>
> Why are the counters enabled at all when an overflow is pending, which
> stopped the counters anyway ?
>
Because on Intel and AMD64, counters are not automatically frozen on interrupt.
On Intel X86, they can be configured to do so, but it is an all or
nothing setting.
I am not using this option because we would then have a problem with the NMI
watchdog given that it is also using a counter.

2008-11-26 16:11:39

by Thomas Gleixner

[permalink] [raw]

Subject: Re: [patch 21/24] perfmon: Intel architectural PMU support (x86)

On Wed, 26 Nov 2008, stephane eranian wrote:
> In anycase, the idea is to encapsulate as much as possible code
> related into a PMU model
> into each module. That is why you are seing some redundancy.

Makes sense.

> There is a difference between enable_mask and used_pmcs. The used_pmcs
> bitmasks shows
> all the config registers in use. Whereas enable_mask shows the all
> config registers which have
> start/stop capabilities. For the basic AMD64 PMU (4 counters)
> used_pmcs and enable_mask
> are equivalent, but that is not the case on Barcelona once we support
> IBS and sampling. So
> for now, I could clean this up and drop enable_mask to use plain used_pmcs.

Understood. If we need that in the near future then it's ok to keep
it, it just did not make any sense from the current code.

But I think you should do this once when you set up the context and
keep that as a separate mask. Right now you evaluate enable_mask and
used_pmcs over and over again.

> >> + count = pfm_arch_bv_weight(used_mask, max_enable);
> >
> > So we have:
> >
> > set->used_pmcs and enable_mask and max_enable.
> >
> > Why can set->used_pmcs contain bits which are not in the enable_mask
> > in the first place ? Why does the arch code not tell the generic code
> > which pmcs are available so we can avoid all this mask, weight
> > whatever magic ?
> >
>
> Because used_pmcs is part of generic code and enable_mask is a x86 construct.
> As I said above, for now, I could drop enable_mask.
> The arch code already export the list of available pmcs and pmds in
> impl_pmcs and impl_pmds.

See above.

> > Why are the counters enabled at all when an overflow is pending, which
> > stopped the counters anyway ?
> >
> Because on Intel and AMD64, counters are not automatically frozen on interrupt.
> On Intel X86, they can be configured to do so, but it is an all or
> nothing setting.
> I am not using this option because we would then have a problem with the NMI
> watchdog given that it is also using a counter.

Well, my question was: why do we have to stop the counters when an
overflow is pending already ?

The overflow pending is set inside of stop_save() and cleared
somewhere else.

stop_save() is called from pfm_arch_stop() and
pfm_arch_ctxswout_thread(). The first thing it does is to disable the
counters.

Now at some points the counters are obviously reenabled for this
context, but why are they reenabled _before_ the pending overflow has
been resolved ? For N counters that N * 2 wrmsrl() overhead.

Thanks,

tglx

2008-11-26 16:34:53

by Andi Kleen

[permalink] [raw]

Subject: Re: [patch 21/24] perfmon: Intel architectural PMU support (x86)

> Both perfmon_intel_arch.c and perfmon_amd64.c are supposed to be kernel modules.
> They are hardcoded right now to make the patch simpler. Have PMU description be
> modules is a key features because it allows adding new processor support without
> necessarily patch the core kernel or rebooting. This has been working

To install the new processor in the first place you have
to reboot anyways.

Also typically at least for new families (which tend to be the
only ones with radically new performance counters) it's typically
needed to change some things in the core kernel anyways. So
this doesn't seem like a really useful feature.

> nicely on Itanium.
> With the introduction of Intel architectural Perfmon (IA32 SDM chapter
> 18), this becomes
> possible on Intel X86 as well.

It becomes possible, but without having to use any modules. It should
just work.

Probably even without it worked -- at least if you limit yourself to
family 6 -- because the register layout all stayed the same too.

That said having modular PMUs is probably a good thing for
distribution kernels, but there is really no need for any
code compromises just to avoid a core kernel patch now and then.

-Andi

2008-12-02 03:09:20

by Stephane Eranian

[permalink] [raw]

Subject: Re: [patch 21/24] perfmon: Intel architectural PMU support (x86)

On Wed, Nov 26, 2008 at 3:55 PM, Thomas Gleixner <[email protected]> wrote:
> On Wed, 26 Nov 2008, [email protected] wrote:
>
>> +static u64 enable_mask[PFM_MAX_PMCS];
>
> Why do we need enable_mask twice for AMD and Intel ?
>
>> +static u16 max_enable;
>> +static int pfm_intel_arch_version;
>> +
>> +DEFINE_PER_CPU(u64, saved_global_ctrl);
>
> static
>
Why you want this static instead of per-cpu?

>> +/*
>> + * layout of EAX for CPUID.0xa leaf function
>> + */
>> +struct pmu_eax {
>> + unsigned int version:8; /* architectural perfmon version */
>> + unsigned int num_cnt:8; /* number of generic counters */
>> + unsigned int cnt_width:8; /* width of generic counters */
>> + unsigned int ebx_length:8; /* number of architected events */
>> +};
>
> in arch/x86/include/asm/intel_arch_perfmon.h we have already:
>
> union cpuid10_eax {
> struct {
> unsigned int version_id:8;
> unsigned int num_counters:8;
> unsigned int bit_width:8;
> unsigned int mask_length:8;
> } split;
> unsigned int full;
> };
>
> Can we either use this or remove it ?
>

Well, I need more than eax. We could rewrite this union to include
eax, edx, So I propose we call it union cpuid10 and define it as:

union cpuid_eax {
struct {
unsigned int version_id:8;
unsigned int num_counters:8;
unsigned int bit_width:8;
unsigned int mask_length:8;
} split_eax;
struct {
unsigned int num_counters:5;
unsigned int bit_width:8;
unsigned int reserved:19;
} split_edx;
unsigned int full;
}