This patch adds the X86 generic perfmon2 code. It is in charge of
implementing certain key functionalities required by the generic
code such as read/write of the PMU registers, low-level interrupt
handling.
Signed-off-by: Stephane Eranian <[email protected]>
--
Index: o/arch/x86/perfmon/Kconfig
===================================================================
--- /dev/null 1970-01-01 00:00:00.000000000 +0000
+++ o/arch/x86/perfmon/Kconfig 2008-06-16 18:21:52.000000000 +0200
@@ -0,0 +1,18 @@
+menu "Hardware Performance Monitoring support"
+config PERFMON
+ bool "Perfmon2 performance monitoring interface"
+ select X86_LOCAL_APIC
+ default n
+ help
+ Enables the perfmon2 interface to access the hardware
+ performance counters. See <http://perfmon2.sf.net/> for
+ more details.
+
+config PERFMON_DEBUG
+ bool "Perfmon debugging"
+ default n
+ depends on PERFMON
+ help
+ Enables perfmon debugging support
+
+endmenu
Index: o/arch/x86/perfmon/Makefile
===================================================================
--- /dev/null 1970-01-01 00:00:00.000000000 +0000
+++ o/arch/x86/perfmon/Makefile 2008-06-16 18:21:52.000000000 +0200
@@ -0,0 +1,5 @@
+#
+# Copyright (c) 2005-2007 Hewlett-Packard Development Company, L.P.
+# Contributed by Stephane Eranian <[email protected]>
+#
+obj-$(CONFIG_PERFMON) += perfmon.o
Index: o/arch/x86/perfmon/perfmon.c
===================================================================
--- /dev/null 1970-01-01 00:00:00.000000000 +0000
+++ o/arch/x86/perfmon/perfmon.c 2008-06-16 18:21:52.000000000 +0200
@@ -0,0 +1,634 @@
+/*
+ * This file implements the X86 specific support for the perfmon2 interface
+ *
+ * Copyright (c) 2005-2007 Hewlett-Packard Development Company, L.P.
+ * Contributed by Stephane Eranian <[email protected]>
+ *
+ * Copyright (c) 2007 Advanced Micro Devices, Inc.
+ * Contributed by Robert Richter <[email protected]>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+ * 02111-1307 USA
+ */
+#include <linux/interrupt.h>
+#include <linux/perfmon_kern.h>
+#include <linux/kprobes.h>
+#include <linux/kdebug.h>
+#include <linux/nmi.h>
+
+#include <asm/apic.h>
+
+DEFINE_PER_CPU(unsigned long, real_iip);
+DEFINE_PER_CPU(int, pfm_using_nmi);
+DEFINE_PER_CPU(unsigned long, saved_lvtpc);
+
+/**
+ * pfm_arch_ctxswin_thread - thread context switch in
+ * @task: task switched in
+ * @ctx: context for the task
+ * @set: active event set
+ *
+ * Called from pfm_ctxsw(). Task is guaranteed to be current.
+ * set cannot be NULL. Context is locked. Interrupts are masked.
+ *
+ * Caller has already restored all PMD and PMC registers, if
+ * necessary (i.e., lazy restore scheme).
+ *
+ * On x86, the only common code just needs to unsecure RDPMC if necessary
+ *
+ * On model-specific features, e.g., PEBS, IBS, are taken care of in the
+ * corresponding PMU description module
+ */
+void pfm_arch_ctxswin_thread(struct task_struct *task, struct pfm_context *ctx)
+{
+ struct pfm_arch_context *ctx_arch;
+
+ ctx_arch = pfm_ctx_arch(ctx);
+
+ /*
+ * restore saved real iip
+ */
+ if (ctx->active_set->npend_ovfls)
+ __get_cpu_var(real_iip) = ctx_arch->saved_real_iip;
+
+ /*
+ * enable RDPMC on this CPU
+ */
+ if (ctx_arch->flags.insecure)
+ set_in_cr4(X86_CR4_PCE);
+}
+
+/**
+ * pfm_arch_ctxswout_thread - context switch out thread
+ * @task: task switched out
+ * @ctx : context switched out
+ *
+ * Called from pfm_ctxsw(). Task is guaranteed to be current.
+ * Context is locked. Interrupts are masked. Monitoring may be active.
+ * PMU access is guaranteed. PMC and PMD registers are live in PMU.
+ *
+ * Return:
+ * non-zero : did not save PMDs (as part of stopping the PMU)
+ * 0 : saved PMDs (no need to save them in caller)
+ */
+int pfm_arch_ctxswout_thread(struct task_struct *task, struct pfm_context *ctx)
+{
+ struct pfm_arch_context *ctx_arch;
+ struct pfm_arch_pmu_info *pmu_info;
+
+ ctx_arch = pfm_ctx_arch(ctx);
+ pmu_info = pfm_pmu_info();
+
+ /*
+ * disable lazy restore of PMCS on ctxswin because
+ * we modify some of them.
+ */
+ ctx->active_set->priv_flags |= PFM_SETFL_PRIV_MOD_PMCS;
+
+ if (ctx->active_set->npend_ovfls)
+ ctx_arch->saved_real_iip = __get_cpu_var(real_iip);
+
+ /*
+ * disable RDPMC on this CPU
+ */
+ if (ctx_arch->flags.insecure)
+ clear_in_cr4(X86_CR4_PCE);
+
+ return pmu_info->stop_save(ctx, ctx->active_set);
+}
+
+/**
+ * pfm_arch_stop - deactivate monitoring
+ * @task: task to stop
+ * @ctx: context to stop
+ *
+ * Called from pfm_stop()
+ * Interrupts are masked. Context is locked. Set is the active set.
+ *
+ * For per-thread:
+ * task is not necessarily current. If not current task, then
+ * task is guaranteed stopped and off any cpu. Access to PMU
+ * is not guaranteed.
+ *
+ * For system-wide:
+ * task is current
+ *
+ * must disable active monitoring. ctx cannot be NULL
+ */
+void pfm_arch_stop(struct task_struct *task, struct pfm_context *ctx)
+{
+ struct pfm_arch_pmu_info *pmu_info;
+
+ pmu_info = pfm_pmu_info();
+
+ /*
+ * no need to go through stop_save()
+ * if we are already stopped
+ */
+ if (!ctx->flags.started)
+ return;
+
+ if (task != current)
+ return;
+
+ pmu_info->stop_save(ctx, ctx->active_set);
+}
+
+
+/**
+ * pfm_arch_start - activate monitoring
+ * @task: task to start
+ * @ctx: context to stop
+ *
+ * Interrupts are masked. Context is locked.
+ *
+ * For per-thread:
+ * Task is not necessarily current. If not current task, then task
+ * is guaranteed stopped and off any cpu. No access to PMU is task
+ * is not current.
+ *
+ * For system-wide:
+ * task is always current
+ */
+void pfm_arch_start(struct task_struct *task, struct pfm_context *ctx)
+{
+ struct pfm_event_set *set;
+ u64 *mask;
+ u16 i, num;
+
+ set = ctx->active_set;
+
+ /*
+ * cannot restore PMC if no access to PMU. Will be done
+ * when the thread is switched back in
+ */
+ if (task != current)
+ return;
+
+ /*
+ * we actually install all implemented pmcs registers because
+ * until started, we do not write any PMC registers.
+ * Note that registers used by other subsystems (e.g. NMI) are
+ * removed from pmcs.
+ *
+ * XXX: we may be able to optimize this for non-P4 PMU as pmcs are
+ * independent from each others. That would need to be in model
+ * specific start routine.
+ */
+ num = pfm_pmu_conf->regs.num_pmcs;
+ mask = pfm_pmu_conf->regs.pmcs;
+ for (i = 0; num; i++) {
+ if (test_bit(i, cast_ulp(mask))) {
+ pfm_arch_write_pmc(ctx, i, set->pmcs[i]);
+ num--;
+ }
+ }
+}
+
+/**
+ * pfm_arch_restore_pmds - reload PMD registers
+ * @ctx: context to restore from
+ * @set: current event set
+ *
+ * function called from pfm_context_load(), pfm_ctxsw()
+ *
+ * Context is locked. Interrupts are masked. Set cannot be NULL.
+ * Access to the PMU is guaranteed.
+ */
+void pfm_arch_restore_pmds(struct pfm_context *ctx, struct pfm_event_set *set)
+{
+ u16 i, num;
+
+ num = set->nused_pmds;
+
+ /*
+ * we can restore only the PMD we use because:
+ *
+ * - can only read with pfm_read_pmds() the registers
+ * declared used via pfm_write_pmds(), smpl_pmds, reset_pmds
+ *
+ * - if cr4.pce=1, only counters are exposed to user. RDPMC
+ * does not work with other types of PMU registers.Thus, no
+ * address is ever exposed by counters
+ *
+ * - there is never a dependency between one pmd register and
+ * another
+ */
+ for (i = 0; num; i++) {
+ if (likely(test_bit(i, cast_ulp(set->used_pmds)))) {
+ pfm_write_pmd(ctx, i, set->pmds[i].value);
+ num--;
+ }
+ }
+}
+
+/**
+ * pfm_arch_restore_pmcs - reload PMC registers
+ * @ctx: context to restore from
+ * @set: current event set
+ *
+ * function called from pfm_context_load(), pfm_ctxsw().
+ *
+ * Context is locked. Interrupts are masked. set cannot be NULL.
+ * Access to the PMU is guaranteed.
+ *
+ * function must restore all PMC registers from set
+ */
+void pfm_arch_restore_pmcs(struct pfm_context *ctx, struct pfm_event_set *set)
+{
+ u16 i, num;
+
+ /*
+ * we need to restore PMCs only when:
+ * - context is not masked
+ * - monitoring activated
+ *
+ * Masking monitoring after an overflow does not change the
+ * value of flags.started
+ */
+ if (!ctx->flags.started)
+ return;
+
+ /*
+ * restore all pmcs
+ *
+ * It is not possible to restore only the pmcs we used because
+ * certain PMU models (e.g. Pentium 4) have dependencies. Thus
+ * we do not want one application using stale PMC coming from
+ * another one.
+ *
+ * On PMU models where there is no dependencies between pmc, then
+ * it is possible to optimize by only restoring the registers that
+ * are used, and this can be done with the models-specific override
+ * for this function.
+ */
+ num = set->nused_pmcs;
+ for (i = 0; num; i++) {
+ if (test_bit(i, cast_ulp(set->used_pmcs))) {
+ pfm_arch_write_pmc(ctx, i, set->pmcs[i]);
+ num--;
+ }
+ }
+}
+
+/**
+ * smp_pmu_interrupt - lowest level PMU interrupt handler for X86
+ * @regs: machine state
+ *
+ * The PMU interrupt is handled through an interrupt gate, therefore
+ * the CPU automatically clears the EFLAGS.IF, i.e., masking interrupts.
+ *
+ * The perfmon interrupt handler MUST run with interrupts disabled due
+ * to possible race with other, higher priority interrupts, such as timer
+ * or IPI function calls.
+ *
+ * See description in IA-32 architecture manual, Vol 3 section 5.8.1
+ */
+void smp_pmu_interrupt(struct pt_regs *regs)
+{
+ unsigned long iip;
+ int using_nmi;
+
+ using_nmi = __get_cpu_var(pfm_using_nmi);
+
+ ack_APIC_irq();
+
+ irq_enter();
+
+ /*
+ * when using NMI, pfm_handle_nmi() gets called
+ * first. It stops monitoring and record the
+ * iip into real_iip, then it repost the interrupt
+ * using the lower priority vector LOCAL_PERFMON_VECTOR
+ *
+ * On some processors, e.g., P4, it may be that some
+ * state is already recorded from pfm_handle_nmi()
+ * and it only needs to be copied back into the normal
+ * fields so it can be used transparently by higher level
+ * code.
+ */
+ if (using_nmi)
+ iip = __get_cpu_var(real_iip);
+ else
+ iip = instruction_pointer(regs);
+
+ pfm_interrupt_handler(iip, regs);
+
+ /*
+ * On Intel processors:
+ * - it is necessary to clear the MASK field for the LVTPC
+ * vector. Otherwise interrupts remain masked. See
+ * section 8.5.1
+ * AMD X86-64:
+ * - the documentation does not stipulate the behavior but
+ * it seems to work without the write, so we skip
+ */
+ if (!using_nmi && current_cpu_data.x86_vendor == X86_VENDOR_INTEL)
+ apic_write(APIC_LVTPC, LOCAL_PERFMON_VECTOR);
+
+ irq_exit();
+}
+
+/**
+ * pfm_handle_nmi - PMU NMI handler notifier callback
+ * @nb ; notifier block
+ * @val: type of die notifier
+ * @data: die notifier-specific data
+ *
+ * called from notify_die() notifier from an trap handler path. We only
+ * care about NMI related callbacks, and ignore everything else.
+ *
+ * Cannot grab any locks, include the perfmon context lock
+ *
+ * Must detect if NMI interrupt comes from perfmon, and if so it must
+ * stop the PMU and repost a lower-priority interrupt. The perfmon interrupt
+ * handler needs to grab the context lock, thus is cannot be run directly
+ * from the NMI interrupt call path.
+ */
+static int __kprobes pfm_handle_nmi(struct notifier_block *nb,
+ unsigned long val,
+ void *data)
+{
+ struct die_args *args = data;
+ struct pfm_context *ctx;
+ struct pfm_arch_pmu_info *pmu_info;
+
+ /*
+ * only NMI related calls
+ */
+ if (val != DIE_NMI_IPI)
+ return NOTIFY_DONE;
+
+ /*
+ * perfmon not using NMI
+ */
+ if (!__get_cpu_var(pfm_using_nmi))
+ return NOTIFY_DONE;
+
+ /*
+ * No context
+ */
+ ctx = __get_cpu_var(pmu_ctx);
+ if (!ctx) {
+ PFM_DBG_ovfl("no ctx");
+ return NOTIFY_DONE;
+ }
+
+ /*
+ * Detect if we have overflows, i.e., NMI interrupt
+ * caused by PMU
+ */
+ pmu_info = pfm_pmu_info();
+ if (!pmu_info->has_ovfls(ctx)) {
+ PFM_DBG_ovfl("no ovfl");
+ return NOTIFY_DONE;
+ }
+
+ /*
+ * we stop the PMU to avoid further overflow before this
+ * one is treated by lower priority interrupt handler
+ */
+ pmu_info->quiesce();
+
+ /*
+ * record actual instruction pointer
+ */
+ __get_cpu_var(real_iip) = instruction_pointer(args->regs);
+
+ /*
+ * post lower priority interrupt (LOCAL_PERFMON_VECTOR)
+ */
+ pfm_arch_resend_irq(ctx);
+
+ /*
+ * we need to rewrite the APIC vector on Intel
+ */
+ if (current_cpu_data.x86_vendor == X86_VENDOR_INTEL)
+ apic_write(APIC_LVTPC, APIC_DM_NMI);
+
+ /*
+ * the notification was for us
+ */
+ return NOTIFY_STOP;
+}
+
+static struct notifier_block pfm_nmi_nb = {
+ .notifier_call = pfm_handle_nmi
+};
+
+/**
+ * pfm_arch_resend_irq - post perfmon interrupt on regular vector
+ *
+ * called from pfm_ctxswin_thread() and pfm_handle_nmi()
+ */
+void pfm_arch_resend_irq(struct pfm_context *ctx)
+{
+ unsigned long val, dest;
+ /*
+ * we cannot use hw_resend_irq() because it goes to
+ * the I/O APIC. We need to go to the Local APIC.
+ *
+ * The "int vec" is not the right solution either
+ * because it triggers a software intr. We need
+ * to regenerate the interrupt and have it pended
+ * until we unmask interrupts.
+ *
+ * Instead we send ourself an IPI on the perfmon
+ * vector.
+ */
+ val = APIC_DEST_SELF|APIC_INT_ASSERT|
+ APIC_DM_FIXED|LOCAL_PERFMON_VECTOR;
+
+ dest = apic_read(APIC_ID);
+ apic_write(APIC_ICR2, dest);
+ apic_write(APIC_ICR, val);
+}
+
+/**
+ * pfm_arch_pmu_acquire_percpu - setup APIC per CPU
+ * @data: contains pmu flags
+ */
+static void pfm_arch_pmu_acquire_percpu(void *data)
+{
+
+ unsigned int tmp, vec;
+ unsigned long flags = (unsigned long)data;
+ unsigned long lvtpc;
+
+ /*
+ * we only reprogram the LVTPC vector if we have detected
+ * no sharing, otherwise it means the APIC is already programmed
+ * and we use whatever vector (likely NMI) is there
+ */
+ if (!(flags & PFM_X86_FL_SHARING)) {
+ vec = LOCAL_PERFMON_VECTOR;
+
+ tmp = apic_read(APIC_LVTERR);
+ apic_write(APIC_LVTERR, tmp | APIC_LVT_MASKED);
+ apic_write(APIC_LVTPC, vec);
+ apic_write(APIC_LVTERR, tmp);
+ }
+ lvtpc = (unsigned long)apic_read(APIC_LVTPC);
+
+ __get_cpu_var(pfm_using_nmi) = lvtpc == APIC_DM_NMI;
+
+ PFM_DBG("LTVPC=0x%lx using_nmi=%d", lvtpc, __get_cpu_var(pfm_using_nmi));
+}
+
+/**
+ * pfm_arch_pmu_acquire - acquire PMU resource from system
+ * @unavail_pmcs : bitmask to use to set unavailable pmcs
+ * @unavail_pmds : bitmask to use to set unavailable pmds
+ *
+ * interrupts are not masked
+ *
+ * Grab PMU registers from lower level MSR allocator
+ *
+ * Program the APIC according the possible interrupt vector
+ * either LOCAL_PERFMON_VECTOR or NMI
+ */
+int pfm_arch_pmu_acquire(u64 *unavail_pmcs, u64 *unavail_pmds)
+{
+ struct pfm_arch_pmu_info *pmu_info;
+ struct pfm_regmap_desc *d;
+ u16 i, nlost;
+
+ pmu_info = pfm_pmu_conf->pmu_info;
+ pmu_info->flags &= ~PFM_X86_FL_SHARING;
+
+ nlost = 0;
+
+ d = pfm_pmu_conf->pmc_desc;
+ for (i = 0; i < pfm_pmu_conf->num_pmc_entries; i++, d++) {
+ if (!(d->type & PFM_REG_I))
+ continue;
+
+ /*
+ * reserve register with lower-level allocator
+ */
+ if (!reserve_evntsel_nmi(d->hw_addr)) {
+ PFM_DBG("pmc%d(%s) already used", i, d->desc);
+ __set_bit(i, cast_ulp(unavail_pmcs));
+ nlost++;
+ continue;
+ }
+ }
+ PFM_DBG("nlost=%d info_flags=0x%x\n", nlost, pmu_info->flags);
+ /*
+ * some PMU models (e.g., P6) do not support sharing
+ * so check if we found less than the expected number of PMC registers
+ */
+ if (nlost) {
+ if (pmu_info->flags & PFM_X86_FL_NO_SHARING) {
+ PFM_INFO("PMU already used by another subsystem, "
+ "PMU does not support sharing, "
+ "try disabling Oprofile or "
+ "reboot with nmi_watchdog=0");
+ goto undo;
+ }
+ pmu_info->flags |= PFM_X86_FL_SHARING;
+ }
+
+ d = pfm_pmu_conf->pmd_desc;
+ for (i = 0; i < pfm_pmu_conf->num_pmd_entries; i++, d++) {
+ if (!(d->type & PFM_REG_I))
+ continue;
+
+ if (!reserve_perfctr_nmi(d->hw_addr)) {
+ PFM_DBG("pmd%d(%s) already used", i, d->desc);
+ __set_bit(i, cast_ulp(unavail_pmds));
+ }
+ }
+ /*
+ * program APIC on each CPU
+ */
+ on_each_cpu(pfm_arch_pmu_acquire_percpu,
+ (void *)(unsigned long)pmu_info->flags , 0, 1);
+
+ return 0;
+undo:
+ /*
+ * must undo reservation of pmcs in case of error
+ */
+ d = pfm_pmu_conf->pmc_desc;
+ for (i = 0; i < pfm_pmu_conf->num_pmc_entries; i++, d++) {
+ if (!(d->type & PFM_REG_I))
+ continue;
+ if (!test_bit(i, cast_ulp(unavail_pmcs)))
+ release_evntsel_nmi(d->hw_addr);
+ }
+ return -EBUSY;
+}
+
+/**
+ * pfm-arch_pmu_release_percpu - clear NMI state for one CPU
+ *
+ */
+static void pfm_arch_pmu_release_percpu(void *data)
+{
+ __get_cpu_var(pfm_using_nmi) = 0;
+}
+
+/**
+ * pfm_arch_pmu_release - release PMU resource to system
+ *
+ * called from pfm_pmu_release()
+ * interrupts are not masked
+ *
+ * On x86, we return the PMU registers to the MSR allocator
+ */
+void pfm_arch_pmu_release(void)
+{
+ struct pfm_regmap_desc *d;
+ u16 i, n;
+
+ d = pfm_pmu_conf->pmc_desc;
+ n = pfm_pmu_conf->regs.num_pmcs;
+ for (i = 0; n; i++, d++) {
+ if (!test_bit(i, cast_ulp(pfm_pmu_conf->regs.pmcs)))
+ continue;
+ release_evntsel_nmi(d->hw_addr);
+ n--;
+ PFM_DBG("pmc%u released", i);
+ }
+ d = pfm_pmu_conf->pmd_desc;
+ n = pfm_pmu_conf->regs.num_pmds;
+ for (i = 0; n; i++, d++) {
+ if (!test_bit(i, cast_ulp(pfm_pmu_conf->regs.pmds)))
+ continue;
+ release_perfctr_nmi(d->hw_addr);
+ n--;
+ PFM_DBG("pmd%u released", i);
+ }
+
+ /* clear NMI variable if used */
+ if (__get_cpu_var(pfm_using_nmi))
+ on_each_cpu(pfm_arch_pmu_release_percpu, NULL , 0, 1);
+}
+
+/**
+ * pfm_arch_init - one time global arch-specific initialization
+ *
+ * called from pfm_init()
+ */
+int __init pfm_arch_init(void)
+{
+ /*
+ * we need to register our NMI handler when the kernels boots
+ * to avoid a deadlock condition with the NMI watchdog or Oprofile
+ * if we were to try and register/unregister on-demand.
+ */
+ register_die_notifier(&pfm_nmi_nb);
+ return 0;
+}
Index: o/include/asm-x86/perfmon.h
===================================================================
--- /dev/null 1970-01-01 00:00:00.000000000 +0000
+++ o/include/asm-x86/perfmon.h 2008-06-16 18:21:52.000000000 +0200
@@ -0,0 +1,34 @@
+/*
+ * Copyright (c) 2007 Hewlett-Packard Development Company, L.P.
+ * Contributed by Stephane Eranian <[email protected]>
+ *
+ * This file contains i386/x86_64 specific definitions for the perfmon
+ * interface.
+ *
+ * This file MUST never be included directly. Use linux/perfmon.h.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+ * 02111-1307 USA
+ */
+#ifndef _ASM_X86_PERFMON__H_
+#define _ASM_X86_PERFMON__H_
+
+/*
+ * arch-specific user visible interface definitions
+ */
+
+#define PFM_ARCH_MAX_PMCS (256+64) /* 256 HW 64 SW */
+#define PFM_ARCH_MAX_PMDS (256+64) /* 256 HW 64 SW */
+
+#endif /* _ASM_X86_PERFMON_H_ */
Index: o/include/asm-x86/perfmon_kern.h
===================================================================
--- /dev/null 1970-01-01 00:00:00.000000000 +0000
+++ o/include/asm-x86/perfmon_kern.h 2008-06-16 18:21:52.000000000 +0200
@@ -0,0 +1,376 @@
+/*
+ * Copyright (c) 2005-2006 Hewlett-Packard Development Company, L.P.
+ * Contributed by Stephane Eranian <[email protected]>
+ *
+ * Copyright (c) 2007 Advanced Micro Devices, Inc.
+ * Contributed by Robert Richter <[email protected]>
+ *
+ * This file contains X86 Processor Family specific definitions
+ * for the perfmon interface. This covers P6, Pentium M, P4/Xeon
+ * (32-bit and 64-bit, i.e., EM64T) and AMD X86-64.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+ * 02111-1307 USA
+ */
+#ifndef _ASM_X86_PERFMON_KERN_H_
+#define _ASM_X86_PERFMON_KERN_H_
+
+#ifdef CONFIG_PERFMON
+#include <linux/unistd.h>
+#ifdef CONFIG_4KSTACKS
+#define PFM_ARCH_PMD_STK_ARG 2
+#define PFM_ARCH_PMC_STK_ARG 2
+#else
+#define PFM_ARCH_PMD_STK_ARG 4 /* about 700 bytes of stack space */
+#define PFM_ARCH_PMC_STK_ARG 4 /* about 200 bytes of stack space */
+#endif
+
+struct pfm_arch_pmu_info {
+ u32 flags; /* PMU feature flags */
+ /*
+ * mandatory model-specific callbacks
+ */
+ int (*stop_save)(struct pfm_context *ctx, struct pfm_event_set *set);
+ int (*has_ovfls)(struct pfm_context *ctx);
+ void (*quiesce)(void);
+
+ /*
+ * optional model-specific callbacks
+ */
+// void (*acquire_pmu_percpu)(void);
+// void (*release_pmu_percpu)(void);
+ int (*load_context)(struct pfm_context *ctx);
+ void (*unload_context)(struct pfm_context *ctx);
+};
+
+/*
+ * PMU feature flags
+ */
+#define PFM_X86_FL_NO_SHARING 0x02 /* no sharing with other subsystems */
+#define PFM_X86_FL_SHARING 0x04 /* PMU is being shared */
+
+struct pfm_x86_ctx_flags {
+ unsigned int insecure:1; /* rdpmc per-thread self-monitoring */
+ unsigned int reserved:31; /* for future use */
+};
+
+struct pfm_arch_context {
+ u64 saved_real_iip; /* instr pointer of last NMI intr */
+ struct pfm_x86_ctx_flags flags; /* flags */
+};
+
+/*
+ * functions implemented as inline on x86
+ */
+
+/**
+ * pfm_arch_write_pmc - write a single PMC register
+ * @ctx: context to work on
+ * @cnum: PMC index
+ * @value: PMC 64-bit value
+ *
+ * in certain situations, ctx may be NULL
+ */
+static inline void pfm_arch_write_pmc(struct pfm_context *ctx,
+ unsigned int cnum, u64 value)
+{
+ /*
+ * we only write to the actual register when monitoring is
+ * active (pfm_start was issued)
+ */
+ if (ctx && ctx->flags.started == 0)
+ return;
+
+ PFM_DBG_ovfl("pfm_arch_write_pmc(0x%lx, 0x%Lx)",
+ pfm_pmu_conf->pmc_desc[cnum].hw_addr,
+ (unsigned long long) value);
+
+ wrmsrl(pfm_pmu_conf->pmc_desc[cnum].hw_addr, value);
+}
+
+/**
+ * pfm_arch_write_pmd - write a single PMD register
+ * @ctx: context to work on
+ * @cnum: PMD index
+ * @value: PMD 64-bit value
+ */
+static inline void pfm_arch_write_pmd(struct pfm_context *ctx,
+ unsigned int cnum, u64 value)
+{
+ /*
+ * to make sure the counter overflows, we set the
+ * upper bits. we also clear any other unimplemented
+ * bits as this may cause crash on some processors.
+ */
+ if (pfm_pmu_conf->pmd_desc[cnum].type & PFM_REG_C64)
+ value = (value | ~pfm_pmu_conf->ovfl_mask)
+ & ~pfm_pmu_conf->pmd_desc[cnum].rsvd_msk;
+
+ PFM_DBG_ovfl("pfm_arch_write_pmd(0x%lx, 0x%Lx)",
+ pfm_pmu_conf->pmd_desc[cnum].hw_addr,
+ (unsigned long long) value);
+
+ wrmsrl(pfm_pmu_conf->pmd_desc[cnum].hw_addr, value);
+}
+
+/**
+ * pfm_arch_read_pmd - read a single PMD register
+ * @ctx: context to work on
+ * @cnum: PMD index
+ *
+ * return value is register 64-bit value
+ */
+static inline u64 pfm_arch_read_pmd(struct pfm_context *ctx, unsigned int cnum)
+{
+ u64 tmp;
+
+ rdmsrl(pfm_pmu_conf->pmd_desc[cnum].hw_addr, tmp);
+
+ PFM_DBG_ovfl("pfm_arch_read_pmd(0x%lx) = 0x%Lx",
+ pfm_pmu_conf->pmd_desc[cnum].hw_addr,
+ (unsigned long long) tmp);
+ return tmp;
+}
+
+/**
+ * pfm_arch_read_pmc - read a single PMC register
+ * @ctx: context to work on
+ * @cnum: PMC index
+ *
+ * return value is register 64-bit value
+ */
+static inline u64 pfm_arch_read_pmc(struct pfm_context *ctx, unsigned int cnum)
+{
+ u64 tmp;
+
+ rdmsrl(pfm_pmu_conf->pmc_desc[cnum].hw_addr, tmp);
+
+ PFM_DBG_ovfl("pfm_arch_read_pmc(0x%lx) = 0x%016Lx",
+ pfm_pmu_conf->pmc_desc[cnum].hw_addr,
+ (unsigned long long) tmp);
+ return tmp;
+}
+
+/**
+ * pfm_arch_is_active - return non-zero is monitoring has been started
+ * @ctx: context to check
+ *
+ * At certain points, perfmon needs to know if monitoring has been
+ * explicitly started.
+ *
+ * On x86, there is not other way but to use pfm_start/pfm_stop
+ * to activate monitoring, thus we can simply check flags.started
+ */
+static inline int pfm_arch_is_active(struct pfm_context *ctx)
+{
+ return ctx->flags.started;
+}
+
+
+/**
+ * pfm_arch_unload_context - detach context from thread or CPU
+ * @ctx: context to detach
+ *
+ * in system-wide ctx->task is NULL, otherwise it points to the
+ * attached thread
+ */
+static inline void pfm_arch_unload_context(struct pfm_context *ctx)
+{
+ struct pfm_arch_pmu_info *pmu_info;
+ struct pfm_arch_context *ctx_arch;
+
+ ctx_arch = pfm_ctx_arch(ctx);
+ pmu_info = pfm_pmu_info();
+
+ if (ctx_arch->flags.insecure) {
+ PFM_DBG("clear cr4.pce");
+ clear_in_cr4(X86_CR4_PCE);
+ }
+
+ if (pmu_info->unload_context)
+ pmu_info->unload_context(ctx);
+}
+
+/**
+ * pfm_arch_load_context - attach context to thread or CPU
+ * @ctx: context to attach
+ */
+static inline int pfm_arch_load_context(struct pfm_context *ctx)
+{
+ struct pfm_arch_pmu_info *pmu_info;
+ struct pfm_arch_context *ctx_arch;
+ int ret = 0;
+
+ ctx_arch = pfm_ctx_arch(ctx);
+ pmu_info = pfm_pmu_info();
+
+ /*
+ * RDPMC authorized in system-wide and
+ * per-thread self-monitoring.
+ *
+ * RDPMC only gives access to counts.
+ *
+ * The context-switch routine code does not restore
+ * all the PMD registers (optimization), thus there
+ * is a possible leak of counts there in per-thread
+ * mode.
+ */
+ if (ctx->task == current) {
+ PFM_DBG("set cr4.pce");
+ set_in_cr4(X86_CR4_PCE);
+ ctx_arch->flags.insecure = 1;
+ }
+
+ if (pmu_info->load_context)
+ ret = pmu_info->load_context(ctx);
+
+ return ret;
+}
+
+void pfm_arch_restore_pmcs(struct pfm_context *ctx, struct pfm_event_set *set);
+void pfm_arch_start(struct task_struct *task, struct pfm_context *ctx);
+void pfm_arch_stop(struct task_struct *task, struct pfm_context *ctx);
+
+/**
+ * pfm_arch_intr_freeze_pmu - stop monitoring when handling PMU interrupt
+ * @ctx: current context
+ * @set: current event set
+ *
+ * called from __pfm_interrupt_handler().
+ * ctx is not NULL. ctx is locked. interrupts are masked
+ *
+ * The following actions must take place:
+ * - stop all monitoring to ensure handler has consistent view.
+ * - collect overflowed PMDs bitmask into povfls_pmds and
+ * npend_ovfls. If no interrupt detected then npend_ovfls
+ * must be set to zero.
+ */
+static inline void pfm_arch_intr_freeze_pmu(struct pfm_context *ctx,
+ struct pfm_event_set *set)
+{
+ /*
+ * on X86, freezing is equivalent to stopping
+ */
+ pfm_arch_stop(current, ctx);
+
+ /*
+ * we mark monitoring as stopped to avoid
+ * certain side effects especially in
+ * pfm_switch_sets_from_intr() and
+ * pfm_arch_restore_pmcs()
+ */
+ ctx->flags.started = 0;
+}
+
+/**
+ * pfm_arch_intr_unfreeze_pmu - conditionally reactive monitoring
+ * @ctx: current context
+ *
+ * current context may be not when dealing when spurious interrupts
+ *
+ * Must re-activate monitoring if context is not MASKED.
+ * interrupts are masked.
+ */
+static inline void pfm_arch_intr_unfreeze_pmu(struct pfm_context *ctx)
+{
+ if (ctx == NULL)
+ return;
+
+ PFM_DBG_ovfl("state=%d", ctx->state);
+
+ /*
+ * restore flags.started which is cleared in
+ * pfm_arch_intr_freeze_pmu()
+ */
+ ctx->flags.started = 1;
+
+ pfm_arch_restore_pmcs(ctx, ctx->active_set);
+}
+
+/**
+ * pfm_arch_ovfl_reset_pmd - reset pmd on overflow
+ * @ctx: current context
+ * @cnum: PMD index
+ *
+ * On some CPUs, the upper bits of a counter must be set in order for the
+ * overflow interrupt to happen. On overflow, the counter has wrapped around,
+ * and the upper bits are cleared. This function may be used to set them back.
+ *
+ * For x86, the current version loses whatever is remaining in the counter,
+ * which is usually has a small count. In order not to loose this count,
+ * we do a read-modify-write to set the upper bits while preserving the
+ * low-order bits. This is slow but works.
+ */
+static inline void pfm_arch_ovfl_reset_pmd(struct pfm_context *ctx, unsigned int cnum)
+{
+ u64 val;
+ val = pfm_arch_read_pmd(ctx, cnum);
+ pfm_arch_write_pmd(ctx, cnum, val);
+}
+
+/**
+ * pfm_arch_context_create - create context
+ * @ctx: newly created context
+ * @flags: context flags as passed by user
+ *
+ * called from __pfm_create_context()
+ */
+static inline int pfm_arch_context_create(struct pfm_context *ctx, u32 ctx_flags)
+{
+ return 0;
+}
+
+/**
+ * pfm_arch_context_free - free context
+ * @ctx: context to free
+ */
+static inline void pfm_arch_context_free(struct pfm_context *ctx)
+{}
+
+/*
+ * functions implemented in arch/x86/perfmon/perfmon.c
+ */
+int pfm_arch_init(void);
+void pfm_arch_resend_irq(struct pfm_context *ctx);
+
+int pfm_arch_ctxswout_thread(struct task_struct *task, struct pfm_context *ctx);
+void pfm_arch_ctxswin_thread(struct task_struct *task, struct pfm_context *ctx);
+
+void pfm_arch_restore_pmds(struct pfm_context *ctx, struct pfm_event_set *set);
+int pfm_arch_pmu_config_init(struct pfm_pmu_config *cfg);
+void pfm_arch_pmu_config_remove(void);
+char *pfm_arch_get_pmu_module_name(void);
+int pfm_arch_pmu_acquire(u64 *unavail_pmcs, u64 *unavail_pmds);
+void pfm_arch_pmu_release(void);
+
+static inline void pfm_arch_serialize(void)
+{}
+
+static inline void pfm_arch_arm_handle_work(struct task_struct *task)
+{}
+
+static inline void pfm_arch_disarm_handle_work(struct task_struct *task)
+{}
+
+#define PFM_ARCH_CTX_SIZE (sizeof(struct pfm_arch_context))
+/*
+ * x86 does not need extra alignment requirements for the sampling buffer
+ */
+#define PFM_ARCH_SMPL_ALIGN_SIZE 0
+
+asmlinkage void pmu_interrupt(void);
+
+#endif /* CONFIG_PEFMON */
+
+#endif /* _ASM_X86_PERFMON_KERN_H_ */
Index: o/arch/x86/Kconfig
===================================================================
--- o.orig/arch/x86/Kconfig 2008-06-16 18:23:57.000000000 +0200
+++ o/arch/x86/Kconfig 2008-06-16 18:24:29.000000000 +0200
@@ -1306,6 +1306,8 @@
If unsure, say Y.
+source "arch/x86/perfmon/Kconfig"
+
endmenu
config ARCH_ENABLE_MEMORY_HOTPLUG
Index: o/arch/x86/Makefile
===================================================================
--- o.orig/arch/x86/Makefile 2008-06-16 18:22:54.000000000 +0200
+++ o/arch/x86/Makefile 2008-06-16 18:23:26.000000000 +0200
@@ -176,6 +176,8 @@
core-y += arch/x86/kernel/
core-y += arch/x86/mm/
+core-$(CONFIG_PERFMON) += arch/x86/perfmon/
+
# Remaining sub architecture files
core-y += $(mcore-y)
Index: o/include/asm-x86/Kbuild
===================================================================
--- o.orig/include/asm-x86/Kbuild 2008-06-16 18:24:46.000000000 +0200
+++ o/include/asm-x86/Kbuild 2008-06-16 18:25:00.000000000 +0200
@@ -11,6 +11,7 @@
header-y += sigcontext32.h
header-y += ucontext.h
header-y += processor-flags.h
+header-y += perfmon.h
unifdef-y += e820.h
unifdef-y += ist.h
--
Hi Stephane,
On Tue, 17 Jun 2008 15:02:22 -0700 (PDT) [email protected] wrote:
>
> This patch adds the X86 generic perfmon2 code. It is in charge of
> implementing certain key functionalities required by the generic
> code such as read/write of the PMU registers, low-level interrupt
> handling.
After applying patches 1-5, building an x86_64 allmodconfig produces
these errors:
In file included from include/linux/perfmon_kern.h:167,
from arch/x86/perfmon/perfmon.c:25:
include/linux/perfmon_pmu.h:92: warning: ‘struct pfarg_pmc’ declared inside parameter list
include/linux/perfmon_pmu.h:92: warning: its scope is only this definition or declaration, which is probably not what you want
include/linux/perfmon_pmu.h:96: warning: ‘struct pfarg_pmd’ declared inside parameter list
Those structures aren't defined until patch 11.
In file included from arch/x86/perfmon/perfmon.c:25:
include/linux/perfmon_kern.h: In function ‘pfm_copy_thread’:
include/linux/perfmon_kern.h:199: error: ‘TIF_PERFMON_CTXSW’ undeclared (first use in this function)
arch/x86/perfmon/perfmon.c: In function ‘smp_pmu_interrupt’:
arch/x86/perfmon/perfmon.c:338: error: ‘LOCAL_PERFMON_VECTOR’ undeclared (first use in this function)
arch/x86/perfmon/perfmon.c: In function ‘pfm_arch_resend_irq’:
arch/x86/perfmon/perfmon.c:451: error: ‘LOCAL_PERFMON_VECTOR’ undeclared (first use in this function)
arch/x86/perfmon/perfmon.c: In function ‘pfm_arch_pmu_acquire_percpu’:
arch/x86/perfmon/perfmon.c:475: error: ‘LOCAL_PERFMON_VECTOR’ undeclared (first use in this function)
These are not defined until patch 7.
So I think some rearrangement is still required.
Also, some of the x86 files that you are modifying have had their 32 and
64 bit versions combined in linux-next (which is the stuff of the next
merge window) so just a heads up that there will be more merge work then.
Lastly, some of this will clash with the generic IPIs work.
I don't want to put you off, you have done great work here, thanks.
--
Cheers,
Stephen Rothwell [email protected]
http://www.canb.auug.org.au/~sfr/
Stephen,
The patch series is currently structured in such a way that the stack
is not build-able at each step.
I should add this to the introduction but you have a first "plateau"
at perfmon-syscalls. You cannot
build before that.
As for the linux-next, I do not use it currently, I am still on
mainline, maybe that's a mistake.
I will look into this tree.
As for IPI, I hope they have maintained a smp_call_function_single()
kind of functionality.
Thanks for your feedback and keep it coming.
On Mon, Jun 23, 2008 at 6:11 PM, Stephen Rothwell <[email protected]> wrote:
> Hi Stephane,
>
> On Tue, 17 Jun 2008 15:02:22 -0700 (PDT) [email protected] wrote:
>>
>> This patch adds the X86 generic perfmon2 code. It is in charge of
>> implementing certain key functionalities required by the generic
>> code such as read/write of the PMU registers, low-level interrupt
>> handling.
>
> After applying patches 1-5, building an x86_64 allmodconfig produces
> these errors:
>
> In file included from include/linux/perfmon_kern.h:167,
> from arch/x86/perfmon/perfmon.c:25:
> include/linux/perfmon_pmu.h:92: warning: 'struct pfarg_pmc' declared inside parameter list
> include/linux/perfmon_pmu.h:92: warning: its scope is only this definition or declaration, which is probably not what you want
> include/linux/perfmon_pmu.h:96: warning: 'struct pfarg_pmd' declared inside parameter list
>
> Those structures aren't defined until patch 11.
>
> In file included from arch/x86/perfmon/perfmon.c:25:
> include/linux/perfmon_kern.h: In function 'pfm_copy_thread':
> include/linux/perfmon_kern.h:199: error: 'TIF_PERFMON_CTXSW' undeclared (first use in this function)
> arch/x86/perfmon/perfmon.c: In function 'smp_pmu_interrupt':
> arch/x86/perfmon/perfmon.c:338: error: 'LOCAL_PERFMON_VECTOR' undeclared (first use in this function)
> arch/x86/perfmon/perfmon.c: In function 'pfm_arch_resend_irq':
> arch/x86/perfmon/perfmon.c:451: error: 'LOCAL_PERFMON_VECTOR' undeclared (first use in this function)
> arch/x86/perfmon/perfmon.c: In function 'pfm_arch_pmu_acquire_percpu':
> arch/x86/perfmon/perfmon.c:475: error: 'LOCAL_PERFMON_VECTOR' undeclared (first use in this function)
>
> These are not defined until patch 7.
>
> So I think some rearrangement is still required.
>
> Also, some of the x86 files that you are modifying have had their 32 and
> 64 bit versions combined in linux-next (which is the stuff of the next
> merge window) so just a heads up that there will be more merge work then.
> Lastly, some of this will clash with the generic IPIs work.
>
> I don't want to put you off, you have done great work here, thanks.
>
> --
> Cheers,
> Stephen Rothwell [email protected]
> http://www.canb.auug.org.au/~sfr/
>
Stephen,
Ok, so I cloned your linux-next tree and ported by perfmon2 minimal patch
series to it. As you said, there was a handful of files which caused problems,
mostly because of x86 merge. But the net effect is that the series is now a bit
smaller. I tried some simple perfmon tests and they worked fine. I did not go
all the way to the patch which needed IPI (i.e. system-wide support).
However I had problems with this tree and my Broadcom BMC5754 network
card. It was not recognized by the tg3 driver so hard to use this kernel.
On Mon, Jun 23, 2008 at 6:11 PM, Stephen Rothwell <[email protected]> wrote:
> Hi Stephane,
>
> On Tue, 17 Jun 2008 15:02:22 -0700 (PDT) [email protected] wrote:
>>
>> This patch adds the X86 generic perfmon2 code. It is in charge of
>> implementing certain key functionalities required by the generic
>> code such as read/write of the PMU registers, low-level interrupt
>> handling.
>
> After applying patches 1-5, building an x86_64 allmodconfig produces
> these errors:
>
> In file included from include/linux/perfmon_kern.h:167,
> from arch/x86/perfmon/perfmon.c:25:
> include/linux/perfmon_pmu.h:92: warning: 'struct pfarg_pmc' declared inside parameter list
> include/linux/perfmon_pmu.h:92: warning: its scope is only this definition or declaration, which is probably not what you want
> include/linux/perfmon_pmu.h:96: warning: 'struct pfarg_pmd' declared inside parameter list
>
> Those structures aren't defined until patch 11.
>
> In file included from arch/x86/perfmon/perfmon.c:25:
> include/linux/perfmon_kern.h: In function 'pfm_copy_thread':
> include/linux/perfmon_kern.h:199: error: 'TIF_PERFMON_CTXSW' undeclared (first use in this function)
> arch/x86/perfmon/perfmon.c: In function 'smp_pmu_interrupt':
> arch/x86/perfmon/perfmon.c:338: error: 'LOCAL_PERFMON_VECTOR' undeclared (first use in this function)
> arch/x86/perfmon/perfmon.c: In function 'pfm_arch_resend_irq':
> arch/x86/perfmon/perfmon.c:451: error: 'LOCAL_PERFMON_VECTOR' undeclared (first use in this function)
> arch/x86/perfmon/perfmon.c: In function 'pfm_arch_pmu_acquire_percpu':
> arch/x86/perfmon/perfmon.c:475: error: 'LOCAL_PERFMON_VECTOR' undeclared (first use in this function)
>
> These are not defined until patch 7.
>
> So I think some rearrangement is still required.
>
> Also, some of the x86 files that you are modifying have had their 32 and
> 64 bit versions combined in linux-next (which is the stuff of the next
> merge window) so just a heads up that there will be more merge work then.
> Lastly, some of this will clash with the generic IPIs work.
>
> I don't want to put you off, you have done great work here, thanks.
>
> --
> Cheers,
> Stephen Rothwell [email protected]
> http://www.canb.auug.org.au/~sfr/
>