This patch the core of perfmon2.
The core consists of:
- back-end to most system calls
- notification message queue management
- sampling buffer allocation
- support functions for sampling
- context allocation and destruction
- user level notification
- perfmon2 initialization
- permission checking
--- linux-2.6.17.9.base/include/linux/perfmon.h 1969-12-31 16:00:00.000000000 -0800
+++ linux-2.6.17.9/include/linux/perfmon.h 2006-08-21 03:37:46.000000000 -0700
@@ -0,0 +1,749 @@
+/*
+ * Copyright (c) 2001-2006 Hewlett-Packard Development Company, L.P.
+ * Contributed by Stephane Eranian <[email protected]>
+ */
+
+#ifndef __LINUX_PERFMON_H__
+#define __LINUX_PERFMON_H__
+
+#ifdef CONFIG_PERFMON
+
+/*
+ * user interface definitions
+ *
+ * Do not use directly for applications, use libpfm/libc provided
+ * header file instead.
+ */
+
+#define PFM_MAX_HW_PMCS 256
+#define PFM_MAX_HW_PMDS 256
+#define PFM_MAX_XTRA_PMCS 64
+#define PFM_MAX_XTRA_PMDS 64
+
+#define PFM_MAX_PMCS (PFM_MAX_HW_PMCS+PFM_MAX_XTRA_PMCS)
+#define PFM_MAX_PMDS (PFM_MAX_HW_PMDS+PFM_MAX_XTRA_PMDS)
+
+/*
+ * number of elements for each type of bitvector
+ * all bitvectors use u64 fixed size type on all architectures.
+ */
+#define PFM_BVSIZE(x) (((x)+(sizeof(u64)<<3)-1) / (sizeof(u64)<<3))
+#define PFM_HW_PMD_BV PFM_BVSIZE(PFM_MAX_HW_PMDS)
+#define PFM_PMD_BV PFM_BVSIZE(PFM_MAX_PMDS)
+#define PFM_PMC_BV PFM_BVSIZE(PFM_MAX_PMCS)
+
+/*
+ * custom sampling buffer identifier type
+ */
+typedef __u8 pfm_uuid_t[16];
+
+/*
+ * PMC/PMD flags to use with pfm_write_pmds() or pfm_write_pmcs()
+ *
+ * reg_flags layout:
+ * bit 00-15 : generic flags
+ * bit 16-23 : arch-specific flags
+ * bit 24-31 : error codes
+ */
+#define PFM_REGFL_OVFL_NOTIFY 0x1 /* PMD: send notification on overflow */
+#define PFM_REGFL_RANDOM 0x2 /* PMD: randomize sampling interval */
+#define PFM_REGFL_NO_EMUL64 0x4 /* PMC: no 64-bit emulation for counter */
+
+/*
+ * event set flags layout:
+ * bit 00-15 : generic flags
+ * bit 16-31 : arch-specific flags
+ */
+#define PFM_SETFL_OVFL_SWITCH 0x01 /* enable switch on overflow */
+#define PFM_SETFL_TIME_SWITCH 0x02 /* switch set on timeout */
+#define PFM_SETFL_EXPL_NEXT 0x04 /* use set_id_next as the next set */
+#define PFM_SETFL_EXCL_IDLE 0x08 /* exclude idle task (syswide only) */
+
+/*
+ * PMD/PMC return flags in case of error (ignored on input)
+ *
+ * reg_flags layout:
+ * bit 00-15 : generic flags
+ * bit 16-23 : arch-specific flags
+ * bit 24-31 : error codes
+ *
+ * Those flags are used on output and must be checked in case EINVAL is
+ * returned by a command accepting a vector of values and each has a flag
+ * field, such as pfarg_pmc or pfarg_pmd.
+ */
+#define PFM_REG_RETFL_NOTAVAIL (1<<31) /* implemented but not available */
+#define PFM_REG_RETFL_EINVAL (1<<30) /* entry is invalid */
+#define PFM_REG_RETFL_NOSET (1<<29) /* event set does not exist */
+#define PFM_REG_RETFL_MASK (PFM_REG_RETFL_NOTAVAIL|\
+ PFM_REG_RETFL_EINVAL|\
+ PFM_REG_RETFL_NOSET)
+
+#define PFM_REG_HAS_ERROR(flag) (((flag) & PFM_REG_RETFL_MASK) != 0)
+
+typedef __u32 __bitwise pfm_flags_t;
+/*
+ * Request structure used to define a context
+ */
+struct pfarg_ctx {
+ pfm_uuid_t ctx_smpl_buf_id; /* which buffer format to use */
+ pfm_flags_t ctx_flags; /* noblock/block/syswide */
+ __s32 ctx_fd; /* ret arg: fd for context */
+ __u64 ctx_smpl_buf_size; /* ret arg: actual buffer size */
+ __u64 ctx_reserved3[12]; /* for future use */
+};
+/*
+ * context flags (ctx_flags)
+ *
+ */
+#define PFM_FL_NOTIFY_BLOCK 0x01 /* block task on user notifications */
+#define PFM_FL_SYSTEM_WIDE 0x02 /* create a system wide context */
+#define PFM_FL_OVFL_NO_MSG 0x80 /* no overflow msgs */
+#define PFM_FL_MAP_SETS 0x10 /* event sets are remapped */
+
+
+/*
+ * argument structure for pfm_write_pmcs()
+ */
+struct pfarg_pmc {
+ __u16 reg_num; /* which register */
+ __u16 reg_set; /* event set for this register */
+ pfm_flags_t reg_flags; /* input: flags, return: reg error */
+ __u64 reg_value; /* pmc value */
+ __u64 reg_reserved2[4]; /* for future use */
+};
+
+/*
+ * argument structure for pfm_write_pmds() and pfm_read_pmds()
+ */
+struct pfarg_pmd {
+ __u16 reg_num; /* which register */
+ __u16 reg_set; /* event set for this register */
+ pfm_flags_t reg_flags; /* input: flags, return: reg error */
+ __u64 reg_value; /* initial pmc/pmd value */
+ __u64 reg_long_reset; /* value to reload after notification */
+ __u64 reg_short_reset; /* reset after counter overflow */
+ __u64 reg_last_reset_val; /* return: PMD last reset value */
+ __u64 reg_ovfl_switch_cnt; /* #overflows before switch */
+ __u64 reg_reset_pmds[PFM_PMD_BV]; /* reset on overflow */
+ __u64 reg_smpl_pmds[PFM_PMD_BV]; /* record in sample */
+ __u64 reg_smpl_eventid; /* opaque event identifier */
+ __u64 reg_random_mask; /* bitmask used to limit random value */
+ __u32 reg_random_seed; /* seed for randomization */
+ __u32 reg_reserved2[7]; /* for future use */
+};
+
+/*
+ * optional argument to pfm_start(), pass NULL if no arg needed
+ */
+struct pfarg_start {
+ __u16 start_set; /* event set to start with */
+ __u16 start_reserved1; /* for future use */
+ __u32 start_reserved2; /* for future use */
+ __u64 reserved3[3]; /* for future use */
+};
+
+/*
+ * argument to pfm_load_context()
+ */
+struct pfarg_load {
+ __u32 load_pid; /* thread to attach to */
+ __u16 load_set; /* set to load first */
+ __u16 load_reserved1; /* for future use */
+ __u64 load_reserved2[3]; /* for future use */
+};
+
+/*
+ * argument to pfm_create_evtsets()/pfm_delete_evtsets()
+ *
+ * max timeout: 1h11mn33s (2<<32 usecs)
+ */
+struct pfarg_setdesc {
+ __u16 set_id; /* which set */
+ __u16 set_id_next; /* next set to go to */
+ pfm_flags_t set_flags; /* input: flags, return: err flag */
+ __u32 set_timeout; /* req/eff switch timeout in usecs */
+ __u32 set_reserved1; /* for future use */
+ __u64 set_mmap_offset; /* ret arg: cookie for mmap offset */
+ __u64 reserved[5]; /* for future use */
+};
+
+/*
+ * argument to pfm_getinfo_evtsets()
+ */
+struct pfarg_setinfo {
+ __u16 set_id; /* which set */
+ __u16 set_id_next; /* out: next set to go to */
+ pfm_flags_t set_flags; /* out:flags or error */
+ __u64 set_ovfl_pmds[PFM_PMD_BV]; /* out: last ovfl PMDs */
+ __u64 set_runs; /* out: #times the set was active */
+ __u32 set_timeout; /* out: effective switch timeout in usecs */
+ __u32 set_reserved1; /* for future use */
+ __u64 set_act_duration; /* out: time set active (cycles) */
+ __u64 set_mmap_offset; /* cookie to for mmap offset */
+ __u64 set_avail_pmcs[PFM_PMC_BV];/* unavailable PMCs */
+ __u64 set_avail_pmds[PFM_PMD_BV];/* unavailable PMDs */
+ __u64 reserved[4]; /* for future use */
+};
+
+/*
+ * default value for the user and group security parameters in
+ * /proc/sys/kernel/perfmon/sys_group
+ * /proc/sys/kernel/perfmon/task_group
+ */
+#define PFM_GROUP_PERM_ANY -1 /* any user/group */
+
+/*
+ * remapped set view
+ *
+ * IMPORTANT: cannot be bigger than PAGE_SIZE
+ */
+struct pfm_set_view {
+ __u32 set_status; /* set status: active/inact */
+ __u32 set_reserved1; /* for future use */
+ __u64 set_runs; /* number of activations */
+ __u64 set_pmds[PFM_MAX_PMDS]; /* 64-bit value of PMDS */
+ volatile unsigned long set_seq; /* sequence number of updates */
+};
+
+/*
+ * pfm_set_view status flags
+ */
+#define PFM_SETVFL_ACTIVE 0x1 /* set is active */
+
+struct pfm_ovfl_msg {
+ __u32 msg_type; /* generic message header */
+ __u32 msg_ovfl_pid; /* process id */
+ __u64 msg_ovfl_pmds[PFM_HW_PMD_BV];/* overflowed PMDs */
+ __u16 msg_active_set; /* active set at overflow */
+ __u16 msg_ovfl_cpu; /* cpu of PMU interrupt */
+ __u32 msg_ovfl_tid; /* kernel thread id */
+ __u64 msg_ovfl_ip; /* IP on PMU intr */
+};
+
+#define PFM_MSG_OVFL 1 /* an overflow happened */
+#define PFM_MSG_END 2 /* task to which context was attached ended */
+
+union pfm_msg {
+ __u32 type;
+ struct pfm_ovfl_msg pfm_ovfl_msg;
+};
+
+/*
+ * perfmon version number
+ */
+#define PFM_VERSION_MAJ 2U
+#define PFM_VERSION_MIN 2U
+#define PFM_VERSION (((PFM_VERSION_MAJ&0xffff)<<16)|\
+ (PFM_VERSION_MIN & 0xffff))
+#define PFM_VERSION_MAJOR(x) (((x)>>16) & 0xffff)
+#define PFM_VERSION_MINOR(x) ((x) & 0xffff)
+
+
+/*
+ * This part of the header file is meant for kernel level code only including
+ * kernel modules
+ */
+#ifdef __KERNEL__
+
+#include <linux/file.h>
+#include <linux/seq_file.h>
+#include <linux/interrupt.h>
+#include <linux/kobject.h>
+
+/*
+ * perfmon context state
+ */
+#define PFM_CTX_UNLOADED 1 /* context is not loaded onto any task */
+#define PFM_CTX_LOADED 2 /* context is loaded onto a task */
+#define PFM_CTX_MASKED 3 /* context is loaded, monitoring is masked */
+#define PFM_CTX_ZOMBIE 4 /* context lost owner but is still attached */
+
+/*
+ * depth of message queue
+ */
+#define PFM_MAX_MSGS 8
+#define PFM_CTXQ_EMPTY(g) ((g)->msgq_head == (g)->msgq_tail)
+
+/*
+ * type of PMD reset for pfm_reset_pmds() or pfm_switch_sets()
+ */
+#define PFM_PMD_RESET_NONE 0 /* do not reset (pfm_switch_set) */
+#define PFM_PMD_RESET_SHORT 1 /* use short reset value */
+#define PFM_PMD_RESET_LONG 2 /* use long reset value */
+
+/*
+ * describe the content of the pfm_syst_info field
+ */
+#define PFM_CPUINFO_TIME_SWITCH 0x20 /* current set is time-switched */
+
+struct pfm_controls {
+ int debug; /* debugging via syslog */
+ int debug_ovfl; /* overflow handling debugging */
+ gid_t sys_group; /* gid to create a syswide context */
+ gid_t task_group; /* gid to create a per-task context */
+ size_t arg_size_max; /* maximum vector argument size */
+ size_t smpl_buf_size_max; /* max buf mem, -1 for infinity */
+ int pmd_read;
+};
+
+DECLARE_PER_CPU(struct task_struct *, pmu_owner);
+DECLARE_PER_CPU(struct pfm_context *, pmu_ctx);
+DECLARE_PER_CPU(unsigned long, pfm_syst_info);
+DECLARE_PER_CPU(u64, pmu_activation_number);
+DECLARE_PER_CPU(struct pfm_stats, pfm_stats);
+
+/*
+ * logging
+ */
+#define PFM_ERR(f, x...) printk(KERN_ERR "perfmon: " f "\n", ## x)
+#define PFM_WARN(f, x...) printk(KERN_WARNING "perfmon: " f "\n", ## x)
+#define PFM_LOG(f, x...) printk(KERN_NOTICE "perfmon: " f "\n", ## x)
+#define PFM_INFO(f, x...) printk(KERN_INFO "perfmon: " f "\n", ## x)
+
+/*
+ * debugging
+ */
+#define PFM_DEBUGGING 1
+#ifdef PFM_DEBUGGING
+#define PFM_DBG(f, x...) \
+ do { \
+ if (unlikely(pfm_controls.debug >0)) { \
+ printk("perfmon: %s.%d: CPU%d [%d]: " f "\n", \
+ __FUNCTION__, __LINE__, \
+ smp_processor_id(), current->pid , ## x); \
+ } \
+ } while (0)
+
+#define PFM_DBG_ovfl(f, x...) \
+ do { \
+ if (unlikely(pfm_controls.debug_ovfl >0)) { \
+ printk("perfmon: %s.%d: CPU%d [%d]: " f "\n", \
+ __FUNCTION__, __LINE__, \
+ smp_processor_id(), current->pid , ## x); \
+ } \
+ } while (0)
+#else
+#define PFM_DBG(f, x...) do {} while(0)
+#define PFM_DBG_ovfl(f, x...) do {} while(0)
+#endif
+
+/*
+ * global information about all sessions
+ * mostly used to synchronize between system wide and per-process
+ */
+struct pfm_sessions {
+ u32 pfs_task_sessions;/* #num loaded per-thread sessions */
+ u32 pfs_sys_sessions; /* #num loaded system wide sessions */
+ size_t pfs_cur_smpl_buf_mem; /* current smpl buf mem usage */
+ cpumask_t pfs_sys_cpumask;/* bitmask of used cpus */
+};
+
+/*
+ * PMD information
+ * software maintained value is in the pfm_set_view structure.
+ */
+struct pfm_pmd {
+ u64 lval; /* last reset value */
+ u64 ovflsw_thres; /* #overflows left before switching */
+ u64 long_reset; /* reset value on sampling overflow */
+ u64 short_reset; /* reset value on overflow */
+ u64 reset_pmds[PFM_PMD_BV]; /* pmds to reset on overflow */
+ u64 smpl_pmds[PFM_PMD_BV]; /* pmds to record on overflow */
+ u64 mask; /* mask for generator */
+ u64 seed; /* seed for generator (must be 64 bits here) */
+ u32 flags; /* notify/do not notify */
+ u64 ovflsw_ref_thres; /* #overflows before switching to next set */
+ u64 eventid; /* overflow event identifier */
+};
+
+/*
+ * perfmon context: encapsulates all the state of a monitoring session
+ */
+struct pfm_event_set {
+ u16 id;
+ u16 id_next; /* which set to go to from this one */
+ pfm_flags_t flags; /* public set flags */
+
+ struct list_head list; /* next in the ordered list */
+ struct pfm_event_set *sw_next; /* address of set to go to */
+ pfm_flags_t priv_flags; /* private flags */
+ u32 npend_ovfls; /* number of pending PMD overflow */
+
+ u64 used_pmds[PFM_PMD_BV]; /* used PMDs */
+ u64 povfl_pmds[PFM_PMD_BV]; /* pending overflowed PMDs */
+ u64 ovfl_pmds[PFM_PMD_BV]; /* last overflowed PMDs */
+ u64 reset_pmds[PFM_PMD_BV]; /* PMDs to reset */
+ u64 ovfl_notify[PFM_PMD_BV]; /* notify on overflow */
+ u64 pmcs[PFM_MAX_PMCS]; /* PMC values */
+
+ u16 nused_pmds; /* max number of used PMDs */
+ u16 nused_pmcs; /* max number of used PMCs */
+
+ struct pfm_pmd pmds[PFM_MAX_PMDS]; /* 64-bit SW PMDs */
+ struct pfm_set_view *view; /* pointer to view */
+ u64 switch_timeout; /* switch timeout */
+ u64 timeout; /* timeout remaining */
+ u64 duration_start; /* start cycles */
+ u64 duration; /* total active cycles */
+ off_t mmap_offset; /* view mmap offset */
+ u64 used_pmcs[PFM_PMC_BV]; /* used PMCs (keep for arbitration) */
+
+ unsigned long last_iip; /* last interrupt instruction pointer */
+ u64 last_ovfl_pmd_reset;/* reset of lowest idx of last overflowed pmds */
+ unsigned int last_ovfl_pmd; /* lowest idx of last overflowed pmds */
+};
+
+/*
+ * common private event set flags (priv_flags)
+ *
+ * upper 16 bits: for arch-specific use
+ * lower 16 bits: for common use
+ */
+#define PFM_SETFL_PRIV_MOD_PMDS 0x1 /* PMD register(s) modified */
+#define PFM_SETFL_PRIV_MOD_PMCS 0x2 /* PMC register(s) modified */
+#define PFM_SETFL_PRIV_SWITCH 0x4 /* must switch set on restart */
+#define PFM_SETFL_PRIV_MOD_BOTH (PFM_SETFL_PRIV_MOD_PMDS | PFM_SETFL_PRIV_MOD_PMCS)
+
+/*
+ * context flags
+ */
+struct pfm_context_flags {
+ unsigned int block:1; /* task blocks on user notifications */
+ unsigned int system:1; /* do system wide monitoring */
+ unsigned int excl_idle:1; /* exclude idle task (syswide) */
+ unsigned int no_msg:1; /* no message sent on overflow */
+ unsigned int can_restart:1; /* allowed to issue a PFM_RESTART */
+ unsigned int switch_ovfl:1; /* switch set on counter ovfl */
+ unsigned int switch_time:1; /* switch set on timeout */
+ unsigned int mapset:1; /* event sets are remapped */
+ unsigned int started:1; /* pfm_start() issued */
+ unsigned int trap_reason:2; /* reason for pfm_handle_work() */
+ unsigned int kapi:1; /* is kernel level context */
+ unsigned int reserved:20; /* for future use */
+};
+
+/*
+ * values for trap_reason
+ */
+#define PFM_TRAP_REASON_NONE 0x0 /* nothing to do */
+#define PFM_TRAP_REASON_BLOCK 0x1 /* block on overflow */
+#define PFM_TRAP_REASON_RESET 0x2 /* reset PMDs */
+#define PFM_TRAP_REASON_ZOMBIE 0x3 /* cleanup because of ZOMBIE */
+
+/*
+ * check_mask bitmask values for pfm_check_task_state()
+ */
+#define PFM_CMD_STOPPED 0x01 /* command needs thread stopped */
+#define PFM_CMD_UNLOADED 0x02 /* command needs ctx unloaded */
+#define PFM_CMD_UNLOAD 0x04 /* command is unload */
+
+#include <linux/perfmon_pmu.h>
+#include <linux/perfmon_fmt.h>
+#include <linux/perfmon_kernel.h>
+
+/*
+ * perfmon context: encapsulates all the state of a monitoring session
+ */
+struct pfm_context {
+ spinlock_t lock; /* context protection */
+
+ struct file *filp; /* filp */
+
+ struct pfm_context_flags flags; /* flags */
+ u32 state; /* state */
+ struct task_struct *task; /* attached task */
+
+ struct completion restart_complete;/* block on notification */
+ u64 duration_start; /* last cycles at last activation */
+ u64 duration; /* total cycles context was active */
+ u64 last_act; /* last activation */
+ u32 last_cpu; /* last CPU used (SMP only) */
+ u32 cpu; /* cpu bound to context */
+ struct pfm_smpl_fmt *smpl_fmt; /* buffer format callbacks */
+ void *smpl_addr; /* smpl buffer base */
+ size_t smpl_size;
+
+ wait_queue_head_t msgq_wait; /* used when flags.kapi=0 */
+ struct completion *msgq_comp; /* when kapi=1 */
+ union pfm_msg msgq[PFM_MAX_MSGS];
+ int msgq_head;
+ int msgq_tail;
+
+ struct fasync_struct *async_queue;
+
+ struct pfm_event_set *active_set; /* active set */
+ struct list_head list; /* ordered list of sets */
+
+ /*
+ * save stack space by allocating temporary variables for
+ * pfm_overflow_handler() in pfm_context
+ */
+ struct pfm_ovfl_arg ovfl_arg;
+ u64 ovfl_ovfl_notify[PFM_PMD_BV];
+};
+
+#define pfm_ctx_arch(c) ((struct pfm_arch_context *)((c)+1))
+
+static inline void pfm_set_pmu_owner(struct task_struct *task,
+ struct pfm_context *ctx)
+{
+ BUG_ON(task && task->pid == 0);
+ __get_cpu_var(pmu_owner) = task;
+ __get_cpu_var(pmu_ctx) = ctx;
+}
+
+static inline void pfm_inc_activation(void)
+{
+ __get_cpu_var(pmu_activation_number)++;
+}
+
+static inline void pfm_set_activation(struct pfm_context *ctx)
+{
+ ctx->last_act = __get_cpu_var(pmu_activation_number);
+}
+
+static inline void pfm_set_last_cpu(struct pfm_context *ctx, int cpu)
+{
+ ctx->last_cpu = cpu;
+}
+
+static inline void pfm_modview_begin(struct pfm_event_set *set)
+{
+ set->view->set_seq++;
+}
+
+static inline void pfm_modview_end(struct pfm_event_set *set)
+{
+ set->view->set_seq++;
+}
+
+static inline void pfm_retflag_set(u32 flags, u32 val)
+{
+ flags &= ~PFM_REG_RETFL_MASK;
+ flags |= (val);
+}
+
+extern struct _pfm_pmu_config *pfm_pmu_conf;
+extern struct pfm_controls pfm_controls;
+
+int pfm_get_args(void __user *, size_t, size_t, void *, void **);
+int pfm_get_smpl_arg(pfm_uuid_t, void __user *, size_t,
+ void **, struct pfm_smpl_fmt **);
+
+int pfm_alloc_fd(struct file **);
+
+int __pfm_write_pmcs(struct pfm_context *, struct pfarg_pmc *, int);
+int __pfm_write_pmds(struct pfm_context *, struct pfarg_pmd *, int, int);
+int __pfm_read_pmds(struct pfm_context *, struct pfarg_pmd *, int);
+void __pfm_reset_stats(void);
+int __pfm_load_context(struct pfm_context *, struct pfarg_load *);
+int __pfm_unload_context(struct pfm_context *, int);
+int __pfm_stop(struct pfm_context *);
+int __pfm_restart(struct pfm_context *);
+int __pfm_start(struct pfm_context *, struct pfarg_start *);
+int __pfm_delete_evtsets(struct pfm_context *, void *, int);
+int __pfm_getinfo_evtsets(struct pfm_context *, struct pfarg_setinfo *, int);
+int __pfm_create_evtsets(struct pfm_context *, struct pfarg_setdesc *, int);
+int __pfm_create_context(struct pfarg_ctx *, struct pfm_smpl_fmt *,
+ void *,
+ int,
+ struct completion *,
+ struct pfm_context **);
+int pfm_check_task_state(struct pfm_context *, int, unsigned long *);
+
+struct pfm_event_set *pfm_find_set(struct pfm_context *, u16, int);
+struct pfm_context * pfm_get_ctx(int);
+
+void pfm_context_free(struct pfm_context *);
+struct pfm_context *pfm_context_alloc(void);
+int pfm_pmu_conf_get(int);
+void pfm_pmu_conf_put(void);
+
+int pfm_reserve_session(struct pfm_context *, u32);
+int pfm_release_session(struct pfm_context *, u32);
+
+int pfm_smpl_buffer_alloc(struct pfm_context *, size_t);
+int pfm_reserve_buf_space(size_t);
+void pfm_release_buf_space(size_t);
+
+struct pfm_smpl_fmt *pfm_smpl_fmt_get(pfm_uuid_t);
+void pfm_smpl_fmt_put(struct pfm_smpl_fmt *);
+int pfm_use_smpl_fmt(pfm_uuid_t);
+
+int pfm_sysfs_init(void);
+ssize_t pfm_sysfs_session_show(char *, size_t , int);
+int pfm_sysfs_remove_pmu(struct _pfm_pmu_config *);
+int pfm_sysfs_add_pmu(struct _pfm_pmu_config *);
+
+int pfm_sysfs_remove_fmt(struct pfm_smpl_fmt *);
+int pfm_sysfs_add_fmt(struct pfm_smpl_fmt *);
+
+irqreturn_t pfm_interrupt_handler(int, void *, struct pt_regs *);
+void pfm_save_pmds_release(struct pfm_context *);
+
+void pfm_reset_pmds(struct pfm_context *, struct pfm_event_set *, int);
+
+void __pfm_handle_switch_timeout(void);
+int pfm_prepare_sets(struct pfm_context *, struct pfm_event_set *);
+int pfm_sets_init(void);
+int pfm_mmap_set(struct pfm_context *, struct vm_area_struct *, size_t);
+void pfm_free_sets(struct pfm_context *);
+void pfm_init_evtset(struct pfm_event_set *);
+void pfm_switch_sets(struct pfm_context *,
+ struct pfm_event_set *,
+ int,
+ int);
+
+void pfm_save_pmds(struct pfm_context *, struct pfm_event_set *);
+void pfm_mask_monitoring(struct pfm_context *);
+int pfm_ovfl_notify_user(struct pfm_context *,
+ struct pfm_event_set *,
+ unsigned long);
+
+int init_pfm_fs(void);
+int pfm_is_fd(struct file *);
+
+u64 carta_random32 (u64);
+
+int __pfm_close(struct pfm_context *, struct file *);
+ssize_t __pfmk_read(struct pfm_context *, union pfm_msg *, int);
+
+static inline void pfm_put_ctx(struct pfm_context *ctx)
+{
+ fput(ctx->filp);
+}
+
+#define PFM_MAX_NUM_SETS 65536
+#define PFM_SET_REMAP_SCALAR PAGE_SIZE
+#define PFM_SET_REMAP_OFFS 16384 /* number of pages to offset */
+#define PFM_SET_REMAP_BASE (PFM_SET_REMAP_OFFS*PAGE_SIZE)
+#define PFM_SET_REMAP_OFFS_MAX (PFM_SET_REMAP_OFFS+\
+ PFM_MAX_NUM_SETS*PFM_SET_REMAP_SCALAR)
+
+#define PFM_ONE_64 ((u64)1)
+
+struct pfm_stats {
+ u64 pfm_ovfl_intr_replay_count; /* replayed ovfl interrupts */
+ u64 pfm_ovfl_intr_regular_count;/* processed ovfl interrupts */
+ u64 pfm_ovfl_intr_all_count; /* total ovfl interrupts */
+ u64 pfm_ovfl_intr_cycles; /* cycles in ovfl interrupts */
+ u64 pfm_ovfl_intr_phase1; /* cycles in ovfl interrupts */
+ u64 pfm_ovfl_intr_phase2; /* cycles in ovfl interrupts */
+ u64 pfm_ovfl_intr_phase3; /* cycles in ovfl interrupts */
+ u64 pfm_fmt_handler_calls; /* # calls smpl buffer handler */
+ u64 pfm_fmt_handler_cycles; /* cycle in smpl format handler */
+ u64 pfm_set_switch_count; /* #set_switches on this CPU */
+ u64 pfm_set_switch_cycles; /* cycles for switching sets */
+ u64 pfm_ctxsw_count; /* #context switches on this CPU */
+ u64 pfm_ctxsw_cycles; /* cycles for context switches */
+ u64 pfm_handle_timeout_count; /* #count of set timeouts handled */
+ struct kobject kobj; /* for sysfs internal use only */
+};
+#define to_stats(n) container_of(n, struct pfm_stats, kobj)
+
+
+#include <asm/perfmon.h>
+
+extern struct file_operations pfm_file_ops;
+
+/*
+ * max vector argument elements for local storage (no kmalloc/kfree)
+ * The PFM_ARCH_PM*_ARG should be defined in the arch specific perfmon.h
+ * file. If not, default (conservative) values are used
+ */
+
+#ifndef PFM_ARCH_PMC_STK_ARG
+#define PFM_ARCH_PMC_STK_ARG 1
+#endif
+
+#ifndef PFM_ARCH_PMD_STK_ARG
+#define PFM_ARCH_PMD_STK_ARG 1
+#endif
+
+#define PFM_PMC_STK_ARG PFM_ARCH_PMC_STK_ARG
+#define PFM_PMD_STK_ARG PFM_ARCH_PMD_STK_ARG
+
+#define PFM_BPL 64
+#define PFM_LBPL 6 /* log2(BPL) */
+
+/*
+ * those operations are not provided by linux/bitmap.h.
+ * We do not need atomicity nor volatile accesses here.
+ * All bitmaps are 64-bit wide.
+ */
+static inline void pfm_bv_set(u64 *bv, unsigned int rnum)
+{
+ bv[rnum>>PFM_LBPL] |= PFM_ONE_64 << (rnum&(PFM_BPL-1));
+}
+
+static inline int pfm_bv_isset(u64 *bv, unsigned int rnum)
+{
+ return bv[rnum>>PFM_LBPL] & (PFM_ONE_64 <<(rnum&(PFM_BPL-1))) ? 1 : 0;
+}
+
+static inline void pfm_bv_clear(u64 *bv, unsigned int rnum)
+{
+ bv[rnum>>PFM_LBPL] &= ~(PFM_ONE_64 << (rnum&(PFM_BPL-1)));
+}
+
+/*
+ * read a single PMD register. PMD register mapping is provided by PMU
+ * description module. Some PMD registers are require a special read
+ * handler (e.g., virtual PMD mapping to a SW resource).
+ */
+static inline u64 pfm_read_pmd(struct pfm_context *ctx, unsigned int cnum)
+{
+ if (unlikely(pfm_pmu_conf->pmd_desc[cnum].type & PFM_REG_V))
+ return pfm_pmu_conf->pmd_sread(ctx, cnum);
+
+ return pfm_arch_read_pmd(ctx, cnum);
+}
+
+static inline void pfm_write_pmd(struct pfm_context *ctx, unsigned int cnum, u64 value)
+{
+ /*
+ * PMD writes are ignored for read-only registers
+ */
+ if (pfm_pmu_conf->pmd_desc[cnum].type & PFM_REG_RO)
+ return;
+
+ if (pfm_pmu_conf->pmd_desc[cnum].type & PFM_REG_V) {
+ pfm_pmu_conf->pmd_swrite(ctx, cnum, value);
+ return;
+ }
+ pfm_arch_write_pmd(ctx, cnum, value);
+}
+
+#define ulp(_x) ((unsigned long *)_x)
+
+#define PFM_NORMAL 0
+#define PFM_COMPAT 1
+#define PFM_KAPI 2
+
+/*
+ * kernel level interface
+ */
+int pfmk_create_context(struct pfarg_ctx *, void *,
+ size_t,
+ struct completion *,
+ void **,
+ void **);
+int pfmk_write_pmcs(void *, struct pfarg_pmc *, int);
+int pfmk_write_pmds(void *, struct pfarg_pmd *, int);
+int pfmk_read_pmds(void *, struct pfarg_pmd *, int);
+int pfmk_restart(void *);
+int pfmk_stop(void *);
+int pfmk_start(void *, struct pfarg_start *);
+int pfmk_load_context(void *, struct pfarg_load *);
+int pfmk_unload_context(void *);
+int pfmk_delete_evtsets(void *, struct pfarg_setinfo *, int);
+int pfmk_create_evtsets(void *, struct pfarg_setdesc *, int);
+int pfmk_getinfo_evtsets(void *, struct pfarg_setinfo *, int);
+int pfmk_close(void *);
+ssize_t pfmk_read(void *, union pfm_msg *, size_t, int);
+
+#endif /* __KERNEL__ */
+
+#endif /* CONFIG_PERFMON */
+
+#endif /* __LINUX_PERFMON_H__ */
--- linux-2.6.17.9.base/include/linux/perfmon_kernel.h 1969-12-31 16:00:00.000000000 -0800
+++ linux-2.6.17.9/include/linux/perfmon_kernel.h 2006-08-21 03:37:46.000000000 -0700
@@ -0,0 +1,76 @@
+/*
+ * Copyright (c) 2006 Hewlett-Packard Development Company, L.P.
+ * Contributed by Stephane Eranian <[email protected]>
+ *
+ * Kernel hooks for perfmon
+ */
+#ifndef __PERFMON_KERNEL_H__
+#define __PERFMON_KERNEL_H__ 1
+
+#ifdef __KERNEL__
+
+#ifdef CONFIG_PERFMON
+
+void __pfm_exit_thread(struct task_struct *);
+void __pfm_copy_thread(struct task_struct *);
+void __pfm_ctxsw(struct task_struct *, struct task_struct *);
+void __pfm_handle_work(struct task_struct *);
+void __pfm_handle_switch_timeout(void);
+void pfm_vector_init(void);
+
+static inline void pfm_exit_thread(struct task_struct *task)
+{
+ if (task->pfm_context)
+ __pfm_exit_thread(task);
+}
+
+static inline void pfm_handle_work(struct task_struct *task)
+{
+ if (task->pfm_context)
+ __pfm_handle_work(task);
+}
+
+static inline void pfm_copy_thread(struct task_struct *task)
+{
+ /*
+ * task is child state
+ * perfmon context is never shared with child tasks
+ */
+ task->pfm_context = NULL;
+ clear_tsk_thread_flag(task, TIF_PERFMON);
+}
+
+static inline void pfm_ctxsw(struct task_struct *p, struct task_struct *n)
+{
+ __pfm_ctxsw(p, n);
+}
+
+static inline void pfm_handle_switch_timeout(void)
+{
+ unsigned long info;
+ info = __get_cpu_var(pfm_syst_info);
+ if (info & PFM_CPUINFO_TIME_SWITCH)
+ __pfm_handle_switch_timeout();
+}
+
+#define tsks_have_perfmon(p, n) \
+ (test_tsk_thread_flag(prev, TIF_PERFMON) \
+ ||test_tsk_thread_flag(next, TIF_PERFMON))
+#else /* !CONFIG_PERFMON */
+
+#define tsks_have_perfmon(p, n) (0)
+
+#define pfm_exit_thread(_t) do { } while (0)
+#define pfm_handle_work(_t) do { } while (0)
+#define pfm_copy_thread(_t,_r) do { } while (0)
+#define pfm_ctxsw(_p, _t) do { } while (0)
+#define pfm_handle_switch_timeout() do { } while (0)
+#define pfm_vector_init() do { } while (0)
+#define pfm_release_dbregs(_t) do { } while (0)
+#define pfm_use_dbregs(_t) (0)
+
+#endif /* CONFIG_PERFMON */
+
+#endif /* __KERNEL__ */
+
+#endif /* __PERFMON_KERNEL_H__ */
--- linux-2.6.17.9.base/perfmon/perfmon.c 1969-12-31 16:00:00.000000000 -0800
+++ linux-2.6.17.9/perfmon/perfmon.c 2006-08-21 03:37:46.000000000 -0700
@@ -0,0 +1,2370 @@
+/*
+ * perfmon.c: perfmon2 core functions
+ *
+ * This file implements the perfmon2 interface which
+ * provides access to the hardware performance counters
+ * of the host processor.
+ *
+ * The initial version of perfmon.c was written by
+ * Ganesh Venkitachalam, IBM Corp.
+ *
+ * Then it was modified for perfmon-1.x by Stephane Eranian and
+ * David Mosberger, Hewlett Packard Co.
+ *
+ * Version Perfmon-2.x is a complete rewrite of perfmon-1.x
+ * by Stephane Eranian, Hewlett Packard Co.
+ *
+ * Copyright (c) 1999-2006 Hewlett-Packard Development Company, L.P.
+ * Contributed by Stephane Eranian <[email protected]>
+ * David Mosberger-Tang <[email protected]>
+ *
+ * More information about perfmon available at:
+ * http://www.hpl.hp.com/research/linux/perfmon
+ */
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/vmalloc.h>
+#include <linux/sysctl.h>
+#include <linux/file.h>
+#include <linux/poll.h>
+#include <linux/vfs.h>
+#include <linux/pagemap.h>
+#include <linux/mount.h>
+#include <linux/perfmon.h>
+
+/*
+ * internal variables
+ */
+static kmem_cache_t *pfm_ctx_cachep;
+
+
+/*
+ * external variables
+ */
+
+DEFINE_PER_CPU(unsigned long, pfm_syst_info);
+DEFINE_PER_CPU(struct task_struct *, pmu_owner);
+DEFINE_PER_CPU(struct pfm_context *, pmu_ctx);
+DEFINE_PER_CPU(u64, pmu_activation_number);
+DEFINE_PER_CPU(struct pfm_stats, pfm_stats);
+
+#define PFM_INVALID_ACTIVATION ((u64)~0)
+
+/*
+ * Reset PMD register flags
+ */
+#define PFM_PMD_RESET_NONE 0 /* do not reset (pfm_switch_set) */
+#define PFM_PMD_RESET_SHORT 1 /* use short reset value */
+#define PFM_PMD_RESET_LONG 2 /* use long reset value */
+
+static union pfm_msg *pfm_get_new_msg(struct pfm_context *ctx)
+{
+ int idx, next;
+
+ next = (ctx->msgq_tail+1) % PFM_MAX_MSGS;
+
+ PFM_DBG("head=%d tail=%d", ctx->msgq_head, ctx->msgq_tail);
+
+ if (next == ctx->msgq_head)
+ return NULL;
+
+ idx = ctx->msgq_tail;
+ ctx->msgq_tail = next;
+
+ PFM_DBG("head=%d tail=%d msg=%d",
+ ctx->msgq_head,
+ ctx->msgq_tail, idx);
+
+ return ctx->msgq+idx;
+}
+
+static inline void pfm_reset_msgq(struct pfm_context *ctx)
+{
+ ctx->msgq_head = ctx->msgq_tail = 0;
+}
+
+void pfm_context_free(struct pfm_context *ctx)
+{
+ struct pfm_smpl_fmt *fmt;
+
+ fmt = ctx->smpl_fmt;
+
+ pfm_free_sets(ctx);
+
+ if (ctx->smpl_addr) {
+ PFM_DBG("freeing sampling buffer @%p size=%zu",
+ ctx->smpl_addr,
+ ctx->smpl_size);
+
+ if (!ctx->flags.kapi)
+ pfm_release_buf_space(ctx->smpl_size);
+
+ if (fmt->fmt_exit)
+ (*fmt->fmt_exit)(ctx->smpl_addr);
+
+ vfree(ctx->smpl_addr);
+ }
+
+ PFM_DBG("free ctx @%p", ctx);
+ kmem_cache_free(pfm_ctx_cachep, ctx);
+
+ /*
+ * decrease refcount on:
+ * - PMU description table
+ * - sampling format
+ */
+ pfm_pmu_conf_put();
+ pfm_smpl_fmt_put(fmt);
+}
+
+/*
+ * only called in for the current task
+ */
+static int pfm_setup_smpl_fmt(struct pfm_smpl_fmt *fmt, void *fmt_arg,
+ struct pfm_context *ctx, u32 ctx_flags,
+ int mode, struct file *filp)
+{
+ size_t size = 0;
+ int ret = 0;
+
+ /*
+ * validate parameters
+ */
+ if (fmt->fmt_validate) {
+ ret = (*fmt->fmt_validate)(ctx_flags, pfm_pmu_conf->num_pmds,
+ fmt_arg);
+ PFM_DBG("validate(0x%x,%p)=%d", ctx_flags, fmt_arg, ret);
+ if (ret)
+ goto error;
+ }
+
+ /*
+ * check if buffer format wants to use perfmon
+ * buffer allocation/mapping service
+ */
+ size = 0;
+ if (fmt->fmt_getsize) {
+ ret = (*fmt->fmt_getsize)(ctx_flags, fmt_arg, &size);
+ if (ret) {
+ PFM_DBG("cannot get size ret=%d", ret);
+ goto error;
+ }
+ }
+
+ if (size) {
+#ifdef CONFIG_IA64_PERFMON_COMPAT
+ if (mode == PFM_COMPAT)
+ ret = pfm_smpl_buffer_alloc_old(ctx, size, filp);
+ else
+#endif
+ {
+ ret = pfm_smpl_buffer_alloc(ctx, size);
+ }
+ if (ret)
+ goto error;
+
+ }
+
+ if (fmt->fmt_init) {
+ ret = (*fmt->fmt_init)(ctx, ctx->smpl_addr, ctx_flags,
+ pfm_pmu_conf->num_pmds,
+ fmt_arg);
+ if (ret)
+ goto error_buffer;
+ }
+ return 0;
+
+error_buffer:
+ if (!ctx->flags.kapi)
+ pfm_release_buf_space(ctx->smpl_size);
+ /*
+ * we do not call fmt_exit, if init has failed
+ */
+ vfree(ctx->smpl_addr);
+error:
+ return ret;
+}
+
+
+
+void pfm_mask_monitoring(struct pfm_context *ctx)
+{
+ struct pfm_event_set *set;
+ u64 now_itc;
+
+ PFM_DBG_ovfl("masking monitoring");
+
+ now_itc = pfm_arch_get_itc();
+ set = ctx->active_set;
+
+ /*
+ * monitoring can only be masked as a result of a valid
+ * counter overflow. In UP and per-thread mode,
+ * it is possible that the current task may not be the
+ * one that generated the overflow because the overflow happen
+ * very close to the context switch point where interrupts are
+ * masked. In SMP per-thread, current is always the task that
+ * generated the overflow.
+ *
+ * For system-wide, the current task is alwys the one that
+ * generated the overflow.
+ *
+ * In any case, accessing the PMU directly is always safe
+ * given that we are only called from the overflow handler.
+ */
+ pfm_modview_begin(set);
+ pfm_save_pmds(ctx, set);
+ pfm_modview_end(set);
+ pfm_arch_mask_monitoring(ctx);
+
+ /*
+ * accumulate the set duration up to this point
+ */
+ set->duration += now_itc - set->duration_start;
+}
+
+/*
+ * interrupts are masked when entering this function.
+ * context must be in MASKED state when calling.
+ */
+static void pfm_unmask_monitoring(struct pfm_context *ctx)
+{
+ struct pfm_event_set *set;
+ u64 now_itc;
+
+ if (ctx->state != PFM_CTX_MASKED)
+ return;
+
+ PFM_DBG("unmasking monitoring");
+
+ set = ctx->active_set;
+
+ /*
+ * must be done before calling
+ * pfm_arch_unmask_monitoring()
+ */
+ ctx->state = PFM_CTX_LOADED;
+
+ pfm_arch_restore_pmds(ctx, set);
+
+ pfm_arch_unmask_monitoring(ctx);
+
+ now_itc = pfm_arch_get_itc();
+
+ set->priv_flags &= ~PFM_SETFL_PRIV_MOD_BOTH;
+
+ /*
+ * reset set duration timer
+ */
+ set->duration_start = now_itc;
+}
+
+struct pfm_context *pfm_context_alloc(void)
+{
+ struct pfm_context *ctx;
+
+ /*
+ * allocate context structure
+ * the architecture specific portion is allocated
+ * right after the struct pfm_context struct. It is
+ * accessible at ctx_arch = (ctx+1)
+ */
+ ctx = kmem_cache_alloc(pfm_ctx_cachep, SLAB_ATOMIC);
+ if (ctx) {
+ memset(ctx, 0, sizeof(*ctx)+PFM_ARCH_CTX_SIZE);
+ PFM_DBG("alloc ctx @%p", ctx);
+ }
+ return ctx;
+}
+
+/*
+ * in new mode, we only allocate the kernel buffer, an explicit mmap()
+ * is needed to remap the buffer at the user level
+ */
+int pfm_smpl_buffer_alloc(struct pfm_context *ctx, size_t rsize)
+{
+ void *addr;
+ size_t size;
+ int ret;
+
+ /*
+ * the fixed header + requested size and align to page boundary
+ */
+ size = PAGE_ALIGN(rsize);
+
+ PFM_DBG("sampling buffer rsize=%zu size=%zu", rsize, size);
+
+ if (!ctx->flags.kapi) {
+ ret = pfm_reserve_buf_space(size);
+ if (ret) return ret;
+ }
+
+ addr = vmalloc(size);
+ if (addr == NULL) {
+ PFM_DBG("cannot allocate sampling buffer");
+ goto unres;
+ }
+
+ memset(addr, 0, size);
+
+ ctx->smpl_addr = addr;
+ ctx->smpl_size = size;
+
+ PFM_DBG("kernel smpl buffer @%p", addr);
+
+ return 0;
+unres:
+ if (!ctx->flags.kapi)
+ pfm_release_buf_space(size);
+ return -ENOMEM;
+}
+
+static inline u64 pfm_new_pmd_value (struct pfm_pmd *reg, int reset_mode)
+{
+ u64 val, mask;
+ u64 new_seed, old_seed;
+
+ val = reset_mode == PFM_PMD_RESET_LONG ? reg->long_reset : reg->short_reset;
+ old_seed = reg->seed;
+ mask = reg->mask;
+
+ if (reg->flags & PFM_REGFL_RANDOM) {
+ new_seed = carta_random32(old_seed);
+
+ /* counter values are negative numbers! */
+ val -= (old_seed & mask);
+ if ((mask >> 32) != 0)
+ /* construct a full 64-bit random value: */
+ new_seed |= (u64)carta_random32((u32)(old_seed >> 32)) << 32;
+ reg->seed = new_seed;
+ }
+ reg->lval = val;
+ return val;
+}
+
+void pfm_reset_pmds(struct pfm_context *ctx, struct pfm_event_set *set,
+ int reset_mode)
+{
+ u64 ovfl_mask, hw_val;
+ u64 *cnt_mask, *reset_pmds;
+ u64 val;
+ unsigned int i, max_pmd, not_masked;
+
+ reset_pmds = set->reset_pmds;
+ max_pmd = pfm_pmu_conf->max_pmd;
+
+ ovfl_mask = pfm_pmu_conf->ovfl_mask;
+ cnt_mask = pfm_pmu_conf->cnt_pmds;
+ not_masked = ctx->state != PFM_CTX_MASKED;
+
+ PFM_DBG_ovfl("%s r_pmds=0x%llx not_masked=%d",
+ reset_mode == PFM_PMD_RESET_LONG ? "long" : "short",
+ (unsigned long long)reset_pmds[0],
+ not_masked);
+
+ pfm_modview_begin(set);
+
+ for (i = 0; i < max_pmd; i++) {
+
+ if (pfm_bv_isset(reset_pmds, i)) {
+
+ val = pfm_new_pmd_value(set->pmds + i,
+ reset_mode);
+
+ set->view->set_pmds[i]= val;
+
+ if (not_masked) {
+ if (pfm_bv_isset(cnt_mask, i)) {
+ hw_val = val & ovfl_mask;
+ } else {
+ hw_val = val;
+ }
+ pfm_write_pmd(ctx, i, hw_val);
+ }
+ PFM_DBG_ovfl("pmd%u set=%u sval=0x%llx",
+ i,
+ set->id,
+ (unsigned long long)val);
+ }
+ }
+
+ pfm_modview_end(set);
+
+ /*
+ * done with reset
+ */
+ bitmap_zero(ulp(reset_pmds), max_pmd);
+
+ /*
+ * make changes visible
+ */
+ if (not_masked)
+ pfm_arch_serialize();
+}
+
+
+
+
+/*
+ * called from pfm_handle_work() and __pfm_restart()
+ * for system-wide and per-thread context.
+ */
+static void pfm_resume_after_ovfl(struct pfm_context *ctx)
+{
+ struct pfm_smpl_fmt *fmt;
+ u32 rst_ctrl;
+ struct pfm_event_set *set;
+ u64 *reset_pmds;
+ void *hdr;
+ int state, ret;
+
+ hdr = ctx->smpl_addr;
+ fmt = ctx->smpl_fmt;
+ state = ctx->state;
+ set = ctx->active_set;
+ ret = 0;
+
+ if (hdr) {
+ rst_ctrl = 0;
+ prefetch(hdr);
+ if (fmt->fmt_restart)
+ ret = (*fmt->fmt_restart)(state == PFM_CTX_LOADED,
+ &rst_ctrl, hdr);
+ } else {
+ rst_ctrl= PFM_OVFL_CTRL_RESET;
+ }
+ reset_pmds = set->reset_pmds;
+
+ PFM_DBG("restart=%d set=%u r_pmds=0x%llx switch=%d ctx_state=%d",
+ ret,
+ set->id,
+ (unsigned long long)reset_pmds[0],
+ !(set->priv_flags & PFM_SETFL_PRIV_SWITCH),
+ state);
+
+ if (!ret) {
+ /*
+ * switch set if needed
+ */
+ if (set->priv_flags & PFM_SETFL_PRIV_SWITCH) {
+ set->priv_flags &= ~PFM_SETFL_PRIV_SWITCH;
+ pfm_switch_sets(ctx, NULL, PFM_PMD_RESET_LONG, 0);
+ set = ctx->active_set;
+ } else if (rst_ctrl & PFM_OVFL_CTRL_RESET) {
+ if (!bitmap_empty(ulp(set->reset_pmds), pfm_pmu_conf->max_pmd))
+ pfm_reset_pmds(ctx, set, PFM_PMD_RESET_LONG);
+ }
+
+ if (!(rst_ctrl & PFM_OVFL_CTRL_MASK)) {
+ pfm_unmask_monitoring(ctx);
+ } else {
+ PFM_DBG("stopping monitoring?");
+ }
+ ctx->state = PFM_CTX_LOADED;
+ }
+ ctx->flags.can_restart = 0;
+}
+
+/*
+ * This function is always called after pfm_stop has been issued
+ */
+void pfm_flush_pmds(struct task_struct *task, struct pfm_context *ctx)
+{
+ struct pfm_event_set *set;
+ u64 ovfl_mask;
+ u64 *ovfl_pmds;
+ u32 num_ovfls;
+ u16 i, first_cnt_pmd;
+
+ ovfl_mask = pfm_pmu_conf->ovfl_mask;
+ first_cnt_pmd = pfm_pmu_conf->first_cnt_pmd;
+
+ set = ctx->active_set;
+
+ /*
+ * save active set
+ * UP:
+ * if not current task and due to lazy, state may
+ * still be live
+ * for system-wide, guaranteed to run on correct CPU
+ */
+ if (__get_cpu_var(pmu_ctx) == ctx) {
+ /*
+ * pending overflows have been saved by pfm_stop()
+ */
+ pfm_modview_begin(set);
+ pfm_save_pmds(ctx, set);
+ pfm_modview_end(set);
+ pfm_set_pmu_owner(NULL, NULL);
+ PFM_DBG("released ownership");
+ }
+
+ /*
+ * cleanup each set
+ */
+ list_for_each_entry(set, &ctx->list, list) {
+ if (!set->npend_ovfls)
+ continue;
+
+ pfm_modview_begin(set);
+
+ /*
+ * take care of overflow
+ * no format handler is called here
+ */
+ ovfl_pmds = set->povfl_pmds;
+ num_ovfls = set->npend_ovfls;
+
+ PFM_DBG("set%u first=%u novfls=%u",
+ set->id, first_cnt_pmd, num_ovfls);
+ /*
+ * only look up to the last counting PMD register
+ */
+ for (i = first_cnt_pmd; num_ovfls; i++) {
+ if (pfm_bv_isset(ovfl_pmds, i)) {
+ set->view->set_pmds[i] += 1 + ovfl_mask;
+ num_ovfls--;
+ PFM_DBG("pmd%u overflowed", i);
+ }
+ PFM_DBG("pmd%u set=%u val=0x%llx",
+ i,
+ set->id,
+ (unsigned long long)set->view->set_pmds[i]);
+ }
+ pfm_modview_end(set);
+ }
+}
+
+/*
+ * pfm_handle_work() can be called with interrupts enabled
+ * (TIF_NEED_RESCHED) or disabled. The down_interruptible
+ * call may sleep, therefore we must re-enable interrupts
+ * to avoid deadlocks. It is safe to do so because this function
+ * is called ONLY when returning to user level (PUStk=1), in which case
+ * there is no risk of kernel stack overflow due to deep
+ * interrupt nesting.
+ *
+ * input:
+ * - current task pointer
+ */
+void __pfm_handle_work(struct task_struct *task)
+{
+ struct pfm_context *ctx;
+ unsigned long flags, dummy_flags;
+ unsigned int reason;
+ int ret;
+
+ ctx = task->pfm_context;
+ if (ctx == NULL) {
+ PFM_ERR("handle_work [%d] has no ctx", task->pid);
+ return;
+ }
+
+ BUG_ON(ctx->flags.system);
+
+ spin_lock_irqsave(&ctx->lock, flags);
+
+ clear_thread_flag(TIF_NOTIFY_RESUME);
+
+ /*
+ * extract reason for being here and clear
+ */
+ reason = ctx->flags.trap_reason;
+
+ if (reason == PFM_TRAP_REASON_NONE)
+ goto nothing_to_do;
+
+ ctx->flags.trap_reason = PFM_TRAP_REASON_NONE;
+
+ PFM_DBG("reason=%d state=%d", reason, ctx->state);
+
+ /*
+ * must be done before we check for simple-reset mode
+ */
+ if (ctx->state == PFM_CTX_ZOMBIE)
+ goto do_zombie;
+
+ if (reason == PFM_TRAP_REASON_RESET)
+ goto skip_blocking;
+
+ /*
+ * restore interrupt mask to what it was on entry.
+ * Could be enabled/diasbled.
+ */
+ spin_unlock_irqrestore(&ctx->lock, flags);
+
+ /*
+ * force interrupt enable because of down_interruptible()
+ */
+ local_irq_enable();
+
+ PFM_DBG("before block sleeping");
+
+ /*
+ * may go through without blocking on SMP systems
+ * if restart has been received already by the time we call down()
+ */
+ ret = wait_for_completion_interruptible(&ctx->restart_complete);
+
+ PFM_DBG("after block sleeping ret=%d", ret);
+
+ /*
+ * lock context and mask interrupts again
+ * We save flags into a dummy because we may have
+ * altered interrupts mask compared to entry in this
+ * function.
+ */
+ spin_lock_irqsave(&ctx->lock, dummy_flags);
+
+ if (ctx->state == PFM_CTX_ZOMBIE)
+ goto do_zombie;
+
+ /*
+ * in case of interruption of down() we don't restart anything
+ */
+ if (ret < 0)
+ goto nothing_to_do;
+
+skip_blocking:
+ pfm_resume_after_ovfl(ctx);
+
+nothing_to_do:
+
+ /*
+ * restore flags as they were upon entry
+ */
+ spin_unlock_irqrestore(&ctx->lock, flags);
+ return;
+
+do_zombie:
+ PFM_DBG("context is zombie, bailing out");
+
+ __pfm_unload_context(ctx, 0);
+
+ /*
+ * enable interrupt for vfree()
+ */
+ local_irq_enable();
+
+ /*
+ * actual context free
+ */
+ pfm_context_free(ctx);
+
+ /*
+ * restore interrupts as they were upon entry
+ */
+ local_irq_restore(flags);
+}
+
+static int pfm_notify_user(struct pfm_context *ctx, union pfm_msg *msg)
+{
+ if (ctx->state == PFM_CTX_ZOMBIE) {
+ PFM_DBG("ignoring overflow notification, owner is zombie");
+ return 0;
+ }
+ PFM_DBG("waking up somebody");
+
+ if (ctx->flags.kapi) {
+ complete(ctx->msgq_comp);
+ return 0;
+ }
+
+ wake_up_interruptible(&ctx->msgq_wait);
+
+ /*
+ * it is safe to call kill_fasync() from an interrupt
+ * handler. kill_fasync() grabs two RW locks (fasync_lock,
+ * tasklist_lock) in read mode. There is conflict only in
+ * case the PMU interrupt occurs during a write mode critical
+ * section. This cannot happen because for both locks, the
+ * write mode is always using interrupt masking (write_lock_irq).
+ */
+ kill_fasync (&ctx->async_queue, SIGIO, POLL_IN);
+
+ return 0;
+}
+
+int pfm_ovfl_notify_user(struct pfm_context *ctx,
+ struct pfm_event_set *set,
+ unsigned long ip)
+{
+ union pfm_msg *msg = NULL;
+ int max_cnt_pmd;
+ u64 *ovfl_pmds;
+
+ max_cnt_pmd = pfm_pmu_conf->max_cnt_pmd;
+
+ if (!ctx->flags.no_msg) {
+ msg = pfm_get_new_msg(ctx);
+ if (msg == NULL) {
+ /*
+ * when message queue fills up it is because the user
+ * did not extract the message, yet issued
+ * pfm_restart(). At this point, we stop sending
+ * notification, thus the user will not be able to get
+ * new samples when using the default format.
+ */
+ PFM_DBG_ovfl("no more notification msgs");
+ return -1;
+ }
+
+ msg->pfm_ovfl_msg.msg_type = PFM_MSG_OVFL;
+ msg->pfm_ovfl_msg.msg_ovfl_pid = current->pid;
+ msg->pfm_ovfl_msg.msg_active_set = set->id;
+
+ ovfl_pmds = msg->pfm_ovfl_msg.msg_ovfl_pmds;
+
+ bitmap_copy(ulp(ovfl_pmds), ulp(set->ovfl_pmds),
+ max_cnt_pmd);
+
+ msg->pfm_ovfl_msg.msg_ovfl_cpu = smp_processor_id();
+ msg->pfm_ovfl_msg.msg_ovfl_tid = current->tgid;
+ msg->pfm_ovfl_msg.msg_ovfl_ip = ip;
+ }
+
+ PFM_DBG("ovfl msg: ip=0x%lx o_pmds=0x%llx",
+ ip,
+ (unsigned long long)set->ovfl_pmds[0]);
+
+ return pfm_notify_user(ctx, msg);
+}
+
+static int pfm_end_notify_user(struct pfm_context *ctx)
+{
+ union pfm_msg *msg;
+
+ msg = pfm_get_new_msg(ctx);
+ if (msg == NULL) {
+ PFM_ERR("%s no more msgs", __FUNCTION__);
+ return -1;
+ }
+ /* no leak */
+ memset(msg, 0, sizeof(*msg));
+
+ msg->type = PFM_MSG_END;
+
+ PFM_DBG("end msg: msg=%p no_msg=%d",
+ msg,
+ ctx->flags.no_msg);
+
+ return pfm_notify_user(ctx, msg);
+}
+
+/*
+ * called only from exit_thread(): task == current
+ * we come here only if current has a context
+ * attached (loaded or masked or zombie)
+ */
+void __pfm_exit_thread(struct task_struct *task)
+{
+ struct pfm_context *ctx;
+ unsigned long flags;
+ int free_ok = 0;
+
+ ctx = task->pfm_context;
+
+ BUG_ON(ctx->flags.system);
+
+ spin_lock_irqsave(&ctx->lock, flags);
+
+ PFM_DBG("state=%d", ctx->state);
+
+ /*
+ * __pfm_unload_context() cannot fail
+ * in the context states we are interested in
+ */
+ switch(ctx->state) {
+ case PFM_CTX_LOADED:
+ case PFM_CTX_MASKED:
+ __pfm_unload_context(ctx, 0);
+ pfm_end_notify_user(ctx);
+ break;
+ case PFM_CTX_ZOMBIE:
+ __pfm_unload_context(ctx, 0);
+ free_ok = 1;
+ break;
+ default:
+ BUG_ON(ctx->state != PFM_CTX_LOADED);
+ break;
+ }
+ spin_unlock_irqrestore(&ctx->lock, flags);
+
+ /*
+ * All memory free operations (especially for vmalloc'ed memory)
+ * MUST be done with interrupts ENABLED.
+ */
+ if (free_ok)
+ pfm_context_free(ctx);
+}
+
+/*
+ * this function is called from pfm_init()
+ * pfm_pmu_conf is NULL at this point
+ */
+void __cpuinit pfm_init_percpu (void *dummy)
+{
+ pfm_arch_init_percpu();
+}
+
+/*
+ * global initialization routine, executed only once
+ */
+int __init pfm_init(void)
+{
+ int ret;
+
+ PFM_LOG("version %u.%u", PFM_VERSION_MAJ, PFM_VERSION_MIN);
+
+ pfm_ctx_cachep = kmem_cache_create("pfm_context",
+ sizeof(struct pfm_context)+PFM_ARCH_CTX_SIZE,
+ SLAB_HWCACHE_ALIGN, 0, NULL, NULL);
+ if (pfm_ctx_cachep == NULL) {
+ PFM_ERR("cannot initialize context slab");
+ goto error_disable;
+ }
+ ret = pfm_sets_init();
+ if (ret)
+ goto error_disable;
+
+
+ if (pfm_sysfs_init())
+ goto error_disable;
+
+ /*
+ * one time, global initialization
+ */
+ if (pfm_arch_initialize())
+ goto error_disable;
+
+ init_pfm_fs();
+
+ /*
+ * per cpu initialization (interrupts must be enabled)
+ */
+ on_each_cpu(pfm_init_percpu, NULL, 1, 1);
+
+ return 0;
+error_disable:
+ return -1;
+}
+/*
+ * must use subsys_initcall() to ensure that the perfmon2 core
+ * is initialized before any PMU description module when they are
+ * compiled in.
+ */
+subsys_initcall(pfm_init);
+
+int __pfm_start(struct pfm_context *ctx, struct pfarg_start *start)
+{
+ struct task_struct *task, *owner_task;
+ struct pfm_event_set *new_set, *old_set;
+ u64 now_itc;
+ int is_self, flags;
+
+ task = ctx->task;
+
+ /*
+ * context must be loaded.
+ * we do not support starting while in MASKED state
+ * (mostly because of set switching issues)
+ */
+ if (ctx->state != PFM_CTX_LOADED)
+ return -EINVAL;
+
+ old_set = new_set = ctx->active_set;
+
+ /*
+ * always the case for system-wide
+ */
+ if (task == NULL)
+ task = current;
+
+ is_self = task == current;
+
+ /*
+ * argument is provided?
+ */
+ if (start) {
+ /*
+ * find the set to load first
+ */
+ new_set = pfm_find_set(ctx, start->start_set, 0);
+ if (new_set == NULL) {
+ PFM_DBG("event set%u does not exist",
+ start->start_set);
+ return -EINVAL;
+ }
+ }
+
+ PFM_DBG("cur_set=%u req_set=%u",
+ old_set->id,
+ new_set->id);
+
+ /*
+ * if we need to change the active set we need
+ * to check if we can access the PMU
+ */
+ if (new_set != old_set) {
+ owner_task = __get_cpu_var(pmu_owner);
+ /*
+ * system-wide: must run on the right CPU
+ * per-thread : must be the owner of the PMU context
+ *
+ * pfm_switch_sets() returns with monitoring stopped
+ */
+ if (is_self) {
+ pfm_switch_sets(ctx, new_set, PFM_PMD_RESET_LONG, 1);
+ } else {
+ /*
+ * In a UP kernel, the PMU may contain the state
+ * of the task we want to operate on, yet the task
+ * may be switched out (lazy save). We need to save
+ * current state (old_set), switch active_set and
+ * mark it for reload.
+ */
+ if (owner_task == task) {
+ pfm_modview_begin(old_set);
+ pfm_save_pmds(ctx, old_set);
+ pfm_modview_end(old_set);
+ }
+ ctx->active_set = new_set;
+ new_set->view->set_status |= PFM_SETVFL_ACTIVE;
+ new_set->priv_flags |= PFM_SETFL_PRIV_MOD_BOTH;
+ }
+ }
+ /*
+ * mark as started, must be done before calling
+ * pfm_arch_start()
+ */
+ ctx->flags.started = 1;
+
+ /*
+ * at this point, monitoring is:
+ * - stopped if we switched set (self-monitoring)
+ * - stopped if never started
+ * - started if calling pfm_start() in sequence
+ */
+ now_itc = pfm_arch_get_itc();
+ flags = new_set->flags;
+
+ if (is_self) {
+ unsigned long info;
+ if (flags & PFM_SETFL_TIME_SWITCH)
+ info = PFM_CPUINFO_TIME_SWITCH;
+ else
+ info = 0;
+
+ __get_cpu_var(pfm_syst_info) = info;
+ }
+ /*
+ * in system-wide, the new_set may EXCL_IDLE, in which
+ * case pfm_start() must actually stop monitoring
+ */
+ if (current->pid == 0 && (flags & PFM_SETFL_EXCL_IDLE))
+ pfm_arch_stop(task, ctx, new_set);
+ else
+ pfm_arch_start(task, ctx, new_set);
+
+ /*
+ * we restart total duration even if context was
+ * already started. In that case, counts are simply
+ * reset.
+ *
+ * For system-wide, we start counting even when we exclude
+ * idle and pfm_start() called by idle.
+ *
+ * For per-thread, if not self-monitoring, the statement
+ * below will have no effect because thread is stopped.
+ * The field is reset of ctxsw in.
+ *
+ * if monitoring is masked (MASKED), this statement
+ * will be overriden in pfm_unmask_monitoring()
+ */
+ ctx->duration_start = now_itc;
+ new_set->duration_start = now_itc;
+
+ return 0;
+}
+
+int __pfm_stop(struct pfm_context *ctx)
+{
+ struct pfm_event_set *set;
+ struct task_struct *task;
+ u64 now_itc;
+ int state;
+
+ now_itc = pfm_arch_get_itc();
+ state = ctx->state;
+ set = ctx->active_set;
+
+ /*
+ * context must be attached (zombie cannot happen)
+ */
+ if (state == PFM_CTX_UNLOADED)
+ return -EINVAL;
+
+ task = ctx->task;
+
+ PFM_DBG("ctx_task=[%d] ctx_state=%d is_system=%d",
+ task ? task->pid : -1,
+ state,
+ ctx->flags.system);
+
+ /*
+ * this happens for system-wide context
+ */
+ if (task == NULL)
+ task = current;
+
+ /*
+ * compute elapsed time
+ *
+ * for non-self-monitorint, the thread is necessarily stopped
+ * and total duration has already been computed in ctxsw out.
+ */
+ if (task == current) {
+ ctx->duration += now_itc - ctx->duration_start;
+ /*
+ * don't update set duration if masked
+ */
+ if (state == PFM_CTX_LOADED)
+ set->duration += now_itc - set->duration_start;
+ }
+
+ pfm_arch_stop(task, ctx, set);
+
+ ctx->flags.started = 0;
+
+ return 0;
+}
+
+int __pfm_restart(struct pfm_context *ctx)
+{
+ int state;
+
+ state = ctx->state;
+
+ switch(state) {
+ case PFM_CTX_MASKED:
+ break;
+ case PFM_CTX_LOADED:
+ if (ctx->smpl_addr && ctx->smpl_fmt->fmt_restart)
+ break;
+ /* fall through */
+ case PFM_CTX_UNLOADED:
+ case PFM_CTX_ZOMBIE:
+ PFM_DBG("invalid state=%d", state);
+ return -EBUSY;
+ default:
+ PFM_DBG("state=%d with no active_restart handler",
+ state);
+ return -EINVAL;
+ }
+ /*
+ * at this point, the context is either LOADED or MASKED
+ */
+
+ if (ctx->task == current || ctx->flags.system) {
+ pfm_resume_after_ovfl(ctx);
+ return 0;
+ }
+
+ /*
+ * restart another task
+ */
+
+ /*
+ * When PFM_CTX_MASKED, we cannot issue a restart before the previous
+ * one is seen by the task.
+ */
+ if (state == PFM_CTX_MASKED) {
+ if (!ctx->flags.can_restart) {
+ PFM_DBG("cannot restart can_restart=%d",
+ ctx->flags.can_restart);
+ return -EBUSY;
+ }
+ /*
+ * prevent subsequent restart before this one is
+ * seen by the task
+ */
+ ctx->flags.can_restart = 0;
+ }
+
+ /*
+ * if blocking, then post the semaphore is PFM_CTX_MASKED, i.e.
+ * the task is blocked or on its way to block. That's the normal
+ * restart path. If the monitoring is not masked, then the task
+ * can be actively monitoring and we cannot directly intervene.
+ * Therefore we use the trap mechanism to catch the task and
+ * force it to reset the buffer/reset PMDs.
+ *
+ * if non-blocking, then we ensure that the task will go into
+ * pfm_handle_work() before returning to user mode.
+ *
+ * We cannot explicitely reset another task, it MUST always
+ * be done by the task itself. This works for system wide because
+ * the tool that is controlling the session is logically doing
+ * "self-monitoring".
+ */
+ if (ctx->flags.block && state == PFM_CTX_MASKED) {
+ PFM_DBG("unblocking [%d]", ctx->task->pid);
+ complete(&ctx->restart_complete);
+ } else {
+ struct thread_info *info;
+
+ PFM_DBG("[%d] armed exit trap", ctx->task->pid);
+
+ /*
+ * mark work pending
+ */
+ ctx->flags.trap_reason = PFM_TRAP_REASON_RESET;
+
+ info = ctx->task->thread_info;
+ set_bit(TIF_NOTIFY_RESUME, &info->flags);
+
+ /*
+ * XXX: send reschedule if task runs on another CPU
+ */
+ }
+ return 0;
+}
+
+/*
+ * XXX: interrupts are masked yet monitoring may be active. Hence they
+ * might be a counter overflow during the call. It will be kept pending
+ * and we might return inconsistent unless we check the state of the counter
+ * and compensate for the overflow. Note that we will not loose a sample
+ * when sampling, however, there may be an issue with simple counting and
+ * virtualization.
+ */
+int __pfm_read_pmds(struct pfm_context *ctx, struct pfarg_pmd *req, int count)
+{
+ u64 val = 0, lval, ovfl_mask, hw_val;
+ u64 sw_cnt;
+ u64 *impl_pmds;
+ struct pfm_event_set *set, *active_set;
+ int i, can_access_pmu = 0;
+ int error_code;
+ u16 cnum, pmd_type, set_id, prev_set_id, max_pmd;
+
+ ovfl_mask = pfm_pmu_conf->ovfl_mask;
+ impl_pmds = pfm_pmu_conf->impl_pmds;
+ max_pmd = pfm_pmu_conf->max_pmd;
+ active_set = ctx->active_set;
+ set = NULL;
+ prev_set_id = 0;
+
+ if (likely(ctx->state == PFM_CTX_LOADED)) {
+ can_access_pmu = __get_cpu_var(pmu_owner) == ctx->task
+ || ctx->flags.system;
+
+ if (can_access_pmu)
+ pfm_arch_serialize();
+ }
+ error_code = PFM_REG_RETFL_EINVAL;
+
+ /*
+ * on both UP and SMP, we can only read the PMD from the hardware
+ * register when the task is the owner of the local PMU.
+ */
+ for (i = 0; i < count; i++, req++) {
+
+ cnum = req->reg_num;
+ set_id = req->reg_set;
+
+ if (unlikely(cnum >= max_pmd || !pfm_bv_isset(impl_pmds, cnum)))
+ goto error;
+
+ pmd_type = pfm_pmu_conf->pmd_desc[cnum].type;
+
+ /*
+ * locate event set
+ */
+ if (i == 0 || set_id != prev_set_id) {
+ set = pfm_find_set(ctx, set_id, 0);
+ if (set == NULL) {
+ PFM_DBG("event set%u does not exist",
+ set_id);
+ error_code = PFM_REG_RETFL_NOSET;
+ goto error;
+ }
+ }
+ /*
+ * it is not possible to read a PMD which was not requested:
+ * - explicitly written via pfm_write_pmds()
+ * - provided as a reg_smpl_pmds[] to another PMD during
+ * pfm_write_pmds()
+ *
+ * This is motivated by security and for optimizations purposes:
+ * - on context switch restore, we can restore only what we
+ * use (except when regs directly readable at user level,
+ * e.g., IA-64 self-monitoring, I386 RDTSC).
+ * - do not need to maintain PMC -> PMD dependencies
+ */
+ if (unlikely(!pfm_bv_isset(set->used_pmds, cnum))) {
+ PFM_DBG("pmd%u cannot be read, because never "
+ "requested", cnum);
+ goto error;
+ }
+
+ /*
+ * it is possible to read PMD registers which have not
+ * explicitely been written by the application. In this case
+ * the default value is returned.
+ */
+ val = set->view->set_pmds[cnum];
+ lval = set->pmds[cnum].lval;
+
+ /*
+ * extract remaining ovfl to switch
+ */
+ sw_cnt = set->pmds[cnum].ovflsw_thres;
+
+ /*
+ * If the task is not the current one, then we check if the
+ * PMU state is still in the local live register due to lazy
+ * ctxsw. If true, then we read directly from the registers.
+ */
+ if (set == active_set && can_access_pmu) {
+ hw_val = pfm_read_pmd(ctx, cnum);
+ if (pmd_type & PFM_REG_C64)
+ val = (val & ~ovfl_mask) | (hw_val & ovfl_mask);
+ else
+ val = hw_val;
+ }
+
+ PFM_DBG("set%u pmd%u=0x%llx switch_thres=%llu",
+ set->id,
+ cnum,
+ (unsigned long long)val,
+ (unsigned long long)sw_cnt);
+
+ pfm_retflag_set(req->reg_flags, 0);
+ req->reg_value = val;
+ req->reg_last_reset_val = lval;
+ req->reg_ovfl_switch_cnt = sw_cnt;
+
+ prev_set_id = set_id;
+ }
+ return 0;
+
+error:
+ pfm_retflag_set(req->reg_flags, error_code);
+ return -EINVAL;
+}
+
+int __pfm_write_pmds(struct pfm_context *ctx, struct pfarg_pmd *req, int count,
+ int compat)
+{
+#define PFM_REGFL_PMD_ALL (PFM_REGFL_RANDOM | \
+ PFM_REGFL_OVFL_NOTIFY| \
+ PFM_REG_RETFL_MASK)
+
+ struct pfm_event_set *set, *active_set;
+ u64 value, hw_val, ovfl_mask;
+ u64 *smpl_pmds, *reset_pmds, *impl_pmds;
+ u32 req_flags, flags;
+ u16 cnum, pmd_type, max_pmd, max_pmc;
+ u16 set_id, prev_set_id;
+ int i, can_access_pmu;
+ int is_counting;
+ int ret, error_code;
+ pfm_pmd_check_t wr_func;
+
+ ovfl_mask = pfm_pmu_conf->ovfl_mask;
+ active_set = ctx->active_set;
+ max_pmd = pfm_pmu_conf->max_pmd;
+ max_pmc = pfm_pmu_conf->max_pmc;
+ impl_pmds = pfm_pmu_conf->impl_pmds;
+ wr_func = pfm_pmu_conf->pmd_write_check;
+ set = NULL;
+
+ prev_set_id = 0;
+ can_access_pmu = 0;
+
+ /*
+ * we cannot access the actual PMD registers when monitoring is masked
+ */
+ if (likely(ctx->state == PFM_CTX_LOADED))
+ can_access_pmu = __get_cpu_var(pmu_owner) == ctx->task
+ || ctx->flags.system;
+
+ error_code = PFM_REG_RETFL_EINVAL;
+ ret = -EINVAL;
+
+ for (i = 0; i < count; i++, req++) {
+
+ cnum = req->reg_num;
+ set_id = req->reg_set;
+ req_flags = req->reg_flags;
+ smpl_pmds = req->reg_smpl_pmds;
+ reset_pmds = req->reg_reset_pmds;
+ flags = 0;
+
+ if (unlikely(cnum >= max_pmd || !pfm_bv_isset(impl_pmds, cnum))) {
+ PFM_DBG("pmd%u is not implemented or not accessible",
+ cnum);
+ goto error;
+ }
+
+ pmd_type = pfm_pmu_conf->pmd_desc[cnum].type;
+ is_counting = pmd_type & PFM_REG_C64;
+
+ if (likely(!compat)) {
+ if (likely(is_counting)) {
+ /*
+ * ensure only valid flags are set
+ */
+ if (req_flags & ~(PFM_REGFL_PMD_ALL)) {
+ PFM_DBG("pmd%u: invalid flags=0x%x",
+ cnum, req_flags);
+ goto error;
+ }
+
+ if (req_flags & PFM_REGFL_OVFL_NOTIFY)
+ flags |= PFM_REGFL_OVFL_NOTIFY;
+ if (req_flags & PFM_REGFL_RANDOM)
+ flags |= PFM_REGFL_RANDOM;
+ /*
+ * verify validity of smpl_pmds
+ */
+ if (unlikely(!bitmap_subset(ulp(smpl_pmds),
+ ulp(impl_pmds),
+ max_pmd))) {
+ PFM_DBG("invalid smpl_pmds=0x%llx "
+ "for pmd%u",
+ (unsigned long long)smpl_pmds[0],
+ cnum);
+ goto error;
+ }
+ /*
+ * verify validity of reset_pmds
+ */
+ if (unlikely(!bitmap_subset(ulp(reset_pmds),
+ ulp(impl_pmds),
+ max_pmd))) {
+ PFM_DBG("invalid reset_pmds=0x%llx "
+ "for pmd%u",
+ (unsigned long long)reset_pmds[0],
+ cnum);
+ goto error;
+ }
+ }
+ }
+
+ /*
+ * locate event set
+ */
+ if (i == 0 || set_id != prev_set_id) {
+ set = pfm_find_set(ctx, set_id, 0);
+ if (set == NULL) {
+ PFM_DBG("event set%u does not exist",
+ set_id);
+ error_code = PFM_REG_RETFL_NOSET;
+ goto error;
+ }
+ }
+
+ /*
+ * execute write checker, if any
+ */
+ if (likely(wr_func && (pmd_type & PFM_REG_WC))) {
+ ret = (*wr_func)(ctx, set, req);
+ if (ret)
+ goto error;
+
+ }
+ hw_val = value = req->reg_value;
+
+ /*
+ * now commit changes to software state
+ */
+ pfm_modview_begin(set);
+
+ if (likely(is_counting)) {
+ if (likely(!compat)) {
+
+ set->pmds[cnum].flags = flags;
+
+ /*
+ * copy reset and sampling bitvectors
+ */
+ bitmap_copy(ulp(set->pmds[cnum].reset_pmds),
+ ulp(reset_pmds),
+ max_pmd);
+
+ bitmap_copy(ulp(set->pmds[cnum].smpl_pmds),
+ ulp(smpl_pmds),
+ max_pmd);
+
+ set->pmds[cnum].eventid = req->reg_smpl_eventid;
+
+ /*
+ * Mark reset/smpl PMDS as used.
+ *
+ * We do not keep track of PMC because we have to
+ * systematically restore ALL of them.
+ */
+ bitmap_or(ulp(set->used_pmds),
+ ulp(set->used_pmds),
+ ulp(reset_pmds), max_pmd);
+
+ bitmap_or(ulp(set->used_pmds),
+ ulp(set->used_pmds),
+ ulp(smpl_pmds), max_pmd);
+
+ /*
+ * we reprogrammed the PMD hence, clear any pending
+ * ovfl, switch based on the old value
+ * for restart we have already established new values
+ */
+ pfm_bv_clear(set->povfl_pmds, cnum);
+ pfm_bv_clear(set->ovfl_pmds, cnum);
+
+ /*
+ * update ovfl_notify
+ */
+ if (flags & PFM_REGFL_OVFL_NOTIFY)
+ pfm_bv_set(set->ovfl_notify, cnum);
+ else
+ pfm_bv_clear(set->ovfl_notify, cnum);
+ }
+ /*
+ * reset last value to new value
+ */
+ set->pmds[cnum].lval = value;
+
+ hw_val = value & ovfl_mask;
+
+ /*
+ * establish new switch count
+ */
+ set->pmds[cnum].ovflsw_thres = req->reg_ovfl_switch_cnt;
+ set->pmds[cnum].ovflsw_ref_thres = req->reg_ovfl_switch_cnt;
+ }
+
+ /*
+ * update reset values (not just for counters)
+ */
+ set->pmds[cnum].long_reset = req->reg_long_reset;
+ set->pmds[cnum].short_reset = req->reg_short_reset;
+
+ /*
+ * update randomization parameters (not just for counters)
+ */
+ set->pmds[cnum].seed = req->reg_random_seed;
+ set->pmds[cnum].mask = req->reg_random_mask;
+
+ /*
+ * update set values
+ */
+ set->view->set_pmds[cnum] = value;
+
+ pfm_modview_end(set);
+
+ pfm_bv_set(set->used_pmds, cnum);
+
+ if (set == active_set) {
+ set->priv_flags |= PFM_SETFL_PRIV_MOD_PMDS;
+ if (can_access_pmu)
+ pfm_write_pmd(ctx, cnum, hw_val);
+ }
+
+ /*
+ * update number of used PMD registers
+ */
+ set->nused_pmds = bitmap_weight(ulp(set->used_pmds), max_pmd);
+
+ pfm_retflag_set(req->reg_flags, 0);
+
+ prev_set_id = set_id;
+
+ PFM_DBG("set%u pmd%u=0x%llx flags=0x%x a_pmu=%d "
+ "hw_pmd=0x%llx ctx_pmd=0x%llx s_reset=0x%llx "
+ "l_reset=0x%llx u_pmds=0x%llx nu_pmds=%u "
+ "s_pmds=0x%llx r_pmds=0x%llx o_pmds=0x%llx "
+ "o_thres=%llu compat=%d eventid=%llx",
+ set->id,
+ cnum,
+ (unsigned long long)value,
+ set->pmds[cnum].flags,
+ can_access_pmu,
+ (unsigned long long)hw_val,
+ (unsigned long long)set->view->set_pmds[cnum],
+ (unsigned long long)set->pmds[cnum].short_reset,
+ (unsigned long long)set->pmds[cnum].long_reset,
+ (unsigned long long)set->used_pmds[0],
+ set->nused_pmds,
+ (unsigned long long)set->pmds[cnum].smpl_pmds[0],
+ (unsigned long long)set->pmds[cnum].reset_pmds[0],
+ (unsigned long long)set->ovfl_pmds[0],
+ (unsigned long long)set->pmds[cnum].ovflsw_thres,
+ compat,
+ (unsigned long long)set->pmds[cnum].eventid);
+ }
+
+ /*
+ * make changes visible
+ */
+ if (can_access_pmu)
+ pfm_arch_serialize();
+
+ return 0;
+
+error:
+ /*
+ * for now, we have only one possibility for error
+ */
+ pfm_retflag_set(req->reg_flags, error_code);
+ return ret;
+}
+
+int __pfm_write_pmcs(struct pfm_context *ctx, struct pfarg_pmc *req, int count)
+{
+#define PFM_REGFL_PMC_ALL (PFM_REGFL_NO_EMUL64|PFM_REG_RETFL_MASK)
+ struct pfm_event_set *set, *active_set;
+ u64 value, dfl_val, rsvd_msk;
+ u64 *impl_pmcs;
+ int i, can_access_pmu;
+ int ret, error_code;
+ u16 set_id, prev_set_id;
+ u16 cnum, pmc_type, max_pmc;
+ u32 flags;
+ pfm_pmc_check_t wr_func;
+
+ active_set = ctx->active_set;
+
+ wr_func = pfm_pmu_conf->pmc_write_check;
+ max_pmc = pfm_pmu_conf->max_pmc;
+ impl_pmcs = pfm_pmu_conf->impl_pmcs;
+
+ set = NULL;
+ prev_set_id = 0;
+ can_access_pmu = 0;
+
+ /*
+ * we cannot access the actual PMC registers when monitoring is masked
+ */
+ if (likely(ctx->state == PFM_CTX_LOADED))
+ can_access_pmu = __get_cpu_var(pmu_owner) == ctx->task
+ || ctx->flags.system;
+
+ error_code = PFM_REG_RETFL_EINVAL;
+
+ for (i = 0; i < count; i++, req++) {
+
+ ret = -EINVAL;
+ cnum = req->reg_num;
+ set_id = req->reg_set;
+ value = req->reg_value;
+ flags = req->reg_flags;
+
+ /*
+ * no access to unimplemented PMC register
+ */
+ if (unlikely(cnum >= max_pmc || !pfm_bv_isset(impl_pmcs, cnum))) {
+ PFM_DBG("pmc%u is not implemented/unaccessible",
+ cnum);
+ error_code = PFM_REG_RETFL_NOTAVAIL;
+ goto error;
+ }
+
+ pmc_type = pfm_pmu_conf->pmc_desc[cnum].type;
+ dfl_val = pfm_pmu_conf->pmc_desc[cnum].dfl_val;
+ rsvd_msk = pfm_pmu_conf->pmc_desc[cnum].rsvd_msk;
+
+ /*
+ * ensure only valid flags are set
+ */
+ if (flags & ~(PFM_REGFL_PMC_ALL)) {
+ PFM_DBG("pmc%u: invalid flags=0x%x", cnum, flags);
+ goto error;
+ }
+
+ /*
+ * locate event set
+ */
+ if (i == 0 || set_id != prev_set_id) {
+ set = pfm_find_set(ctx, set_id, 0);
+ if (set == NULL) {
+ PFM_DBG("event set%u does not exist",
+ set_id);
+ error_code = PFM_REG_RETFL_NOSET;
+ goto error;
+ }
+ }
+
+ /*
+ * set reserved bits to default values
+ */
+ value = (value & rsvd_msk) | (dfl_val & ~rsvd_msk);
+
+ if (flags & PFM_REGFL_NO_EMUL64) {
+ if (!(pmc_type & PFM_REG_NO64)) {
+ PFM_DBG("pmc%u no support "
+ "PFM_REGFL_NO_EMUL64", cnum);
+ goto error;
+ }
+ value &= ~pfm_pmu_conf->pmc_desc[cnum].no_emul64_msk;
+ }
+
+ /*
+ * execute write checker, if any
+ */
+ if (likely(wr_func && (pmc_type & PFM_REG_WC))) {
+ req->reg_value = value;
+ ret = (*wr_func)(ctx, set, req);
+ if (ret)
+ goto error;
+ value = req->reg_value;
+ }
+
+ /*
+ * Now we commit the changes
+ */
+
+ /*
+ * mark PMC register as used
+ * We do not track associated PMC register based on
+ * the fact that they will likely need to be written
+ * in order to become useful at which point the statement
+ * below will catch that.
+ *
+ * The used_pmcs bitmask is only useful on architectures where
+ * the PMC needs to be modified for particular bits, especially
+ * on overflow or to stop/start.
+ */
+ if (!pfm_bv_isset(set->used_pmcs, cnum)) {
+ pfm_bv_set(set->used_pmcs, cnum);
+ set->nused_pmcs++;
+ }
+
+ set->pmcs[cnum] = value;
+
+ if (set == active_set) {
+ set->priv_flags |= PFM_SETFL_PRIV_MOD_PMCS;
+ if (can_access_pmu)
+ pfm_arch_write_pmc(ctx, cnum, value);
+ }
+
+ pfm_retflag_set(req->reg_flags, 0);
+
+ prev_set_id = set_id;
+
+ PFM_DBG("set%u pmc%u=0x%llx a_pmu=%d "
+ "u_pmcs=0x%llx nu_pmcs=%u",
+ set->id,
+ cnum,
+ (unsigned long long)value,
+ can_access_pmu,
+ (unsigned long long)set->used_pmcs[0],
+ set->nused_pmcs);
+ }
+ /*
+ * make sure the changes are visible
+ */
+ if (can_access_pmu)
+ pfm_arch_serialize();
+
+ return 0;
+error:
+ pfm_retflag_set(req->reg_flags, error_code);
+ return ret;
+}
+
+/*
+ * should not call when task == current
+ */
+static int pfm_bad_permissions(struct task_struct *task)
+{
+ /* inspired by ptrace_attach() */
+ PFM_DBG("cur: euid=%d uid=%d gid=%d task: euid=%d "
+ "suid=%d uid=%d egid=%d cap:%d sgid=%d",
+ current->euid,
+ current->uid,
+ current->gid,
+ task->euid,
+ task->suid,
+ task->uid,
+ task->egid,
+ task->sgid, capable(CAP_SYS_PTRACE));
+
+ return ((current->uid != task->euid)
+ || (current->uid != task->suid)
+ || (current->uid != task->uid)
+ || (current->gid != task->egid)
+ || (current->gid != task->sgid)
+ || (current->gid != task->gid)) && !capable(CAP_SYS_PTRACE);
+}
+
+
+/*
+ * cannot attach if :
+ * - kernel task
+ * - task not owned by caller
+ * - task incompatible with context mode
+ */
+static int pfm_task_incompatible(struct pfm_context *ctx,
+ struct task_struct *task)
+{
+ /*
+ * no kernel task or task not owned by caller
+ */
+ if (!task->mm) {
+ PFM_DBG("cannot attach to kernel thread [%d]", task->pid);
+ return -EPERM;
+ }
+
+ if (pfm_bad_permissions(task)) {
+ PFM_DBG("no permission to attach to [%d]", task->pid);
+ return -EPERM;
+ }
+
+ /*
+ * cannot block in self-monitoring mode
+ */
+ if (ctx->flags.block && task == current) {
+ PFM_DBG("cannot load a in blocking mode on self for [%d]",
+ task->pid);
+ return -EINVAL;
+ }
+
+ if (task->state == EXIT_ZOMBIE || task->state == EXIT_DEAD) {
+ PFM_DBG("cannot attach to zombie/dead task [%d]", task->pid);
+ return -EBUSY;
+ }
+
+ /*
+ * always ok for self
+ */
+ if (task == current)
+ return 0;
+
+ if ((task->state != TASK_STOPPED) && (task->state != TASK_TRACED)) {
+ PFM_DBG("cannot attach to non-stopped task [%d] state=%ld",
+ task->pid, task->state);
+ return -EBUSY;
+ }
+ PFM_DBG("before wait_inactive() task [%d] state=%ld",
+ task->pid, task->state);
+ /*
+ * make sure the task is off any CPU
+ */
+ wait_task_inactive(task);
+
+ PFM_DBG("after wait_inactive() task [%d] state=%ld",
+ task->pid, task->state);
+ /* more to come... */
+
+ return 0;
+}
+static int pfm_get_task(struct pfm_context *ctx, pid_t pid,
+ struct task_struct **task)
+{
+ struct task_struct *p = current;
+ int ret;
+
+ /* XXX: need to add more checks here */
+ if (pid < 2)
+ return -EPERM;
+
+ if (pid != current->pid) {
+
+ read_lock(&tasklist_lock);
+
+ p = find_task_by_pid(pid);
+
+ /* make sure task cannot go away while we operate on it */
+ if (p)
+ get_task_struct(p);
+
+ read_unlock(&tasklist_lock);
+
+ if (p == NULL)
+ return -ESRCH;
+ }
+
+ ret = pfm_task_incompatible(ctx, p);
+ if (!ret) {
+ *task = p;
+ } else if (p != current) {
+ put_task_struct(p);
+ }
+ return ret;
+}
+
+static int pfm_check_task_exist(struct pfm_context *ctx)
+{
+ struct task_struct *g, *t;
+ int ret = -ESRCH;
+
+ read_lock(&tasklist_lock);
+
+ do_each_thread (g, t) {
+ if (t->pfm_context == ctx) {
+ ret = 0;
+ break;
+ }
+ } while_each_thread (g, t);
+
+ read_unlock(&tasklist_lock);
+
+ PFM_DBG("ret=%d ctx=%p", ret, ctx);
+
+ return ret;
+}
+
+
+static int pfm_load_context_thread(struct pfm_context *ctx, pid_t pid,
+ struct pfm_event_set *set)
+{
+ struct task_struct *task = NULL;
+ struct pfm_context *old;
+ u32 set_flags;
+ unsigned long info;
+ int ret, state;
+
+ state = ctx->state;
+ set_flags = set->flags;
+
+ PFM_DBG("load_pid [%d] set=%u runs=%llu set_flags=0x%x",
+ pid,
+ set->id,
+ (unsigned long long)set->view->set_runs,
+ set_flags);
+
+ if (ctx->flags.block && pid == current->pid) {
+ PFM_DBG("cannot use blocking mode in while self-monitoring");
+ return -EINVAL;
+ }
+
+ ret = pfm_get_task(ctx, pid, &task);
+ if (ret) {
+ PFM_DBG("load_pid [%d] get_task=%d", pid, ret);
+ return ret;
+ }
+
+ ret = pfm_arch_load_context(ctx, task);
+ if (ret) {
+ put_task_struct(task);
+ return ret;
+ }
+
+ /*
+ * now reserve the session
+ */
+ ret = pfm_reserve_session(ctx, -1);
+ if (ret)
+ goto error;
+
+ /*
+ * task is necessarily stopped at this point.
+ *
+ * If the previous context was zombie, then it got removed in
+ * pfm_ctxswout_thread(). Therefore we should not see it here.
+ * If we see a context, then this is an active context
+ *
+ */
+ PFM_DBG("before cmpxchg() old_ctx=%p new_ctx=%p",
+ task->pfm_context, ctx);
+
+ ret = -EEXIST;
+
+ old = cmpxchg(&task->pfm_context, NULL, ctx);
+ if (old != NULL) {
+ PFM_DBG("load_pid [%d] has already a context "
+ "old=%p new=%p cur=%p",
+ pid,
+ old,
+ ctx,
+ task->pfm_context);
+ goto error_unres;
+ }
+
+ /*
+ * link context to task
+ */
+ ctx->task = task;
+ set_tsk_thread_flag(task, TIF_PERFMON);
+
+ /*
+ * commit active set
+ */
+ ctx->active_set = set;
+
+ pfm_modview_begin(set);
+
+ set->view->set_runs++;
+
+ set->view->set_status |= PFM_SETVFL_ACTIVE;
+
+ /*
+ * self-monitoring
+ */
+ if (task == current) {
+#ifndef CONFIG_SMP
+ struct pfm_context *ctxp;
+
+ /*
+ * in UP per-thread, due to lazy save
+ * there could be a context from another
+ * task. We need to push it first before
+ * installing our new state
+ */
+ ctxp = __get_cpu_var(pmu_ctx);
+ if (ctxp) {
+ struct pfm_event_set *setp;
+ setp = ctxp->active_set;
+ pfm_modview_begin(setp);
+ pfm_save_pmds(ctxp, setp);
+ setp->view->set_status &= ~PFM_SETVFL_ACTIVE;
+ pfm_modview_end(setp);
+ /*
+ * do not clear ownership because we rewrite
+ * right away
+ */
+ }
+#endif
+ pfm_set_last_cpu(ctx, smp_processor_id());
+ pfm_inc_activation();
+ pfm_set_activation(ctx);
+
+ /*
+ * setting PFM_CPUINFO_TIME_SWITCH, triggers
+ * further checking if __pfm_handle_switch_timeout().
+ * switch timeout is effectively decremented only once
+ * monitoring has been activated via pfm_start() or
+ * any user level equivalent.
+ */
+ if (set_flags & PFM_SETFL_TIME_SWITCH) {
+ info = PFM_CPUINFO_TIME_SWITCH;
+ __get_cpu_var(pfm_syst_info) = info;
+ }
+ /*
+ * load all PMD from set
+ * load all PMC from set
+ */
+ pfm_arch_restore_pmds(ctx, set);
+ pfm_arch_restore_pmcs(ctx, set);
+
+ /*
+ * set new ownership
+ */
+ pfm_set_pmu_owner(task, ctx);
+
+ PFM_DBG("context loaded on PMU for [%d] TIF=%d", task->pid, test_tsk_thread_flag(task, TIF_PERFMON));
+ } else {
+
+ /* force a full reload */
+ ctx->last_act = PFM_INVALID_ACTIVATION;
+ pfm_set_last_cpu(ctx, -1);
+ set->priv_flags |= PFM_SETFL_PRIV_MOD_BOTH;
+ PFM_DBG("context loaded next ctxswin for [%d]", task->pid);
+ }
+
+ pfm_modview_end(set);
+
+ ret = 0;
+
+error_unres:
+ if (ret)
+ pfm_release_session(ctx, -1);
+error:
+ /*
+ * release task, there is now a link with the context
+ */
+ if (task != current) {
+ put_task_struct(task);
+
+ if (!ret) {
+ ret = pfm_check_task_exist(ctx);
+ if (ret) {
+ ctx->state = PFM_CTX_UNLOADED;
+ ctx->task = NULL;
+ }
+ }
+ }
+ return ret;
+}
+
+static int pfm_load_context_sys(struct pfm_context *ctx, struct pfm_event_set *set)
+{
+ u32 set_flags;
+ u32 my_cpu;
+ int ret;
+
+ my_cpu = smp_processor_id();
+
+ set_flags = set->flags;
+
+ ret = pfm_arch_load_context(ctx, NULL);
+ if (ret)
+ return ret;
+
+ PFM_DBG("cpu=%d set=%u runs=%llu set_flags=0x%x",
+ smp_processor_id(),
+ set->id,
+ (unsigned long long)set->view->set_runs,
+ set_flags);
+
+ /*
+ * now reserve the session
+ */
+ ret = pfm_reserve_session(ctx, my_cpu);
+ if (ret)
+ return ret;
+
+ /*
+ * bind context to current CPU
+ */
+ ctx->cpu = my_cpu;
+ ctx->task = NULL;
+
+ pfm_modview_begin(set);
+
+ set->view->set_runs++;
+
+ /*
+ * commit active set
+ */
+ ctx->active_set = set;
+ set->view->set_status |= PFM_SETVFL_ACTIVE;
+
+ /*
+ * load all registes from ctx to PMU
+ */
+ pfm_arch_restore_pmds(ctx, set);
+ pfm_arch_restore_pmcs(ctx, set);
+
+ pfm_modview_end(set);
+
+ set_thread_flag(TIF_PERFMON);
+
+ set->priv_flags &= ~PFM_SETFL_PRIV_MOD_BOTH;
+
+ PFM_DBG("context loaded on CPU%d", my_cpu);
+
+ pfm_set_pmu_owner(NULL, ctx);
+
+ return 0;
+}
+
+int __pfm_load_context(struct pfm_context *ctx, struct pfarg_load *req)
+{
+ struct pfm_event_set *set;
+ int ret = 0;
+
+ /*
+ * can only load from unloaded
+ */
+ if (ctx->state != PFM_CTX_UNLOADED) {
+ PFM_DBG("context already loaded");
+ return -EBUSY;
+ }
+
+ /*
+ * locate active set
+ */
+ set = pfm_find_set(ctx, req->load_set, 0);
+ if (set == NULL) {
+ PFM_DBG("event set%u does not exist", req->load_set);
+ return -EINVAL;
+ }
+ /*
+ * assess sanity of event sets, initialize set state
+ */
+ ret = pfm_prepare_sets(ctx, set);
+ if (ret) {
+ PFM_DBG("invalid next field pointers in the sets");
+ return -EINVAL;
+ }
+
+ if (ctx->flags.system)
+ ret = pfm_load_context_sys(ctx, set);
+ else
+ ret = pfm_load_context_thread(ctx, req->load_pid, set);
+
+ if (ret)
+ return ret;
+
+ /*
+ * reset message queue
+ */
+ pfm_reset_msgq(ctx);
+
+ ctx->duration = 0;
+ ctx->flags.started = 0;
+ ctx->flags.trap_reason = PFM_TRAP_REASON_NONE;
+ ctx->flags.can_restart = 0;
+ ctx->state = PFM_CTX_LOADED;
+
+ return 0;
+}
+
+int __pfm_unload_context(struct pfm_context *ctx, int defer_release)
+{
+ struct task_struct *task;
+ struct pfm_event_set *set;
+ int state, ret, is_self;
+
+ state = ctx->state;
+
+ /*
+ * unload only when necessary
+ */
+ if (state == PFM_CTX_UNLOADED)
+ return 0;
+
+ task = ctx->task;
+ set = ctx->active_set;
+ is_self = ctx->flags.system || task == current;
+
+ PFM_DBG("ctx_state=%d task [%d]", state, task ? task->pid : -1);
+
+ /*
+ * stop monitoring
+ */
+ ret = __pfm_stop(ctx);
+ if (ret)
+ return ret;
+
+ pfm_modview_begin(set);
+ set->view->set_status &= ~PFM_SETVFL_ACTIVE;
+ pfm_modview_end(set);
+
+ ctx->state = PFM_CTX_UNLOADED;
+
+ /*
+ * clear any leftover in pfm_syst_info.
+ *
+ * for non-self monitoring,
+ * this is done in pfm_ctxswout_thread.
+ */
+ if (is_self)
+ __get_cpu_var(pfm_syst_info) = 0;
+
+ /*
+ * save PMDs to context
+ * release ownership
+ */
+ pfm_flush_pmds(task, ctx);
+
+ pfm_arch_unload_context(ctx, task);
+
+ /*
+ * at this point we are done with the PMU
+ * so we can release the resource.
+ *
+ * when state is ZOMBIE, we have already released
+ */
+ if (state != PFM_CTX_ZOMBIE && !defer_release)
+ pfm_release_session(ctx, ctx->cpu);
+
+ /*
+ * per-thread: disconnect from monitored task
+ * syswide : keep ctx->cpu has it may be used after unload
+ * to release the session
+ */
+ if (task) {
+ task->pfm_context = NULL;
+ ctx->task = NULL;
+ clear_tsk_thread_flag(task, TIF_PERFMON);
+ } else
+ clear_thread_flag(TIF_PERFMON);
+
+ PFM_DBG("done, state was %d", state);
+ return 0;
+}
+
+static int pfm_ctx_flags_sane(u32 ctx_flags)
+{
+ if (ctx_flags & PFM_FL_SYSTEM_WIDE) {
+ if (ctx_flags & PFM_FL_NOTIFY_BLOCK) {
+ PFM_DBG("cannot use blocking mode in syswide mode");
+ return -EINVAL;
+ }
+ }
+ return 0;
+}
+
+/*
+ * check for permissions to create a context
+ */
+static inline int pfm_ctx_permissions(u32 ctx_flags)
+{
+ if ( (ctx_flags & PFM_FL_SYSTEM_WIDE)
+ && pfm_controls.sys_group != PFM_GROUP_PERM_ANY
+ && !in_group_p(pfm_controls.sys_group)) {
+ PFM_DBG("user group not allowed to create a syswide ctx");
+ return -EPERM;
+ } else if (pfm_controls.task_group != PFM_GROUP_PERM_ANY
+ && !in_group_p(pfm_controls.task_group)) {
+ PFM_DBG("user group not allowed to create a task context");
+ return -EPERM;
+ }
+ return 0;
+}
+
+int __pfm_create_context(struct pfarg_ctx *req,
+ struct pfm_smpl_fmt *fmt,
+ void *fmt_arg,
+ int mode,
+ struct completion *c,
+ struct pfm_context **new_ctx)
+{
+ struct pfm_context *ctx;
+ struct pfm_event_set *set;
+ struct file *filp = NULL;
+ u32 ctx_flags;
+ int fd = 0, ret;
+
+ ctx_flags = req->ctx_flags;
+
+ if (mode == PFM_KAPI && c == NULL)
+ return -EINVAL;
+
+ /* Increase refcount on PMU description */
+ ret = pfm_pmu_conf_get(mode != PFM_KAPI);
+ if (ret < 0)
+ goto error_conf;
+
+ ret = pfm_ctx_flags_sane(ctx_flags);
+ if (ret < 0)
+ goto error_alloc;
+
+ ret = pfm_ctx_permissions(ctx_flags);
+ if (ret < 0)
+ goto error_alloc;
+
+ ret = -ENOMEM;
+ ctx = pfm_context_alloc();
+ if (!ctx)
+ goto error_alloc;
+
+ /*
+ * link to format, must be done first for correct
+ * error handling in pfm_context_free()
+ */
+ ctx->smpl_fmt = fmt;
+
+ if (mode != PFM_KAPI) {
+ ret = -ENFILE;
+ fd = pfm_alloc_fd(&filp);
+ if (fd < 0)
+ goto error_file;
+ }
+
+ /*
+ * context is unloaded
+ */
+ ctx->state = PFM_CTX_UNLOADED;
+
+ /*
+ * initialization of context's flags
+ * must be done before pfm_find_set()
+ */
+ ctx->flags.block = (ctx_flags & PFM_FL_NOTIFY_BLOCK) ? 1 : 0;
+ ctx->flags.system = (ctx_flags & PFM_FL_SYSTEM_WIDE) ? 1: 0;
+ ctx->flags.no_msg = (ctx_flags & PFM_FL_OVFL_NO_MSG) ? 1: 0;
+ ctx->flags.mapset = (ctx_flags & PFM_FL_MAP_SETS) ? 1: 0;
+ ctx->flags.trap_reason = PFM_TRAP_REASON_NONE;
+ ctx->flags.kapi = mode == PFM_KAPI;
+
+ INIT_LIST_HEAD(&ctx->list);
+
+ /*
+ * initialize arch-specific section
+ * must be done before fmt_init()
+ *
+ * XXX: fix dependency with fmt_init()
+ */
+ ret = pfm_arch_context_initialize(ctx, ctx_flags);
+ if (ret)
+ goto error_set;
+
+ ret = -ENOMEM;
+ /*
+ * create initial set
+ */
+ if (pfm_find_set(ctx, 0, 1) == NULL)
+ goto error_set;
+
+ set = list_entry(ctx->list.next, struct pfm_event_set, list);
+
+ pfm_init_evtset(set);
+
+ /*
+ * does the user want to sample?
+ */
+ if (fmt) {
+ ret = pfm_setup_smpl_fmt(fmt, fmt_arg, ctx, ctx_flags,
+ mode, filp);
+ if (ret)
+ goto error_set;
+ }
+
+ req->ctx_smpl_buf_size = ctx->smpl_size;
+
+ /*
+ * attach context to file
+ */
+ if (filp)
+ filp->private_data = ctx;
+
+ spin_lock_init(&ctx->lock);
+ init_completion(&ctx->restart_complete);
+
+ ctx->last_act = PFM_INVALID_ACTIVATION;
+ pfm_set_last_cpu(ctx, -1);
+
+ /*
+ * initialize notification message queue
+ */
+ ctx->msgq_head = ctx->msgq_tail = 0;
+ init_waitqueue_head(&ctx->msgq_wait);
+ ctx->msgq_comp = c;
+
+ PFM_DBG("ctx=%p flags=0x%x system=%d notify_block=%d no_msg=%d"
+ " use_fmt=%d remap=%d ctx_fd=%d mode=%d",
+ ctx,
+ ctx_flags,
+ ctx->flags.system,
+ ctx->flags.block,
+ ctx->flags.no_msg,
+ fmt != NULL,
+ ctx->flags.mapset,
+ fd, mode);
+
+ *new_ctx = ctx;
+
+ /*
+ * we defer the fd_install until we are certain the call succeeded
+ * to ensure we do not have to undo its effect. Neither put_filp()
+ * nor put_unused_fd() undoes the effect of fd_install().
+ */
+ if (mode != PFM_KAPI)
+ fd_install(fd, filp);
+
+ req->ctx_fd = fd;
+
+ return 0;
+
+error_set:
+ if (mode != PFM_KAPI) {
+ put_filp(filp);
+ put_unused_fd(fd);
+ }
+error_file:
+ /* calls the right *_put() functions */
+ pfm_context_free(ctx);
+ return ret;
+
+error_alloc:
+ pfm_pmu_conf_put();
+error_conf:
+ pfm_smpl_fmt_put(fmt);
+ return ret;
+}
+
+
--- linux-2.6.17.9.base/perfmon/Makefile 1969-12-31 16:00:00.000000000 -0800
+++ linux-2.6.17.9/perfmon/Makefile 2006-08-21 03:37:46.000000000 -0700
@@ -0,0 +1,7 @@
+#
+# Copyright (c) 2005-2006 Hewlett-Packard Development Company, L.P.
+# Contributed by Stephane Eranian <[email protected]>
+#
+obj-$(CONFIG_PERFMON) = perfmon.o perfmon_res.o perfmon_fmt.o perfmon_pmu.o \
+ perfmon_sysfs.o perfmon_syscalls.o perfmon_file.o perfmon_ctxsw.o \
+ perfmon_intr.o perfmon_dfl_smpl.o perfmon_kapi.o perfmon_sets.o
On Wed, 23 Aug 2006 01:06:03 -0700
Stephane Eranian <[email protected]> wrote:
> This patch the core of perfmon2.
>
> The core consists of:
> - back-end to most system calls
> - notification message queue management
> - sampling buffer allocation
> - support functions for sampling
> - context allocation and destruction
> - user level notification
> - perfmon2 initialization
> - permission checking
>
Remind us again why this doesn't use relay files?
> --- linux-2.6.17.9.base/include/linux/perfmon.h 1969-12-31 16:00:00.000000000 -0800
> +++ linux-2.6.17.9/include/linux/perfmon.h 2006-08-21 03:37:46.000000000 -0700
> @@ -0,0 +1,749 @@
> +/*
> + * Copyright (c) 2001-2006 Hewlett-Packard Development Company, L.P.
> + * Contributed by Stephane Eranian <[email protected]>
> + */
I'm a bit surprised to not see explicit licensing info within the perfmon
files.
> +/*
> + * custom sampling buffer identifier type
> + */
> +typedef __u8 pfm_uuid_t[16];
What does this do, and why is a UUID used?
> +typedef __u32 __bitwise pfm_flags_t;
grumble.
> +/*
> + * Request structure used to define a context
> + */
> +struct pfarg_ctx {
> + pfm_uuid_t ctx_smpl_buf_id; /* which buffer format to use */
> + pfm_flags_t ctx_flags; /* noblock/block/syswide */
> + __s32 ctx_fd; /* ret arg: fd for context */
> + __u64 ctx_smpl_buf_size; /* ret arg: actual buffer size */
> + __u64 ctx_reserved3[12]; /* for future use */
> +};
It helps if the comments explicitly identify those structures which are
shared with userspace.
I suspect this structure _is_ shared with userspace, and I wonder about the
alignment of those u64's. It looks to be OK, as long as pfm_flags_t
remains 32-bit.
Given this, and the fact that the type of pfm_flags_t is cast in stone (if
it is indeed exported to userspace), there really is little point in using
a typedef - we won't be changing it. Sometimes there's a clarity case to
be made for a typedef, but usually not.
> +/*
> + * context flags (ctx_flags)
> + *
> + */
> +#define PFM_FL_NOTIFY_BLOCK 0x01 /* block task on user notifications */
> +#define PFM_FL_SYSTEM_WIDE 0x02 /* create a system wide context */
> +#define PFM_FL_OVFL_NO_MSG 0x80 /* no overflow msgs */
> +#define PFM_FL_MAP_SETS 0x10 /* event sets are remapped */
> +
> +
> +/*
> + * argument structure for pfm_write_pmcs()
> + */
> +struct pfarg_pmc {
> + __u16 reg_num; /* which register */
> + __u16 reg_set; /* event set for this register */
> + pfm_flags_t reg_flags; /* input: flags, return: reg error */
> + __u64 reg_value; /* pmc value */
> + __u64 reg_reserved2[4]; /* for future use */
> +};
>
> +/*
> + * argument structure for pfm_write_pmds() and pfm_read_pmds()
> + */
> +struct pfarg_pmd {
> + __u16 reg_num; /* which register */
> + __u16 reg_set; /* event set for this register */
> + pfm_flags_t reg_flags; /* input: flags, return: reg error */
> + __u64 reg_value; /* initial pmc/pmd value */
> + __u64 reg_long_reset; /* value to reload after notification */
> + __u64 reg_short_reset; /* reset after counter overflow */
> + __u64 reg_last_reset_val; /* return: PMD last reset value */
> + __u64 reg_ovfl_switch_cnt; /* #overflows before switch */
> + __u64 reg_reset_pmds[PFM_PMD_BV]; /* reset on overflow */
> + __u64 reg_smpl_pmds[PFM_PMD_BV]; /* record in sample */
> + __u64 reg_smpl_eventid; /* opaque event identifier */
> + __u64 reg_random_mask; /* bitmask used to limit random value */
> + __u32 reg_random_seed; /* seed for randomization */
> + __u32 reg_reserved2[7]; /* for future use */
> +};
OK, these both seem to be cunningly designed to avoid alignment problems
and compiler changes.
Perhaps declaring all these `packed' might provide additional safety there;
not sure.
The "reserved for future use" field is pretty useless unless there is also
version information somewhere. Is there?
Are the reserved-for-future-use fields guaranteed to be zero when the
kernel priovides them? (They should be).
Does the kernel check that the reserved-for-future-use fields are all-zero
when userspace provides them? (Perhaps it should?)
> +/*
> + * optional argument to pfm_start(), pass NULL if no arg needed
> + */
> +struct pfarg_start {
> + __u16 start_set; /* event set to start with */
> + __u16 start_reserved1; /* for future use */
> + __u32 start_reserved2; /* for future use */
> + __u64 reserved3[3]; /* for future use */
> +};
> +
> +/*
> + * argument to pfm_load_context()
> + */
> +struct pfarg_load {
> + __u32 load_pid; /* thread to attach to */
> + __u16 load_set; /* set to load first */
> + __u16 load_reserved1; /* for future use */
> + __u64 load_reserved2[3]; /* for future use */
> +};
> +
> +/*
> + * argument to pfm_create_evtsets()/pfm_delete_evtsets()
> + *
> + * max timeout: 1h11mn33s (2<<32 usecs)
> + */
> +struct pfarg_setdesc {
> + __u16 set_id; /* which set */
> + __u16 set_id_next; /* next set to go to */
> + pfm_flags_t set_flags; /* input: flags, return: err flag */
> + __u32 set_timeout; /* req/eff switch timeout in usecs */
> + __u32 set_reserved1; /* for future use */
> + __u64 set_mmap_offset; /* ret arg: cookie for mmap offset */
> + __u64 reserved[5]; /* for future use */
> +};
Why microseconds? 64-bit nanoseconds would be more typical, and perhaps
more useful. (Except people have gone and shipped it now, so it gets
messy, yes?)
> +/*
> + * argument to pfm_getinfo_evtsets()
> + */
> +struct pfarg_setinfo {
> + __u16 set_id; /* which set */
> + __u16 set_id_next; /* out: next set to go to */
> + pfm_flags_t set_flags; /* out:flags or error */
> + __u64 set_ovfl_pmds[PFM_PMD_BV]; /* out: last ovfl PMDs */
> + __u64 set_runs; /* out: #times the set was active */
> + __u32 set_timeout; /* out: effective switch timeout in usecs */
> + __u32 set_reserved1; /* for future use */
> + __u64 set_act_duration; /* out: time set active (cycles) */
> + __u64 set_mmap_offset; /* cookie to for mmap offset */
> + __u64 set_avail_pmcs[PFM_PMC_BV];/* unavailable PMCs */
> + __u64 set_avail_pmds[PFM_PMD_BV];/* unavailable PMDs */
> + __u64 reserved[4]; /* for future use */
> +};
> +
> +/*
> + * default value for the user and group security parameters in
> + * /proc/sys/kernel/perfmon/sys_group
> + * /proc/sys/kernel/perfmon/task_group
> + */
> +#define PFM_GROUP_PERM_ANY -1 /* any user/group */
> +
> +/*
> + * remapped set view
> + *
> + * IMPORTANT: cannot be bigger than PAGE_SIZE
> + */
> +struct pfm_set_view {
> + __u32 set_status; /* set status: active/inact */
> + __u32 set_reserved1; /* for future use */
> + __u64 set_runs; /* number of activations */
> + __u64 set_pmds[PFM_MAX_PMDS]; /* 64-bit value of PMDS */
> + volatile unsigned long set_seq; /* sequence number of updates */
> +};
What's that volatile doing in there?
> +/*
> + * pfm_set_view status flags
> + */
> +#define PFM_SETVFL_ACTIVE 0x1 /* set is active */
> +
> +struct pfm_ovfl_msg {
> + __u32 msg_type; /* generic message header */
> + __u32 msg_ovfl_pid; /* process id */
> + __u64 msg_ovfl_pmds[PFM_HW_PMD_BV];/* overflowed PMDs */
> + __u16 msg_active_set; /* active set at overflow */
> + __u16 msg_ovfl_cpu; /* cpu of PMU interrupt */
> + __u32 msg_ovfl_tid; /* kernel thread id */
> + __u64 msg_ovfl_ip; /* IP on PMU intr */
> +};
> +
> +#define PFM_MSG_OVFL 1 /* an overflow happened */
> +#define PFM_MSG_END 2 /* task to which context was attached ended */
> +
> +union pfm_msg {
> + __u32 type;
> + struct pfm_ovfl_msg pfm_ovfl_msg;
> +};
> +
> +/*
> + * perfmon version number
> + */
> +#define PFM_VERSION_MAJ 2U
> +#define PFM_VERSION_MIN 2U
> +#define PFM_VERSION (((PFM_VERSION_MAJ&0xffff)<<16)|\
> + (PFM_VERSION_MIN & 0xffff))
> +#define PFM_VERSION_MAJOR(x) (((x)>>16) & 0xffff)
> +#define PFM_VERSION_MINOR(x) ((x) & 0xffff)
There's a version.
> +#define pfm_ctx_arch(c) ((struct pfm_arch_context *)((c)+1))
ick, so we can do pfm_ctx_arch(42) and there will be no compiler warnings.
Suggest that this be converted to an inline function.
> +
> +static inline void pfm_inc_activation(void)
> +{
> + __get_cpu_var(pmu_activation_number)++;
> +}
> +
> +static inline void pfm_set_activation(struct pfm_context *ctx)
> +{
> + ctx->last_act = __get_cpu_var(pmu_activation_number);
> +}
> +
> +static inline void pfm_set_last_cpu(struct pfm_context *ctx, int cpu)
> +{
> + ctx->last_cpu = cpu;
> +}
> +
> +static inline void pfm_modview_begin(struct pfm_event_set *set)
> +{
> + set->view->set_seq++;
> +}
> +
> +static inline void pfm_modview_end(struct pfm_event_set *set)
> +{
> + set->view->set_seq++;
> +}
> +
> +static inline void pfm_retflag_set(u32 flags, u32 val)
> +{
> + flags &= ~PFM_REG_RETFL_MASK;
> + flags |= (val);
> +}
All the above need caller-provided locking. It would be nice to add a
comment describing what it is.
> +int __pfm_write_pmcs(struct pfm_context *, struct pfarg_pmc *, int);
> +int __pfm_write_pmds(struct pfm_context *, struct pfarg_pmd *, int, int);
> +int __pfm_read_pmds(struct pfm_context *, struct pfarg_pmd *, int);
Prototypes are more useful if the programmer fills in the (well-chosen)
argument identifiers.
> +u64 carta_random32 (u64);
This declaration shouldn't be in this header, should it?
> +static inline void pfm_put_ctx(struct pfm_context *ctx)
> +{
> + fput(ctx->filp);
> +}
This wrapper makes conversion to fput_light() a bit more complex.
> +#define PFM_ONE_64 ((u64)1)
heh, OK, C sucks
> +#define PFM_BPL 64
> +#define PFM_LBPL 6 /* log2(BPL) */
#define PFM_BPL (1 << PFM_LBPL)
> +
> +/*
> + * those operations are not provided by linux/bitmap.h.
Please, add them there then.
> + * We do not need atomicity nor volatile accesses here.
> + * All bitmaps are 64-bit wide.
> + */
> +static inline void pfm_bv_set(u64 *bv, unsigned int rnum)
> +{
> + bv[rnum>>PFM_LBPL] |= PFM_ONE_64 << (rnum&(PFM_BPL-1));
> +}
> +
> +static inline int pfm_bv_isset(u64 *bv, unsigned int rnum)
> +{
> + return bv[rnum>>PFM_LBPL] & (PFM_ONE_64 <<(rnum&(PFM_BPL-1))) ? 1 : 0;
> +}
> +
> +static inline void pfm_bv_clear(u64 *bv, unsigned int rnum)
> +{
> + bv[rnum>>PFM_LBPL] &= ~(PFM_ONE_64 << (rnum&(PFM_BPL-1)));
> +}
> +
> +/*
> + * read a single PMD register. PMD register mapping is provided by PMU
> + * description module. Some PMD registers are require a special read
> + * handler (e.g., virtual PMD mapping to a SW resource).
> + */
> +static inline u64 pfm_read_pmd(struct pfm_context *ctx, unsigned int cnum)
> +{
> + if (unlikely(pfm_pmu_conf->pmd_desc[cnum].type & PFM_REG_V))
> + return pfm_pmu_conf->pmd_sread(ctx, cnum);
> +
> + return pfm_arch_read_pmd(ctx, cnum);
> +}
> +
> +static inline void pfm_write_pmd(struct pfm_context *ctx, unsigned int cnum, u64 value)
> +{
> + /*
> + * PMD writes are ignored for read-only registers
> + */
> + if (pfm_pmu_conf->pmd_desc[cnum].type & PFM_REG_RO)
> + return;
> +
> + if (pfm_pmu_conf->pmd_desc[cnum].type & PFM_REG_V) {
> + pfm_pmu_conf->pmd_swrite(ctx, cnum, value);
> + return;
> + }
> + pfm_arch_write_pmd(ctx, cnum, value);
> +}
> +
> +#define ulp(_x) ((unsigned long *)_x)
OK this is related to bitmap API shortcomings. Let's try to get those
shortcomings fixed and then make all this go away.
> +
> +static union pfm_msg *pfm_get_new_msg(struct pfm_context *ctx)
> +{
> + int idx, next;
> +
> + next = (ctx->msgq_tail+1) % PFM_MAX_MSGS;
> +
> + PFM_DBG("head=%d tail=%d", ctx->msgq_head, ctx->msgq_tail);
> +
> + if (next == ctx->msgq_head)
> + return NULL;
> +
> + idx = ctx->msgq_tail;
> + ctx->msgq_tail = next;
> +
> + PFM_DBG("head=%d tail=%d msg=%d",
> + ctx->msgq_head,
> + ctx->msgq_tail, idx);
> +
> + return ctx->msgq+idx;
> +}
This is the inferior way of doing a ringbuffer.
It's better to let `head' and `tail' wrap all the way through 0xffffffff and
to only mask them off when actually using them as offsets. That way,
(head - tail == 0): empty
(head - tail == PFM_MAX_MSGS): full
(head - tail): number-of-items
which is nicer. It requires that PFM_MAX_MSGS be a power of two, which is
reasonable.
> +
> +/*
> + * only called in for the current task
> + */
> +static int pfm_setup_smpl_fmt(struct pfm_smpl_fmt *fmt, void *fmt_arg,
> + struct pfm_context *ctx, u32 ctx_flags,
> + int mode, struct file *filp)
> +{
> + size_t size = 0;
> + int ret = 0;
> +
> + /*
> + * validate parameters
> + */
> + if (fmt->fmt_validate) {
> + ret = (*fmt->fmt_validate)(ctx_flags, pfm_pmu_conf->num_pmds,
> + fmt_arg);
> + PFM_DBG("validate(0x%x,%p)=%d", ctx_flags, fmt_arg, ret);
> + if (ret)
> + goto error;
> + }
> +
> + /*
> + * check if buffer format wants to use perfmon
> + * buffer allocation/mapping service
> + */
> + size = 0;
We already did that.
> + if (fmt->fmt_getsize) {
> + ret = (*fmt->fmt_getsize)(ctx_flags, fmt_arg, &size);
> + if (ret) {
> + PFM_DBG("cannot get size ret=%d", ret);
> + goto error;
> + }
> + }
> +
> + if (size) {
> +#ifdef CONFIG_IA64_PERFMON_COMPAT
> + if (mode == PFM_COMPAT)
> + ret = pfm_smpl_buffer_alloc_old(ctx, size, filp);
> + else
> +#endif
> + {
> + ret = pfm_smpl_buffer_alloc(ctx, size);
> + }
Would be better to create more per-arch helpers and get the IA64 stuff out
of perfmon/perfmon.c.
> + if (ret)
> + goto error;
> +
> + }
> +
> + if (fmt->fmt_init) {
> + ret = (*fmt->fmt_init)(ctx, ctx->smpl_addr, ctx_flags,
> + pfm_pmu_conf->num_pmds,
> + fmt_arg);
> + if (ret)
> + goto error_buffer;
> + }
> + return 0;
> +
> +error_buffer:
> + if (!ctx->flags.kapi)
> + pfm_release_buf_space(ctx->smpl_size);
> + /*
> + * we do not call fmt_exit, if init has failed
> + */
> + vfree(ctx->smpl_addr);
> +error:
> + return ret;
> +}
> +
>
> ...
>
> +struct pfm_context *pfm_context_alloc(void)
> +{
> + struct pfm_context *ctx;
> +
> + /*
> + * allocate context structure
> + * the architecture specific portion is allocated
> + * right after the struct pfm_context struct. It is
> + * accessible at ctx_arch = (ctx+1)
> + */
> + ctx = kmem_cache_alloc(pfm_ctx_cachep, SLAB_ATOMIC);
> + if (ctx) {
> + memset(ctx, 0, sizeof(*ctx)+PFM_ARCH_CTX_SIZE);
> + PFM_DBG("alloc ctx @%p", ctx);
> + }
> + return ctx;
> +}
Can we avoid the unreliable SLAB_ATOMIC here?
> +/*
> + * in new mode, we only allocate the kernel buffer, an explicit mmap()
> + * is needed to remap the buffer at the user level
> + */
> +int pfm_smpl_buffer_alloc(struct pfm_context *ctx, size_t rsize)
> +{
> + void *addr;
> + size_t size;
> + int ret;
> +
> + /*
> + * the fixed header + requested size and align to page boundary
> + */
> + size = PAGE_ALIGN(rsize);
> +
> + PFM_DBG("sampling buffer rsize=%zu size=%zu", rsize, size);
> +
> + if (!ctx->flags.kapi) {
> + ret = pfm_reserve_buf_space(size);
> + if (ret) return ret;
newline, please.
> + }
> +
> + addr = vmalloc(size);
> + if (addr == NULL) {
> + PFM_DBG("cannot allocate sampling buffer");
> + goto unres;
> + }
> +
> + memset(addr, 0, size);
> +
> + ctx->smpl_addr = addr;
> + ctx->smpl_size = size;
> +
> + PFM_DBG("kernel smpl buffer @%p", addr);
> +
> + return 0;
> +unres:
> + if (!ctx->flags.kapi)
> + pfm_release_buf_space(size);
> + return -ENOMEM;
> +}
> +
> +static inline u64 pfm_new_pmd_value (struct pfm_pmd *reg, int reset_mode)
> +{
> + u64 val, mask;
> + u64 new_seed, old_seed;
> +
> + val = reset_mode == PFM_PMD_RESET_LONG ? reg->long_reset : reg->short_reset;
> + old_seed = reg->seed;
> + mask = reg->mask;
> +
> + if (reg->flags & PFM_REGFL_RANDOM) {
> + new_seed = carta_random32(old_seed);
> +
> + /* counter values are negative numbers! */
> + val -= (old_seed & mask);
> + if ((mask >> 32) != 0)
> + /* construct a full 64-bit random value: */
> + new_seed |= (u64)carta_random32((u32)(old_seed >> 32)) << 32;
carta_random32 already returns u64. I think neither cast is needed here.
> + reg->seed = new_seed;
> + }
> + reg->lval = val;
> + return val;
> +}
>
> ...
>
> +void pfm_reset_pmds(struct pfm_context *ctx, struct pfm_event_set *set,
> + int reset_mode)
> +{
> + u64 ovfl_mask, hw_val;
> + u64 *cnt_mask, *reset_pmds;
> + u64 val;
> + unsigned int i, max_pmd, not_masked;
> +
> + reset_pmds = set->reset_pmds;
> + max_pmd = pfm_pmu_conf->max_pmd;
> +
> + ovfl_mask = pfm_pmu_conf->ovfl_mask;
> + cnt_mask = pfm_pmu_conf->cnt_pmds;
> + not_masked = ctx->state != PFM_CTX_MASKED;
> +
> + PFM_DBG_ovfl("%s r_pmds=0x%llx not_masked=%d",
> + reset_mode == PFM_PMD_RESET_LONG ? "long" : "short",
> + (unsigned long long)reset_pmds[0],
> + not_masked);
> +
> + pfm_modview_begin(set);
> +
> + for (i = 0; i < max_pmd; i++) {
> +
> + if (pfm_bv_isset(reset_pmds, i)) {
> +
Unneeded newline here (lots of places)
> + val = pfm_new_pmd_value(set->pmds + i,
> + reset_mode);
val = pfm_new_pmd_value(set->pmds + i, reset_mode);
> + set->view->set_pmds[i]= val;
> +
> + if (not_masked) {
> + if (pfm_bv_isset(cnt_mask, i)) {
> + hw_val = val & ovfl_mask;
> + } else {
> + hw_val = val;
> + }
Unneeded braces.
> + pfm_write_pmd(ctx, i, hw_val);
> + }
> + PFM_DBG_ovfl("pmd%u set=%u sval=0x%llx",
> + i,
> + set->id,
> + (unsigned long long)val);
> + }
> + }
> +
> + pfm_modview_end(set);
> +
> + /*
> + * done with reset
> + */
> + bitmap_zero(ulp(reset_pmds), max_pmd);
Let's fix the bitmap code.
> + /*
> + * make changes visible
> + */
> + if (not_masked)
> + pfm_arch_serialize();
> +}
> +
>
> ...
>
> +/*
> + * called from pfm_handle_work() and __pfm_restart()
> + * for system-wide and per-thread context.
> + */
> +static void pfm_resume_after_ovfl(struct pfm_context *ctx)
> +{
> + struct pfm_smpl_fmt *fmt;
> + u32 rst_ctrl;
> + struct pfm_event_set *set;
> + u64 *reset_pmds;
> + void *hdr;
> + int state, ret;
> +
> + hdr = ctx->smpl_addr;
> + fmt = ctx->smpl_fmt;
> + state = ctx->state;
> + set = ctx->active_set;
> + ret = 0;
> +
> + if (hdr) {
> + rst_ctrl = 0;
> + prefetch(hdr);
> + if (fmt->fmt_restart)
> + ret = (*fmt->fmt_restart)(state == PFM_CTX_LOADED,
> + &rst_ctrl, hdr);
> + } else {
> + rst_ctrl= PFM_OVFL_CTRL_RESET;
> + }
> + reset_pmds = set->reset_pmds;
> +
> + PFM_DBG("restart=%d set=%u r_pmds=0x%llx switch=%d ctx_state=%d",
> + ret,
> + set->id,
> + (unsigned long long)reset_pmds[0],
> + !(set->priv_flags & PFM_SETFL_PRIV_SWITCH),
> + state);
> +
> + if (!ret) {
> + /*
> + * switch set if needed
> + */
> + if (set->priv_flags & PFM_SETFL_PRIV_SWITCH) {
> + set->priv_flags &= ~PFM_SETFL_PRIV_SWITCH;
> + pfm_switch_sets(ctx, NULL, PFM_PMD_RESET_LONG, 0);
> + set = ctx->active_set;
I assume there's some locking in place for all of this. If so, it's useful
to mention that in the function's introductory comment block - it's rather
important.
Or stick an assert_spin_locked() in there, which is rather stronger...
> + } else if (rst_ctrl & PFM_OVFL_CTRL_RESET) {
> + if (!bitmap_empty(ulp(set->reset_pmds), pfm_pmu_conf->max_pmd))
> + pfm_reset_pmds(ctx, set, PFM_PMD_RESET_LONG);
> + }
> +
> + if (!(rst_ctrl & PFM_OVFL_CTRL_MASK)) {
> + pfm_unmask_monitoring(ctx);
> + } else {
> + PFM_DBG("stopping monitoring?");
> + }
braces...
> + ctx->state = PFM_CTX_LOADED;
> + }
> + ctx->flags.can_restart = 0;
> +}
> +
>
> ...
>
> +/*
> + * pfm_handle_work() can be called with interrupts enabled
> + * (TIF_NEED_RESCHED) or disabled. The down_interruptible
> + * call may sleep, therefore we must re-enable interrupts
> + * to avoid deadlocks. It is safe to do so because this function
> + * is called ONLY when returning to user level (PUStk=1), in which case
> + * there is no risk of kernel stack overflow due to deep
> + * interrupt nesting.
> + *
> + * input:
> + * - current task pointer
> + */
What's PUStk?
> +void __pfm_handle_work(struct task_struct *task)
> +{
> + struct pfm_context *ctx;
> + unsigned long flags, dummy_flags;
> + unsigned int reason;
> + int ret;
> +
> + ctx = task->pfm_context;
> + if (ctx == NULL) {
> + PFM_ERR("handle_work [%d] has no ctx", task->pid);
> + return;
> + }
> +
> + BUG_ON(ctx->flags.system);
> +
> + spin_lock_irqsave(&ctx->lock, flags);
> +
> + clear_thread_flag(TIF_NOTIFY_RESUME);
> +
> + /*
> + * extract reason for being here and clear
> + */
> + reason = ctx->flags.trap_reason;
> +
> + if (reason == PFM_TRAP_REASON_NONE)
> + goto nothing_to_do;
> +
> + ctx->flags.trap_reason = PFM_TRAP_REASON_NONE;
> +
> + PFM_DBG("reason=%d state=%d", reason, ctx->state);
> +
> + /*
> + * must be done before we check for simple-reset mode
> + */
> + if (ctx->state == PFM_CTX_ZOMBIE)
> + goto do_zombie;
> +
> + if (reason == PFM_TRAP_REASON_RESET)
> + goto skip_blocking;
> +
> + /*
> + * restore interrupt mask to what it was on entry.
> + * Could be enabled/diasbled.
> + */
> + spin_unlock_irqrestore(&ctx->lock, flags);
> +
> + /*
> + * force interrupt enable because of down_interruptible()
> + */
> + local_irq_enable();
> +
> + PFM_DBG("before block sleeping");
> +
> + /*
> + * may go through without blocking on SMP systems
> + * if restart has been received already by the time we call down()
> + */
> + ret = wait_for_completion_interruptible(&ctx->restart_complete);
> +
> + PFM_DBG("after block sleeping ret=%d", ret);
> +
> + /*
> + * lock context and mask interrupts again
> + * We save flags into a dummy because we may have
> + * altered interrupts mask compared to entry in this
> + * function.
> + */
> + spin_lock_irqsave(&ctx->lock, dummy_flags);
> +
> + if (ctx->state == PFM_CTX_ZOMBIE)
> + goto do_zombie;
> +
> + /*
> + * in case of interruption of down() we don't restart anything
> + */
> + if (ret < 0)
> + goto nothing_to_do;
> +
> +skip_blocking:
> + pfm_resume_after_ovfl(ctx);
> +
> +nothing_to_do:
> +
> + /*
> + * restore flags as they were upon entry
> + */
> + spin_unlock_irqrestore(&ctx->lock, flags);
> + return;
> +
> +do_zombie:
> + PFM_DBG("context is zombie, bailing out");
> +
> + __pfm_unload_context(ctx, 0);
> +
> + /*
> + * enable interrupt for vfree()
> + */
> + local_irq_enable();
> +
> + /*
> + * actual context free
> + */
> + pfm_context_free(ctx);
> +
> + /*
> + * restore interrupts as they were upon entry
> + */
> + local_irq_restore(flags);
> +}
Yeah, the local_irq handling here is unpleasing.
> +/*
> + * called only from exit_thread(): task == current
> + * we come here only if current has a context
> + * attached (loaded or masked or zombie)
> + */
> +void __pfm_exit_thread(struct task_struct *task)
> +{
> + struct pfm_context *ctx;
> + unsigned long flags;
> + int free_ok = 0;
> +
> + ctx = task->pfm_context;
> +
> + BUG_ON(ctx->flags.system);
> +
> + spin_lock_irqsave(&ctx->lock, flags);
> +
> + PFM_DBG("state=%d", ctx->state);
> +
> + /*
> + * __pfm_unload_context() cannot fail
> + * in the context states we are interested in
> + */
> + switch(ctx->state) {
^ space, please.
> + case PFM_CTX_LOADED:
> + case PFM_CTX_MASKED:
> + __pfm_unload_context(ctx, 0);
> + pfm_end_notify_user(ctx);
> + break;
> + case PFM_CTX_ZOMBIE:
> + __pfm_unload_context(ctx, 0);
> + free_ok = 1;
> + break;
> + default:
> + BUG_ON(ctx->state != PFM_CTX_LOADED);
> + break;
> + }
We normally indent switch statement bodies one tabstop less than this.
> + spin_unlock_irqrestore(&ctx->lock, flags);
> +
> + /*
> + * All memory free operations (especially for vmalloc'ed memory)
> + * MUST be done with interrupts ENABLED.
> + */
> + if (free_ok)
> + pfm_context_free(ctx);
> +}
> +
> +/*
> + * this function is called from pfm_init()
> + * pfm_pmu_conf is NULL at this point
> + */
> +void __cpuinit pfm_init_percpu (void *dummy)
^ no space ;)
> +{
> + pfm_arch_init_percpu();
> +}
> +
> +/*
> + * global initialization routine, executed only once
> + */
> +int __init pfm_init(void)
> +{
> + int ret;
> +
> + PFM_LOG("version %u.%u", PFM_VERSION_MAJ, PFM_VERSION_MIN);
> +
> + pfm_ctx_cachep = kmem_cache_create("pfm_context",
> + sizeof(struct pfm_context)+PFM_ARCH_CTX_SIZE,
> + SLAB_HWCACHE_ALIGN, 0, NULL, NULL);
> + if (pfm_ctx_cachep == NULL) {
> + PFM_ERR("cannot initialize context slab");
> + goto error_disable;
> + }
> + ret = pfm_sets_init();
> + if (ret)
> + goto error_disable;
> +
> +
> + if (pfm_sysfs_init())
> + goto error_disable;
> +
> + /*
> + * one time, global initialization
> + */
> + if (pfm_arch_initialize())
> + goto error_disable;
> +
> + init_pfm_fs();
> +
> + /*
> + * per cpu initialization (interrupts must be enabled)
> + */
> + on_each_cpu(pfm_init_percpu, NULL, 1, 1);
> +
> + return 0;
> +error_disable:
> + return -1;
> +}
Three of these error paths will leak *pfm_ctx_cachep. The kernel will
panic next time the module is loaded (if this is a loadable module..)
> +/*
> + * must use subsys_initcall() to ensure that the perfmon2 core
> + * is initialized before any PMU description module when they are
> + * compiled in.
> + */
> +subsys_initcall(pfm_init);
> +
> +int __pfm_start(struct pfm_context *ctx, struct pfarg_start *start)
> +{
> + struct task_struct *task, *owner_task;
> + struct pfm_event_set *new_set, *old_set;
> + u64 now_itc;
> + int is_self, flags;
> +
> + task = ctx->task;
> +
> + /*
> + * context must be loaded.
> + * we do not support starting while in MASKED state
> + * (mostly because of set switching issues)
> + */
> + if (ctx->state != PFM_CTX_LOADED)
> + return -EINVAL;
> +
> + old_set = new_set = ctx->active_set;
> +
> + /*
> + * always the case for system-wide
> + */
> + if (task == NULL)
> + task = current;
> +
> + is_self = task == current;
> +
> + /*
> + * argument is provided?
> + */
> + if (start) {
> + /*
> + * find the set to load first
> + */
> + new_set = pfm_find_set(ctx, start->start_set, 0);
> + if (new_set == NULL) {
> + PFM_DBG("event set%u does not exist",
> + start->start_set);
> + return -EINVAL;
> + }
> + }
> +
> + PFM_DBG("cur_set=%u req_set=%u",
> + old_set->id,
> + new_set->id);
> +
> + /*
> + * if we need to change the active set we need
> + * to check if we can access the PMU
> + */
> + if (new_set != old_set) {
> + owner_task = __get_cpu_var(pmu_owner);
>From this I'll assume that either a) this function is always called under
some locking (but which?) or b) it hasn't been tested with
CONFIG_DEBUG_PREEMPT.
> + /*
> + * system-wide: must run on the right CPU
> + * per-thread : must be the owner of the PMU context
> + *
> + * pfm_switch_sets() returns with monitoring stopped
> + */
> + if (is_self) {
> + pfm_switch_sets(ctx, new_set, PFM_PMD_RESET_LONG, 1);
> + } else {
> + /*
> + * In a UP kernel, the PMU may contain the state
> + * of the task we want to operate on, yet the task
> + * may be switched out (lazy save). We need to save
> + * current state (old_set), switch active_set and
> + * mark it for reload.
> + */
> + if (owner_task == task) {
> + pfm_modview_begin(old_set);
> + pfm_save_pmds(ctx, old_set);
> + pfm_modview_end(old_set);
> + }
> + ctx->active_set = new_set;
> + new_set->view->set_status |= PFM_SETVFL_ACTIVE;
> + new_set->priv_flags |= PFM_SETFL_PRIV_MOD_BOTH;
> + }
> + }
> + /*
> + * mark as started, must be done before calling
> + * pfm_arch_start()
> + */
> + ctx->flags.started = 1;
> +
> + /*
> + * at this point, monitoring is:
> + * - stopped if we switched set (self-monitoring)
> + * - stopped if never started
> + * - started if calling pfm_start() in sequence
> + */
> + now_itc = pfm_arch_get_itc();
> + flags = new_set->flags;
> +
> + if (is_self) {
> + unsigned long info;
> + if (flags & PFM_SETFL_TIME_SWITCH)
> + info = PFM_CPUINFO_TIME_SWITCH;
> + else
> + info = 0;
> +
> + __get_cpu_var(pfm_syst_info) = info;
> + }
> + /*
> + * in system-wide, the new_set may EXCL_IDLE, in which
> + * case pfm_start() must actually stop monitoring
> + */
> + if (current->pid == 0 && (flags & PFM_SETFL_EXCL_IDLE))
> + pfm_arch_stop(task, ctx, new_set);
> + else
> + pfm_arch_start(task, ctx, new_set);
> +
> + /*
> + * we restart total duration even if context was
> + * already started. In that case, counts are simply
> + * reset.
> + *
> + * For system-wide, we start counting even when we exclude
> + * idle and pfm_start() called by idle.
> + *
> + * For per-thread, if not self-monitoring, the statement
> + * below will have no effect because thread is stopped.
> + * The field is reset of ctxsw in.
> + *
> + * if monitoring is masked (MASKED), this statement
> + * will be overriden in pfm_unmask_monitoring()
> + */
> + ctx->duration_start = now_itc;
> + new_set->duration_start = now_itc;
> +
> + return 0;
> +}
>
> ...
>
> +/*
> + * XXX: interrupts are masked yet monitoring may be active. Hence they
> + * might be a counter overflow during the call. It will be kept pending
> + * and we might return inconsistent unless we check the state of the counter
> + * and compensate for the overflow. Note that we will not loose a sample
> + * when sampling, however, there may be an issue with simple counting and
> + * virtualization.
> + */
What issue?
> +int __pfm_read_pmds(struct pfm_context *ctx, struct pfarg_pmd *req, int count)
> +{
> +
>
> ...
>
> +int __pfm_write_pmds(struct pfm_context *ctx, struct pfarg_pmd *req, int count,
> + int compat)
> +{
>
> ...
>
> + bitmap_or(ulp(set->used_pmds),
> + ulp(set->used_pmds),
> + ulp(reset_pmds), max_pmd);
argh.
> +/*
> + * should not call when task == current
> + */
> +static int pfm_bad_permissions(struct task_struct *task)
> +{
> + /* inspired by ptrace_attach() */
> + PFM_DBG("cur: euid=%d uid=%d gid=%d task: euid=%d "
> + "suid=%d uid=%d egid=%d cap:%d sgid=%d",
> + current->euid,
> + current->uid,
> + current->gid,
> + task->euid,
> + task->suid,
> + task->uid,
> + task->egid,
> + task->sgid, capable(CAP_SYS_PTRACE));
> +
> + return ((current->uid != task->euid)
> + || (current->uid != task->suid)
> + || (current->uid != task->uid)
> + || (current->gid != task->egid)
> + || (current->gid != task->sgid)
> + || (current->gid != task->gid)) && !capable(CAP_SYS_PTRACE);
> +}
A comment which describes the design decisions behind this permission check
would be a pretty important improvement.
I wonder if selinux wants to get this deep into things.
> +
> +/*
> + * cannot attach if :
> + * - kernel task
> + * - task not owned by caller
> + * - task incompatible with context mode
> + */
there's a comment.
What does "incompatible with context mode" mean?
> +static int pfm_task_incompatible(struct pfm_context *ctx,
> + struct task_struct *task)
> +{
> + /*
> + * no kernel task or task not owned by caller
> + */
> + if (!task->mm) {
> + PFM_DBG("cannot attach to kernel thread [%d]", task->pid);
> + return -EPERM;
> + }
> +
> + if (pfm_bad_permissions(task)) {
> + PFM_DBG("no permission to attach to [%d]", task->pid);
> + return -EPERM;
> + }
> +
> + /*
> + * cannot block in self-monitoring mode
> + */
> + if (ctx->flags.block && task == current) {
> + PFM_DBG("cannot load a in blocking mode on self for [%d]",
tpyo.
> + task->pid);
> + return -EINVAL;
> + }
> +
> + if (task->state == EXIT_ZOMBIE || task->state == EXIT_DEAD) {
That isn't right. These things are recorded in task_struct.exit_state, not in
task_struct.state.
> + PFM_DBG("cannot attach to zombie/dead task [%d]", task->pid);
> + return -EBUSY;
> + }
> +
> + /*
> + * always ok for self
> + */
> + if (task == current)
> + return 0;
> +
> + if ((task->state != TASK_STOPPED) && (task->state != TASK_TRACED)) {
> + PFM_DBG("cannot attach to non-stopped task [%d] state=%ld",
> + task->pid, task->state);
> + return -EBUSY;
> + }
> + PFM_DBG("before wait_inactive() task [%d] state=%ld",
> + task->pid, task->state);
> + /*
> + * make sure the task is off any CPU
> + */
> + wait_task_inactive(task);
There it is again. This is a busywait.
> + PFM_DBG("after wait_inactive() task [%d] state=%ld",
> + task->pid, task->state);
> + /* more to come... */
> +
> + return 0;
> +}
> +static int pfm_get_task(struct pfm_context *ctx, pid_t pid,
> + struct task_struct **task)
> +{
> + struct task_struct *p = current;
> + int ret;
> +
> + /* XXX: need to add more checks here */
> + if (pid < 2)
> + return -EPERM;
;)
What are we actually trying to do here?
> + if (pid != current->pid) {
> +
> + read_lock(&tasklist_lock);
> +
> + p = find_task_by_pid(pid);
> +
> + /* make sure task cannot go away while we operate on it */
> + if (p)
> + get_task_struct(p);
> +
> + read_unlock(&tasklist_lock);
> +
> + if (p == NULL)
> + return -ESRCH;
> + }
> +
> + ret = pfm_task_incompatible(ctx, p);
> + if (!ret) {
> + *task = p;
> + } else if (p != current) {
> + put_task_struct(p);
> + }
braces.
> + return ret;
> +}
> +
> +static int pfm_check_task_exist(struct pfm_context *ctx)
> +{
> + struct task_struct *g, *t;
> + int ret = -ESRCH;
> +
> + read_lock(&tasklist_lock);
> +
> + do_each_thread (g, t) {
^ space
> + if (t->pfm_context == ctx) {
> + ret = 0;
> + break;
> + }
> + } while_each_thread (g, t);
again
> + read_unlock(&tasklist_lock);
> +
> + PFM_DBG("ret=%d ctx=%p", ret, ctx);
> +
> + return ret;
> +}
These functions are expensive. Hopefully not called much?
> +
> +static int pfm_load_context_thread(struct pfm_context *ctx, pid_t pid,
> + struct pfm_event_set *set)
> +{
> + struct task_struct *task = NULL;
> + struct pfm_context *old;
> + u32 set_flags;
> + unsigned long info;
> + int ret, state;
> +
> + state = ctx->state;
> + set_flags = set->flags;
> +
> + PFM_DBG("load_pid [%d] set=%u runs=%llu set_flags=0x%x",
> + pid,
> + set->id,
> + (unsigned long long)set->view->set_runs,
> + set_flags);
> +
> + if (ctx->flags.block && pid == current->pid) {
> + PFM_DBG("cannot use blocking mode in while self-monitoring");
> + return -EINVAL;
> + }
> +
> + ret = pfm_get_task(ctx, pid, &task);
> + if (ret) {
> + PFM_DBG("load_pid [%d] get_task=%d", pid, ret);
> + return ret;
> + }
> +
> + ret = pfm_arch_load_context(ctx, task);
> + if (ret) {
> + put_task_struct(task);
Further down, we only do put_task_struct() if task!=current.
> + return ret;
> + }
> +
> + /*
> + * now reserve the session
> + */
> + ret = pfm_reserve_session(ctx, -1);
> + if (ret)
> + goto error;
> +
> + /*
> + * task is necessarily stopped at this point.
> + *
> + * If the previous context was zombie, then it got removed in
> + * pfm_ctxswout_thread(). Therefore we should not see it here.
> + * If we see a context, then this is an active context
> + *
> + */
> + PFM_DBG("before cmpxchg() old_ctx=%p new_ctx=%p",
> + task->pfm_context, ctx);
> +
> + ret = -EEXIST;
> +
> + old = cmpxchg(&task->pfm_context, NULL, ctx);
> + if (old != NULL) {
> + PFM_DBG("load_pid [%d] has already a context "
> + "old=%p new=%p cur=%p",
> + pid,
> + old,
> + ctx,
> + task->pfm_context);
> + goto error_unres;
> + }
> +
> + /*
> + * link context to task
> + */
> + ctx->task = task;
> + set_tsk_thread_flag(task, TIF_PERFMON);
> +
> + /*
> + * commit active set
> + */
> + ctx->active_set = set;
> +
> + pfm_modview_begin(set);
> +
> + set->view->set_runs++;
Locking for this increment?
> + set->view->set_status |= PFM_SETVFL_ACTIVE;
and for this?
> + /*
> + * self-monitoring
> + */
> + if (task == current) {
> +#ifndef CONFIG_SMP
> + struct pfm_context *ctxp;
> +
> + /*
> + * in UP per-thread, due to lazy save
> + * there could be a context from another
> + * task. We need to push it first before
> + * installing our new state
> + */
> + ctxp = __get_cpu_var(pmu_ctx);
This code does smp_processor_id() a lot. Hopefully it's all preempt-correct..
> + if (ctxp) {
> + struct pfm_event_set *setp;
> + setp = ctxp->active_set;
> + pfm_modview_begin(setp);
> + pfm_save_pmds(ctxp, setp);
> + setp->view->set_status &= ~PFM_SETVFL_ACTIVE;
> + pfm_modview_end(setp);
> + /*
> + * do not clear ownership because we rewrite
> + * right away
> + */
> + }
> +#endif
> + pfm_set_last_cpu(ctx, smp_processor_id());
> + pfm_inc_activation();
> + pfm_set_activation(ctx);
> +
> + /*
> + * setting PFM_CPUINFO_TIME_SWITCH, triggers
> + * further checking if __pfm_handle_switch_timeout().
> + * switch timeout is effectively decremented only once
> + * monitoring has been activated via pfm_start() or
> + * any user level equivalent.
> + */
> + if (set_flags & PFM_SETFL_TIME_SWITCH) {
> + info = PFM_CPUINFO_TIME_SWITCH;
> + __get_cpu_var(pfm_syst_info) = info;
> + }
> + /*
> + * load all PMD from set
> + * load all PMC from set
> + */
> + pfm_arch_restore_pmds(ctx, set);
> + pfm_arch_restore_pmcs(ctx, set);
> +
> + /*
> + * set new ownership
> + */
> + pfm_set_pmu_owner(task, ctx);
> +
> + PFM_DBG("context loaded on PMU for [%d] TIF=%d", task->pid, test_tsk_thread_flag(task, TIF_PERFMON));
> + } else {
> +
> + /* force a full reload */
> + ctx->last_act = PFM_INVALID_ACTIVATION;
> + pfm_set_last_cpu(ctx, -1);
> + set->priv_flags |= PFM_SETFL_PRIV_MOD_BOTH;
> + PFM_DBG("context loaded next ctxswin for [%d]", task->pid);
> + }
> +
> + pfm_modview_end(set);
> +
> + ret = 0;
> +
> +error_unres:
> + if (ret)
> + pfm_release_session(ctx, -1);
> +error:
> + /*
> + * release task, there is now a link with the context
> + */
> + if (task != current) {
> + put_task_struct(task);
> +
> + if (!ret) {
> + ret = pfm_check_task_exist(ctx);
> + if (ret) {
> + ctx->state = PFM_CTX_UNLOADED;
> + ctx->task = NULL;
> + }
> + }
> + }
> + return ret;
> +}
>
> ...
>
> +int __pfm_unload_context(struct pfm_context *ctx, int defer_release)
It's fairly unfruitful reviewing functions when you don't know what they
do. Comments really help.