2009-07-01 09:45:40

by Jaswinder Singh Rajput

[permalink] [raw]
Subject: [GIT-PULL -tip][PATCH 0/6] perf_counter patches

Ingo,

Please pull perf_counter patches :
The following changes since commit 092304de242705abf24edcb0fc7beed4c4276865:
Ingo Molnar (1):
Merge branch 'perfcounters/urgent'

are available in the git repository at:

git://git.kernel.org/pub/scm/linux/kernel/git/jaswinder/linux-2.6-tip.git master

Jaswinder Singh Rajput (6):
perf stat: define MATCH_EVENT for easy attrs checking
perf stat: treat same behaviour for all CYCLES and CLOCKS
perf_counter: Add Generalized Hardware vectored co-processor support for AMD
perf_counter: Add Generalized Hardware interrupt support for AMD
perf_counter: Add hardware vector events for nehalem
perf_counter: Add hardware interrupt events for nehalem, core2 and atom

arch/x86/kernel/cpu/perf_counter.c | 95 ++++++++++++++++++++++++++++++++++++
include/linux/perf_counter.h | 27 ++++++++++
kernel/perf_counter.c | 2 +
tools/perf/builtin-stat.c | 60 ++++++++++++++---------
tools/perf/util/parse-events.c | 73 +++++++++++++++++++++++++++
5 files changed, 233 insertions(+), 24 deletions(-)

Complete diff:
diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index d4cf4ce..4ef1838 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -372,6 +372,42 @@ static const u64 atom_hw_cache_event_ids
},
};

+/*
+ * Generalized hw vectored co-processor event table
+ */
+
+static u64 __read_mostly hw_vector_event_ids[PERF_COUNT_HW_VECTOR_MAX];
+
+static const u64 nehalem_hw_vector_event_ids[] =
+{
+ [PERF_COUNT_HW_VECTOR_ADD] = 0x01B1, /* UOPS_EXECUTED.PORT0 */
+ [PERF_COUNT_HW_VECTOR_MULTIPLY] = 0x0214, /* ARITH.MUL */
+ [PERF_COUNT_HW_VECTOR_DIVIDE] = 0x0114, /* ARITH.CYCLES_DIV_BUSY */
+ [PERF_COUNT_HW_VECTOR_IDLE_CYCLES] = 0x0,
+ [PERF_COUNT_HW_VECTOR_STALL_CYCLES] = 0x60A2, /* RESOURCE_STALLS.FPCW|MXCSR*/
+ [PERF_COUNT_HW_VECTOR_OPS] = 0x0710, /* FP_COMP_OPS_EXE.X87|MMX|SSE_FP*/
+};
+
+/*
+ * Generalized hw interrupt event table
+ */
+
+static u64 __read_mostly hw_interrupt_event_ids[PERF_COUNT_HW_INTERRUPT_MAX];
+
+static const u64 nehalem_hw_interrupt_event_ids[] =
+{
+ [PERF_COUNT_HW_INTERRUPT] = 0x011D, /* HW_INT.RCV */
+ [PERF_COUNT_HW_INTERRUPT_MASK] = 0x021D, /* HW_INT.CYCLES_MASKED */
+ [PERF_COUNT_HW_INTERRUPT_PENDING_MASK]= 0x041D, /* HW_INT.CYCLES_PENDING_AND_MASKED*/
+};
+
+static const u64 core2_atom_hw_interrupt_event_ids[] =
+{
+ [PERF_COUNT_HW_INTERRUPT] = 0x00C8, /* HW_INT_RCV */
+ [PERF_COUNT_HW_INTERRUPT_MASK] = 0x01C6, /* CYCLES_INT_MASKED.CYCLES_INT_MASKED*/
+ [PERF_COUNT_HW_INTERRUPT_PENDING_MASK]= 0x02C6, /* CYCLES_INT_MASKED.CYCLES_INT_PENDING_AND_MASKED*/
+};
+
static u64 intel_pmu_raw_event(u64 event)
{
#define CORE_EVNTSEL_EVENT_MASK 0x000000FFULL
@@ -481,6 +517,25 @@ static const u64 amd_hw_cache_event_ids
},
};

+static const u64 amd_hw_vector_event_ids[] =
+{
+ [PERF_COUNT_HW_VECTOR_ADD] = 0x0100, /* Dispatched FPU Add */
+ [PERF_COUNT_HW_VECTOR_MULTIPLY] = 0x0200, /* Dispatched FPU Multiply */
+ [PERF_COUNT_HW_VECTOR_DIVIDE] = 0x0400, /* Dispatched FPU Store */
+ [PERF_COUNT_HW_VECTOR_IDLE_CYCLES] = 0x0001, /* FPU Empty cycles */
+ [PERF_COUNT_HW_VECTOR_STALL_CYCLES] = 0x00D7, /* Dispatch stall for FPU */
+ [PERF_COUNT_HW_VECTOR_OPS] = 0x0FCB, /* Retired x87|(MMX & 3Dnow)
+ |SSE & SSE2) Instructions */
+};
+
+
+static const u64 amd_hw_interrupt_event_ids[] =
+{
+ [PERF_COUNT_HW_INTERRUPT] = 0x00CF, /* Interrupts Taken */
+ [PERF_COUNT_HW_INTERRUPT_MASK] = 0x00CD, /* Interrupts-Masked Cycles*/
+ [PERF_COUNT_HW_INTERRUPT_PENDING_MASK]= 0x00CE, /* Int Mask+Pending Cycles */
+};
+
/*
* AMD Performance Monitor K7 and later.
*/
@@ -659,6 +714,28 @@ set_ext_hw_attr(struct hw_perf_counter *hwc, struct perf_counter_attr *attr)
return 0;
}

+static inline int
+set_hw_vector_attr(struct hw_perf_counter *hwc, struct perf_counter_attr *attr)
+{
+ if (attr->config >= PERF_COUNT_HW_VECTOR_MAX)
+ return -EINVAL;
+
+ hwc->config |= hw_vector_event_ids[attr->config];
+
+ return 0;
+}
+
+static inline int
+set_hw_interrupt_attr(struct hw_perf_counter *hwc, struct perf_counter_attr *attr)
+{
+ if (attr->config >= PERF_COUNT_HW_INTERRUPT_MAX)
+ return -EINVAL;
+
+ hwc->config |= hw_interrupt_event_ids[attr->config];
+
+ return 0;
+}
+
/*
* Setup the hardware configuration for a given attr_type
*/
@@ -716,6 +793,12 @@ static int __hw_perf_counter_init(struct perf_counter *counter)
if (attr->type == PERF_TYPE_HW_CACHE)
return set_ext_hw_attr(hwc, attr);

+ if (attr->type == PERF_TYPE_HW_VECTOR)
+ return set_hw_vector_attr(hwc, attr);
+
+ if (attr->type == PERF_TYPE_HW_INTERRUPT)
+ return set_hw_interrupt_attr(hwc, attr);
+
if (attr->config >= x86_pmu.max_events)
return -EINVAL;
/*
@@ -1437,6 +1520,8 @@ static int intel_pmu_init(void)
case 29: /* six-core 45 nm xeon "Dunnington" */
memcpy(hw_cache_event_ids, core2_hw_cache_event_ids,
sizeof(hw_cache_event_ids));
+ memcpy(hw_interrupt_event_ids, core2_atom_hw_interrupt_event_ids,
+ sizeof(hw_interrupt_event_ids));

pr_cont("Core2 events, ");
break;
@@ -1444,12 +1529,18 @@ static int intel_pmu_init(void)
case 26:
memcpy(hw_cache_event_ids, nehalem_hw_cache_event_ids,
sizeof(hw_cache_event_ids));
+ memcpy(hw_vector_event_ids, nehalem_hw_vector_event_ids,
+ sizeof(hw_vector_event_ids));
+ memcpy(hw_interrupt_event_ids, nehalem_hw_interrupt_event_ids,
+ sizeof(hw_interrupt_event_ids));

pr_cont("Nehalem/Corei7 events, ");
break;
case 28:
memcpy(hw_cache_event_ids, atom_hw_cache_event_ids,
sizeof(hw_cache_event_ids));
+ memcpy(hw_interrupt_event_ids, core2_atom_hw_interrupt_event_ids,
+ sizeof(hw_interrupt_event_ids));

pr_cont("Atom events, ");
break;
@@ -1468,6 +1559,10 @@ static int amd_pmu_init(void)
/* Events are common for all AMDs */
memcpy(hw_cache_event_ids, amd_hw_cache_event_ids,
sizeof(hw_cache_event_ids));
+ memcpy(hw_vector_event_ids, amd_hw_vector_event_ids,
+ sizeof(hw_vector_event_ids));
+ memcpy(hw_interrupt_event_ids, amd_hw_interrupt_event_ids,
+ sizeof(hw_interrupt_event_ids));

return 0;
}
diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index 5e970c7..c7165b9 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -31,6 +31,8 @@ enum perf_type_id {
PERF_TYPE_TRACEPOINT = 2,
PERF_TYPE_HW_CACHE = 3,
PERF_TYPE_RAW = 4,
+ PERF_TYPE_HW_VECTOR = 5,
+ PERF_TYPE_HW_INTERRUPT = 6,

PERF_TYPE_MAX, /* non-ABI */
};
@@ -89,6 +91,31 @@ enum perf_hw_cache_op_result_id {
};

/*
+ * Generalized hardware vectored co-processor counters:
+ */
+enum perf_hw_vector_id {
+ PERF_COUNT_HW_VECTOR_ADD = 0,
+ PERF_COUNT_HW_VECTOR_MULTIPLY = 1,
+ PERF_COUNT_HW_VECTOR_DIVIDE = 2,
+ PERF_COUNT_HW_VECTOR_IDLE_CYCLES = 3,
+ PERF_COUNT_HW_VECTOR_STALL_CYCLES = 4,
+ PERF_COUNT_HW_VECTOR_OPS = 5,
+
+ PERF_COUNT_HW_VECTOR_MAX, /* non-ABI */
+};
+
+/*
+ * Generalized hardware inturrupt counters:
+ */
+enum perf_hw_interrupt_id {
+ PERF_COUNT_HW_INTERRUPT = 0,
+ PERF_COUNT_HW_INTERRUPT_MASK = 1,
+ PERF_COUNT_HW_INTERRUPT_PENDING_MASK = 2,
+
+ PERF_COUNT_HW_INTERRUPT_MAX, /* non-ABI */
+};
+
+/*
* Special "software" counters provided by the kernel, even if the hardware
* does not support performance counters. These counters measure various
* physical and sw events of the kernel (and allow the profiling of them as
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index d55a50d..7a529a8 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -3838,6 +3838,8 @@ perf_counter_alloc(struct perf_counter_attr *attr,
case PERF_TYPE_RAW:
case PERF_TYPE_HARDWARE:
case PERF_TYPE_HW_CACHE:
+ case PERF_TYPE_HW_VECTOR:
+ case PERF_TYPE_HW_INTERRUPT:
pmu = hw_perf_counter_init(counter);
break;

diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c
index 2e03524..af61c29 100644
--- a/tools/perf/builtin-stat.c
+++ b/tools/perf/builtin-stat.c
@@ -96,6 +96,10 @@ static u64 walltime_nsecs_noise;
static u64 runtime_cycles_avg;
static u64 runtime_cycles_noise;

+#define MATCH_EVENT(t, c, counter) \
+ (attrs[counter].type == PERF_TYPE_##t && \
+ attrs[counter].config == PERF_COUNT_##c)
+
#define ERR_PERF_OPEN \
"Error: counter %d, sys_perf_counter_open() syscall returned with %d (%s)\n"

@@ -132,13 +136,31 @@ static void create_perf_stat_counter(int counter, int pid)
*/
static inline int nsec_counter(int counter)
{
- if (attrs[counter].type != PERF_TYPE_SOFTWARE)
- return 0;
+ if (MATCH_EVENT(SOFTWARE, SW_CPU_CLOCK, counter) ||
+ MATCH_EVENT(SOFTWARE, SW_TASK_CLOCK, counter))
+ return 1;
+
+ return 0;
+}

- if (attrs[counter].config == PERF_COUNT_SW_CPU_CLOCK)
+/*
+ * Does the counter have cycles as a unit?
+ */
+static inline int cycle_counter(int counter)
+{
+ if (MATCH_EVENT(HARDWARE, HW_CPU_CYCLES, counter) ||
+ MATCH_EVENT(HARDWARE, HW_BUS_CYCLES, counter))
return 1;

- if (attrs[counter].config == PERF_COUNT_SW_TASK_CLOCK)
+ return 0;
+}
+
+/*
+ * Does the counter have instructions as a unit?
+ */
+static inline int instruction_counter(int counter)
+{
+ if (MATCH_EVENT(HARDWARE, HW_INSTRUCTIONS, counter))
return 1;

return 0;
@@ -192,11 +214,9 @@ static void read_counter(int counter)
/*
* Save the full runtime - to allow normalization during printout:
*/
- if (attrs[counter].type == PERF_TYPE_SOFTWARE &&
- attrs[counter].config == PERF_COUNT_SW_TASK_CLOCK)
+ if (nsec_counter(counter))
runtime_nsecs[run_idx] = count[0];
- if (attrs[counter].type == PERF_TYPE_HARDWARE &&
- attrs[counter].config == PERF_COUNT_HW_CPU_CYCLES)
+ else if (cycle_counter(counter))
runtime_cycles[run_idx] = count[0];
}

@@ -290,13 +310,10 @@ static void nsec_printout(int counter, u64 *count, u64 *noise)

fprintf(stderr, " %14.6f %-24s", msecs, event_name(counter));

- if (attrs[counter].type == PERF_TYPE_SOFTWARE &&
- attrs[counter].config == PERF_COUNT_SW_TASK_CLOCK) {
+ if (nsec_counter(counter) && walltime_nsecs_avg)
+ fprintf(stderr, " # %10.3f CPUs ",
+ (double)count[0] / (double)walltime_nsecs_avg);

- if (walltime_nsecs_avg)
- fprintf(stderr, " # %10.3f CPUs ",
- (double)count[0] / (double)walltime_nsecs_avg);
- }
print_noise(count, noise);
}

@@ -304,18 +321,13 @@ static void abs_printout(int counter, u64 *count, u64 *noise)
{
fprintf(stderr, " %14Ld %-24s", count[0], event_name(counter));

- if (runtime_cycles_avg &&
- attrs[counter].type == PERF_TYPE_HARDWARE &&
- attrs[counter].config == PERF_COUNT_HW_INSTRUCTIONS) {
-
+ if (instruction_counter(counter) && runtime_cycles_avg)
fprintf(stderr, " # %10.3f IPC ",
(double)count[0] / (double)runtime_cycles_avg);
- } else {
- if (runtime_nsecs_avg) {
- fprintf(stderr, " # %10.3f M/sec",
- (double)count[0]/runtime_nsecs_avg*1000.0);
- }
- }
+ else if (runtime_nsecs_avg)
+ fprintf(stderr, " # %10.3f M/sec",
+ (double)count[0]/runtime_nsecs_avg*1000.0);
+
print_noise(count, noise);
}

diff --git a/tools/perf/util/parse-events.c b/tools/perf/util/parse-events.c
index 4d042f1..5ea4c12 100644
--- a/tools/perf/util/parse-events.c
+++ b/tools/perf/util/parse-events.c
@@ -40,6 +40,25 @@ static struct event_symbol event_symbols[] = {
{ CSW(CPU_MIGRATIONS), "cpu-migrations", "migrations" },
};

+#define CHVECTOR(x) .type = PERF_TYPE_HW_VECTOR, .config = PERF_COUNT_HW_VECTOR_##x
+
+static struct event_symbol vector_event_symbols[] = {
+ { CHVECTOR(ADD), "vec-adds", "add" },
+ { CHVECTOR(MULTIPLY), "vec-muls", "multiply" },
+ { CHVECTOR(DIVIDE), "vec-divs", "divide" },
+ { CHVECTOR(IDLE_CYCLES), "vec-idle-cycles", "vec-empty-cycles"},
+ { CHVECTOR(STALL_CYCLES), "vec-stall-cycles", "vec-busy-cycles"},
+ { CHVECTOR(OPS), "vec-ops", "vec-operations"},
+};
+
+#define CHINT(x) .type = PERF_TYPE_HW_INTERRUPT, .config = PERF_COUNT_HW_##x
+
+static struct event_symbol interrupt_event_symbols[] = {
+ { CHINT(INTERRUPT), "interrupts", "interrupt" },
+ { CHINT(INTERRUPT_MASK), "int-mask-cycles", "masked" },
+ { CHINT(INTERRUPT_PENDING_MASK),"int-pending-mask-cycles", "" },
+};
+
#define __PERF_COUNTER_FIELD(config, name) \
((config & PERF_COUNTER_##name##_MASK) >> PERF_COUNTER_##name##_SHIFT)

@@ -172,6 +191,16 @@ char *event_name(int counter)
return event_cache_name(cache_type, cache_op, cache_result);
}

+ case PERF_TYPE_HW_VECTOR:
+ if (config < PERF_COUNT_HW_VECTOR_MAX)
+ return vector_event_symbols[config].symbol;
+ return "unknown-vector";
+
+ case PERF_TYPE_HW_INTERRUPT:
+ if (config < PERF_COUNT_HW_INTERRUPT_MAX)
+ return interrupt_event_symbols[config].symbol;
+ return "unknown-interrupt";
+
case PERF_TYPE_SOFTWARE:
if (config < PERF_COUNT_SW_MAX)
return sw_event_names[config];
@@ -250,6 +279,32 @@ static int check_events(const char *str, unsigned int i)
return 0;
}

+static int check_vector_events(const char *str, unsigned int i)
+{
+ if (!strncmp(str, vector_event_symbols[i].symbol,
+ strlen(vector_event_symbols[i].symbol)))
+ return 1;
+
+ if (strlen(vector_event_symbols[i].alias))
+ if (!strncmp(str, vector_event_symbols[i].alias,
+ strlen(vector_event_symbols[i].alias)))
+ return 1;
+ return 0;
+}
+
+static int check_interrupt_events(const char *str, unsigned int i)
+{
+ if (!strncmp(str, interrupt_event_symbols[i].symbol,
+ strlen(interrupt_event_symbols[i].symbol)))
+ return 1;
+
+ if (strlen(interrupt_event_symbols[i].alias))
+ if (!strncmp(str, interrupt_event_symbols[i].alias,
+ strlen(interrupt_event_symbols[i].alias)))
+ return 1;
+ return 0;
+}
+
/*
* Each event can have multiple symbolic names.
* Symbolic names are (almost) exactly matched.
@@ -297,6 +352,24 @@ static int parse_event_symbols(const char *str, struct perf_counter_attr *attr)
}
}

+ for (i = 0; i < ARRAY_SIZE(vector_event_symbols); i++) {
+ if (check_vector_events(str, i)) {
+ attr->type = vector_event_symbols[i].type;
+ attr->config = vector_event_symbols[i].config;
+
+ return 0;
+ }
+ }
+
+ for (i = 0; i < ARRAY_SIZE(interrupt_event_symbols); i++) {
+ if (check_interrupt_events(str, i)) {
+ attr->type = interrupt_event_symbols[i].type;
+ attr->config = interrupt_event_symbols[i].config;
+
+ return 0;
+ }
+ }
+
return parse_generic_hw_symbols(str, attr);
}



2009-07-01 09:45:58

by Jaswinder Singh Rajput

[permalink] [raw]
Subject: [PATCH 1/6 -tip] perf stat: define MATCH_EVENT for easy attrs checking


MATCH_EVENT is useful :
1. for multiple attrs checking
2. avoid repetition of PERF_TYPE_ and PERF_COUNT_ and save space
3. avoids line breakage

Signed-off-by: Jaswinder Singh Rajput <[email protected]>
---
tools/perf/builtin-stat.c | 27 ++++++++++-----------------
1 files changed, 10 insertions(+), 17 deletions(-)

diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c
index 2e03524..6bf2b80 100644
--- a/tools/perf/builtin-stat.c
+++ b/tools/perf/builtin-stat.c
@@ -96,6 +96,10 @@ static u64 walltime_nsecs_noise;
static u64 runtime_cycles_avg;
static u64 runtime_cycles_noise;

+#define MATCH_EVENT(t, c, counter) \
+ (attrs[counter].type == PERF_TYPE_##t && \
+ attrs[counter].config == PERF_COUNT_##c)
+
#define ERR_PERF_OPEN \
"Error: counter %d, sys_perf_counter_open() syscall returned with %d (%s)\n"

@@ -132,13 +136,8 @@ static void create_perf_stat_counter(int counter, int pid)
*/
static inline int nsec_counter(int counter)
{
- if (attrs[counter].type != PERF_TYPE_SOFTWARE)
- return 0;
-
- if (attrs[counter].config == PERF_COUNT_SW_CPU_CLOCK)
- return 1;
-
- if (attrs[counter].config == PERF_COUNT_SW_TASK_CLOCK)
+ if (MATCH_EVENT(SOFTWARE, SW_CPU_CLOCK, counter) ||
+ MATCH_EVENT(SOFTWARE, SW_TASK_CLOCK, counter))
return 1;

return 0;
@@ -192,11 +191,9 @@ static void read_counter(int counter)
/*
* Save the full runtime - to allow normalization during printout:
*/
- if (attrs[counter].type == PERF_TYPE_SOFTWARE &&
- attrs[counter].config == PERF_COUNT_SW_TASK_CLOCK)
+ if (MATCH_EVENT(SOFTWARE, SW_TASK_CLOCK, counter))
runtime_nsecs[run_idx] = count[0];
- if (attrs[counter].type == PERF_TYPE_HARDWARE &&
- attrs[counter].config == PERF_COUNT_HW_CPU_CYCLES)
+ if (MATCH_EVENT(HARDWARE, HW_CPU_CYCLES, counter))
runtime_cycles[run_idx] = count[0];
}

@@ -290,9 +287,7 @@ static void nsec_printout(int counter, u64 *count, u64 *noise)

fprintf(stderr, " %14.6f %-24s", msecs, event_name(counter));

- if (attrs[counter].type == PERF_TYPE_SOFTWARE &&
- attrs[counter].config == PERF_COUNT_SW_TASK_CLOCK) {
-
+ if (MATCH_EVENT(SOFTWARE, SW_TASK_CLOCK, counter)) {
if (walltime_nsecs_avg)
fprintf(stderr, " # %10.3f CPUs ",
(double)count[0] / (double)walltime_nsecs_avg);
@@ -305,9 +300,7 @@ static void abs_printout(int counter, u64 *count, u64 *noise)
fprintf(stderr, " %14Ld %-24s", count[0], event_name(counter));

if (runtime_cycles_avg &&
- attrs[counter].type == PERF_TYPE_HARDWARE &&
- attrs[counter].config == PERF_COUNT_HW_INSTRUCTIONS) {
-
+ MATCH_EVENT(HARDWARE, HW_INSTRUCTIONS, counter)) {
fprintf(stderr, " # %10.3f IPC ",
(double)count[0] / (double)runtime_cycles_avg);
} else {
--
1.6.0.6


2009-07-01 09:47:05

by Jaswinder Singh Rajput

[permalink] [raw]
Subject: [PATCH 2/6 -tip] perf stat: treat same behaviour for all CYCLES and CLOCKS


For normalization also added SW_CPU_CLOCK and HW_BUS_CYCLES

For nsec_printout also added SW_CPU_CLOCK

Added helper functions to check counter unit as cycles and instructions

Signed-off-by: Jaswinder Singh Rajput <[email protected]>
---
tools/perf/builtin-stat.c | 49 +++++++++++++++++++++++++++++++-------------
1 files changed, 34 insertions(+), 15 deletions(-)

diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c
index 6bf2b80..af61c29 100644
--- a/tools/perf/builtin-stat.c
+++ b/tools/perf/builtin-stat.c
@@ -144,6 +144,29 @@ static inline int nsec_counter(int counter)
}

/*
+ * Does the counter have cycles as a unit?
+ */
+static inline int cycle_counter(int counter)
+{
+ if (MATCH_EVENT(HARDWARE, HW_CPU_CYCLES, counter) ||
+ MATCH_EVENT(HARDWARE, HW_BUS_CYCLES, counter))
+ return 1;
+
+ return 0;
+}
+
+/*
+ * Does the counter have instructions as a unit?
+ */
+static inline int instruction_counter(int counter)
+{
+ if (MATCH_EVENT(HARDWARE, HW_INSTRUCTIONS, counter))
+ return 1;
+
+ return 0;
+}
+
+/*
* Read out the results of a single counter:
*/
static void read_counter(int counter)
@@ -191,9 +214,9 @@ static void read_counter(int counter)
/*
* Save the full runtime - to allow normalization during printout:
*/
- if (MATCH_EVENT(SOFTWARE, SW_TASK_CLOCK, counter))
+ if (nsec_counter(counter))
runtime_nsecs[run_idx] = count[0];
- if (MATCH_EVENT(HARDWARE, HW_CPU_CYCLES, counter))
+ else if (cycle_counter(counter))
runtime_cycles[run_idx] = count[0];
}

@@ -287,11 +310,10 @@ static void nsec_printout(int counter, u64 *count, u64 *noise)

fprintf(stderr, " %14.6f %-24s", msecs, event_name(counter));

- if (MATCH_EVENT(SOFTWARE, SW_TASK_CLOCK, counter)) {
- if (walltime_nsecs_avg)
- fprintf(stderr, " # %10.3f CPUs ",
- (double)count[0] / (double)walltime_nsecs_avg);
- }
+ if (nsec_counter(counter) && walltime_nsecs_avg)
+ fprintf(stderr, " # %10.3f CPUs ",
+ (double)count[0] / (double)walltime_nsecs_avg);
+
print_noise(count, noise);
}

@@ -299,16 +321,13 @@ static void abs_printout(int counter, u64 *count, u64 *noise)
{
fprintf(stderr, " %14Ld %-24s", count[0], event_name(counter));

- if (runtime_cycles_avg &&
- MATCH_EVENT(HARDWARE, HW_INSTRUCTIONS, counter)) {
+ if (instruction_counter(counter) && runtime_cycles_avg)
fprintf(stderr, " # %10.3f IPC ",
(double)count[0] / (double)runtime_cycles_avg);
- } else {
- if (runtime_nsecs_avg) {
- fprintf(stderr, " # %10.3f M/sec",
- (double)count[0]/runtime_nsecs_avg*1000.0);
- }
- }
+ else if (runtime_nsecs_avg)
+ fprintf(stderr, " # %10.3f M/sec",
+ (double)count[0]/runtime_nsecs_avg*1000.0);
+
print_noise(count, noise);
}

--
1.6.0.6


2009-07-01 09:48:29

by Jaswinder Singh Rajput

[permalink] [raw]
Subject: [PATCH 3/6 -tip] perf_counter: Add Generalized Hardware vectored co-processor support for AMD


$ ./perf stat -e add -e multiply -e divide -e vec-idle-cycles -e vec-stall-cycles -e vec-ops -- ls -lR /usr/include/ > /dev/null

Performance counter stats for 'ls -lR /usr/include/':

4218 vec-adds (scaled from 66.60%)
7426 vec-muls (scaled from 66.67%)
5441 vec-divs (scaled from 66.29%)
821982187 vec-idle-cycles (scaled from 66.45%)
2681 vec-stall-cycles (scaled from 67.11%)
7887 vec-ops (scaled from 66.88%)

0.417614573 seconds time elapsed

$ ./perf stat -e add -e multiply -e divide -e vec-idle-cycles -e vec-stall-cycles -e vec-ops -- /usr/bin/rhythmbox ~jaswinder/Music/singhiskinng.mp3

Performance counter stats for '/usr/bin/rhythmbox /home/jaswinder/Music/singhiskinng.mp3':

17552264 vec-adds (scaled from 66.28%)
19715258 vec-muls (scaled from 66.63%)
15862733 vec-divs (scaled from 66.82%)
23735187095 vec-idle-cycles (scaled from 66.89%)
11353159 vec-stall-cycles (scaled from 66.90%)
36628571 vec-ops (scaled from 66.48%)

298.350012843 seconds time elapsed

$ ./perf stat -e add -e multiply -e divide -e vec-idle-cycles -e vec-stall-cycles -e vec-ops -- /usr/bin/vlc ~jaswinder/Videos/Linus_Torvalds_interview_with_Charlie_Rose_Part_1.flv

Performance counter stats for '/usr/bin/vlc /home/jaswinder/Videos/Linus_Torvalds_interview_with_Charlie_Rose_Part_1.flv':

20177177044 vec-adds (scaled from 66.63%)
34101687027 vec-muls (scaled from 66.64%)
3984060862 vec-divs (scaled from 66.71%)
26349684710 vec-idle-cycles (scaled from 66.65%)
9052001905 vec-stall-cycles (scaled from 66.66%)
76440734242 vec-ops (scaled from 66.71%)

272.523058097 seconds time elapsed

Signed-off-by: Jaswinder Singh Rajput <[email protected]>
---
arch/x86/kernel/cpu/perf_counter.c | 33 +++++++++++++++++++++++++++++++
include/linux/perf_counter.h | 15 ++++++++++++++
kernel/perf_counter.c | 1 +
tools/perf/util/parse-events.c | 38 ++++++++++++++++++++++++++++++++++++
4 files changed, 87 insertions(+), 0 deletions(-)

diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index d4cf4ce..8092200 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -372,6 +372,12 @@ static const u64 atom_hw_cache_event_ids
},
};

+/*
+ * Generalized hw vectored co-processor event table
+ */
+
+static u64 __read_mostly hw_vector_event_ids[PERF_COUNT_HW_VECTOR_MAX];
+
static u64 intel_pmu_raw_event(u64 event)
{
#define CORE_EVNTSEL_EVENT_MASK 0x000000FFULL
@@ -481,6 +487,17 @@ static const u64 amd_hw_cache_event_ids
},
};

+static const u64 amd_hw_vector_event_ids[] =
+{
+ [PERF_COUNT_HW_VECTOR_ADD] = 0x0100, /* Dispatched FPU Add */
+ [PERF_COUNT_HW_VECTOR_MULTIPLY] = 0x0200, /* Dispatched FPU Multiply */
+ [PERF_COUNT_HW_VECTOR_DIVIDE] = 0x0400, /* Dispatched FPU Store */
+ [PERF_COUNT_HW_VECTOR_IDLE_CYCLES] = 0x0001, /* FPU Empty cycles */
+ [PERF_COUNT_HW_VECTOR_STALL_CYCLES] = 0x00D7, /* Dispatch stall for FPU */
+ [PERF_COUNT_HW_VECTOR_OPS] = 0x0FCB, /* Retired x87|(MMX & 3Dnow)
+ |SSE & SSE2) Instructions */
+};
+
/*
* AMD Performance Monitor K7 and later.
*/
@@ -659,6 +676,17 @@ set_ext_hw_attr(struct hw_perf_counter *hwc, struct perf_counter_attr *attr)
return 0;
}

+static inline int
+set_hw_vector_attr(struct hw_perf_counter *hwc, struct perf_counter_attr *attr)
+{
+ if (attr->config >= PERF_COUNT_HW_VECTOR_MAX)
+ return -EINVAL;
+
+ hwc->config |= hw_vector_event_ids[attr->config];
+
+ return 0;
+}
+
/*
* Setup the hardware configuration for a given attr_type
*/
@@ -716,6 +744,9 @@ static int __hw_perf_counter_init(struct perf_counter *counter)
if (attr->type == PERF_TYPE_HW_CACHE)
return set_ext_hw_attr(hwc, attr);

+ if (attr->type == PERF_TYPE_HW_VECTOR)
+ return set_hw_vector_attr(hwc, attr);
+
if (attr->config >= x86_pmu.max_events)
return -EINVAL;
/*
@@ -1468,6 +1499,8 @@ static int amd_pmu_init(void)
/* Events are common for all AMDs */
memcpy(hw_cache_event_ids, amd_hw_cache_event_ids,
sizeof(hw_cache_event_ids));
+ memcpy(hw_vector_event_ids, amd_hw_vector_event_ids,
+ sizeof(hw_vector_event_ids));

return 0;
}
diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index 5e970c7..e91b712 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -31,6 +31,7 @@ enum perf_type_id {
PERF_TYPE_TRACEPOINT = 2,
PERF_TYPE_HW_CACHE = 3,
PERF_TYPE_RAW = 4,
+ PERF_TYPE_HW_VECTOR = 5,

PERF_TYPE_MAX, /* non-ABI */
};
@@ -89,6 +90,20 @@ enum perf_hw_cache_op_result_id {
};

/*
+ * Generalized hardware vectored co-processor counters:
+ */
+enum perf_hw_vector_id {
+ PERF_COUNT_HW_VECTOR_ADD = 0,
+ PERF_COUNT_HW_VECTOR_MULTIPLY = 1,
+ PERF_COUNT_HW_VECTOR_DIVIDE = 2,
+ PERF_COUNT_HW_VECTOR_IDLE_CYCLES = 3,
+ PERF_COUNT_HW_VECTOR_STALL_CYCLES = 4,
+ PERF_COUNT_HW_VECTOR_OPS = 5,
+
+ PERF_COUNT_HW_VECTOR_MAX, /* non-ABI */
+};
+
+/*
* Special "software" counters provided by the kernel, even if the hardware
* does not support performance counters. These counters measure various
* physical and sw events of the kernel (and allow the profiling of them as
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index d55a50d..dd3848a 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -3838,6 +3838,7 @@ perf_counter_alloc(struct perf_counter_attr *attr,
case PERF_TYPE_RAW:
case PERF_TYPE_HARDWARE:
case PERF_TYPE_HW_CACHE:
+ case PERF_TYPE_HW_VECTOR:
pmu = hw_perf_counter_init(counter);
break;

diff --git a/tools/perf/util/parse-events.c b/tools/perf/util/parse-events.c
index 4d042f1..5e5d17e 100644
--- a/tools/perf/util/parse-events.c
+++ b/tools/perf/util/parse-events.c
@@ -40,6 +40,17 @@ static struct event_symbol event_symbols[] = {
{ CSW(CPU_MIGRATIONS), "cpu-migrations", "migrations" },
};

+#define CHVECTOR(x) .type = PERF_TYPE_HW_VECTOR, .config = PERF_COUNT_HW_VECTOR_##x
+
+static struct event_symbol vector_event_symbols[] = {
+ { CHVECTOR(ADD), "vec-adds", "add" },
+ { CHVECTOR(MULTIPLY), "vec-muls", "multiply" },
+ { CHVECTOR(DIVIDE), "vec-divs", "divide" },
+ { CHVECTOR(IDLE_CYCLES), "vec-idle-cycles", "vec-empty-cycles"},
+ { CHVECTOR(STALL_CYCLES), "vec-stall-cycles", "vec-busy-cycles"},
+ { CHVECTOR(OPS), "vec-ops", "vec-operations"},
+};
+
#define __PERF_COUNTER_FIELD(config, name) \
((config & PERF_COUNTER_##name##_MASK) >> PERF_COUNTER_##name##_SHIFT)

@@ -172,6 +183,11 @@ char *event_name(int counter)
return event_cache_name(cache_type, cache_op, cache_result);
}

+ case PERF_TYPE_HW_VECTOR:
+ if (config < PERF_COUNT_HW_VECTOR_MAX)
+ return vector_event_symbols[config].symbol;
+ return "unknown-vector";
+
case PERF_TYPE_SOFTWARE:
if (config < PERF_COUNT_SW_MAX)
return sw_event_names[config];
@@ -250,6 +266,19 @@ static int check_events(const char *str, unsigned int i)
return 0;
}

+static int check_vector_events(const char *str, unsigned int i)
+{
+ if (!strncmp(str, vector_event_symbols[i].symbol,
+ strlen(vector_event_symbols[i].symbol)))
+ return 1;
+
+ if (strlen(vector_event_symbols[i].alias))
+ if (!strncmp(str, vector_event_symbols[i].alias,
+ strlen(vector_event_symbols[i].alias)))
+ return 1;
+ return 0;
+}
+
/*
* Each event can have multiple symbolic names.
* Symbolic names are (almost) exactly matched.
@@ -297,6 +326,15 @@ static int parse_event_symbols(const char *str, struct perf_counter_attr *attr)
}
}

+ for (i = 0; i < ARRAY_SIZE(vector_event_symbols); i++) {
+ if (check_vector_events(str, i)) {
+ attr->type = vector_event_symbols[i].type;
+ attr->config = vector_event_symbols[i].config;
+
+ return 0;
+ }
+ }
+
return parse_generic_hw_symbols(str, attr);
}

--
1.6.0.6


2009-07-01 09:48:42

by Jaswinder Singh Rajput

[permalink] [raw]
Subject: [PATCH 4/6 -tip] perf_counter: Add Generalized Hardware interrupt support for AMD


$ ./perf stat -e interrupts -e masked -e int-pending-mask-cycles -- ls -lR /usr/include/ > /dev/null

Performance counter stats for 'ls -lR /usr/include/':

377 interrupts
53429936 int-mask-cycles
1119 int-pending-mask-cycles

0.371457539 seconds time elapsed

Signed-off-by: Jaswinder Singh Rajput <[email protected]>
---
arch/x86/kernel/cpu/perf_counter.c | 30 ++++++++++++++++++++++++++++++
include/linux/perf_counter.h | 12 ++++++++++++
kernel/perf_counter.c | 1 +
tools/perf/util/parse-events.c | 35 +++++++++++++++++++++++++++++++++++
4 files changed, 78 insertions(+), 0 deletions(-)

diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index 8092200..487df5c 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -378,6 +378,12 @@ static const u64 atom_hw_cache_event_ids

static u64 __read_mostly hw_vector_event_ids[PERF_COUNT_HW_VECTOR_MAX];

+/*
+ * Generalized hw interrupt event table
+ */
+
+static u64 __read_mostly hw_interrupt_event_ids[PERF_COUNT_HW_INTERRUPT_MAX];
+
static u64 intel_pmu_raw_event(u64 event)
{
#define CORE_EVNTSEL_EVENT_MASK 0x000000FFULL
@@ -498,6 +504,14 @@ static const u64 amd_hw_vector_event_ids[] =
|SSE & SSE2) Instructions */
};

+
+static const u64 amd_hw_interrupt_event_ids[] =
+{
+ [PERF_COUNT_HW_INTERRUPT] = 0x00CF, /* Interrupts Taken */
+ [PERF_COUNT_HW_INTERRUPT_MASK] = 0x00CD, /* Interrupts-Masked Cycles*/
+ [PERF_COUNT_HW_INTERRUPT_PENDING_MASK]= 0x00CE, /* Int Mask+Pending Cycles */
+};
+
/*
* AMD Performance Monitor K7 and later.
*/
@@ -687,6 +701,17 @@ set_hw_vector_attr(struct hw_perf_counter *hwc, struct perf_counter_attr *attr)
return 0;
}

+static inline int
+set_hw_interrupt_attr(struct hw_perf_counter *hwc, struct perf_counter_attr *attr)
+{
+ if (attr->config >= PERF_COUNT_HW_INTERRUPT_MAX)
+ return -EINVAL;
+
+ hwc->config |= hw_interrupt_event_ids[attr->config];
+
+ return 0;
+}
+
/*
* Setup the hardware configuration for a given attr_type
*/
@@ -747,6 +772,9 @@ static int __hw_perf_counter_init(struct perf_counter *counter)
if (attr->type == PERF_TYPE_HW_VECTOR)
return set_hw_vector_attr(hwc, attr);

+ if (attr->type == PERF_TYPE_HW_INTERRUPT)
+ return set_hw_interrupt_attr(hwc, attr);
+
if (attr->config >= x86_pmu.max_events)
return -EINVAL;
/*
@@ -1501,6 +1529,8 @@ static int amd_pmu_init(void)
sizeof(hw_cache_event_ids));
memcpy(hw_vector_event_ids, amd_hw_vector_event_ids,
sizeof(hw_vector_event_ids));
+ memcpy(hw_interrupt_event_ids, amd_hw_interrupt_event_ids,
+ sizeof(hw_interrupt_event_ids));

return 0;
}
diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index e91b712..c7165b9 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -32,6 +32,7 @@ enum perf_type_id {
PERF_TYPE_HW_CACHE = 3,
PERF_TYPE_RAW = 4,
PERF_TYPE_HW_VECTOR = 5,
+ PERF_TYPE_HW_INTERRUPT = 6,

PERF_TYPE_MAX, /* non-ABI */
};
@@ -104,6 +105,17 @@ enum perf_hw_vector_id {
};

/*
+ * Generalized hardware inturrupt counters:
+ */
+enum perf_hw_interrupt_id {
+ PERF_COUNT_HW_INTERRUPT = 0,
+ PERF_COUNT_HW_INTERRUPT_MASK = 1,
+ PERF_COUNT_HW_INTERRUPT_PENDING_MASK = 2,
+
+ PERF_COUNT_HW_INTERRUPT_MAX, /* non-ABI */
+};
+
+/*
* Special "software" counters provided by the kernel, even if the hardware
* does not support performance counters. These counters measure various
* physical and sw events of the kernel (and allow the profiling of them as
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index dd3848a..7a529a8 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -3839,6 +3839,7 @@ perf_counter_alloc(struct perf_counter_attr *attr,
case PERF_TYPE_HARDWARE:
case PERF_TYPE_HW_CACHE:
case PERF_TYPE_HW_VECTOR:
+ case PERF_TYPE_HW_INTERRUPT:
pmu = hw_perf_counter_init(counter);
break;

diff --git a/tools/perf/util/parse-events.c b/tools/perf/util/parse-events.c
index 5e5d17e..5ea4c12 100644
--- a/tools/perf/util/parse-events.c
+++ b/tools/perf/util/parse-events.c
@@ -51,6 +51,14 @@ static struct event_symbol vector_event_symbols[] = {
{ CHVECTOR(OPS), "vec-ops", "vec-operations"},
};

+#define CHINT(x) .type = PERF_TYPE_HW_INTERRUPT, .config = PERF_COUNT_HW_##x
+
+static struct event_symbol interrupt_event_symbols[] = {
+ { CHINT(INTERRUPT), "interrupts", "interrupt" },
+ { CHINT(INTERRUPT_MASK), "int-mask-cycles", "masked" },
+ { CHINT(INTERRUPT_PENDING_MASK),"int-pending-mask-cycles", "" },
+};
+
#define __PERF_COUNTER_FIELD(config, name) \
((config & PERF_COUNTER_##name##_MASK) >> PERF_COUNTER_##name##_SHIFT)

@@ -188,6 +196,11 @@ char *event_name(int counter)
return vector_event_symbols[config].symbol;
return "unknown-vector";

+ case PERF_TYPE_HW_INTERRUPT:
+ if (config < PERF_COUNT_HW_INTERRUPT_MAX)
+ return interrupt_event_symbols[config].symbol;
+ return "unknown-interrupt";
+
case PERF_TYPE_SOFTWARE:
if (config < PERF_COUNT_SW_MAX)
return sw_event_names[config];
@@ -279,6 +292,19 @@ static int check_vector_events(const char *str, unsigned int i)
return 0;
}

+static int check_interrupt_events(const char *str, unsigned int i)
+{
+ if (!strncmp(str, interrupt_event_symbols[i].symbol,
+ strlen(interrupt_event_symbols[i].symbol)))
+ return 1;
+
+ if (strlen(interrupt_event_symbols[i].alias))
+ if (!strncmp(str, interrupt_event_symbols[i].alias,
+ strlen(interrupt_event_symbols[i].alias)))
+ return 1;
+ return 0;
+}
+
/*
* Each event can have multiple symbolic names.
* Symbolic names are (almost) exactly matched.
@@ -335,6 +361,15 @@ static int parse_event_symbols(const char *str, struct perf_counter_attr *attr)
}
}

+ for (i = 0; i < ARRAY_SIZE(interrupt_event_symbols); i++) {
+ if (check_interrupt_events(str, i)) {
+ attr->type = interrupt_event_symbols[i].type;
+ attr->config = interrupt_event_symbols[i].config;
+
+ return 0;
+ }
+ }
+
return parse_generic_hw_symbols(str, attr);
}

--
1.6.0.6


2009-07-01 09:49:33

by Jaswinder Singh Rajput

[permalink] [raw]
Subject: [PATCH 5/6 -tip] perf_counter: Add hardware vector events for nehalem


Add hardware vector events for nehalem

Signed-off-by: Jaswinder Singh Rajput <[email protected]>
---
arch/x86/kernel/cpu/perf_counter.c | 12 ++++++++++++
1 files changed, 12 insertions(+), 0 deletions(-)

diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index 487df5c..8f05226 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -378,6 +378,16 @@ static const u64 atom_hw_cache_event_ids

static u64 __read_mostly hw_vector_event_ids[PERF_COUNT_HW_VECTOR_MAX];

+static const u64 nehalem_hw_vector_event_ids[] =
+{
+ [PERF_COUNT_HW_VECTOR_ADD] = 0x01B1, /* UOPS_EXECUTED.PORT0 */
+ [PERF_COUNT_HW_VECTOR_MULTIPLY] = 0x0214, /* ARITH.MUL */
+ [PERF_COUNT_HW_VECTOR_DIVIDE] = 0x0114, /* ARITH.CYCLES_DIV_BUSY */
+ [PERF_COUNT_HW_VECTOR_IDLE_CYCLES] = 0x0,
+ [PERF_COUNT_HW_VECTOR_STALL_CYCLES] = 0x60A2, /* RESOURCE_STALLS.FPCW|MXCSR*/
+ [PERF_COUNT_HW_VECTOR_OPS] = 0x0710, /* FP_COMP_OPS_EXE.X87|MMX|SSE_FP*/
+};
+
/*
* Generalized hw interrupt event table
*/
@@ -1503,6 +1513,8 @@ static int intel_pmu_init(void)
case 26:
memcpy(hw_cache_event_ids, nehalem_hw_cache_event_ids,
sizeof(hw_cache_event_ids));
+ memcpy(hw_vector_event_ids, nehalem_hw_vector_event_ids,
+ sizeof(hw_vector_event_ids));

pr_cont("Nehalem/Corei7 events, ");
break;
--
1.6.0.6


2009-07-01 09:49:44

by Jaswinder Singh Rajput

[permalink] [raw]
Subject: [PATCH 6/6 -tip] perf_counter: Add hardware interrupt events for nehalem, core2 and atom


Add hardware interrupt events for nehalem, core2 and atom

Signed-off-by: Jaswinder Singh Rajput <[email protected]>
---
arch/x86/kernel/cpu/perf_counter.c | 20 ++++++++++++++++++++
1 files changed, 20 insertions(+), 0 deletions(-)

diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index 8f05226..4ef1838 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -394,6 +394,20 @@ static const u64 nehalem_hw_vector_event_ids[] =

static u64 __read_mostly hw_interrupt_event_ids[PERF_COUNT_HW_INTERRUPT_MAX];

+static const u64 nehalem_hw_interrupt_event_ids[] =
+{
+ [PERF_COUNT_HW_INTERRUPT] = 0x011D, /* HW_INT.RCV */
+ [PERF_COUNT_HW_INTERRUPT_MASK] = 0x021D, /* HW_INT.CYCLES_MASKED */
+ [PERF_COUNT_HW_INTERRUPT_PENDING_MASK]= 0x041D, /* HW_INT.CYCLES_PENDING_AND_MASKED*/
+};
+
+static const u64 core2_atom_hw_interrupt_event_ids[] =
+{
+ [PERF_COUNT_HW_INTERRUPT] = 0x00C8, /* HW_INT_RCV */
+ [PERF_COUNT_HW_INTERRUPT_MASK] = 0x01C6, /* CYCLES_INT_MASKED.CYCLES_INT_MASKED*/
+ [PERF_COUNT_HW_INTERRUPT_PENDING_MASK]= 0x02C6, /* CYCLES_INT_MASKED.CYCLES_INT_PENDING_AND_MASKED*/
+};
+
static u64 intel_pmu_raw_event(u64 event)
{
#define CORE_EVNTSEL_EVENT_MASK 0x000000FFULL
@@ -1506,6 +1520,8 @@ static int intel_pmu_init(void)
case 29: /* six-core 45 nm xeon "Dunnington" */
memcpy(hw_cache_event_ids, core2_hw_cache_event_ids,
sizeof(hw_cache_event_ids));
+ memcpy(hw_interrupt_event_ids, core2_atom_hw_interrupt_event_ids,
+ sizeof(hw_interrupt_event_ids));

pr_cont("Core2 events, ");
break;
@@ -1515,12 +1531,16 @@ static int intel_pmu_init(void)
sizeof(hw_cache_event_ids));
memcpy(hw_vector_event_ids, nehalem_hw_vector_event_ids,
sizeof(hw_vector_event_ids));
+ memcpy(hw_interrupt_event_ids, nehalem_hw_interrupt_event_ids,
+ sizeof(hw_interrupt_event_ids));

pr_cont("Nehalem/Corei7 events, ");
break;
case 28:
memcpy(hw_cache_event_ids, atom_hw_cache_event_ids,
sizeof(hw_cache_event_ids));
+ memcpy(hw_interrupt_event_ids, core2_atom_hw_interrupt_event_ids,
+ sizeof(hw_interrupt_event_ids));

pr_cont("Atom events, ");
break;
--
1.6.0.6


2009-07-01 11:21:59

by Ingo Molnar

[permalink] [raw]
Subject: Re: [PATCH 3/6 -tip] perf_counter: Add Generalized Hardware vectored co-processor support for AMD


* Jaswinder Singh Rajput <[email protected]> wrote:

> $ ./perf stat -e add -e multiply -e divide -e vec-idle-cycles -e vec-stall-cycles -e vec-ops -- /usr/bin/vlc ~jaswinder/Videos/Linus_Torvalds_interview_with_Charlie_Rose_Part_1.flv
>
> Performance counter stats for '/usr/bin/vlc /home/jaswinder/Videos/Linus_Torvalds_interview_with_Charlie_Rose_Part_1.flv':
>
> 20177177044 vec-adds (scaled from 66.63%)
> 34101687027 vec-muls (scaled from 66.64%)
> 3984060862 vec-divs (scaled from 66.71%)
> 26349684710 vec-idle-cycles (scaled from 66.65%)
> 9052001905 vec-stall-cycles (scaled from 66.66%)
> 76440734242 vec-ops (scaled from 66.71%)
>
> 272.523058097 seconds time elapsed

Ok, this looks very nice now - a highly generic and still very
useful looking categorization of FPU/MMX/SSE related co-processor hw
events.

I'm still waiting for feedback from Paulus, BenH and Anton, whether
this kind of generic enumeration fits PowerPC well enough.

I think from a pure logic/math/physics POV this categorization is
pretty complete: a modern co-processor has three fundamental states
we are interested in: idle, busy and busy-stalled. It has an 'ops'
metric that counts instructions, plus the main operations are add,
mul and div.

Cell is i guess a complication to be solved, as there the various
vector units have separate decoders and separate thread state. This
above abstraction only covers the portion of CPU designs where there
are vector operations in the main ALU decoder stream of instructions

One thing that might be worth exposing is vectored loads/stores in
general. But we dont have those in the generic ALU enumeration yet
and if then it should be done together.

Also, the Nehalem bits need to be tested, i'll try to find time for
that.

Good stuff.

Ingo

2009-07-01 11:24:53

by Ingo Molnar

[permalink] [raw]
Subject: Re: [PATCH 4/6 -tip] perf_counter: Add Generalized Hardware interrupt support for AMD


* Jaswinder Singh Rajput <[email protected]> wrote:

>
> $ ./perf stat -e interrupts -e masked -e int-pending-mask-cycles -- ls -lR /usr/include/ > /dev/null
>
> Performance counter stats for 'ls -lR /usr/include/':
>
> 377 interrupts
> 53429936 int-mask-cycles
> 1119 int-pending-mask-cycles
>
> 0.371457539 seconds time elapsed

Agreed, this is another useful generalization - and the 'cycles
pending' metrics are not retrievable via any software means.

We could and should probably add a software counter for hardirqs as
wel. That would allow the vector/irqnr information to be passed in,
and it would allow architectures without irq metrics in the PMU to
have this counter too.

This way we could profile based on a specific interrupt source only
- say based on the networking card.

Ingo

2009-07-01 11:27:46

by Ingo Molnar

[permalink] [raw]
Subject: Re: [PATCH 3/6 -tip] perf_counter: Add Generalized Hardware vectored co-processor support for AMD


* Ingo Molnar <[email protected]> wrote:

> > Performance counter stats for '/usr/bin/vlc /home/jaswinder/Videos/Linus_Torvalds_interview_with_Charlie_Rose_Part_1.flv':
> >
> > 20177177044 vec-adds (scaled from 66.63%)
> > 34101687027 vec-muls (scaled from 66.64%)
> > 3984060862 vec-divs (scaled from 66.71%)
> > 26349684710 vec-idle-cycles (scaled from 66.65%)
> > 9052001905 vec-stall-cycles (scaled from 66.66%)
> > 76440734242 vec-ops (scaled from 66.71%)
> >
> > 272.523058097 seconds time elapsed

btw., the 'perf list' bits are missing - any new counter added
should be listed by 'perf list' as well - otherwise people wont know
what we have and what to use.

Ingo

2009-07-01 11:31:19

by Jaswinder Singh Rajput

[permalink] [raw]
Subject: [tip:perfcounters/urgent] perf stat: Define MATCH_EVENT for easy attr checking

Commit-ID: b9ebdcc0ce1c676ebf5dc4f6df6b440d8fcf88ab
Gitweb: http://git.kernel.org/tip/b9ebdcc0ce1c676ebf5dc4f6df6b440d8fcf88ab
Author: Jaswinder Singh Rajput <[email protected]>
AuthorDate: Wed, 1 Jul 2009 15:05:09 +0530
Committer: Ingo Molnar <[email protected]>
CommitDate: Wed, 1 Jul 2009 13:28:38 +0200

perf stat: Define MATCH_EVENT for easy attr checking

MATCH_EVENT is useful:

1. for multiple attrs checking
2. avoid repetition of PERF_TYPE_ and PERF_COUNT_ and save space
3. avoids line breakage

Signed-off-by: Jaswinder Singh Rajput <[email protected]>
Cc: Peter Zijlstra <[email protected]>
LKML-Reference: <[email protected]>
Signed-off-by: Ingo Molnar <[email protected]>


---
tools/perf/builtin-stat.c | 27 ++++++++++-----------------
1 files changed, 10 insertions(+), 17 deletions(-)

diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c
index 095a90e..01cc07e 100644
--- a/tools/perf/builtin-stat.c
+++ b/tools/perf/builtin-stat.c
@@ -96,6 +96,10 @@ static u64 walltime_nsecs_noise;
static u64 runtime_cycles_avg;
static u64 runtime_cycles_noise;

+#define MATCH_EVENT(t, c, counter) \
+ (attrs[counter].type == PERF_TYPE_##t && \
+ attrs[counter].config == PERF_COUNT_##c)
+
#define ERR_PERF_OPEN \
"Error: counter %d, sys_perf_counter_open() syscall returned with %d (%s)\n"

@@ -133,13 +137,8 @@ static void create_perf_stat_counter(int counter, int pid)
*/
static inline int nsec_counter(int counter)
{
- if (attrs[counter].type != PERF_TYPE_SOFTWARE)
- return 0;
-
- if (attrs[counter].config == PERF_COUNT_SW_CPU_CLOCK)
- return 1;
-
- if (attrs[counter].config == PERF_COUNT_SW_TASK_CLOCK)
+ if (MATCH_EVENT(SOFTWARE, SW_CPU_CLOCK, counter) ||
+ MATCH_EVENT(SOFTWARE, SW_TASK_CLOCK, counter))
return 1;

return 0;
@@ -194,11 +193,9 @@ static void read_counter(int counter)
/*
* Save the full runtime - to allow normalization during printout:
*/
- if (attrs[counter].type == PERF_TYPE_SOFTWARE &&
- attrs[counter].config == PERF_COUNT_SW_TASK_CLOCK)
+ if (MATCH_EVENT(SOFTWARE, SW_TASK_CLOCK, counter))
runtime_nsecs[run_idx] = count[0];
- if (attrs[counter].type == PERF_TYPE_HARDWARE &&
- attrs[counter].config == PERF_COUNT_HW_CPU_CYCLES)
+ if (MATCH_EVENT(HARDWARE, HW_CPU_CYCLES, counter))
runtime_cycles[run_idx] = count[0];
}

@@ -292,9 +289,7 @@ static void nsec_printout(int counter, u64 *count, u64 *noise)

fprintf(stderr, " %14.6f %-24s", msecs, event_name(counter));

- if (attrs[counter].type == PERF_TYPE_SOFTWARE &&
- attrs[counter].config == PERF_COUNT_SW_TASK_CLOCK) {
-
+ if (MATCH_EVENT(SOFTWARE, SW_TASK_CLOCK, counter)) {
if (walltime_nsecs_avg)
fprintf(stderr, " # %10.3f CPUs ",
(double)count[0] / (double)walltime_nsecs_avg);
@@ -307,9 +302,7 @@ static void abs_printout(int counter, u64 *count, u64 *noise)
fprintf(stderr, " %14Ld %-24s", count[0], event_name(counter));

if (runtime_cycles_avg &&
- attrs[counter].type == PERF_TYPE_HARDWARE &&
- attrs[counter].config == PERF_COUNT_HW_INSTRUCTIONS) {
-
+ MATCH_EVENT(HARDWARE, HW_INSTRUCTIONS, counter)) {
fprintf(stderr, " # %10.3f IPC ",
(double)count[0] / (double)runtime_cycles_avg);
} else {

2009-07-01 11:39:42

by Ingo Molnar

[permalink] [raw]
Subject: Re: [PATCH 2/6 -tip] perf stat: treat same behaviour for all CYCLES and CLOCKS


* Jaswinder Singh Rajput <[email protected]> wrote:

> For normalization also added SW_CPU_CLOCK and HW_BUS_CYCLES
>
> For nsec_printout also added SW_CPU_CLOCK
>
> Added helper functions to check counter unit as cycles and instructions
>
> Signed-off-by: Jaswinder Singh Rajput <[email protected]>
> ---
> tools/perf/builtin-stat.c | 49 +++++++++++++++++++++++++++++++-------------
> 1 files changed, 34 insertions(+), 15 deletions(-)
>
> diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c
> index 6bf2b80..af61c29 100644
> --- a/tools/perf/builtin-stat.c
> +++ b/tools/perf/builtin-stat.c
> @@ -144,6 +144,29 @@ static inline int nsec_counter(int counter)
> }
>
> /*
> + * Does the counter have cycles as a unit?
> + */
> +static inline int cycle_counter(int counter)
> +{
> + if (MATCH_EVENT(HARDWARE, HW_CPU_CYCLES, counter) ||
> + MATCH_EVENT(HARDWARE, HW_BUS_CYCLES, counter))
> + return 1;
> +
> + return 0;
> +}
> +
> +/*
> + * Does the counter have instructions as a unit?
> + */
> +static inline int instruction_counter(int counter)
> +{
> + if (MATCH_EVENT(HARDWARE, HW_INSTRUCTIONS, counter))
> + return 1;
> +
> + return 0;
> +}

This should be done a bit differently. Each event should have a
'unit type' index in its descriptor table, which links back to
_another_ event, specifying its unit.

For example:

(PERF_COUNT_HW_INSTRUCTIONS, -1 , "instructions")
(PERF_COUNT_HW_CACHE_REFERENCES, PERF_COUNT_HW_INSTRUCTIONS)
(PERF_COUNT_HW_CACHE_MISSES, PERF_COUNT_HW_INSTRUCTIONS)

'-1' signals an event that has itself as a unit, and a string field
gives us the pretty-print form of the unit.

The same could be done for other types of events as well, such as
software events:

(PERF_COUNT_SW_CPU_CLOCK, -1 , "nsecs")
(PERF_COUNT_SW_TASK_CLOCK, PERF_COUNT_SW_CPU_CLOCK )

This way normalization can be fully automated: say if we print out
PERF_COUNT_HW_CACHE_MISSES, we see that it is in units of
PERF_COUNT_HW_INSTRUCTIONS so we can print out that unit and can
normalize it to that metric.

the 'IPC' (Instructions Per Cycle) field is special, and if you are
interested in this then i think it should be implemented as a
special 'compound' event: it is represented by the division of two
events.

( If it's implemented like that then IPC will be printed in a
separate line, not as part of the instructions line - but that's
OK. )

Other 'compound' events might be possible too: for example a new
cache-hits field could be is cache-refs minus cache-misses.

I.e. the simplest model for 'compound' events would be:

X = A / B
X = A - B
X = A + B

We could list them in the event table, with a flag that specifies
which arithmetic operation connects two 'atomic' counters.

Then the adding of a new compound event would only be the matter of
adding one more line to the event table.

Can you see any problems with this approach?

Ingo

2009-07-01 11:41:40

by Jaswinder Singh Rajput

[permalink] [raw]
Subject: Re: [PATCH 3/6 -tip] perf_counter: Add Generalized Hardware vectored co-processor support for AMD

On Wed, 2009-07-01 at 13:27 +0200, Ingo Molnar wrote:
> * Ingo Molnar <[email protected]> wrote:
>
> > > Performance counter stats for '/usr/bin/vlc /home/jaswinder/Videos/Linus_Torvalds_interview_with_Charlie_Rose_Part_1.flv':
> > >
> > > 20177177044 vec-adds (scaled from 66.63%)
> > > 34101687027 vec-muls (scaled from 66.64%)
> > > 3984060862 vec-divs (scaled from 66.71%)
> > > 26349684710 vec-idle-cycles (scaled from 66.65%)
> > > 9052001905 vec-stall-cycles (scaled from 66.66%)
> > > 76440734242 vec-ops (scaled from 66.71%)
> > >
> > > 272.523058097 seconds time elapsed
>
> btw., the 'perf list' bits are missing - any new counter added
> should be listed by 'perf list' as well - otherwise people wont know
> what we have and what to use.
>

Even cache is not available for 'perf list'. Should I also resend patch
for adding cache along with vector and interrupt.

Thanks,
--
JSR

2009-07-01 11:46:31

by Ingo Molnar

[permalink] [raw]
Subject: Re: [GIT-PULL -tip][PATCH 0/6] perf_counter patches


* Jaswinder Singh Rajput <[email protected]> wrote:

> Ingo,
>
> Please pull perf_counter patches :
> The following changes since commit 092304de242705abf24edcb0fc7beed4c4276865:
> Ingo Molnar (1):
> Merge branch 'perfcounters/urgent'
>
> are available in the git repository at:
>
> git://git.kernel.org/pub/scm/linux/kernel/git/jaswinder/linux-2.6-tip.git master
>
> Jaswinder Singh Rajput (6):
> perf stat: define MATCH_EVENT for easy attrs checking
> perf stat: treat same behaviour for all CYCLES and CLOCKS
> perf_counter: Add Generalized Hardware vectored co-processor support for AMD
> perf_counter: Add Generalized Hardware interrupt support for AMD
> perf_counter: Add hardware vector events for nehalem
> perf_counter: Add hardware interrupt events for nehalem, core2 and atom

A patch nit-picking sidenote, please try to use more consistent
capitalization in commit titles. I fixed the first commit's title
to be:

b9ebdcc: perf stat: Define MATCH_EVENT for easy attr checking

Also, the way we want to refer to the above Intel CPU models is
"Corei7/Nehalem, Core2 and Atom".

Ingo

2009-07-01 11:51:19

by Ingo Molnar

[permalink] [raw]
Subject: Re: [PATCH 3/6 -tip] perf_counter: Add Generalized Hardware vectored co-processor support for AMD


* Jaswinder Singh Rajput <[email protected]> wrote:

> On Wed, 2009-07-01 at 13:27 +0200, Ingo Molnar wrote:
> > * Ingo Molnar <[email protected]> wrote:
> >
> > > > Performance counter stats for '/usr/bin/vlc /home/jaswinder/Videos/Linus_Torvalds_interview_with_Charlie_Rose_Part_1.flv':
> > > >
> > > > 20177177044 vec-adds (scaled from 66.63%)
> > > > 34101687027 vec-muls (scaled from 66.64%)
> > > > 3984060862 vec-divs (scaled from 66.71%)
> > > > 26349684710 vec-idle-cycles (scaled from 66.65%)
> > > > 9052001905 vec-stall-cycles (scaled from 66.66%)
> > > > 76440734242 vec-ops (scaled from 66.71%)
> > > >
> > > > 272.523058097 seconds time elapsed
> >
> > btw., the 'perf list' bits are missing - any new counter added
> > should be listed by 'perf list' as well - otherwise people wont know
> > what we have and what to use.
> >
>
> Even cache is not available for 'perf list'. Should I also resend
> patch for adding cache along with vector and interrupt.

I'd suggest for you to send a separate patch for the cache bits
first (that way it's not held up by other dependencies) - and keep
the vector and irq bits in their respective patches.

I.e. when we add new generic events, we want to enable it in the
full tool-space in a single patch.

Ingo

2009-07-02 09:45:45

by Jaswinder Singh Rajput

[permalink] [raw]
Subject: [PATCH 1/2 -tip] perf_counter: Add generalized hardware vectored co-processor support for AMD and Intel Corei7/Nehalem


This output is from AMD box:

$ ./perf stat -e add -e multiply -e divide -e vec-idle-cycles -e vec-stall-cycles -e vec-ops -- ls -lR /usr/include/ > /dev/null

Performance counter stats for 'ls -lR /usr/include/':

4218 vec-adds (scaled from 66.60%)
7426 vec-muls (scaled from 66.67%)
5441 vec-divs (scaled from 66.29%)
821982187 vec-idle-cycles (scaled from 66.45%)
2681 vec-stall-cycles (scaled from 67.11%)
7887 vec-ops (scaled from 66.88%)

0.417614573 seconds time elapsed

$ ./perf stat -e add -e multiply -e divide -e vec-idle-cycles -e vec-stall-cycles -e vec-ops -- /usr/bin/rhythmbox ~jaswinder/Music/singhiskinng.mp3

Performance counter stats for '/usr/bin/rhythmbox /home/jaswinder/Music/singhiskinng.mp3':

17552264 vec-adds (scaled from 66.28%)
19715258 vec-muls (scaled from 66.63%)
15862733 vec-divs (scaled from 66.82%)
23735187095 vec-idle-cycles (scaled from 66.89%)
11353159 vec-stall-cycles (scaled from 66.90%)
36628571 vec-ops (scaled from 66.48%)

298.350012843 seconds time elapsed

$ ./perf stat -e add -e multiply -e divide -e vec-idle-cycles -e vec-stall-cycles -e vec-ops -- /usr/bin/vlc ~jaswinder/Videos/Linus_Torvalds_interview_with_Charlie_Rose_Part_1.flv

Performance counter stats for '/usr/bin/vlc /home/jaswinder/Videos/Linus_Torvalds_interview_with_Charlie_Rose_Part_1.flv':

20177177044 vec-adds (scaled from 66.63%)
34101687027 vec-muls (scaled from 66.64%)
3984060862 vec-divs (scaled from 66.71%)
26349684710 vec-idle-cycles (scaled from 66.65%)
9052001905 vec-stall-cycles (scaled from 66.66%)
76440734242 vec-ops (scaled from 66.71%)

272.523058097 seconds time elapsed

$ ./perf list shows vector events like :

vec-adds OR add [Hardware vector event]
vec-muls OR multiply [Hardware vector event]
vec-divs OR divide [Hardware vector event]
vec-idle-cycles OR vec-empty-cycles [Hardware vector event]
vec-stall-cycles OR vec-busy-cycles [Hardware vector event]
vec-ops OR vec-operations [Hardware vector event]

Signed-off-by: Jaswinder Singh Rajput <[email protected]>
---
arch/x86/kernel/cpu/perf_counter.c | 45 +++++++++++++++++++++++++++++
include/linux/perf_counter.h | 15 ++++++++++
kernel/perf_counter.c | 1 +
tools/perf/util/parse-events.c | 55 ++++++++++++++++++++++++++++++++++++
4 files changed, 116 insertions(+), 0 deletions(-)

diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index 36c3dc7..48f28b7 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -372,6 +372,22 @@ static const u64 atom_hw_cache_event_ids
},
};

+/*
+ * Generalized hw vectored co-processor event table
+ */
+
+static u64 __read_mostly hw_vector_event_ids[PERF_COUNT_HW_VECTOR_MAX];
+
+static const u64 nehalem_hw_vector_event_ids[] =
+{
+ [PERF_COUNT_HW_VECTOR_ADD] = 0x01B1, /* UOPS_EXECUTED.PORT0 */
+ [PERF_COUNT_HW_VECTOR_MULTIPLY] = 0x0214, /* ARITH.MUL */
+ [PERF_COUNT_HW_VECTOR_DIVIDE] = 0x0114, /* ARITH.CYCLES_DIV_BUSY */
+ [PERF_COUNT_HW_VECTOR_IDLE_CYCLES] = 0x0,
+ [PERF_COUNT_HW_VECTOR_STALL_CYCLES] = 0x60A2, /* RESOURCE_STALLS.FPCW|MXCSR*/
+ [PERF_COUNT_HW_VECTOR_OPS] = 0x0710, /* FP_COMP_OPS_EXE.X87|MMX|SSE_FP*/
+};
+
static u64 intel_pmu_raw_event(u64 event)
{
#define CORE_EVNTSEL_EVENT_MASK 0x000000FFULL
@@ -481,6 +497,17 @@ static const u64 amd_hw_cache_event_ids
},
};

+static const u64 amd_hw_vector_event_ids[] =
+{
+ [PERF_COUNT_HW_VECTOR_ADD] = 0x0100, /* Dispatched FPU Add */
+ [PERF_COUNT_HW_VECTOR_MULTIPLY] = 0x0200, /* Dispatched FPU Multiply */
+ [PERF_COUNT_HW_VECTOR_DIVIDE] = 0x0400, /* Dispatched FPU Store */
+ [PERF_COUNT_HW_VECTOR_IDLE_CYCLES] = 0x0001, /* FPU Empty cycles */
+ [PERF_COUNT_HW_VECTOR_STALL_CYCLES] = 0x00D7, /* Dispatch stall for FPU */
+ [PERF_COUNT_HW_VECTOR_OPS] = 0x0FCB, /* Retired x87|(MMX & 3Dnow)
+ |SSE & SSE2) Instructions */
+};
+
/*
* AMD Performance Monitor K7 and later.
*/
@@ -659,6 +686,17 @@ set_ext_hw_attr(struct hw_perf_counter *hwc, struct perf_counter_attr *attr)
return 0;
}

+static inline int
+set_hw_vector_attr(struct hw_perf_counter *hwc, struct perf_counter_attr *attr)
+{
+ if (attr->config >= PERF_COUNT_HW_VECTOR_MAX)
+ return -EINVAL;
+
+ hwc->config |= hw_vector_event_ids[attr->config];
+
+ return 0;
+}
+
/*
* Setup the hardware configuration for a given attr_type
*/
@@ -716,6 +754,9 @@ static int __hw_perf_counter_init(struct perf_counter *counter)
if (attr->type == PERF_TYPE_HW_CACHE)
return set_ext_hw_attr(hwc, attr);

+ if (attr->type == PERF_TYPE_HW_VECTOR)
+ return set_hw_vector_attr(hwc, attr);
+
if (attr->config >= x86_pmu.max_events)
return -EINVAL;
/*
@@ -1444,6 +1485,8 @@ static int intel_pmu_init(void)
case 26:
memcpy(hw_cache_event_ids, nehalem_hw_cache_event_ids,
sizeof(hw_cache_event_ids));
+ memcpy(hw_vector_event_ids, nehalem_hw_vector_event_ids,
+ sizeof(hw_vector_event_ids));

pr_cont("Nehalem/Corei7 events, ");
break;
@@ -1468,6 +1511,8 @@ static int amd_pmu_init(void)
/* Events are common for all AMDs */
memcpy(hw_cache_event_ids, amd_hw_cache_event_ids,
sizeof(hw_cache_event_ids));
+ memcpy(hw_vector_event_ids, amd_hw_vector_event_ids,
+ sizeof(hw_vector_event_ids));

return 0;
}
diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index 5e970c7..e91b712 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -31,6 +31,7 @@ enum perf_type_id {
PERF_TYPE_TRACEPOINT = 2,
PERF_TYPE_HW_CACHE = 3,
PERF_TYPE_RAW = 4,
+ PERF_TYPE_HW_VECTOR = 5,

PERF_TYPE_MAX, /* non-ABI */
};
@@ -89,6 +90,20 @@ enum perf_hw_cache_op_result_id {
};

/*
+ * Generalized hardware vectored co-processor counters:
+ */
+enum perf_hw_vector_id {
+ PERF_COUNT_HW_VECTOR_ADD = 0,
+ PERF_COUNT_HW_VECTOR_MULTIPLY = 1,
+ PERF_COUNT_HW_VECTOR_DIVIDE = 2,
+ PERF_COUNT_HW_VECTOR_IDLE_CYCLES = 3,
+ PERF_COUNT_HW_VECTOR_STALL_CYCLES = 4,
+ PERF_COUNT_HW_VECTOR_OPS = 5,
+
+ PERF_COUNT_HW_VECTOR_MAX, /* non-ABI */
+};
+
+/*
* Special "software" counters provided by the kernel, even if the hardware
* does not support performance counters. These counters measure various
* physical and sw events of the kernel (and allow the profiling of them as
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index d55a50d..dd3848a 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -3838,6 +3838,7 @@ perf_counter_alloc(struct perf_counter_attr *attr,
case PERF_TYPE_RAW:
case PERF_TYPE_HARDWARE:
case PERF_TYPE_HW_CACHE:
+ case PERF_TYPE_HW_VECTOR:
pmu = hw_perf_counter_init(counter);
break;

diff --git a/tools/perf/util/parse-events.c b/tools/perf/util/parse-events.c
index 5184959..8213dfb 100644
--- a/tools/perf/util/parse-events.c
+++ b/tools/perf/util/parse-events.c
@@ -40,6 +40,17 @@ static struct event_symbol event_symbols[] = {
{ CSW(CPU_MIGRATIONS), "cpu-migrations", "migrations" },
};

+#define CHVECTOR(x) .type = PERF_TYPE_HW_VECTOR, .config = PERF_COUNT_HW_VECTOR_##x
+
+static struct event_symbol vector_event_symbols[] = {
+ { CHVECTOR(ADD), "vec-adds", "add" },
+ { CHVECTOR(MULTIPLY), "vec-muls", "multiply" },
+ { CHVECTOR(DIVIDE), "vec-divs", "divide" },
+ { CHVECTOR(IDLE_CYCLES), "vec-idle-cycles", "vec-empty-cycles"},
+ { CHVECTOR(STALL_CYCLES), "vec-stall-cycles", "vec-busy-cycles"},
+ { CHVECTOR(OPS), "vec-ops", "vec-operations"},
+};
+
#define __PERF_COUNTER_FIELD(config, name) \
((config & PERF_COUNTER_##name##_MASK) >> PERF_COUNTER_##name##_SHIFT)

@@ -172,6 +183,11 @@ char *event_name(int counter)
return event_cache_name(cache_type, cache_op, cache_result);
}

+ case PERF_TYPE_HW_VECTOR:
+ if (config < PERF_COUNT_HW_VECTOR_MAX)
+ return vector_event_symbols[config].symbol;
+ return "unknown-vector";
+
case PERF_TYPE_SOFTWARE:
if (config < PERF_COUNT_SW_MAX)
return sw_event_names[config];
@@ -280,6 +296,21 @@ static int check_events(const char *str, unsigned int i)
return 0;
}

+static int check_vector_events(const char *str, unsigned int i)
+{
+ int n;
+
+ n = strlen(vector_event_symbols[i].symbol);
+ if (!strncmp(str, vector_event_symbols[i].symbol, n))
+ return n;
+
+ n = strlen(vector_event_symbols[i].alias);
+ if (n)
+ if (!strncmp(str, vector_event_symbols[i].alias, n))
+ return n;
+ return 0;
+}
+
static int
parse_symbolic_event(const char **strp, struct perf_counter_attr *attr)
{
@@ -296,6 +327,17 @@ parse_symbolic_event(const char **strp, struct perf_counter_attr *attr)
return 1;
}
}
+
+ for (i = 0; i < ARRAY_SIZE(vector_event_symbols); i++) {
+ n = check_vector_events(str, i);
+ if (n > 0) {
+ attr->type = vector_event_symbols[i].type;
+ attr->config = vector_event_symbols[i].config;
+ *strp = str + n;
+ return 1;
+ }
+ }
+
return 0;
}

@@ -420,6 +462,7 @@ static const char * const event_type_descriptors[] = {
"Software event",
"Tracepoint event",
"Hardware cache event",
+ "Hardware vector event",
};

/*
@@ -468,6 +511,18 @@ void print_events(void)
}

fprintf(stderr, "\n");
+ syms = vector_event_symbols;
+ type = syms->type;
+ for (i = 0; i < ARRAY_SIZE(vector_event_symbols); i++, syms++) {
+ if (strlen(syms->alias))
+ sprintf(name, "%s OR %s", syms->symbol, syms->alias);
+ else
+ strcpy(name, syms->symbol);
+ fprintf(stderr, " %-40s [%s]\n", name,
+ event_type_descriptors[type]);
+ }
+
+ fprintf(stderr, "\n");
fprintf(stderr, " %-40s [raw hardware event descriptor]\n",
"rNNN");
fprintf(stderr, "\n");
--
1.6.0.6


2009-07-02 09:46:52

by Jaswinder Singh Rajput

[permalink] [raw]
Subject: [PATCH 2/2 -tip] perf_counter: Add generalized hardware interrupt support for AMD and Intel Corei7/Nehalem, Core2 and Atom


$ ./perf stat -e interrupts -e masked -e int-pending-mask-cycles -- ls -lR /usr/include/ > /dev/null

Performance counter stats for 'ls -lR /usr/include/':

377 interrupts
53429936 int-mask-cycles
1119 int-pending-mask-cycles

0.371457539 seconds time elapsed

$ ./perf list shows interrupt events like :

interrupts OR interrupt [Hardware interrupt event]
int-mask-cycles OR masked [Hardware interrupt event]
int-pending-mask-cycles [Hardware interrupt event]

Signed-off-by: Jaswinder Singh Rajput <[email protected]>
---
arch/x86/kernel/cpu/perf_counter.c | 50 +++++++++++++++++++++++++++++++++++
include/linux/perf_counter.h | 12 ++++++++
kernel/perf_counter.c | 1 +
tools/perf/util/parse-events.c | 51 ++++++++++++++++++++++++++++++++++++
4 files changed, 114 insertions(+), 0 deletions(-)

diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index 48f28b7..43b24ad 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -388,6 +388,26 @@ static const u64 nehalem_hw_vector_event_ids[] =
[PERF_COUNT_HW_VECTOR_OPS] = 0x0710, /* FP_COMP_OPS_EXE.X87|MMX|SSE_FP*/
};

+/*
+ * Generalized hw interrupt event table
+ */
+
+static u64 __read_mostly hw_interrupt_event_ids[PERF_COUNT_HW_INTERRUPT_MAX];
+
+static const u64 nehalem_hw_interrupt_event_ids[] =
+{
+ [PERF_COUNT_HW_INTERRUPT] = 0x011D, /* HW_INT.RCV */
+ [PERF_COUNT_HW_INTERRUPT_MASK] = 0x021D, /* HW_INT.CYCLES_MASKED */
+ [PERF_COUNT_HW_INTERRUPT_PENDING_MASK]= 0x041D, /* HW_INT.CYCLES_PENDING_AND_MASKED*/
+};
+
+static const u64 core2_atom_hw_interrupt_event_ids[] =
+{
+ [PERF_COUNT_HW_INTERRUPT] = 0x00C8, /* HW_INT_RCV */
+ [PERF_COUNT_HW_INTERRUPT_MASK] = 0x01C6, /* CYCLES_INT_MASKED.CYCLES_INT_MASKED*/
+ [PERF_COUNT_HW_INTERRUPT_PENDING_MASK]= 0x02C6, /* CYCLES_INT_MASKED.CYCLES_INT_PENDING_AND_MASKED*/
+};
+
static u64 intel_pmu_raw_event(u64 event)
{
#define CORE_EVNTSEL_EVENT_MASK 0x000000FFULL
@@ -508,6 +528,14 @@ static const u64 amd_hw_vector_event_ids[] =
|SSE & SSE2) Instructions */
};

+
+static const u64 amd_hw_interrupt_event_ids[] =
+{
+ [PERF_COUNT_HW_INTERRUPT] = 0x00CF, /* Interrupts Taken */
+ [PERF_COUNT_HW_INTERRUPT_MASK] = 0x00CD, /* Interrupts-Masked Cycles*/
+ [PERF_COUNT_HW_INTERRUPT_PENDING_MASK]= 0x00CE, /* Int Mask+Pending Cycles */
+};
+
/*
* AMD Performance Monitor K7 and later.
*/
@@ -697,6 +725,17 @@ set_hw_vector_attr(struct hw_perf_counter *hwc, struct perf_counter_attr *attr)
return 0;
}

+static inline int
+set_hw_interrupt_attr(struct hw_perf_counter *hwc, struct perf_counter_attr *attr)
+{
+ if (attr->config >= PERF_COUNT_HW_INTERRUPT_MAX)
+ return -EINVAL;
+
+ hwc->config |= hw_interrupt_event_ids[attr->config];
+
+ return 0;
+}
+
/*
* Setup the hardware configuration for a given attr_type
*/
@@ -757,6 +796,9 @@ static int __hw_perf_counter_init(struct perf_counter *counter)
if (attr->type == PERF_TYPE_HW_VECTOR)
return set_hw_vector_attr(hwc, attr);

+ if (attr->type == PERF_TYPE_HW_INTERRUPT)
+ return set_hw_interrupt_attr(hwc, attr);
+
if (attr->config >= x86_pmu.max_events)
return -EINVAL;
/*
@@ -1478,6 +1520,8 @@ static int intel_pmu_init(void)
case 29: /* six-core 45 nm xeon "Dunnington" */
memcpy(hw_cache_event_ids, core2_hw_cache_event_ids,
sizeof(hw_cache_event_ids));
+ memcpy(hw_interrupt_event_ids, core2_atom_hw_interrupt_event_ids,
+ sizeof(hw_interrupt_event_ids));

pr_cont("Core2 events, ");
break;
@@ -1487,12 +1531,16 @@ static int intel_pmu_init(void)
sizeof(hw_cache_event_ids));
memcpy(hw_vector_event_ids, nehalem_hw_vector_event_ids,
sizeof(hw_vector_event_ids));
+ memcpy(hw_interrupt_event_ids, nehalem_hw_interrupt_event_ids,
+ sizeof(hw_interrupt_event_ids));

pr_cont("Nehalem/Corei7 events, ");
break;
case 28:
memcpy(hw_cache_event_ids, atom_hw_cache_event_ids,
sizeof(hw_cache_event_ids));
+ memcpy(hw_interrupt_event_ids, core2_atom_hw_interrupt_event_ids,
+ sizeof(hw_interrupt_event_ids));

pr_cont("Atom events, ");
break;
@@ -1513,6 +1561,8 @@ static int amd_pmu_init(void)
sizeof(hw_cache_event_ids));
memcpy(hw_vector_event_ids, amd_hw_vector_event_ids,
sizeof(hw_vector_event_ids));
+ memcpy(hw_interrupt_event_ids, amd_hw_interrupt_event_ids,
+ sizeof(hw_interrupt_event_ids));

return 0;
}
diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index e91b712..a53081b 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -32,6 +32,7 @@ enum perf_type_id {
PERF_TYPE_HW_CACHE = 3,
PERF_TYPE_RAW = 4,
PERF_TYPE_HW_VECTOR = 5,
+ PERF_TYPE_HW_INTERRUPT = 6,

PERF_TYPE_MAX, /* non-ABI */
};
@@ -104,6 +105,17 @@ enum perf_hw_vector_id {
};

/*
+ * Generalized hardware interrupt counters:
+ */
+enum perf_hw_interrupt_id {
+ PERF_COUNT_HW_INTERRUPT = 0,
+ PERF_COUNT_HW_INTERRUPT_MASK = 1,
+ PERF_COUNT_HW_INTERRUPT_PENDING_MASK = 2,
+
+ PERF_COUNT_HW_INTERRUPT_MAX, /* non-ABI */
+};
+
+/*
* Special "software" counters provided by the kernel, even if the hardware
* does not support performance counters. These counters measure various
* physical and sw events of the kernel (and allow the profiling of them as
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index dd3848a..7a529a8 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -3839,6 +3839,7 @@ perf_counter_alloc(struct perf_counter_attr *attr,
case PERF_TYPE_HARDWARE:
case PERF_TYPE_HW_CACHE:
case PERF_TYPE_HW_VECTOR:
+ case PERF_TYPE_HW_INTERRUPT:
pmu = hw_perf_counter_init(counter);
break;

diff --git a/tools/perf/util/parse-events.c b/tools/perf/util/parse-events.c
index 8213dfb..d085b8f 100644
--- a/tools/perf/util/parse-events.c
+++ b/tools/perf/util/parse-events.c
@@ -51,6 +51,14 @@ static struct event_symbol vector_event_symbols[] = {
{ CHVECTOR(OPS), "vec-ops", "vec-operations"},
};

+#define CHINT(x) .type = PERF_TYPE_HW_INTERRUPT, .config = PERF_COUNT_HW_##x
+
+static struct event_symbol interrupt_event_symbols[] = {
+ { CHINT(INTERRUPT), "interrupts", "interrupt" },
+ { CHINT(INTERRUPT_MASK), "int-mask-cycles", "masked" },
+ { CHINT(INTERRUPT_PENDING_MASK),"int-pending-mask-cycles", "" },
+};
+
#define __PERF_COUNTER_FIELD(config, name) \
((config & PERF_COUNTER_##name##_MASK) >> PERF_COUNTER_##name##_SHIFT)

@@ -188,6 +196,11 @@ char *event_name(int counter)
return vector_event_symbols[config].symbol;
return "unknown-vector";

+ case PERF_TYPE_HW_INTERRUPT:
+ if (config < PERF_COUNT_HW_INTERRUPT_MAX)
+ return interrupt_event_symbols[config].symbol;
+ return "unknown-interrupt";
+
case PERF_TYPE_SOFTWARE:
if (config < PERF_COUNT_SW_MAX)
return sw_event_names[config];
@@ -311,6 +324,21 @@ static int check_vector_events(const char *str, unsigned int i)
return 0;
}

+static int check_interrupt_events(const char *str, unsigned int i)
+{
+ int n;
+
+ n = strlen(interrupt_event_symbols[i].symbol);
+ if (!strncmp(str, interrupt_event_symbols[i].symbol, n))
+ return n;
+
+ n = strlen(interrupt_event_symbols[i].alias);
+ if (n)
+ if (!strncmp(str, interrupt_event_symbols[i].alias, n))
+ return n;
+ return 0;
+}
+
static int
parse_symbolic_event(const char **strp, struct perf_counter_attr *attr)
{
@@ -338,6 +366,16 @@ parse_symbolic_event(const char **strp, struct perf_counter_attr *attr)
}
}

+ for (i = 0; i < ARRAY_SIZE(interrupt_event_symbols); i++) {
+ n = check_interrupt_events(str, i);
+ if (n > 0) {
+ attr->type = interrupt_event_symbols[i].type;
+ attr->config = interrupt_event_symbols[i].config;
+ *strp = str + n;
+ return 1;
+ }
+ }
+
return 0;
}

@@ -463,6 +501,7 @@ static const char * const event_type_descriptors[] = {
"Tracepoint event",
"Hardware cache event",
"Hardware vector event",
+ "Hardware interrupt event",
};

/*
@@ -523,6 +562,18 @@ void print_events(void)
}

fprintf(stderr, "\n");
+ syms = interrupt_event_symbols;
+ type = syms->type;
+ for (i = 0; i < ARRAY_SIZE(interrupt_event_symbols); i++, syms++) {
+ if (strlen(syms->alias))
+ sprintf(name, "%s OR %s", syms->symbol, syms->alias);
+ else
+ strcpy(name, syms->symbol);
+ fprintf(stderr, " %-40s [%s]\n", name,
+ event_type_descriptors[type]);
+ }
+
+ fprintf(stderr, "\n");
fprintf(stderr, " %-40s [raw hardware event descriptor]\n",
"rNNN");
fprintf(stderr, "\n");
--
1.6.0.6


2009-07-03 07:39:35

by Jaswinder Singh Rajput

[permalink] [raw]
Subject: Re: [PATCH 1/2 -tip] perf_counter: Add generalized hardware vectored co-processor support for AMD and Intel Corei7/Nehalem

Hello Ingo,

On Thu, 2009-07-02 at 15:14 +0530, Jaswinder Singh Rajput wrote:
> This output is from AMD box:
>
> $ ./perf stat -e add -e multiply -e divide -e vec-idle-cycles -e vec-stall-cycles -e vec-ops -- ls -lR /usr/include/ > /dev/null
>
> Performance counter stats for 'ls -lR /usr/include/':
>
> 4218 vec-adds (scaled from 66.60%)
> 7426 vec-muls (scaled from 66.67%)
> 5441 vec-divs (scaled from 66.29%)
> 821982187 vec-idle-cycles (scaled from 66.45%)
> 2681 vec-stall-cycles (scaled from 67.11%)
> 7887 vec-ops (scaled from 66.88%)
>
> 0.417614573 seconds time elapsed
>
> $ ./perf stat -e add -e multiply -e divide -e vec-idle-cycles -e vec-stall-cycles -e vec-ops -- /usr/bin/rhythmbox ~jaswinder/Music/singhiskinng.mp3
>
> Performance counter stats for '/usr/bin/rhythmbox /home/jaswinder/Music/singhiskinng.mp3':
>
> 17552264 vec-adds (scaled from 66.28%)
> 19715258 vec-muls (scaled from 66.63%)
> 15862733 vec-divs (scaled from 66.82%)
> 23735187095 vec-idle-cycles (scaled from 66.89%)
> 11353159 vec-stall-cycles (scaled from 66.90%)
> 36628571 vec-ops (scaled from 66.48%)
>
> 298.350012843 seconds time elapsed
>
> $ ./perf stat -e add -e multiply -e divide -e vec-idle-cycles -e vec-stall-cycles -e vec-ops -- /usr/bin/vlc ~jaswinder/Videos/Linus_Torvalds_interview_with_Charlie_Rose_Part_1.flv
>
> Performance counter stats for '/usr/bin/vlc /home/jaswinder/Videos/Linus_Torvalds_interview_with_Charlie_Rose_Part_1.flv':
>
> 20177177044 vec-adds (scaled from 66.63%)
> 34101687027 vec-muls (scaled from 66.64%)
> 3984060862 vec-divs (scaled from 66.71%)
> 26349684710 vec-idle-cycles (scaled from 66.65%)
> 9052001905 vec-stall-cycles (scaled from 66.66%)
> 76440734242 vec-ops (scaled from 66.71%)
>
> 272.523058097 seconds time elapsed
>
> $ ./perf list shows vector events like :
>
> vec-adds OR add [Hardware vector event]
> vec-muls OR multiply [Hardware vector event]
> vec-divs OR divide [Hardware vector event]
> vec-idle-cycles OR vec-empty-cycles [Hardware vector event]
> vec-stall-cycles OR vec-busy-cycles [Hardware vector event]
> vec-ops OR vec-operations [Hardware vector event]
>
> Signed-off-by: Jaswinder Singh Rajput <[email protected]>
> ---
> arch/x86/kernel/cpu/perf_counter.c | 45 +++++++++++++++++++++++++++++
> include/linux/perf_counter.h | 15 ++++++++++
> kernel/perf_counter.c | 1 +
> tools/perf/util/parse-events.c | 55 ++++++++++++++++++++++++++++++++++++
> 4 files changed, 116 insertions(+), 0 deletions(-)
>
> diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
> index 36c3dc7..48f28b7 100644
> --- a/arch/x86/kernel/cpu/perf_counter.c
> +++ b/arch/x86/kernel/cpu/perf_counter.c
> @@ -372,6 +372,22 @@ static const u64 atom_hw_cache_event_ids
> },
> };
>
> +/*
> + * Generalized hw vectored co-processor event table
> + */
> +
> +static u64 __read_mostly hw_vector_event_ids[PERF_COUNT_HW_VECTOR_MAX];
> +
> +static const u64 nehalem_hw_vector_event_ids[] =
> +{
> + [PERF_COUNT_HW_VECTOR_ADD] = 0x01B1, /* UOPS_EXECUTED.PORT0 */
> + [PERF_COUNT_HW_VECTOR_MULTIPLY] = 0x0214, /* ARITH.MUL */
> + [PERF_COUNT_HW_VECTOR_DIVIDE] = 0x0114, /* ARITH.CYCLES_DIV_BUSY */
> + [PERF_COUNT_HW_VECTOR_IDLE_CYCLES] = 0x0,
> + [PERF_COUNT_HW_VECTOR_STALL_CYCLES] = 0x60A2, /* RESOURCE_STALLS.FPCW|MXCSR*/
> + [PERF_COUNT_HW_VECTOR_OPS] = 0x0710, /* FP_COMP_OPS_EXE.X87|MMX|SSE_FP*/
> +};
> +

Have you tested this patch on Intel Corei7/Nehalem.

Thanks,
--
JSR

http://userweb.kernel.org/~jaswinder/


2009-07-03 08:18:23

by Paul Mackerras

[permalink] [raw]
Subject: Re: [PATCH 2/6 -tip] perf stat: treat same behaviour for all CYCLES and CLOCKS

Ingo Molnar writes:

> Other 'compound' events might be possible too: for example a new
> cache-hits field could be is cache-refs minus cache-misses.

Hmmm, on the MPC7450 family there are events for cache-hits and
cache-misses, so there it would be nice to be able to ask for
cache-refs and have it compute cache-hits plus cache-misses.

> I.e. the simplest model for 'compound' events would be:
>
> X = A / B
> X = A - B
> X = A + B
>
> We could list them in the event table, with a flag that specifies
> which arithmetic operation connects two 'atomic' counters.
>
> Then the adding of a new compound event would only be the matter of
> adding one more line to the event table.

Sounds nice. If we do this we should ensure that the two events
get put into one group if possible.

Paul.

2009-07-03 08:28:14

by Ingo Molnar

[permalink] [raw]
Subject: Re: [PATCH 2/6 -tip] perf stat: treat same behaviour for all CYCLES and CLOCKS


* Paul Mackerras <[email protected]> wrote:

> Ingo Molnar writes:
>
> > Other 'compound' events might be possible too: for example a new
> > cache-hits field could be is cache-refs minus cache-misses.
>
> Hmmm, on the MPC7450 family there are events for cache-hits and
> cache-misses, so there it would be nice to be able to ask for
> cache-refs and have it compute cache-hits plus cache-misses.

Yes. I think the API is structured enough so that user-space knows
enough about the meaning of the events here. We can certainly
stipulate this rule:

refs == hits + misses

And if the kernel returns -ENODEV for a particular component
user-space can fall back using the other two events.

I.e. this would allow transparent support for all 3 permutations:

hw has refs and hits
hw has refs and misses
hw has hits and misses

For sampling it's a tiny bit tricky but still doable:j a compound
counter could still sample because we handle weighted samples
throughout the tools and negative weight can be subtraced.

Intuitive annotation output would have to be thought out for this as
entries/function could go negative statistically.

> > I.e. the simplest model for 'compound' events would be:
> >
> > X = A / B
> > X = A - B
> > X = A + B
> >
> > We could list them in the event table, with a flag that
> > specifies which arithmetic operation connects two 'atomic'
> > counters.
> >
> > Then the adding of a new compound event would only be the matter
> > of adding one more line to the event table.
>
> Sounds nice. If we do this we should ensure that the two events
> get put into one group if possible.

Correct. Are you interested in adding this, so that it fits the
MPC7450 family perfectly?

Ingo

2009-07-03 09:31:07

by Ingo Molnar

[permalink] [raw]
Subject: Re: [PATCH 1/2 -tip] perf_counter: Add generalized hardware vectored co-processor support for AMD and Intel Corei7/Nehalem


* Jaswinder Singh Rajput <[email protected]> wrote:

> > $ ./perf list shows vector events like :
> >
> > vec-adds OR add [Hardware vector event]
> > vec-muls OR multiply [Hardware vector event]
> > vec-divs OR divide [Hardware vector event]
> > vec-idle-cycles OR vec-empty-cycles [Hardware vector event]
> > vec-stall-cycles OR vec-busy-cycles [Hardware vector event]
> > vec-ops OR vec-operations [Hardware vector event]

btw., why does this printout SHOUT the 'or'? It's certainly not an
important piece of information. Something like:

vec-adds | add [Hardware vector event]
vec-muls | multiply [Hardware vector event]
vec-divs | divide [Hardware vector event]
vec-idle-cycles | vec-empty-cycles [Hardware vector event]
vec-stall-cycles | vec-busy-cycles [Hardware vector event]
vec-ops | vec-operations [Hardware vector event]

looks better on all levels.

Ingo

2009-07-03 10:12:03

by Jaswinder Singh Rajput

[permalink] [raw]
Subject: Re: [PATCH 1/2 -tip] perf_counter: Add generalized hardware vectored co-processor support for AMD and Intel Corei7/Nehalem

On Fri, 2009-07-03 at 11:30 +0200, Ingo Molnar wrote:
> * Jaswinder Singh Rajput <[email protected]> wrote:
>
> > > $ ./perf list shows vector events like :
> > >
> > > vec-adds OR add [Hardware vector event]
> > > vec-muls OR multiply [Hardware vector event]
> > > vec-divs OR divide [Hardware vector event]
> > > vec-idle-cycles OR vec-empty-cycles [Hardware vector event]
> > > vec-stall-cycles OR vec-busy-cycles [Hardware vector event]
> > > vec-ops OR vec-operations [Hardware vector event]
>
> btw., why does this printout SHOUT the 'or'? It's certainly not an
> important piece of information. Something like:
>
> vec-adds | add [Hardware vector event]
> vec-muls | multiply [Hardware vector event]
> vec-divs | divide [Hardware vector event]
> vec-idle-cycles | vec-empty-cycles [Hardware vector event]
> vec-stall-cycles | vec-busy-cycles [Hardware vector event]
> vec-ops | vec-operations [Hardware vector event]
>
> looks better on all levels.
>

'OR' is also used for other events.
If this is the only issue, I request you to accept these 2 patches.

I will send incremental patch which will fix these 'OR's and also avoid
duplicating of these functions.

Thanks,
--
JSR

http://userweb.kernel.org/~jaswinder/

2009-07-03 10:30:26

by Ingo Molnar

[permalink] [raw]
Subject: Re: [PATCH 1/2 -tip] perf_counter: Add generalized hardware vectored co-processor support for AMD and Intel Corei7/Nehalem


* Jaswinder Singh Rajput <[email protected]> wrote:

> Performance counter stats for '/usr/bin/rhythmbox /home/jaswinder/Music/singhiskinng.mp3':
>
> 17552264 vec-adds (scaled from 66.28%)
> 19715258 vec-muls (scaled from 66.63%)
> 15862733 vec-divs (scaled from 66.82%)
> 23735187095 vec-idle-cycles (scaled from 66.89%)
> 11353159 vec-stall-cycles (scaled from 66.90%)
> 36628571 vec-ops (scaled from 66.48%)

Is stall-cycles equivalent to busy-cycles? I.e. do we have this
general relationship to the cycle event:

cycles = vec-stall-cycles + vec-idle-cycles

?

Ingo

2009-07-03 10:33:39

by Ingo Molnar

[permalink] [raw]
Subject: Re: [PATCH 2/2 -tip] perf_counter: Add generalized hardware interrupt support for AMD and Intel Corei7/Nehalem, Core2 and Atom


* Jaswinder Singh Rajput <[email protected]> wrote:

> $ ./perf stat -e interrupts -e masked -e int-pending-mask-cycles -- ls -lR /usr/include/ > /dev/null
>
> Performance counter stats for 'ls -lR /usr/include/':
>
> 377 interrupts
> 53429936 int-mask-cycles
> 1119 int-pending-mask-cycles

What's your take on my review feedback:

> We could and should probably add a software counter for hardirqs
> as well. That would allow the vector/irqnr information to be
> passed in, and it would allow architectures without irq metrics in
> the PMU to have this counter too.
>
> This way we could profile based on a specific interrupt source
> only - say based on the networking card.

Why did you resend the patch while there was still unanswered review
feedback?

Ingo

2009-07-03 11:56:00

by Jaswinder Singh Rajput

[permalink] [raw]
Subject: Re: [PATCH 1/2 -tip] perf_counter: Add generalized hardware vectored co-processor support for AMD and Intel Corei7/Nehalem

On Fri, 2009-07-03 at 12:29 +0200, Ingo Molnar wrote:
> * Jaswinder Singh Rajput <[email protected]> wrote:
>
> > Performance counter stats for '/usr/bin/rhythmbox /home/jaswinder/Music/singhiskinng.mp3':
> >
> > 17552264 vec-adds (scaled from 66.28%)
> > 19715258 vec-muls (scaled from 66.63%)
> > 15862733 vec-divs (scaled from 66.82%)
> > 23735187095 vec-idle-cycles (scaled from 66.89%)
> > 11353159 vec-stall-cycles (scaled from 66.90%)
> > 36628571 vec-ops (scaled from 66.48%)
>
> Is stall-cycles equivalent to busy-cycles?


hmm, normally we can use these terms interchangeably. But they can be
different some times.

busy means it is already executing some instructions so it will not take
another instruction.

stall can be busy(executing) or non-executing may be it is waiting for
some operands due to cache miss.


> I.e. do we have this
> general relationship to the cycle event:
>
> cycles = vec-stall-cycles + vec-idle-cycles
>
> ?

This patch is already big enough, having 206 lines. Do you want
everything in this patch ;-)

Or we can do these things later on.

Thanks,
--
JSR

http://userweb.kernel.org/~jaswinder/

2009-07-03 12:03:00

by Jaswinder Singh Rajput

[permalink] [raw]
Subject: Re: [PATCH 4/6 -tip] perf_counter: Add Generalized Hardware interrupt support for AMD

On Wed, 2009-07-01 at 13:24 +0200, Ingo Molnar wrote:
> * Jaswinder Singh Rajput <[email protected]> wrote:
>
> >
> > $ ./perf stat -e interrupts -e masked -e int-pending-mask-cycles -- ls -lR /usr/include/ > /dev/null
> >
> > Performance counter stats for 'ls -lR /usr/include/':
> >
> > 377 interrupts
> > 53429936 int-mask-cycles
> > 1119 int-pending-mask-cycles
> >
> > 0.371457539 seconds time elapsed
>
> Agreed, this is another useful generalization - and the 'cycles
> pending' metrics are not retrievable via any software means.
>
> We could and should probably add a software counter for hardirqs as
> wel. That would allow the vector/irqnr information to be passed in,
> and it would allow architectures without irq metrics in the PMU to
> have this counter too.
>

Please let me know that addition of software counter will be in this
patch or we can do it incrementally after this patch.

Thanks,
--
JSR

http://userweb.kernel.org/~jaswinder/

2009-07-03 12:50:37

by Jaswinder Singh Rajput

[permalink] [raw]
Subject: Re: [PATCH 1/2 -tip] perf_counter: Add generalized hardware vectored co-processor support for AMD and Intel Corei7/Nehalem

On Fri, 2009-07-03 at 17:25 +0530, Jaswinder Singh Rajput wrote:
> On Fri, 2009-07-03 at 12:29 +0200, Ingo Molnar wrote:
> > * Jaswinder Singh Rajput <[email protected]> wrote:
> >
> > > Performance counter stats for '/usr/bin/rhythmbox /home/jaswinder/Music/singhiskinng.mp3':
> > >
> > > 17552264 vec-adds (scaled from 66.28%)
> > > 19715258 vec-muls (scaled from 66.63%)
> > > 15862733 vec-divs (scaled from 66.82%)
> > > 23735187095 vec-idle-cycles (scaled from 66.89%)
> > > 11353159 vec-stall-cycles (scaled from 66.90%)
> > > 36628571 vec-ops (scaled from 66.48%)
> >
> > Is stall-cycles equivalent to busy-cycles?
>
>
> hmm, normally we can use these terms interchangeably. But they can be
> different some times.
>
> busy means it is already executing some instructions so it will not take
> another instruction.
>
> stall can be busy(executing) or non-executing may be it is waiting for
> some operands due to cache miss.
>
>
> > I.e. do we have this
> > general relationship to the cycle event:
> >
> > cycles = vec-stall-cycles + vec-idle-cycles
> >
> > ?

Like on AMD :

13390918485 vec-adds (scaled from 57.07%)
22465091289 vec-muls (scaled from 57.22%)
2643789384 vec-divs (scaled from 57.21%)
17922784596 vec-idle-cycles (scaled from 57.23%)
6402888606 vec-stall-cycles (scaled from 57.17%)
55823491597 cycles (scaled from 57.05%)
51035264218 vec-ops (scaled from 57.05%)

187.494664172 seconds time elapsed

vec-idle-cycles + vec-stall-cycles = 24325673202

so cycles = 2.29 * (vec-idle-cycles + vec-stall-cycles)

On AMD I used : EventSelect 0D7h Dispatch Stall for FPU Full
The number of processor cycles the decoder is stalled because the
scheduler for the Floating Point Unit is full. This condition can be
caused by a lack of parallelism in FP-intensive code, or by cache misses
on FP operand loads (which could also show up as EventSelect 0D8h
instead, depending on the nature of the instruction sequences). May
occur simultaneously with certain other stall conditions; see
EventSelect 0D1h

So stall is due to lack of parallelism and cache misses.
If we keep on increasing the size of FP units and cache may at some
point be we can get vec-stall-cycles = zero.

Thanks,
--
JSR

http://userweb.kernel.org/~jaswinder/

2009-07-03 13:26:37

by Jaswinder Singh Rajput

[permalink] [raw]
Subject: Re: [PATCH 1/2 -tip] perf_counter: Add generalized hardware vectored co-processor support for AMD and Intel Corei7/Nehalem

On Fri, 2009-07-03 at 18:19 +0530, Jaswinder Singh Rajput wrote:
> On Fri, 2009-07-03 at 17:25 +0530, Jaswinder Singh Rajput wrote:
> > On Fri, 2009-07-03 at 12:29 +0200, Ingo Molnar wrote:
> > > * Jaswinder Singh Rajput <[email protected]> wrote:
> > >
> > > > Performance counter stats for '/usr/bin/rhythmbox /home/jaswinder/Music/singhiskinng.mp3':
> > > >
> > > > 17552264 vec-adds (scaled from 66.28%)
> > > > 19715258 vec-muls (scaled from 66.63%)
> > > > 15862733 vec-divs (scaled from 66.82%)
> > > > 23735187095 vec-idle-cycles (scaled from 66.89%)
> > > > 11353159 vec-stall-cycles (scaled from 66.90%)
> > > > 36628571 vec-ops (scaled from 66.48%)
> > >
> > > Is stall-cycles equivalent to busy-cycles?
> >
> >
> > hmm, normally we can use these terms interchangeably. But they can be
> > different some times.
> >
> > busy means it is already executing some instructions so it will not take
> > another instruction.
> >
> > stall can be busy(executing) or non-executing may be it is waiting for
> > some operands due to cache miss.
> >
> >
> > > I.e. do we have this
> > > general relationship to the cycle event:
> > >
> > > cycles = vec-stall-cycles + vec-idle-cycles
> > >
> > > ?
>
> Like on AMD :
>
> 13390918485 vec-adds (scaled from 57.07%)
> 22465091289 vec-muls (scaled from 57.22%)
> 2643789384 vec-divs (scaled from 57.21%)
> 17922784596 vec-idle-cycles (scaled from 57.23%)
> 6402888606 vec-stall-cycles (scaled from 57.17%)
> 55823491597 cycles (scaled from 57.05%)
> 51035264218 vec-ops (scaled from 57.05%)
>
> 187.494664172 seconds time elapsed
>
> vec-idle-cycles + vec-stall-cycles = 24325673202
>
> so cycles = 2.29 * (vec-idle-cycles + vec-stall-cycles)
>
> On AMD I used : EventSelect 0D7h Dispatch Stall for FPU Full
> The number of processor cycles the decoder is stalled because the
> scheduler for the Floating Point Unit is full. This condition can be
> caused by a lack of parallelism in FP-intensive code, or by cache misses
> on FP operand loads (which could also show up as EventSelect 0D8h
> instead, depending on the nature of the instruction sequences). May
> occur simultaneously with certain other stall conditions; see
> EventSelect 0D1h
>
> So stall is due to lack of parallelism and cache misses.
> If we keep on increasing the size of FP units and cache may at some
> point be we can get vec-stall-cycles = zero.
>

I mean, So stall is majorly due to lack of parallelism and cache misses.
If we keep on increasing the size of FP units and cache then stall time
will keep on decreasing (ofcourse it will be never Zero ;)

And same thing will be happen for Intel.

So stall is not equal to busy.

Please let me know what is next, should I remove busy term from alias.

Thanks,
--
JSR

2009-07-04 09:50:07

by Ingo Molnar

[permalink] [raw]
Subject: Re: [PATCH 1/2 -tip] perf_counter: Add generalized hardware vectored co-processor support for AMD and Intel Corei7/Nehalem


* Jaswinder Singh Rajput <[email protected]> wrote:

> On Fri, 2009-07-03 at 12:29 +0200, Ingo Molnar wrote:
> > * Jaswinder Singh Rajput <[email protected]> wrote:
> >
> > > Performance counter stats for '/usr/bin/rhythmbox /home/jaswinder/Music/singhiskinng.mp3':
> > >
> > > 17552264 vec-adds (scaled from 66.28%)
> > > 19715258 vec-muls (scaled from 66.63%)
> > > 15862733 vec-divs (scaled from 66.82%)
> > > 23735187095 vec-idle-cycles (scaled from 66.89%)
> > > 11353159 vec-stall-cycles (scaled from 66.90%)
> > > 36628571 vec-ops (scaled from 66.48%)
> >
> > Is stall-cycles equivalent to busy-cycles?
>
>
> hmm, normally we can use these terms interchangeably. But they can
> be different some times.
>
> busy means it is already executing some instructions so it will
> not take another instruction.
>
> stall can be busy(executing) or non-executing may be it is waiting
> for some operands due to cache miss.
>
>
> > I.e. do we have this
> > general relationship to the cycle event:
> >
> > cycles = vec-stall-cycles + vec-idle-cycles
> >
> > ?
>
> This patch is already big enough, having 206 lines. Do you want
> everything in this patch ;-)

The question i asked is whether the above relationship is true. You
can test this by displaying the 'cycles' metric too in your test,
alongside vec-stall-cycles and vec-idle-cycles. Do the numbers add
up?

Ingo

2009-07-04 10:04:19

by Ingo Molnar

[permalink] [raw]
Subject: Re: [PATCH 1/2 -tip] perf_counter: Add generalized hardware vectored co-processor support for AMD and Intel Corei7/Nehalem


* Jaswinder Singh Rajput <[email protected]> wrote:

> On Fri, 2009-07-03 at 18:19 +0530, Jaswinder Singh Rajput wrote:
> > On Fri, 2009-07-03 at 17:25 +0530, Jaswinder Singh Rajput wrote:
> > > On Fri, 2009-07-03 at 12:29 +0200, Ingo Molnar wrote:
> > > > * Jaswinder Singh Rajput <[email protected]> wrote:
> > > >
> > > > > Performance counter stats for '/usr/bin/rhythmbox /home/jaswinder/Music/singhiskinng.mp3':
> > > > >
> > > > > 17552264 vec-adds (scaled from 66.28%)
> > > > > 19715258 vec-muls (scaled from 66.63%)
> > > > > 15862733 vec-divs (scaled from 66.82%)
> > > > > 23735187095 vec-idle-cycles (scaled from 66.89%)
> > > > > 11353159 vec-stall-cycles (scaled from 66.90%)
> > > > > 36628571 vec-ops (scaled from 66.48%)
> > > >
> > > > Is stall-cycles equivalent to busy-cycles?
> > >
> > >
> > > hmm, normally we can use these terms interchangeably. But they can be
> > > different some times.
> > >
> > > busy means it is already executing some instructions so it will not take
> > > another instruction.
> > >
> > > stall can be busy(executing) or non-executing may be it is waiting for
> > > some operands due to cache miss.
> > >
> > >
> > > > I.e. do we have this
> > > > general relationship to the cycle event:
> > > >
> > > > cycles = vec-stall-cycles + vec-idle-cycles
> > > >
> > > > ?
> >
> > Like on AMD :
> >
> > 13390918485 vec-adds (scaled from 57.07%)
> > 22465091289 vec-muls (scaled from 57.22%)
> > 2643789384 vec-divs (scaled from 57.21%)
> > 17922784596 vec-idle-cycles (scaled from 57.23%)
> > 6402888606 vec-stall-cycles (scaled from 57.17%)
> > 55823491597 cycles (scaled from 57.05%)
> > 51035264218 vec-ops (scaled from 57.05%)
> >
> > 187.494664172 seconds time elapsed
> >
> > vec-idle-cycles + vec-stall-cycles = 24325673202
> >
> > so cycles = 2.29 * (vec-idle-cycles + vec-stall-cycles)

that equation is entirely bogus.

> >
> > On AMD I used : EventSelect 0D7h Dispatch Stall for FPU Full The
> > number of processor cycles the decoder is stalled because the
> > scheduler for the Floating Point Unit is full. This condition
> > can be caused by a lack of parallelism in FP-intensive code, or
> > by cache misses on FP operand loads (which could also show up as
> > EventSelect 0D8h instead, depending on the nature of the
> > instruction sequences). May occur simultaneously with certain
> > other stall conditions; see EventSelect 0D1h
> >
> > So stall is due to lack of parallelism and cache misses. If we
> > keep on increasing the size of FP units and cache may at some
> > point be we can get vec-stall-cycles = zero.
> >
>
> I mean, So stall is majorly due to lack of parallelism and cache
> misses. If we keep on increasing the size of FP units and cache
> then stall time will keep on decreasing (ofcourse it will be never
> Zero ;)
>
> And same thing will be happen for Intel.
>
> So stall is not equal to busy.
>
> Please let me know what is next, should I remove busy term from
> alias.

What is needed is for you to understand these events and provide a
generalization around them that makes sense. Or to declare it
honestly when you dont.

The numbers simply dont add up:

> > 13390918485 vec-adds (scaled from 57.07%)
> > 22465091289 vec-muls (scaled from 57.22%)
> > 2643789384 vec-divs (scaled from 57.21%)
> > 17922784596 vec-idle-cycles (scaled from 57.23%)
> > 6402888606 vec-stall-cycles (scaled from 57.17%)
> > 55823491597 cycles (scaled from 57.05%)
> > 51035264218 vec-ops (scaled from 57.05%)

vec-idle-cycles + vec-stall-cycles does not add up to cycles -
because a stall is not an 'interchangeable' term with 'busy' as you
claimed before, but a special state of the pipeline, a subset of
busy.

I prefer to apply patches from people who understand what they are
doing - and more importantly, who express and declare their own
limits properly when they _dont_ understand something and are
guessing.

Frankly, your patches dont give me this impression and you are also
babbling way too much about things you clearly dont understand, and
thus you hinder the discussions with noise.

It's not bad at all to not understand something (we all are at
various stages of a big and constantly refreshing learning curves),
but it's very bad to pretend you understand something while you
clearly dont. What we need in lkml discussions is an honest laying
down of facts, opinions and doubts.

Why the heck didnt you say:

" I dont know much about PMUs or vector units yet, but I have found
these blurbs in the Intel and AMD docs and what do you think
about structuring these events the following way. Someone who
knows this stuff should review this first, it is quite likely
incomplete. "

Ingo

2009-07-04 10:23:25

by Ingo Molnar

[permalink] [raw]
Subject: Re: [PATCH 4/6 -tip] perf_counter: Add Generalized Hardware interrupt support for AMD


* Jaswinder Singh Rajput <[email protected]> wrote:

> On Wed, 2009-07-01 at 13:24 +0200, Ingo Molnar wrote:
> > * Jaswinder Singh Rajput <[email protected]> wrote:
> >
> > >
> > > $ ./perf stat -e interrupts -e masked -e int-pending-mask-cycles -- ls -lR /usr/include/ > /dev/null
> > >
> > > Performance counter stats for 'ls -lR /usr/include/':
> > >
> > > 377 interrupts
> > > 53429936 int-mask-cycles
> > > 1119 int-pending-mask-cycles
> > >
> > > 0.371457539 seconds time elapsed
> >
> > Agreed, this is another useful generalization - and the 'cycles
> > pending' metrics are not retrievable via any software means.
> >
> > We could and should probably add a software counter for hardirqs
> > as wel. That would allow the vector/irqnr information to be
> > passed in, and it would allow architectures without irq metrics
> > in the PMU to have this counter too.
> >
>
> Please let me know that addition of software counter will be in
> this patch or we can do it incrementally after this patch.

It should be in this series. That way we can cross-check whether the
soft counts and the hard counts match up and find potential bugs
that way, etc.

Ingo

2009-07-04 13:55:28

by Jaswinder Singh Rajput

[permalink] [raw]
Subject: Re: [PATCH 1/2 -tip] perf_counter: Add generalized hardware vectored co-processor support for AMD and Intel Corei7/Nehalem

On Sat, 2009-07-04 at 11:49 +0200, Ingo Molnar wrote:
> * Jaswinder Singh Rajput <[email protected]> wrote:
>
> > On Fri, 2009-07-03 at 12:29 +0200, Ingo Molnar wrote:
> > > * Jaswinder Singh Rajput <[email protected]> wrote:
> > >
> > > > Performance counter stats for '/usr/bin/rhythmbox /home/jaswinder/Music/singhiskinng.mp3':
> > > >
> > > > 17552264 vec-adds (scaled from 66.28%)
> > > > 19715258 vec-muls (scaled from 66.63%)
> > > > 15862733 vec-divs (scaled from 66.82%)
> > > > 23735187095 vec-idle-cycles (scaled from 66.89%)
> > > > 11353159 vec-stall-cycles (scaled from 66.90%)
> > > > 36628571 vec-ops (scaled from 66.48%)
> > >
> > > Is stall-cycles equivalent to busy-cycles?
> >
> >
> > hmm, normally we can use these terms interchangeably. But they can
> > be different some times.
> >
> > busy means it is already executing some instructions so it will
> > not take another instruction.
> >
> > stall can be busy(executing) or non-executing may be it is waiting
> > for some operands due to cache miss.
> >
> >
> > > I.e. do we have this
> > > general relationship to the cycle event:
> > >
> > > cycles = vec-stall-cycles + vec-idle-cycles
> > >
> > > ?
> >
> > This patch is already big enough, having 206 lines. Do you want
> > everything in this patch ;-)
>
> The question i asked is whether the above relationship is true. You
> can test this by displaying the 'cycles' metric too in your test,
> alongside vec-stall-cycles and vec-idle-cycles. Do the numbers add
> up?
>

But I do not understand why you asked this relationship from me, you can
also do this on your side.

What is the point of blocking the patch and going on another tangent.

I am totally confused on one side your are saying this patch is useful
and another side you are not applying it.

Please let me know what is the problem in this patch so that I can fix
it and so that you can apply this patch and another people can start
taking benefit from this feature.

Thanks,
--
JSR

2009-07-04 14:05:46

by Jaswinder Singh Rajput

[permalink] [raw]
Subject: Re: [PATCH 1/2 -tip] perf_counter: Add generalized hardware vectored co-processor support for AMD and Intel Corei7/Nehalem

On Sat, 2009-07-04 at 12:03 +0200, Ingo Molnar wrote:
> * Jaswinder Singh Rajput <[email protected]> wrote:
>
> > > > > I.e. do we have this
> > > > > general relationship to the cycle event:
> > > > >
> > > > > cycles = vec-stall-cycles + vec-idle-cycles
> > > > >
> > > > > ?
> > >
> > > Like on AMD :
> > >
> > > 13390918485 vec-adds (scaled from 57.07%)
> > > 22465091289 vec-muls (scaled from 57.22%)
> > > 2643789384 vec-divs (scaled from 57.21%)
> > > 17922784596 vec-idle-cycles (scaled from 57.23%)
> > > 6402888606 vec-stall-cycles (scaled from 57.17%)
> > > 55823491597 cycles (scaled from 57.05%)
> > > 51035264218 vec-ops (scaled from 57.05%)
> > >
> > > 187.494664172 seconds time elapsed
> > >
> > > vec-idle-cycles + vec-stall-cycles = 24325673202
> > >
> > > so cycles = 2.29 * (vec-idle-cycles + vec-stall-cycles)
>
> that equation is entirely bogus.
>

What is bogus ? in this case this equation is true and it depends on
each application.

> > >
> > > On AMD I used : EventSelect 0D7h Dispatch Stall for FPU Full The
> > > number of processor cycles the decoder is stalled because the
> > > scheduler for the Floating Point Unit is full. This condition
> > > can be caused by a lack of parallelism in FP-intensive code, or
> > > by cache misses on FP operand loads (which could also show up as
> > > EventSelect 0D8h instead, depending on the nature of the
> > > instruction sequences). May occur simultaneously with certain
> > > other stall conditions; see EventSelect 0D1h
> > >
> > > So stall is due to lack of parallelism and cache misses. If we
> > > keep on increasing the size of FP units and cache may at some
> > > point be we can get vec-stall-cycles = zero.
> > >
> >
> > I mean, So stall is majorly due to lack of parallelism and cache
> > misses. If we keep on increasing the size of FP units and cache
> > then stall time will keep on decreasing (ofcourse it will be never
> > Zero ;)
> >
> > And same thing will be happen for Intel.
> >
> > So stall is not equal to busy.
> >
> > Please let me know what is next, should I remove busy term from
> > alias.
>
> What is needed is for you to understand these events and provide a
> generalization around them that makes sense. Or to declare it
> honestly when you dont.
>

what ??

tell me where is the problem, Is there any problem is patch.

> The numbers simply dont add up:
>
> > > 13390918485 vec-adds (scaled from 57.07%)
> > > 22465091289 vec-muls (scaled from 57.22%)
> > > 2643789384 vec-divs (scaled from 57.21%)
> > > 17922784596 vec-idle-cycles (scaled from 57.23%)
> > > 6402888606 vec-stall-cycles (scaled from 57.17%)
> > > 55823491597 cycles (scaled from 57.05%)
> > > 51035264218 vec-ops (scaled from 57.05%)
>
> vec-idle-cycles + vec-stall-cycles does not add up to cycles -
> because a stall is not an 'interchangeable' term with 'busy' as you
> claimed before, but a special state of the pipeline, a subset of
> busy.
>
> I prefer to apply patches from people who understand what they are
> doing - and more importantly, who express and declare their own
> limits properly when they _dont_ understand something and are
> guessing.
>

what is the problem in understanding. You raised the question, so you
was confused not me. And you got the clear picture from my points and
you are still blaming me ?


> Frankly, your patches dont give me this impression and you are also
> babbling way too much about things you clearly dont understand, and
> thus you hinder the discussions with noise.
>
> It's not bad at all to not understand something (we all are at
> various stages of a big and constantly refreshing learning curves),
> but it's very bad to pretend you understand something while you
> clearly dont. What we need in lkml discussions is an honest laying
> down of facts, opinions and doubts.
>
> Why the heck didnt you say:
>
> " I dont know much about PMUs or vector units yet, but I have found
> these blurbs in the Intel and AMD docs and what do you think
> about structuring these events the following way. Someone who
> knows this stuff should review this first, it is quite likely
> incomplete. "


Why should I say this. Its you who need to say this.

I have clear understand that why I came up with this patch.

Thanks,
--
JSR

2009-07-04 14:18:27

by Jaswinder Singh Rajput

[permalink] [raw]
Subject: Re: [PATCH 4/6 -tip] perf_counter: Add Generalized Hardware interrupt support for AMD

On Sat, 2009-07-04 at 12:22 +0200, Ingo Molnar wrote:
> * Jaswinder Singh Rajput <[email protected]> wrote:
>
> > On Wed, 2009-07-01 at 13:24 +0200, Ingo Molnar wrote:
> > > * Jaswinder Singh Rajput <[email protected]> wrote:
> > >
> > > >
> > > > $ ./perf stat -e interrupts -e masked -e int-pending-mask-cycles -- ls -lR /usr/include/ > /dev/null
> > > >
> > > > Performance counter stats for 'ls -lR /usr/include/':
> > > >
> > > > 377 interrupts
> > > > 53429936 int-mask-cycles
> > > > 1119 int-pending-mask-cycles
> > > >
> > > > 0.371457539 seconds time elapsed
> > >
> > > Agreed, this is another useful generalization - and the 'cycles
> > > pending' metrics are not retrievable via any software means.
> > >
> > > We could and should probably add a software counter for hardirqs
> > > as wel. That would allow the vector/irqnr information to be
> > > passed in, and it would allow architectures without irq metrics
> > > in the PMU to have this counter too.
> > >
> >
> > Please let me know that addition of software counter will be in
> > this patch or we can do it incrementally after this patch.
>
> It should be in this series. That way we can cross-check whether the
> soft counts and the hard counts match up and find potential bugs
> that way, etc.
>

You want to cross check performance counter events ?

Why you choose interrupt events, why do not you raise this point when
cache events was added ?

I do not understand why you keep going on tangents.

If you want to cross-check then it should be in different patch and
there should be no requirement to have this on this series and no point
of blocking this patch on irrelevant argument.

Only thing I can do is to fix the patch, if you point any problem in
this.

Thanks,
--
JSR

2009-07-05 01:12:18

by Ingo Molnar

[permalink] [raw]
Subject: Re: [PATCH 4/6 -tip] perf_counter: Add Generalized Hardware interrupt support for AMD


* Jaswinder Singh Rajput <[email protected]> wrote:

> On Sat, 2009-07-04 at 12:22 +0200, Ingo Molnar wrote:
> > * Jaswinder Singh Rajput <[email protected]> wrote:
> >
> > > On Wed, 2009-07-01 at 13:24 +0200, Ingo Molnar wrote:
> > > > * Jaswinder Singh Rajput <[email protected]> wrote:
> > > >
> > > > >
> > > > > $ ./perf stat -e interrupts -e masked -e int-pending-mask-cycles -- ls -lR /usr/include/ > /dev/null
> > > > >
> > > > > Performance counter stats for 'ls -lR /usr/include/':
> > > > >
> > > > > 377 interrupts
> > > > > 53429936 int-mask-cycles
> > > > > 1119 int-pending-mask-cycles
> > > > >
> > > > > 0.371457539 seconds time elapsed
> > > >
> > > > Agreed, this is another useful generalization - and the 'cycles
> > > > pending' metrics are not retrievable via any software means.
> > > >
> > > > We could and should probably add a software counter for hardirqs
> > > > as wel. That would allow the vector/irqnr information to be
> > > > passed in, and it would allow architectures without irq metrics
> > > > in the PMU to have this counter too.
> > > >
> > >
> > > Please let me know that addition of software counter will be
> > > in this patch or we can do it incrementally after this patch.
> >
> > It should be in this series. That way we can cross-check whether
> > the soft counts and the hard counts match up and find potential
> > bugs that way, etc.
> >
>
> You want to cross check performance counter events ?

Yes. The events are also more complete if we add per IRQ source
counts as well, not just summary counts.

Ingo

2009-07-05 04:30:27

by Jaswinder Singh Rajput

[permalink] [raw]
Subject: Re: [PATCH 4/6 -tip] perf_counter: Add Generalized Hardware interrupt support for AMD

On Sun, 2009-07-05 at 03:11 +0200, Ingo Molnar wrote:
> * Jaswinder Singh Rajput <[email protected]> wrote:
>
> > On Sat, 2009-07-04 at 12:22 +0200, Ingo Molnar wrote:
> > > * Jaswinder Singh Rajput <[email protected]> wrote:
> > >
> > > > On Wed, 2009-07-01 at 13:24 +0200, Ingo Molnar wrote:
> > > > > * Jaswinder Singh Rajput <[email protected]> wrote:
> > > > >
> > > > > >
> > > > > > $ ./perf stat -e interrupts -e masked -e int-pending-mask-cycles -- ls -lR /usr/include/ > /dev/null
> > > > > >
> > > > > > Performance counter stats for 'ls -lR /usr/include/':
> > > > > >
> > > > > > 377 interrupts
> > > > > > 53429936 int-mask-cycles
> > > > > > 1119 int-pending-mask-cycles
> > > > > >
> > > > > > 0.371457539 seconds time elapsed
> > > > >
> > > > > Agreed, this is another useful generalization - and the 'cycles
> > > > > pending' metrics are not retrievable via any software means.
> > > > >
> > > > > We could and should probably add a software counter for hardirqs
> > > > > as wel. That would allow the vector/irqnr information to be
> > > > > passed in, and it would allow architectures without irq metrics
> > > > > in the PMU to have this counter too.
> > > > >
> > > >
> > > > Please let me know that addition of software counter will be
> > > > in this patch or we can do it incrementally after this patch.
> > >
> > > It should be in this series. That way we can cross-check whether
> > > the soft counts and the hard counts match up and find potential
> > > bugs that way, etc.
> > >
> >
> > You want to cross check performance counter events ?
>
> Yes. The events are also more complete if we add per IRQ source
> counts as well, not just summary counts.
>

If you ask me about 'complete', I will say :
"No-one is 'complete' except God".

Let me know what you mean by 'complete' and 'more complete'.

This is a hardware performance interrupt event patch.
If you want to add IRQ source, of course you can add it in another
patch, it is a never ending task.

I do not understand why you behave like this :

1. Is today the last day of the creation.
2. Or you will not collect any further patches.

Of course answer is "no" then what is the problem with you.

Stop this complete-ness madness. You will never complete atleast in this
life no matter what you will do.

Thanks,
--
JSR

2009-07-05 08:06:14

by Ingo Molnar

[permalink] [raw]
Subject: Re: [PATCH 4/6 -tip] perf_counter: Add Generalized Hardware interrupt support for AMD


* Jaswinder Singh Rajput <[email protected]> wrote:

> On Sun, 2009-07-05 at 03:11 +0200, Ingo Molnar wrote:
> > * Jaswinder Singh Rajput <[email protected]> wrote:
> >
> > > On Sat, 2009-07-04 at 12:22 +0200, Ingo Molnar wrote:
> > > > * Jaswinder Singh Rajput <[email protected]> wrote:
> > > >
> > > > > On Wed, 2009-07-01 at 13:24 +0200, Ingo Molnar wrote:
> > > > > > * Jaswinder Singh Rajput <[email protected]> wrote:
> > > > > >
> > > > > > >
> > > > > > > $ ./perf stat -e interrupts -e masked -e int-pending-mask-cycles -- ls -lR /usr/include/ > /dev/null
> > > > > > >
> > > > > > > Performance counter stats for 'ls -lR /usr/include/':
> > > > > > >
> > > > > > > 377 interrupts
> > > > > > > 53429936 int-mask-cycles
> > > > > > > 1119 int-pending-mask-cycles
> > > > > > >
> > > > > > > 0.371457539 seconds time elapsed
> > > > > >
> > > > > > Agreed, this is another useful generalization - and the 'cycles
> > > > > > pending' metrics are not retrievable via any software means.
> > > > > >
> > > > > > We could and should probably add a software counter for hardirqs
> > > > > > as wel. That would allow the vector/irqnr information to be
> > > > > > passed in, and it would allow architectures without irq metrics
> > > > > > in the PMU to have this counter too.
> > > > > >
> > > > >
> > > > > Please let me know that addition of software counter will be
> > > > > in this patch or we can do it incrementally after this patch.
> > > >
> > > > It should be in this series. That way we can cross-check whether
> > > > the soft counts and the hard counts match up and find potential
> > > > bugs that way, etc.
> > > >
> > >
> > > You want to cross check performance counter events ?
> >
> > Yes. The events are also more complete if we add per IRQ source
> > counts as well, not just summary counts.
>
> If you ask me about 'complete', I will say : "No-one is 'complete'
> except God".
>
> Let me know what you mean by 'complete' and 'more complete'.
>
> This is a hardware performance interrupt event patch. If you want
> to add IRQ source, of course you can add it in another patch, it
> is a never ending task.
>
> I do not understand why you behave like this :
>
> 1. Is today the last day of the creation.
> 2. Or you will not collect any further patches.
>
> Of course answer is "no" then what is the problem with you.
>
> Stop this complete-ness madness. You will never complete atleast
> in this life no matter what you will do.

I'm simply not going to apply patches from you for what i consider a
half-done feature.

Ingo

2009-07-05 09:02:50

by Jaswinder Singh Rajput

[permalink] [raw]
Subject: Re: [PATCH 4/6 -tip] perf_counter: Add Generalized Hardware interrupt support for AMD

On Sun, 2009-07-05 at 10:04 +0200, Ingo Molnar wrote:
> * Jaswinder Singh Rajput <[email protected]> wrote:
>

> I'm simply not going to apply patches from you for what i consider a
> half-done feature.
>

This is not half-done. There are only 3 hardware interrupt performance
counter events in Intel and AMD. And I supported all of them.

I also supported all relevant Intel models and all AMD models.

You are requesting for software counter for hardirqs, I have no problem
to support it, I have also plan to add exceptions through software
counters, but again it will be different patch. And there is no point of
blocking this patch, as this will never change even if you add software
counters.

And you not even telling the problem in this patch, but you want to add
more stuff, which is independent of this.

So it is time to reconsider your consideration.

Thanks,
--
JSR

2009-07-05 09:56:54

by Jaswinder Singh Rajput

[permalink] [raw]
Subject: Re: [PATCH 4/6 -tip] perf_counter: Add Generalized Hardware interrupt support for AMD

On Sun, 2009-07-05 at 10:04 +0200, Ingo Molnar wrote:

> I'm simply not going to apply patches from you for what i consider a
> half-done feature.
>

OK, can you suggest me how output will look like so that I can start
preparing the hardirq patch.

Thanks,
--
JSR

2009-07-04 09:51:02

by Ingo Molnar

[permalink] [raw]
Subject: Re: [PATCH 3/3 -tip] perf list: avoid replicating functions


* Jaswinder Singh Rajput <[email protected]> wrote:

> On Fri, 2009-07-03 at 11:30 +0200, Ingo Molnar wrote:
> > * Jaswinder Singh Rajput <[email protected]> wrote:
> >
> > > > $ ./perf list shows vector events like :
> > > >
> > > > vec-adds OR add [Hardware vector event]
> > > > vec-muls OR multiply [Hardware vector event]
> > > > vec-divs OR divide [Hardware vector event]
> > > > vec-idle-cycles OR vec-empty-cycles [Hardware vector event]
> > > > vec-stall-cycles OR vec-busy-cycles [Hardware vector event]
> > > > vec-ops OR vec-operations [Hardware vector event]
> >
> > btw., why does this printout SHOUT the 'or'? It's certainly not an
> > important piece of information. Something like:
> >
> > vec-adds | add [Hardware vector event]
> > vec-muls | multiply [Hardware vector event]
> > vec-divs | divide [Hardware vector event]
> > vec-idle-cycles | vec-empty-cycles [Hardware vector event]
> > vec-stall-cycles | vec-busy-cycles [Hardware vector event]
> > vec-ops | vec-operations [Hardware vector event]
> >
> > looks better on all levels.
> >
>
> I prepared this patch incrementally on :
> [PATCH 1/2 -tip] perf_counter: Add generalized hardware vectored co-processor support for AMD and Intel Corei7/Nehalem
> [PATCH 2/2 -tip] perf_counter: Add generalized hardware interrupt support for AMD and Intel Corei7/Nehalem, Core2 and Atom
>
> [PATCH 3/3] perf list: avoid replicating functions
>
> vector and interrupt can use same function made for hardware and
> software generic events.
>
> Also replaced 'OR' with '|'

Please submit a clean series of patches instead of a mixture of
patches plus fixes to patches.

Ingo

2009-07-03 12:18:08

by Jaswinder Singh Rajput

[permalink] [raw]
Subject: [PATCH 3/3 -tip] perf list: avoid replicating functions

On Fri, 2009-07-03 at 11:30 +0200, Ingo Molnar wrote:
> * Jaswinder Singh Rajput <[email protected]> wrote:
>
> > > $ ./perf list shows vector events like :
> > >
> > > vec-adds OR add [Hardware vector event]
> > > vec-muls OR multiply [Hardware vector event]
> > > vec-divs OR divide [Hardware vector event]
> > > vec-idle-cycles OR vec-empty-cycles [Hardware vector event]
> > > vec-stall-cycles OR vec-busy-cycles [Hardware vector event]
> > > vec-ops OR vec-operations [Hardware vector event]
>
> btw., why does this printout SHOUT the 'or'? It's certainly not an
> important piece of information. Something like:
>
> vec-adds | add [Hardware vector event]
> vec-muls | multiply [Hardware vector event]
> vec-divs | divide [Hardware vector event]
> vec-idle-cycles | vec-empty-cycles [Hardware vector event]
> vec-stall-cycles | vec-busy-cycles [Hardware vector event]
> vec-ops | vec-operations [Hardware vector event]
>
> looks better on all levels.
>

I prepared this patch incrementally on :
[PATCH 1/2 -tip] perf_counter: Add generalized hardware vectored co-processor support for AMD and Intel Corei7/Nehalem
[PATCH 2/2 -tip] perf_counter: Add generalized hardware interrupt support for AMD and Intel Corei7/Nehalem, Core2 and Atom

[PATCH 3/3] perf list: avoid replicating functions

vector and interrupt can use same function made for hardware and software generic events.

Also replaced 'OR' with '|'

$ ./perf list

List of pre-defined events (to be used in -e):

cpu-cycles | cycles [Hardware event]
instructions [Hardware event]
cache-references [Hardware event]
cache-misses [Hardware event]
branch-instructions | branches [Hardware event]
branch-misses [Hardware event]
bus-cycles [Hardware event]

cpu-clock [Software event]
task-clock [Software event]
page-faults | faults [Software event]
minor-faults [Software event]
major-faults [Software event]
context-switches | cs [Software event]
cpu-migrations | migrations [Software event]

L1-d$-loads [Hardware cache event]
L1-d$-load-misses [Hardware cache event]
L1-d$-stores [Hardware cache event]
L1-d$-store-misses [Hardware cache event]
L1-d$-prefetches [Hardware cache event]
L1-d$-prefetch-misses [Hardware cache event]
L1-i$-loads [Hardware cache event]
L1-i$-load-misses [Hardware cache event]
L1-i$-prefetches [Hardware cache event]
L1-i$-prefetch-misses [Hardware cache event]
LLC-loads [Hardware cache event]
LLC-load-misses [Hardware cache event]
LLC-stores [Hardware cache event]
LLC-store-misses [Hardware cache event]
LLC-prefetches [Hardware cache event]
LLC-prefetch-misses [Hardware cache event]
dTLB-loads [Hardware cache event]
dTLB-load-misses [Hardware cache event]
dTLB-stores [Hardware cache event]
dTLB-store-misses [Hardware cache event]
dTLB-prefetches [Hardware cache event]
dTLB-prefetch-misses [Hardware cache event]
iTLB-loads [Hardware cache event]
iTLB-load-misses [Hardware cache event]
branch-loads [Hardware cache event]
branch-load-misses [Hardware cache event]

vec-adds | add [Hardware vector event]
vec-muls | multiply [Hardware vector event]
vec-divs | divide [Hardware vector event]
vec-idle-cycles | vec-empty-cycles [Hardware vector event]
vec-stall-cycles | vec-busy-cycles [Hardware vector event]
vec-ops | vec-operations [Hardware vector event]

interrupts | interrupt [Hardware interrupt event]
int-mask-cycles | masked [Hardware interrupt event]
int-pending-mask-cycles [Hardware interrupt event]

rNNN [Hardware raw event]

Signed-off-by: Jaswinder Singh Rajput <[email protected]>
---
tools/perf/util/parse-events.c | 83 +++++++++++++++++-----------------------
1 files changed, 35 insertions(+), 48 deletions(-)

diff --git a/tools/perf/util/parse-events.c b/tools/perf/util/parse-events.c
index d085b8f..c2a7dc2 100644
--- a/tools/perf/util/parse-events.c
+++ b/tools/perf/util/parse-events.c
@@ -494,46 +494,51 @@ int parse_events(const struct option *opt __used, const char *str, int unset __u
return 0;
}

-static const char * const event_type_descriptors[] = {
- "",
+static const char *event_types[PERF_TYPE_MAX] = {
"Hardware event",
"Software event",
"Tracepoint event",
"Hardware cache event",
+ "Hardware raw event",
"Hardware vector event",
"Hardware interrupt event",
};

-/*
- * Print the help text for the event symbols:
- */
-void print_events(void)
+static void print_desc(struct event_symbol *syms, unsigned int size)
{
- struct event_symbol *syms = event_symbols;
- unsigned int i, type, op, prev_type = -1;
+ unsigned int i, type, prev_type = -1;
char name[40];

- fprintf(stderr, "\n");
- fprintf(stderr, "List of pre-defined events (to be used in -e):\n");
-
- for (i = 0; i < ARRAY_SIZE(event_symbols); i++, syms++) {
- type = syms->type + 1;
- if (type > ARRAY_SIZE(event_type_descriptors))
- type = 0;
-
- if (type != prev_type)
+ for (i = 0; i < size; i++, syms++) {
+ type = syms->type;
+ if (type != prev_type) {
+ prev_type = type;
fprintf(stderr, "\n");
+ }

if (strlen(syms->alias))
- sprintf(name, "%s OR %s", syms->symbol, syms->alias);
+ sprintf(name, "%s | %s", syms->symbol, syms->alias);
else
strcpy(name, syms->symbol);
- fprintf(stderr, " %-40s [%s]\n", name,
- event_type_descriptors[type]);

- prev_type = type;
+ fprintf(stderr, " %-40s [%s]\n", name, event_types[type]);
}
+}
+
+/*
+ * Print the help text for the event symbols:
+ */
+void print_events(void)
+{
+ unsigned int type, op, r;
+
+ fprintf(stderr, "\n");
+ fprintf(stderr, "List of pre-defined events (to be used in -e):\n");

+ /* List hardware and software event descriptors */
+ print_desc(event_symbols, ARRAY_SIZE(event_symbols));
+
+ /* List hardware cache event descriptors */
fprintf(stderr, "\n");
for (type = 0; type < PERF_COUNT_HW_CACHE_MAX; type++) {
for (op = 0; op < PERF_COUNT_HW_CACHE_OP_MAX; op++) {
@@ -541,41 +546,23 @@ void print_events(void)
if (!is_cache_op_valid(type, op))
continue;

- for (i = 0; i < PERF_COUNT_HW_CACHE_RESULT_MAX; i++) {
+ for (r = 0; r < PERF_COUNT_HW_CACHE_RESULT_MAX; r++) {
fprintf(stderr, " %-40s [%s]\n",
- event_cache_name(type, op, i),
- event_type_descriptors[4]);
+ event_cache_name(type, op, r),
+ event_types[PERF_TYPE_HW_CACHE]);
}
}
}

- fprintf(stderr, "\n");
- syms = vector_event_symbols;
- type = syms->type;
- for (i = 0; i < ARRAY_SIZE(vector_event_symbols); i++, syms++) {
- if (strlen(syms->alias))
- sprintf(name, "%s OR %s", syms->symbol, syms->alias);
- else
- strcpy(name, syms->symbol);
- fprintf(stderr, " %-40s [%s]\n", name,
- event_type_descriptors[type]);
- }
+ /* List hardware vectored co-processor event descriptors */
+ print_desc(vector_event_symbols, ARRAY_SIZE(vector_event_symbols));

- fprintf(stderr, "\n");
- syms = interrupt_event_symbols;
- type = syms->type;
- for (i = 0; i < ARRAY_SIZE(interrupt_event_symbols); i++, syms++) {
- if (strlen(syms->alias))
- sprintf(name, "%s OR %s", syms->symbol, syms->alias);
- else
- strcpy(name, syms->symbol);
- fprintf(stderr, " %-40s [%s]\n", name,
- event_type_descriptors[type]);
- }
+ /* List hardware interrupt event descriptors */
+ print_desc(interrupt_event_symbols, ARRAY_SIZE(interrupt_event_symbols));

+ /* List hardware raw event descriptors */
fprintf(stderr, "\n");
- fprintf(stderr, " %-40s [raw hardware event descriptor]\n",
- "rNNN");
+ fprintf(stderr, " %-40s [%s]\n", "rNNN", event_types[PERF_TYPE_RAW]);
fprintf(stderr, "\n");

exit(129);
--
1.6.2.5