Added more events not it looks like on AMD box :
./perf stat -- ls -lR > /dev/null
Performance counter stats for 'ls -lR':
2507744774 cycles # 2085.473 M/sec (scaled from 13.28%)
1515534968 instructions # 0.604 IPC (scaled from 13.28%)
783181797 cache-references # 651.304 M/sec (scaled from 36.36%)
18089523 cache-misses # 15.043 M/sec (scaled from 36.37%)
195550613 branches # 162.622 M/sec (scaled from 36.29%)
14623394 branch-misses # 12.161 M/sec (scaled from 36.29%)
<not counted> bus-cycles
1203.182949 cpu-clock-msecs
1202.482671 task-clock-msecs # 0.990 CPUs
454 page-faults # 0.000 M/sec
454 minor-faults # 0.000 M/sec
0 major-faults # 0.000 M/sec
133 context-switches # 0.000 M/sec
1 CPU-migrations # 0.000 M/sec
744421154 L1-data-Cache-Load-Referencees # 619.070 M/sec (scaled from 13.20%)
5220656 L1-data-Cache-Load-Misses # 4.342 M/sec (scaled from 13.28%)
438576 L1-data-Cache-Store-Referencees # 0.365 M/sec (scaled from 13.36%)
<not counted> L1-data-Cache-Store-Misses
1976596 L1-data-Cache-Prefetch-Referencees # 1.644 M/sec (scaled from 13.44%)
1644021 L1-data-Cache-Prefetch-Misses # 1.367 M/sec (scaled from 13.52%)
764273224 L1-instruction-Cache-Load-Referencees # 635.579 M/sec (scaled from 13.53%)
17242789 L1-instruction-Cache-Load-Misses # 14.339 M/sec (scaled from 13.53%)
<not counted> L1-instruction-Cache-Store-Referencees
<not counted> L1-instruction-Cache-Store-Misses
372621 L1-instruction-Cache-Prefetch-Referencees # 0.310 M/sec (scaled from 13.53%)
<not counted> L1-instruction-Cache-Prefetch-Misses
22844109 L2-Cache-Load-Referencees # 18.997 M/sec (scaled from 13.53%)
2235733 L2-Cache-Load-Misses # 1.859 M/sec (scaled from 13.53%)
23949920 L2-Cache-Store-Referencees # 19.917 M/sec (scaled from 13.46%)
<not counted> L2-Cache-Store-Misses
<not counted> L2-Cache-Prefetch-Referencees
<not counted> L2-Cache-Prefetch-Misses
732364670 Data-TLB-Cache-Load-Referencees # 609.044 M/sec (scaled from 13.45%)
16516548 Data-TLB-Cache-Load-Misses # 13.735 M/sec (scaled from 13.42%)
<not counted> Data-TLB-Cache-Store-Referencees
<not counted> Data-TLB-Cache-Store-Misses
<not counted> Data-TLB-Cache-Prefetch-Referencees
<not counted> Data-TLB-Cache-Prefetch-Misses
766865920 Instruction-TLB-Cache-Load-Referencees # 637.736 M/sec (scaled from 13.42%)
19981 Instruction-TLB-Cache-Load-Misses # 0.017 M/sec (scaled from 13.40%)
<not counted> Instruction-TLB-Cache-Store-Referencees
<not counted> Instruction-TLB-Cache-Store-Misses
<not counted> Instruction-TLB-Cache-Prefetch-Referencees
<not counted> Instruction-TLB-Cache-Prefetch-Misses
308272002 Branch-Cache-Load-Referencees # 256.363 M/sec (scaled from 13.33%)
19226358 Branch-Cache-Load-Misses # 15.989 M/sec (scaled from 13.28%)
<not counted> Branch-Cache-Store-Referencees
<not counted> Branch-Cache-Store-Misses
<not counted> Branch-Cache-Prefetch-Referencees
<not counted> Branch-Cache-Prefetch-Misses
1.214877275 seconds time elapsed.
Fix alignment, style problems and remove dead code
Increase limit for event_name() display
Signed-off-by: Jaswinder Singh Rajput <[email protected]>
---
tools/perf/builtin-stat.c | 111 ++++++++++++++++++++++++++++++++++-----------
1 files changed, 84 insertions(+), 27 deletions(-)
diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c
index 6d3eeac..a8b31f8 100644
--- a/tools/perf/builtin-stat.c
+++ b/tools/perf/builtin-stat.c
@@ -32,6 +32,7 @@
* Wu Fengguang <[email protected]>
* Mike Galbraith <[email protected]>
* Paul Mackerras <[email protected]>
+ * Jaswinder Singh <[email protected]>
*
* Released under the GPL v2. (and only v2, not any later version)
*/
@@ -45,32 +46,94 @@
#include <sys/prctl.h>
#include <math.h>
-static struct perf_counter_attr default_attrs[MAX_COUNTERS] = {
-
- { .type = PERF_TYPE_SOFTWARE, .config = PERF_COUNT_SW_TASK_CLOCK },
- { .type = PERF_TYPE_SOFTWARE, .config = PERF_COUNT_SW_CONTEXT_SWITCHES},
- { .type = PERF_TYPE_SOFTWARE, .config = PERF_COUNT_SW_CPU_MIGRATIONS },
- { .type = PERF_TYPE_SOFTWARE, .config = PERF_COUNT_SW_PAGE_FAULTS },
-
- { .type = PERF_TYPE_HARDWARE, .config = PERF_COUNT_HW_CPU_CYCLES },
- { .type = PERF_TYPE_HARDWARE, .config = PERF_COUNT_HW_INSTRUCTIONS },
- { .type = PERF_TYPE_HARDWARE, .config = PERF_COUNT_HW_CACHE_REFERENCES},
- { .type = PERF_TYPE_HARDWARE, .config = PERF_COUNT_HW_CACHE_MISSES },
+#define CHW(x) .type = PERF_TYPE_HARDWARE, .config = PERF_COUNT_HW_##x
+#define CSW(x) .type = PERF_TYPE_SOFTWARE, .config = PERF_COUNT_SW_##x
+#define CHCACHE(x, y, z) \
+.type = PERF_TYPE_HW_CACHE, \
+.config = (PERF_COUNT_HW_CACHE_##x | (PERF_COUNT_HW_CACHE_OP_##y << 8) |\
+ (PERF_COUNT_HW_CACHE_RESULT_##z << 16))
+
+static struct perf_counter_attr default_attrs[] = {
+
+/* Generalized Hardware events */
+ { CHW(CPU_CYCLES) },
+ { CHW(INSTRUCTIONS) },
+ { CHW(CACHE_REFERENCES) },
+ { CHW(CACHE_MISSES) },
+ { CHW(BRANCH_INSTRUCTIONS) },
+ { CHW(BRANCH_MISSES) },
+ { CHW(BUS_CYCLES) },
+
+/* Generalized Software events */
+ { CSW(CPU_CLOCK) },
+ { CSW(TASK_CLOCK) },
+ { CSW(PAGE_FAULTS) },
+ { CSW(PAGE_FAULTS_MIN) },
+ { CSW(PAGE_FAULTS_MAJ) },
+ { CSW(CONTEXT_SWITCHES) },
+ { CSW(CPU_MIGRATIONS) },
+
+/* Generalized Hardware cache counters events */
+ { CHCACHE(L1D, READ, ACCESS) },
+ { CHCACHE(L1D, READ, MISS) },
+ { CHCACHE(L1D, WRITE, ACCESS) },
+ { CHCACHE(L1D, WRITE, MISS) },
+ { CHCACHE(L1D, PREFETCH, ACCESS) },
+ { CHCACHE(L1D, PREFETCH, MISS) },
+
+ { CHCACHE(L1I, READ, ACCESS) },
+ { CHCACHE(L1I, READ, MISS) },
+ { CHCACHE(L1I, WRITE, ACCESS) },
+ { CHCACHE(L1I, WRITE, MISS) },
+ { CHCACHE(L1I, PREFETCH, ACCESS) },
+ { CHCACHE(L1I, PREFETCH, MISS) },
+
+ { CHCACHE(LL, READ, ACCESS) },
+ { CHCACHE(LL, READ, MISS) },
+ { CHCACHE(LL, WRITE, ACCESS) },
+ { CHCACHE(LL, WRITE, MISS) },
+ { CHCACHE(LL, PREFETCH, ACCESS) },
+ { CHCACHE(LL, PREFETCH, MISS) },
+
+ { CHCACHE(DTLB, READ, ACCESS) },
+ { CHCACHE(DTLB, READ, MISS) },
+ { CHCACHE(DTLB, WRITE, ACCESS) },
+ { CHCACHE(DTLB, WRITE, MISS) },
+ { CHCACHE(DTLB, PREFETCH, ACCESS) },
+ { CHCACHE(DTLB, PREFETCH, MISS) },
+
+ { CHCACHE(ITLB, READ, ACCESS) },
+ { CHCACHE(ITLB, READ, MISS) },
+ { CHCACHE(ITLB, WRITE, ACCESS) },
+ { CHCACHE(ITLB, WRITE, MISS) },
+ { CHCACHE(ITLB, PREFETCH, ACCESS) },
+ { CHCACHE(ITLB, PREFETCH, MISS) },
+
+ { CHCACHE(BPU, READ, ACCESS) },
+ { CHCACHE(BPU, READ, MISS) },
+ { CHCACHE(BPU, WRITE, ACCESS) },
+ { CHCACHE(BPU, WRITE, MISS) },
+ { CHCACHE(BPU, PREFETCH, ACCESS) },
+ { CHCACHE(BPU, PREFETCH, MISS) },
};
-static int system_wide = 0;
-static int inherit = 1;
-static int verbose = 0;
+#define MAX_RUN 100
static int fd[MAX_NR_CPUS][MAX_COUNTERS];
-static int target_pid = -1;
+static int system_wide = 0;
static int nr_cpus = 0;
-static unsigned int page_size;
+static int verbose = 0;
+static int run_idx = 0;
+static int run_count = 1;
+static int target_pid = -1;
+static int inherit = 1;
static int scale = 1;
+static unsigned int page_size;
+
static const unsigned int default_count[] = {
1000000,
1000000,
@@ -80,17 +143,11 @@ static const unsigned int default_count[] = {
10000,
};
-#define MAX_RUN 100
-static int run_count = 1;
-static int run_idx = 0;
static u64 event_res[MAX_RUN][MAX_COUNTERS][3];
static u64 event_scaled[MAX_RUN][MAX_COUNTERS];
-//static u64 event_hist[MAX_RUN][MAX_COUNTERS][3];
-
-
static u64 runtime_nsecs[MAX_RUN];
static u64 walltime_nsecs[MAX_RUN];
static u64 runtime_cycles[MAX_RUN];
@@ -119,7 +176,7 @@ static void create_perf_stat_counter(int counter)
if (system_wide) {
int cpu;
- for (cpu = 0; cpu < nr_cpus; cpu ++) {
+ for (cpu = 0; cpu < nr_cpus; cpu++) {
fd[cpu][counter] = sys_perf_counter_open(attr, -1, cpu, -1, 0);
if (fd[cpu][counter] < 0 && verbose) {
printf("Error: counter %d, sys_perf_counter_open() syscall returned with %d (%s)\n", counter, fd[cpu][counter], strerror(errno));
@@ -168,7 +225,7 @@ static void read_counter(int counter)
count[0] = count[1] = count[2] = 0;
nv = scale ? 3 : 1;
- for (cpu = 0; cpu < nr_cpus; cpu ++) {
+ for (cpu = 0; cpu < nr_cpus; cpu++) {
if (fd[cpu][counter] < 0)
continue;
@@ -262,7 +319,7 @@ static void nsec_printout(int counter, u64 *count, u64 *noise)
{
double msecs = (double)count[0] / 1000000;
- fprintf(stderr, " %14.6f %-20s", msecs, event_name(counter));
+ fprintf(stderr, " %14.6f %-43s", msecs, event_name(counter));
if (attrs[counter].type == PERF_TYPE_SOFTWARE &&
attrs[counter].config == PERF_COUNT_SW_TASK_CLOCK) {
@@ -276,7 +333,7 @@ static void nsec_printout(int counter, u64 *count, u64 *noise)
static void abs_printout(int counter, u64 *count, u64 *noise)
{
- fprintf(stderr, " %14Ld %-20s", count[0], event_name(counter));
+ fprintf(stderr, " %14Ld %-43s", count[0], event_name(counter));
if (runtime_cycles_avg &&
attrs[counter].type == PERF_TYPE_HARDWARE &&
@@ -491,7 +548,7 @@ int cmd_stat(int argc, const char **argv, const char *prefix)
usage_with_options(stat_usage, options);
if (!nr_counters)
- nr_counters = 8;
+ nr_counters = ARRAY_SIZE(default_attrs);
nr_cpus = sysconf(_SC_NPROCESSORS_ONLN);
assert(nr_cpus <= MAX_NR_CPUS);
--
1.6.0.6
* Jaswinder Singh Rajput <[email protected]> wrote:
> Added more events not it looks like on AMD box :
>
> ./perf stat -- ls -lR > /dev/null
>
> Performance counter stats for 'ls -lR':
>
> 2507744774 cycles # 2085.473 M/sec (scaled from 13.28%)
> 1515534968 instructions # 0.604 IPC (scaled from 13.28%)
> 783181797 cache-references # 651.304 M/sec (scaled from 36.36%)
> 18089523 cache-misses # 15.043 M/sec (scaled from 36.37%)
> 195550613 branches # 162.622 M/sec (scaled from 36.29%)
> 14623394 branch-misses # 12.161 M/sec (scaled from 36.29%)
> <not counted> bus-cycles
> 1203.182949 cpu-clock-msecs
> 1202.482671 task-clock-msecs # 0.990 CPUs
> 454 page-faults # 0.000 M/sec
> 454 minor-faults # 0.000 M/sec
> 0 major-faults # 0.000 M/sec
> 133 context-switches # 0.000 M/sec
> 1 CPU-migrations # 0.000 M/sec
> 744421154 L1-data-Cache-Load-Referencees # 619.070 M/sec (scaled from 13.20%)
> 5220656 L1-data-Cache-Load-Misses # 4.342 M/sec (scaled from 13.28%)
> 438576 L1-data-Cache-Store-Referencees # 0.365 M/sec (scaled from 13.36%)
> <not counted> L1-data-Cache-Store-Misses
> 1976596 L1-data-Cache-Prefetch-Referencees # 1.644 M/sec (scaled from 13.44%)
> 1644021 L1-data-Cache-Prefetch-Misses # 1.367 M/sec (scaled from 13.52%)
> 764273224 L1-instruction-Cache-Load-Referencees # 635.579 M/sec (scaled from 13.53%)
> 17242789 L1-instruction-Cache-Load-Misses # 14.339 M/sec (scaled from 13.53%)
> <not counted> L1-instruction-Cache-Store-Referencees
> <not counted> L1-instruction-Cache-Store-Misses
> 372621 L1-instruction-Cache-Prefetch-Referencees # 0.310 M/sec (scaled from 13.53%)
> <not counted> L1-instruction-Cache-Prefetch-Misses
> 22844109 L2-Cache-Load-Referencees # 18.997 M/sec (scaled from 13.53%)
> 2235733 L2-Cache-Load-Misses # 1.859 M/sec (scaled from 13.53%)
> 23949920 L2-Cache-Store-Referencees # 19.917 M/sec (scaled from 13.46%)
> <not counted> L2-Cache-Store-Misses
> <not counted> L2-Cache-Prefetch-Referencees
> <not counted> L2-Cache-Prefetch-Misses
> 732364670 Data-TLB-Cache-Load-Referencees # 609.044 M/sec (scaled from 13.45%)
> 16516548 Data-TLB-Cache-Load-Misses # 13.735 M/sec (scaled from 13.42%)
> <not counted> Data-TLB-Cache-Store-Referencees
> <not counted> Data-TLB-Cache-Store-Misses
> <not counted> Data-TLB-Cache-Prefetch-Referencees
> <not counted> Data-TLB-Cache-Prefetch-Misses
> 766865920 Instruction-TLB-Cache-Load-Referencees # 637.736 M/sec (scaled from 13.42%)
> 19981 Instruction-TLB-Cache-Load-Misses # 0.017 M/sec (scaled from 13.40%)
> <not counted> Instruction-TLB-Cache-Store-Referencees
> <not counted> Instruction-TLB-Cache-Store-Misses
> <not counted> Instruction-TLB-Cache-Prefetch-Referencees
> <not counted> Instruction-TLB-Cache-Prefetch-Misses
> 308272002 Branch-Cache-Load-Referencees # 256.363 M/sec (scaled from 13.33%)
> 19226358 Branch-Cache-Load-Misses # 15.989 M/sec (scaled from 13.28%)
> <not counted> Branch-Cache-Store-Referencees
> <not counted> Branch-Cache-Store-Misses
> <not counted> Branch-Cache-Prefetch-Referencees
> <not counted> Branch-Cache-Prefetch-Misses
Looks useful - but it would be nice to not touch the default 'perf
stat' output but instead offer a few 'sets' of pre-defined events,
which can be specified in the event list, such as:
perf stat -e cache-events
perf stat -e all-cache-events
perf stat -e sw-events
Perhaps also a:
perf stat -e all
To get output from all counters that we know about.
Regex matching on event specifiers would be useful too - there's
already regex code in perf-report, see the --parent option.
Ingo