$./perf stat -e add -e multiply -e fpu-store -e fpu-empty -e fpu-busy -e x87 -e mmx-3dnow -e sse-sse2 -- ls -lR /usr/include/ > /dev/null
Performance counter stats for 'ls -lR /usr/include/':
7335 add ( 2.00x scaled)
8012 multiply ( 1.99x scaled)
5229 fpu-store ( 2.00x scaled)
793097355 fpu-empty ( 2.00x scaled)
182 fpu-busy ( 2.00x scaled)
6 x87 ( 2.01x scaled)
4 mmx-3dnow ( 2.00x scaled)
8933 sse-sse2 ( 2.00x scaled)
0.393548820 seconds time elapsed
$./perf stat -e add -e multiply -e fpu-store -e fpu-empty -e fpu-busy -e x87 -e mmx-3dnow -e sse-sse2 -- /usr/bin/rhythmbox ~jaswinder/Music/singhiskinng.mp3
Performance counter stats for '/usr/bin/rhythmbox /home/jaswinder/Music/singhiskinng.mp3':
19583739 add ( 2.01x scaled)
20856051 multiply ( 2.01x scaled)
18669503 fpu-store ( 2.00x scaled)
25100224054 fpu-empty ( 1.99x scaled)
12540131 fpu-busy ( 1.99x scaled)
207228 x87 ( 1.99x scaled)
1768418 mmx-3dnow ( 2.00x scaled)
42286702 sse-sse2 ( 2.01x scaled)
302.698647617 seconds time elapsed
$./perf stat -e add -e multiply -e fpu-store -e fpu-empty -e fpu-busy -e x87 -e mmx-3dnow -e sse-sse2 -- /usr/bin/vlc ~jaswinder/Videos/Linus_Torvalds_interview_with_Charlie_Rose_Part_1.flv
Performance counter stats for '/usr/bin/vlc /home/jaswinder/Videos/Linus_Torvalds_interview_with_Charlie_Rose_Part_1.flv':
6572682335 add ( 2.00x scaled)
11131555181 multiply ( 2.00x scaled)
1317520699 fpu-store ( 2.00x scaled)
9089415134 fpu-empty ( 1.99x scaled)
2902772713 fpu-busy ( 2.00x scaled)
26047 x87 ( 2.00x scaled)
24850978532 mmx-3dnow ( 2.00x scaled)
262276117 sse-sse2 ( 2.01x scaled)
96.169312358 seconds time elapsed
Signed-off-by: Jaswinder Singh Rajput <[email protected]>
---
arch/x86/kernel/cpu/perf_counter.c | 34 ++++++++++++++++++++++++++++++
include/linux/perf_counter.h | 17 +++++++++++++++
kernel/perf_counter.c | 1 +
tools/perf/util/parse-events.c | 40 ++++++++++++++++++++++++++++++++++++
4 files changed, 92 insertions(+), 0 deletions(-)
diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index b83474b..4417edf 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -372,6 +372,12 @@ static const u64 atom_hw_cache_event_ids
},
};
+/*
+ * Generalized hw fpu event table
+ */
+
+static u64 __read_mostly hw_fpu_event_ids[PERF_COUNT_HW_FPU_MAX];
+
static u64 intel_pmu_raw_event(u64 event)
{
#define CORE_EVNTSEL_EVENT_MASK 0x000000FFULL
@@ -481,6 +487,18 @@ static const u64 amd_hw_cache_event_ids
},
};
+static const u64 amd_hw_fpu_event_ids[] =
+{
+ [PERF_COUNT_HW_FPU_ADD] = 0x0100, /* Dispatched FPU Add */
+ [PERF_COUNT_HW_FPU_MULTIPLY] = 0x0200, /* Dispatched FPU Multiply */
+ [PERF_COUNT_HW_FPU_STORE] = 0x0400, /* Dispatched FPU Store */
+ [PERF_COUNT_HW_FPU_EMPTY] = 0x0001, /* FPU Empty cycles */
+ [PERF_COUNT_HW_FPU_BUSY] = 0x00D7, /* Dispatch stall for FPU */
+ [PERF_COUNT_HW_FPU_X87_INSTR] = 0x01CB, /* Retired x87 Instructions*/
+ [PERF_COUNT_HW_FPU_MMX_3DNOW_INSTR] = 0x02CB, /* Retired MMX & 3DNow Inst*/
+ [PERF_COUNT_HW_FPU_SSE_SSE2_INSTR] = 0x0CCB, /* Retired SSE & SSE2 Instr*/
+};
+
/*
* AMD Performance Monitor K7 and later.
*/
@@ -659,6 +677,17 @@ set_ext_hw_attr(struct hw_perf_counter *hwc, struct perf_counter_attr *attr)
return 0;
}
+static inline int
+set_hw_fpu_attr(struct hw_perf_counter *hwc, struct perf_counter_attr *attr)
+{
+ if (attr->config >= PERF_COUNT_HW_FPU_MAX)
+ return -EINVAL;
+
+ hwc->config |= hw_fpu_event_ids[attr->config];
+
+ return 0;
+}
+
/*
* Setup the hardware configuration for a given attr_type
*/
@@ -716,6 +745,9 @@ static int __hw_perf_counter_init(struct perf_counter *counter)
if (attr->type == PERF_TYPE_HW_CACHE)
return set_ext_hw_attr(hwc, attr);
+ if (attr->type == PERF_TYPE_HW_FPU)
+ return set_hw_fpu_attr(hwc, attr);
+
if (attr->config >= x86_pmu.max_events)
return -EINVAL;
/*
@@ -1468,6 +1500,8 @@ static int amd_pmu_init(void)
/* Events are common for all AMDs */
memcpy(hw_cache_event_ids, amd_hw_cache_event_ids,
sizeof(hw_cache_event_ids));
+ memcpy(hw_fpu_event_ids, amd_hw_fpu_event_ids,
+ sizeof(hw_fpu_event_ids));
return 0;
}
diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index 3078e23..89b3370 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -31,6 +31,7 @@ enum perf_type_id {
PERF_TYPE_TRACEPOINT = 2,
PERF_TYPE_HW_CACHE = 3,
PERF_TYPE_RAW = 4,
+ PERF_TYPE_HW_FPU = 5,
PERF_TYPE_MAX, /* non-ABI */
};
@@ -89,6 +90,22 @@ enum perf_hw_cache_op_result_id {
};
/*
+ * Generalized hardware FPU counters:
+ */
+enum perf_hw_fpu_id {
+ PERF_COUNT_HW_FPU_ADD = 0,
+ PERF_COUNT_HW_FPU_MULTIPLY = 1,
+ PERF_COUNT_HW_FPU_STORE = 2,
+ PERF_COUNT_HW_FPU_EMPTY = 3,
+ PERF_COUNT_HW_FPU_BUSY = 4,
+ PERF_COUNT_HW_FPU_X87_INSTR = 5,
+ PERF_COUNT_HW_FPU_MMX_3DNOW_INSTR = 6,
+ PERF_COUNT_HW_FPU_SSE_SSE2_INSTR = 7,
+
+ PERF_COUNT_HW_FPU_MAX, /* non-ABI */
+};
+
+/*
* Special "software" counters provided by the kernel, even if the hardware
* does not support performance counters. These counters measure various
* physical and sw events of the kernel (and allow the profiling of them as
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 66ab1e9..c40132f 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -3788,6 +3788,7 @@ perf_counter_alloc(struct perf_counter_attr *attr,
case PERF_TYPE_RAW:
case PERF_TYPE_HARDWARE:
case PERF_TYPE_HW_CACHE:
+ case PERF_TYPE_HW_FPU:
pmu = hw_perf_counter_init(counter);
break;
diff --git a/tools/perf/util/parse-events.c b/tools/perf/util/parse-events.c
index 4d042f1..4d03061 100644
--- a/tools/perf/util/parse-events.c
+++ b/tools/perf/util/parse-events.c
@@ -40,6 +40,19 @@ static struct event_symbol event_symbols[] = {
{ CSW(CPU_MIGRATIONS), "cpu-migrations", "migrations" },
};
+#define CHFPU(x) .type = PERF_TYPE_HW_FPU, .config = PERF_COUNT_HW_FPU_##x
+
+static struct event_symbol fpu_event_symbols[] = {
+ { CHFPU(ADD), "add", "addition" },
+ { CHFPU(MULTIPLY), "multiply", "multiplication"},
+ { CHFPU(STORE), "fpu-store", "" },
+ { CHFPU(EMPTY), "fpu-empty", "" },
+ { CHFPU(BUSY), "fpu-busy", "" },
+ { CHFPU(X87_INSTR), "x87", "" },
+ { CHFPU(MMX_3DNOW_INSTR), "mmx-3dnow", "" },
+ { CHFPU(SSE_SSE2_INSTR), "sse-sse2", "sse" },
+};
+
#define __PERF_COUNTER_FIELD(config, name) \
((config & PERF_COUNTER_##name##_MASK) >> PERF_COUNTER_##name##_SHIFT)
@@ -172,6 +185,11 @@ char *event_name(int counter)
return event_cache_name(cache_type, cache_op, cache_result);
}
+ case PERF_TYPE_HW_FPU:
+ if (config < PERF_COUNT_HW_FPU_MAX)
+ return fpu_event_symbols[config].symbol;
+ return "unknown-fpu";
+
case PERF_TYPE_SOFTWARE:
if (config < PERF_COUNT_SW_MAX)
return sw_event_names[config];
@@ -250,6 +268,19 @@ static int check_events(const char *str, unsigned int i)
return 0;
}
+static int check_fpu_events(const char *str, unsigned int i)
+{
+ if (!strncmp(str, fpu_event_symbols[i].symbol,
+ strlen(fpu_event_symbols[i].symbol)))
+ return 1;
+
+ if (strlen(fpu_event_symbols[i].alias))
+ if (!strncmp(str, fpu_event_symbols[i].alias,
+ strlen(fpu_event_symbols[i].alias)))
+ return 1;
+ return 0;
+}
+
/*
* Each event can have multiple symbolic names.
* Symbolic names are (almost) exactly matched.
@@ -297,6 +328,15 @@ static int parse_event_symbols(const char *str, struct perf_counter_attr *attr)
}
}
+ for (i = 0; i < ARRAY_SIZE(fpu_event_symbols); i++) {
+ if (check_fpu_events(str, i)) {
+ attr->type = fpu_event_symbols[i].type;
+ attr->config = fpu_event_symbols[i].config;
+
+ return 0;
+ }
+ }
+
return parse_generic_hw_symbols(str, attr);
}
--
1.6.0.6
* Jaswinder Singh Rajput <[email protected]> wrote:
> $./perf stat -e add -e multiply -e fpu-store -e fpu-empty -e fpu-busy -e x87 -e mmx-3dnow -e sse-sse2 -- ls -lR /usr/include/ > /dev/null
>
> Performance counter stats for 'ls -lR /usr/include/':
>
> 7335 add ( 2.00x scaled)
> 8012 multiply ( 1.99x scaled)
> 5229 fpu-store ( 2.00x scaled)
> 793097355 fpu-empty ( 2.00x scaled)
> 182 fpu-busy ( 2.00x scaled)
> 6 x87 ( 2.01x scaled)
> 4 mmx-3dnow ( 2.00x scaled)
> 8933 sse-sse2 ( 2.00x scaled)
>
> 0.393548820 seconds time elapsed
>
> $./perf stat -e add -e multiply -e fpu-store -e fpu-empty -e fpu-busy -e x87 -e mmx-3dnow -e sse-sse2 -- /usr/bin/rhythmbox ~jaswinder/Music/singhiskinng.mp3
>
> Performance counter stats for '/usr/bin/rhythmbox /home/jaswinder/Music/singhiskinng.mp3':
>
> 19583739 add ( 2.01x scaled)
> 20856051 multiply ( 2.01x scaled)
> 18669503 fpu-store ( 2.00x scaled)
> 25100224054 fpu-empty ( 1.99x scaled)
> 12540131 fpu-busy ( 1.99x scaled)
> 207228 x87 ( 1.99x scaled)
> 1768418 mmx-3dnow ( 2.00x scaled)
> 42286702 sse-sse2 ( 2.01x scaled)
>
> 302.698647617 seconds time elapsed
>
> $./perf stat -e add -e multiply -e fpu-store -e fpu-empty -e fpu-busy -e x87 -e mmx-3dnow -e sse-sse2 -- /usr/bin/vlc ~jaswinder/Videos/Linus_Torvalds_interview_with_Charlie_Rose_Part_1.flv
>
> Performance counter stats for '/usr/bin/vlc /home/jaswinder/Videos/Linus_Torvalds_interview_with_Charlie_Rose_Part_1.flv':
>
> 6572682335 add ( 2.00x scaled)
> 11131555181 multiply ( 2.00x scaled)
> 1317520699 fpu-store ( 2.00x scaled)
> 9089415134 fpu-empty ( 1.99x scaled)
> 2902772713 fpu-busy ( 2.00x scaled)
> 26047 x87 ( 2.00x scaled)
> 24850978532 mmx-3dnow ( 2.00x scaled)
> 262276117 sse-sse2 ( 2.01x scaled)
>
> 96.169312358 seconds time elapsed
>
> Signed-off-by: Jaswinder Singh Rajput <[email protected]>
> ---
> arch/x86/kernel/cpu/perf_counter.c | 34 ++++++++++++++++++++++++++++++
> include/linux/perf_counter.h | 17 +++++++++++++++
> kernel/perf_counter.c | 1 +
> tools/perf/util/parse-events.c | 40 ++++++++++++++++++++++++++++++++++++
> 4 files changed, 92 insertions(+), 0 deletions(-)
>
> diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
> index b83474b..4417edf 100644
> --- a/arch/x86/kernel/cpu/perf_counter.c
> +++ b/arch/x86/kernel/cpu/perf_counter.c
> @@ -372,6 +372,12 @@ static const u64 atom_hw_cache_event_ids
> },
> };
>
> +/*
> + * Generalized hw fpu event table
> + */
> +
> +static u64 __read_mostly hw_fpu_event_ids[PERF_COUNT_HW_FPU_MAX];
ok, this looks genuinely useful, but there are some gaps. Where's
the divides? Plus things like mmx-3dnow are AMD specific, sse-sse2
is x86 specific. We definitely want this general table, but the
events should be truly general.
Also, how would this look like on Intel, roughly?
Ingo
On Tue, 2009-06-30 at 12:11 +0200, Ingo Molnar wrote:
> * Jaswinder Singh Rajput <[email protected]> wrote:
>
> > $./perf stat -e add -e multiply -e fpu-store -e fpu-empty -e fpu-busy -e x87 -e mmx-3dnow -e sse-sse2 -- ls -lR /usr/include/ > /dev/null
> >
> > Performance counter stats for 'ls -lR /usr/include/':
> >
> > 7335 add ( 2.00x scaled)
> > 8012 multiply ( 1.99x scaled)
> > 5229 fpu-store ( 2.00x scaled)
> > 793097355 fpu-empty ( 2.00x scaled)
> > 182 fpu-busy ( 2.00x scaled)
> > 6 x87 ( 2.01x scaled)
> > 4 mmx-3dnow ( 2.00x scaled)
> > 8933 sse-sse2 ( 2.00x scaled)
> >
> > 0.393548820 seconds time elapsed
> >
> > $./perf stat -e add -e multiply -e fpu-store -e fpu-empty -e fpu-busy -e x87 -e mmx-3dnow -e sse-sse2 -- /usr/bin/rhythmbox ~jaswinder/Music/singhiskinng.mp3
> >
> > Performance counter stats for '/usr/bin/rhythmbox /home/jaswinder/Music/singhiskinng.mp3':
> >
> > 19583739 add ( 2.01x scaled)
> > 20856051 multiply ( 2.01x scaled)
> > 18669503 fpu-store ( 2.00x scaled)
> > 25100224054 fpu-empty ( 1.99x scaled)
> > 12540131 fpu-busy ( 1.99x scaled)
> > 207228 x87 ( 1.99x scaled)
> > 1768418 mmx-3dnow ( 2.00x scaled)
> > 42286702 sse-sse2 ( 2.01x scaled)
> >
> > 302.698647617 seconds time elapsed
> >
> > $./perf stat -e add -e multiply -e fpu-store -e fpu-empty -e fpu-busy -e x87 -e mmx-3dnow -e sse-sse2 -- /usr/bin/vlc ~jaswinder/Videos/Linus_Torvalds_interview_with_Charlie_Rose_Part_1.flv
> >
> > Performance counter stats for '/usr/bin/vlc /home/jaswinder/Videos/Linus_Torvalds_interview_with_Charlie_Rose_Part_1.flv':
> >
> > 6572682335 add ( 2.00x scaled)
> > 11131555181 multiply ( 2.00x scaled)
> > 1317520699 fpu-store ( 2.00x scaled)
> > 9089415134 fpu-empty ( 1.99x scaled)
> > 2902772713 fpu-busy ( 2.00x scaled)
> > 26047 x87 ( 2.00x scaled)
> > 24850978532 mmx-3dnow ( 2.00x scaled)
> > 262276117 sse-sse2 ( 2.01x scaled)
> >
> > 96.169312358 seconds time elapsed
> >
> > Signed-off-by: Jaswinder Singh Rajput <[email protected]>
> > ---
> > arch/x86/kernel/cpu/perf_counter.c | 34 ++++++++++++++++++++++++++++++
> > include/linux/perf_counter.h | 17 +++++++++++++++
> > kernel/perf_counter.c | 1 +
> > tools/perf/util/parse-events.c | 40 ++++++++++++++++++++++++++++++++++++
> > 4 files changed, 92 insertions(+), 0 deletions(-)
> >
> > diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
> > index b83474b..4417edf 100644
> > --- a/arch/x86/kernel/cpu/perf_counter.c
> > +++ b/arch/x86/kernel/cpu/perf_counter.c
> > @@ -372,6 +372,12 @@ static const u64 atom_hw_cache_event_ids
> > },
> > };
> >
> > +/*
> > + * Generalized hw fpu event table
> > + */
> > +
> > +static u64 __read_mostly hw_fpu_event_ids[PERF_COUNT_HW_FPU_MAX];
>
> ok, this looks genuinely useful, but there are some gaps. Where's
> the divides?
I was also surprised divide is not available for AMD. Thats why I did
not included it. You are right it should be there.
> Plus things like mmx-3dnow are AMD specific, sse-sse2
> is x86 specific. We definitely want this general table, but the
> events should be truly general.
>
mmx and sse are available for both Intel and AMD. Thats why I added both
of them. Is it OK.
> Also, how would this look like on Intel, roughly?
>
Intel have almost all of them + divide.
As you know I work from home and I do not have any Intel machine which
supports PMU.
Can you suggest your machine name so that I can prepare the FPU events
list for your machine and you can verify it on your side.
Thanks,
--
JSR
On Tue, 2009-06-30 at 18:50 +0530, Jaswinder Singh Rajput wrote:
> On Tue, 2009-06-30 at 12:11 +0200, Ingo Molnar wrote:
> > * Jaswinder Singh Rajput <[email protected]> wrote:
> >
> > > $./perf stat -e add -e multiply -e fpu-store -e fpu-empty -e fpu-busy -e x87 -e mmx-3dnow -e sse-sse2 -- ls -lR /usr/include/ > /dev/null
> > >
> > > Performance counter stats for 'ls -lR /usr/include/':
> > >
> > > 7335 add ( 2.00x scaled)
> > > 8012 multiply ( 1.99x scaled)
> > > 5229 fpu-store ( 2.00x scaled)
> > > 793097355 fpu-empty ( 2.00x scaled)
> > > 182 fpu-busy ( 2.00x scaled)
> > > 6 x87 ( 2.01x scaled)
> > > 4 mmx-3dnow ( 2.00x scaled)
> > > 8933 sse-sse2 ( 2.00x scaled)
> > >
> > > 0.393548820 seconds time elapsed
> > >
> > > $./perf stat -e add -e multiply -e fpu-store -e fpu-empty -e fpu-busy -e x87 -e mmx-3dnow -e sse-sse2 -- /usr/bin/rhythmbox ~jaswinder/Music/singhiskinng.mp3
> > >
> > > Performance counter stats for '/usr/bin/rhythmbox /home/jaswinder/Music/singhiskinng.mp3':
> > >
> > > 19583739 add ( 2.01x scaled)
> > > 20856051 multiply ( 2.01x scaled)
> > > 18669503 fpu-store ( 2.00x scaled)
> > > 25100224054 fpu-empty ( 1.99x scaled)
> > > 12540131 fpu-busy ( 1.99x scaled)
> > > 207228 x87 ( 1.99x scaled)
> > > 1768418 mmx-3dnow ( 2.00x scaled)
> > > 42286702 sse-sse2 ( 2.01x scaled)
> > >
> > > 302.698647617 seconds time elapsed
> > >
> > > $./perf stat -e add -e multiply -e fpu-store -e fpu-empty -e fpu-busy -e x87 -e mmx-3dnow -e sse-sse2 -- /usr/bin/vlc ~jaswinder/Videos/Linus_Torvalds_interview_with_Charlie_Rose_Part_1.flv
> > >
> > > Performance counter stats for '/usr/bin/vlc /home/jaswinder/Videos/Linus_Torvalds_interview_with_Charlie_Rose_Part_1.flv':
> > >
> > > 6572682335 add ( 2.00x scaled)
> > > 11131555181 multiply ( 2.00x scaled)
> > > 1317520699 fpu-store ( 2.00x scaled)
> > > 9089415134 fpu-empty ( 1.99x scaled)
> > > 2902772713 fpu-busy ( 2.00x scaled)
> > > 26047 x87 ( 2.00x scaled)
> > > 24850978532 mmx-3dnow ( 2.00x scaled)
> > > 262276117 sse-sse2 ( 2.01x scaled)
> > >
> > > 96.169312358 seconds time elapsed
> > >
> > > Signed-off-by: Jaswinder Singh Rajput <[email protected]>
> > > ---
> > > arch/x86/kernel/cpu/perf_counter.c | 34 ++++++++++++++++++++++++++++++
> > > include/linux/perf_counter.h | 17 +++++++++++++++
> > > kernel/perf_counter.c | 1 +
> > > tools/perf/util/parse-events.c | 40 ++++++++++++++++++++++++++++++++++++
> > > 4 files changed, 92 insertions(+), 0 deletions(-)
> > >
> > > diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
> > > index b83474b..4417edf 100644
> > > --- a/arch/x86/kernel/cpu/perf_counter.c
> > > +++ b/arch/x86/kernel/cpu/perf_counter.c
> > > @@ -372,6 +372,12 @@ static const u64 atom_hw_cache_event_ids
> > > },
> > > };
> > >
> > > +/*
> > > + * Generalized hw fpu event table
> > > + */
> > > +
> > > +static u64 __read_mostly hw_fpu_event_ids[PERF_COUNT_HW_FPU_MAX];
> >
> > ok, this looks genuinely useful, but there are some gaps. Where's
> > the divides?
>
> I was also surprised divide is not available for AMD. Thats why I did
> not included it. You are right it should be there.
>
In AMD FPU operations include add, multiple and store.
Can I use store as divide for AMD, samples I shown above seems like they
are divide.
> > Plus things like mmx-3dnow are AMD specific, sse-sse2
> > is x86 specific. We definitely want this general table, but the
> > events should be truly general.
> >
>
> mmx and sse are available for both Intel and AMD. Thats why I added both
> of them. Is it OK.
>
Is this looks :
enum perf_hw_fpu_id {
PERF_COUNT_HW_FPU_ADD = 0,
PERF_COUNT_HW_FPU_MULTIPLY = 1,
PERF_COUNT_HW_FPU_DIVIDE = 2,
PERF_COUNT_HW_FPU_EMPTY = 3,
PERF_COUNT_HW_FPU_STALL = 4,
PERF_COUNT_HW_FPU_X87 = 5,
PERF_COUNT_HW_FPU_MMX = 6,
PERF_COUNT_HW_FPU_SSE = 7,
PERF_COUNT_HW_FPU_MAX, /* non-ABI */
> > Also, how would this look like on Intel, roughly?
> >
>
> Intel have almost all of them + divide.
>
> As you know I work from home and I do not have any Intel machine which
> supports PMU.
>
> Can you suggest your machine name so that I can prepare the FPU events
> list for your machine and you can verify it on your side.
>
For Nehalem it will look like :
static const u64 nehalem_hw_fpu_event_ids[] =
{
[PERF_COUNT_HW_FPU_ADD] = 0x01B1, /* UOPS_EXECUTED.PORT0 */
[PERF_COUNT_HW_FPU_MULTIPLY] = 0x0214, /* ARITH.MUL */
[PERF_COUNT_HW_FPU_DIVIDE] = 0x0114, /* ARITH.CYCLES_DIV_BUSY */
[PERF_COUNT_HW_FPU_EMPTY] = 0x0,
[PERF_COUNT_HW_FPU_STALL] = 0x60A2, /* RESOURCE_STALLS.FPCW|MXCSR*/
[PERF_COUNT_HW_FPU_X87] = 0x0110, /* FP_COMP_OPS_EXE.X87 */
[PERF_COUNT_HW_FPU_MMX] = 0x0210, /* FP_COMP_OPS_EXE.MMX */
[PERF_COUNT_HW_FPU_SSE] = 0x0410, /* FP_COMP_OPS_EXE.SSE_FP */
};
Is these looks OK to you. Can I resend the patch based on these.
Thanks,
--
JSR
* Jaswinder Singh Rajput <[email protected]> wrote:
> > Plus things like mmx-3dnow are AMD specific, sse-sse2 is x86
> > specific. We definitely want this general table, but the events
> > should be truly general.
>
> mmx and sse are available for both Intel and AMD. Thats why I
> added both of them. Is it OK.
'3dnow' is an AMD marketing term. (and is long obsolete)
Nor did you answer (or understand) my sentence above: 'sse' is an
x86 specific term.
I think a naming and enumeration scheme around the general concept
of 'vectored co-processor' would be far less x86 specific.
Mockup:
19583739 vec-adds ( 2.01x scaled)
20856051 vec-muls ( 2.01x scaled)
20856051 vec-divs ( 2.01x scaled)
25100224054 vec-idle-cycles ( 1.99x scaled)
12540131 vec-busy-cycles ( 1.99x scaled)
42286702 vec-ops ( 2.01x scaled)
Paulus: would this categorization fit PowerPC too?
Ingo
> '3dnow' is an AMD marketing term. (and is long obsolete)
And an instruction set extension as well so different to SSE
(also be careful of MMX as there is MMX and MMX-EXT (Cyrix/AMD).
Ingo Molnar writes:
> 19583739 vec-adds ( 2.01x scaled)
> 20856051 vec-muls ( 2.01x scaled)
> 20856051 vec-divs ( 2.01x scaled)
> 25100224054 vec-idle-cycles ( 1.99x scaled)
> 12540131 vec-busy-cycles ( 1.99x scaled)
> 42286702 vec-ops ( 2.01x scaled)
>
> Paulus: would this categorization fit PowerPC too?
Conceptually that looks nice, but unfortunately we don't have events
that correspond to that categorization on any PowerPC with vector
hardware (VMX/Altivec). POWER6 seems to have the most vector events,
and they are mostly divided up along the lines of simple / complex /
permute / load / store operations, and whether they are integer or
floating-point operations.
Here are the vector-related events we have on POWER6:
MRK_VMX0_LD_WRBACK Marked VMX0 load writeback valid
MRK_VMX1_LD_WRBACK Marked VMX1 load writeback valid
MRK_VMX_COMPLEX_ISSUED Marked VMX instruction issued to complex
MRK_VMX_FLOAT_ISSUED Marked VMX instruction issued to float
MRK_VMX_PERMUTE_ISSUED Marked VMX instruction issued to permute
MRK_VMX_SIMPLE_ISSUED Marked VMX instruction issued to simple
MRK_VMX_ST_ISSUED Marked VMX store issued
VMX0_INST_ISSUED VMX0 instruction issued
VMX0_LD_ISSUED VMX0 load issued
VMX0_LD_WRBACK VMX0 load writeback valid
VMX0_STALL VMX0 stall
VMX1_INST_ISSUED VMX1 instruction issued
VMX1_LD_ISSUED VMX1 load issued
VMX1_LD_WRBACK VMX1 load writeback valid
VMX1_STALL VMX1 stall
VMX_COMPLEX_ISSUED VMX instruction issued to complex
VMX_FLOAT_ISSUED VMX instruction issued to float
VMX_FLOAT_MULTICYCLE VMX multi-cycle floating point instruction issued
VMX_PERMUTE_ISSUED VMX instruction issued to permute
VMX_RESULT_SAT_0_1 VMX valid result with sat bit is set (0->1)
VMX_RESULT_SAT_1 VMX valid result with sat=1
VMX_SIMPLE_ISSUED VMX instruction issued to simple
VMX_ST_ISSUED VMX store issued
I'm not sure what the exact distinction is between VMX0 and VMX1.
I'll find out.
The MPC7450 (G4, 32-bit) cpu also has quite a few VMX/Altivec events,
such as counts of cycles that individual vector units are waiting for
operands, but not counts of how many vector add or vector multiply
operations are done.
Paul.
* Paul Mackerras <[email protected]> wrote:
> Ingo Molnar writes:
>
> > 19583739 vec-adds ( 2.01x scaled)
> > 20856051 vec-muls ( 2.01x scaled)
> > 20856051 vec-divs ( 2.01x scaled)
> > 25100224054 vec-idle-cycles ( 1.99x scaled)
> > 12540131 vec-busy-cycles ( 1.99x scaled)
> > 42286702 vec-ops ( 2.01x scaled)
> >
> > Paulus: would this categorization fit PowerPC too?
>
> Conceptually that looks nice, but unfortunately we don't have
> events that correspond to that categorization on any PowerPC with
> vector hardware (VMX/Altivec). POWER6 seems to have the most
> vector events, and they are mostly divided up along the lines of
> simple / complex / permute / load / store operations, and whether
> they are integer or floating-point operations.
Here's what we have on x86:
20177177044 vec-adds (scaled from 66.63%)
34101687027 vec-muls (scaled from 66.64%)
3984060862 vec-divs (scaled from 66.71%)
26349684710 vec-idle-cycles (scaled from 66.65%)
9052001905 vec-stall-cycles (scaled from 66.66%)
76440734242 vec-ops (scaled from 66.71%)
Could at least the idle/busy/stall/total generic stats be filled in
on powerpc, with a reasonable enough approximation? Those
utilization metrics are the most important ones when one tries to
figure out how well utilized the vector units are.
Ingo
On Wed, 2009-07-01 at 22:33 +1000, Paul Mackerras wrote:
> Ingo Molnar writes:
>
> > 19583739 vec-adds ( 2.01x scaled)
> > 20856051 vec-muls ( 2.01x scaled)
> > 20856051 vec-divs ( 2.01x scaled)
> > 25100224054 vec-idle-cycles ( 1.99x scaled)
> > 12540131 vec-busy-cycles ( 1.99x scaled)
> > 42286702 vec-ops ( 2.01x scaled)
> >
> > Paulus: would this categorization fit PowerPC too?
>
> Conceptually that looks nice, but unfortunately we don't have events
> that correspond to that categorization on any PowerPC with vector
> hardware (VMX/Altivec). POWER6 seems to have the most vector events,
> and they are mostly divided up along the lines of simple / complex /
> permute / load / store operations, and whether they are integer or
> floating-point operations.
>
> Here are the vector-related events we have on POWER6:
>
> MRK_VMX0_LD_WRBACK Marked VMX0 load writeback valid
> MRK_VMX1_LD_WRBACK Marked VMX1 load writeback valid
> MRK_VMX_COMPLEX_ISSUED Marked VMX instruction issued to complex
> MRK_VMX_FLOAT_ISSUED Marked VMX instruction issued to float
> MRK_VMX_PERMUTE_ISSUED Marked VMX instruction issued to permute
> MRK_VMX_SIMPLE_ISSUED Marked VMX instruction issued to simple
> MRK_VMX_ST_ISSUED Marked VMX store issued
> VMX0_INST_ISSUED VMX0 instruction issued
> VMX0_LD_ISSUED VMX0 load issued
> VMX0_LD_WRBACK VMX0 load writeback valid
> VMX0_STALL VMX0 stall
> VMX1_INST_ISSUED VMX1 instruction issued
> VMX1_LD_ISSUED VMX1 load issued
> VMX1_LD_WRBACK VMX1 load writeback valid
> VMX1_STALL VMX1 stall
> VMX_COMPLEX_ISSUED VMX instruction issued to complex
> VMX_FLOAT_ISSUED VMX instruction issued to float
> VMX_FLOAT_MULTICYCLE VMX multi-cycle floating point instruction issued
> VMX_PERMUTE_ISSUED VMX instruction issued to permute
> VMX_RESULT_SAT_0_1 VMX valid result with sat bit is set (0->1)
> VMX_RESULT_SAT_1 VMX valid result with sat=1
> VMX_SIMPLE_ISSUED VMX instruction issued to simple
> VMX_ST_ISSUED VMX store issued
>
> I'm not sure what the exact distinction is between VMX0 and VMX1.
> I'll find out.
>
I am just guessing for powerpc, normally different units are for
different purpose like some do addition/multiplication and others do
division.
Like in Intel Corei7/Nehalem :
UOPS_EXECUTED.PORT0: Counts number of Uops executed
that were issued on port 0. Port 0
handles integer arithmetic, SIMD
and FP add Uops.
UOPS_EXECUTED.PORT1: Counts number of Uops executed
that were issued on port 1. Port 1
handles integer arithmetic, SIMD,
integer shift, FP multiply and FP
divide Uops.
Can you provide me the link of the Hardware manual so that I can check
it out.
Thanks,
--
JSR