2009-06-29 09:34:23

by Jaswinder Singh Rajput

[permalink] [raw]
Subject: [PATCH -tip] perf_counter: Add Generalized Hardware FPU support for AMD


$./perf stat -e add -e multiply -e fpu-store -e fpu-empty -e fpu-busy -e x87 -e mmx-3dnow -e sse-sse2 -- ls -lR /usr/include/ > /dev/null

Performance counter stats for 'ls -lR /usr/include/':

7335 add ( 2.00x scaled)
8012 multiply ( 1.99x scaled)
5229 fpu-store ( 2.00x scaled)
793097355 fpu-empty ( 2.00x scaled)
182 fpu-busy ( 2.00x scaled)
6 x87 ( 2.01x scaled)
4 mmx-3dnow ( 2.00x scaled)
8933 sse-sse2 ( 2.00x scaled)

0.393548820 seconds time elapsed

$./perf stat -e add -e multiply -e fpu-store -e fpu-empty -e fpu-busy -e x87 -e mmx-3dnow -e sse-sse2 -- /usr/bin/rhythmbox ~jaswinder/Music/singhiskinng.mp3

Performance counter stats for '/usr/bin/rhythmbox /home/jaswinder/Music/singhiskinng.mp3':

19583739 add ( 2.01x scaled)
20856051 multiply ( 2.01x scaled)
18669503 fpu-store ( 2.00x scaled)
25100224054 fpu-empty ( 1.99x scaled)
12540131 fpu-busy ( 1.99x scaled)
207228 x87 ( 1.99x scaled)
1768418 mmx-3dnow ( 2.00x scaled)
42286702 sse-sse2 ( 2.01x scaled)

302.698647617 seconds time elapsed

$./perf stat -e add -e multiply -e fpu-store -e fpu-empty -e fpu-busy -e x87 -e mmx-3dnow -e sse-sse2 -- /usr/bin/vlc ~jaswinder/Videos/Linus_Torvalds_interview_with_Charlie_Rose_Part_1.flv

Performance counter stats for '/usr/bin/vlc /home/jaswinder/Videos/Linus_Torvalds_interview_with_Charlie_Rose_Part_1.flv':

6572682335 add ( 2.00x scaled)
11131555181 multiply ( 2.00x scaled)
1317520699 fpu-store ( 2.00x scaled)
9089415134 fpu-empty ( 1.99x scaled)
2902772713 fpu-busy ( 2.00x scaled)
26047 x87 ( 2.00x scaled)
24850978532 mmx-3dnow ( 2.00x scaled)
262276117 sse-sse2 ( 2.01x scaled)

96.169312358 seconds time elapsed

Signed-off-by: Jaswinder Singh Rajput <[email protected]>
---
arch/x86/kernel/cpu/perf_counter.c | 34 ++++++++++++++++++++++++++++++
include/linux/perf_counter.h | 17 +++++++++++++++
kernel/perf_counter.c | 1 +
tools/perf/util/parse-events.c | 40 ++++++++++++++++++++++++++++++++++++
4 files changed, 92 insertions(+), 0 deletions(-)

diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index b83474b..4417edf 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -372,6 +372,12 @@ static const u64 atom_hw_cache_event_ids
},
};

+/*
+ * Generalized hw fpu event table
+ */
+
+static u64 __read_mostly hw_fpu_event_ids[PERF_COUNT_HW_FPU_MAX];
+
static u64 intel_pmu_raw_event(u64 event)
{
#define CORE_EVNTSEL_EVENT_MASK 0x000000FFULL
@@ -481,6 +487,18 @@ static const u64 amd_hw_cache_event_ids
},
};

+static const u64 amd_hw_fpu_event_ids[] =
+{
+ [PERF_COUNT_HW_FPU_ADD] = 0x0100, /* Dispatched FPU Add */
+ [PERF_COUNT_HW_FPU_MULTIPLY] = 0x0200, /* Dispatched FPU Multiply */
+ [PERF_COUNT_HW_FPU_STORE] = 0x0400, /* Dispatched FPU Store */
+ [PERF_COUNT_HW_FPU_EMPTY] = 0x0001, /* FPU Empty cycles */
+ [PERF_COUNT_HW_FPU_BUSY] = 0x00D7, /* Dispatch stall for FPU */
+ [PERF_COUNT_HW_FPU_X87_INSTR] = 0x01CB, /* Retired x87 Instructions*/
+ [PERF_COUNT_HW_FPU_MMX_3DNOW_INSTR] = 0x02CB, /* Retired MMX & 3DNow Inst*/
+ [PERF_COUNT_HW_FPU_SSE_SSE2_INSTR] = 0x0CCB, /* Retired SSE & SSE2 Instr*/
+};
+
/*
* AMD Performance Monitor K7 and later.
*/
@@ -659,6 +677,17 @@ set_ext_hw_attr(struct hw_perf_counter *hwc, struct perf_counter_attr *attr)
return 0;
}

+static inline int
+set_hw_fpu_attr(struct hw_perf_counter *hwc, struct perf_counter_attr *attr)
+{
+ if (attr->config >= PERF_COUNT_HW_FPU_MAX)
+ return -EINVAL;
+
+ hwc->config |= hw_fpu_event_ids[attr->config];
+
+ return 0;
+}
+
/*
* Setup the hardware configuration for a given attr_type
*/
@@ -716,6 +745,9 @@ static int __hw_perf_counter_init(struct perf_counter *counter)
if (attr->type == PERF_TYPE_HW_CACHE)
return set_ext_hw_attr(hwc, attr);

+ if (attr->type == PERF_TYPE_HW_FPU)
+ return set_hw_fpu_attr(hwc, attr);
+
if (attr->config >= x86_pmu.max_events)
return -EINVAL;
/*
@@ -1468,6 +1500,8 @@ static int amd_pmu_init(void)
/* Events are common for all AMDs */
memcpy(hw_cache_event_ids, amd_hw_cache_event_ids,
sizeof(hw_cache_event_ids));
+ memcpy(hw_fpu_event_ids, amd_hw_fpu_event_ids,
+ sizeof(hw_fpu_event_ids));

return 0;
}
diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index 3078e23..89b3370 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -31,6 +31,7 @@ enum perf_type_id {
PERF_TYPE_TRACEPOINT = 2,
PERF_TYPE_HW_CACHE = 3,
PERF_TYPE_RAW = 4,
+ PERF_TYPE_HW_FPU = 5,

PERF_TYPE_MAX, /* non-ABI */
};
@@ -89,6 +90,22 @@ enum perf_hw_cache_op_result_id {
};

/*
+ * Generalized hardware FPU counters:
+ */
+enum perf_hw_fpu_id {
+ PERF_COUNT_HW_FPU_ADD = 0,
+ PERF_COUNT_HW_FPU_MULTIPLY = 1,
+ PERF_COUNT_HW_FPU_STORE = 2,
+ PERF_COUNT_HW_FPU_EMPTY = 3,
+ PERF_COUNT_HW_FPU_BUSY = 4,
+ PERF_COUNT_HW_FPU_X87_INSTR = 5,
+ PERF_COUNT_HW_FPU_MMX_3DNOW_INSTR = 6,
+ PERF_COUNT_HW_FPU_SSE_SSE2_INSTR = 7,
+
+ PERF_COUNT_HW_FPU_MAX, /* non-ABI */
+};
+
+/*
* Special "software" counters provided by the kernel, even if the hardware
* does not support performance counters. These counters measure various
* physical and sw events of the kernel (and allow the profiling of them as
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 66ab1e9..c40132f 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -3788,6 +3788,7 @@ perf_counter_alloc(struct perf_counter_attr *attr,
case PERF_TYPE_RAW:
case PERF_TYPE_HARDWARE:
case PERF_TYPE_HW_CACHE:
+ case PERF_TYPE_HW_FPU:
pmu = hw_perf_counter_init(counter);
break;

diff --git a/tools/perf/util/parse-events.c b/tools/perf/util/parse-events.c
index 4d042f1..4d03061 100644
--- a/tools/perf/util/parse-events.c
+++ b/tools/perf/util/parse-events.c
@@ -40,6 +40,19 @@ static struct event_symbol event_symbols[] = {
{ CSW(CPU_MIGRATIONS), "cpu-migrations", "migrations" },
};

+#define CHFPU(x) .type = PERF_TYPE_HW_FPU, .config = PERF_COUNT_HW_FPU_##x
+
+static struct event_symbol fpu_event_symbols[] = {
+ { CHFPU(ADD), "add", "addition" },
+ { CHFPU(MULTIPLY), "multiply", "multiplication"},
+ { CHFPU(STORE), "fpu-store", "" },
+ { CHFPU(EMPTY), "fpu-empty", "" },
+ { CHFPU(BUSY), "fpu-busy", "" },
+ { CHFPU(X87_INSTR), "x87", "" },
+ { CHFPU(MMX_3DNOW_INSTR), "mmx-3dnow", "" },
+ { CHFPU(SSE_SSE2_INSTR), "sse-sse2", "sse" },
+};
+
#define __PERF_COUNTER_FIELD(config, name) \
((config & PERF_COUNTER_##name##_MASK) >> PERF_COUNTER_##name##_SHIFT)

@@ -172,6 +185,11 @@ char *event_name(int counter)
return event_cache_name(cache_type, cache_op, cache_result);
}

+ case PERF_TYPE_HW_FPU:
+ if (config < PERF_COUNT_HW_FPU_MAX)
+ return fpu_event_symbols[config].symbol;
+ return "unknown-fpu";
+
case PERF_TYPE_SOFTWARE:
if (config < PERF_COUNT_SW_MAX)
return sw_event_names[config];
@@ -250,6 +268,19 @@ static int check_events(const char *str, unsigned int i)
return 0;
}

+static int check_fpu_events(const char *str, unsigned int i)
+{
+ if (!strncmp(str, fpu_event_symbols[i].symbol,
+ strlen(fpu_event_symbols[i].symbol)))
+ return 1;
+
+ if (strlen(fpu_event_symbols[i].alias))
+ if (!strncmp(str, fpu_event_symbols[i].alias,
+ strlen(fpu_event_symbols[i].alias)))
+ return 1;
+ return 0;
+}
+
/*
* Each event can have multiple symbolic names.
* Symbolic names are (almost) exactly matched.
@@ -297,6 +328,15 @@ static int parse_event_symbols(const char *str, struct perf_counter_attr *attr)
}
}

+ for (i = 0; i < ARRAY_SIZE(fpu_event_symbols); i++) {
+ if (check_fpu_events(str, i)) {
+ attr->type = fpu_event_symbols[i].type;
+ attr->config = fpu_event_symbols[i].config;
+
+ return 0;
+ }
+ }
+
return parse_generic_hw_symbols(str, attr);
}

--
1.6.0.6



2009-06-30 10:11:27

by Ingo Molnar

[permalink] [raw]
Subject: Re: [PATCH -tip] perf_counter: Add Generalized Hardware FPU support for AMD


* Jaswinder Singh Rajput <[email protected]> wrote:

> $./perf stat -e add -e multiply -e fpu-store -e fpu-empty -e fpu-busy -e x87 -e mmx-3dnow -e sse-sse2 -- ls -lR /usr/include/ > /dev/null
>
> Performance counter stats for 'ls -lR /usr/include/':
>
> 7335 add ( 2.00x scaled)
> 8012 multiply ( 1.99x scaled)
> 5229 fpu-store ( 2.00x scaled)
> 793097355 fpu-empty ( 2.00x scaled)
> 182 fpu-busy ( 2.00x scaled)
> 6 x87 ( 2.01x scaled)
> 4 mmx-3dnow ( 2.00x scaled)
> 8933 sse-sse2 ( 2.00x scaled)
>
> 0.393548820 seconds time elapsed
>
> $./perf stat -e add -e multiply -e fpu-store -e fpu-empty -e fpu-busy -e x87 -e mmx-3dnow -e sse-sse2 -- /usr/bin/rhythmbox ~jaswinder/Music/singhiskinng.mp3
>
> Performance counter stats for '/usr/bin/rhythmbox /home/jaswinder/Music/singhiskinng.mp3':
>
> 19583739 add ( 2.01x scaled)
> 20856051 multiply ( 2.01x scaled)
> 18669503 fpu-store ( 2.00x scaled)
> 25100224054 fpu-empty ( 1.99x scaled)
> 12540131 fpu-busy ( 1.99x scaled)
> 207228 x87 ( 1.99x scaled)
> 1768418 mmx-3dnow ( 2.00x scaled)
> 42286702 sse-sse2 ( 2.01x scaled)
>
> 302.698647617 seconds time elapsed
>
> $./perf stat -e add -e multiply -e fpu-store -e fpu-empty -e fpu-busy -e x87 -e mmx-3dnow -e sse-sse2 -- /usr/bin/vlc ~jaswinder/Videos/Linus_Torvalds_interview_with_Charlie_Rose_Part_1.flv
>
> Performance counter stats for '/usr/bin/vlc /home/jaswinder/Videos/Linus_Torvalds_interview_with_Charlie_Rose_Part_1.flv':
>
> 6572682335 add ( 2.00x scaled)
> 11131555181 multiply ( 2.00x scaled)
> 1317520699 fpu-store ( 2.00x scaled)
> 9089415134 fpu-empty ( 1.99x scaled)
> 2902772713 fpu-busy ( 2.00x scaled)
> 26047 x87 ( 2.00x scaled)
> 24850978532 mmx-3dnow ( 2.00x scaled)
> 262276117 sse-sse2 ( 2.01x scaled)
>
> 96.169312358 seconds time elapsed
>
> Signed-off-by: Jaswinder Singh Rajput <[email protected]>
> ---
> arch/x86/kernel/cpu/perf_counter.c | 34 ++++++++++++++++++++++++++++++
> include/linux/perf_counter.h | 17 +++++++++++++++
> kernel/perf_counter.c | 1 +
> tools/perf/util/parse-events.c | 40 ++++++++++++++++++++++++++++++++++++
> 4 files changed, 92 insertions(+), 0 deletions(-)
>
> diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
> index b83474b..4417edf 100644
> --- a/arch/x86/kernel/cpu/perf_counter.c
> +++ b/arch/x86/kernel/cpu/perf_counter.c
> @@ -372,6 +372,12 @@ static const u64 atom_hw_cache_event_ids
> },
> };
>
> +/*
> + * Generalized hw fpu event table
> + */
> +
> +static u64 __read_mostly hw_fpu_event_ids[PERF_COUNT_HW_FPU_MAX];

ok, this looks genuinely useful, but there are some gaps. Where's
the divides? Plus things like mmx-3dnow are AMD specific, sse-sse2
is x86 specific. We definitely want this general table, but the
events should be truly general.

Also, how would this look like on Intel, roughly?

Ingo

2009-06-30 13:21:35

by Jaswinder Singh Rajput

[permalink] [raw]
Subject: Re: [PATCH -tip] perf_counter: Add Generalized Hardware FPU support for AMD

On Tue, 2009-06-30 at 12:11 +0200, Ingo Molnar wrote:
> * Jaswinder Singh Rajput <[email protected]> wrote:
>
> > $./perf stat -e add -e multiply -e fpu-store -e fpu-empty -e fpu-busy -e x87 -e mmx-3dnow -e sse-sse2 -- ls -lR /usr/include/ > /dev/null
> >
> > Performance counter stats for 'ls -lR /usr/include/':
> >
> > 7335 add ( 2.00x scaled)
> > 8012 multiply ( 1.99x scaled)
> > 5229 fpu-store ( 2.00x scaled)
> > 793097355 fpu-empty ( 2.00x scaled)
> > 182 fpu-busy ( 2.00x scaled)
> > 6 x87 ( 2.01x scaled)
> > 4 mmx-3dnow ( 2.00x scaled)
> > 8933 sse-sse2 ( 2.00x scaled)
> >
> > 0.393548820 seconds time elapsed
> >
> > $./perf stat -e add -e multiply -e fpu-store -e fpu-empty -e fpu-busy -e x87 -e mmx-3dnow -e sse-sse2 -- /usr/bin/rhythmbox ~jaswinder/Music/singhiskinng.mp3
> >
> > Performance counter stats for '/usr/bin/rhythmbox /home/jaswinder/Music/singhiskinng.mp3':
> >
> > 19583739 add ( 2.01x scaled)
> > 20856051 multiply ( 2.01x scaled)
> > 18669503 fpu-store ( 2.00x scaled)
> > 25100224054 fpu-empty ( 1.99x scaled)
> > 12540131 fpu-busy ( 1.99x scaled)
> > 207228 x87 ( 1.99x scaled)
> > 1768418 mmx-3dnow ( 2.00x scaled)
> > 42286702 sse-sse2 ( 2.01x scaled)
> >
> > 302.698647617 seconds time elapsed
> >
> > $./perf stat -e add -e multiply -e fpu-store -e fpu-empty -e fpu-busy -e x87 -e mmx-3dnow -e sse-sse2 -- /usr/bin/vlc ~jaswinder/Videos/Linus_Torvalds_interview_with_Charlie_Rose_Part_1.flv
> >
> > Performance counter stats for '/usr/bin/vlc /home/jaswinder/Videos/Linus_Torvalds_interview_with_Charlie_Rose_Part_1.flv':
> >
> > 6572682335 add ( 2.00x scaled)
> > 11131555181 multiply ( 2.00x scaled)
> > 1317520699 fpu-store ( 2.00x scaled)
> > 9089415134 fpu-empty ( 1.99x scaled)
> > 2902772713 fpu-busy ( 2.00x scaled)
> > 26047 x87 ( 2.00x scaled)
> > 24850978532 mmx-3dnow ( 2.00x scaled)
> > 262276117 sse-sse2 ( 2.01x scaled)
> >
> > 96.169312358 seconds time elapsed
> >
> > Signed-off-by: Jaswinder Singh Rajput <[email protected]>
> > ---
> > arch/x86/kernel/cpu/perf_counter.c | 34 ++++++++++++++++++++++++++++++
> > include/linux/perf_counter.h | 17 +++++++++++++++
> > kernel/perf_counter.c | 1 +
> > tools/perf/util/parse-events.c | 40 ++++++++++++++++++++++++++++++++++++
> > 4 files changed, 92 insertions(+), 0 deletions(-)
> >
> > diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
> > index b83474b..4417edf 100644
> > --- a/arch/x86/kernel/cpu/perf_counter.c
> > +++ b/arch/x86/kernel/cpu/perf_counter.c
> > @@ -372,6 +372,12 @@ static const u64 atom_hw_cache_event_ids
> > },
> > };
> >
> > +/*
> > + * Generalized hw fpu event table
> > + */
> > +
> > +static u64 __read_mostly hw_fpu_event_ids[PERF_COUNT_HW_FPU_MAX];
>
> ok, this looks genuinely useful, but there are some gaps. Where's
> the divides?

I was also surprised divide is not available for AMD. Thats why I did
not included it. You are right it should be there.

> Plus things like mmx-3dnow are AMD specific, sse-sse2
> is x86 specific. We definitely want this general table, but the
> events should be truly general.
>

mmx and sse are available for both Intel and AMD. Thats why I added both
of them. Is it OK.

> Also, how would this look like on Intel, roughly?
>

Intel have almost all of them + divide.

As you know I work from home and I do not have any Intel machine which
supports PMU.

Can you suggest your machine name so that I can prepare the FPU events
list for your machine and you can verify it on your side.

Thanks,
--
JSR

2009-06-30 14:57:45

by Jaswinder Singh Rajput

[permalink] [raw]
Subject: Re: [PATCH -tip] perf_counter: Add Generalized Hardware FPU support for AMD

On Tue, 2009-06-30 at 18:50 +0530, Jaswinder Singh Rajput wrote:
> On Tue, 2009-06-30 at 12:11 +0200, Ingo Molnar wrote:
> > * Jaswinder Singh Rajput <[email protected]> wrote:
> >
> > > $./perf stat -e add -e multiply -e fpu-store -e fpu-empty -e fpu-busy -e x87 -e mmx-3dnow -e sse-sse2 -- ls -lR /usr/include/ > /dev/null
> > >
> > > Performance counter stats for 'ls -lR /usr/include/':
> > >
> > > 7335 add ( 2.00x scaled)
> > > 8012 multiply ( 1.99x scaled)
> > > 5229 fpu-store ( 2.00x scaled)
> > > 793097355 fpu-empty ( 2.00x scaled)
> > > 182 fpu-busy ( 2.00x scaled)
> > > 6 x87 ( 2.01x scaled)
> > > 4 mmx-3dnow ( 2.00x scaled)
> > > 8933 sse-sse2 ( 2.00x scaled)
> > >
> > > 0.393548820 seconds time elapsed
> > >
> > > $./perf stat -e add -e multiply -e fpu-store -e fpu-empty -e fpu-busy -e x87 -e mmx-3dnow -e sse-sse2 -- /usr/bin/rhythmbox ~jaswinder/Music/singhiskinng.mp3
> > >
> > > Performance counter stats for '/usr/bin/rhythmbox /home/jaswinder/Music/singhiskinng.mp3':
> > >
> > > 19583739 add ( 2.01x scaled)
> > > 20856051 multiply ( 2.01x scaled)
> > > 18669503 fpu-store ( 2.00x scaled)
> > > 25100224054 fpu-empty ( 1.99x scaled)
> > > 12540131 fpu-busy ( 1.99x scaled)
> > > 207228 x87 ( 1.99x scaled)
> > > 1768418 mmx-3dnow ( 2.00x scaled)
> > > 42286702 sse-sse2 ( 2.01x scaled)
> > >
> > > 302.698647617 seconds time elapsed
> > >
> > > $./perf stat -e add -e multiply -e fpu-store -e fpu-empty -e fpu-busy -e x87 -e mmx-3dnow -e sse-sse2 -- /usr/bin/vlc ~jaswinder/Videos/Linus_Torvalds_interview_with_Charlie_Rose_Part_1.flv
> > >
> > > Performance counter stats for '/usr/bin/vlc /home/jaswinder/Videos/Linus_Torvalds_interview_with_Charlie_Rose_Part_1.flv':
> > >
> > > 6572682335 add ( 2.00x scaled)
> > > 11131555181 multiply ( 2.00x scaled)
> > > 1317520699 fpu-store ( 2.00x scaled)
> > > 9089415134 fpu-empty ( 1.99x scaled)
> > > 2902772713 fpu-busy ( 2.00x scaled)
> > > 26047 x87 ( 2.00x scaled)
> > > 24850978532 mmx-3dnow ( 2.00x scaled)
> > > 262276117 sse-sse2 ( 2.01x scaled)
> > >
> > > 96.169312358 seconds time elapsed
> > >
> > > Signed-off-by: Jaswinder Singh Rajput <[email protected]>
> > > ---
> > > arch/x86/kernel/cpu/perf_counter.c | 34 ++++++++++++++++++++++++++++++
> > > include/linux/perf_counter.h | 17 +++++++++++++++
> > > kernel/perf_counter.c | 1 +
> > > tools/perf/util/parse-events.c | 40 ++++++++++++++++++++++++++++++++++++
> > > 4 files changed, 92 insertions(+), 0 deletions(-)
> > >
> > > diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
> > > index b83474b..4417edf 100644
> > > --- a/arch/x86/kernel/cpu/perf_counter.c
> > > +++ b/arch/x86/kernel/cpu/perf_counter.c
> > > @@ -372,6 +372,12 @@ static const u64 atom_hw_cache_event_ids
> > > },
> > > };
> > >
> > > +/*
> > > + * Generalized hw fpu event table
> > > + */
> > > +
> > > +static u64 __read_mostly hw_fpu_event_ids[PERF_COUNT_HW_FPU_MAX];
> >
> > ok, this looks genuinely useful, but there are some gaps. Where's
> > the divides?
>
> I was also surprised divide is not available for AMD. Thats why I did
> not included it. You are right it should be there.
>

In AMD FPU operations include add, multiple and store.
Can I use store as divide for AMD, samples I shown above seems like they
are divide.

> > Plus things like mmx-3dnow are AMD specific, sse-sse2
> > is x86 specific. We definitely want this general table, but the
> > events should be truly general.
> >
>
> mmx and sse are available for both Intel and AMD. Thats why I added both
> of them. Is it OK.
>

Is this looks :

enum perf_hw_fpu_id {
PERF_COUNT_HW_FPU_ADD = 0,
PERF_COUNT_HW_FPU_MULTIPLY = 1,
PERF_COUNT_HW_FPU_DIVIDE = 2,
PERF_COUNT_HW_FPU_EMPTY = 3,
PERF_COUNT_HW_FPU_STALL = 4,
PERF_COUNT_HW_FPU_X87 = 5,
PERF_COUNT_HW_FPU_MMX = 6,
PERF_COUNT_HW_FPU_SSE = 7,

PERF_COUNT_HW_FPU_MAX, /* non-ABI */


> > Also, how would this look like on Intel, roughly?
> >
>
> Intel have almost all of them + divide.
>
> As you know I work from home and I do not have any Intel machine which
> supports PMU.
>
> Can you suggest your machine name so that I can prepare the FPU events
> list for your machine and you can verify it on your side.
>

For Nehalem it will look like :

static const u64 nehalem_hw_fpu_event_ids[] =
{
[PERF_COUNT_HW_FPU_ADD] = 0x01B1, /* UOPS_EXECUTED.PORT0 */
[PERF_COUNT_HW_FPU_MULTIPLY] = 0x0214, /* ARITH.MUL */
[PERF_COUNT_HW_FPU_DIVIDE] = 0x0114, /* ARITH.CYCLES_DIV_BUSY */
[PERF_COUNT_HW_FPU_EMPTY] = 0x0,
[PERF_COUNT_HW_FPU_STALL] = 0x60A2, /* RESOURCE_STALLS.FPCW|MXCSR*/
[PERF_COUNT_HW_FPU_X87] = 0x0110, /* FP_COMP_OPS_EXE.X87 */
[PERF_COUNT_HW_FPU_MMX] = 0x0210, /* FP_COMP_OPS_EXE.MMX */
[PERF_COUNT_HW_FPU_SSE] = 0x0410, /* FP_COMP_OPS_EXE.SSE_FP */
};

Is these looks OK to you. Can I resend the patch based on these.

Thanks,
--
JSR

2009-06-30 22:42:51

by Ingo Molnar

[permalink] [raw]
Subject: Re: [PATCH -tip] perf_counter: Add Generalized Hardware FPU support for AMD


* Jaswinder Singh Rajput <[email protected]> wrote:

> > Plus things like mmx-3dnow are AMD specific, sse-sse2 is x86
> > specific. We definitely want this general table, but the events
> > should be truly general.
>
> mmx and sse are available for both Intel and AMD. Thats why I
> added both of them. Is it OK.

'3dnow' is an AMD marketing term. (and is long obsolete)

Nor did you answer (or understand) my sentence above: 'sse' is an
x86 specific term.

I think a naming and enumeration scheme around the general concept
of 'vectored co-processor' would be far less x86 specific.

Mockup:

19583739 vec-adds ( 2.01x scaled)
20856051 vec-muls ( 2.01x scaled)
20856051 vec-divs ( 2.01x scaled)
25100224054 vec-idle-cycles ( 1.99x scaled)
12540131 vec-busy-cycles ( 1.99x scaled)
42286702 vec-ops ( 2.01x scaled)

Paulus: would this categorization fit PowerPC too?

Ingo

2009-06-30 23:19:57

by Alan

[permalink] [raw]
Subject: Re: [PATCH -tip] perf_counter: Add Generalized Hardware FPU support for AMD

> '3dnow' is an AMD marketing term. (and is long obsolete)

And an instruction set extension as well so different to SSE

(also be careful of MMX as there is MMX and MMX-EXT (Cyrix/AMD).

2009-07-01 12:33:17

by Paul Mackerras

[permalink] [raw]
Subject: Re: [PATCH -tip] perf_counter: Add Generalized Hardware FPU support for AMD

Ingo Molnar writes:

> 19583739 vec-adds ( 2.01x scaled)
> 20856051 vec-muls ( 2.01x scaled)
> 20856051 vec-divs ( 2.01x scaled)
> 25100224054 vec-idle-cycles ( 1.99x scaled)
> 12540131 vec-busy-cycles ( 1.99x scaled)
> 42286702 vec-ops ( 2.01x scaled)
>
> Paulus: would this categorization fit PowerPC too?

Conceptually that looks nice, but unfortunately we don't have events
that correspond to that categorization on any PowerPC with vector
hardware (VMX/Altivec). POWER6 seems to have the most vector events,
and they are mostly divided up along the lines of simple / complex /
permute / load / store operations, and whether they are integer or
floating-point operations.

Here are the vector-related events we have on POWER6:

MRK_VMX0_LD_WRBACK Marked VMX0 load writeback valid
MRK_VMX1_LD_WRBACK Marked VMX1 load writeback valid
MRK_VMX_COMPLEX_ISSUED Marked VMX instruction issued to complex
MRK_VMX_FLOAT_ISSUED Marked VMX instruction issued to float
MRK_VMX_PERMUTE_ISSUED Marked VMX instruction issued to permute
MRK_VMX_SIMPLE_ISSUED Marked VMX instruction issued to simple
MRK_VMX_ST_ISSUED Marked VMX store issued
VMX0_INST_ISSUED VMX0 instruction issued
VMX0_LD_ISSUED VMX0 load issued
VMX0_LD_WRBACK VMX0 load writeback valid
VMX0_STALL VMX0 stall
VMX1_INST_ISSUED VMX1 instruction issued
VMX1_LD_ISSUED VMX1 load issued
VMX1_LD_WRBACK VMX1 load writeback valid
VMX1_STALL VMX1 stall
VMX_COMPLEX_ISSUED VMX instruction issued to complex
VMX_FLOAT_ISSUED VMX instruction issued to float
VMX_FLOAT_MULTICYCLE VMX multi-cycle floating point instruction issued
VMX_PERMUTE_ISSUED VMX instruction issued to permute
VMX_RESULT_SAT_0_1 VMX valid result with sat bit is set (0->1)
VMX_RESULT_SAT_1 VMX valid result with sat=1
VMX_SIMPLE_ISSUED VMX instruction issued to simple
VMX_ST_ISSUED VMX store issued

I'm not sure what the exact distinction is between VMX0 and VMX1.
I'll find out.

The MPC7450 (G4, 32-bit) cpu also has quite a few VMX/Altivec events,
such as counts of cycles that individual vector units are waiting for
operands, but not counts of how many vector add or vector multiply
operations are done.

Paul.

2009-07-01 13:12:41

by Ingo Molnar

[permalink] [raw]
Subject: Re: [PATCH -tip] perf_counter: Add Generalized Hardware FPU support for AMD


* Paul Mackerras <[email protected]> wrote:

> Ingo Molnar writes:
>
> > 19583739 vec-adds ( 2.01x scaled)
> > 20856051 vec-muls ( 2.01x scaled)
> > 20856051 vec-divs ( 2.01x scaled)
> > 25100224054 vec-idle-cycles ( 1.99x scaled)
> > 12540131 vec-busy-cycles ( 1.99x scaled)
> > 42286702 vec-ops ( 2.01x scaled)
> >
> > Paulus: would this categorization fit PowerPC too?
>
> Conceptually that looks nice, but unfortunately we don't have
> events that correspond to that categorization on any PowerPC with
> vector hardware (VMX/Altivec). POWER6 seems to have the most
> vector events, and they are mostly divided up along the lines of
> simple / complex / permute / load / store operations, and whether
> they are integer or floating-point operations.

Here's what we have on x86:

20177177044 vec-adds (scaled from 66.63%)
34101687027 vec-muls (scaled from 66.64%)
3984060862 vec-divs (scaled from 66.71%)
26349684710 vec-idle-cycles (scaled from 66.65%)
9052001905 vec-stall-cycles (scaled from 66.66%)
76440734242 vec-ops (scaled from 66.71%)

Could at least the idle/busy/stall/total generic stats be filled in
on powerpc, with a reasonable enough approximation? Those
utilization metrics are the most important ones when one tries to
figure out how well utilized the vector units are.

Ingo

2009-07-01 13:26:31

by Jaswinder Singh Rajput

[permalink] [raw]
Subject: Re: [PATCH -tip] perf_counter: Add Generalized Hardware FPU support for AMD

On Wed, 2009-07-01 at 22:33 +1000, Paul Mackerras wrote:
> Ingo Molnar writes:
>
> > 19583739 vec-adds ( 2.01x scaled)
> > 20856051 vec-muls ( 2.01x scaled)
> > 20856051 vec-divs ( 2.01x scaled)
> > 25100224054 vec-idle-cycles ( 1.99x scaled)
> > 12540131 vec-busy-cycles ( 1.99x scaled)
> > 42286702 vec-ops ( 2.01x scaled)
> >
> > Paulus: would this categorization fit PowerPC too?
>
> Conceptually that looks nice, but unfortunately we don't have events
> that correspond to that categorization on any PowerPC with vector
> hardware (VMX/Altivec). POWER6 seems to have the most vector events,
> and they are mostly divided up along the lines of simple / complex /
> permute / load / store operations, and whether they are integer or
> floating-point operations.
>
> Here are the vector-related events we have on POWER6:
>
> MRK_VMX0_LD_WRBACK Marked VMX0 load writeback valid
> MRK_VMX1_LD_WRBACK Marked VMX1 load writeback valid
> MRK_VMX_COMPLEX_ISSUED Marked VMX instruction issued to complex
> MRK_VMX_FLOAT_ISSUED Marked VMX instruction issued to float
> MRK_VMX_PERMUTE_ISSUED Marked VMX instruction issued to permute
> MRK_VMX_SIMPLE_ISSUED Marked VMX instruction issued to simple
> MRK_VMX_ST_ISSUED Marked VMX store issued
> VMX0_INST_ISSUED VMX0 instruction issued
> VMX0_LD_ISSUED VMX0 load issued
> VMX0_LD_WRBACK VMX0 load writeback valid
> VMX0_STALL VMX0 stall
> VMX1_INST_ISSUED VMX1 instruction issued
> VMX1_LD_ISSUED VMX1 load issued
> VMX1_LD_WRBACK VMX1 load writeback valid
> VMX1_STALL VMX1 stall
> VMX_COMPLEX_ISSUED VMX instruction issued to complex
> VMX_FLOAT_ISSUED VMX instruction issued to float
> VMX_FLOAT_MULTICYCLE VMX multi-cycle floating point instruction issued
> VMX_PERMUTE_ISSUED VMX instruction issued to permute
> VMX_RESULT_SAT_0_1 VMX valid result with sat bit is set (0->1)
> VMX_RESULT_SAT_1 VMX valid result with sat=1
> VMX_SIMPLE_ISSUED VMX instruction issued to simple
> VMX_ST_ISSUED VMX store issued
>
> I'm not sure what the exact distinction is between VMX0 and VMX1.
> I'll find out.
>

I am just guessing for powerpc, normally different units are for
different purpose like some do addition/multiplication and others do
division.

Like in Intel Corei7/Nehalem :

UOPS_EXECUTED.PORT0: Counts number of Uops executed
that were issued on port 0. Port 0
handles integer arithmetic, SIMD
and FP add Uops.
UOPS_EXECUTED.PORT1: Counts number of Uops executed
that were issued on port 1. Port 1
handles integer arithmetic, SIMD,
integer shift, FP multiply and FP
divide Uops.

Can you provide me the link of the Hardware manual so that I can check
it out.

Thanks,
--
JSR