From: German Gomez <[email protected]>
Add 'simd' sort field to visualize SIMD ops in perf-report.
Rows are labeled with the SIMD isa, and the type of predicate (if any):
- [p] partial predicate
- [e] empty predicate (no elements in the vector being used)
Example with Arm SPE and SVE (Scalable Vector Extension):
#include <arm_sve.h>
double src[1025], dst[1025];
int main(void) {
svfloat64_t vc = svdup_f64(1);
for(;;)
for(int i = 0; i < 1025; i += svcntd())
{
svbool_t pg = svwhilelt_b64(i, 1025);
svfloat64_t vsrc = svld1(pg, &src[i]);
svfloat64_t vdst = svadd_x(pg, vsrc, vc);
svst1(pg, &dst[i], vdst);
}
return 0;
}
... compiled using "gcc-11 -march=armv8-a+sve -O3"
Profiling on a platform that implements FEAT_SVE and FEAT_SPEv1p1:
$ perf record -e arm_spe_0// -- ./a.out
$ perf report --itrace=i1i -s overhead,pid,simd,sym
Overhead Pid:Command Simd Symbol
........ ................ ....... ......................
53.76% 10758:program [.] main
46.14% 10758:program [.] SVE [.] main
0.09% 10758:program [p] SVE [.] main
The report shows 0.09% of the sampled SVE operations use partial
predicates due to src and dst arrays not being multiples of the vector
register lengths.
Signed-off-by: German Gomez <[email protected]>
Signed-off-by: James Clark <[email protected]>
---
tools/perf/Documentation/perf-report.txt | 1 +
tools/perf/util/hist.c | 1 +
tools/perf/util/hist.h | 1 +
tools/perf/util/sort.c | 47 ++++++++++++++++++++++++
tools/perf/util/sort.h | 2 +
5 files changed, 52 insertions(+)
diff --git a/tools/perf/Documentation/perf-report.txt b/tools/perf/Documentation/perf-report.txt
index c242e8da6b1a..cfd502f7e6da 100644
--- a/tools/perf/Documentation/perf-report.txt
+++ b/tools/perf/Documentation/perf-report.txt
@@ -117,6 +117,7 @@ OPTIONS
- addr: (Full) virtual address of the sampled instruction
- retire_lat: On X86, this reports pipeline stall of this instruction compared
to the previous instruction in cycles. And currently supported only on X86
+ - simd: Flags describing a SIMD operation. "e" for empty Arm SVE predicate. "p" for partial Arm SVE predicate
By default, comm, dso and symbol keys are used.
(i.e. --sort comm,dso,symbol)
diff --git a/tools/perf/util/hist.c b/tools/perf/util/hist.c
index 3670136a0074..0c11f50abfec 100644
--- a/tools/perf/util/hist.c
+++ b/tools/perf/util/hist.c
@@ -745,6 +745,7 @@ __hists__add_entry(struct hists *hists,
.weight = sample->weight,
.ins_lat = sample->ins_lat,
.p_stage_cyc = sample->p_stage_cyc,
+ .simd_flags = sample->simd_flags,
}, *he = hists__findnew_entry(hists, &entry, al, sample_self);
if (!hists->has_callchains && he && he->callchain_size != 0)
diff --git a/tools/perf/util/hist.h b/tools/perf/util/hist.h
index 86a677954279..afc9f1c7f4dc 100644
--- a/tools/perf/util/hist.h
+++ b/tools/perf/util/hist.h
@@ -81,6 +81,7 @@ enum hist_column {
HISTC_ADDR_FROM,
HISTC_ADDR_TO,
HISTC_ADDR,
+ HISTC_SIMD,
HISTC_NR_COLS, /* Last entry */
};
diff --git a/tools/perf/util/sort.c b/tools/perf/util/sort.c
index 093a0c8b2e3d..e11e68ecf0a2 100644
--- a/tools/perf/util/sort.c
+++ b/tools/perf/util/sort.c
@@ -139,6 +139,52 @@ struct sort_entry sort_thread = {
.se_width_idx = HISTC_THREAD,
};
+/* --sort simd */
+
+static int64_t
+sort__simd_cmp(struct hist_entry *left, struct hist_entry *right)
+{
+ if (left->simd_flags.arch != right->simd_flags.arch)
+ return (int64_t) left->simd_flags.arch - right->simd_flags.arch;
+
+ return (int64_t) left->simd_flags.pred - right->simd_flags.pred;
+}
+
+static const char *hist_entry__get_simd_name(struct simd_flags *simd_flags)
+{
+ u64 arch = simd_flags->arch;
+
+ if (arch & SIMD_OP_FLAGS_ARCH_SVE)
+ return "SVE";
+ else
+ return "n/a";
+}
+
+static int hist_entry__simd_snprintf(struct hist_entry *he, char *bf,
+ size_t size, unsigned int width __maybe_unused)
+{
+ const char *name;
+
+ if (!he->simd_flags.arch)
+ return repsep_snprintf(bf, size, "");
+
+ name = hist_entry__get_simd_name(&he->simd_flags);
+
+ if (he->simd_flags.pred & SIMD_OP_FLAGS_PRED_EMPTY)
+ return repsep_snprintf(bf, size, "[e] %s", name);
+ else if (he->simd_flags.pred & SIMD_OP_FLAGS_PRED_PARTIAL)
+ return repsep_snprintf(bf, size, "[p] %s", name);
+
+ return repsep_snprintf(bf, size, "[.] %s", name);
+}
+
+struct sort_entry sort_simd = {
+ .se_header = "Simd ",
+ .se_cmp = sort__simd_cmp,
+ .se_snprintf = hist_entry__simd_snprintf,
+ .se_width_idx = HISTC_SIMD,
+};
+
/* --sort comm */
/*
@@ -2142,6 +2188,7 @@ static struct sort_dimension common_sort_dimensions[] = {
DIM(SORT_ADDR, "addr", sort_addr),
DIM(SORT_LOCAL_RETIRE_LAT, "local_retire_lat", sort_local_p_stage_cyc),
DIM(SORT_GLOBAL_RETIRE_LAT, "retire_lat", sort_global_p_stage_cyc),
+ DIM(SORT_SIMD, "simd", sort_simd)
};
#undef DIM
diff --git a/tools/perf/util/sort.h b/tools/perf/util/sort.h
index 22f437c3476f..ecfb7f1359d5 100644
--- a/tools/perf/util/sort.h
+++ b/tools/perf/util/sort.h
@@ -111,6 +111,7 @@ struct hist_entry {
u64 p_stage_cyc;
u8 cpumode;
u8 depth;
+ struct simd_flags simd_flags;
/* We are added by hists__add_dummy_entry. */
bool dummy;
@@ -241,6 +242,7 @@ enum sort_type {
SORT_ADDR,
SORT_LOCAL_RETIRE_LAT,
SORT_GLOBAL_RETIRE_LAT,
+ SORT_SIMD,
/* branch stack specific sort keys */
__SORT_BRANCH_STACK,
--
2.34.1
On Mon, Mar 20, 2023 at 8:15 AM James Clark <[email protected]> wrote:
>
> From: German Gomez <[email protected]>
>
> Add 'simd' sort field to visualize SIMD ops in perf-report.
>
> Rows are labeled with the SIMD isa, and the type of predicate (if any):
>
> - [p] partial predicate
> - [e] empty predicate (no elements in the vector being used)
>
> Example with Arm SPE and SVE (Scalable Vector Extension):
>
> #include <arm_sve.h>
>
> double src[1025], dst[1025];
>
> int main(void) {
> svfloat64_t vc = svdup_f64(1);
> for(;;)
> for(int i = 0; i < 1025; i += svcntd())
> {
> svbool_t pg = svwhilelt_b64(i, 1025);
> svfloat64_t vsrc = svld1(pg, &src[i]);
> svfloat64_t vdst = svadd_x(pg, vsrc, vc);
> svst1(pg, &dst[i], vdst);
> }
> return 0;
> }
>
> ... compiled using "gcc-11 -march=armv8-a+sve -O3"
>
> Profiling on a platform that implements FEAT_SVE and FEAT_SPEv1p1:
>
> $ perf record -e arm_spe_0// -- ./a.out
> $ perf report --itrace=i1i -s overhead,pid,simd,sym
>
> Overhead Pid:Command Simd Symbol
> ........ ................ ....... ......................
>
> 53.76% 10758:program [.] main
> 46.14% 10758:program [.] SVE [.] main
> 0.09% 10758:program [p] SVE [.] main
>
> The report shows 0.09% of the sampled SVE operations use partial
> predicates due to src and dst arrays not being multiples of the vector
> register lengths.
>
> Signed-off-by: German Gomez <[email protected]>
> Signed-off-by: James Clark <[email protected]>
> ---
> tools/perf/Documentation/perf-report.txt | 1 +
> tools/perf/util/hist.c | 1 +
> tools/perf/util/hist.h | 1 +
> tools/perf/util/sort.c | 47 ++++++++++++++++++++++++
> tools/perf/util/sort.h | 2 +
> 5 files changed, 52 insertions(+)
>
> diff --git a/tools/perf/Documentation/perf-report.txt b/tools/perf/Documentation/perf-report.txt
> index c242e8da6b1a..cfd502f7e6da 100644
> --- a/tools/perf/Documentation/perf-report.txt
> +++ b/tools/perf/Documentation/perf-report.txt
> @@ -117,6 +117,7 @@ OPTIONS
> - addr: (Full) virtual address of the sampled instruction
> - retire_lat: On X86, this reports pipeline stall of this instruction compared
> to the previous instruction in cycles. And currently supported only on X86
> + - simd: Flags describing a SIMD operation. "e" for empty Arm SVE predicate. "p" for partial Arm SVE predicate
nit: the line wrap looks off here.
Thanks,
Ian
>
> By default, comm, dso and symbol keys are used.
> (i.e. --sort comm,dso,symbol)
> diff --git a/tools/perf/util/hist.c b/tools/perf/util/hist.c
> index 3670136a0074..0c11f50abfec 100644
> --- a/tools/perf/util/hist.c
> +++ b/tools/perf/util/hist.c
> @@ -745,6 +745,7 @@ __hists__add_entry(struct hists *hists,
> .weight = sample->weight,
> .ins_lat = sample->ins_lat,
> .p_stage_cyc = sample->p_stage_cyc,
> + .simd_flags = sample->simd_flags,
> }, *he = hists__findnew_entry(hists, &entry, al, sample_self);
>
> if (!hists->has_callchains && he && he->callchain_size != 0)
> diff --git a/tools/perf/util/hist.h b/tools/perf/util/hist.h
> index 86a677954279..afc9f1c7f4dc 100644
> --- a/tools/perf/util/hist.h
> +++ b/tools/perf/util/hist.h
> @@ -81,6 +81,7 @@ enum hist_column {
> HISTC_ADDR_FROM,
> HISTC_ADDR_TO,
> HISTC_ADDR,
> + HISTC_SIMD,
> HISTC_NR_COLS, /* Last entry */
> };
>
> diff --git a/tools/perf/util/sort.c b/tools/perf/util/sort.c
> index 093a0c8b2e3d..e11e68ecf0a2 100644
> --- a/tools/perf/util/sort.c
> +++ b/tools/perf/util/sort.c
> @@ -139,6 +139,52 @@ struct sort_entry sort_thread = {
> .se_width_idx = HISTC_THREAD,
> };
>
> +/* --sort simd */
> +
> +static int64_t
> +sort__simd_cmp(struct hist_entry *left, struct hist_entry *right)
> +{
> + if (left->simd_flags.arch != right->simd_flags.arch)
> + return (int64_t) left->simd_flags.arch - right->simd_flags.arch;
> +
> + return (int64_t) left->simd_flags.pred - right->simd_flags.pred;
> +}
> +
> +static const char *hist_entry__get_simd_name(struct simd_flags *simd_flags)
> +{
> + u64 arch = simd_flags->arch;
> +
> + if (arch & SIMD_OP_FLAGS_ARCH_SVE)
> + return "SVE";
> + else
> + return "n/a";
> +}
> +
> +static int hist_entry__simd_snprintf(struct hist_entry *he, char *bf,
> + size_t size, unsigned int width __maybe_unused)
> +{
> + const char *name;
> +
> + if (!he->simd_flags.arch)
> + return repsep_snprintf(bf, size, "");
> +
> + name = hist_entry__get_simd_name(&he->simd_flags);
> +
> + if (he->simd_flags.pred & SIMD_OP_FLAGS_PRED_EMPTY)
> + return repsep_snprintf(bf, size, "[e] %s", name);
> + else if (he->simd_flags.pred & SIMD_OP_FLAGS_PRED_PARTIAL)
> + return repsep_snprintf(bf, size, "[p] %s", name);
> +
> + return repsep_snprintf(bf, size, "[.] %s", name);
> +}
> +
> +struct sort_entry sort_simd = {
> + .se_header = "Simd ",
> + .se_cmp = sort__simd_cmp,
> + .se_snprintf = hist_entry__simd_snprintf,
> + .se_width_idx = HISTC_SIMD,
> +};
> +
> /* --sort comm */
>
> /*
> @@ -2142,6 +2188,7 @@ static struct sort_dimension common_sort_dimensions[] = {
> DIM(SORT_ADDR, "addr", sort_addr),
> DIM(SORT_LOCAL_RETIRE_LAT, "local_retire_lat", sort_local_p_stage_cyc),
> DIM(SORT_GLOBAL_RETIRE_LAT, "retire_lat", sort_global_p_stage_cyc),
> + DIM(SORT_SIMD, "simd", sort_simd)
> };
>
> #undef DIM
> diff --git a/tools/perf/util/sort.h b/tools/perf/util/sort.h
> index 22f437c3476f..ecfb7f1359d5 100644
> --- a/tools/perf/util/sort.h
> +++ b/tools/perf/util/sort.h
> @@ -111,6 +111,7 @@ struct hist_entry {
> u64 p_stage_cyc;
> u8 cpumode;
> u8 depth;
> + struct simd_flags simd_flags;
>
> /* We are added by hists__add_dummy_entry. */
> bool dummy;
> @@ -241,6 +242,7 @@ enum sort_type {
> SORT_ADDR,
> SORT_LOCAL_RETIRE_LAT,
> SORT_GLOBAL_RETIRE_LAT,
> + SORT_SIMD,
>
> /* branch stack specific sort keys */
> __SORT_BRANCH_STACK,
> --
> 2.34.1
>