2022-03-19 00:17:32

by Ali Saidi

[permalink] [raw]
Subject: [PATCH v3 1/3] perf arm-spe: Use SPE data source for neoverse cores

When synthesizing data from SPE, augment the type with source information
for Arm Neoverse cores. The field is IMPLDEF but the Neoverse cores all use
the same encoding. I can't find encoding information for any other SPE
implementations to unify their choices with Arm's thus that is left for
future work.

This change populates the mem_lvl_num for Neoverse cores instead of the
deprecated mem_lvl namespace.

Signed-off-by: Ali Saidi <[email protected]>
---
.../util/arm-spe-decoder/arm-spe-decoder.c | 1 +
.../util/arm-spe-decoder/arm-spe-decoder.h | 12 ++
tools/perf/util/arm-spe.c | 109 +++++++++++++++---
3 files changed, 108 insertions(+), 14 deletions(-)

diff --git a/tools/perf/util/arm-spe-decoder/arm-spe-decoder.c b/tools/perf/util/arm-spe-decoder/arm-spe-decoder.c
index 5e390a1a79ab..091987dd3966 100644
--- a/tools/perf/util/arm-spe-decoder/arm-spe-decoder.c
+++ b/tools/perf/util/arm-spe-decoder/arm-spe-decoder.c
@@ -220,6 +220,7 @@ static int arm_spe_read_record(struct arm_spe_decoder *decoder)

break;
case ARM_SPE_DATA_SOURCE:
+ decoder->record.source = payload;
break;
case ARM_SPE_BAD:
break;
diff --git a/tools/perf/util/arm-spe-decoder/arm-spe-decoder.h b/tools/perf/util/arm-spe-decoder/arm-spe-decoder.h
index 69b31084d6be..c81bf90c0996 100644
--- a/tools/perf/util/arm-spe-decoder/arm-spe-decoder.h
+++ b/tools/perf/util/arm-spe-decoder/arm-spe-decoder.h
@@ -29,6 +29,17 @@ enum arm_spe_op_type {
ARM_SPE_ST = 1 << 1,
};

+enum arm_spe_neoverse_data_source {
+ ARM_SPE_NV_L1D = 0x0,
+ ARM_SPE_NV_L2 = 0x8,
+ ARM_SPE_NV_PEER_CORE = 0x9,
+ ARM_SPE_NV_LCL_CLSTR = 0xa,
+ ARM_SPE_NV_SYS_CACHE = 0xb,
+ ARM_SPE_NV_PEER_CLSTR = 0xc,
+ ARM_SPE_NV_REMOTE = 0xd,
+ ARM_SPE_NV_DRAM = 0xe,
+};
+
struct arm_spe_record {
enum arm_spe_sample_type type;
int err;
@@ -40,6 +51,7 @@ struct arm_spe_record {
u64 virt_addr;
u64 phys_addr;
u64 context_id;
+ u16 source;
};

struct arm_spe_insn;
diff --git a/tools/perf/util/arm-spe.c b/tools/perf/util/arm-spe.c
index d2b64e3f588b..a45d638d2f06 100644
--- a/tools/perf/util/arm-spe.c
+++ b/tools/perf/util/arm-spe.c
@@ -34,6 +34,7 @@
#include "arm-spe-decoder/arm-spe-decoder.h"
#include "arm-spe-decoder/arm-spe-pkt-decoder.h"

+#include <../../../arch/arm64/include/asm/cputype.h>
#define MAX_TIMESTAMP (~0ULL)

struct arm_spe {
@@ -45,6 +46,7 @@ struct arm_spe {
struct perf_session *session;
struct machine *machine;
u32 pmu_type;
+ u64 midr;

struct perf_tsc_conversion tc;

@@ -399,33 +401,109 @@ static bool arm_spe__is_memory_event(enum arm_spe_sample_type type)
return false;
}

-static u64 arm_spe__synth_data_source(const struct arm_spe_record *record)
+static const struct midr_range neoverse_spe[] = {
+ MIDR_ALL_VERSIONS(MIDR_NEOVERSE_N1),
+ MIDR_ALL_VERSIONS(MIDR_NEOVERSE_N2),
+ {},
+};
+
+
+static void arm_spe__synth_data_source_neoverse(const struct arm_spe_record *record,
+ union perf_mem_data_src *data_src)
{
- union perf_mem_data_src data_src = { 0 };
+ /*
+ * Even though four levels of cache hierarchy are possible, no known
+ * production Neoverse systems currently include more than three levels
+ * so for the time being we assume three exist. If a production system
+ * is built with four the this function would have to be changed to
+ * detect the number of levels for reporting.
+ */

- if (record->op == ARM_SPE_LD)
- data_src.mem_op = PERF_MEM_OP_LOAD;
- else
- data_src.mem_op = PERF_MEM_OP_STORE;
+ switch (record->source) {
+ case ARM_SPE_NV_L1D:
+ data_src->mem_lvl = PERF_MEM_LVL_HIT;
+ data_src->mem_lvl_num = PERF_MEM_LVLNUM_L1;
+ break;
+ case ARM_SPE_NV_L2:
+ data_src->mem_lvl = PERF_MEM_LVL_HIT;
+ data_src->mem_lvl_num = PERF_MEM_LVLNUM_L2;
+ break;
+ case ARM_SPE_NV_PEER_CORE:
+ data_src->mem_lvl = PERF_MEM_LVL_HIT;
+ data_src->mem_snoop = PERF_MEM_SNOOP_HITM;
+ data_src->mem_lvl_num = PERF_MEM_LVLNUM_ANY_CACHE;
+ break;
+ /*
+ * We don't know if this is L1, L2 but we do know it was a cache-2-cache
+ * transfer, so set SNOOP_HITM
+ */
+ case ARM_SPE_NV_LCL_CLSTR:
+ case ARM_SPE_NV_PEER_CLSTR:
+ data_src->mem_lvl = PERF_MEM_LVL_HIT;
+ data_src->mem_snoop = PERF_MEM_SNOOP_HITM;
+ data_src->mem_lvl_num = PERF_MEM_LVLNUM_ANY_CACHE;
+ break;
+ /*
+ * System cache is assumed to be L3
+ */
+ case ARM_SPE_NV_SYS_CACHE:
+ data_src->mem_lvl = PERF_MEM_LVL_HIT;
+ data_src->mem_lvl_num = PERF_MEM_LVLNUM_L3;
+ break;
+ /*
+ * We don't know what level it hit in, except it came from the other
+ * socket
+ */
+ case ARM_SPE_NV_REMOTE:
+ data_src->mem_snoop = PERF_MEM_SNOOP_HITM;
+ data_src->mem_remote = PERF_MEM_REMOTE_REMOTE;
+ break;
+ case ARM_SPE_NV_DRAM:
+ data_src->mem_lvl = PERF_MEM_LVL_HIT;
+ data_src->mem_lvl_num = PERF_MEM_LVLNUM_RAM;
+ break;
+ default:
+ break;
+ }
+}

+static void arm_spe__synth_data_source_generic(const struct arm_spe_record *record,
+ union perf_mem_data_src *data_src)
+{
if (record->type & (ARM_SPE_LLC_ACCESS | ARM_SPE_LLC_MISS)) {
- data_src.mem_lvl = PERF_MEM_LVL_L3;
+ data_src->mem_lvl = PERF_MEM_LVL_L3;

if (record->type & ARM_SPE_LLC_MISS)
- data_src.mem_lvl |= PERF_MEM_LVL_MISS;
+ data_src->mem_lvl |= PERF_MEM_LVL_MISS;
else
- data_src.mem_lvl |= PERF_MEM_LVL_HIT;
+ data_src->mem_lvl |= PERF_MEM_LVL_HIT;
} else if (record->type & (ARM_SPE_L1D_ACCESS | ARM_SPE_L1D_MISS)) {
- data_src.mem_lvl = PERF_MEM_LVL_L1;
+ data_src->mem_lvl = PERF_MEM_LVL_L1;

if (record->type & ARM_SPE_L1D_MISS)
- data_src.mem_lvl |= PERF_MEM_LVL_MISS;
+ data_src->mem_lvl |= PERF_MEM_LVL_MISS;
else
- data_src.mem_lvl |= PERF_MEM_LVL_HIT;
+ data_src->mem_lvl |= PERF_MEM_LVL_HIT;
}

if (record->type & ARM_SPE_REMOTE_ACCESS)
- data_src.mem_lvl |= PERF_MEM_LVL_REM_CCE1;
+ data_src->mem_lvl |= PERF_MEM_LVL_REM_CCE1;
+}
+
+static u64 arm_spe__synth_data_source(const struct arm_spe_record *record, u64 midr)
+{
+ union perf_mem_data_src data_src = { 0 };
+ bool is_neoverse = is_midr_in_range(midr, neoverse_spe);
+
+ if (record->op & ARM_SPE_LD)
+ data_src.mem_op = PERF_MEM_OP_LOAD;
+ else
+ data_src.mem_op = PERF_MEM_OP_STORE;
+
+ if (is_neoverse)
+ arm_spe__synth_data_source_neoverse(record, &data_src);
+ else
+ arm_spe__synth_data_source_generic(record, &data_src);

if (record->type & (ARM_SPE_TLB_ACCESS | ARM_SPE_TLB_MISS)) {
data_src.mem_dtlb = PERF_MEM_TLB_WK;
@@ -446,7 +524,7 @@ static int arm_spe_sample(struct arm_spe_queue *speq)
u64 data_src;
int err;

- data_src = arm_spe__synth_data_source(record);
+ data_src = arm_spe__synth_data_source(record, spe->midr);

if (spe->sample_flc) {
if (record->type & ARM_SPE_L1D_MISS) {
@@ -1183,6 +1261,8 @@ int arm_spe_process_auxtrace_info(union perf_event *event,
struct perf_record_auxtrace_info *auxtrace_info = &event->auxtrace_info;
size_t min_sz = sizeof(u64) * ARM_SPE_AUXTRACE_PRIV_MAX;
struct perf_record_time_conv *tc = &session->time_conv;
+ const char *cpuid = perf_env__cpuid(session->evlist->env);
+ u64 midr = strtol(cpuid, NULL, 16);
struct arm_spe *spe;
int err;

@@ -1202,6 +1282,7 @@ int arm_spe_process_auxtrace_info(union perf_event *event,
spe->machine = &session->machines.host; /* No kvm support */
spe->auxtrace_type = auxtrace_info->type;
spe->pmu_type = auxtrace_info->priv[ARM_SPE_PMU_TYPE];
+ spe->midr = midr;

spe->timeless_decoding = arm_spe__is_timeless_decoding(spe);

--
2.32.0


2022-03-25 00:53:18

by Arnaldo Carvalho de Melo

[permalink] [raw]
Subject: Re: [PATCH v3 1/3] perf arm-spe: Use SPE data source for neoverse cores

Em Fri, Mar 18, 2022 at 07:59:11PM +0000, Ali Saidi escreveu:
> When synthesizing data from SPE, augment the type with source information
> for Arm Neoverse cores. The field is IMPLDEF but the Neoverse cores all use
> the same encoding. I can't find encoding information for any other SPE
> implementations to unify their choices with Arm's thus that is left for
> future work.
>
> This change populates the mem_lvl_num for Neoverse cores instead of the
> deprecated mem_lvl namespace.
>
> Signed-off-by: Ali Saidi <[email protected]>
> ---
> .../util/arm-spe-decoder/arm-spe-decoder.c | 1 +
> .../util/arm-spe-decoder/arm-spe-decoder.h | 12 ++
> tools/perf/util/arm-spe.c | 109 +++++++++++++++---
> 3 files changed, 108 insertions(+), 14 deletions(-)
>
> diff --git a/tools/perf/util/arm-spe-decoder/arm-spe-decoder.c b/tools/perf/util/arm-spe-decoder/arm-spe-decoder.c
> index 5e390a1a79ab..091987dd3966 100644
> --- a/tools/perf/util/arm-spe-decoder/arm-spe-decoder.c
> +++ b/tools/perf/util/arm-spe-decoder/arm-spe-decoder.c
> @@ -220,6 +220,7 @@ static int arm_spe_read_record(struct arm_spe_decoder *decoder)
>
> break;
> case ARM_SPE_DATA_SOURCE:
> + decoder->record.source = payload;
> break;
> case ARM_SPE_BAD:
> break;
> diff --git a/tools/perf/util/arm-spe-decoder/arm-spe-decoder.h b/tools/perf/util/arm-spe-decoder/arm-spe-decoder.h
> index 69b31084d6be..c81bf90c0996 100644
> --- a/tools/perf/util/arm-spe-decoder/arm-spe-decoder.h
> +++ b/tools/perf/util/arm-spe-decoder/arm-spe-decoder.h
> @@ -29,6 +29,17 @@ enum arm_spe_op_type {
> ARM_SPE_ST = 1 << 1,
> };
>
> +enum arm_spe_neoverse_data_source {
> + ARM_SPE_NV_L1D = 0x0,
> + ARM_SPE_NV_L2 = 0x8,
> + ARM_SPE_NV_PEER_CORE = 0x9,
> + ARM_SPE_NV_LCL_CLSTR = 0xa,
> + ARM_SPE_NV_SYS_CACHE = 0xb,
> + ARM_SPE_NV_PEER_CLSTR = 0xc,
> + ARM_SPE_NV_REMOTE = 0xd,
> + ARM_SPE_NV_DRAM = 0xe,
> +};
> +
> struct arm_spe_record {
> enum arm_spe_sample_type type;
> int err;
> @@ -40,6 +51,7 @@ struct arm_spe_record {
> u64 virt_addr;
> u64 phys_addr;
> u64 context_id;
> + u16 source;
> };
>
> struct arm_spe_insn;
> diff --git a/tools/perf/util/arm-spe.c b/tools/perf/util/arm-spe.c
> index d2b64e3f588b..a45d638d2f06 100644
> --- a/tools/perf/util/arm-spe.c
> +++ b/tools/perf/util/arm-spe.c
> @@ -34,6 +34,7 @@
> #include "arm-spe-decoder/arm-spe-decoder.h"
> #include "arm-spe-decoder/arm-spe-pkt-decoder.h"
>
> +#include <../../../arch/arm64/include/asm/cputype.h>

This isn't working for me:

⬢[acme@toolbox perf]$ make BUILD_BPF_SKEL=1 CORESIGHT=1 PYTHON=python3 O=/tmp/build/perf -C tools/perf install-bin
<SNIP>
CC /tmp/build/perf/util/parse-events-flex.o
CC /tmp/build/perf/util/expr-flex.o
CC /tmp/build/perf/util/expr.o
LD /tmp/build/perf/util/intel-pt-decoder/perf-in.o
In file included from util/arm-spe.c:37:
/var/home/acme/git/perf/tools/include/uapi/../../../arch/arm64/include/asm/cputype.h:173:10: fatal error: asm/sysreg.h: No such file or directory
173 | #include <asm/sysreg.h>
| ^~~~~~~~~~~~~~
compilation terminated.
make[4]: *** [/var/home/acme/git/perf/tools/build/Makefile.build:96: /tmp/build/perf/util/arm-spe.o] Error 1
make[4]: *** Waiting for unfinished jobs....
make[3]: *** [/var/home/acme/git/perf/tools/build/Makefile.build:139: util] Error 2
make[2]: *** [Makefile.perf:665: /tmp/build/perf/perf-in.o] Error 2
make[1]: *** [Makefile.perf:240: sub-make] Error 2
make: *** [Makefile:113: install-bin] Error 2
make: Leaving directory '/var/home/acme/git/perf/tools/perf'


Can you please take a look?

What I have is in my tmp.perf/core branch, pending further tests to move
to perf/core at git://git.kernel.org/pub/scm/linux/kernel/git/acme/linux.git

I see:

You're getting out of tools/

⬢[acme@toolbox perf]$ realpath /var/home/acme/git/perf/tools/include/uapi/../../../arch/arm64/include/asm/cputype.h
/var/home/acme/git/perf/arch/arm64/include/asm/cputype.h
⬢[acme@toolbox perf]$

You can't include things from outside tools/ into tools code. See this:

-rw-r--r--. 1 acme acme 44506 Mar 14 17:55 tools/arch/arm64/include/asm/sysreg.h
⬢[acme@toolbox perf]$ git log tools/arch/arm64/include/asm/sysreg.h
commit 272a067df3c89f6f2176a350f88661625a2c8b3b
Author: Raghavendra Rao Ananta <[email protected]>
Date: Thu Oct 7 23:34:26 2021 +0000

tools: arm64: Import sysreg.h

Bring-in the kernel's arch/arm64/include/asm/sysreg.h
into tools/ for arm64 to make use of all the standard
register definitions in consistence with the kernel.

Make use of the register read/write definitions from
sysreg.h, instead of the existing definitions. A syntax
correction is needed for the files that use write_sysreg()
to make it compliant with the new (kernel's) syntax.

Reviewed-by: Andrew Jones <[email protected]>
Reviewed-by: Oliver Upton <[email protected]>
Signed-off-by: Raghavendra Rao Ananta <[email protected]>
[maz: squashed two commits in order to keep the series bisectable]
Signed-off-by: Marc Zyngier <[email protected]>
Link: https://lore.kernel.org/r/[email protected]
Link: https://lore.kernel.org/r/[email protected]
⬢[acme@toolbox perf]$

So we need to bring asm/cputype.h as well.

Please try adding that one as a prep patch before these three and then
test it and resubmit,

Thanks,

- Arnaldo

> #define MAX_TIMESTAMP (~0ULL)
>
> struct arm_spe {
> @@ -45,6 +46,7 @@ struct arm_spe {
> struct perf_session *session;
> struct machine *machine;
> u32 pmu_type;
> + u64 midr;
>
> struct perf_tsc_conversion tc;
>
> @@ -399,33 +401,109 @@ static bool arm_spe__is_memory_event(enum arm_spe_sample_type type)
> return false;
> }
>
> -static u64 arm_spe__synth_data_source(const struct arm_spe_record *record)
> +static const struct midr_range neoverse_spe[] = {
> + MIDR_ALL_VERSIONS(MIDR_NEOVERSE_N1),
> + MIDR_ALL_VERSIONS(MIDR_NEOVERSE_N2),
> + {},
> +};
> +
> +
> +static void arm_spe__synth_data_source_neoverse(const struct arm_spe_record *record,
> + union perf_mem_data_src *data_src)
> {
> - union perf_mem_data_src data_src = { 0 };
> + /*
> + * Even though four levels of cache hierarchy are possible, no known
> + * production Neoverse systems currently include more than three levels
> + * so for the time being we assume three exist. If a production system
> + * is built with four the this function would have to be changed to
> + * detect the number of levels for reporting.
> + */
>
> - if (record->op == ARM_SPE_LD)
> - data_src.mem_op = PERF_MEM_OP_LOAD;
> - else
> - data_src.mem_op = PERF_MEM_OP_STORE;
> + switch (record->source) {
> + case ARM_SPE_NV_L1D:
> + data_src->mem_lvl = PERF_MEM_LVL_HIT;
> + data_src->mem_lvl_num = PERF_MEM_LVLNUM_L1;
> + break;
> + case ARM_SPE_NV_L2:
> + data_src->mem_lvl = PERF_MEM_LVL_HIT;
> + data_src->mem_lvl_num = PERF_MEM_LVLNUM_L2;
> + break;
> + case ARM_SPE_NV_PEER_CORE:
> + data_src->mem_lvl = PERF_MEM_LVL_HIT;
> + data_src->mem_snoop = PERF_MEM_SNOOP_HITM;
> + data_src->mem_lvl_num = PERF_MEM_LVLNUM_ANY_CACHE;
> + break;
> + /*
> + * We don't know if this is L1, L2 but we do know it was a cache-2-cache
> + * transfer, so set SNOOP_HITM
> + */
> + case ARM_SPE_NV_LCL_CLSTR:
> + case ARM_SPE_NV_PEER_CLSTR:
> + data_src->mem_lvl = PERF_MEM_LVL_HIT;
> + data_src->mem_snoop = PERF_MEM_SNOOP_HITM;
> + data_src->mem_lvl_num = PERF_MEM_LVLNUM_ANY_CACHE;
> + break;
> + /*
> + * System cache is assumed to be L3
> + */
> + case ARM_SPE_NV_SYS_CACHE:
> + data_src->mem_lvl = PERF_MEM_LVL_HIT;
> + data_src->mem_lvl_num = PERF_MEM_LVLNUM_L3;
> + break;
> + /*
> + * We don't know what level it hit in, except it came from the other
> + * socket
> + */
> + case ARM_SPE_NV_REMOTE:
> + data_src->mem_snoop = PERF_MEM_SNOOP_HITM;
> + data_src->mem_remote = PERF_MEM_REMOTE_REMOTE;
> + break;
> + case ARM_SPE_NV_DRAM:
> + data_src->mem_lvl = PERF_MEM_LVL_HIT;
> + data_src->mem_lvl_num = PERF_MEM_LVLNUM_RAM;
> + break;
> + default:
> + break;
> + }
> +}
>
> +static void arm_spe__synth_data_source_generic(const struct arm_spe_record *record,
> + union perf_mem_data_src *data_src)
> +{
> if (record->type & (ARM_SPE_LLC_ACCESS | ARM_SPE_LLC_MISS)) {
> - data_src.mem_lvl = PERF_MEM_LVL_L3;
> + data_src->mem_lvl = PERF_MEM_LVL_L3;
>
> if (record->type & ARM_SPE_LLC_MISS)
> - data_src.mem_lvl |= PERF_MEM_LVL_MISS;
> + data_src->mem_lvl |= PERF_MEM_LVL_MISS;
> else
> - data_src.mem_lvl |= PERF_MEM_LVL_HIT;
> + data_src->mem_lvl |= PERF_MEM_LVL_HIT;
> } else if (record->type & (ARM_SPE_L1D_ACCESS | ARM_SPE_L1D_MISS)) {
> - data_src.mem_lvl = PERF_MEM_LVL_L1;
> + data_src->mem_lvl = PERF_MEM_LVL_L1;
>
> if (record->type & ARM_SPE_L1D_MISS)
> - data_src.mem_lvl |= PERF_MEM_LVL_MISS;
> + data_src->mem_lvl |= PERF_MEM_LVL_MISS;
> else
> - data_src.mem_lvl |= PERF_MEM_LVL_HIT;
> + data_src->mem_lvl |= PERF_MEM_LVL_HIT;
> }
>
> if (record->type & ARM_SPE_REMOTE_ACCESS)
> - data_src.mem_lvl |= PERF_MEM_LVL_REM_CCE1;
> + data_src->mem_lvl |= PERF_MEM_LVL_REM_CCE1;
> +}
> +
> +static u64 arm_spe__synth_data_source(const struct arm_spe_record *record, u64 midr)
> +{
> + union perf_mem_data_src data_src = { 0 };
> + bool is_neoverse = is_midr_in_range(midr, neoverse_spe);
> +
> + if (record->op & ARM_SPE_LD)
> + data_src.mem_op = PERF_MEM_OP_LOAD;
> + else
> + data_src.mem_op = PERF_MEM_OP_STORE;
> +
> + if (is_neoverse)
> + arm_spe__synth_data_source_neoverse(record, &data_src);
> + else
> + arm_spe__synth_data_source_generic(record, &data_src);
>
> if (record->type & (ARM_SPE_TLB_ACCESS | ARM_SPE_TLB_MISS)) {
> data_src.mem_dtlb = PERF_MEM_TLB_WK;
> @@ -446,7 +524,7 @@ static int arm_spe_sample(struct arm_spe_queue *speq)
> u64 data_src;
> int err;
>
> - data_src = arm_spe__synth_data_source(record);
> + data_src = arm_spe__synth_data_source(record, spe->midr);
>
> if (spe->sample_flc) {
> if (record->type & ARM_SPE_L1D_MISS) {
> @@ -1183,6 +1261,8 @@ int arm_spe_process_auxtrace_info(union perf_event *event,
> struct perf_record_auxtrace_info *auxtrace_info = &event->auxtrace_info;
> size_t min_sz = sizeof(u64) * ARM_SPE_AUXTRACE_PRIV_MAX;
> struct perf_record_time_conv *tc = &session->time_conv;
> + const char *cpuid = perf_env__cpuid(session->evlist->env);
> + u64 midr = strtol(cpuid, NULL, 16);
> struct arm_spe *spe;
> int err;
>
> @@ -1202,6 +1282,7 @@ int arm_spe_process_auxtrace_info(union perf_event *event,
> spe->machine = &session->machines.host; /* No kvm support */
> spe->auxtrace_type = auxtrace_info->type;
> spe->pmu_type = auxtrace_info->priv[ARM_SPE_PMU_TYPE];
> + spe->midr = midr;
>
> spe->timeless_decoding = arm_spe__is_timeless_decoding(spe);
>
> --
> 2.32.0

--

- Arnaldo