When synthesizing data from SPE, augment the type with source information
for Arm Neoverse cores so we can detect situtions like cache line
contention and transfers on Arm platforms.
This changes enables future changes to c2c on a system with SPE where lines that
are shared among multiple cores show up in perf c2c output.
Changes is v9:
* Change reporting of remote socket data which should make Leo's upcomping
patch set for c2c make sense on multi-socket platforms
Changes in v8:
* Report NA for both mem_lvl and mem_lvl_num for stores where we have no
information
Changes in v7:
* Minor change requested by Leo Yan
Changes in v6:
* Drop changes to c2c command which will come from Leo Yan
Changes in v5:
* Add a new snooping type to disambiguate cache-to-cache transfers where
we don't know if the data is clean or dirty.
* Set snoop flags on all the data-source cases
* Special case stores as we have no information on them
Changes in v4:
* Bring-in the kernel's arch/arm64/include/asm/cputype.h into tools/
* Add neoverse-v1 to the neoverse cores list
Ali Saidi (4):
tools: arm64: Import cputype.h
perf arm-spe: Use SPE data source for neoverse cores
perf mem: Support mem_lvl_num in c2c command
perf mem: Support HITM for when mem_lvl_num is any
tools/arch/arm64/include/asm/cputype.h | 258 ++++++++++++++++++
.../util/arm-spe-decoder/arm-spe-decoder.c | 1 +
.../util/arm-spe-decoder/arm-spe-decoder.h | 12 +
tools/perf/util/arm-spe.c | 110 +++++++-
tools/perf/util/mem-events.c | 20 +-
5 files changed, 383 insertions(+), 18 deletions(-)
create mode 100644 tools/arch/arm64/include/asm/cputype.h
--
2.32.0
From: Leo Yan <[email protected]>
Except memory load and store operations, Arm SPE records also can
support other operation types, bug when set the data source field the
current code assumes a record is a either load operation or store
operation, this leads to wrongly synthesize memory samples.
This patch strictly checks the record operation type, it only sets data
source only for the operation types ARM_SPE_LD and ARM_SPE_ST,
otherwise, returns zero for data source. Therefore, we can synthesize
memory samples only when data source is a non-zero value, the function
arm_spe__is_memory_event() is useless and removed.
Fixes: e55ed3423c1b ("perf arm-spe: Synthesize memory event")
Signed-off-by: Leo Yan <[email protected]>
Reviewed-by: Ali Saidi <[email protected]>
Tested-by: Ali Saidi <[email protected]>
Reviewed-by: German Gomez <[email protected]>
---
tools/perf/util/arm-spe.c | 22 ++++++++--------------
1 file changed, 8 insertions(+), 14 deletions(-)
diff --git a/tools/perf/util/arm-spe.c b/tools/perf/util/arm-spe.c
index d2b64e3f588b..e032efc03274 100644
--- a/tools/perf/util/arm-spe.c
+++ b/tools/perf/util/arm-spe.c
@@ -387,26 +387,16 @@ static int arm_spe__synth_instruction_sample(struct arm_spe_queue *speq,
return arm_spe_deliver_synth_event(spe, speq, event, &sample);
}
-#define SPE_MEM_TYPE (ARM_SPE_L1D_ACCESS | ARM_SPE_L1D_MISS | \
- ARM_SPE_LLC_ACCESS | ARM_SPE_LLC_MISS | \
- ARM_SPE_REMOTE_ACCESS)
-
-static bool arm_spe__is_memory_event(enum arm_spe_sample_type type)
-{
- if (type & SPE_MEM_TYPE)
- return true;
-
- return false;
-}
-
static u64 arm_spe__synth_data_source(const struct arm_spe_record *record)
{
union perf_mem_data_src data_src = { 0 };
if (record->op == ARM_SPE_LD)
data_src.mem_op = PERF_MEM_OP_LOAD;
- else
+ else if (record->op == ARM_SPE_ST)
data_src.mem_op = PERF_MEM_OP_STORE;
+ else
+ return 0;
if (record->type & (ARM_SPE_LLC_ACCESS | ARM_SPE_LLC_MISS)) {
data_src.mem_lvl = PERF_MEM_LVL_L3;
@@ -510,7 +500,11 @@ static int arm_spe_sample(struct arm_spe_queue *speq)
return err;
}
- if (spe->sample_memory && arm_spe__is_memory_event(record->type)) {
+ /*
+ * When data_src is zero it means the record is not a memory operation,
+ * skip to synthesize memory sample for this case.
+ */
+ if (spe->sample_memory && data_src) {
err = arm_spe__synth_mem_sample(speq, spe->memory_id, data_src);
if (err)
return err;
--
2.32.0
Add a flag to the perf mem data struct to signal that a request caused a
cache-to-cache transfer of a line from a peer of the requestor and
wasn't sourced from a lower cache level. The line being moved from one
peer cache to another has latency and performance implications. On Arm64
Neoverse systems the data source can indicate a cache-to-cache transfer
but not if the line is dirty or clean, so instead of overloading HITM
define a new flag that indicates this type of transfer.
Signed-off-by: Ali Saidi <[email protected]>
Reviewed-by: Leo Yan <[email protected]>
Reviewed-by: Kajol Jain<[email protected]>
---
include/uapi/linux/perf_event.h | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h
index d37629dbad72..7b88bfd097dc 100644
--- a/include/uapi/linux/perf_event.h
+++ b/include/uapi/linux/perf_event.h
@@ -1310,7 +1310,7 @@ union perf_mem_data_src {
#define PERF_MEM_SNOOP_SHIFT 19
#define PERF_MEM_SNOOPX_FWD 0x01 /* forward */
-/* 1 free */
+#define PERF_MEM_SNOOPX_PEER 0x02 /* xfer from peer */
#define PERF_MEM_SNOOPX_SHIFT 38
/* locked instruction */
--
2.32.0
Em Tue, May 17, 2022 at 02:03:21AM +0000, Ali Saidi escreveu:
> When synthesizing data from SPE, augment the type with source information
> for Arm Neoverse cores so we can detect situtions like cache line
> contention and transfers on Arm platforms.
>
> This changes enables future changes to c2c on a system with SPE where lines that
> are shared among multiple cores show up in perf c2c output.
>
> Changes is v9:
> * Change reporting of remote socket data which should make Leo's upcomping
> patch set for c2c make sense on multi-socket platforms
Hey,
Joe Mario, who is one of 'perf c2c' authors asked me about some
git tree he could clone from for both building the kernel and
tools/perf/ so that he could do tests, can you please provide that?
thanks!
- Arnaldo
> Changes in v8:
> * Report NA for both mem_lvl and mem_lvl_num for stores where we have no
> information
>
> Changes in v7:
> * Minor change requested by Leo Yan
>
> Changes in v6:
> * Drop changes to c2c command which will come from Leo Yan
>
> Changes in v5:
> * Add a new snooping type to disambiguate cache-to-cache transfers where
> we don't know if the data is clean or dirty.
> * Set snoop flags on all the data-source cases
> * Special case stores as we have no information on them
>
> Changes in v4:
> * Bring-in the kernel's arch/arm64/include/asm/cputype.h into tools/
> * Add neoverse-v1 to the neoverse cores list
>
> Ali Saidi (4):
> tools: arm64: Import cputype.h
> perf arm-spe: Use SPE data source for neoverse cores
> perf mem: Support mem_lvl_num in c2c command
> perf mem: Support HITM for when mem_lvl_num is any
>
> tools/arch/arm64/include/asm/cputype.h | 258 ++++++++++++++++++
> .../util/arm-spe-decoder/arm-spe-decoder.c | 1 +
> .../util/arm-spe-decoder/arm-spe-decoder.h | 12 +
> tools/perf/util/arm-spe.c | 110 +++++++-
> tools/perf/util/mem-events.c | 20 +-
> 5 files changed, 383 insertions(+), 18 deletions(-)
> create mode 100644 tools/arch/arm64/include/asm/cputype.h
>
> --
> 2.32.0
--
- Arnaldo
From: Leo Yan <[email protected]>
Since PERF_MEM_SNOOPX_PEER flag is a new snoop type, print this flag if
it is set.
Before:
memstress 3603 [020] 122.463754: 1 l1d-miss: 8688000842 |OP LOAD|LVL L3 or L3 hit|SNP N/A|TLB Walker hit|LCK No|BLK N/A aaaac17c3e88 [unknown] (/home/ubuntu/memstress)
memstress 3603 [020] 122.463754: 1 l1d-access: 8688000842 |OP LOAD|LVL L3 or L3 hit|SNP N/A|TLB Walker hit|LCK No|BLK N/A aaaac17c3e88 [unknown] (/home/ubuntu/memstress)
memstress 3603 [020] 122.463754: 1 llc-miss: 8688000842 |OP LOAD|LVL L3 or L3 hit|SNP N/A|TLB Walker hit|LCK No|BLK N/A aaaac17c3e88 [unknown] (/home/ubuntu/memstress)
memstress 3603 [020] 122.463754: 1 llc-access: 8688000842 |OP LOAD|LVL L3 or L3 hit|SNP N/A|TLB Walker hit|LCK No|BLK N/A aaaac17c3e88 [unknown] (/home/ubuntu/memstress)
memstress 3603 [020] 122.463754: 1 tlb-access: 8688000842 |OP LOAD|LVL L3 or L3 hit|SNP N/A|TLB Walker hit|LCK No|BLK N/A aaaac17c3e88 [unknown] (/home/ubuntu/memstress)
memstress 3603 [020] 122.463754: 1 memory: 8688000842 |OP LOAD|LVL L3 or L3 hit|SNP N/A|TLB Walker hit|LCK No|BLK N/A aaaac17c3e88 [unknown] (/home/ubuntu/memstress)
After:
memstress 3603 [020] 122.463754: 1 l1d-miss: 8688000842 |OP LOAD|LVL L3 or L3 hit|SNP Peer|TLB Walker hit|LCK No|BLK N/A aaaac17c3e88 [unknown] (/home/ubuntu/memstress)
memstress 3603 [020] 122.463754: 1 l1d-access: 8688000842 |OP LOAD|LVL L3 or L3 hit|SNP Peer|TLB Walker hit|LCK No|BLK N/A aaaac17c3e88 [unknown] (/home/ubuntu/memstress)
memstress 3603 [020] 122.463754: 1 llc-miss: 8688000842 |OP LOAD|LVL L3 or L3 hit|SNP Peer|TLB Walker hit|LCK No|BLK N/A aaaac17c3e88 [unknown] (/home/ubuntu/memstress)
memstress 3603 [020] 122.463754: 1 llc-access: 8688000842 |OP LOAD|LVL L3 or L3 hit|SNP Peer|TLB Walker hit|LCK No|BLK N/A aaaac17c3e88 [unknown] (/home/ubuntu/memstress)
memstress 3603 [020] 122.463754: 1 tlb-access: 8688000842 |OP LOAD|LVL L3 or L3 hit|SNP Peer|TLB Walker hit|LCK No|BLK N/A aaaac17c3e88 [unknown] (/home/ubuntu/memstress)
memstress 3603 [020] 122.463754: 1 memory: 8688000842 |OP LOAD|LVL L3 or L3 hit|SNP Peer|TLB Walker hit|LCK No|BLK N/A aaaac17c3e88 [unknown] (/home/ubuntu/memstress)
Signed-off-by: Leo Yan <[email protected]>
Reviewed-by: Ali Saidi <[email protected]>
Tested-by: Ali Saidi <[email protected]>
Reviewed-by: Kajol Jain<[email protected]>
---
tools/perf/util/mem-events.c | 18 +++++++++++++++---
1 file changed, 15 insertions(+), 3 deletions(-)
diff --git a/tools/perf/util/mem-events.c b/tools/perf/util/mem-events.c
index efaf263464b9..db5225caaabe 100644
--- a/tools/perf/util/mem-events.c
+++ b/tools/perf/util/mem-events.c
@@ -410,6 +410,11 @@ static const char * const snoop_access[] = {
"HitM",
};
+static const char * const snoopx_access[] = {
+ "Fwd",
+ "Peer",
+};
+
int perf_mem__snp_scnprintf(char *out, size_t sz, struct mem_info *mem_info)
{
size_t i, l = 0;
@@ -430,13 +435,20 @@ int perf_mem__snp_scnprintf(char *out, size_t sz, struct mem_info *mem_info)
}
l += scnprintf(out + l, sz - l, snoop_access[i]);
}
- if (mem_info &&
- (mem_info->data_src.mem_snoopx & PERF_MEM_SNOOPX_FWD)) {
+
+ m = 0;
+ if (mem_info)
+ m = mem_info->data_src.mem_snoopx;
+
+ for (i = 0; m && i < ARRAY_SIZE(snoopx_access); i++, m >>= 1) {
+ if (!(m & 0x1))
+ continue;
+
if (l) {
strcat(out, " or ");
l += 4;
}
- l += scnprintf(out + l, sz - l, "Fwd");
+ l += scnprintf(out + l, sz - l, snoopx_access[i]);
}
if (*out == '\0')
--
2.32.0
Hi Arnaldo,
On Tue, May 17, 2022 at 06:20:03PM -0300, Arnaldo Carvalho de Melo wrote:
> Em Tue, May 17, 2022 at 02:03:21AM +0000, Ali Saidi escreveu:
> > When synthesizing data from SPE, augment the type with source information
> > for Arm Neoverse cores so we can detect situtions like cache line
> > contention and transfers on Arm platforms.
> >
> > This changes enables future changes to c2c on a system with SPE where lines that
> > are shared among multiple cores show up in perf c2c output.
> >
> > Changes is v9:
> > * Change reporting of remote socket data which should make Leo's upcomping
> > patch set for c2c make sense on multi-socket platforms
>
> Hey,
>
> Joe Mario, who is one of 'perf c2c' authors asked me about some
> git tree he could clone from for both building the kernel and
> tools/perf/ so that he could do tests, can you please provide that?
Sure, I will prepare a git tree for testing and share with Joe.
> thanks!
Also thanks for your reminding.
Leo
Hi Joe,
On Tue, May 17, 2022 at 06:20:03PM -0300, Arnaldo Carvalho de Melo wrote:
> Em Tue, May 17, 2022 at 02:03:21AM +0000, Ali Saidi escreveu:
> > When synthesizing data from SPE, augment the type with source information
> > for Arm Neoverse cores so we can detect situtions like cache line
> > contention and transfers on Arm platforms.
> >
> > This changes enables future changes to c2c on a system with SPE where lines that
> > are shared among multiple cores show up in perf c2c output.
> >
> > Changes is v9:
> > * Change reporting of remote socket data which should make Leo's upcomping
> > patch set for c2c make sense on multi-socket platforms
>
> Hey,
>
> Joe Mario, who is one of 'perf c2c' authors asked me about some
> git tree he could clone from for both building the kernel and
> tools/perf/ so that he could do tests, can you please provide that?
I have uploaded the latest patches for enabling 'perf c2c' on Arm SPE
on the repo:
https://git.linaro.org/people/leo.yan/linux-spe.git branch: perf_c2c_arm_spe_peer_v3
Below are the quick notes for build the kernel with enabling Arm SPE:
$ git clone -b perf_c2c_arm_spe_peer_v3 https://git.linaro.org/people/leo.yan/linux-spe.git
Or
$ git clone -b perf_c2c_arm_spe_peer_v3 ssh://[email protected]/people/leo.yan/linux-spe.git
$ cd linux-spe
# Build kernel
$ make defconfig
$ ./scripts/config -e CONFIG_PID_IN_CONTEXTIDR
$ ./scripts/config -e CONFIG_ARM_SPE_PMU
$ make Image
# Build perf
$ cd tools/perf
$ make VF=1 DEBUG=1
When boot the kernel, please add option "kpti=off" in kernel command
line, you might need to update grub menu for this.
Please feel free let us know if anything is not clear for you.
Thank you,
Leo
Hi Joe,
On Thu, May 19, 2022 at 11:16:53AM -0400, Joe Mario wrote:
[...]
> Hi Leo:
> Thanks for getting this working on ARM. I do have a few comments.
>
> I built and ran this on a ARM Neoverse-N1 system with 2 numa nodes.
>
> Comment 1:
> When I run "perf c2c report", the "Node" field is marked "N/A". It's supposed to show the numa node where the data address for the cacheline resides. That's important both to see what node hot data resides on and if that data is getting lots of cross-numa accesses.
Good catching. Will fix it.
> Comment 2:
> I'm assuming you're identifying the contended cachelines using the "peer" load response, which indicates the load was resolved from a "peer" cpu's cacheline. Please confirm.
Yeah, "peer" is ambiguous. AFAIK, "peer" load can come from:
- Local node which in peer CPU's cache (can be same cluster or peer cluster);
- Remove ndoe which in CPU's cache line, or even from *remote DRAM*.
> If that's true, is it possible to identify if that "peer" response was on the local or remote numa node?
Good point. Yes, we can do this. So far, the remote accesses are
accounted in the metric "rmt_hit", it should be same with the
remote peer load; but so far we have no a metric to account local
peer loads, it would be not hard to add metric "lcl_peer".
> I ask because being able to identify both local and remote HitM's on Intel X86_64 has been quite valuable. That's because remote HitM's are costly and because it helps the viewer see if they need to optimize their cpu affinity or what node their hot data resides on.
Thanks a lot for the info. This means at least I should refine the shared
cache line distribution pareto for remote peer access, will do some
experiment for the enhancement.
> Last Comment:
> There's a row in the Pareto table that has incorrect column alignment.
> Look at row 80 below in the truncated snipit of output. It has an extra field inserted in it at the beginning.
> I also show what the corrected output should look like.
>
> Incorrect row 80:
> 71 =================================================
> 72 Shared Cache Line Distribution Pareto
> 73 =================================================
> 74 #
> 75 # ----- HITM ----- Snoop ------- Store Refs ------ ------- CL --------
> 76 # RmtHitm LclHitm Peer L1 Hit L1 Miss N/A Off Node PA cnt Code address
> 77 # ....... ....... ....... ....... ....... ....... ..... .... ...... ..................
> 78 #
> 79 -------------------------------------------------------------------------------
> 80 0 0 0 4648 0 0 11572 0x422140
> 81 -------------------------------------------------------------------------------
> 82 0.00% 0.00% 0.00% 0.00% 0.00% 44.47% 0x0 N/A 0 0x400ce8
> 83 0.00% 0.00% 10.26% 0.00% 0.00% 0.00% 0x0 N/A 0 0x400e48
> 84 0.00% 0.00% 0.00% 0.00% 0.00% 55.53% 0x0 N/A 0 0x400e54
> 85 0.00% 0.00% 89.74% 0.00% 0.00% 0.00% 0x8 N/A 0 0x401038
>
>
> Corrected row 80:
> 71 =================================================
> 72 Shared Cache Line Distribution Pareto
> 73 =================================================
> 74 #
> 75 # ----- HITM ----- Snoop ------- Store Refs ----- ------- CL --------
> 76 # RmtHitm LclHitm Peer L1 Hit L1 Miss N/A Off Node PA cnt Code address
> 77 # ....... ....... ....... ....... ....... ...... ..... .... ...... ..................
> 78 #
> 79 -------------------------------------------------------------------------------
> 80 0 0 4648 0 0 11572 0x422140
> 81 -------------------------------------------------------------------------------
> 82 0.00% 0.00% 0.00% 0.00% 0.00% 44.47% 0x0 N/A 0 0x400ce8
> 83 0.00% 0.00% 10.26% 0.00% 0.00% 0.00% 0x0 N/A 0 0x400e48
> 84 0.00% 0.00% 0.00% 0.00% 0.00% 55.53% 0x0 N/A 0 0x400e54
> 85 0.00% 0.00% 89.74% 0.00% 0.00% 0.00% 0x8 N/A 0 0x401038
Hmm‥. At my side, I used below command to output pareto view, but I
cannot see the conlumn "CL", the conlumn "CL" is only shown for TUI
mode but not for the mode "--stdio". Could you share the method for
how to reproduce this issue?
$ ./perf c2c report -i perf.data.v3 -N
=================================================
Shared Cache Line Distribution Pareto
=================================================
#
# ----- HITM ----- Snoop ------- Store Refs ------ --------- Data address --------- --------------- cycles --------------- Total cpu Shared
# Num RmtHitm LclHitm Peer L1 Hit L1 Miss N/A Offset Node PA cnt Code address rmt hitm lcl hitm load peer records cnt Symbol Object Source:Line Node{cpus %peers %stores}
# ..... ....... ....... ....... ....... ....... ....... .................. .... ...... .................. ........ ........ ........ ........ ....... ........ ...................... ................. ...................... ....
#
-------------------------------------------------------------------------------
0 0 0 56183 0 0 26534 0x420180
-------------------------------------------------------------------------------
0.00% 0.00% 99.85% 0.00% 0.00% 0.00% 0x0 N/A 0 0x400bd0 0 0 1587 4034 188785 2 [.] 0x0000000000000bd0 false_sharing.exe false_sharing.exe[bd0] 0{ 1 87.4% n/a} 1{ 1 12.6% n/a}
0.00% 0.00% 0.00% 0.00% 0.00% 54.56% 0x0 N/A 0 0x400bd4 0 0 0 0 14476 2 [.] 0x0000000000000bd4 false_sharing.exe false_sharing.exe[bd4] 0{ 1 n/a 0.2%} 1{ 1 n/a 99.8%}
0.00% 0.00% 0.00% 0.00% 0.00% 45.44% 0x0 N/A 0 0x400bf8 0 0 0 0 12058 2 [.] 0x0000000000000bf8 false_sharing.exe false_sharing.exe[bf8] 0{ 1 n/a 70.3%} 1{ 1 n/a 29.7%}
0.00% 0.00% 0.15% 0.00% 0.00% 0.00% 0x20 N/A 0 0x400c64 0 0 2462 2451 4835 2 [.] 0x0000000000000c64 false_sharing.exe false_sharing.exe[c64] 0{ 1 11.9% n/a} 1{ 1 88.1% n/a}
-------------------------------------------------------------------------------
1 0 0 2571 0 0 69861 0x420100
-------------------------------------------------------------------------------
0.00% 0.00% 0.00% 0.00% 0.00% 100.00% 0x8 N/A 0 0x400c08 0 0 0 0 69861 2 [.] 0x0000000000000c08 false_sharing.exe false_sharing.exe[c08] 0{ 1 n/a 62.1%} 1{ 1 n/a 37.9%}
0.00% 0.00% 100.00% 0.00% 0.00% 0.00% 0x20 N/A 0 0x400c74 0 0 834 641 6576 2 [.] 0x0000000000000c74 false_sharing.exe false_sharing.exe[c74] 0{ 1 93.2% n/a} 1{ 1 6.8% n/a}
Very appreciate your testing and suggestions!
Leo
On 5/18/22 12:16 AM, Leo Yan wrote:
> Hi Joe,
>
> On Tue, May 17, 2022 at 06:20:03PM -0300, Arnaldo Carvalho de Melo wrote:
>> Em Tue, May 17, 2022 at 02:03:21AM +0000, Ali Saidi escreveu:
>>> When synthesizing data from SPE, augment the type with source information
>>> for Arm Neoverse cores so we can detect situtions like cache line
>>> contention and transfers on Arm platforms.
>>>
>>> This changes enables future changes to c2c on a system with SPE where lines that
>>> are shared among multiple cores show up in perf c2c output.
>>>
>>> Changes is v9:
>>> * Change reporting of remote socket data which should make Leo's upcomping
>>> patch set for c2c make sense on multi-socket platforms
>>
>> Hey,
>>
>> Joe Mario, who is one of 'perf c2c' authors asked me about some
>> git tree he could clone from for both building the kernel and
>> tools/perf/ so that he could do tests, can you please provide that?
>
> I have uploaded the latest patches for enabling 'perf c2c' on Arm SPE
> on the repo:
>
> https://git.linaro.org/people/leo.yan/linux-spe.git branch: perf_c2c_arm_spe_peer_v3
>
> Below are the quick notes for build the kernel with enabling Arm SPE:
>
> $ git clone -b perf_c2c_arm_spe_peer_v3 https://git.linaro.org/people/leo.yan/linux-spe.git
>
> Or
>
> $ git clone -b perf_c2c_arm_spe_peer_v3 ssh://[email protected]/people/leo.yan/linux-spe.git
>
> $ cd linux-spe
>
> # Build kernel
> $ make defconfig
> $ ./scripts/config -e CONFIG_PID_IN_CONTEXTIDR
> $ ./scripts/config -e CONFIG_ARM_SPE_PMU
> $ make Image
>
> # Build perf
> $ cd tools/perf
> $ make VF=1 DEBUG=1
>
> When boot the kernel, please add option "kpti=off" in kernel command
> line, you might need to update grub menu for this.
>
> Please feel free let us know if anything is not clear for you.
>
> Thank you,
> Leo
>
Hi Leo:
Thanks for getting this working on ARM. I do have a few comments.
I built and ran this on a ARM Neoverse-N1 system with 2 numa nodes.
Comment 1:
When I run "perf c2c report", the "Node" field is marked "N/A". It's supposed to show the numa node where the data address for the cacheline resides. That's important both to see what node hot data resides on and if that data is getting lots of cross-numa accesses.
Comment 2:
I'm assuming you're identifying the contended cachelines using the "peer" load response, which indicates the load was resolved from a "peer" cpu's cacheline. Please confirm.
If that's true, is it possible to identify if that "peer" response was on the local or remote numa node?
I ask because being able to identify both local and remote HitM's on Intel X86_64 has been quite valuable. That's because remote HitM's are costly and because it helps the viewer see if they need to optimize their cpu affinity or what node their hot data resides on.
Last Comment:
There's a row in the Pareto table that has incorrect column alignment.
Look at row 80 below in the truncated snipit of output. It has an extra field inserted in it at the beginning.
I also show what the corrected output should look like.
Incorrect row 80:
71 =================================================
72 Shared Cache Line Distribution Pareto
73 =================================================
74 #
75 # ----- HITM ----- Snoop ------- Store Refs ------ ------- CL --------
76 # RmtHitm LclHitm Peer L1 Hit L1 Miss N/A Off Node PA cnt Code address
77 # ....... ....... ....... ....... ....... ....... ..... .... ...... ..................
78 #
79 -------------------------------------------------------------------------------
80 0 0 0 4648 0 0 11572 0x422140
81 -------------------------------------------------------------------------------
82 0.00% 0.00% 0.00% 0.00% 0.00% 44.47% 0x0 N/A 0 0x400ce8
83 0.00% 0.00% 10.26% 0.00% 0.00% 0.00% 0x0 N/A 0 0x400e48
84 0.00% 0.00% 0.00% 0.00% 0.00% 55.53% 0x0 N/A 0 0x400e54
85 0.00% 0.00% 89.74% 0.00% 0.00% 0.00% 0x8 N/A 0 0x401038
Corrected row 80:
71 =================================================
72 Shared Cache Line Distribution Pareto
73 =================================================
74 #
75 # ----- HITM ----- Snoop ------- Store Refs ----- ------- CL --------
76 # RmtHitm LclHitm Peer L1 Hit L1 Miss N/A Off Node PA cnt Code address
77 # ....... ....... ....... ....... ....... ...... ..... .... ...... ..................
78 #
79 -------------------------------------------------------------------------------
80 0 0 4648 0 0 11572 0x422140
81 -------------------------------------------------------------------------------
82 0.00% 0.00% 0.00% 0.00% 0.00% 44.47% 0x0 N/A 0 0x400ce8
83 0.00% 0.00% 10.26% 0.00% 0.00% 0.00% 0x0 N/A 0 0x400e48
84 0.00% 0.00% 0.00% 0.00% 0.00% 55.53% 0x0 N/A 0 0x400e54
85 0.00% 0.00% 89.74% 0.00% 0.00% 0.00% 0x8 N/A 0 0x401038
Thanks again for doing this.
Joe
On 5/22/22 2:15 AM, Leo Yan wrote:
> Hi Joe,
>
> On Thu, May 19, 2022 at 11:16:53AM -0400, Joe Mario wrote:
>
> [SNIP]
>
>> Last Comment:
>> There's a row in the Pareto table that has incorrect column alignment.
>> Look at row 80 below in the truncated snipit of output. It has an extra field inserted in it at the beginning.
>> I also show what the corrected output should look like.
>>
>> Incorrect row 80:
>> 71 =================================================
>> 72 Shared Cache Line Distribution Pareto
>> 73 =================================================
>> 74 #
>> 75 # ----- HITM ----- Snoop ------- Store Refs ------ ------- CL --------
>> 76 # RmtHitm LclHitm Peer L1 Hit L1 Miss N/A Off Node PA cnt Code address
>> 77 # ....... ....... ....... ....... ....... ....... ..... .... ...... ..................
>> 78 #
>> 79 -------------------------------------------------------------------------------
>> 80 0 0 0 4648 0 0 11572 0x422140
>> 81 -------------------------------------------------------------------------------
>> 82 0.00% 0.00% 0.00% 0.00% 0.00% 44.47% 0x0 N/A 0 0x400ce8
>> 83 0.00% 0.00% 10.26% 0.00% 0.00% 0.00% 0x0 N/A 0 0x400e48
>> 84 0.00% 0.00% 0.00% 0.00% 0.00% 55.53% 0x0 N/A 0 0x400e54
>> 85 0.00% 0.00% 89.74% 0.00% 0.00% 0.00% 0x8 N/A 0 0x401038
>>
>>
>> Corrected row 80:
>> 71 =================================================
>> 72 Shared Cache Line Distribution Pareto
>> 73 =================================================
>> 74 #
>> 75 # ----- HITM ----- Snoop ------- Store Refs ----- ------- CL --------
>> 76 # RmtHitm LclHitm Peer L1 Hit L1 Miss N/A Off Node PA cnt Code address
>> 77 # ....... ....... ....... ....... ....... ...... ..... .... ...... ..................
>> 78 #
>> 79 -------------------------------------------------------------------------------
>> 80 0 0 4648 0 0 11572 0x422140
>> 81 -------------------------------------------------------------------------------
>> 82 0.00% 0.00% 0.00% 0.00% 0.00% 44.47% 0x0 N/A 0 0x400ce8
>> 83 0.00% 0.00% 10.26% 0.00% 0.00% 0.00% 0x0 N/A 0 0x400e48
>> 84 0.00% 0.00% 0.00% 0.00% 0.00% 55.53% 0x0 N/A 0 0x400e54
>> 85 0.00% 0.00% 89.74% 0.00% 0.00% 0.00% 0x8 N/A 0 0x401038
>
> Hmm‥. At my side, I used below command to output pareto view, but I
> cannot see the conlumn "CL", the conlumn "CL" is only shown for TUI
> mode but not for the mode "--stdio". Could you share the method for
> how to reproduce this issue?
Hi Leo:
I figured out why my output was different than yours.
I did not have the slang-devel rpm installed on the host system.
In my original perf build, I missed the this output in the build log:
> slang not found, disables TUI support. Please install slang-devel, libslang-dev or libslang2-dev
Once I installed slang-devel, rebuilt perf, and then reran my test, the pareto output looked fine.
When the TUI support is disabled, it shouldn't corrupt the resulting stdio output. I don't believe this has anything to do with your commits.
Last, it looks like you should update the help text for the display flag options to reflect your new peer option.
Currently it says:
-d, --display <Switch HITM output type>
lcl,rmt
But since you added the "peer" display, shouldn't the output for that help text state:
-d, --display <Switch HITM output type>
lcl,rmt,peer
Joe
Hi Joe,
On Mon, May 23, 2022 at 01:24:32PM -0400, Joe Mario wrote:
[...]
> Hi Leo:
> I figured out why my output was different than yours.
>
> I did not have the slang-devel rpm installed on the host system.
>
> In my original perf build, I missed the this output in the build log:
> > slang not found, disables TUI support. Please install slang-devel, libslang-dev or libslang2-dev
>
> Once I installed slang-devel, rebuilt perf, and then reran my test, the pareto output looked fine.
>
> When the TUI support is disabled, it shouldn't corrupt the resulting stdio output. I don't believe this has anything to do with your commits.
Thanks for taking time to hunt this issue. I checked the code and
sent out a patch to fix the stdio interface if slang lib is not
installed. Please see the patch:
https://lore.kernel.org/lkml/[email protected]/T/#u
> Last, it looks like you should update the help text for the display flag options to reflect your new peer option.
> Currently it says:
> -d, --display <Switch HITM output type>
> lcl,rmt
>
> But since you added the "peer" display, shouldn't the output for that help text state:
> -d, --display <Switch HITM output type>
> lcl,rmt,peer
Yeah, will fix.
Very appreciate for your detailed testing and suggestions.
Leo
Em Tue, May 17, 2022 at 02:03:25AM +0000, Ali Saidi escreveu:
> From: Leo Yan <[email protected]>
>
> Except memory load and store operations, Arm SPE records also can
> support other operation types, bug when set the data source field the
> current code assumes a record is a either load operation or store
> operation, this leads to wrongly synthesize memory samples.
>
> This patch strictly checks the record operation type, it only sets data
> source only for the operation types ARM_SPE_LD and ARM_SPE_ST,
> otherwise, returns zero for data source. Therefore, we can synthesize
> memory samples only when data source is a non-zero value, the function
> arm_spe__is_memory_event() is useless and removed.
Thanks, applied.
- Arnaldo
> Fixes: e55ed3423c1b ("perf arm-spe: Synthesize memory event")
> Signed-off-by: Leo Yan <[email protected]>
> Reviewed-by: Ali Saidi <[email protected]>
> Tested-by: Ali Saidi <[email protected]>
> Reviewed-by: German Gomez <[email protected]>
> ---
> tools/perf/util/arm-spe.c | 22 ++++++++--------------
> 1 file changed, 8 insertions(+), 14 deletions(-)
>
> diff --git a/tools/perf/util/arm-spe.c b/tools/perf/util/arm-spe.c
> index d2b64e3f588b..e032efc03274 100644
> --- a/tools/perf/util/arm-spe.c
> +++ b/tools/perf/util/arm-spe.c
> @@ -387,26 +387,16 @@ static int arm_spe__synth_instruction_sample(struct arm_spe_queue *speq,
> return arm_spe_deliver_synth_event(spe, speq, event, &sample);
> }
>
> -#define SPE_MEM_TYPE (ARM_SPE_L1D_ACCESS | ARM_SPE_L1D_MISS | \
> - ARM_SPE_LLC_ACCESS | ARM_SPE_LLC_MISS | \
> - ARM_SPE_REMOTE_ACCESS)
> -
> -static bool arm_spe__is_memory_event(enum arm_spe_sample_type type)
> -{
> - if (type & SPE_MEM_TYPE)
> - return true;
> -
> - return false;
> -}
> -
> static u64 arm_spe__synth_data_source(const struct arm_spe_record *record)
> {
> union perf_mem_data_src data_src = { 0 };
>
> if (record->op == ARM_SPE_LD)
> data_src.mem_op = PERF_MEM_OP_LOAD;
> - else
> + else if (record->op == ARM_SPE_ST)
> data_src.mem_op = PERF_MEM_OP_STORE;
> + else
> + return 0;
>
> if (record->type & (ARM_SPE_LLC_ACCESS | ARM_SPE_LLC_MISS)) {
> data_src.mem_lvl = PERF_MEM_LVL_L3;
> @@ -510,7 +500,11 @@ static int arm_spe_sample(struct arm_spe_queue *speq)
> return err;
> }
>
> - if (spe->sample_memory && arm_spe__is_memory_event(record->type)) {
> + /*
> + * When data_src is zero it means the record is not a memory operation,
> + * skip to synthesize memory sample for this case.
> + */
> + if (spe->sample_memory && data_src) {
> err = arm_spe__synth_mem_sample(speq, spe->memory_id, data_src);
> if (err)
> return err;
> --
> 2.32.0
--
- Arnaldo