2017-06-07 23:23:00

by Andi Kleen

[permalink] [raw]
Subject: Fix Skylake PEBS data source for perf

Fix data source reporting for Skylake and Skylake Server.
The encodings have changed to express support for L4 and persistent
memory.

The first patch is a (independent) cleanup.

The second is for the kernel and the third for perf/tools.
The kernel part and perf tools will compile independently.

v1:
Initial post
v2:
Merged some patches.
Change encoding to use special bit for each combination instead
of modifiers.



2017-06-07 23:23:03

by Andi Kleen

[permalink] [raw]
Subject: [PATCH v2 3/4] perf, tools: Add support for printing new mem_info encodings

From: Andi Kleen <[email protected]>

Add decoding for the new lvlx and snoopx field meminfo field
added earlier to the kernel so that "perf mem report" and
other tools can print it properly.

v2: Merge with persistent memory patch.
Switch to new bit encoding for each combination.
Signed-off-by: Andi Kleen <[email protected]>
---
tools/include/uapi/linux/perf_event.h | 22 +++++++++++++++++--
tools/perf/util/mem-events.c | 40 ++++++++++++++++++++++++++++++++---
2 files changed, 57 insertions(+), 5 deletions(-)

diff --git a/tools/include/uapi/linux/perf_event.h b/tools/include/uapi/linux/perf_event.h
index b1c0b187acfe..95daade294d7 100644
--- a/tools/include/uapi/linux/perf_event.h
+++ b/tools/include/uapi/linux/perf_event.h
@@ -931,14 +931,18 @@ union perf_mem_data_src {
mem_snoop:5, /* snoop mode */
mem_lock:2, /* lock instr */
mem_dtlb:7, /* tlb access */
- mem_rsvd:31;
+ mem_lvlx:8, /* memory hierarchy level, ext */
+ mem_snoopx:2, /* snoop mode, ext */
+ mem_rsvd:21;
};
};
#elif defined(__BIG_ENDIAN_BITFIELD)
union perf_mem_data_src {
__u64 val;
struct {
- __u64 mem_rsvd:31,
+ __u64 mem_rsvd:21,
+ mem_snoopx:2, /* snoop mode, ext */
+ mem_lvlx:8, /* memory hierarchy level, ext */
mem_dtlb:7, /* tlb access */
mem_lock:2, /* lock instr */
mem_snoop:5, /* snoop mode */
@@ -975,6 +979,16 @@ union perf_mem_data_src {
#define PERF_MEM_LVL_UNC 0x2000 /* Uncached memory */
#define PERF_MEM_LVL_SHIFT 5

+#define PERF_MEM_LVLX_L4 0x01 /* L4 */
+#define PERF_MEM_LVLX_REM_L4 0x02 /* Remote L4 */
+#define PERF_MEM_LVLX_REM_RAM 0x04 /* Remote Ram, unknown hops */
+#define PERF_MEM_LVLX_PMEM 0x08 /* Persistent Memory */
+#define PERF_MEM_LVLX_REM_PMEM 0x10 /* Remote Persistent Memory */
+#define PERF_MEM_LVLX_REM_NA 0x20 /* Remote N/A level */
+/* 2 free */
+
+#define PERF_MEM_LVLX_SHIFT 33
+
/* snoop mode */
#define PERF_MEM_SNOOP_NA 0x01 /* not available */
#define PERF_MEM_SNOOP_NONE 0x02 /* no snoop */
@@ -983,6 +997,10 @@ union perf_mem_data_src {
#define PERF_MEM_SNOOP_HITM 0x10 /* snoop hit modified */
#define PERF_MEM_SNOOP_SHIFT 19

+#define PERF_MEM_SNOOPX_FWD 0x01 /* forward */
+/* 1 free */
+#define PERF_MEM_SNOOPX_SHIFT 41
+
/* locked instruction */
#define PERF_MEM_LOCK_NA 0x01 /* not available */
#define PERF_MEM_LOCK_LOCKED 0x02 /* locked transaction */
diff --git a/tools/perf/util/mem-events.c b/tools/perf/util/mem-events.c
index 06f5a3a4295c..28968e54cab4 100644
--- a/tools/perf/util/mem-events.c
+++ b/tools/perf/util/mem-events.c
@@ -166,11 +166,21 @@ static const char * const mem_lvl[] = {
"Uncached",
};

+static const char * const mem_lvlx[] = {
+ "L4",
+ "Remote L4",
+ "Remote RAM",
+ "PMEM",
+ "Remote PMEM",
+ "Remote N/A",
+};
+
int perf_mem__lvl_scnprintf(char *out, size_t sz, struct mem_info *mem_info)
{
size_t i, l = 0;
u64 m = PERF_MEM_LVL_NA;
u64 hit, miss;
+ int printed;

if (mem_info)
m = mem_info->data_src.mem_lvl;
@@ -184,17 +194,33 @@ int perf_mem__lvl_scnprintf(char *out, size_t sz, struct mem_info *mem_info)
/* already taken care of */
m &= ~(PERF_MEM_LVL_HIT|PERF_MEM_LVL_MISS);

+ printed = 0;
for (i = 0; m && i < ARRAY_SIZE(mem_lvl); i++, m >>= 1) {
if (!(m & 0x1))
continue;
- if (l) {
+ if (printed++) {
strcat(out, " or ");
l += 4;
}
l += scnprintf(out + l, sz - l, mem_lvl[i]);
}
- if (*out == '\0')
- l += scnprintf(out, sz - l, "N/A");
+
+ m = 0;
+ if (mem_info)
+ m = mem_info->data_src.mem_lvlx;
+
+ for (i = 0; m && i < ARRAY_SIZE(mem_lvlx); i++, m >>= 1) {
+ if (!(m & 0x1))
+ continue;
+ if (printed++) {
+ strcat(out, " or ");
+ l += 4;
+ }
+ l += scnprintf(out + l, sz - l, mem_lvlx[i]);
+ }
+
+ if (l == 0)
+ l += scnprintf(out + l, sz - l, "N/A");
if (hit)
l += scnprintf(out + l, sz - l, " hit");
if (miss)
@@ -231,6 +257,14 @@ int perf_mem__snp_scnprintf(char *out, size_t sz, struct mem_info *mem_info)
}
l += scnprintf(out + l, sz - l, snoop_access[i]);
}
+ if (mem_info &&
+ (mem_info->data_src.mem_snoopx & PERF_MEM_SNOOPX_FWD)) {
+ if (l) {
+ strcat(out, " or ");
+ l += 4;
+ }
+ l += scnprintf(out + l, sz - l, "Fwd");
+ }

if (*out == '\0')
l += scnprintf(out, sz - l, "N/A");
--
2.9.4

2017-06-07 23:23:18

by Andi Kleen

[permalink] [raw]
Subject: [PATCH v2 1/4] perf/x86: Move Nehalem PEBS code to flag

From: Andi Kleen <[email protected]>

Minor cleanup: use an explicit x86_pmu flag to handle the
missing Lock / TLB information on Nehalem, instead of always
checking the model number for each PEBS sample.

Signed-off-by: Andi Kleen <[email protected]>
---
arch/x86/events/intel/core.c | 1 +
arch/x86/events/intel/ds.c | 5 +----
arch/x86/events/perf_event.h | 3 ++-
3 files changed, 4 insertions(+), 5 deletions(-)

diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c
index da9047eec7ba..dec9b4bf0752 100644
--- a/arch/x86/events/intel/core.c
+++ b/arch/x86/events/intel/core.c
@@ -3775,6 +3775,7 @@ __init int intel_pmu_init(void)

intel_pmu_pebs_data_source_nhm();
x86_add_quirk(intel_nehalem_quirk);
+ x86_pmu.pebs_no_tlb = 1;

pr_cont("Nehalem events, ");
break;
diff --git a/arch/x86/events/intel/ds.c b/arch/x86/events/intel/ds.c
index c6d23ffe422d..7732999f5e2a 100644
--- a/arch/x86/events/intel/ds.c
+++ b/arch/x86/events/intel/ds.c
@@ -149,8 +149,6 @@ static u64 load_latency_data(u64 status)
{
union intel_x86_pebs_dse dse;
u64 val;
- int model = boot_cpu_data.x86_model;
- int fam = boot_cpu_data.x86;

dse.val = status;

@@ -162,8 +160,7 @@ static u64 load_latency_data(u64 status)
/*
* Nehalem models do not support TLB, Lock infos
*/
- if (fam == 0x6 && (model == 26 || model == 30
- || model == 31 || model == 46)) {
+ if (x86_pmu.pebs_no_tlb) {
val |= P(TLB, NA) | P(LOCK, NA);
return val;
}
diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h
index 53728eea1bed..a6d9d6570957 100644
--- a/arch/x86/events/perf_event.h
+++ b/arch/x86/events/perf_event.h
@@ -591,7 +591,8 @@ struct x86_pmu {
pebs :1,
pebs_active :1,
pebs_broken :1,
- pebs_prec_dist :1;
+ pebs_prec_dist :1,
+ pebs_no_tlb :1;
int pebs_record_size;
int pebs_buffer_size;
void (*drain_pebs)(struct pt_regs *regs);
--
2.9.4

2017-06-07 23:22:59

by Andi Kleen

[permalink] [raw]
Subject: [PATCH v2 2/4] perf/x86: Fix data source decoding for Skylake

From: Andi Kleen <[email protected]>

Skylake changed the encoding of the PEBS data source field.
Some combinations are not available anymore, but some new cases
e.g. for L4 cache hit are added.

Fix up the conversion table for Skylake, similar as had been done
for Nehalem.

On Skylake server the encoding for L4 actually means persistent
memory. Handle this case too.

To properly describe it in the abstracted perf format I had to add
some new bits. Unfortunately the existing fields were full, so
this required adding eXtension fields for mem_lvl and snoop
into existing reserved space.

v2: Merge with persistent memory patch.
Add explicit bit for each case instead of using generic modifier.
Signed-off-by: Andi Kleen <[email protected]>
---
arch/x86/events/intel/core.c | 2 ++
arch/x86/events/intel/ds.c | 13 +++++++++++++
arch/x86/events/perf_event.h | 2 ++
include/uapi/linux/perf_event.h | 22 ++++++++++++++++++++--
4 files changed, 37 insertions(+), 2 deletions(-)

diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c
index dec9b4bf0752..08e53f36d697 100644
--- a/arch/x86/events/intel/core.c
+++ b/arch/x86/events/intel/core.c
@@ -4052,6 +4052,8 @@ __init int intel_pmu_init(void)
skl_format_attr);
WARN_ON(!x86_pmu.format_attrs);
x86_pmu.cpu_events = hsw_events_attrs;
+ intel_pmu_pebs_data_source_skl(
+ boot_cpu_data.x86_model == INTEL_FAM6_SKYLAKE_X);
pr_cont("Skylake events, ");
break;

diff --git a/arch/x86/events/intel/ds.c b/arch/x86/events/intel/ds.c
index 7732999f5e2a..cd28c4babd36 100644
--- a/arch/x86/events/intel/ds.c
+++ b/arch/x86/events/intel/ds.c
@@ -79,6 +79,19 @@ void __init intel_pmu_pebs_data_source_nhm(void)
pebs_data_source[0x07] = OP_LH | P(LVL, L3) | P(SNOOP, HITM);
}

+void __init intel_pmu_pebs_data_source_skl(bool pmem)
+{
+ u64 pmem_or_l4;
+
+ pmem_or_l4 = pmem ? P(LVLX, PMEM) : P(LVLX, L4);
+ pebs_data_source[0x08] = OP_LH | pmem_or_l4 | P(SNOOP, HIT);
+ pmem_or_l4 = pmem ? P(LVLX, REM_PMEM) : P(LVLX, REM_L4);
+ pebs_data_source[0x09] = OP_LH | pmem_or_l4 | P(SNOOP, HIT);
+ pebs_data_source[0x0b] = OP_LH | P(LVLX, REM_RAM) | P(SNOOP, NONE);
+ pebs_data_source[0x0c] = OP_LH | P(LVLX, REM_NA) | P(SNOOPX, FWD);
+ pebs_data_source[0x0d] = OP_LH | P(LVLX, REM_NA) | P(SNOOP, HITM);
+}
+
static u64 precise_store_data(u64 status)
{
union intel_x86_pebs_dse dse;
diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h
index a6d9d6570957..d7571f248652 100644
--- a/arch/x86/events/perf_event.h
+++ b/arch/x86/events/perf_event.h
@@ -946,6 +946,8 @@ void intel_pmu_lbr_init_knl(void);

void intel_pmu_pebs_data_source_nhm(void);

+void intel_pmu_pebs_data_source_skl(bool pmem);
+
int intel_pmu_setup_lbr_filter(struct perf_event *event);

void intel_pt_interrupt(void);
diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h
index b1c0b187acfe..95daade294d7 100644
--- a/include/uapi/linux/perf_event.h
+++ b/include/uapi/linux/perf_event.h
@@ -931,14 +931,18 @@ union perf_mem_data_src {
mem_snoop:5, /* snoop mode */
mem_lock:2, /* lock instr */
mem_dtlb:7, /* tlb access */
- mem_rsvd:31;
+ mem_lvlx:8, /* memory hierarchy level, ext */
+ mem_snoopx:2, /* snoop mode, ext */
+ mem_rsvd:21;
};
};
#elif defined(__BIG_ENDIAN_BITFIELD)
union perf_mem_data_src {
__u64 val;
struct {
- __u64 mem_rsvd:31,
+ __u64 mem_rsvd:21,
+ mem_snoopx:2, /* snoop mode, ext */
+ mem_lvlx:8, /* memory hierarchy level, ext */
mem_dtlb:7, /* tlb access */
mem_lock:2, /* lock instr */
mem_snoop:5, /* snoop mode */
@@ -975,6 +979,16 @@ union perf_mem_data_src {
#define PERF_MEM_LVL_UNC 0x2000 /* Uncached memory */
#define PERF_MEM_LVL_SHIFT 5

+#define PERF_MEM_LVLX_L4 0x01 /* L4 */
+#define PERF_MEM_LVLX_REM_L4 0x02 /* Remote L4 */
+#define PERF_MEM_LVLX_REM_RAM 0x04 /* Remote Ram, unknown hops */
+#define PERF_MEM_LVLX_PMEM 0x08 /* Persistent Memory */
+#define PERF_MEM_LVLX_REM_PMEM 0x10 /* Remote Persistent Memory */
+#define PERF_MEM_LVLX_REM_NA 0x20 /* Remote N/A level */
+/* 2 free */
+
+#define PERF_MEM_LVLX_SHIFT 33
+
/* snoop mode */
#define PERF_MEM_SNOOP_NA 0x01 /* not available */
#define PERF_MEM_SNOOP_NONE 0x02 /* no snoop */
@@ -983,6 +997,10 @@ union perf_mem_data_src {
#define PERF_MEM_SNOOP_HITM 0x10 /* snoop hit modified */
#define PERF_MEM_SNOOP_SHIFT 19

+#define PERF_MEM_SNOOPX_FWD 0x01 /* forward */
+/* 1 free */
+#define PERF_MEM_SNOOPX_SHIFT 41
+
/* locked instruction */
#define PERF_MEM_LOCK_NA 0x01 /* not available */
#define PERF_MEM_LOCK_LOCKED 0x02 /* locked transaction */
--
2.9.4

2017-06-07 23:24:06

by Andi Kleen

[permalink] [raw]
Subject: [PATCH v2 4/4] perf, tools: Add test cases for new data source encoding

From: Andi Kleen <[email protected]>

Add some simple tests to perf test to test data source printing.

v2: Make the tests actually checked for the correct name of Forward
Signed-off-by: Andi Kleen <[email protected]>
---
tools/perf/tests/Build | 1 +
tools/perf/tests/builtin-test.c | 4 ++++
tools/perf/tests/mem.c | 50 +++++++++++++++++++++++++++++++++++++++++
tools/perf/tests/tests.h | 1 +
4 files changed, 56 insertions(+)
create mode 100644 tools/perf/tests/mem.c

diff --git a/tools/perf/tests/Build b/tools/perf/tests/Build
index af58ebc243ef..540409613b73 100644
--- a/tools/perf/tests/Build
+++ b/tools/perf/tests/Build
@@ -34,6 +34,7 @@ perf-y += thread-map.o
perf-y += llvm.o llvm-src-base.o llvm-src-kbuild.o llvm-src-prologue.o llvm-src-relocation.o
perf-y += bpf.o
perf-y += topology.o
+perf-y += mem.o
perf-y += cpumap.o
perf-y += stat.o
perf-y += event_update.o
diff --git a/tools/perf/tests/builtin-test.c b/tools/perf/tests/builtin-test.c
index 9e08d297f1a9..57d355445c4f 100644
--- a/tools/perf/tests/builtin-test.c
+++ b/tools/perf/tests/builtin-test.c
@@ -43,6 +43,10 @@ static struct test generic_tests[] = {
.func = test__basic_mmap,
},
{
+ .desc = "Test data source output",
+ .func = test__mem,
+ },
+ {
.desc = "Parse event definition strings",
.func = test__parse_events,
},
diff --git a/tools/perf/tests/mem.c b/tools/perf/tests/mem.c
new file mode 100644
index 000000000000..b3fa20bd46df
--- /dev/null
+++ b/tools/perf/tests/mem.c
@@ -0,0 +1,50 @@
+#include "util/mem-events.h"
+#include "util/symbol.h"
+#include "linux/perf_event.h"
+#include "util/debug.h"
+#include "tests.h"
+#include <string.h>
+
+static int check(union perf_mem_data_src data_src,
+ const char *string)
+{
+ char out[100];
+ char failure[100];
+ struct mem_info mi = { .data_src = data_src };
+
+ int n;
+
+ n = perf_mem__snp_scnprintf(out, sizeof out, &mi);
+ n += perf_mem__lvl_scnprintf(out + n, sizeof out - n, &mi);
+ snprintf(failure, sizeof failure, "unexpected %s", out);
+ TEST_ASSERT_VAL(failure, !strcmp(string, out));
+ return 0;
+}
+
+int test__mem(int subtest __maybe_unused)
+{
+ int ret = 0;
+
+ ret |= check(((union perf_mem_data_src) {
+ .mem_lvl = PERF_MEM_LVL_HIT,
+ .mem_lvlx = PERF_MEM_LVLX_L4 }), "N/AL4 hit");
+
+ ret |= check(((union perf_mem_data_src) {
+ .mem_lvl = PERF_MEM_LVL_HIT,
+ .mem_lvlx = PERF_MEM_LVLX_REM_L4 }), "N/ARemote L4 hit");
+
+ ret |= check(((union perf_mem_data_src) {
+ .mem_lvl = PERF_MEM_LVL_MISS,
+ .mem_lvlx = PERF_MEM_LVLX_PMEM }), "N/APMEM miss");
+
+ ret |= check(((union perf_mem_data_src) {
+ .mem_lvl = PERF_MEM_LVL_MISS,
+ .mem_lvlx = PERF_MEM_LVLX_REM_PMEM }), "N/ARemote PMEM miss");
+
+ ret |= check(((union perf_mem_data_src) {
+ .mem_snoopx = PERF_MEM_SNOOPX_FWD,
+ .mem_lvl = PERF_MEM_LVL_MISS,
+ .mem_lvlx = PERF_MEM_LVLX_REM_RAM }), "FwdRemote RAM miss");
+
+ return ret;
+}
diff --git a/tools/perf/tests/tests.h b/tools/perf/tests/tests.h
index 631859629403..3b3017ee91d6 100644
--- a/tools/perf/tests/tests.h
+++ b/tools/perf/tests/tests.h
@@ -56,6 +56,7 @@ int test__python_use(int subtest);
int test__bp_signal(int subtest);
int test__bp_signal_overflow(int subtest);
int test__task_exit(int subtest);
+int test__mem(int subtest);
int test__sw_clock_freq(int subtest);
int test__code_reading(int subtest);
int test__sample_parsing(int subtest);
--
2.9.4

2017-06-08 08:15:39

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [PATCH v2 2/4] perf/x86: Fix data source decoding for Skylake

On Wed, Jun 07, 2017 at 04:22:24PM -0700, Andi Kleen wrote:

> diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h
> index b1c0b187acfe..95daade294d7 100644
> --- a/include/uapi/linux/perf_event.h
> +++ b/include/uapi/linux/perf_event.h
> @@ -931,14 +931,18 @@ union perf_mem_data_src {
> mem_snoop:5, /* snoop mode */
> mem_lock:2, /* lock instr */
> mem_dtlb:7, /* tlb access */
> - mem_rsvd:31;
> + mem_lvlx:8, /* memory hierarchy level, ext */
> + mem_snoopx:2, /* snoop mode, ext */
> + mem_rsvd:21;
> };
> };
> #elif defined(__BIG_ENDIAN_BITFIELD)
> union perf_mem_data_src {
> __u64 val;
> struct {
> - __u64 mem_rsvd:31,
> + __u64 mem_rsvd:21,
> + mem_snoopx:2, /* snoop mode, ext */
> + mem_lvlx:8, /* memory hierarchy level, ext */
> mem_dtlb:7, /* tlb access */
> mem_lock:2, /* lock instr */
> mem_snoop:5, /* snoop mode */

So one thing we could do is add a mem_hops field and always set that,
even for the old stuff. The old stuff will not know about that field and
ignore the bits, but new stuff will then not need as many LVL bits.

Of course, we then get into the problem of how many bits of hops we
need.. Power guys ?

> @@ -975,6 +979,16 @@ union perf_mem_data_src {
> #define PERF_MEM_LVL_UNC 0x2000 /* Uncached memory */
> #define PERF_MEM_LVL_SHIFT 5
>
> +#define PERF_MEM_LVLX_L4 0x01 /* L4 */
> +#define PERF_MEM_LVLX_REM_L4 0x02 /* Remote L4 */
> +#define PERF_MEM_LVLX_REM_RAM 0x04 /* Remote Ram, unknown hops */
> +#define PERF_MEM_LVLX_PMEM 0x08 /* Persistent Memory */
> +#define PERF_MEM_LVLX_REM_PMEM 0x10 /* Remote Persistent Memory */
> +#define PERF_MEM_LVLX_REM_NA 0x20 /* Remote N/A level */

Still wondering what the point of REM_NA is.. can you explain?

> +/* 2 free */
> +
> +#define PERF_MEM_LVLX_SHIFT 33
> +
> /* snoop mode */
> #define PERF_MEM_SNOOP_NA 0x01 /* not available */
> #define PERF_MEM_SNOOP_NONE 0x02 /* no snoop */
> @@ -983,6 +997,10 @@ union perf_mem_data_src {
> #define PERF_MEM_SNOOP_HITM 0x10 /* snoop hit modified */
> #define PERF_MEM_SNOOP_SHIFT 19
>
> +#define PERF_MEM_SNOOPX_FWD 0x01 /* forward */
> +/* 1 free */
> +#define PERF_MEM_SNOOPX_SHIFT 41
> +
> /* locked instruction */
> #define PERF_MEM_LOCK_NA 0x01 /* not available */
> #define PERF_MEM_LOCK_LOCKED 0x02 /* locked transaction */
> --
> 2.9.4
>

2017-06-08 19:41:08

by Stephane Eranian

[permalink] [raw]
Subject: Re: [PATCH v2 2/4] perf/x86: Fix data source decoding for Skylake

Hi,

On Thu, Jun 8, 2017 at 1:15 AM, Peter Zijlstra <[email protected]> wrote:
>
> On Wed, Jun 07, 2017 at 04:22:24PM -0700, Andi Kleen wrote:
>
> > diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h
> > index b1c0b187acfe..95daade294d7 100644
> > --- a/include/uapi/linux/perf_event.h
> > +++ b/include/uapi/linux/perf_event.h
> > @@ -931,14 +931,18 @@ union perf_mem_data_src {
> > mem_snoop:5, /* snoop mode */
> > mem_lock:2, /* lock instr */
> > mem_dtlb:7, /* tlb access */
> > - mem_rsvd:31;
> > + mem_lvlx:8, /* memory hierarchy level, ext */
> > + mem_snoopx:2, /* snoop mode, ext */
> > + mem_rsvd:21;
> > };
> > };
> > #elif defined(__BIG_ENDIAN_BITFIELD)
> > union perf_mem_data_src {
> > __u64 val;
> > struct {
> > - __u64 mem_rsvd:31,
> > + __u64 mem_rsvd:21,
> > + mem_snoopx:2, /* snoop mode, ext */
> > + mem_lvlx:8, /* memory hierarchy level, ext */
> > mem_dtlb:7, /* tlb access */
> > mem_lock:2, /* lock instr */
> > mem_snoop:5, /* snoop mode */
>
> So one thing we could do is add a mem_hops field and always set that,
> even for the old stuff. The old stuff will not know about that field and
> ignore the bits, but new stuff will then not need as many LVL bits.
>
That would be better than lvlx I think. I am guessing you're suggesting
an integer count here and not a bitmask. Right? Then I wonder why it
would need 8 bits or 255 possible levels!

> Of course, we then get into the problem of how many bits of hops we
> need.. Power guys ?
>
> > @@ -975,6 +979,16 @@ union perf_mem_data_src {
> > #define PERF_MEM_LVL_UNC 0x2000 /* Uncached memory */
> > #define PERF_MEM_LVL_SHIFT 5
> >
> > +#define PERF_MEM_LVLX_L4 0x01 /* L4 */
> > +#define PERF_MEM_LVLX_REM_L4 0x02 /* Remote L4 */
> > +#define PERF_MEM_LVLX_REM_RAM 0x04 /* Remote Ram, unknown hops */
> > +#define PERF_MEM_LVLX_PMEM 0x08 /* Persistent Memory */
> > +#define PERF_MEM_LVLX_REM_PMEM 0x10 /* Remote Persistent Memory */
> > +#define PERF_MEM_LVLX_REM_NA 0x20 /* Remote N/A level */
>
> Still wondering what the point of REM_NA is.. can you explain?
>
> > +/* 2 free */
> > +
> > +#define PERF_MEM_LVLX_SHIFT 33
> > +
> > /* snoop mode */
> > #define PERF_MEM_SNOOP_NA 0x01 /* not available */
> > #define PERF_MEM_SNOOP_NONE 0x02 /* no snoop */
> > @@ -983,6 +997,10 @@ union perf_mem_data_src {
> > #define PERF_MEM_SNOOP_HITM 0x10 /* snoop hit modified */
> > #define PERF_MEM_SNOOP_SHIFT 19
> >
> > +#define PERF_MEM_SNOOPX_FWD 0x01 /* forward */
> > +/* 1 free */
> > +#define PERF_MEM_SNOOPX_SHIFT 41
> > +
> > /* locked instruction */
> > #define PERF_MEM_LOCK_NA 0x01 /* not available */
> > #define PERF_MEM_LOCK_LOCKED 0x02 /* locked transaction */
> > --
> > 2.9.4
> >

2017-06-08 20:03:07

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [PATCH v2 2/4] perf/x86: Fix data source decoding for Skylake

On Thu, Jun 08, 2017 at 12:40:59PM -0700, Stephane Eranian wrote:
> Hi,
>
> On Thu, Jun 8, 2017 at 1:15 AM, Peter Zijlstra <[email protected]> wrote:
> >
> > On Wed, Jun 07, 2017 at 04:22:24PM -0700, Andi Kleen wrote:
> >
> > > diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h
> > > index b1c0b187acfe..95daade294d7 100644
> > > --- a/include/uapi/linux/perf_event.h
> > > +++ b/include/uapi/linux/perf_event.h
> > > @@ -931,14 +931,18 @@ union perf_mem_data_src {
> > > mem_snoop:5, /* snoop mode */
> > > mem_lock:2, /* lock instr */
> > > mem_dtlb:7, /* tlb access */
> > > - mem_rsvd:31;
> > > + mem_lvlx:8, /* memory hierarchy level, ext */
> > > + mem_snoopx:2, /* snoop mode, ext */
> > > + mem_rsvd:21;
> > > };
> > > };
> > > #elif defined(__BIG_ENDIAN_BITFIELD)
> > > union perf_mem_data_src {
> > > __u64 val;
> > > struct {
> > > - __u64 mem_rsvd:31,
> > > + __u64 mem_rsvd:21,
> > > + mem_snoopx:2, /* snoop mode, ext */
> > > + mem_lvlx:8, /* memory hierarchy level, ext */
> > > mem_dtlb:7, /* tlb access */
> > > mem_lock:2, /* lock instr */
> > > mem_snoop:5, /* snoop mode */
> >
> > So one thing we could do is add a mem_hops field and always set that,
> > even for the old stuff. The old stuff will not know about that field and
> > ignore the bits, but new stuff will then not need as many LVL bits.
> >
> That would be better than lvlx I think. I am guessing you're suggesting
> an integer count here and not a bitmask. Right?

Yah, 0 hops = local, etc..

> Then I wonder why it
> would need 8 bits or 255 possible levels!

I thing we still need lvlx, simply because the current lvl doesn't have
room to encode L4.

But having a mem_hops field avoids having to have local/remote/remote2
variants of everything.

That said, I'm afraid SGI can actually fill a mem_hops:8 or something like
that ;-)

2017-06-08 20:10:11

by Andi Kleen

[permalink] [raw]
Subject: Re: [PATCH v2 2/4] perf/x86: Fix data source decoding for Skylake

On Thu, Jun 08, 2017 at 12:40:59PM -0700, Stephane Eranian wrote:
> Hi,
>
> On Thu, Jun 8, 2017 at 1:15 AM, Peter Zijlstra <[email protected]> wrote:
> >
> > On Wed, Jun 07, 2017 at 04:22:24PM -0700, Andi Kleen wrote:
> >
> > > diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h
> > > index b1c0b187acfe..95daade294d7 100644
> > > --- a/include/uapi/linux/perf_event.h
> > > +++ b/include/uapi/linux/perf_event.h
> > > @@ -931,14 +931,18 @@ union perf_mem_data_src {
> > > mem_snoop:5, /* snoop mode */
> > > mem_lock:2, /* lock instr */
> > > mem_dtlb:7, /* tlb access */
> > > - mem_rsvd:31;
> > > + mem_lvlx:8, /* memory hierarchy level, ext */
> > > + mem_snoopx:2, /* snoop mode, ext */
> > > + mem_rsvd:21;
> > > };
> > > };
> > > #elif defined(__BIG_ENDIAN_BITFIELD)
> > > union perf_mem_data_src {
> > > __u64 val;
> > > struct {
> > > - __u64 mem_rsvd:31,
> > > + __u64 mem_rsvd:21,
> > > + mem_snoopx:2, /* snoop mode, ext */
> > > + mem_lvlx:8, /* memory hierarchy level, ext */
> > > mem_dtlb:7, /* tlb access */
> > > mem_lock:2, /* lock instr */
> > > mem_snoop:5, /* snoop mode */
> >
> > So one thing we could do is add a mem_hops field and always set that,
> > even for the old stuff. The old stuff will not know about that field and
> > ignore the bits, but new stuff will then not need as many LVL bits.

Note that Skylake cannot fill it in, it doesn't report the hops.
And for the old parts the existing bits work. So I don't think
there's a motivation to add a new hops field, unless some other
architecture needs it.

> >
> That would be better than lvlx I think. I am guessing you're suggesting
> an integer count here and not a bitmask. Right? Then I wonder why it
> would need 8 bits or 255 possible levels!

Sure counts are better than bits. At least it's a far more efficient
encoding. Ok so add a new level_num field, and use one count for
PMEM and L4, and also fill in for the others.

> > > +#define PERF_MEM_LVLX_PMEM 0x08 /* Persistent Memory */
> > > +#define PERF_MEM_LVLX_REM_PMEM 0x10 /* Remote Persistent Memory */
> > > +#define PERF_MEM_LVLX_REM_NA 0x20 /* Remote N/A level */
> >
> > Still wondering what the point of REM_NA is.. can you explain?

It's a remote cache, but the hardware doesn't report which one,
and also doesn't report the hops.

Ok PERF_MEM_LVL_REM_CCE would be a better name I guess, fitting
with the existing ones.

-Andi

2017-06-08 20:22:32

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [PATCH v2 2/4] perf/x86: Fix data source decoding for Skylake

On Thu, Jun 08, 2017 at 01:10:07PM -0700, Andi Kleen wrote:
> Note that Skylake cannot fill it in, it doesn't report the hops.
> And for the old parts the existing bits work. So I don't think
> there's a motivation to add a new hops field, unless some other
> architecture needs it.

PowerPC is currently using the REM2 stuff, I added the people who wrote
that code to Cc in the hope they would help by explaining their platform
capabilities.

2017-06-09 08:04:55

by Madhavan Srinivasan

[permalink] [raw]
Subject: Re: [PATCH v2 2/4] perf/x86: Fix data source decoding for Skylake



On Thursday 08 June 2017 01:45 PM, Peter Zijlstra wrote:
> On Wed, Jun 07, 2017 at 04:22:24PM -0700, Andi Kleen wrote:
>
>> diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h
>> index b1c0b187acfe..95daade294d7 100644
>> --- a/include/uapi/linux/perf_event.h
>> +++ b/include/uapi/linux/perf_event.h
>> @@ -931,14 +931,18 @@ union perf_mem_data_src {
>> mem_snoop:5, /* snoop mode */
>> mem_lock:2, /* lock instr */
>> mem_dtlb:7, /* tlb access */
>> - mem_rsvd:31;
>> + mem_lvlx:8, /* memory hierarchy level, ext */
>> + mem_snoopx:2, /* snoop mode, ext */
>> + mem_rsvd:21;
>> };
>> };
>> #elif defined(__BIG_ENDIAN_BITFIELD)
>> union perf_mem_data_src {
>> __u64 val;
>> struct {
>> - __u64 mem_rsvd:31,
>> + __u64 mem_rsvd:21,
>> + mem_snoopx:2, /* snoop mode, ext */
>> + mem_lvlx:8, /* memory hierarchy level, ext */
>> mem_dtlb:7, /* tlb access */
>> mem_lock:2, /* lock instr */
>> mem_snoop:5, /* snoop mode */
> So one thing we could do is add a mem_hops field and always set that,
> even for the old stuff. The old stuff will not know about that field and
> ignore the bits, but new stuff will then not need as many LVL bits.
>
> Of course, we then get into the problem of how many bits of hops we
> need.. Power guys ?

Currently we support 3 hops (local, remote and distant) and
in future we may have another for capi. So 4 levels of hops
might do. 8 would be nice future proof.

Maddy

>
>> @@ -975,6 +979,16 @@ union perf_mem_data_src {
>> #define PERF_MEM_LVL_UNC 0x2000 /* Uncached memory */
>> #define PERF_MEM_LVL_SHIFT 5
>>
>> +#define PERF_MEM_LVLX_L4 0x01 /* L4 */
>> +#define PERF_MEM_LVLX_REM_L4 0x02 /* Remote L4 */
>> +#define PERF_MEM_LVLX_REM_RAM 0x04 /* Remote Ram, unknown hops */
>> +#define PERF_MEM_LVLX_PMEM 0x08 /* Persistent Memory */
>> +#define PERF_MEM_LVLX_REM_PMEM 0x10 /* Remote Persistent Memory */
>> +#define PERF_MEM_LVLX_REM_NA 0x20 /* Remote N/A level */
> Still wondering what the point of REM_NA is.. can you explain?
>
>> +/* 2 free */
>> +
>> +#define PERF_MEM_LVLX_SHIFT 33
>> +
>> /* snoop mode */
>> #define PERF_MEM_SNOOP_NA 0x01 /* not available */
>> #define PERF_MEM_SNOOP_NONE 0x02 /* no snoop */
>> @@ -983,6 +997,10 @@ union perf_mem_data_src {
>> #define PERF_MEM_SNOOP_HITM 0x10 /* snoop hit modified */
>> #define PERF_MEM_SNOOP_SHIFT 19
>>
>> +#define PERF_MEM_SNOOPX_FWD 0x01 /* forward */
>> +/* 1 free */
>> +#define PERF_MEM_SNOOPX_SHIFT 41
>> +
>> /* locked instruction */
>> #define PERF_MEM_LOCK_NA 0x01 /* not available */
>> #define PERF_MEM_LOCK_LOCKED 0x02 /* locked transaction */
>> --
>> 2.9.4
>>

2017-06-09 16:47:35

by Andi Kleen

[permalink] [raw]
Subject: Re: [PATCH v2 2/4] perf/x86: Fix data source decoding for Skylake

> > > So one thing we could do is add a mem_hops field and always set that,
> > > even for the old stuff. The old stuff will not know about that field and
> > > ignore the bits, but new stuff will then not need as many LVL bits.
> > >
> > That would be better than lvlx I think. I am guessing you're suggesting
> > an integer count here and not a bitmask. Right?
>
> Yah, 0 hops = local, etc..

This doesn't work on Skylake because it doesn't report the number of hops,
just remote or not.

Would need a wildcard entry, but that can as well be a separate bit.

That's what I did in the latest version of my patch.

-Andi