From: Namhyung Kim <[email protected]>
Kernel IBS driver wasn't using new PERF_MEM_* APIs due to few of its
limitations. Mainly:
- mem_lvl_num doesn't allow setting multiple sources whereas old API
allows it. Setting multiple data sources is useful because IBS on
pre-zen4 uarch doesn't provide fine granular DataSrc details (there
is only one such DataSrc(2h) though).
- perf mem sorting logic (sort__lvl_cmp()) ignores mem_lvl_num. perf
c2c (c2c_decode_stats()) does not use mem_lvl_num at all. perf mem
prints mem_lvl and mem_lvl_num both if both are set, which is ugly.
1st one can be handled by using ANY_CACHE with HOPS_0. 2nd one is
purely perf tool specific issue and should be fixed separately.
Signed-off-by: Namhyung Kim <[email protected]>
Signed-off-by: Ravi Bangoria <[email protected]>
---
v1: https://lore.kernel.org/r/[email protected]
v1->v2:
- In addition to setting new API fields, convert all individual field
assignments to compile time wrapper macros built using PERF_MEM_S().
Also convert DataSrc conditional code to array lookups.
arch/x86/events/amd/ibs.c | 155 +++++++++++++++++---------------------
1 file changed, 68 insertions(+), 87 deletions(-)
diff --git a/arch/x86/events/amd/ibs.c b/arch/x86/events/amd/ibs.c
index 64582954b5f6..b46e0b725fe5 100644
--- a/arch/x86/events/amd/ibs.c
+++ b/arch/x86/events/amd/ibs.c
@@ -703,9 +703,41 @@ static u8 perf_ibs_data_src(union ibs_op_data2 *op_data2)
return op_data2->data_src_lo;
}
-static void perf_ibs_get_mem_lvl(union ibs_op_data2 *op_data2,
- union ibs_op_data3 *op_data3,
- struct perf_sample_data *data)
+#define L(x) (PERF_MEM_S(LVL, x) | PERF_MEM_S(LVL, HIT))
+#define LN(x) PERF_MEM_S(LVLNUM, x)
+#define REM PERF_MEM_S(REMOTE, REMOTE)
+#define HOPS(x) PERF_MEM_S(HOPS, x)
+
+static u64 g_data_src[8] = {
+ [IBS_DATA_SRC_LOC_CACHE] = L(L3) | L(REM_CCE1) | LN(ANY_CACHE) | HOPS(0),
+ [IBS_DATA_SRC_DRAM] = L(LOC_RAM) | LN(RAM),
+ [IBS_DATA_SRC_REM_CACHE] = L(REM_CCE2) | LN(ANY_CACHE) | REM | HOPS(1),
+ [IBS_DATA_SRC_IO] = L(IO) | LN(IO),
+};
+
+#define RMT_NODE_BITS ((1 << IBS_DATA_SRC_DRAM) | \
+ (1 << IBS_DATA_SRC_IO))
+#define RMT_NODE_APPLICABLE(x) (RMT_NODE_BITS & (1 << x))
+
+static u64 g_zen4_data_src[32] = {
+ [IBS_DATA_SRC_EXT_LOC_CACHE] = L(L3) | LN(L3),
+ [IBS_DATA_SRC_EXT_NEAR_CCX_CACHE] = L(REM_CCE1) | LN(ANY_CACHE) | REM | HOPS(0),
+ [IBS_DATA_SRC_EXT_DRAM] = L(LOC_RAM) | LN(RAM),
+ [IBS_DATA_SRC_EXT_FAR_CCX_CACHE] = L(REM_CCE2) | LN(ANY_CACHE) | REM | HOPS(1),
+ [IBS_DATA_SRC_EXT_PMEM] = LN(PMEM),
+ [IBS_DATA_SRC_EXT_IO] = L(IO) | LN(IO),
+ [IBS_DATA_SRC_EXT_EXT_MEM] = LN(CXL),
+};
+
+#define ZEN4_RMT_NODE_BITS ((1 << IBS_DATA_SRC_EXT_DRAM) | \
+ (1 << IBS_DATA_SRC_EXT_PMEM) | \
+ (1 << IBS_DATA_SRC_EXT_IO) |\
+ (1 << IBS_DATA_SRC_EXT_EXT_MEM))
+#define ZEN4_RMT_NODE_APPLICABLE(x) (RMT_NODE_BITS & (1 << x))
+
+static __u64 perf_ibs_get_mem_lvl(union ibs_op_data2 *op_data2,
+ union ibs_op_data3 *op_data3,
+ struct perf_sample_data *data)
{
union perf_mem_data_src *data_src = &data->data_src;
u8 ibs_data_src = perf_ibs_data_src(op_data2);
@@ -716,25 +748,19 @@ static void perf_ibs_get_mem_lvl(union ibs_op_data2 *op_data2,
* DcMiss, L2Miss, DataSrc, DcMissLat etc. are all invalid for Uncached
* memory accesses. So, check DcUcMemAcc bit early.
*/
- if (op_data3->dc_uc_mem_acc && ibs_data_src != IBS_DATA_SRC_EXT_IO) {
- data_src->mem_lvl = PERF_MEM_LVL_UNC | PERF_MEM_LVL_HIT;
- return;
- }
+ if (op_data3->dc_uc_mem_acc && ibs_data_src != IBS_DATA_SRC_EXT_IO)
+ return L(UNC);
/* L1 Hit */
- if (op_data3->dc_miss == 0) {
- data_src->mem_lvl = PERF_MEM_LVL_L1 | PERF_MEM_LVL_HIT;
- return;
- }
+ if (op_data3->dc_miss == 0)
+ return L(L1) | LN(L1);
/* L2 Hit */
if (op_data3->l2_miss == 0) {
/* Erratum #1293 */
if (boot_cpu_data.x86 != 0x19 || boot_cpu_data.x86_model > 0xF ||
- !(op_data3->sw_pf || op_data3->dc_miss_no_mab_alloc)) {
- data_src->mem_lvl = PERF_MEM_LVL_L2 | PERF_MEM_LVL_HIT;
- return;
- }
+ !(op_data3->sw_pf || op_data3->dc_miss_no_mab_alloc))
+ return L(L2) | LN(L2);
}
/*
@@ -744,82 +770,36 @@ static void perf_ibs_get_mem_lvl(union ibs_op_data2 *op_data2,
if (data_src->mem_op != PERF_MEM_OP_LOAD)
goto check_mab;
- /* L3 Hit */
if (ibs_caps & IBS_CAPS_ZEN4) {
- if (ibs_data_src == IBS_DATA_SRC_EXT_LOC_CACHE) {
- data_src->mem_lvl = PERF_MEM_LVL_L3 | PERF_MEM_LVL_HIT;
- return;
- }
- } else {
- if (ibs_data_src == IBS_DATA_SRC_LOC_CACHE) {
- data_src->mem_lvl = PERF_MEM_LVL_L3 | PERF_MEM_LVL_REM_CCE1 |
- PERF_MEM_LVL_HIT;
- return;
- }
- }
+ u64 val = g_zen4_data_src[ibs_data_src];
- /* A peer cache in a near CCX */
- if (ibs_caps & IBS_CAPS_ZEN4 &&
- ibs_data_src == IBS_DATA_SRC_EXT_NEAR_CCX_CACHE) {
- data_src->mem_lvl = PERF_MEM_LVL_REM_CCE1 | PERF_MEM_LVL_HIT;
- return;
- }
+ if (!val)
+ goto check_mab;
- /* A peer cache in a far CCX */
- if (ibs_caps & IBS_CAPS_ZEN4) {
- if (ibs_data_src == IBS_DATA_SRC_EXT_FAR_CCX_CACHE) {
- data_src->mem_lvl = PERF_MEM_LVL_REM_CCE2 | PERF_MEM_LVL_HIT;
- return;
+ /* HOPS_1 because IBS doesn't provide remote socket detail */
+ if (op_data2->rmt_node && ZEN4_RMT_NODE_APPLICABLE(ibs_data_src)) {
+ if (ibs_data_src == IBS_DATA_SRC_EXT_DRAM)
+ val = L(REM_RAM1) | LN(RAM) | REM | HOPS(1);
+ else
+ val |= REM | HOPS(1);
}
- } else {
- if (ibs_data_src == IBS_DATA_SRC_REM_CACHE) {
- data_src->mem_lvl = PERF_MEM_LVL_REM_CCE2 | PERF_MEM_LVL_HIT;
- return;
- }
- }
- /* DRAM */
- if (ibs_data_src == IBS_DATA_SRC_EXT_DRAM) {
- if (op_data2->rmt_node == 0)
- data_src->mem_lvl = PERF_MEM_LVL_LOC_RAM | PERF_MEM_LVL_HIT;
- else
- data_src->mem_lvl = PERF_MEM_LVL_REM_RAM1 | PERF_MEM_LVL_HIT;
- return;
- }
+ return val;
+ } else {
+ u64 val = g_data_src[ibs_data_src];
- /* PMEM */
- if (ibs_caps & IBS_CAPS_ZEN4 && ibs_data_src == IBS_DATA_SRC_EXT_PMEM) {
- data_src->mem_lvl_num = PERF_MEM_LVLNUM_PMEM;
- if (op_data2->rmt_node) {
- data_src->mem_remote = PERF_MEM_REMOTE_REMOTE;
- /* IBS doesn't provide Remote socket detail */
- data_src->mem_hops = PERF_MEM_HOPS_1;
- }
- return;
- }
+ if (!val)
+ goto check_mab;
- /* Extension Memory */
- if (ibs_caps & IBS_CAPS_ZEN4 &&
- ibs_data_src == IBS_DATA_SRC_EXT_EXT_MEM) {
- data_src->mem_lvl_num = PERF_MEM_LVLNUM_CXL;
- if (op_data2->rmt_node) {
- data_src->mem_remote = PERF_MEM_REMOTE_REMOTE;
- /* IBS doesn't provide Remote socket detail */
- data_src->mem_hops = PERF_MEM_HOPS_1;
+ /* HOPS_1 because IBS doesn't provide remote socket detail */
+ if (op_data2->rmt_node && RMT_NODE_APPLICABLE(ibs_data_src)) {
+ if (ibs_data_src == IBS_DATA_SRC_DRAM)
+ val = L(REM_RAM1) | LN(RAM) | REM | HOPS(1);
+ else
+ val |= REM | HOPS(1);
}
- return;
- }
- /* IO */
- if (ibs_data_src == IBS_DATA_SRC_EXT_IO) {
- data_src->mem_lvl = PERF_MEM_LVL_IO;
- data_src->mem_lvl_num = PERF_MEM_LVLNUM_IO;
- if (op_data2->rmt_node) {
- data_src->mem_remote = PERF_MEM_REMOTE_REMOTE;
- /* IBS doesn't provide Remote socket detail */
- data_src->mem_hops = PERF_MEM_HOPS_1;
- }
- return;
+ return val;
}
check_mab:
@@ -830,12 +810,11 @@ static void perf_ibs_get_mem_lvl(union ibs_op_data2 *op_data2,
* DataSrc simultaneously. Prioritize DataSrc over MAB, i.e. set
* MAB only when IBS fails to provide DataSrc.
*/
- if (op_data3->dc_miss_no_mab_alloc) {
- data_src->mem_lvl = PERF_MEM_LVL_LFB | PERF_MEM_LVL_HIT;
- return;
- }
+ if (op_data3->dc_miss_no_mab_alloc)
+ return L(LFB) | LN(LFB);
data_src->mem_lvl = PERF_MEM_LVL_NA;
+ return 0;
}
static bool perf_ibs_cache_hit_st_valid(void)
@@ -925,7 +904,9 @@ static void perf_ibs_get_data_src(struct perf_ibs_data *ibs_data,
union ibs_op_data2 *op_data2,
union ibs_op_data3 *op_data3)
{
- perf_ibs_get_mem_lvl(op_data2, op_data3, data);
+ union perf_mem_data_src *data_src = &data->data_src;
+
+ data_src->val |= perf_ibs_get_mem_lvl(op_data2, op_data3, data);
perf_ibs_get_mem_snoop(op_data2, data);
perf_ibs_get_tlb_lvl(op_data3, data);
perf_ibs_get_mem_lock(op_data3, data);
--
2.39.2
Hi Ravi,
On Mon, Mar 27, 2023 at 6:09 AM Ravi Bangoria <[email protected]> wrote:
>
> From: Namhyung Kim <[email protected]>
>
> Kernel IBS driver wasn't using new PERF_MEM_* APIs due to few of its
> limitations. Mainly:
>
> - mem_lvl_num doesn't allow setting multiple sources whereas old API
> allows it. Setting multiple data sources is useful because IBS on
> pre-zen4 uarch doesn't provide fine granular DataSrc details (there
> is only one such DataSrc(2h) though).
> - perf mem sorting logic (sort__lvl_cmp()) ignores mem_lvl_num. perf
> c2c (c2c_decode_stats()) does not use mem_lvl_num at all. perf mem
> prints mem_lvl and mem_lvl_num both if both are set, which is ugly.
>
> 1st one can be handled by using ANY_CACHE with HOPS_0. 2nd one is
> purely perf tool specific issue and should be fixed separately.
>
> Signed-off-by: Namhyung Kim <[email protected]>
> Signed-off-by: Ravi Bangoria <[email protected]>
> ---
> v1: https://lore.kernel.org/r/[email protected]
> v1->v2:
> - In addition to setting new API fields, convert all individual field
> assignments to compile time wrapper macros built using PERF_MEM_S().
> Also convert DataSrc conditional code to array lookups.
>
> arch/x86/events/amd/ibs.c | 155 +++++++++++++++++---------------------
> 1 file changed, 68 insertions(+), 87 deletions(-)
>
> diff --git a/arch/x86/events/amd/ibs.c b/arch/x86/events/amd/ibs.c
> index 64582954b5f6..b46e0b725fe5 100644
> --- a/arch/x86/events/amd/ibs.c
> +++ b/arch/x86/events/amd/ibs.c
> @@ -703,9 +703,41 @@ static u8 perf_ibs_data_src(union ibs_op_data2 *op_data2)
> return op_data2->data_src_lo;
> }
>
> -static void perf_ibs_get_mem_lvl(union ibs_op_data2 *op_data2,
> - union ibs_op_data3 *op_data3,
> - struct perf_sample_data *data)
> +#define L(x) (PERF_MEM_S(LVL, x) | PERF_MEM_S(LVL, HIT))
> +#define LN(x) PERF_MEM_S(LVLNUM, x)
> +#define REM PERF_MEM_S(REMOTE, REMOTE)
> +#define HOPS(x) PERF_MEM_S(HOPS, x)
> +
> +static u64 g_data_src[8] = {
> + [IBS_DATA_SRC_LOC_CACHE] = L(L3) | L(REM_CCE1) | LN(ANY_CACHE) | HOPS(0),
> + [IBS_DATA_SRC_DRAM] = L(LOC_RAM) | LN(RAM),
> + [IBS_DATA_SRC_REM_CACHE] = L(REM_CCE2) | LN(ANY_CACHE) | REM | HOPS(1),
> + [IBS_DATA_SRC_IO] = L(IO) | LN(IO),
> +};
> +
> +#define RMT_NODE_BITS ((1 << IBS_DATA_SRC_DRAM) | \
> + (1 << IBS_DATA_SRC_IO))
> +#define RMT_NODE_APPLICABLE(x) (RMT_NODE_BITS & (1 << x))
> +
> +static u64 g_zen4_data_src[32] = {
> + [IBS_DATA_SRC_EXT_LOC_CACHE] = L(L3) | LN(L3),
> + [IBS_DATA_SRC_EXT_NEAR_CCX_CACHE] = L(REM_CCE1) | LN(ANY_CACHE) | REM | HOPS(0),
> + [IBS_DATA_SRC_EXT_DRAM] = L(LOC_RAM) | LN(RAM),
> + [IBS_DATA_SRC_EXT_FAR_CCX_CACHE] = L(REM_CCE2) | LN(ANY_CACHE) | REM | HOPS(1),
> + [IBS_DATA_SRC_EXT_PMEM] = LN(PMEM),
> + [IBS_DATA_SRC_EXT_IO] = L(IO) | LN(IO),
> + [IBS_DATA_SRC_EXT_EXT_MEM] = LN(CXL),
> +};
> +
> +#define ZEN4_RMT_NODE_BITS ((1 << IBS_DATA_SRC_EXT_DRAM) | \
> + (1 << IBS_DATA_SRC_EXT_PMEM) | \
> + (1 << IBS_DATA_SRC_EXT_IO) |\
> + (1 << IBS_DATA_SRC_EXT_EXT_MEM))
> +#define ZEN4_RMT_NODE_APPLICABLE(x) (RMT_NODE_BITS & (1 << x))
> +
> +static __u64 perf_ibs_get_mem_lvl(union ibs_op_data2 *op_data2,
> + union ibs_op_data3 *op_data3,
> + struct perf_sample_data *data)
> {
> union perf_mem_data_src *data_src = &data->data_src;
> u8 ibs_data_src = perf_ibs_data_src(op_data2);
> @@ -716,25 +748,19 @@ static void perf_ibs_get_mem_lvl(union ibs_op_data2 *op_data2,
> * DcMiss, L2Miss, DataSrc, DcMissLat etc. are all invalid for Uncached
> * memory accesses. So, check DcUcMemAcc bit early.
> */
> - if (op_data3->dc_uc_mem_acc && ibs_data_src != IBS_DATA_SRC_EXT_IO) {
> - data_src->mem_lvl = PERF_MEM_LVL_UNC | PERF_MEM_LVL_HIT;
> - return;
> - }
> + if (op_data3->dc_uc_mem_acc && ibs_data_src != IBS_DATA_SRC_EXT_IO)
> + return L(UNC);
Hmm.. it seems we don't have PERF_MEM_LVLNUM_UNC.
>
> /* L1 Hit */
> - if (op_data3->dc_miss == 0) {
> - data_src->mem_lvl = PERF_MEM_LVL_L1 | PERF_MEM_LVL_HIT;
> - return;
> - }
> + if (op_data3->dc_miss == 0)
> + return L(L1) | LN(L1);
>
> /* L2 Hit */
> if (op_data3->l2_miss == 0) {
> /* Erratum #1293 */
> if (boot_cpu_data.x86 != 0x19 || boot_cpu_data.x86_model > 0xF ||
> - !(op_data3->sw_pf || op_data3->dc_miss_no_mab_alloc)) {
> - data_src->mem_lvl = PERF_MEM_LVL_L2 | PERF_MEM_LVL_HIT;
> - return;
> - }
> + !(op_data3->sw_pf || op_data3->dc_miss_no_mab_alloc))
> + return L(L2) | LN(L2);
> }
>
> /*
> @@ -744,82 +770,36 @@ static void perf_ibs_get_mem_lvl(union ibs_op_data2 *op_data2,
> if (data_src->mem_op != PERF_MEM_OP_LOAD)
> goto check_mab;
>
> - /* L3 Hit */
> if (ibs_caps & IBS_CAPS_ZEN4) {
> - if (ibs_data_src == IBS_DATA_SRC_EXT_LOC_CACHE) {
> - data_src->mem_lvl = PERF_MEM_LVL_L3 | PERF_MEM_LVL_HIT;
> - return;
> - }
> - } else {
> - if (ibs_data_src == IBS_DATA_SRC_LOC_CACHE) {
> - data_src->mem_lvl = PERF_MEM_LVL_L3 | PERF_MEM_LVL_REM_CCE1 |
> - PERF_MEM_LVL_HIT;
> - return;
> - }
> - }
> + u64 val = g_zen4_data_src[ibs_data_src];
>
> - /* A peer cache in a near CCX */
> - if (ibs_caps & IBS_CAPS_ZEN4 &&
> - ibs_data_src == IBS_DATA_SRC_EXT_NEAR_CCX_CACHE) {
> - data_src->mem_lvl = PERF_MEM_LVL_REM_CCE1 | PERF_MEM_LVL_HIT;
> - return;
> - }
> + if (!val)
> + goto check_mab;
>
> - /* A peer cache in a far CCX */
> - if (ibs_caps & IBS_CAPS_ZEN4) {
> - if (ibs_data_src == IBS_DATA_SRC_EXT_FAR_CCX_CACHE) {
> - data_src->mem_lvl = PERF_MEM_LVL_REM_CCE2 | PERF_MEM_LVL_HIT;
> - return;
> + /* HOPS_1 because IBS doesn't provide remote socket detail */
> + if (op_data2->rmt_node && ZEN4_RMT_NODE_APPLICABLE(ibs_data_src)) {
> + if (ibs_data_src == IBS_DATA_SRC_EXT_DRAM)
> + val = L(REM_RAM1) | LN(RAM) | REM | HOPS(1);
> + else
> + val |= REM | HOPS(1);
> }
> - } else {
> - if (ibs_data_src == IBS_DATA_SRC_REM_CACHE) {
> - data_src->mem_lvl = PERF_MEM_LVL_REM_CCE2 | PERF_MEM_LVL_HIT;
> - return;
> - }
> - }
>
> - /* DRAM */
> - if (ibs_data_src == IBS_DATA_SRC_EXT_DRAM) {
> - if (op_data2->rmt_node == 0)
> - data_src->mem_lvl = PERF_MEM_LVL_LOC_RAM | PERF_MEM_LVL_HIT;
> - else
> - data_src->mem_lvl = PERF_MEM_LVL_REM_RAM1 | PERF_MEM_LVL_HIT;
> - return;
> - }
> + return val;
> + } else {
> + u64 val = g_data_src[ibs_data_src];
>
> - /* PMEM */
> - if (ibs_caps & IBS_CAPS_ZEN4 && ibs_data_src == IBS_DATA_SRC_EXT_PMEM) {
> - data_src->mem_lvl_num = PERF_MEM_LVLNUM_PMEM;
> - if (op_data2->rmt_node) {
> - data_src->mem_remote = PERF_MEM_REMOTE_REMOTE;
> - /* IBS doesn't provide Remote socket detail */
> - data_src->mem_hops = PERF_MEM_HOPS_1;
> - }
> - return;
> - }
> + if (!val)
> + goto check_mab;
>
> - /* Extension Memory */
> - if (ibs_caps & IBS_CAPS_ZEN4 &&
> - ibs_data_src == IBS_DATA_SRC_EXT_EXT_MEM) {
> - data_src->mem_lvl_num = PERF_MEM_LVLNUM_CXL;
> - if (op_data2->rmt_node) {
> - data_src->mem_remote = PERF_MEM_REMOTE_REMOTE;
> - /* IBS doesn't provide Remote socket detail */
> - data_src->mem_hops = PERF_MEM_HOPS_1;
> + /* HOPS_1 because IBS doesn't provide remote socket detail */
> + if (op_data2->rmt_node && RMT_NODE_APPLICABLE(ibs_data_src)) {
> + if (ibs_data_src == IBS_DATA_SRC_DRAM)
> + val = L(REM_RAM1) | LN(RAM) | REM | HOPS(1);
> + else
> + val |= REM | HOPS(1);
> }
> - return;
> - }
>
> - /* IO */
> - if (ibs_data_src == IBS_DATA_SRC_EXT_IO) {
> - data_src->mem_lvl = PERF_MEM_LVL_IO;
> - data_src->mem_lvl_num = PERF_MEM_LVLNUM_IO;
> - if (op_data2->rmt_node) {
> - data_src->mem_remote = PERF_MEM_REMOTE_REMOTE;
> - /* IBS doesn't provide Remote socket detail */
> - data_src->mem_hops = PERF_MEM_HOPS_1;
> - }
> - return;
> + return val;
> }
>
> check_mab:
> @@ -830,12 +810,11 @@ static void perf_ibs_get_mem_lvl(union ibs_op_data2 *op_data2,
> * DataSrc simultaneously. Prioritize DataSrc over MAB, i.e. set
> * MAB only when IBS fails to provide DataSrc.
> */
> - if (op_data3->dc_miss_no_mab_alloc) {
> - data_src->mem_lvl = PERF_MEM_LVL_LFB | PERF_MEM_LVL_HIT;
> - return;
> - }
> + if (op_data3->dc_miss_no_mab_alloc)
> + return L(LFB) | LN(LFB);
>
> data_src->mem_lvl = PERF_MEM_LVL_NA;
> + return 0;
Wouldn't it be 'return L(NA) | LN(NA);' ?
Thanks,
Namhyung
> }
>
> static bool perf_ibs_cache_hit_st_valid(void)
> @@ -925,7 +904,9 @@ static void perf_ibs_get_data_src(struct perf_ibs_data *ibs_data,
> union ibs_op_data2 *op_data2,
> union ibs_op_data3 *op_data3)
> {
> - perf_ibs_get_mem_lvl(op_data2, op_data3, data);
> + union perf_mem_data_src *data_src = &data->data_src;
> +
> + data_src->val |= perf_ibs_get_mem_lvl(op_data2, op_data3, data);
> perf_ibs_get_mem_snoop(op_data2, data);
> perf_ibs_get_tlb_lvl(op_data3, data);
> perf_ibs_get_mem_lock(op_data3, data);
> --
> 2.39.2
>
Hi Namhyung,
>> @@ -716,25 +748,19 @@ static void perf_ibs_get_mem_lvl(union ibs_op_data2 *op_data2,
>> * DcMiss, L2Miss, DataSrc, DcMissLat etc. are all invalid for Uncached
>> * memory accesses. So, check DcUcMemAcc bit early.
>> */
>> - if (op_data3->dc_uc_mem_acc && ibs_data_src != IBS_DATA_SRC_EXT_IO) {
>> - data_src->mem_lvl = PERF_MEM_LVL_UNC | PERF_MEM_LVL_HIT;
>> - return;
>> - }
>> + if (op_data3->dc_uc_mem_acc && ibs_data_src != IBS_DATA_SRC_EXT_IO)
>> + return L(UNC);
>
> Hmm.. it seems we don't have PERF_MEM_LVLNUM_UNC.
Right. Is it worth to introduce one?
On a side note, I came to know that IBS OpData2[RmtNode] is not applicable
when DataSrc=7 (I/O). So, I need to respin this patch with that change.
[...]
>> check_mab:
>> @@ -830,12 +810,11 @@ static void perf_ibs_get_mem_lvl(union ibs_op_data2 *op_data2,
>> * DataSrc simultaneously. Prioritize DataSrc over MAB, i.e. set
>> * MAB only when IBS fails to provide DataSrc.
>> */
>> - if (op_data3->dc_miss_no_mab_alloc) {
>> - data_src->mem_lvl = PERF_MEM_LVL_LFB | PERF_MEM_LVL_HIT;
>> - return;
>> - }
>> + if (op_data3->dc_miss_no_mab_alloc)
>> + return L(LFB) | LN(LFB);
>>
>> data_src->mem_lvl = PERF_MEM_LVL_NA;
>> + return 0;
>
> Wouldn't it be 'return L(NA) | LN(NA);' ?
IBS has no instruction type filtering, i.e. it tags whatever instruction it
sees at overflow. When IBS tags non-load/store instruction, data_src->val is
set to PERF_MEM_NA, which does not initialize mem_lvl_num (Shall we change
that?). If I set both LVL_NA and LVL_NUM_NA for load/store with no DataSrc
info, perf mem output becomes funny:
$ sudo ./perf mem report -F sample,mem --stdio
# Samples Memory access
# ............ .......................................
#
1914 N/A <====== Non-LS
905 L1 or L1 hit
19 L3 or L3 hit
16 L2 or L2 hit
6 N/A or N/A hit <====== LS with no DataSrc info
6 Local RAM or RAM hit
4 Remote node, same socket RAM hit
3 Remote core, same node Any cache hit
2 Remote node, same socket Any cache hit
Also, L(NA) is PERF_MEM_LVL_NA | PERF_MEM_LVL_HIT. If I just return L(NA),
perf tools shows it as "N/A hit".
So, until tool code gets refactored, setting mem_lvl = NA here is hiding
tool's dumbness :(. Maybe I should refactor perf_mem__snp_scnprintf() as
part of this patchset.
Thanks,
Ravi
On Wed, Mar 29, 2023 at 2:45 AM Ravi Bangoria <[email protected]> wrote:
>
> Hi Namhyung,
>
> >> @@ -716,25 +748,19 @@ static void perf_ibs_get_mem_lvl(union ibs_op_data2 *op_data2,
> >> * DcMiss, L2Miss, DataSrc, DcMissLat etc. are all invalid for Uncached
> >> * memory accesses. So, check DcUcMemAcc bit early.
> >> */
> >> - if (op_data3->dc_uc_mem_acc && ibs_data_src != IBS_DATA_SRC_EXT_IO) {
> >> - data_src->mem_lvl = PERF_MEM_LVL_UNC | PERF_MEM_LVL_HIT;
> >> - return;
> >> - }
> >> + if (op_data3->dc_uc_mem_acc && ibs_data_src != IBS_DATA_SRC_EXT_IO)
> >> + return L(UNC);
> >
> > Hmm.. it seems we don't have PERF_MEM_LVLNUM_UNC.
>
> Right. Is it worth to introduce one?
I think MEM_LVLNUM should express every memory level in MEM_LVL.
>
> On a side note, I came to know that IBS OpData2[RmtNode] is not applicable
> when DataSrc=7 (I/O). So, I need to respin this patch with that change.
>
> [...]
>
> >> check_mab:
> >> @@ -830,12 +810,11 @@ static void perf_ibs_get_mem_lvl(union ibs_op_data2 *op_data2,
> >> * DataSrc simultaneously. Prioritize DataSrc over MAB, i.e. set
> >> * MAB only when IBS fails to provide DataSrc.
> >> */
> >> - if (op_data3->dc_miss_no_mab_alloc) {
> >> - data_src->mem_lvl = PERF_MEM_LVL_LFB | PERF_MEM_LVL_HIT;
> >> - return;
> >> - }
> >> + if (op_data3->dc_miss_no_mab_alloc)
> >> + return L(LFB) | LN(LFB);
> >>
> >> data_src->mem_lvl = PERF_MEM_LVL_NA;
> >> + return 0;
> >
> > Wouldn't it be 'return L(NA) | LN(NA);' ?
>
> IBS has no instruction type filtering, i.e. it tags whatever instruction it
> sees at overflow. When IBS tags non-load/store instruction, data_src->val is
> set to PERF_MEM_NA, which does not initialize mem_lvl_num (Shall we change
> that?). If I set both LVL_NA and LVL_NUM_NA for load/store with no DataSrc
> info, perf mem output becomes funny:
Probably worth changing PERF_MEM_NA.
>
> $ sudo ./perf mem report -F sample,mem --stdio
> # Samples Memory access
> # ............ .......................................
> #
> 1914 N/A <====== Non-LS
> 905 L1 or L1 hit
> 19 L3 or L3 hit
> 16 L2 or L2 hit
> 6 N/A or N/A hit <====== LS with no DataSrc info
> 6 Local RAM or RAM hit
> 4 Remote node, same socket RAM hit
> 3 Remote core, same node Any cache hit
> 2 Remote node, same socket Any cache hit
Maybe that's better to differentiate them :)
>
> Also, L(NA) is PERF_MEM_LVL_NA | PERF_MEM_LVL_HIT. If I just return L(NA),
> perf tools shows it as "N/A hit".
>
> So, until tool code gets refactored, setting mem_lvl = NA here is hiding
> tool's dumbness :(. Maybe I should refactor perf_mem__snp_scnprintf() as
> part of this patchset.
I think we can change the tool independently - preferring MEM_LVLNUM
if present but you might want to add an option to override.
Given IBS didn't set it so far, the output would remain mostly the same.
Thanks,
Namhyung
On 30-Mar-23 5:17 AM, Namhyung Kim wrote:
> On Wed, Mar 29, 2023 at 2:45 AM Ravi Bangoria <[email protected]> wrote:
>>
>> Hi Namhyung,
>>
>>>> @@ -716,25 +748,19 @@ static void perf_ibs_get_mem_lvl(union ibs_op_data2 *op_data2,
>>>> * DcMiss, L2Miss, DataSrc, DcMissLat etc. are all invalid for Uncached
>>>> * memory accesses. So, check DcUcMemAcc bit early.
>>>> */
>>>> - if (op_data3->dc_uc_mem_acc && ibs_data_src != IBS_DATA_SRC_EXT_IO) {
>>>> - data_src->mem_lvl = PERF_MEM_LVL_UNC | PERF_MEM_LVL_HIT;
>>>> - return;
>>>> - }
>>>> + if (op_data3->dc_uc_mem_acc && ibs_data_src != IBS_DATA_SRC_EXT_IO)
>>>> + return L(UNC);
>>>
>>> Hmm.. it seems we don't have PERF_MEM_LVLNUM_UNC.
>>
>> Right. Is it worth to introduce one?
>
> I think MEM_LVLNUM should express every memory level in MEM_LVL.
Ok.
>
>>
>> On a side note, I came to know that IBS OpData2[RmtNode] is not applicable
>> when DataSrc=7 (I/O). So, I need to respin this patch with that change.
>>
>> [...]
>>
>>>> check_mab:
>>>> @@ -830,12 +810,11 @@ static void perf_ibs_get_mem_lvl(union ibs_op_data2 *op_data2,
>>>> * DataSrc simultaneously. Prioritize DataSrc over MAB, i.e. set
>>>> * MAB only when IBS fails to provide DataSrc.
>>>> */
>>>> - if (op_data3->dc_miss_no_mab_alloc) {
>>>> - data_src->mem_lvl = PERF_MEM_LVL_LFB | PERF_MEM_LVL_HIT;
>>>> - return;
>>>> - }
>>>> + if (op_data3->dc_miss_no_mab_alloc)
>>>> + return L(LFB) | LN(LFB);
>>>>
>>>> data_src->mem_lvl = PERF_MEM_LVL_NA;
>>>> + return 0;
>>>
>>> Wouldn't it be 'return L(NA) | LN(NA);' ?
>>
>> IBS has no instruction type filtering, i.e. it tags whatever instruction it
>> sees at overflow. When IBS tags non-load/store instruction, data_src->val is
>> set to PERF_MEM_NA, which does not initialize mem_lvl_num (Shall we change
>> that?). If I set both LVL_NA and LVL_NUM_NA for load/store with no DataSrc
>> info, perf mem output becomes funny:
>
> Probably worth changing PERF_MEM_NA.
Yeah seems so.
>
>>
>> $ sudo ./perf mem report -F sample,mem --stdio
>> # Samples Memory access
>> # ............ .......................................
>> #
>> 1914 N/A <====== Non-LS
>> 905 L1 or L1 hit
>> 19 L3 or L3 hit
>> 16 L2 or L2 hit
>> 6 N/A or N/A hit <====== LS with no DataSrc info
>> 6 Local RAM or RAM hit
>> 4 Remote node, same socket RAM hit
>> 3 Remote core, same node Any cache hit
>> 2 Remote node, same socket Any cache hit
>
> Maybe that's better to differentiate them :)
:). I would rather add instruction type column and print non-LS, L, S or LS.
>
>>
>> Also, L(NA) is PERF_MEM_LVL_NA | PERF_MEM_LVL_HIT. If I just return L(NA),
>> perf tools shows it as "N/A hit".
>>
>> So, until tool code gets refactored, setting mem_lvl = NA here is hiding
>> tool's dumbness :(. Maybe I should refactor perf_mem__snp_scnprintf() as
>> part of this patchset.
>
> I think we can change the tool independently - preferring MEM_LVLNUM
> if present but you might want to add an option to override.
> Given IBS didn't set it so far, the output would remain mostly the same.
Sure.
Thanks,
Ravi