The mrvl_ddr_pmu is uncore and has a hexadecimal address
suffix. Current PMU sorting/merging code assumes uncore PMU names
start with uncore_ and have a decimal suffix. Add support for
hexadecimal suffixes and add tests.
v6. Add necessary fix in pmu.c to perf_pmu__match_ignoring_suffix that
also needs to ignore hex suffixes.
v5. In pmus.h remove needless addition of #include list.h but add
stddef.h for size_t.
v4. Workaround GCC build error by using unsigned types. Don't consider
short hex suffixes as suffixes (e.g. cpum_cf) and test this
behavior.
v3. Rebase and move tests from pmus.c to the existing pmu.c.
Ian Rogers (2):
perf pmus: Sort/merge/aggregate PMUs like mrvl_ddr_pmu
perf tests: Add some pmu core functionality tests
tools/perf/tests/pmu.c | 99 ++++++++++++++++++++++++++++++++++++++++++
tools/perf/util/pmu.c | 33 ++++++++------
tools/perf/util/pmus.c | 67 ++++++++++++++++------------
tools/perf/util/pmus.h | 7 ++-
4 files changed, 164 insertions(+), 42 deletions(-)
--
2.45.0.rc1.225.g2a3ae87e7f-goog
Test behavior of PMU names and comparisons wrt suffixes using Intel
uncore_cha, marvell mrvl_ddr_pmu and S390's cpum_cf as examples.
Signed-off-by: Ian Rogers <[email protected]>
---
tools/perf/tests/pmu.c | 99 ++++++++++++++++++++++++++++++++++++++++++
1 file changed, 99 insertions(+)
diff --git a/tools/perf/tests/pmu.c b/tools/perf/tests/pmu.c
index 06cc0e46cb28..cc88b5920c3e 100644
--- a/tools/perf/tests/pmu.c
+++ b/tools/perf/tests/pmu.c
@@ -3,6 +3,7 @@
#include "evsel.h"
#include "parse-events.h"
#include "pmu.h"
+#include "pmus.h"
#include "tests.h"
#include "debug.h"
#include "fncache.h"
@@ -340,10 +341,108 @@ static int test__pmu_event_names(struct test_suite *test __maybe_unused,
return ret;
}
+static const char * const uncore_chas[] = {
+ "uncore_cha_0",
+ "uncore_cha_1",
+ "uncore_cha_2",
+ "uncore_cha_3",
+ "uncore_cha_4",
+ "uncore_cha_5",
+ "uncore_cha_6",
+ "uncore_cha_7",
+ "uncore_cha_8",
+ "uncore_cha_9",
+ "uncore_cha_10",
+ "uncore_cha_11",
+ "uncore_cha_12",
+ "uncore_cha_13",
+ "uncore_cha_14",
+ "uncore_cha_15",
+ "uncore_cha_16",
+ "uncore_cha_17",
+ "uncore_cha_18",
+ "uncore_cha_19",
+ "uncore_cha_20",
+ "uncore_cha_21",
+ "uncore_cha_22",
+ "uncore_cha_23",
+ "uncore_cha_24",
+ "uncore_cha_25",
+ "uncore_cha_26",
+ "uncore_cha_27",
+ "uncore_cha_28",
+ "uncore_cha_29",
+ "uncore_cha_30",
+ "uncore_cha_31",
+};
+
+static const char * const mrvl_ddrs[] = {
+ "mrvl_ddr_pmu_87e1b0000000",
+ "mrvl_ddr_pmu_87e1b1000000",
+ "mrvl_ddr_pmu_87e1b2000000",
+ "mrvl_ddr_pmu_87e1b3000000",
+ "mrvl_ddr_pmu_87e1b4000000",
+ "mrvl_ddr_pmu_87e1b5000000",
+ "mrvl_ddr_pmu_87e1b6000000",
+ "mrvl_ddr_pmu_87e1b7000000",
+ "mrvl_ddr_pmu_87e1b8000000",
+ "mrvl_ddr_pmu_87e1b9000000",
+ "mrvl_ddr_pmu_87e1ba000000",
+ "mrvl_ddr_pmu_87e1bb000000",
+ "mrvl_ddr_pmu_87e1bc000000",
+ "mrvl_ddr_pmu_87e1bd000000",
+ "mrvl_ddr_pmu_87e1be000000",
+ "mrvl_ddr_pmu_87e1bf000000",
+};
+
+static int test__name_len(struct test_suite *test __maybe_unused, int subtest __maybe_unused)
+{
+ TEST_ASSERT_VAL("cpu", pmu_name_len_no_suffix("cpu") == strlen("cpu"));
+ TEST_ASSERT_VAL("i915", pmu_name_len_no_suffix("i915") == strlen("i915"));
+ TEST_ASSERT_VAL("cpum_cf", pmu_name_len_no_suffix("cpum_cf") == strlen("cpum_cf"));
+ for (size_t i = 0; i < ARRAY_SIZE(uncore_chas); i++) {
+ TEST_ASSERT_VAL("Strips uncore_cha suffix",
+ pmu_name_len_no_suffix(uncore_chas[i]) ==
+ strlen("uncore_cha"));
+ }
+ for (size_t i = 0; i < ARRAY_SIZE(mrvl_ddrs); i++) {
+ TEST_ASSERT_VAL("Strips mrvl_ddr_pmu suffix",
+ pmu_name_len_no_suffix(mrvl_ddrs[i]) ==
+ strlen("mrvl_ddr_pmu"));
+ }
+ return TEST_OK;
+}
+
+static int test__name_cmp(struct test_suite *test __maybe_unused, int subtest __maybe_unused)
+{
+ TEST_ASSERT_EQUAL("cpu", pmu_name_cmp("cpu", "cpu"), 0);
+ TEST_ASSERT_EQUAL("i915", pmu_name_cmp("i915", "i915"), 0);
+ TEST_ASSERT_EQUAL("cpum_cf", pmu_name_cmp("cpum_cf", "cpum_cf"), 0);
+ TEST_ASSERT_VAL("i915", pmu_name_cmp("cpu", "i915") < 0);
+ TEST_ASSERT_VAL("i915", pmu_name_cmp("i915", "cpu") > 0);
+ TEST_ASSERT_VAL("cpum_cf", pmu_name_cmp("cpum_cf", "cpum_ce") > 0);
+ TEST_ASSERT_VAL("cpum_cf", pmu_name_cmp("cpum_cf", "cpum_d0") < 0);
+ for (size_t i = 1; i < ARRAY_SIZE(uncore_chas); i++) {
+ TEST_ASSERT_VAL("uncore_cha suffixes ordered lt",
+ pmu_name_cmp(uncore_chas[i-1], uncore_chas[i]) < 0);
+ TEST_ASSERT_VAL("uncore_cha suffixes ordered gt",
+ pmu_name_cmp(uncore_chas[i], uncore_chas[i-1]) > 0);
+ }
+ for (size_t i = 1; i < ARRAY_SIZE(mrvl_ddrs); i++) {
+ TEST_ASSERT_VAL("mrvl_ddr_pmu suffixes ordered lt",
+ pmu_name_cmp(mrvl_ddrs[i-1], mrvl_ddrs[i]) < 0);
+ TEST_ASSERT_VAL("mrvl_ddr_pmu suffixes ordered gt",
+ pmu_name_cmp(mrvl_ddrs[i], mrvl_ddrs[i-1]) > 0);
+ }
+ return TEST_OK;
+}
+
static struct test_case tests__pmu[] = {
TEST_CASE("Parsing with PMU format directory", pmu_format),
TEST_CASE("Parsing with PMU event", pmu_events),
TEST_CASE("PMU event names", pmu_event_names),
+ TEST_CASE("PMU name combining", name_len),
+ TEST_CASE("PMU name comparison", name_cmp),
{ .name = NULL, }
};
--
2.45.0.rc1.225.g2a3ae87e7f-goog
The mrvl_ddr_pmu is uncore and has a hexadecimal address suffix while
the previous PMU sorting/merging code assumes uncore PMU names start
with uncore_ and have a decimal suffix. Because of the previous
assumption it isn't possible to wildcard the mrvl_ddr_pmu.
Modify pmu_name_len_no_suffix but also remove the suffix number out
argument, this is because we don't know if a suffix number of say 100
is in hexadecimal or decimal. As the only use of the suffix number is
in comparisons, it is safe there to compare the values as hexadecimal.
Modify perf_pmu__match_ignoring_suffix so that hexadecimal suffixes
are ignored.
Only allow hexadecimal suffixes to be greater than length 2 (ie 3 or
more) so that S390's cpum_cf PMU doesn't lose its suffix.
Change the return type of pmu_name_len_no_suffix to size_t to
workaround GCC incorrectly determining the result could be negative.
Signed-off-by: Ian Rogers <[email protected]>
---
tools/perf/util/pmu.c | 33 +++++++++++++--------
tools/perf/util/pmus.c | 67 ++++++++++++++++++++++++------------------
tools/perf/util/pmus.h | 7 ++++-
3 files changed, 65 insertions(+), 42 deletions(-)
diff --git a/tools/perf/util/pmu.c b/tools/perf/util/pmu.c
index 888ce9912275..c94a91645b21 100644
--- a/tools/perf/util/pmu.c
+++ b/tools/perf/util/pmu.c
@@ -856,26 +856,34 @@ __weak const struct pmu_metrics_table *pmu_metrics_table__find(void)
*/
static bool perf_pmu__match_ignoring_suffix(const char *pmu_name, const char *tok)
{
- const char *p;
+ const char *p, *suffix;
+ bool has_hex = false;
if (strncmp(pmu_name, tok, strlen(tok)))
return false;
- p = pmu_name + strlen(tok);
+ suffix = p = pmu_name + strlen(tok);
if (*p == 0)
return true;
- if (*p == '_')
+ if (*p == '_') {
++p;
+ ++suffix;
+ }
/* Ensure we end in a number */
while (1) {
- if (!isdigit(*p))
+ if (!isxdigit(*p))
return false;
+ if (!has_hex)
+ has_hex = !isdigit(*p);
if (*(++p) == 0)
break;
}
+ if (has_hex)
+ return (p - suffix) > 2;
+
return true;
}
@@ -1788,10 +1796,10 @@ static char *format_alias(char *buf, int len, const struct perf_pmu *pmu,
const struct perf_pmu_alias *alias, bool skip_duplicate_pmus)
{
struct parse_events_term *term;
- int pmu_name_len = skip_duplicate_pmus
- ? pmu_name_len_no_suffix(pmu->name, /*num=*/NULL)
- : (int)strlen(pmu->name);
- int used = snprintf(buf, len, "%.*s/%s", pmu_name_len, pmu->name, alias->name);
+ size_t pmu_name_len = skip_duplicate_pmus
+ ? pmu_name_len_no_suffix(pmu->name)
+ : strlen(pmu->name);
+ int used = snprintf(buf, len, "%.*s/%s", (int)pmu_name_len, pmu->name, alias->name);
list_for_each_entry(term, &alias->terms.terms, list) {
if (term->type_val == PARSE_EVENTS__TERM_TYPE_STR)
@@ -1828,13 +1836,12 @@ int perf_pmu__for_each_event(struct perf_pmu *pmu, bool skip_duplicate_pmus,
pmu_aliases_parse(pmu);
pmu_add_cpu_aliases(pmu);
list_for_each_entry(event, &pmu->aliases, list) {
- size_t buf_used;
- int pmu_name_len;
+ size_t buf_used, pmu_name_len;
info.pmu_name = event->pmu_name ?: pmu->name;
pmu_name_len = skip_duplicate_pmus
- ? pmu_name_len_no_suffix(info.pmu_name, /*num=*/NULL)
- : (int)strlen(info.pmu_name);
+ ? pmu_name_len_no_suffix(info.pmu_name)
+ : strlen(info.pmu_name);
info.alias = NULL;
if (event->desc) {
info.name = event->name;
@@ -1859,7 +1866,7 @@ int perf_pmu__for_each_event(struct perf_pmu *pmu, bool skip_duplicate_pmus,
info.encoding_desc = buf + buf_used;
parse_events_terms__to_strbuf(&event->terms, &sb);
buf_used += snprintf(buf + buf_used, sizeof(buf) - buf_used,
- "%.*s/%s/", pmu_name_len, info.pmu_name, sb.buf) + 1;
+ "%.*s/%s/", (int)pmu_name_len, info.pmu_name, sb.buf) + 1;
info.topic = event->topic;
info.str = sb.buf;
info.deprecated = event->deprecated;
diff --git a/tools/perf/util/pmus.c b/tools/perf/util/pmus.c
index b9b4c5eb5002..63b9cf9ccfa7 100644
--- a/tools/perf/util/pmus.c
+++ b/tools/perf/util/pmus.c
@@ -40,31 +40,52 @@ static bool read_sysfs_all_pmus;
static void pmu_read_sysfs(bool core_only);
-int pmu_name_len_no_suffix(const char *str, unsigned long *num)
+size_t pmu_name_len_no_suffix(const char *str)
{
int orig_len, len;
+ bool has_hex_digits = false;
orig_len = len = strlen(str);
- /* Non-uncore PMUs have their full length, for example, i915. */
- if (!strstarts(str, "uncore_"))
- return len;
-
- /*
- * Count trailing digits and '_', if '_{num}' suffix isn't present use
- * the full length.
- */
- while (len > 0 && isdigit(str[len - 1]))
+ /* Count trailing digits. */
+ while (len > 0 && isxdigit(str[len - 1])) {
+ if (!isdigit(str[len - 1]))
+ has_hex_digits = true;
len--;
+ }
if (len > 0 && len != orig_len && str[len - 1] == '_') {
- if (num)
- *num = strtoul(&str[len], NULL, 10);
- return len - 1;
+ /*
+ * There is a '_{num}' suffix. For decimal suffixes any length
+ * will do, for hexadecimal ensure more than 2 hex digits so
+ * that S390's cpum_cf PMU doesn't match.
+ */
+ if (!has_hex_digits || (orig_len - len) > 2)
+ return len - 1;
}
+ /* Use the full length. */
return orig_len;
}
+int pmu_name_cmp(const char *lhs_pmu_name, const char *rhs_pmu_name)
+{
+ unsigned long lhs_num = 0, rhs_num = 0;
+ size_t lhs_pmu_name_len = pmu_name_len_no_suffix(lhs_pmu_name);
+ size_t rhs_pmu_name_len = pmu_name_len_no_suffix(rhs_pmu_name);
+ int ret = strncmp(lhs_pmu_name, rhs_pmu_name,
+ lhs_pmu_name_len < rhs_pmu_name_len ? lhs_pmu_name_len : rhs_pmu_name_len);
+
+ if (lhs_pmu_name_len != rhs_pmu_name_len || ret != 0 || lhs_pmu_name_len == 0)
+ return ret;
+
+ if (lhs_pmu_name_len + 1 < strlen(lhs_pmu_name))
+ lhs_num = strtoul(&lhs_pmu_name[lhs_pmu_name_len + 1], NULL, 16);
+ if (rhs_pmu_name_len + 1 < strlen(rhs_pmu_name))
+ rhs_num = strtoul(&rhs_pmu_name[rhs_pmu_name_len + 1], NULL, 16);
+
+ return lhs_num < rhs_num ? -1 : (lhs_num > rhs_num ? 1 : 0);
+}
+
void perf_pmus__destroy(void)
{
struct perf_pmu *pmu, *tmp;
@@ -167,20 +188,10 @@ static struct perf_pmu *perf_pmu__find2(int dirfd, const char *name)
static int pmus_cmp(void *priv __maybe_unused,
const struct list_head *lhs, const struct list_head *rhs)
{
- unsigned long lhs_num = 0, rhs_num = 0;
struct perf_pmu *lhs_pmu = container_of(lhs, struct perf_pmu, list);
struct perf_pmu *rhs_pmu = container_of(rhs, struct perf_pmu, list);
- const char *lhs_pmu_name = lhs_pmu->name ?: "";
- const char *rhs_pmu_name = rhs_pmu->name ?: "";
- int lhs_pmu_name_len = pmu_name_len_no_suffix(lhs_pmu_name, &lhs_num);
- int rhs_pmu_name_len = pmu_name_len_no_suffix(rhs_pmu_name, &rhs_num);
- int ret = strncmp(lhs_pmu_name, rhs_pmu_name,
- lhs_pmu_name_len < rhs_pmu_name_len ? lhs_pmu_name_len : rhs_pmu_name_len);
-
- if (lhs_pmu_name_len != rhs_pmu_name_len || ret != 0 || lhs_pmu_name_len == 0)
- return ret;
- return lhs_num < rhs_num ? -1 : (lhs_num > rhs_num ? 1 : 0);
+ return pmu_name_cmp(lhs_pmu->name ?: "", rhs_pmu->name ?: "");
}
/* Add all pmus in sysfs to pmu list: */
@@ -300,11 +311,11 @@ static struct perf_pmu *perf_pmus__scan_skip_duplicates(struct perf_pmu *pmu)
pmu_read_sysfs(/*core_only=*/false);
pmu = list_prepare_entry(pmu, &core_pmus, list);
} else
- last_pmu_name_len = pmu_name_len_no_suffix(pmu->name ?: "", NULL);
+ last_pmu_name_len = pmu_name_len_no_suffix(pmu->name ?: "");
if (use_core_pmus) {
list_for_each_entry_continue(pmu, &core_pmus, list) {
- int pmu_name_len = pmu_name_len_no_suffix(pmu->name ?: "", /*num=*/NULL);
+ int pmu_name_len = pmu_name_len_no_suffix(pmu->name ?: "");
if (last_pmu_name_len == pmu_name_len &&
!strncmp(last_pmu_name, pmu->name ?: "", pmu_name_len))
@@ -316,7 +327,7 @@ static struct perf_pmu *perf_pmus__scan_skip_duplicates(struct perf_pmu *pmu)
pmu = list_prepare_entry(pmu, &other_pmus, list);
}
list_for_each_entry_continue(pmu, &other_pmus, list) {
- int pmu_name_len = pmu_name_len_no_suffix(pmu->name ?: "", /*num=*/NULL);
+ int pmu_name_len = pmu_name_len_no_suffix(pmu->name ?: "");
if (last_pmu_name_len == pmu_name_len &&
!strncmp(last_pmu_name, pmu->name ?: "", pmu_name_len))
@@ -566,7 +577,7 @@ void perf_pmus__print_raw_pmu_events(const struct print_callbacks *print_cb, voi
.long_string = STRBUF_INIT,
.num_formats = 0,
};
- int len = pmu_name_len_no_suffix(pmu->name, /*num=*/NULL);
+ int len = pmu_name_len_no_suffix(pmu->name);
const char *desc = "(see 'man perf-list' or 'man perf-record' on how to encode it)";
if (!pmu->is_core)
diff --git a/tools/perf/util/pmus.h b/tools/perf/util/pmus.h
index 9d4ded80b8e9..bdbff02324bb 100644
--- a/tools/perf/util/pmus.h
+++ b/tools/perf/util/pmus.h
@@ -2,10 +2,15 @@
#ifndef __PMUS_H
#define __PMUS_H
+#include <stdbool.h>
+#include <stddef.h>
+
struct perf_pmu;
struct print_callbacks;
-int pmu_name_len_no_suffix(const char *str, unsigned long *num);
+size_t pmu_name_len_no_suffix(const char *str);
+/* Exposed for testing only. */
+int pmu_name_cmp(const char *lhs_pmu_name, const char *rhs_pmu_name);
void perf_pmus__destroy(void);
--
2.45.0.rc1.225.g2a3ae87e7f-goog
On Tue, May 14, 2024 at 11:01 PM Ian Rogers <[email protected]> wrote:
>
> The mrvl_ddr_pmu is uncore and has a hexadecimal address
> suffix. Current PMU sorting/merging code assumes uncore PMU names
> start with uncore_ and have a decimal suffix. Add support for
> hexadecimal suffixes and add tests.
>
> v6. Add necessary fix in pmu.c to perf_pmu__match_ignoring_suffix that
> also needs to ignore hex suffixes.
> v5. In pmus.h remove needless addition of #include list.h but add
> stddef.h for size_t.
> v4. Workaround GCC build error by using unsigned types. Don't consider
> short hex suffixes as suffixes (e.g. cpum_cf) and test this
> behavior.
> v3. Rebase and move tests from pmus.c to the existing pmu.c.
>
> Ian Rogers (2):
> perf pmus: Sort/merge/aggregate PMUs like mrvl_ddr_pmu
> perf tests: Add some pmu core functionality tests
Acked-by: Namhyung Kim <[email protected]>
Thanks,
Namhyung
>
> tools/perf/tests/pmu.c | 99 ++++++++++++++++++++++++++++++++++++++++++
> tools/perf/util/pmu.c | 33 ++++++++------
> tools/perf/util/pmus.c | 67 ++++++++++++++++------------
> tools/perf/util/pmus.h | 7 ++-
> 4 files changed, 164 insertions(+), 42 deletions(-)
>
> --
> 2.45.0.rc1.225.g2a3ae87e7f-goog
>
On Tue, 14 May 2024 23:01:12 -0700, Ian Rogers wrote:
> The mrvl_ddr_pmu is uncore and has a hexadecimal address
> suffix. Current PMU sorting/merging code assumes uncore PMU names
> start with uncore_ and have a decimal suffix. Add support for
> hexadecimal suffixes and add tests.
>
> v6. Add necessary fix in pmu.c to perf_pmu__match_ignoring_suffix that
> also needs to ignore hex suffixes.
> v5. In pmus.h remove needless addition of #include list.h but add
> stddef.h for size_t.
> v4. Workaround GCC build error by using unsigned types. Don't consider
> short hex suffixes as suffixes (e.g. cpum_cf) and test this
> behavior.
> v3. Rebase and move tests from pmus.c to the existing pmu.c.
>
> [...]
Applied to perf-tools-next, thanks!
Best regards,
--
Namhyung Kim <[email protected]>
On 15/05/2024 07:01, Ian Rogers wrote:
> The mrvl_ddr_pmu is uncore and has a hexadecimal address suffix while
> the previous PMU sorting/merging code assumes uncore PMU names start
> with uncore_ and have a decimal suffix. Because of the previous
> assumption it isn't possible to wildcard the mrvl_ddr_pmu.
>
> Modify pmu_name_len_no_suffix but also remove the suffix number out
> argument, this is because we don't know if a suffix number of say 100
> is in hexadecimal or decimal. As the only use of the suffix number is
> in comparisons, it is safe there to compare the values as hexadecimal.
> Modify perf_pmu__match_ignoring_suffix so that hexadecimal suffixes
> are ignored.
>
> Only allow hexadecimal suffixes to be greater than length 2 (ie 3 or
> more) so that S390's cpum_cf PMU doesn't lose its suffix.
>
> Change the return type of pmu_name_len_no_suffix to size_t to
> workaround GCC incorrectly determining the result could be negative.
>
> Signed-off-by: Ian Rogers <[email protected]>
> ---
> tools/perf/util/pmu.c | 33 +++++++++++++--------
> tools/perf/util/pmus.c | 67 ++++++++++++++++++++++++------------------
> tools/perf/util/pmus.h | 7 ++++-
> 3 files changed, 65 insertions(+), 42 deletions(-)
>
Hi Ian,
Perf test "perf_all_PMU_test" is failing when run against
next-master(next-20240612) kernel with Arm64 on JUNO in our CI. It looks
like it is failing when run on JUNO alone. Verified by running on other
boards like RB5 and Ampere_altra and confirming that it does not fail on
these boards. Suspecting that the suffixed 'armv8_pmuv3_0' naming could
be the reason of test failure.
Reverting the change (3241d46f5f54) seems to fix it.
This works fine on Linux version v6.10-rc3
Failure log
------------
110: perf all PMU test:
--- start ---
test child forked, pid 8279
Testing armv8_pmuv3/br_immed_retired/
Event 'armv8_pmuv3/br_immed_retired/' not printed in:
# Running 'internals/synthesize' benchmark:
Computing performance of single threaded perf event synthesis by
synthesizing events on the perf process itself:
Average synthesis took: 1169.431 usec (+- 0.144 usec)
Average num. events: 35.000 (+- 0.000)
Average time per event 33.412 usec
Average data synthesis took: 1225.698 usec (+- 0.102 usec)
Average num. events: 119.000 (+- 0.000)
Average time per event 10.300 usec
Performance counter stats for 'perf bench internals synthesize':
3263664785 armv8_pmuv3_0/br_immed_retired/
25.472854464 seconds time elapsed
8.004791000 seconds user
17.060209000 seconds sys
---- end(-1) ----
110: perf all PMU test :
FAILED!
Thanks,
Aishwarya
On 12/06/2024 12:19, Aishwarya TCV wrote:
>
>
> On 15/05/2024 07:01, Ian Rogers wrote:
>> The mrvl_ddr_pmu is uncore and has a hexadecimal address suffix while
>> the previous PMU sorting/merging code assumes uncore PMU names start
>> with uncore_ and have a decimal suffix. Because of the previous
>> assumption it isn't possible to wildcard the mrvl_ddr_pmu.
>>
>> Modify pmu_name_len_no_suffix but also remove the suffix number out
>> argument, this is because we don't know if a suffix number of say 100
>> is in hexadecimal or decimal. As the only use of the suffix number is
>> in comparisons, it is safe there to compare the values as hexadecimal.
>> Modify perf_pmu__match_ignoring_suffix so that hexadecimal suffixes
>> are ignored.
>>
>> Only allow hexadecimal suffixes to be greater than length 2 (ie 3 or
>> more) so that S390's cpum_cf PMU doesn't lose its suffix.
>>
>> Change the return type of pmu_name_len_no_suffix to size_t to
>> workaround GCC incorrectly determining the result could be negative.
>>
>> Signed-off-by: Ian Rogers <[email protected]>
>> ---
>> tools/perf/util/pmu.c | 33 +++++++++++++--------
>> tools/perf/util/pmus.c | 67 ++++++++++++++++++++++++------------------
>> tools/perf/util/pmus.h | 7 ++++-
>> 3 files changed, 65 insertions(+), 42 deletions(-)
>>
>
> Hi Ian,
>
> Perf test "perf_all_PMU_test" is failing when run against
> next-master(next-20240612) kernel with Arm64 on JUNO in our CI. It looks
> like it is failing when run on JUNO alone. Verified by running on other
> boards like RB5 and Ampere_altra and confirming that it does not fail on
> these boards. Suspecting that the suffixed 'armv8_pmuv3_0' naming could
> be the reason of test failure.
>
> Reverting the change (3241d46f5f54) seems to fix it.
>
> This works fine on Linux version v6.10-rc3
>
> Failure log
> ------------
> 110: perf all PMU test:
> --- start ---
> test child forked, pid 8279
> Testing armv8_pmuv3/br_immed_retired/
> Event 'armv8_pmuv3/br_immed_retired/' not printed in:
> # Running 'internals/synthesize' benchmark:
> Computing performance of single threaded perf event synthesis by
> synthesizing events on the perf process itself:
> Average synthesis took: 1169.431 usec (+- 0.144 usec)
> Average num. events: 35.000 (+- 0.000)
> Average time per event 33.412 usec
> Average data synthesis took: 1225.698 usec (+- 0.102 usec)
> Average num. events: 119.000 (+- 0.000)
> Average time per event 10.300 usec
>
> Performance counter stats for 'perf bench internals synthesize':
>
> 3263664785 armv8_pmuv3_0/br_immed_retired/
>
>
> 25.472854464 seconds time elapsed
>
> 8.004791000 seconds user
> 17.060209000 seconds sys
> ---- end(-1) ----
> 110: perf all PMU test :
> FAILED!
>
> Thanks,
> Aishwarya
>
I can take this one, the test can probably be relaxed to not care about
the suffix. Or maybe change what we print in the stat output to match
the string that the event was opened with.
Not sure which way is best yet but it will probably become more obvious
after some digging.
James
On Wed, Jun 12, 2024 at 4:19 AM Aishwarya TCV <[email protected]> wrote:
>
>
>
> On 15/05/2024 07:01, Ian Rogers wrote:
> > The mrvl_ddr_pmu is uncore and has a hexadecimal address suffix while
> > the previous PMU sorting/merging code assumes uncore PMU names start
> > with uncore_ and have a decimal suffix. Because of the previous
> > assumption it isn't possible to wildcard the mrvl_ddr_pmu.
> >
> > Modify pmu_name_len_no_suffix but also remove the suffix number out
> > argument, this is because we don't know if a suffix number of say 100
> > is in hexadecimal or decimal. As the only use of the suffix number is
> > in comparisons, it is safe there to compare the values as hexadecimal.
> > Modify perf_pmu__match_ignoring_suffix so that hexadecimal suffixes
> > are ignored.
> >
> > Only allow hexadecimal suffixes to be greater than length 2 (ie 3 or
> > more) so that S390's cpum_cf PMU doesn't lose its suffix.
> >
> > Change the return type of pmu_name_len_no_suffix to size_t to
> > workaround GCC incorrectly determining the result could be negative.
> >
> > Signed-off-by: Ian Rogers <[email protected]>
> > ---
> > tools/perf/util/pmu.c | 33 +++++++++++++--------
> > tools/perf/util/pmus.c | 67 ++++++++++++++++++++++++------------------
> > tools/perf/util/pmus.h | 7 ++++-
> > 3 files changed, 65 insertions(+), 42 deletions(-)
> >
>
> Hi Ian,
>
> Perf test "perf_all_PMU_test" is failing when run against
> next-master(next-20240612) kernel with Arm64 on JUNO in our CI. It looks
> like it is failing when run on JUNO alone. Verified by running on other
> boards like RB5 and Ampere_altra and confirming that it does not fail on
> these boards. Suspecting that the suffixed 'armv8_pmuv3_0' naming could
> be the reason of test failure.
>
> Reverting the change (3241d46f5f54) seems to fix it.
>
> This works fine on Linux version v6.10-rc3
>
> Failure log
> ------------
> 110: perf all PMU test:
> --- start ---
> test child forked, pid 8279
> Testing armv8_pmuv3/br_immed_retired/
> Event 'armv8_pmuv3/br_immed_retired/' not printed in:
> # Running 'internals/synthesize' benchmark:
> Computing performance of single threaded perf event synthesis by
> synthesizing events on the perf process itself:
> Average synthesis took: 1169.431 usec (+- 0.144 usec)
> Average num. events: 35.000 (+- 0.000)
> Average time per event 33.412 usec
> Average data synthesis took: 1225.698 usec (+- 0.102 usec)
> Average num. events: 119.000 (+- 0.000)
> Average time per event 10.300 usec
>
> Performance counter stats for 'perf bench internals synthesize':
>
> 3263664785 armv8_pmuv3_0/br_immed_retired/
>
>
> 25.472854464 seconds time elapsed
>
> 8.004791000 seconds user
> 17.060209000 seconds sys
> ---- end(-1) ----
> 110: perf all PMU test :
> FAILED!
Hi Aishwarya,
Thanks for reporting an issue. The test should be pretty self
explanatory: it is doing a `perf stat -e
armv8_pmuv3/br_immed_retired/` and then looking for that in the
output. The event armv8_pmuv3/br_immed_retired/ comes from running
perf list. As you can see in the output the event did work, so perf
stat is working so nothing is actually broken here. What isn't working
is the perf stat output matching the command line event and this is
because of the unnecessary suffix on ARM's PMU name.
We have a problem that ARM have buggy PMU drivers, either from
introducing new naming conventions or by just being broken:
https://lore.kernel.org/lkml/CAP-5=fWNDkOpnYF=5v1aQkVDrDWsmw+zYX1pjS8hoiYVgZsRGA@mail.gmail.com/
I've also asked that ARM step up their testing, for example in the
event parsing testing the PMU is hardcoded to the x86 PMU name of
'cpu':
https://git.kernel.org/pub/scm/linux/kernel/git/perf/perf-tools-next.git/tree/tools/perf/tests/parse-events.c?h=perf-tools-next#n2317
On a cortex A53, then PMU is named 'armv8_cortex_a53':
```
$ ls /sys/devices/armv8_cortex_a53/
caps cpus events format perf_event_mux_interval_ms power
subsystem type uevent
```
This name appears better, so what's up with ARM's core PMU name?
Anyway, I'm tempted to fix this by just skipping the test on ARM given
ARM's overall broken state.
Further, can we escalate matters? It isn't right that these ARM issues
are coming back to me. ARM aren't stepping up to fix not just the 2
issues above but:
1) the broken Apple M? issue (ARM asked I fix this as it regressed due
to my fixes for the Intel hybrid code):
https://lore.kernel.org/lkml/[email protected]/
2) opening events on BIG.little doesn't open the event on all core PMU types:
https://lore.kernel.org/lkml/[email protected]/
3) the broken 'cycles' event name in the arm_dsu PMU:
https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/drivers/perf/arm_dsu_pmu.c#n177
4) the ARM memory controller PMUs lacking cpumask causing events
opened on them to be opened on every CPU and thereby induce
multiplexing..
and so on..
Thanks,
Ian
> Thanks,
> Aishwarya
On 12/06/2024 13:32, Ian Rogers wrote:
> On Wed, Jun 12, 2024 at 4:19 AM Aishwarya TCV <[email protected]> wrote:
>>
>>
>>
>> On 15/05/2024 07:01, Ian Rogers wrote:
>>> The mrvl_ddr_pmu is uncore and has a hexadecimal address suffix while
>>> the previous PMU sorting/merging code assumes uncore PMU names start
>>> with uncore_ and have a decimal suffix. Because of the previous
>>> assumption it isn't possible to wildcard the mrvl_ddr_pmu.
>>>
>>> Modify pmu_name_len_no_suffix but also remove the suffix number out
>>> argument, this is because we don't know if a suffix number of say 100
>>> is in hexadecimal or decimal. As the only use of the suffix number is
>>> in comparisons, it is safe there to compare the values as hexadecimal.
>>> Modify perf_pmu__match_ignoring_suffix so that hexadecimal suffixes
>>> are ignored.
>>>
>>> Only allow hexadecimal suffixes to be greater than length 2 (ie 3 or
>>> more) so that S390's cpum_cf PMU doesn't lose its suffix.
>>>
>>> Change the return type of pmu_name_len_no_suffix to size_t to
>>> workaround GCC incorrectly determining the result could be negative.
>>>
>>> Signed-off-by: Ian Rogers <[email protected]>
>>> ---
>>> tools/perf/util/pmu.c | 33 +++++++++++++--------
>>> tools/perf/util/pmus.c | 67 ++++++++++++++++++++++++------------------
>>> tools/perf/util/pmus.h | 7 ++++-
>>> 3 files changed, 65 insertions(+), 42 deletions(-)
>>>
>>
>> Hi Ian,
>>
>> Perf test "perf_all_PMU_test" is failing when run against
>> next-master(next-20240612) kernel with Arm64 on JUNO in our CI. It looks
>> like it is failing when run on JUNO alone. Verified by running on other
>> boards like RB5 and Ampere_altra and confirming that it does not fail on
>> these boards. Suspecting that the suffixed 'armv8_pmuv3_0' naming could
>> be the reason of test failure.
>>
>> Reverting the change (3241d46f5f54) seems to fix it.
>>
>> This works fine on Linux version v6.10-rc3
>>
>> Failure log
>> ------------
>> 110: perf all PMU test:
>> --- start ---
>> test child forked, pid 8279
>> Testing armv8_pmuv3/br_immed_retired/
>> Event 'armv8_pmuv3/br_immed_retired/' not printed in:
>> # Running 'internals/synthesize' benchmark:
>> Computing performance of single threaded perf event synthesis by
>> synthesizing events on the perf process itself:
>> Average synthesis took: 1169.431 usec (+- 0.144 usec)
>> Average num. events: 35.000 (+- 0.000)
>> Average time per event 33.412 usec
>> Average data synthesis took: 1225.698 usec (+- 0.102 usec)
>> Average num. events: 119.000 (+- 0.000)
>> Average time per event 10.300 usec
>>
>> Performance counter stats for 'perf bench internals synthesize':
>>
>> 3263664785 armv8_pmuv3_0/br_immed_retired/
>>
>>
>> 25.472854464 seconds time elapsed
>>
>> 8.004791000 seconds user
>> 17.060209000 seconds sys
>> ---- end(-1) ----
>> 110: perf all PMU test :
>> FAILED!
>
> Hi Aishwarya,
>
> Thanks for reporting an issue. The test should be pretty self
> explanatory: it is doing a `perf stat -e
> armv8_pmuv3/br_immed_retired/` and then looking for that in the
> output. The event armv8_pmuv3/br_immed_retired/ comes from running
> perf list. As you can see in the output the event did work, so perf
> stat is working so nothing is actually broken here. What isn't working
> is the perf stat output matching the command line event and this is
> because of the unnecessary suffix on ARM's PMU name.
>
> We have a problem that ARM have buggy PMU drivers, either from
> introducing new naming conventions or by just being broken:
> https://lore.kernel.org/lkml/CAP-5=fWNDkOpnYF=5v1aQkVDrDWsmw+zYX1pjS8hoiYVgZsRGA@mail.gmail.com/
> I've also asked that ARM step up their testing, for example in the
> event parsing testing the PMU is hardcoded to the x86 PMU name of
> 'cpu':
> https://git.kernel.org/pub/scm/linux/kernel/git/perf/perf-tools-next.git/tree/tools/perf/tests/parse-events.c?h=perf-tools-next#n2317
> On a cortex A53, then PMU is named 'armv8_cortex_a53':
> ```
> $ ls /sys/devices/armv8_cortex_a53/
> caps cpus events format perf_event_mux_interval_ms power
> subsystem type uevent
> ```
> This name appears better, so what's up with ARM's core PMU name?
> Anyway, I'm tempted to fix this by just skipping the test on ARM given
> ARM's overall broken state.
>
No need to skip the test, I'll take this one [1]. I think we're on the
same page that it's probably a test or cosmetic issue rather than
anything being broken in Perf.
[1]:
https://lore.kernel.org/linux-perf-users/[email protected]/
> Further, can we escalate matters? It isn't right that these ARM issues
> are coming back to me. ARM aren't stepping up to fix not just the 2
I wouldn't read into it too much, I think this is just a courtesy email
pointing out that we're aware of an issue and tracked it back to a
particular commit. I don't think there is any expectation that it has to
be you that makes the fix and I'm happy to look into it.
> issues above but:
> 1) the broken Apple M? issue (ARM asked I fix this as it regressed due
> to my fixes for the Intel hybrid code):
> https://lore.kernel.org/lkml/[email protected]/
> 2) opening events on BIG.little doesn't open the event on all core PMU types:
> https://lore.kernel.org/lkml/[email protected]/
As I mentioned in that thread I'll fix that one too at some point, I'm
just finishing off some other stuff first.
> 3) the broken 'cycles' event name in the arm_dsu PMU:
> https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/drivers/perf/arm_dsu_pmu.c#n177
> 4) the ARM memory controller PMUs lacking cpumask causing events
> opened on them to be opened on every CPU and thereby induce
> multiplexing..
> and so on..
>
> Thanks,
> Ian
>
>> Thanks,
>> Aishwarya
On 12/06/2024 1:32 pm, Ian Rogers wrote:
> On Wed, Jun 12, 2024 at 4:19 AM Aishwarya TCV <[email protected]> wrote:
>>
>>
>>
>> On 15/05/2024 07:01, Ian Rogers wrote:
>>> The mrvl_ddr_pmu is uncore and has a hexadecimal address suffix while
>>> the previous PMU sorting/merging code assumes uncore PMU names start
>>> with uncore_ and have a decimal suffix. Because of the previous
>>> assumption it isn't possible to wildcard the mrvl_ddr_pmu.
>>>
>>> Modify pmu_name_len_no_suffix but also remove the suffix number out
>>> argument, this is because we don't know if a suffix number of say 100
>>> is in hexadecimal or decimal. As the only use of the suffix number is
>>> in comparisons, it is safe there to compare the values as hexadecimal.
>>> Modify perf_pmu__match_ignoring_suffix so that hexadecimal suffixes
>>> are ignored.
>>>
>>> Only allow hexadecimal suffixes to be greater than length 2 (ie 3 or
>>> more) so that S390's cpum_cf PMU doesn't lose its suffix.
>>>
>>> Change the return type of pmu_name_len_no_suffix to size_t to
>>> workaround GCC incorrectly determining the result could be negative.
>>>
>>> Signed-off-by: Ian Rogers <[email protected]>
>>> ---
>>> tools/perf/util/pmu.c | 33 +++++++++++++--------
>>> tools/perf/util/pmus.c | 67 ++++++++++++++++++++++++------------------
>>> tools/perf/util/pmus.h | 7 ++++-
>>> 3 files changed, 65 insertions(+), 42 deletions(-)
>>>
>>
>> Hi Ian,
>>
>> Perf test "perf_all_PMU_test" is failing when run against
>> next-master(next-20240612) kernel with Arm64 on JUNO in our CI. It looks
>> like it is failing when run on JUNO alone. Verified by running on other
>> boards like RB5 and Ampere_altra and confirming that it does not fail on
>> these boards. Suspecting that the suffixed 'armv8_pmuv3_0' naming could
>> be the reason of test failure.
>>
>> Reverting the change (3241d46f5f54) seems to fix it.
>>
>> This works fine on Linux version v6.10-rc3
>>
>> Failure log
>> ------------
>> 110: perf all PMU test:
>> --- start ---
>> test child forked, pid 8279
>> Testing armv8_pmuv3/br_immed_retired/
>> Event 'armv8_pmuv3/br_immed_retired/' not printed in:
>> # Running 'internals/synthesize' benchmark:
>> Computing performance of single threaded perf event synthesis by
>> synthesizing events on the perf process itself:
>> Average synthesis took: 1169.431 usec (+- 0.144 usec)
>> Average num. events: 35.000 (+- 0.000)
>> Average time per event 33.412 usec
>> Average data synthesis took: 1225.698 usec (+- 0.102 usec)
>> Average num. events: 119.000 (+- 0.000)
>> Average time per event 10.300 usec
>>
>> Performance counter stats for 'perf bench internals synthesize':
>>
>> 3263664785 armv8_pmuv3_0/br_immed_retired/
>>
>>
>> 25.472854464 seconds time elapsed
>>
>> 8.004791000 seconds user
>> 17.060209000 seconds sys
>> ---- end(-1) ----
>> 110: perf all PMU test :
>> FAILED!
>
> Hi Aishwarya,
>
> Thanks for reporting an issue. The test should be pretty self
> explanatory: it is doing a `perf stat -e
> armv8_pmuv3/br_immed_retired/` and then looking for that in the
> output. The event armv8_pmuv3/br_immed_retired/ comes from running
> perf list. As you can see in the output the event did work, so perf
> stat is working so nothing is actually broken here. What isn't working
> is the perf stat output matching the command line event and this is
> because of the unnecessary suffix on ARM's PMU name.
>
> We have a problem that ARM have buggy PMU drivers, either from
> introducing new naming conventions or by just being broken:
> https://lore.kernel.org/lkml/CAP-5=fWNDkOpnYF=5v1aQkVDrDWsmw+zYX1pjS8hoiYVgZsRGA@mail.gmail.com/
> I've also asked that ARM step up their testing, for example in the
> event parsing testing the PMU is hardcoded to the x86 PMU name of
> 'cpu':
> https://git.kernel.org/pub/scm/linux/kernel/git/perf/perf-tools-next.git/tree/tools/perf/tests/parse-events.c?h=perf-tools-next#n2317
> On a cortex A53, then PMU is named 'armv8_cortex_a53':
> ```
> $ ls /sys/devices/armv8_cortex_a53/
> caps cpus events format perf_event_mux_interval_ms power
> subsystem type uevent
> ```
> This name appears better, so what's up with ARM's core PMU name?
With Devicetree, we are able to derive a descriptive PMU name from the
compatible string provided by the DT. Under ACPI, however, all we get
told is whether each CPU has a usable PMU or not, so the best we can do
is work out how many different CPU microarchitectures we have overall
and create a PMU instance for each type. We still don't know *what* each
one is, just that they're different, hence ending up with a common name
plus a suffix which we can increment for disambiguation if and when we
do see something new - userspace can still piece together the "cpus"
lists and MIDRs to figure out what's what, we just can't do much in the
kernel itself.
> Anyway, I'm tempted to fix this by just skipping the test on ARM given
> ARM's overall broken state.
This isn't a driver issue, it's a "the behaviour of 'perf list' changed
inconsistently" issue. I also had a brief dig into this using a
different arm64 ACPI system, and I think I can broadly characterise the
cause. This is prior to 3241d46f5f54:
root@crazy-taxi:~# ./perf-mainline list armv8_pmuv3
List of pre-defined events (to be used in -e or -M):
armv8_pmuv3_0:
L1-dcache-loads OR armv8_pmuv3_0/L1-dcache-loads/
L1-dcache-load-misses OR armv8_pmuv3_0/L1-dcache-load-misses/
L1-icache-loads OR armv8_pmuv3_0/L1-icache-loads/
L1-icache-load-misses OR armv8_pmuv3_0/L1-icache-load-misses/
dTLB-loads OR armv8_pmuv3_0/dTLB-loads/
dTLB-load-misses OR armv8_pmuv3_0/dTLB-load-misses/
iTLB-loads OR armv8_pmuv3_0/iTLB-loads/
iTLB-load-misses OR armv8_pmuv3_0/iTLB-load-misses/
branch-loads OR armv8_pmuv3_0/branch-loads/
branch-load-misses OR armv8_pmuv3_0/branch-load-misses/
l3d_cache_wb OR armv8_pmuv3_0/l3d_cache_wb/ [Kernel PMU event]
And this is after:
root@crazy-taxi:~# ./perf-next list armv8_pmuv3
List of pre-defined events (to be used in -e or -M):
armv8_pmuv3_0:
L1-dcache-loads OR armv8_pmuv3_0/L1-dcache-loads/
L1-dcache-load-misses OR armv8_pmuv3_0/L1-dcache-load-misses/
L1-icache-loads OR armv8_pmuv3_0/L1-icache-loads/
L1-icache-load-misses OR armv8_pmuv3_0/L1-icache-load-misses/
dTLB-loads OR armv8_pmuv3_0/dTLB-loads/
dTLB-load-misses OR armv8_pmuv3_0/dTLB-load-misses/
iTLB-loads OR armv8_pmuv3_0/iTLB-loads/
iTLB-load-misses OR armv8_pmuv3_0/iTLB-load-misses/
branch-loads OR armv8_pmuv3_0/branch-loads/
branch-load-misses OR armv8_pmuv3_0/branch-load-misses/
l3d_cache_wb OR armv8_pmuv3/l3d_cache_wb/ [Kernel PMU event]
See the difference in the last line - it appears that CPU PMU events
which map to common hardware/cache events *do* still report the full PMU
name, but any PMU-type-specific events show a truncated name in list and
thus fail the test's strict match against the full name reported by stat.
Thanks,
Robin.
On Wed, Jun 12, 2024 at 6:57 AM Robin Murphy <[email protected]> wrote:
>
> On 12/06/2024 1:32 pm, Ian Rogers wrote:
> > On Wed, Jun 12, 2024 at 4:19 AM Aishwarya TCV <[email protected]> wrote:
> >>
> >>
> >>
> >> On 15/05/2024 07:01, Ian Rogers wrote:
> >>> The mrvl_ddr_pmu is uncore and has a hexadecimal address suffix while
> >>> the previous PMU sorting/merging code assumes uncore PMU names start
> >>> with uncore_ and have a decimal suffix. Because of the previous
> >>> assumption it isn't possible to wildcard the mrvl_ddr_pmu.
> >>>
> >>> Modify pmu_name_len_no_suffix but also remove the suffix number out
> >>> argument, this is because we don't know if a suffix number of say 100
> >>> is in hexadecimal or decimal. As the only use of the suffix number is
> >>> in comparisons, it is safe there to compare the values as hexadecimal.
> >>> Modify perf_pmu__match_ignoring_suffix so that hexadecimal suffixes
> >>> are ignored.
> >>>
> >>> Only allow hexadecimal suffixes to be greater than length 2 (ie 3 or
> >>> more) so that S390's cpum_cf PMU doesn't lose its suffix.
> >>>
> >>> Change the return type of pmu_name_len_no_suffix to size_t to
> >>> workaround GCC incorrectly determining the result could be negative.
> >>>
> >>> Signed-off-by: Ian Rogers <[email protected]>
> >>> ---
> >>> tools/perf/util/pmu.c | 33 +++++++++++++--------
> >>> tools/perf/util/pmus.c | 67 ++++++++++++++++++++++++------------------
> >>> tools/perf/util/pmus.h | 7 ++++-
> >>> 3 files changed, 65 insertions(+), 42 deletions(-)
> >>>
> >>
> >> Hi Ian,
> >>
> >> Perf test "perf_all_PMU_test" is failing when run against
> >> next-master(next-20240612) kernel with Arm64 on JUNO in our CI. It looks
> >> like it is failing when run on JUNO alone. Verified by running on other
> >> boards like RB5 and Ampere_altra and confirming that it does not fail on
> >> these boards. Suspecting that the suffixed 'armv8_pmuv3_0' naming could
> >> be the reason of test failure.
> >>
> >> Reverting the change (3241d46f5f54) seems to fix it.
> >>
> >> This works fine on Linux version v6.10-rc3
> >>
> >> Failure log
> >> ------------
> >> 110: perf all PMU test:
> >> --- start ---
> >> test child forked, pid 8279
> >> Testing armv8_pmuv3/br_immed_retired/
> >> Event 'armv8_pmuv3/br_immed_retired/' not printed in:
> >> # Running 'internals/synthesize' benchmark:
> >> Computing performance of single threaded perf event synthesis by
> >> synthesizing events on the perf process itself:
> >> Average synthesis took: 1169.431 usec (+- 0.144 usec)
> >> Average num. events: 35.000 (+- 0.000)
> >> Average time per event 33.412 usec
> >> Average data synthesis took: 1225.698 usec (+- 0.102 usec)
> >> Average num. events: 119.000 (+- 0.000)
> >> Average time per event 10.300 usec
> >>
> >> Performance counter stats for 'perf bench internals synthesize':
> >>
> >> 3263664785 armv8_pmuv3_0/br_immed_retired/
> >>
> >>
> >> 25.472854464 seconds time elapsed
> >>
> >> 8.004791000 seconds user
> >> 17.060209000 seconds sys
> >> ---- end(-1) ----
> >> 110: perf all PMU test :
> >> FAILED!
> >
> > Hi Aishwarya,
> >
> > Thanks for reporting an issue. The test should be pretty self
> > explanatory: it is doing a `perf stat -e
> > armv8_pmuv3/br_immed_retired/` and then looking for that in the
> > output. The event armv8_pmuv3/br_immed_retired/ comes from running
> > perf list. As you can see in the output the event did work, so perf
> > stat is working so nothing is actually broken here. What isn't working
> > is the perf stat output matching the command line event and this is
> > because of the unnecessary suffix on ARM's PMU name.
> >
> > We have a problem that ARM have buggy PMU drivers, either from
> > introducing new naming conventions or by just being broken:
> > https://lore.kernel.org/lkml/CAP-5=fWNDkOpnYF=5v1aQkVDrDWsmw+zYX1pjS8hoiYVgZsRGA@mail.gmail.com/
> > I've also asked that ARM step up their testing, for example in the
> > event parsing testing the PMU is hardcoded to the x86 PMU name of
> > 'cpu':
> > https://git.kernel.org/pub/scm/linux/kernel/git/perf/perf-tools-next.git/tree/tools/perf/tests/parse-events.c?h=perf-tools-next#n2317
> > On a cortex A53, then PMU is named 'armv8_cortex_a53':
> > ```
> > $ ls /sys/devices/armv8_cortex_a53/
> > caps cpus events format perf_event_mux_interval_ms power
> > subsystem type uevent
> > ```
> > This name appears better, so what's up with ARM's core PMU name?
>
> With Devicetree, we are able to derive a descriptive PMU name from the
> compatible string provided by the DT. Under ACPI, however, all we get
> told is whether each CPU has a usable PMU or not, so the best we can do
> is work out how many different CPU microarchitectures we have overall
> and create a PMU instance for each type. We still don't know *what* each
> one is, just that they're different, hence ending up with a common name
> plus a suffix which we can increment for disambiguation if and when we
> do see something new - userspace can still piece together the "cpus"
> lists and MIDRs to figure out what's what, we just can't do much in the
> kernel itself.
>
> > Anyway, I'm tempted to fix this by just skipping the test on ARM given
> > ARM's overall broken state.
>
> This isn't a driver issue, it's a "the behaviour of 'perf list' changed
> inconsistently" issue. I also had a brief dig into this using a
> different arm64 ACPI system, and I think I can broadly characterise the
> cause. This is prior to 3241d46f5f54:
>
> root@crazy-taxi:~# ./perf-mainline list armv8_pmuv3
>
> List of pre-defined events (to be used in -e or -M):
>
>
> armv8_pmuv3_0:
> L1-dcache-loads OR armv8_pmuv3_0/L1-dcache-loads/
> L1-dcache-load-misses OR armv8_pmuv3_0/L1-dcache-load-misses/
> L1-icache-loads OR armv8_pmuv3_0/L1-icache-loads/
> L1-icache-load-misses OR armv8_pmuv3_0/L1-icache-load-misses/
> dTLB-loads OR armv8_pmuv3_0/dTLB-loads/
> dTLB-load-misses OR armv8_pmuv3_0/dTLB-load-misses/
> iTLB-loads OR armv8_pmuv3_0/iTLB-loads/
> iTLB-load-misses OR armv8_pmuv3_0/iTLB-load-misses/
> branch-loads OR armv8_pmuv3_0/branch-loads/
> branch-load-misses OR armv8_pmuv3_0/branch-load-misses/
> l3d_cache_wb OR armv8_pmuv3_0/l3d_cache_wb/ [Kernel PMU event]
>
>
> And this is after:
>
> root@crazy-taxi:~# ./perf-next list armv8_pmuv3
>
> List of pre-defined events (to be used in -e or -M):
>
>
> armv8_pmuv3_0:
> L1-dcache-loads OR armv8_pmuv3_0/L1-dcache-loads/
> L1-dcache-load-misses OR armv8_pmuv3_0/L1-dcache-load-misses/
> L1-icache-loads OR armv8_pmuv3_0/L1-icache-loads/
> L1-icache-load-misses OR armv8_pmuv3_0/L1-icache-load-misses/
> dTLB-loads OR armv8_pmuv3_0/dTLB-loads/
> dTLB-load-misses OR armv8_pmuv3_0/dTLB-load-misses/
> iTLB-loads OR armv8_pmuv3_0/iTLB-loads/
> iTLB-load-misses OR armv8_pmuv3_0/iTLB-load-misses/
> branch-loads OR armv8_pmuv3_0/branch-loads/
> branch-load-misses OR armv8_pmuv3_0/branch-load-misses/
> l3d_cache_wb OR armv8_pmuv3/l3d_cache_wb/ [Kernel PMU event]
>
> See the difference in the last line - it appears that CPU PMU events
> which map to common hardware/cache events *do* still report the full PMU
> name, but any PMU-type-specific events show a truncated name in list and
> thus fail the test's strict match against the full name reported by stat.
Thanks Robin. The hardware and cache events are legacy events that are
printed here, the assumption being that core PMUs don't use a suffix:
https://git.kernel.org/pub/scm/linux/kernel/git/perf/perf-tools-next.git/tree/tools/perf/util/print-events.c?h=perf-tools-next#n304
Fwiw these events are knowingly broken on the Apple M? ARM PMUs as
legacy events like L1-dcache-loads will use a legacy encoding which is
broken on this PMU, hence trying to migrate to preferring sysfs/json
events first - the support for which is reverted in v6.10.
The l3d_cache_wb event is not a legacy event and so goes down the
regular PMU event printing route that tries to avoid listing duplicate
PMUs as there may be 10s or 100s of these:
https://git.kernel.org/pub/scm/linux/kernel/git/perf/perf-tools-next.git/tree/tools/perf/util/pmus.c?h=perf-tools-next#n490
Yes, this was an intentional behavior change as system scale has gotten larger.
I'm currently thinking that the suffix should just be ignored on core
PMUs, we assume they never have duplicates. Unfortunately the code
just has a name and doesn't know from this whether a PMU is core or
not:
https://git.kernel.org/pub/scm/linux/kernel/git/perf/perf-tools-next.git/tree/tools/perf/util/pmus.c?h=perf-tools-next#n43
Note, following Kan Liang's advice we've been trying to actually
document the PMU naming convention:
https://lore.kernel.org/lkml/[email protected]/
The inconsistency here comes from ARM core PMUs using suffixes, which
isn't really intended behavior and will likely screw up, my guess is
around legacy events (the tests of which currently have limited ARM
coverage).
Thanks,
Ian
> Thanks,
> Robin.