2024-03-07 23:10:10

by Ilkka Koskinen

[permalink] [raw]
Subject: [PATCH v2 0/2] perf/arm-cmn: Add support for tertiary match group

Tertiary match group was added to CMN-650 and newer implementations.
This patchset makes the parameters model specific to support properly
various models. Also, wp_config registers and filter groups numbers are
decoupled to enable the new match group.

v2:
* Use bitmap to track which WPs/DTMs are used by the event
* Dropped model specific parameters
* Addressed other Robin's comments regarding to sysfs, sanity-check
of wp arguments, and wp_combine check

v1:
https://lore.kernel.org/all/[email protected]/

Ilkka Koskinen (2):
perf/arm-cmn: Decouple wp_config registers from filter group number
perf/arm-cmn: Enable support for tertiary match group

drivers/perf/arm-cmn.c | 144 +++++++++++++++++++++++++++++++++--------
1 file changed, 118 insertions(+), 26 deletions(-)

--
2.43.2



2024-03-07 23:10:22

by Ilkka Koskinen

[permalink] [raw]
Subject: [PATCH v2 1/2] perf/arm-cmn: Decouple wp_config registers from filter group number

Previously, wp_config0/2 registers were used for primary match group and
wp_config1/3 registers for secondary match group. In order to support
tertiary match group, this patch decouples the registers and the groups.

Signed-off-by: Ilkka Koskinen <[email protected]>
---
drivers/perf/arm-cmn.c | 125 ++++++++++++++++++++++++++++++++++-------
1 file changed, 105 insertions(+), 20 deletions(-)

diff --git a/drivers/perf/arm-cmn.c b/drivers/perf/arm-cmn.c
index 7e3aa7e2345f..29d46e0cf1cd 100644
--- a/drivers/perf/arm-cmn.c
+++ b/drivers/perf/arm-cmn.c
@@ -589,6 +589,13 @@ struct arm_cmn_hw_event {
s8 dtc_idx[CMN_MAX_DTCS];
u8 num_dns;
u8 dtm_offset;
+
+ /*
+ * WP config registers are divided to UP and DOWN events. We need to
+ * keep to track only one of them.
+ */
+ DECLARE_BITMAP(wp_cfg, 2 * CMN_MAX_XPS);
+
bool wide_sel;
enum cmn_filter_select filter_sel;
};
@@ -1335,9 +1342,51 @@ static const struct attribute_group *arm_cmn_attr_groups[] = {
NULL
};

-static int arm_cmn_wp_idx(struct perf_event *event)
+static inline unsigned int arm_cmn_get_xp_idx(struct arm_cmn *cmn,
+ struct arm_cmn_node *xp)
{
- return CMN_EVENT_EVENTID(event) + CMN_EVENT_WP_GRP(event);
+ return ((unsigned long) xp - (unsigned long) cmn->xps) / sizeof(struct arm_cmn_node);
+}
+
+static int arm_cmn_find_free_wp_idx(struct arm_cmn *cmn, struct arm_cmn_dtm *dtm,
+ struct perf_event *event)
+{
+ int wp_idx = CMN_EVENT_EVENTID(event);
+
+ if (dtm->wp_event[wp_idx] >= 0)
+ if (dtm->wp_event[++wp_idx] >= 0)
+ return -ENOSPC;
+
+ return wp_idx;
+}
+
+static int arm_cmn_get_assigned_wp_idx(struct arm_cmn *cmn,
+ struct arm_cmn_node *xp,
+ struct perf_event *event,
+ struct arm_cmn_hw_event *hw)
+{
+ int xp_idx = arm_cmn_get_xp_idx(cmn, xp);
+
+ if (test_bit(2 * xp_idx, hw->wp_cfg))
+ return CMN_EVENT_EVENTID(event);
+ else if (test_bit(2 * xp_idx + 1, hw->wp_cfg))
+ return CMN_EVENT_EVENTID(event) + 1;
+
+ dev_err(cmn->dev, "Could't find the assigned wp_cfg\n");
+ return -EINVAL;
+}
+
+static void arm_cmn_claim_wp_idx(struct arm_cmn *cmn,
+ struct arm_cmn_dtm *dtm,
+ struct perf_event *event,
+ struct arm_cmn_node *xp,
+ int wp_idx, unsigned int dtc)
+{
+ struct arm_cmn_hw_event *hw = to_cmn_hw(event);
+ int xp_idx = arm_cmn_get_xp_idx(cmn, xp);
+
+ dtm->wp_event[wp_idx] = hw->dtc_idx[dtc];
+ set_bit(2 * xp_idx + (wp_idx & 1), hw->wp_cfg);
}

static u32 arm_cmn_wp_config(struct perf_event *event)
@@ -1519,12 +1568,16 @@ static void arm_cmn_event_start(struct perf_event *event, int flags)
writeq_relaxed(CMN_CC_INIT, cmn->dtc[i].base + CMN_DT_PMCCNTR);
cmn->dtc[i].cc_active = true;
} else if (type == CMN_TYPE_WP) {
- int wp_idx = arm_cmn_wp_idx(event);
u64 val = CMN_EVENT_WP_VAL(event);
u64 mask = CMN_EVENT_WP_MASK(event);

for_each_hw_dn(hw, dn, i) {
void __iomem *base = dn->pmu_base + CMN_DTM_OFFSET(hw->dtm_offset);
+ int wp_idx;
+
+ wp_idx = arm_cmn_get_assigned_wp_idx(cmn, dn, event, hw);
+ if (wp_idx < 0)
+ return;

writeq_relaxed(val, base + CMN_DTM_WPn_VAL(wp_idx));
writeq_relaxed(mask, base + CMN_DTM_WPn_MASK(wp_idx));
@@ -1549,10 +1602,13 @@ static void arm_cmn_event_stop(struct perf_event *event, int flags)
i = hw->dtc_idx[0];
cmn->dtc[i].cc_active = false;
} else if (type == CMN_TYPE_WP) {
- int wp_idx = arm_cmn_wp_idx(event);
-
for_each_hw_dn(hw, dn, i) {
void __iomem *base = dn->pmu_base + CMN_DTM_OFFSET(hw->dtm_offset);
+ int wp_idx;
+
+ wp_idx = arm_cmn_get_assigned_wp_idx(cmn, dn, event, hw);
+ if (wp_idx < 0)
+ continue;

writeq_relaxed(0, base + CMN_DTM_WPn_MASK(wp_idx));
writeq_relaxed(~0ULL, base + CMN_DTM_WPn_VAL(wp_idx));
@@ -1574,8 +1630,20 @@ struct arm_cmn_val {
bool cycles;
};

-static void arm_cmn_val_add_event(struct arm_cmn *cmn, struct arm_cmn_val *val,
- struct perf_event *event)
+static int arm_cmn_val_find_free_wp_config(struct perf_event *event,
+ struct arm_cmn_val *val, int dtm)
+{
+ int wp_idx = CMN_EVENT_EVENTID(event);
+
+ if (val->wp[dtm][wp_idx])
+ if (val->wp[dtm][++wp_idx])
+ return -ENOSPC;
+
+ return wp_idx;
+}
+
+static int arm_cmn_val_add_event(struct arm_cmn *cmn, struct arm_cmn_val *val,
+ struct perf_event *event)
{
struct arm_cmn_hw_event *hw = to_cmn_hw(event);
struct arm_cmn_node *dn;
@@ -1583,12 +1651,12 @@ static void arm_cmn_val_add_event(struct arm_cmn *cmn, struct arm_cmn_val *val,
int i;

if (is_software_event(event))
- return;
+ return 0;

type = CMN_EVENT_TYPE(event);
if (type == CMN_TYPE_DTC) {
val->cycles = true;
- return;
+ return 0;
}

for_each_hw_dtc_idx(hw, dtc, idx)
@@ -1605,9 +1673,14 @@ static void arm_cmn_val_add_event(struct arm_cmn *cmn, struct arm_cmn_val *val,
if (type != CMN_TYPE_WP)
continue;

- wp_idx = arm_cmn_wp_idx(event);
+ wp_idx = arm_cmn_val_find_free_wp_config(event, val, dtm);
+ if (wp_idx < 0)
+ return -ENOSPC;
+
val->wp[dtm][wp_idx] = CMN_EVENT_WP_COMBINE(event) + 1;
}
+
+ return 0;
}

static int arm_cmn_validate_group(struct arm_cmn *cmn, struct perf_event *event)
@@ -1629,9 +1702,15 @@ static int arm_cmn_validate_group(struct arm_cmn *cmn, struct perf_event *event)
if (!val)
return -ENOMEM;

- arm_cmn_val_add_event(cmn, val, leader);
- for_each_sibling_event(sibling, leader)
- arm_cmn_val_add_event(cmn, val, sibling);
+ ret = arm_cmn_val_add_event(cmn, val, leader);
+ if (ret)
+ goto done;
+
+ for_each_sibling_event(sibling, leader) {
+ ret = arm_cmn_val_add_event(cmn, val, sibling);
+ if (ret)
+ goto done;
+ }

type = CMN_EVENT_TYPE(event);
if (type == CMN_TYPE_DTC) {
@@ -1656,8 +1735,8 @@ static int arm_cmn_validate_group(struct arm_cmn *cmn, struct perf_event *event)
if (type != CMN_TYPE_WP)
continue;

- wp_idx = arm_cmn_wp_idx(event);
- if (val->wp[dtm][wp_idx])
+ wp_idx = arm_cmn_val_find_free_wp_config(event, val, dtm);
+ if (wp_idx < 0)
goto done;

wp_cmb = val->wp[dtm][wp_idx ^ 1];
@@ -1772,8 +1851,11 @@ static void arm_cmn_event_clear(struct arm_cmn *cmn, struct perf_event *event,
struct arm_cmn_dtm *dtm = &cmn->dtms[hw->dn[i].dtm] + hw->dtm_offset;
unsigned int dtm_idx = arm_cmn_get_index(hw->dtm_idx, i);

- if (type == CMN_TYPE_WP)
- dtm->wp_event[arm_cmn_wp_idx(event)] = -1;
+ if (type == CMN_TYPE_WP) {
+ int wp_idx = arm_cmn_get_assigned_wp_idx(cmn, &hw->dn[i], event, hw);
+
+ dtm->wp_event[wp_idx] = -1;
+ }

if (hw->filter_sel > SEL_NONE)
hw->dn[i].occupid[hw->filter_sel].count--;
@@ -1782,6 +1864,7 @@ static void arm_cmn_event_clear(struct arm_cmn *cmn, struct perf_event *event,
writel_relaxed(dtm->pmu_config_low, dtm->base + CMN_DTM_PMU_CONFIG);
}
memset(hw->dtm_idx, 0, sizeof(hw->dtm_idx));
+ bitmap_zero(hw->wp_cfg, 2 * CMN_MAX_XPS);

for_each_hw_dtc_idx(hw, j, idx)
cmn->dtc[j].counters[idx] = NULL;
@@ -1835,10 +1918,11 @@ static int arm_cmn_event_add(struct perf_event *event, int flags)
if (type == CMN_TYPE_XP) {
input_sel = CMN__PMEVCNT0_INPUT_SEL_XP + dtm_idx;
} else if (type == CMN_TYPE_WP) {
- int tmp, wp_idx = arm_cmn_wp_idx(event);
+ int tmp, wp_idx;
u32 cfg = arm_cmn_wp_config(event);

- if (dtm->wp_event[wp_idx] >= 0)
+ wp_idx = arm_cmn_find_free_wp_idx(cmn, dtm, event);
+ if (wp_idx < 0)
goto free_dtms;

tmp = dtm->wp_event[wp_idx ^ 1];
@@ -1847,7 +1931,8 @@ static int arm_cmn_event_add(struct perf_event *event, int flags)
goto free_dtms;

input_sel = CMN__PMEVCNT0_INPUT_SEL_WP + wp_idx;
- dtm->wp_event[wp_idx] = hw->dtc_idx[d];
+
+ arm_cmn_claim_wp_idx(cmn, dtm, event, dn, wp_idx, d);
writel_relaxed(cfg, dtm->base + CMN_DTM_WPn_CONFIG(wp_idx));
} else {
struct arm_cmn_nodeid nid = arm_cmn_nid(cmn, dn->id);
--
2.43.2


2024-03-07 23:10:27

by Ilkka Koskinen

[permalink] [raw]
Subject: [PATCH v2 2/2] perf/arm-cmn: Enable support for tertiary match group

Add support for tertiary match group.

Signed-off-by: Ilkka Koskinen <[email protected]>
---
drivers/perf/arm-cmn.c | 19 +++++++++++++------
1 file changed, 13 insertions(+), 6 deletions(-)

diff --git a/drivers/perf/arm-cmn.c b/drivers/perf/arm-cmn.c
index 29d46e0cf1cd..4a4cbda94907 100644
--- a/drivers/perf/arm-cmn.c
+++ b/drivers/perf/arm-cmn.c
@@ -174,9 +174,8 @@
#define CMN_CONFIG_WP_COMBINE GENMASK_ULL(30, 27)
#define CMN_CONFIG_WP_DEV_SEL GENMASK_ULL(50, 48)
#define CMN_CONFIG_WP_CHN_SEL GENMASK_ULL(55, 51)
-/* Note that we don't yet support the tertiary match group on newer IPs */
-#define CMN_CONFIG_WP_GRP BIT_ULL(56)
-#define CMN_CONFIG_WP_EXCLUSIVE BIT_ULL(57)
+#define CMN_CONFIG_WP_GRP GENMASK_ULL(57, 56)
+#define CMN_CONFIG_WP_EXCLUSIVE BIT_ULL(58)
#define CMN_CONFIG1_WP_VAL GENMASK_ULL(63, 0)
#define CMN_CONFIG2_WP_MASK GENMASK_ULL(63, 0)

@@ -1389,7 +1388,7 @@ static void arm_cmn_claim_wp_idx(struct arm_cmn *cmn,
set_bit(2 * xp_idx + (wp_idx & 1), hw->wp_cfg);
}

-static u32 arm_cmn_wp_config(struct perf_event *event)
+static u32 arm_cmn_wp_config(struct perf_event *event, int wp_idx)
{
u32 config;
u32 dev = CMN_EVENT_WP_DEV_SEL(event);
@@ -1399,6 +1398,10 @@ static u32 arm_cmn_wp_config(struct perf_event *event)
u32 combine = CMN_EVENT_WP_COMBINE(event);
bool is_cmn600 = to_cmn(event->pmu)->part == PART_CMN600;

+ /* CMN-600 supports only primary and secondary matching groups */
+ if (is_cmn600)
+ grp &= 1;
+
config = FIELD_PREP(CMN_DTM_WPn_CONFIG_WP_DEV_SEL, dev) |
FIELD_PREP(CMN_DTM_WPn_CONFIG_WP_CHN_SEL, chn) |
FIELD_PREP(CMN_DTM_WPn_CONFIG_WP_GRP, grp) |
@@ -1406,7 +1409,9 @@ static u32 arm_cmn_wp_config(struct perf_event *event)
if (exc)
config |= is_cmn600 ? CMN600_WPn_CONFIG_WP_EXCLUSIVE :
CMN_DTM_WPn_CONFIG_WP_EXCLUSIVE;
- if (combine && !grp)
+
+ /* wp_combine is available only on WP0 and WP2 */
+ if (combine && !(wp_idx & 0x1))
config |= is_cmn600 ? CMN600_WPn_CONFIG_WP_COMBINE :
CMN_DTM_WPn_CONFIG_WP_COMBINE;
return config;
@@ -1919,12 +1924,14 @@ static int arm_cmn_event_add(struct perf_event *event, int flags)
input_sel = CMN__PMEVCNT0_INPUT_SEL_XP + dtm_idx;
} else if (type == CMN_TYPE_WP) {
int tmp, wp_idx;
- u32 cfg = arm_cmn_wp_config(event);
+ u32 cfg;

wp_idx = arm_cmn_find_free_wp_idx(cmn, dtm, event);
if (wp_idx < 0)
goto free_dtms;

+ cfg = arm_cmn_wp_config(event, wp_idx);
+
tmp = dtm->wp_event[wp_idx ^ 1];
if (tmp >= 0 && CMN_EVENT_WP_COMBINE(event) !=
CMN_EVENT_WP_COMBINE(cmn->dtc[d].counters[tmp]))
--
2.43.2


2024-03-08 14:35:37

by Robin Murphy

[permalink] [raw]
Subject: Re: [PATCH v2 1/2] perf/arm-cmn: Decouple wp_config registers from filter group number

On 2024-03-07 11:09 pm, Ilkka Koskinen wrote:
> Previously, wp_config0/2 registers were used for primary match group and
> wp_config1/3 registers for secondary match group. In order to support
> tertiary match group, this patch decouples the registers and the groups.
>
> Signed-off-by: Ilkka Koskinen <[email protected]>
> ---
> drivers/perf/arm-cmn.c | 125 ++++++++++++++++++++++++++++++++++-------
> 1 file changed, 105 insertions(+), 20 deletions(-)
>
> diff --git a/drivers/perf/arm-cmn.c b/drivers/perf/arm-cmn.c
> index 7e3aa7e2345f..29d46e0cf1cd 100644
> --- a/drivers/perf/arm-cmn.c
> +++ b/drivers/perf/arm-cmn.c
> @@ -589,6 +589,13 @@ struct arm_cmn_hw_event {
> s8 dtc_idx[CMN_MAX_DTCS];
> u8 num_dns;
> u8 dtm_offset;
> +
> + /*
> + * WP config registers are divided to UP and DOWN events. We need to
> + * keep to track only one of them.
> + */
> + DECLARE_BITMAP(wp_cfg, 2 * CMN_MAX_XPS);

What I had in mind was a wp_idx field which works the same way as
dtm_idx, i.e. we just store the allocated index per relevant DN, since a
single event can never use *both* watchpoints on a single XP. Each index
then need only be 0 or 1 since they're already scoped by the watchpoint
direction of the base event, thus we should only need one bit per XP.

> +
> bool wide_sel;
> enum cmn_filter_select filter_sel;
> };
> @@ -1335,9 +1342,51 @@ static const struct attribute_group *arm_cmn_attr_groups[] = {
> NULL
> };
>
> -static int arm_cmn_wp_idx(struct perf_event *event)
> +static inline unsigned int arm_cmn_get_xp_idx(struct arm_cmn *cmn,
> + struct arm_cmn_node *xp)
> {
> - return CMN_EVENT_EVENTID(event) + CMN_EVENT_WP_GRP(event);
> + return ((unsigned long) xp - (unsigned long) cmn->xps) / sizeof(struct arm_cmn_node);
> +}
> +
> +static int arm_cmn_find_free_wp_idx(struct arm_cmn *cmn, struct arm_cmn_dtm *dtm,
> + struct perf_event *event)
> +{
> + int wp_idx = CMN_EVENT_EVENTID(event);
> +
> + if (dtm->wp_event[wp_idx] >= 0)
> + if (dtm->wp_event[++wp_idx] >= 0)
> + return -ENOSPC;
> +
> + return wp_idx;
> +}
> +
> +static int arm_cmn_get_assigned_wp_idx(struct arm_cmn *cmn,
> + struct arm_cmn_node *xp,
> + struct perf_event *event,
> + struct arm_cmn_hw_event *hw)
> +{
> + int xp_idx = arm_cmn_get_xp_idx(cmn, xp);
> +
> + if (test_bit(2 * xp_idx, hw->wp_cfg))
> + return CMN_EVENT_EVENTID(event);
> + else if (test_bit(2 * xp_idx + 1, hw->wp_cfg))
> + return CMN_EVENT_EVENTID(event) + 1;
> +
> + dev_err(cmn->dev, "Could't find the assigned wp_cfg\n");
> + return -EINVAL;
> +}

..and so for this we would only need more of a mild tweak to the
existing design, something like:

static int arm_cmn_get_wp_idx(struct perf_event *event, int pos)
{
struct arm_cmn_hw_event *hw = to_cmn_hw(event);

return CMN_EVENT_EVENTID(event) + test_bit(&hw->wp_idx, pos);
}

> +
> +static void arm_cmn_claim_wp_idx(struct arm_cmn *cmn,
> + struct arm_cmn_dtm *dtm,
> + struct perf_event *event,
> + struct arm_cmn_node *xp,
> + int wp_idx, unsigned int dtc)
> +{
> + struct arm_cmn_hw_event *hw = to_cmn_hw(event);
> + int xp_idx = arm_cmn_get_xp_idx(cmn, xp);
> +
> + dtm->wp_event[wp_idx] = hw->dtc_idx[dtc];
> + set_bit(2 * xp_idx + (wp_idx & 1), hw->wp_cfg);

This is recalculating way more than it needs to. It's only ever used
within for_each_hw_dn(), which already has all the information to hand
already - again, look at how hw->dtm_idx is managed. Furthermore I'd
also prefer to similarly not conflate management of the per-event state
with that of the DTM state (i.e. just have an arm_cmn_set_wp_idx() for
updating the event data).

> }
>
> static u32 arm_cmn_wp_config(struct perf_event *event)
> @@ -1519,12 +1568,16 @@ static void arm_cmn_event_start(struct perf_event *event, int flags)
> writeq_relaxed(CMN_CC_INIT, cmn->dtc[i].base + CMN_DT_PMCCNTR);
> cmn->dtc[i].cc_active = true;
> } else if (type == CMN_TYPE_WP) {
> - int wp_idx = arm_cmn_wp_idx(event);
> u64 val = CMN_EVENT_WP_VAL(event);
> u64 mask = CMN_EVENT_WP_MASK(event);
>
> for_each_hw_dn(hw, dn, i) {
> void __iomem *base = dn->pmu_base + CMN_DTM_OFFSET(hw->dtm_offset);
> + int wp_idx;
> +
> + wp_idx = arm_cmn_get_assigned_wp_idx(cmn, dn, event, hw);
> + if (wp_idx < 0)
> + return;
>
> writeq_relaxed(val, base + CMN_DTM_WPn_VAL(wp_idx));
> writeq_relaxed(mask, base + CMN_DTM_WPn_MASK(wp_idx));
> @@ -1549,10 +1602,13 @@ static void arm_cmn_event_stop(struct perf_event *event, int flags)
> i = hw->dtc_idx[0];
> cmn->dtc[i].cc_active = false;
> } else if (type == CMN_TYPE_WP) {
> - int wp_idx = arm_cmn_wp_idx(event);
> -
> for_each_hw_dn(hw, dn, i) {
> void __iomem *base = dn->pmu_base + CMN_DTM_OFFSET(hw->dtm_offset);
> + int wp_idx;
> +
> + wp_idx = arm_cmn_get_assigned_wp_idx(cmn, dn, event, hw);
> + if (wp_idx < 0)
> + continue;
>
> writeq_relaxed(0, base + CMN_DTM_WPn_MASK(wp_idx));
> writeq_relaxed(~0ULL, base + CMN_DTM_WPn_VAL(wp_idx));
> @@ -1574,8 +1630,20 @@ struct arm_cmn_val {
> bool cycles;
> };
>
> -static void arm_cmn_val_add_event(struct arm_cmn *cmn, struct arm_cmn_val *val,
> - struct perf_event *event)
> +static int arm_cmn_val_find_free_wp_config(struct perf_event *event,
> + struct arm_cmn_val *val, int dtm)
> +{
> + int wp_idx = CMN_EVENT_EVENTID(event);
> +
> + if (val->wp[dtm][wp_idx])
> + if (val->wp[dtm][++wp_idx])
> + return -ENOSPC;
> +
> + return wp_idx;
> +}
> +
> +static int arm_cmn_val_add_event(struct arm_cmn *cmn, struct arm_cmn_val *val,
> + struct perf_event *event)

This must never fail - the purpose of val_add_event is to fill in the
val structure with the combination of leader and sibling events which
have *already* passed their own event_init calls been declared valid as
a group. The body of validate_group then does the "what if?" version to
test whether the group would remain valid if the *current* event were to
be added.

The trick with the offset combine value relies on direct indexing to
work, so I think we need to rejig the structure slightly to track
distinct wp_count and wp_combine values (per direction) - that then
becomes nicely consistent with the relationship between dtm_count and
occupid, too.

> {
> struct arm_cmn_hw_event *hw = to_cmn_hw(event);
> struct arm_cmn_node *dn;
> @@ -1583,12 +1651,12 @@ static void arm_cmn_val_add_event(struct arm_cmn *cmn, struct arm_cmn_val *val,
> int i;
>
> if (is_software_event(event))
> - return;
> + return 0;
>
> type = CMN_EVENT_TYPE(event);
> if (type == CMN_TYPE_DTC) {
> val->cycles = true;
> - return;
> + return 0;
> }
>
> for_each_hw_dtc_idx(hw, dtc, idx)
> @@ -1605,9 +1673,14 @@ static void arm_cmn_val_add_event(struct arm_cmn *cmn, struct arm_cmn_val *val,
> if (type != CMN_TYPE_WP)
> continue;
>
> - wp_idx = arm_cmn_wp_idx(event);
> + wp_idx = arm_cmn_val_find_free_wp_config(event, val, dtm);
> + if (wp_idx < 0)
> + return -ENOSPC;
> +
> val->wp[dtm][wp_idx] = CMN_EVENT_WP_COMBINE(event) + 1;
> }
> +
> + return 0;
> }
>
> static int arm_cmn_validate_group(struct arm_cmn *cmn, struct perf_event *event)
> @@ -1629,9 +1702,15 @@ static int arm_cmn_validate_group(struct arm_cmn *cmn, struct perf_event *event)
> if (!val)
> return -ENOMEM;
>
> - arm_cmn_val_add_event(cmn, val, leader);
> - for_each_sibling_event(sibling, leader)
> - arm_cmn_val_add_event(cmn, val, sibling);
> + ret = arm_cmn_val_add_event(cmn, val, leader);
> + if (ret)
> + goto done;
> +
> + for_each_sibling_event(sibling, leader) {
> + ret = arm_cmn_val_add_event(cmn, val, sibling);
> + if (ret)
> + goto done;
> + }
>
> type = CMN_EVENT_TYPE(event);
> if (type == CMN_TYPE_DTC) {
> @@ -1656,8 +1735,8 @@ static int arm_cmn_validate_group(struct arm_cmn *cmn, struct perf_event *event)
> if (type != CMN_TYPE_WP)
> continue;
>
> - wp_idx = arm_cmn_wp_idx(event);
> - if (val->wp[dtm][wp_idx])
> + wp_idx = arm_cmn_val_find_free_wp_config(event, val, dtm);
> + if (wp_idx < 0)
> goto done;
>
> wp_cmb = val->wp[dtm][wp_idx ^ 1];
> @@ -1772,8 +1851,11 @@ static void arm_cmn_event_clear(struct arm_cmn *cmn, struct perf_event *event,
> struct arm_cmn_dtm *dtm = &cmn->dtms[hw->dn[i].dtm] + hw->dtm_offset;
> unsigned int dtm_idx = arm_cmn_get_index(hw->dtm_idx, i);
>
> - if (type == CMN_TYPE_WP)
> - dtm->wp_event[arm_cmn_wp_idx(event)] = -1;
> + if (type == CMN_TYPE_WP) {
> + int wp_idx = arm_cmn_get_assigned_wp_idx(cmn, &hw->dn[i], event, hw);
> +
> + dtm->wp_event[wp_idx] = -1;
> + }
>
> if (hw->filter_sel > SEL_NONE)
> hw->dn[i].occupid[hw->filter_sel].count--;
> @@ -1782,6 +1864,7 @@ static void arm_cmn_event_clear(struct arm_cmn *cmn, struct perf_event *event,
> writel_relaxed(dtm->pmu_config_low, dtm->base + CMN_DTM_PMU_CONFIG);
> }
> memset(hw->dtm_idx, 0, sizeof(hw->dtm_idx));
> + bitmap_zero(hw->wp_cfg, 2 * CMN_MAX_XPS);

Nit: I'd rather do this in terms of sizeof() so it's harder to break in
future. And since it's going to end up being a memset() anyway I'd then
probably just open-code that rather than mucking about with
bytes-to-bits calculations.

> for_each_hw_dtc_idx(hw, j, idx)
> cmn->dtc[j].counters[idx] = NULL;
> @@ -1835,10 +1918,11 @@ static int arm_cmn_event_add(struct perf_event *event, int flags)
> if (type == CMN_TYPE_XP) {
> input_sel = CMN__PMEVCNT0_INPUT_SEL_XP + dtm_idx;
> } else if (type == CMN_TYPE_WP) {
> - int tmp, wp_idx = arm_cmn_wp_idx(event);
> + int tmp, wp_idx;
> u32 cfg = arm_cmn_wp_config(event);
>
> - if (dtm->wp_event[wp_idx] >= 0)
> + wp_idx = arm_cmn_find_free_wp_idx(cmn, dtm, event);
> + if (wp_idx < 0)

TBH I'm not convinced it's even worth factoring out the "allocator"
here, since inline it can be as simple as:

int tmp, wp_idx = CMN_EVENT_EVENTID(event);
...
if (dtm->wp_event[wp_idx] && dtm->wp_event[++wp_idx])

(or perhaps follow the same while/if shape as for dtm_idx further up, if
you think it's worth being more clear than concise)

Thanks,
Robin.

> goto free_dtms;
>
> tmp = dtm->wp_event[wp_idx ^ 1];
> @@ -1847,7 +1931,8 @@ static int arm_cmn_event_add(struct perf_event *event, int flags)
> goto free_dtms;
>
> input_sel = CMN__PMEVCNT0_INPUT_SEL_WP + wp_idx;
> - dtm->wp_event[wp_idx] = hw->dtc_idx[d];
> +
> + arm_cmn_claim_wp_idx(cmn, dtm, event, dn, wp_idx, d);
> writel_relaxed(cfg, dtm->base + CMN_DTM_WPn_CONFIG(wp_idx));
> } else {
> struct arm_cmn_nodeid nid = arm_cmn_nid(cmn, dn->id);

2024-03-25 22:04:24

by Ilkka Koskinen

[permalink] [raw]
Subject: Re: [PATCH v2 1/2] perf/arm-cmn: Decouple wp_config registers from filter group number



Hi Robin,

On Fri, 8 Mar 2024, Robin Murphy wrote:
> On 2024-03-07 11:09 pm, Ilkka Koskinen wrote:
>> Previously, wp_config0/2 registers were used for primary match group and
>> wp_config1/3 registers for secondary match group. In order to support
>> tertiary match group, this patch decouples the registers and the groups.
>>
>> Signed-off-by: Ilkka Koskinen <[email protected]>
>> ---
>> drivers/perf/arm-cmn.c | 125 ++++++++++++++++++++++++++++++++++-------
>> 1 file changed, 105 insertions(+), 20 deletions(-)
>>
>> diff --git a/drivers/perf/arm-cmn.c b/drivers/perf/arm-cmn.c
>> index 7e3aa7e2345f..29d46e0cf1cd 100644
>> --- a/drivers/perf/arm-cmn.c
>> +++ b/drivers/perf/arm-cmn.c
>> @@ -589,6 +589,13 @@ struct arm_cmn_hw_event {
>> s8 dtc_idx[CMN_MAX_DTCS];
>> u8 num_dns;
>> u8 dtm_offset;
>> +
>> + /*
>> + * WP config registers are divided to UP and DOWN events. We need to
>> + * keep to track only one of them.
>> + */
>> + DECLARE_BITMAP(wp_cfg, 2 * CMN_MAX_XPS);
>
> What I had in mind was a wp_idx field which works the same way as dtm_idx,
> i.e. we just store the allocated index per relevant DN, since a single event
> can never use *both* watchpoints on a single XP. Each index then need only be
> 0 or 1 since they're already scoped by the watchpoint direction of the base
> event, thus we should only need one bit per XP.

Ah, I got it now.

>
>> +
>> bool wide_sel;
>> enum cmn_filter_select filter_sel;
>> };
>> @@ -1335,9 +1342,51 @@ static const struct attribute_group
>> *arm_cmn_attr_groups[] = {
>> NULL
>> };
>> -static int arm_cmn_wp_idx(struct perf_event *event)
>> +static inline unsigned int arm_cmn_get_xp_idx(struct arm_cmn *cmn,
>> + struct arm_cmn_node *xp)
>> {
>> - return CMN_EVENT_EVENTID(event) + CMN_EVENT_WP_GRP(event);
>> + return ((unsigned long) xp - (unsigned long) cmn->xps) /
>> sizeof(struct arm_cmn_node);
>> +}
>> +
>> +static int arm_cmn_find_free_wp_idx(struct arm_cmn *cmn, struct
>> arm_cmn_dtm *dtm,
>> + struct perf_event *event)
>> +{
>> + int wp_idx = CMN_EVENT_EVENTID(event);
>> +
>> + if (dtm->wp_event[wp_idx] >= 0)
>> + if (dtm->wp_event[++wp_idx] >= 0)
>> + return -ENOSPC;
>> +
>> + return wp_idx;
>> +}
>> +
>> +static int arm_cmn_get_assigned_wp_idx(struct arm_cmn *cmn,
>> + struct arm_cmn_node *xp,
>> + struct perf_event *event,
>> + struct arm_cmn_hw_event *hw)
>> +{
>> + int xp_idx = arm_cmn_get_xp_idx(cmn, xp);
>> +
>> + if (test_bit(2 * xp_idx, hw->wp_cfg))
>> + return CMN_EVENT_EVENTID(event);
>> + else if (test_bit(2 * xp_idx + 1, hw->wp_cfg))
>> + return CMN_EVENT_EVENTID(event) + 1;
>> +
>> + dev_err(cmn->dev, "Could't find the assigned wp_cfg\n");
>> + return -EINVAL;
>> +}
>
> ...and so for this we would only need more of a mild tweak to the existing
> design, something like:
>
> static int arm_cmn_get_wp_idx(struct perf_event *event, int pos)
> {
> struct arm_cmn_hw_event *hw = to_cmn_hw(event);
>
> return CMN_EVENT_EVENTID(event) + test_bit(&hw->wp_idx, pos);
> }

Yep, it simplifies it quite a bit.

>
>> +
>> +static void arm_cmn_claim_wp_idx(struct arm_cmn *cmn,
>> + struct arm_cmn_dtm *dtm,
>> + struct perf_event *event,
>> + struct arm_cmn_node *xp,
>> + int wp_idx, unsigned int dtc)
>> +{
>> + struct arm_cmn_hw_event *hw = to_cmn_hw(event);
>> + int xp_idx = arm_cmn_get_xp_idx(cmn, xp);
>> +
>> + dtm->wp_event[wp_idx] = hw->dtc_idx[dtc];
>> + set_bit(2 * xp_idx + (wp_idx & 1), hw->wp_cfg);
>
> This is recalculating way more than it needs to. It's only ever used within
> for_each_hw_dn(), which already has all the information to hand already -
> again, look at how hw->dtm_idx is managed. Furthermore I'd also prefer to
> similarly not conflate management of the per-event state with that of the DTM
> state (i.e. just have an arm_cmn_set_wp_idx() for updating the event data).

Right, I somehow forgot that hw->dn points to the right type of the node
and I can simply use the index from for_each_hw_dn().

>
>> }
>> static u32 arm_cmn_wp_config(struct perf_event *event)
>> @@ -1519,12 +1568,16 @@ static void arm_cmn_event_start(struct perf_event
>> *event, int flags)
>> writeq_relaxed(CMN_CC_INIT, cmn->dtc[i].base +
>> CMN_DT_PMCCNTR);
>> cmn->dtc[i].cc_active = true;
>> } else if (type == CMN_TYPE_WP) {
>> - int wp_idx = arm_cmn_wp_idx(event);
>> u64 val = CMN_EVENT_WP_VAL(event);
>> u64 mask = CMN_EVENT_WP_MASK(event);
>> for_each_hw_dn(hw, dn, i) {
>> void __iomem *base = dn->pmu_base +
>> CMN_DTM_OFFSET(hw->dtm_offset);
>> + int wp_idx;
>> +
>> + wp_idx = arm_cmn_get_assigned_wp_idx(cmn, dn, event,
>> hw);
>> + if (wp_idx < 0)
>> + return;
>> writeq_relaxed(val, base + CMN_DTM_WPn_VAL(wp_idx));
>> writeq_relaxed(mask, base +
>> CMN_DTM_WPn_MASK(wp_idx));
>> @@ -1549,10 +1602,13 @@ static void arm_cmn_event_stop(struct perf_event
>> *event, int flags)
>> i = hw->dtc_idx[0];
>> cmn->dtc[i].cc_active = false;
>> } else if (type == CMN_TYPE_WP) {
>> - int wp_idx = arm_cmn_wp_idx(event);
>> -
>> for_each_hw_dn(hw, dn, i) {
>> void __iomem *base = dn->pmu_base +
>> CMN_DTM_OFFSET(hw->dtm_offset);
>> + int wp_idx;
>> +
>> + wp_idx = arm_cmn_get_assigned_wp_idx(cmn, dn, event,
>> hw);
>> + if (wp_idx < 0)
>> + continue;
>> writeq_relaxed(0, base + CMN_DTM_WPn_MASK(wp_idx));
>> writeq_relaxed(~0ULL, base +
>> CMN_DTM_WPn_VAL(wp_idx));
>> @@ -1574,8 +1630,20 @@ struct arm_cmn_val {
>> bool cycles;
>> };
>> -static void arm_cmn_val_add_event(struct arm_cmn *cmn, struct
>> arm_cmn_val *val,
>> - struct perf_event *event)
>> +static int arm_cmn_val_find_free_wp_config(struct perf_event *event,
>> + struct arm_cmn_val *val, int dtm)
>> +{
>> + int wp_idx = CMN_EVENT_EVENTID(event);
>> +
>> + if (val->wp[dtm][wp_idx])
>> + if (val->wp[dtm][++wp_idx])
>> + return -ENOSPC;
>> +
>> + return wp_idx;
>> +}
>> +
>> +static int arm_cmn_val_add_event(struct arm_cmn *cmn, struct arm_cmn_val
>> *val,
>> + struct perf_event *event)
>
> This must never fail - the purpose of val_add_event is to fill in the val
> structure with the combination of leader and sibling events which have
> *already* passed their own event_init calls been declared valid as a group.
> The body of validate_group then does the "what if?" version to test whether
> the group would remain valid if the *current* event were to be added.

Makes perfectly sense. I fix it.

>
> The trick with the offset combine value relies on direct indexing to work, so
> I think we need to rejig the structure slightly to track distinct wp_count
> and wp_combine values (per direction) - that then becomes nicely consistent
> with the relationship between dtm_count and occupid, too.

I'm not sure if it's necessary but I do get the consistency reason though

>
>> {
>> struct arm_cmn_hw_event *hw = to_cmn_hw(event);
>> struct arm_cmn_node *dn;
>> @@ -1583,12 +1651,12 @@ static void arm_cmn_val_add_event(struct arm_cmn
>> *cmn, struct arm_cmn_val *val,
>> int i;
>> if (is_software_event(event))
>> - return;
>> + return 0;
>> type = CMN_EVENT_TYPE(event);
>> if (type == CMN_TYPE_DTC) {
>> val->cycles = true;
>> - return;
>> + return 0;
>> }
>> for_each_hw_dtc_idx(hw, dtc, idx)
>> @@ -1605,9 +1673,14 @@ static void arm_cmn_val_add_event(struct arm_cmn
>> *cmn, struct arm_cmn_val *val,
>> if (type != CMN_TYPE_WP)
>> continue;
>> - wp_idx = arm_cmn_wp_idx(event);
>> + wp_idx = arm_cmn_val_find_free_wp_config(event, val, dtm);
>> + if (wp_idx < 0)
>> + return -ENOSPC;
>> +
>> val->wp[dtm][wp_idx] = CMN_EVENT_WP_COMBINE(event) + 1;
>> }
>> +
>> + return 0;
>> }
>> static int arm_cmn_validate_group(struct arm_cmn *cmn, struct
>> perf_event *event)
>> @@ -1629,9 +1702,15 @@ static int arm_cmn_validate_group(struct arm_cmn
>> *cmn, struct perf_event *event)
>> if (!val)
>> return -ENOMEM;
>> - arm_cmn_val_add_event(cmn, val, leader);
>> - for_each_sibling_event(sibling, leader)
>> - arm_cmn_val_add_event(cmn, val, sibling);
>> + ret = arm_cmn_val_add_event(cmn, val, leader);
>> + if (ret)
>> + goto done;
>> +
>> + for_each_sibling_event(sibling, leader) {
>> + ret = arm_cmn_val_add_event(cmn, val, sibling);
>> + if (ret)
>> + goto done;
>> + }
>> type = CMN_EVENT_TYPE(event);
>> if (type == CMN_TYPE_DTC) {
>> @@ -1656,8 +1735,8 @@ static int arm_cmn_validate_group(struct arm_cmn
>> *cmn, struct perf_event *event)
>> if (type != CMN_TYPE_WP)
>> continue;
>> - wp_idx = arm_cmn_wp_idx(event);
>> - if (val->wp[dtm][wp_idx])
>> + wp_idx = arm_cmn_val_find_free_wp_config(event, val, dtm);
>> + if (wp_idx < 0)
>> goto done;
>> wp_cmb = val->wp[dtm][wp_idx ^ 1];
>> @@ -1772,8 +1851,11 @@ static void arm_cmn_event_clear(struct arm_cmn *cmn,
>> struct perf_event *event,
>> struct arm_cmn_dtm *dtm = &cmn->dtms[hw->dn[i].dtm] +
>> hw->dtm_offset;
>> unsigned int dtm_idx = arm_cmn_get_index(hw->dtm_idx, i);
>> - if (type == CMN_TYPE_WP)
>> - dtm->wp_event[arm_cmn_wp_idx(event)] = -1;
>> + if (type == CMN_TYPE_WP) {
>> + int wp_idx = arm_cmn_get_assigned_wp_idx(cmn,
>> &hw->dn[i], event, hw);
>> +
>> + dtm->wp_event[wp_idx] = -1;
>> + }
>> if (hw->filter_sel > SEL_NONE)
>> hw->dn[i].occupid[hw->filter_sel].count--;
>> @@ -1782,6 +1864,7 @@ static void arm_cmn_event_clear(struct arm_cmn *cmn,
>> struct perf_event *event,
>> writel_relaxed(dtm->pmu_config_low, dtm->base +
>> CMN_DTM_PMU_CONFIG);
>> }
>> memset(hw->dtm_idx, 0, sizeof(hw->dtm_idx));
>> + bitmap_zero(hw->wp_cfg, 2 * CMN_MAX_XPS);
>
> Nit: I'd rather do this in terms of sizeof() so it's harder to break in
> future. And since it's going to end up being a memset() anyway I'd then
> probably just open-code that rather than mucking about with bytes-to-bits
> calculations.

I change it.

>
>> for_each_hw_dtc_idx(hw, j, idx)
>> cmn->dtc[j].counters[idx] = NULL;
>> @@ -1835,10 +1918,11 @@ static int arm_cmn_event_add(struct perf_event
>> *event, int flags)
>> if (type == CMN_TYPE_XP) {
>> input_sel = CMN__PMEVCNT0_INPUT_SEL_XP + dtm_idx;
>> } else if (type == CMN_TYPE_WP) {
>> - int tmp, wp_idx = arm_cmn_wp_idx(event);
>> + int tmp, wp_idx;
>> u32 cfg = arm_cmn_wp_config(event);
>> - if (dtm->wp_event[wp_idx] >= 0)
>> + wp_idx = arm_cmn_find_free_wp_idx(cmn, dtm, event);
>> + if (wp_idx < 0)
>
> TBH I'm not convinced it's even worth factoring out the "allocator" here,
> since inline it can be as simple as:
>
> int tmp, wp_idx = CMN_EVENT_EVENTID(event);
> ...
> if (dtm->wp_event[wp_idx] && dtm->wp_event[++wp_idx])
>
> (or perhaps follow the same while/if shape as for dtm_idx further up, if you
> think it's worth being more clear than concise)

I'd rather keep them in their own functions to be more consistent and
slghtly clearer.

Cheers, Ilkka

>
> Thanks,
> Robin.
>
>> goto free_dtms;
>> tmp = dtm->wp_event[wp_idx ^ 1];
>> @@ -1847,7 +1931,8 @@ static int arm_cmn_event_add(struct perf_event
>> *event, int flags)
>> goto free_dtms;
>> input_sel = CMN__PMEVCNT0_INPUT_SEL_WP + wp_idx;
>> - dtm->wp_event[wp_idx] = hw->dtc_idx[d];
>> +
>> + arm_cmn_claim_wp_idx(cmn, dtm, event, dn, wp_idx, d);
>> writel_relaxed(cfg, dtm->base +
>> CMN_DTM_WPn_CONFIG(wp_idx));
>> } else {
>> struct arm_cmn_nodeid nid = arm_cmn_nid(cmn, dn->id);
>