2023-11-18 15:52:42

by Yury Norov

[permalink] [raw]
Subject: [PATCH 07/34] perf/arm: optimize opencoded atomic find_bit() API

Switch subsystem to use atomic find_bit() or atomic iterators as
appropriate.

Signed-off-by: Yury Norov <[email protected]>
---
drivers/perf/arm-cci.c | 23 +++++------------------
drivers/perf/arm-ccn.c | 10 ++--------
drivers/perf/arm_dmc620_pmu.c | 9 ++-------
drivers/perf/arm_pmuv3.c | 8 ++------
4 files changed, 11 insertions(+), 39 deletions(-)

diff --git a/drivers/perf/arm-cci.c b/drivers/perf/arm-cci.c
index 61de861eaf91..70fbf9d09d37 100644
--- a/drivers/perf/arm-cci.c
+++ b/drivers/perf/arm-cci.c
@@ -320,12 +320,8 @@ static int cci400_get_event_idx(struct cci_pmu *cci_pmu,
return CCI400_PMU_CYCLE_CNTR_IDX;
}

- for (idx = CCI400_PMU_CNTR0_IDX; idx <= CCI_PMU_CNTR_LAST(cci_pmu); ++idx)
- if (!test_and_set_bit(idx, hw->used_mask))
- return idx;
-
- /* No counters available */
- return -EAGAIN;
+ idx = find_and_set_bit(hw->used_mask, CCI_PMU_CNTR_LAST(cci_pmu) + 1);
+ return idx < CCI_PMU_CNTR_LAST(cci_pmu) + 1 ? idx : -EAGAIN;
}

static int cci400_validate_hw_event(struct cci_pmu *cci_pmu, unsigned long hw_event)
@@ -802,13 +798,8 @@ static int pmu_get_event_idx(struct cci_pmu_hw_events *hw, struct perf_event *ev
if (cci_pmu->model->get_event_idx)
return cci_pmu->model->get_event_idx(cci_pmu, hw, cci_event);

- /* Generic code to find an unused idx from the mask */
- for (idx = 0; idx <= CCI_PMU_CNTR_LAST(cci_pmu); idx++)
- if (!test_and_set_bit(idx, hw->used_mask))
- return idx;
-
- /* No counters available */
- return -EAGAIN;
+ idx = find_and_set_bit(hw->used_mask, CCI_PMU_CNTR_LAST(cci_pmu) + 1);
+ return idx < CCI_PMU_CNTR_LAST(cci_pmu) + 1 ? idx : -EAGAIN;
}

static int pmu_map_event(struct perf_event *event)
@@ -861,12 +852,8 @@ static void pmu_free_irq(struct cci_pmu *cci_pmu)
{
int i;

- for (i = 0; i < cci_pmu->nr_irqs; i++) {
- if (!test_and_clear_bit(i, &cci_pmu->active_irqs))
- continue;
-
+ for_each_test_and_clear_bit(i, &cci_pmu->active_irqs, cci_pmu->nr_irqs)
free_irq(cci_pmu->irqs[i], cci_pmu);
- }
}

static u32 pmu_read_counter(struct perf_event *event)
diff --git a/drivers/perf/arm-ccn.c b/drivers/perf/arm-ccn.c
index 728d13d8e98a..d657701b1f23 100644
--- a/drivers/perf/arm-ccn.c
+++ b/drivers/perf/arm-ccn.c
@@ -589,15 +589,9 @@ static const struct attribute_group *arm_ccn_pmu_attr_groups[] = {

static int arm_ccn_pmu_alloc_bit(unsigned long *bitmap, unsigned long size)
{
- int bit;
-
- do {
- bit = find_first_zero_bit(bitmap, size);
- if (bit >= size)
- return -EAGAIN;
- } while (test_and_set_bit(bit, bitmap));
+ int bit = find_and_set_bit(bitmap, size);

- return bit;
+ return bit < size ? bit : -EAGAIN;
}

/* All RN-I and RN-D nodes have identical PMUs */
diff --git a/drivers/perf/arm_dmc620_pmu.c b/drivers/perf/arm_dmc620_pmu.c
index 30cea6859574..e41c84dabc3e 100644
--- a/drivers/perf/arm_dmc620_pmu.c
+++ b/drivers/perf/arm_dmc620_pmu.c
@@ -303,13 +303,8 @@ static int dmc620_get_event_idx(struct perf_event *event)
end_idx = DMC620_PMU_MAX_COUNTERS;
}

- for (idx = start_idx; idx < end_idx; ++idx) {
- if (!test_and_set_bit(idx, dmc620_pmu->used_mask))
- return idx;
- }
-
- /* The counters are all in use. */
- return -EAGAIN;
+ idx = find_and_set_next_bit(dmc620_pmu->used_mask, end_idx, start_idx);
+ return idx < end_idx ? idx : -EAGAIN;
}

static inline
diff --git a/drivers/perf/arm_pmuv3.c b/drivers/perf/arm_pmuv3.c
index 18b91b56af1d..784b0383e9f8 100644
--- a/drivers/perf/arm_pmuv3.c
+++ b/drivers/perf/arm_pmuv3.c
@@ -825,13 +825,9 @@ static irqreturn_t armv8pmu_handle_irq(struct arm_pmu *cpu_pmu)
static int armv8pmu_get_single_idx(struct pmu_hw_events *cpuc,
struct arm_pmu *cpu_pmu)
{
- int idx;
+ int idx = find_and_set_next_bit(cpuc->used_mask, cpu_pmu->num_events, ARMV8_IDX_COUNTER0);

- for (idx = ARMV8_IDX_COUNTER0; idx < cpu_pmu->num_events; idx++) {
- if (!test_and_set_bit(idx, cpuc->used_mask))
- return idx;
- }
- return -EAGAIN;
+ return idx < cpu_pmu->num_events ? idx : -EAGAIN;
}

static int armv8pmu_get_chain_idx(struct pmu_hw_events *cpuc,
--
2.39.2


2023-11-21 15:54:28

by Will Deacon

[permalink] [raw]
Subject: Re: [PATCH 07/34] perf/arm: optimize opencoded atomic find_bit() API

On Sat, Nov 18, 2023 at 07:50:38AM -0800, Yury Norov wrote:
> Switch subsystem to use atomic find_bit() or atomic iterators as
> appropriate.
>
> Signed-off-by: Yury Norov <[email protected]>
> ---
> drivers/perf/arm-cci.c | 23 +++++------------------
> drivers/perf/arm-ccn.c | 10 ++--------
> drivers/perf/arm_dmc620_pmu.c | 9 ++-------
> drivers/perf/arm_pmuv3.c | 8 ++------
> 4 files changed, 11 insertions(+), 39 deletions(-)
>
> diff --git a/drivers/perf/arm-cci.c b/drivers/perf/arm-cci.c
> index 61de861eaf91..70fbf9d09d37 100644
> --- a/drivers/perf/arm-cci.c
> +++ b/drivers/perf/arm-cci.c
> @@ -320,12 +320,8 @@ static int cci400_get_event_idx(struct cci_pmu *cci_pmu,
> return CCI400_PMU_CYCLE_CNTR_IDX;
> }
>
> - for (idx = CCI400_PMU_CNTR0_IDX; idx <= CCI_PMU_CNTR_LAST(cci_pmu); ++idx)
> - if (!test_and_set_bit(idx, hw->used_mask))
> - return idx;
> -
> - /* No counters available */
> - return -EAGAIN;
> + idx = find_and_set_bit(hw->used_mask, CCI_PMU_CNTR_LAST(cci_pmu) + 1);

CCI400_PMU_CNTR0_IDX is defined as 1, so isn't this wrong?

[...]

> diff --git a/drivers/perf/arm_dmc620_pmu.c b/drivers/perf/arm_dmc620_pmu.c
> index 30cea6859574..e41c84dabc3e 100644
> --- a/drivers/perf/arm_dmc620_pmu.c
> +++ b/drivers/perf/arm_dmc620_pmu.c
> @@ -303,13 +303,8 @@ static int dmc620_get_event_idx(struct perf_event *event)
> end_idx = DMC620_PMU_MAX_COUNTERS;
> }
>
> - for (idx = start_idx; idx < end_idx; ++idx) {
> - if (!test_and_set_bit(idx, dmc620_pmu->used_mask))
> - return idx;
> - }
> -
> - /* The counters are all in use. */
> - return -EAGAIN;
> + idx = find_and_set_next_bit(dmc620_pmu->used_mask, end_idx, start_idx);

It might just be me, but I'd find this a tonne easier to read if you swapped
the last two arguments around so that the offset came before the limit in
the new function.

Will

2023-11-21 16:18:07

by Will Deacon

[permalink] [raw]
Subject: Re: [PATCH 07/34] perf/arm: optimize opencoded atomic find_bit() API

On Tue, Nov 21, 2023 at 08:16:13AM -0800, Yury Norov wrote:
> On Tue, Nov 21, 2023 at 03:53:44PM +0000, Will Deacon wrote:
> > On Sat, Nov 18, 2023 at 07:50:38AM -0800, Yury Norov wrote:
> > > Switch subsystem to use atomic find_bit() or atomic iterators as
> > > appropriate.
> > >
> > > Signed-off-by: Yury Norov <[email protected]>
> > > ---
> > > drivers/perf/arm-cci.c | 23 +++++------------------
> > > drivers/perf/arm-ccn.c | 10 ++--------
> > > drivers/perf/arm_dmc620_pmu.c | 9 ++-------
> > > drivers/perf/arm_pmuv3.c | 8 ++------
> > > 4 files changed, 11 insertions(+), 39 deletions(-)
> > >
> > > diff --git a/drivers/perf/arm-cci.c b/drivers/perf/arm-cci.c
> > > index 61de861eaf91..70fbf9d09d37 100644
> > > --- a/drivers/perf/arm-cci.c
> > > +++ b/drivers/perf/arm-cci.c
> > > @@ -320,12 +320,8 @@ static int cci400_get_event_idx(struct cci_pmu *cci_pmu,
> > > return CCI400_PMU_CYCLE_CNTR_IDX;
> > > }
> > >
> > > - for (idx = CCI400_PMU_CNTR0_IDX; idx <= CCI_PMU_CNTR_LAST(cci_pmu); ++idx)
> > > - if (!test_and_set_bit(idx, hw->used_mask))
> > > - return idx;
> > > -
> > > - /* No counters available */
> > > - return -EAGAIN;
> > > + idx = find_and_set_bit(hw->used_mask, CCI_PMU_CNTR_LAST(cci_pmu) + 1);
> >
> > CCI400_PMU_CNTR0_IDX is defined as 1, so isn't this wrong?
>
> You're right. Will fix in v2
>
> > [...]
> >
> > > diff --git a/drivers/perf/arm_dmc620_pmu.c b/drivers/perf/arm_dmc620_pmu.c
> > > index 30cea6859574..e41c84dabc3e 100644
> > > --- a/drivers/perf/arm_dmc620_pmu.c
> > > +++ b/drivers/perf/arm_dmc620_pmu.c
> > > @@ -303,13 +303,8 @@ static int dmc620_get_event_idx(struct perf_event *event)
> > > end_idx = DMC620_PMU_MAX_COUNTERS;
> > > }
> > >
> > > - for (idx = start_idx; idx < end_idx; ++idx) {
> > > - if (!test_and_set_bit(idx, dmc620_pmu->used_mask))
> > > - return idx;
> > > - }
> > > -
> > > - /* The counters are all in use. */
> > > - return -EAGAIN;
> > > + idx = find_and_set_next_bit(dmc620_pmu->used_mask, end_idx, start_idx);
> >
> > It might just be me, but I'd find this a tonne easier to read if you swapped
> > the last two arguments around so that the offset came before the limit in
> > the new function.
>
> I personally agree, but we already have find_next_*_bit(addr, nbits, offset)
> functions, and having atomic versions of the same with different order
> of arguments will make it even more messy...

Urgh, and there's loads of them too :(

Will

2023-11-21 16:18:16

by Yury Norov

[permalink] [raw]
Subject: Re: [PATCH 07/34] perf/arm: optimize opencoded atomic find_bit() API

On Tue, Nov 21, 2023 at 03:53:44PM +0000, Will Deacon wrote:
> On Sat, Nov 18, 2023 at 07:50:38AM -0800, Yury Norov wrote:
> > Switch subsystem to use atomic find_bit() or atomic iterators as
> > appropriate.
> >
> > Signed-off-by: Yury Norov <[email protected]>
> > ---
> > drivers/perf/arm-cci.c | 23 +++++------------------
> > drivers/perf/arm-ccn.c | 10 ++--------
> > drivers/perf/arm_dmc620_pmu.c | 9 ++-------
> > drivers/perf/arm_pmuv3.c | 8 ++------
> > 4 files changed, 11 insertions(+), 39 deletions(-)
> >
> > diff --git a/drivers/perf/arm-cci.c b/drivers/perf/arm-cci.c
> > index 61de861eaf91..70fbf9d09d37 100644
> > --- a/drivers/perf/arm-cci.c
> > +++ b/drivers/perf/arm-cci.c
> > @@ -320,12 +320,8 @@ static int cci400_get_event_idx(struct cci_pmu *cci_pmu,
> > return CCI400_PMU_CYCLE_CNTR_IDX;
> > }
> >
> > - for (idx = CCI400_PMU_CNTR0_IDX; idx <= CCI_PMU_CNTR_LAST(cci_pmu); ++idx)
> > - if (!test_and_set_bit(idx, hw->used_mask))
> > - return idx;
> > -
> > - /* No counters available */
> > - return -EAGAIN;
> > + idx = find_and_set_bit(hw->used_mask, CCI_PMU_CNTR_LAST(cci_pmu) + 1);
>
> CCI400_PMU_CNTR0_IDX is defined as 1, so isn't this wrong?

You're right. Will fix in v2

> [...]
>
> > diff --git a/drivers/perf/arm_dmc620_pmu.c b/drivers/perf/arm_dmc620_pmu.c
> > index 30cea6859574..e41c84dabc3e 100644
> > --- a/drivers/perf/arm_dmc620_pmu.c
> > +++ b/drivers/perf/arm_dmc620_pmu.c
> > @@ -303,13 +303,8 @@ static int dmc620_get_event_idx(struct perf_event *event)
> > end_idx = DMC620_PMU_MAX_COUNTERS;
> > }
> >
> > - for (idx = start_idx; idx < end_idx; ++idx) {
> > - if (!test_and_set_bit(idx, dmc620_pmu->used_mask))
> > - return idx;
> > - }
> > -
> > - /* The counters are all in use. */
> > - return -EAGAIN;
> > + idx = find_and_set_next_bit(dmc620_pmu->used_mask, end_idx, start_idx);
>
> It might just be me, but I'd find this a tonne easier to read if you swapped
> the last two arguments around so that the offset came before the limit in
> the new function.

I personally agree, but we already have find_next_*_bit(addr, nbits, offset)
functions, and having atomic versions of the same with different order
of arguments will make it even more messy...

Thanks,
Yury