2022-07-04 15:23:18

by Marco Elver

[permalink] [raw]
Subject: [PATCH v3 04/14] perf/hw_breakpoint: Optimize list of per-task breakpoints

On a machine with 256 CPUs, running the recently added perf breakpoint
benchmark results in:

| $> perf bench -r 30 breakpoint thread -b 4 -p 64 -t 64
| # Running 'breakpoint/thread' benchmark:
| # Created/joined 30 threads with 4 breakpoints and 64 parallelism
| Total time: 236.418 [sec]
|
| 123134.794271 usecs/op
| 7880626.833333 usecs/op/cpu

The benchmark tests inherited breakpoint perf events across many
threads.

Looking at a perf profile, we can see that the majority of the time is
spent in various hw_breakpoint.c functions, which execute within the
'nr_bp_mutex' critical sections which then results in contention on that
mutex as well:

37.27% [kernel] [k] osq_lock
34.92% [kernel] [k] mutex_spin_on_owner
12.15% [kernel] [k] toggle_bp_slot
11.90% [kernel] [k] __reserve_bp_slot

The culprit here is task_bp_pinned(), which has a runtime complexity of
O(#tasks) due to storing all task breakpoints in the same list and
iterating through that list looking for a matching task. Clearly, this
does not scale to thousands of tasks.

Instead, make use of the "rhashtable" variant "rhltable" which stores
multiple items with the same key in a list. This results in average
runtime complexity of O(1) for task_bp_pinned().

With the optimization, the benchmark shows:

| $> perf bench -r 30 breakpoint thread -b 4 -p 64 -t 64
| # Running 'breakpoint/thread' benchmark:
| # Created/joined 30 threads with 4 breakpoints and 64 parallelism
| Total time: 0.208 [sec]
|
| 108.422396 usecs/op
| 6939.033333 usecs/op/cpu

On this particular setup that's a speedup of ~1135x.

While one option would be to make task_struct a breakpoint list node,
this would only further bloat task_struct for infrequently used data.
Furthermore, after all optimizations in this series, there's no evidence
it would result in better performance: later optimizations make the time
spent looking up entries in the hash table negligible (we'll reach the
theoretical ideal performance i.e. no constraints).

Signed-off-by: Marco Elver <[email protected]>
Reviewed-by: Dmitry Vyukov <[email protected]>
---
v2:
* Commit message tweaks.
---
include/linux/perf_event.h | 3 +-
kernel/events/hw_breakpoint.c | 56 ++++++++++++++++++++++-------------
2 files changed, 37 insertions(+), 22 deletions(-)

diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 01231f1d976c..e27360436dc6 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -36,6 +36,7 @@ struct perf_guest_info_callbacks {
};

#ifdef CONFIG_HAVE_HW_BREAKPOINT
+#include <linux/rhashtable-types.h>
#include <asm/hw_breakpoint.h>
#endif

@@ -178,7 +179,7 @@ struct hw_perf_event {
* creation and event initalization.
*/
struct arch_hw_breakpoint info;
- struct list_head bp_list;
+ struct rhlist_head bp_list;
};
#endif
struct { /* amd_iommu */
diff --git a/kernel/events/hw_breakpoint.c b/kernel/events/hw_breakpoint.c
index 6076c6346291..6d09edc80d19 100644
--- a/kernel/events/hw_breakpoint.c
+++ b/kernel/events/hw_breakpoint.c
@@ -26,10 +26,10 @@
#include <linux/irqflags.h>
#include <linux/kdebug.h>
#include <linux/kernel.h>
-#include <linux/list.h>
#include <linux/mutex.h>
#include <linux/notifier.h>
#include <linux/percpu.h>
+#include <linux/rhashtable.h>
#include <linux/sched.h>
#include <linux/slab.h>

@@ -54,7 +54,13 @@ static struct bp_cpuinfo *get_bp_info(int cpu, enum bp_type_idx type)
}

/* Keep track of the breakpoints attached to tasks */
-static LIST_HEAD(bp_task_head);
+static struct rhltable task_bps_ht;
+static const struct rhashtable_params task_bps_ht_params = {
+ .head_offset = offsetof(struct hw_perf_event, bp_list),
+ .key_offset = offsetof(struct hw_perf_event, target),
+ .key_len = sizeof_field(struct hw_perf_event, target),
+ .automatic_shrinking = true,
+};

static int constraints_initialized;

@@ -103,17 +109,23 @@ static unsigned int max_task_bp_pinned(int cpu, enum bp_type_idx type)
*/
static int task_bp_pinned(int cpu, struct perf_event *bp, enum bp_type_idx type)
{
- struct task_struct *tsk = bp->hw.target;
+ struct rhlist_head *head, *pos;
struct perf_event *iter;
int count = 0;

- list_for_each_entry(iter, &bp_task_head, hw.bp_list) {
- if (iter->hw.target == tsk &&
- find_slot_idx(iter->attr.bp_type) == type &&
+ rcu_read_lock();
+ head = rhltable_lookup(&task_bps_ht, &bp->hw.target, task_bps_ht_params);
+ if (!head)
+ goto out;
+
+ rhl_for_each_entry_rcu(iter, pos, head, hw.bp_list) {
+ if (find_slot_idx(iter->attr.bp_type) == type &&
(iter->cpu < 0 || cpu == iter->cpu))
count += hw_breakpoint_weight(iter);
}

+out:
+ rcu_read_unlock();
return count;
}

@@ -186,7 +198,7 @@ static void toggle_bp_task_slot(struct perf_event *bp, int cpu,
/*
* Add/remove the given breakpoint in our constraint table
*/
-static void
+static int
toggle_bp_slot(struct perf_event *bp, bool enable, enum bp_type_idx type,
int weight)
{
@@ -199,7 +211,7 @@ toggle_bp_slot(struct perf_event *bp, bool enable, enum bp_type_idx type,
/* Pinned counter cpu profiling */
if (!bp->hw.target) {
get_bp_info(bp->cpu, type)->cpu_pinned += weight;
- return;
+ return 0;
}

/* Pinned counter task profiling */
@@ -207,9 +219,9 @@ toggle_bp_slot(struct perf_event *bp, bool enable, enum bp_type_idx type,
toggle_bp_task_slot(bp, cpu, type, weight);

if (enable)
- list_add_tail(&bp->hw.bp_list, &bp_task_head);
+ return rhltable_insert(&task_bps_ht, &bp->hw.bp_list, task_bps_ht_params);
else
- list_del(&bp->hw.bp_list);
+ return rhltable_remove(&task_bps_ht, &bp->hw.bp_list, task_bps_ht_params);
}

__weak int arch_reserve_bp_slot(struct perf_event *bp)
@@ -307,9 +319,7 @@ static int __reserve_bp_slot(struct perf_event *bp, u64 bp_type)
if (ret)
return ret;

- toggle_bp_slot(bp, true, type, weight);
-
- return 0;
+ return toggle_bp_slot(bp, true, type, weight);
}

int reserve_bp_slot(struct perf_event *bp)
@@ -334,7 +344,7 @@ static void __release_bp_slot(struct perf_event *bp, u64 bp_type)

type = find_slot_idx(bp_type);
weight = hw_breakpoint_weight(bp);
- toggle_bp_slot(bp, false, type, weight);
+ WARN_ON(toggle_bp_slot(bp, false, type, weight));
}

void release_bp_slot(struct perf_event *bp)
@@ -707,7 +717,7 @@ static struct pmu perf_breakpoint = {
int __init init_hw_breakpoint(void)
{
int cpu, err_cpu;
- int i;
+ int i, ret;

for (i = 0; i < TYPE_MAX; i++)
nr_slots[i] = hw_breakpoint_slots(i);
@@ -718,18 +728,24 @@ int __init init_hw_breakpoint(void)

info->tsk_pinned = kcalloc(nr_slots[i], sizeof(int),
GFP_KERNEL);
- if (!info->tsk_pinned)
- goto err_alloc;
+ if (!info->tsk_pinned) {
+ ret = -ENOMEM;
+ goto err;
+ }
}
}

+ ret = rhltable_init(&task_bps_ht, &task_bps_ht_params);
+ if (ret)
+ goto err;
+
constraints_initialized = 1;

perf_pmu_register(&perf_breakpoint, "breakpoint", PERF_TYPE_BREAKPOINT);

return register_die_notifier(&hw_breakpoint_exceptions_nb);

- err_alloc:
+err:
for_each_possible_cpu(err_cpu) {
for (i = 0; i < TYPE_MAX; i++)
kfree(get_bp_info(err_cpu, i)->tsk_pinned);
@@ -737,7 +753,5 @@ int __init init_hw_breakpoint(void)
break;
}

- return -ENOMEM;
+ return ret;
}
-
-
--
2.37.0.rc0.161.g10f37bed90-goog


2022-07-20 15:49:24

by Ian Rogers

[permalink] [raw]
Subject: Re: [PATCH v3 04/14] perf/hw_breakpoint: Optimize list of per-task breakpoints

On Mon, Jul 4, 2022 at 8:06 AM Marco Elver <[email protected]> wrote:
>
> On a machine with 256 CPUs, running the recently added perf breakpoint
> benchmark results in:
>
> | $> perf bench -r 30 breakpoint thread -b 4 -p 64 -t 64
> | # Running 'breakpoint/thread' benchmark:
> | # Created/joined 30 threads with 4 breakpoints and 64 parallelism
> | Total time: 236.418 [sec]
> |
> | 123134.794271 usecs/op
> | 7880626.833333 usecs/op/cpu
>
> The benchmark tests inherited breakpoint perf events across many
> threads.
>
> Looking at a perf profile, we can see that the majority of the time is
> spent in various hw_breakpoint.c functions, which execute within the
> 'nr_bp_mutex' critical sections which then results in contention on that
> mutex as well:
>
> 37.27% [kernel] [k] osq_lock
> 34.92% [kernel] [k] mutex_spin_on_owner
> 12.15% [kernel] [k] toggle_bp_slot
> 11.90% [kernel] [k] __reserve_bp_slot
>
> The culprit here is task_bp_pinned(), which has a runtime complexity of
> O(#tasks) due to storing all task breakpoints in the same list and
> iterating through that list looking for a matching task. Clearly, this
> does not scale to thousands of tasks.
>
> Instead, make use of the "rhashtable" variant "rhltable" which stores
> multiple items with the same key in a list. This results in average
> runtime complexity of O(1) for task_bp_pinned().
>
> With the optimization, the benchmark shows:
>
> | $> perf bench -r 30 breakpoint thread -b 4 -p 64 -t 64
> | # Running 'breakpoint/thread' benchmark:
> | # Created/joined 30 threads with 4 breakpoints and 64 parallelism
> | Total time: 0.208 [sec]
> |
> | 108.422396 usecs/op
> | 6939.033333 usecs/op/cpu
>
> On this particular setup that's a speedup of ~1135x.
>
> While one option would be to make task_struct a breakpoint list node,
> this would only further bloat task_struct for infrequently used data.
> Furthermore, after all optimizations in this series, there's no evidence
> it would result in better performance: later optimizations make the time
> spent looking up entries in the hash table negligible (we'll reach the
> theoretical ideal performance i.e. no constraints).
>
> Signed-off-by: Marco Elver <[email protected]>
> Reviewed-by: Dmitry Vyukov <[email protected]>
> ---
> v2:
> * Commit message tweaks.
> ---
> include/linux/perf_event.h | 3 +-
> kernel/events/hw_breakpoint.c | 56 ++++++++++++++++++++++-------------
> 2 files changed, 37 insertions(+), 22 deletions(-)
>
> diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
> index 01231f1d976c..e27360436dc6 100644
> --- a/include/linux/perf_event.h
> +++ b/include/linux/perf_event.h
> @@ -36,6 +36,7 @@ struct perf_guest_info_callbacks {
> };
>
> #ifdef CONFIG_HAVE_HW_BREAKPOINT
> +#include <linux/rhashtable-types.h>
> #include <asm/hw_breakpoint.h>
> #endif
>
> @@ -178,7 +179,7 @@ struct hw_perf_event {
> * creation and event initalization.
> */
> struct arch_hw_breakpoint info;
> - struct list_head bp_list;
> + struct rhlist_head bp_list;

nit: perhaps it would be more intention revealing here to rename this
to bp_hashtable?

Acked-by: Ian Rogers <[email protected]>

Thanks,
Ian

> };
> #endif
> struct { /* amd_iommu */
> diff --git a/kernel/events/hw_breakpoint.c b/kernel/events/hw_breakpoint.c
> index 6076c6346291..6d09edc80d19 100644
> --- a/kernel/events/hw_breakpoint.c
> +++ b/kernel/events/hw_breakpoint.c
> @@ -26,10 +26,10 @@
> #include <linux/irqflags.h>
> #include <linux/kdebug.h>
> #include <linux/kernel.h>
> -#include <linux/list.h>
> #include <linux/mutex.h>
> #include <linux/notifier.h>
> #include <linux/percpu.h>
> +#include <linux/rhashtable.h>
> #include <linux/sched.h>
> #include <linux/slab.h>
>
> @@ -54,7 +54,13 @@ static struct bp_cpuinfo *get_bp_info(int cpu, enum bp_type_idx type)
> }
>
> /* Keep track of the breakpoints attached to tasks */
> -static LIST_HEAD(bp_task_head);
> +static struct rhltable task_bps_ht;
> +static const struct rhashtable_params task_bps_ht_params = {
> + .head_offset = offsetof(struct hw_perf_event, bp_list),
> + .key_offset = offsetof(struct hw_perf_event, target),
> + .key_len = sizeof_field(struct hw_perf_event, target),
> + .automatic_shrinking = true,
> +};
>
> static int constraints_initialized;
>
> @@ -103,17 +109,23 @@ static unsigned int max_task_bp_pinned(int cpu, enum bp_type_idx type)
> */
> static int task_bp_pinned(int cpu, struct perf_event *bp, enum bp_type_idx type)
> {
> - struct task_struct *tsk = bp->hw.target;
> + struct rhlist_head *head, *pos;
> struct perf_event *iter;
> int count = 0;
>
> - list_for_each_entry(iter, &bp_task_head, hw.bp_list) {
> - if (iter->hw.target == tsk &&
> - find_slot_idx(iter->attr.bp_type) == type &&
> + rcu_read_lock();
> + head = rhltable_lookup(&task_bps_ht, &bp->hw.target, task_bps_ht_params);
> + if (!head)
> + goto out;
> +
> + rhl_for_each_entry_rcu(iter, pos, head, hw.bp_list) {
> + if (find_slot_idx(iter->attr.bp_type) == type &&
> (iter->cpu < 0 || cpu == iter->cpu))
> count += hw_breakpoint_weight(iter);
> }
>
> +out:
> + rcu_read_unlock();
> return count;
> }
>
> @@ -186,7 +198,7 @@ static void toggle_bp_task_slot(struct perf_event *bp, int cpu,
> /*
> * Add/remove the given breakpoint in our constraint table
> */
> -static void
> +static int
> toggle_bp_slot(struct perf_event *bp, bool enable, enum bp_type_idx type,
> int weight)
> {
> @@ -199,7 +211,7 @@ toggle_bp_slot(struct perf_event *bp, bool enable, enum bp_type_idx type,
> /* Pinned counter cpu profiling */
> if (!bp->hw.target) {
> get_bp_info(bp->cpu, type)->cpu_pinned += weight;
> - return;
> + return 0;
> }
>
> /* Pinned counter task profiling */
> @@ -207,9 +219,9 @@ toggle_bp_slot(struct perf_event *bp, bool enable, enum bp_type_idx type,
> toggle_bp_task_slot(bp, cpu, type, weight);
>
> if (enable)
> - list_add_tail(&bp->hw.bp_list, &bp_task_head);
> + return rhltable_insert(&task_bps_ht, &bp->hw.bp_list, task_bps_ht_params);
> else
> - list_del(&bp->hw.bp_list);
> + return rhltable_remove(&task_bps_ht, &bp->hw.bp_list, task_bps_ht_params);
> }
>
> __weak int arch_reserve_bp_slot(struct perf_event *bp)
> @@ -307,9 +319,7 @@ static int __reserve_bp_slot(struct perf_event *bp, u64 bp_type)
> if (ret)
> return ret;
>
> - toggle_bp_slot(bp, true, type, weight);
> -
> - return 0;
> + return toggle_bp_slot(bp, true, type, weight);
> }
>
> int reserve_bp_slot(struct perf_event *bp)
> @@ -334,7 +344,7 @@ static void __release_bp_slot(struct perf_event *bp, u64 bp_type)
>
> type = find_slot_idx(bp_type);
> weight = hw_breakpoint_weight(bp);
> - toggle_bp_slot(bp, false, type, weight);
> + WARN_ON(toggle_bp_slot(bp, false, type, weight));
> }
>
> void release_bp_slot(struct perf_event *bp)
> @@ -707,7 +717,7 @@ static struct pmu perf_breakpoint = {
> int __init init_hw_breakpoint(void)
> {
> int cpu, err_cpu;
> - int i;
> + int i, ret;
>
> for (i = 0; i < TYPE_MAX; i++)
> nr_slots[i] = hw_breakpoint_slots(i);
> @@ -718,18 +728,24 @@ int __init init_hw_breakpoint(void)
>
> info->tsk_pinned = kcalloc(nr_slots[i], sizeof(int),
> GFP_KERNEL);
> - if (!info->tsk_pinned)
> - goto err_alloc;
> + if (!info->tsk_pinned) {
> + ret = -ENOMEM;
> + goto err;
> + }
> }
> }
>
> + ret = rhltable_init(&task_bps_ht, &task_bps_ht_params);
> + if (ret)
> + goto err;
> +
> constraints_initialized = 1;
>
> perf_pmu_register(&perf_breakpoint, "breakpoint", PERF_TYPE_BREAKPOINT);
>
> return register_die_notifier(&hw_breakpoint_exceptions_nb);
>
> - err_alloc:
> +err:
> for_each_possible_cpu(err_cpu) {
> for (i = 0; i < TYPE_MAX; i++)
> kfree(get_bp_info(err_cpu, i)->tsk_pinned);
> @@ -737,7 +753,5 @@ int __init init_hw_breakpoint(void)
> break;
> }
>
> - return -ENOMEM;
> + return ret;
> }
> -
> -
> --
> 2.37.0.rc0.161.g10f37bed90-goog
>

2022-07-20 16:41:45

by Marco Elver

[permalink] [raw]
Subject: Re: [PATCH v3 04/14] perf/hw_breakpoint: Optimize list of per-task breakpoints

On Wed, 20 Jul 2022 at 17:29, Ian Rogers <[email protected]> wrote:
>
> On Mon, Jul 4, 2022 at 8:06 AM Marco Elver <[email protected]> wrote:
> >
> > On a machine with 256 CPUs, running the recently added perf breakpoint
> > benchmark results in:
> >
> > | $> perf bench -r 30 breakpoint thread -b 4 -p 64 -t 64
> > | # Running 'breakpoint/thread' benchmark:
> > | # Created/joined 30 threads with 4 breakpoints and 64 parallelism
> > | Total time: 236.418 [sec]
> > |
> > | 123134.794271 usecs/op
> > | 7880626.833333 usecs/op/cpu
> >
> > The benchmark tests inherited breakpoint perf events across many
> > threads.
> >
> > Looking at a perf profile, we can see that the majority of the time is
> > spent in various hw_breakpoint.c functions, which execute within the
> > 'nr_bp_mutex' critical sections which then results in contention on that
> > mutex as well:
> >
> > 37.27% [kernel] [k] osq_lock
> > 34.92% [kernel] [k] mutex_spin_on_owner
> > 12.15% [kernel] [k] toggle_bp_slot
> > 11.90% [kernel] [k] __reserve_bp_slot
> >
> > The culprit here is task_bp_pinned(), which has a runtime complexity of
> > O(#tasks) due to storing all task breakpoints in the same list and
> > iterating through that list looking for a matching task. Clearly, this
> > does not scale to thousands of tasks.
> >
> > Instead, make use of the "rhashtable" variant "rhltable" which stores
> > multiple items with the same key in a list. This results in average
> > runtime complexity of O(1) for task_bp_pinned().
> >
> > With the optimization, the benchmark shows:
> >
> > | $> perf bench -r 30 breakpoint thread -b 4 -p 64 -t 64
> > | # Running 'breakpoint/thread' benchmark:
> > | # Created/joined 30 threads with 4 breakpoints and 64 parallelism
> > | Total time: 0.208 [sec]
> > |
> > | 108.422396 usecs/op
> > | 6939.033333 usecs/op/cpu
> >
> > On this particular setup that's a speedup of ~1135x.
> >
> > While one option would be to make task_struct a breakpoint list node,
> > this would only further bloat task_struct for infrequently used data.
> > Furthermore, after all optimizations in this series, there's no evidence
> > it would result in better performance: later optimizations make the time
> > spent looking up entries in the hash table negligible (we'll reach the
> > theoretical ideal performance i.e. no constraints).
> >
> > Signed-off-by: Marco Elver <[email protected]>
> > Reviewed-by: Dmitry Vyukov <[email protected]>
> > ---
> > v2:
> > * Commit message tweaks.
> > ---
> > include/linux/perf_event.h | 3 +-
> > kernel/events/hw_breakpoint.c | 56 ++++++++++++++++++++++-------------
> > 2 files changed, 37 insertions(+), 22 deletions(-)
> >
> > diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
> > index 01231f1d976c..e27360436dc6 100644
> > --- a/include/linux/perf_event.h
> > +++ b/include/linux/perf_event.h
> > @@ -36,6 +36,7 @@ struct perf_guest_info_callbacks {
> > };
> >
> > #ifdef CONFIG_HAVE_HW_BREAKPOINT
> > +#include <linux/rhashtable-types.h>
> > #include <asm/hw_breakpoint.h>
> > #endif
> >
> > @@ -178,7 +179,7 @@ struct hw_perf_event {
> > * creation and event initalization.
> > */
> > struct arch_hw_breakpoint info;
> > - struct list_head bp_list;
> > + struct rhlist_head bp_list;
>
> nit: perhaps it would be more intention revealing here to rename this
> to bp_hashtable?

The naming convention for uses of rhlist_head appears to be either
'list' or 'node' (also inside lib/rhashtable.c). I think this makes
sense because internally this struct is used to just append to the
bucket's list.

> Acked-by: Ian Rogers <[email protected]>

Thanks!
-- Marco