2023-10-04 04:13:06

by Namhyung Kim

[permalink] [raw]
Subject: [PATCH] perf/core: Introduce cpuctx->cgrp_ctx_list

The commit bd2756811766 ("perf: Rewrite core context handling") removed
cgrp_cpuctx_list to link active cpu (pmu) contexts together. It's used
in the perf_cgroup_switch() to access cgroup events only.

But after the change, it ended up iterating all pmus/events in the cpu
context if there's a cgroup event somewhere on the cpu context.
Unfortunately it includes uncore pmus which have much longer latency to
control.

That regressed some load tests occasionally (only when unrelated perf
stat with cgroup events and uncore events ran on the same cpu) due to
increased context switch time.

AFAIK we don't have a tool to measure the context switch overhead
directly. (I think I should add one to perf ftrace latency). But I can
see it with a simple perf bench command like this.

$ perf bench sched pipe -l 100000
# Running 'sched/pipe' benchmark:
# Executed 100000 pipe operations between two processes

Total time: 0.650 [sec]

6.505740 usecs/op
153710 ops/sec

It runs two tasks communicate each other using a pipe so it should
stress the context switch code. This is the normal numbers on my
system. But after I run these two perf stat commands in background,
the numbers vary a lot.

$ sudo perf stat -a -e cycles -G user.slice -- sleep 100000 &
$ sudo perf stat -a -e uncore_imc/cas_count_read/ -- sleep 10000 &

I will show the last two lines of perf bench sched pipe output for
three runs.

58.597060 usecs/op # run 1
17065 ops/sec

11.329240 usecs/op # run 2
88267 ops/sec

88.481920 usecs/op # run 3
11301 ops/sec

I think the deviation comes from the fact that uncore events are managed
a certain number of cpus only. If the target process runs on a cpu that
manages uncore pmu, it'd take longer. Otherwise it won't affect the
performance much.

To fix the issue, I restored a linked list equivalent to cgrp_cpuctx_list
in the perf_cpu_context and link perf_cpu_pmu_contexts that have cgroup
events only. Also add new helpers to enable/disable and does ctx sched
in/out for cgroups.

After the change, I got something like this constantly. It's slightly
higher than the normal, but it includes actual cgroup event switch time.

8.970910 usecs/op
111471 ops/sec

Fixes: bd2756811766 ("perf: Rewrite core context handling")
Cc: [email protected]
Signed-off-by: Namhyung Kim <[email protected]>
---
include/linux/perf_event.h | 5 ++
kernel/events/core.c | 117 +++++++++++++++++++++++++++++++++----
2 files changed, 111 insertions(+), 11 deletions(-)

diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index e85cd1c0eaf3..7d56d7aa6b34 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -972,6 +972,10 @@ struct perf_cpu_pmu_context {
struct perf_event_pmu_context epc;
struct perf_event_pmu_context *task_epc;

+#ifdef CONFIG_CGROUP_PERF
+ struct list_head cgrp_ctx_entry;
+#endif
+
struct list_head sched_cb_entry;
int sched_cb_usage;

@@ -994,6 +998,7 @@ struct perf_cpu_context {

#ifdef CONFIG_CGROUP_PERF
struct perf_cgroup *cgrp;
+ struct list_head cgrp_ctx_list;
#endif

/*
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 4c72a41f11af..06b39b8066a9 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -700,11 +700,73 @@ static void perf_ctx_enable(struct perf_event_context *ctx)
perf_pmu_enable(pmu_ctx->pmu);
}

+static int __ctx_sched_out(struct perf_event_context *ctx, enum event_type_t event_type);
static void ctx_sched_out(struct perf_event_context *ctx, enum event_type_t event_type);
+static int __ctx_sched_in(struct perf_event_context *ctx, enum event_type_t event_type);
static void ctx_sched_in(struct perf_event_context *ctx, enum event_type_t event_type);
+static void __pmu_ctx_sched_out(struct perf_event_pmu_context *pmu_ctx,
+ enum event_type_t event_type);
+static void ctx_pinned_sched_in(struct perf_event_context *ctx, struct pmu *pmu);
+static void ctx_flexible_sched_in(struct perf_event_context *ctx, struct pmu *pmu);

#ifdef CONFIG_CGROUP_PERF

+static void perf_cgrp_ctx_disable(struct perf_event_context *ctx)
+{
+ struct perf_cpu_context *cpuctx;
+ struct perf_cpu_pmu_context *cpc;
+
+ cpuctx = container_of(ctx, struct perf_cpu_context, ctx);
+ list_for_each_entry(cpc, &cpuctx->cgrp_ctx_list, cgrp_ctx_entry)
+ perf_pmu_disable(cpc->epc.pmu);
+}
+
+static void perf_cgrp_ctx_enable(struct perf_event_context *ctx)
+{
+ struct perf_cpu_context *cpuctx;
+ struct perf_cpu_pmu_context *cpc;
+
+ cpuctx = container_of(ctx, struct perf_cpu_context, ctx);
+ list_for_each_entry(cpc, &cpuctx->cgrp_ctx_list, cgrp_ctx_entry)
+ perf_pmu_enable(cpc->epc.pmu);
+}
+
+static void cgrp_ctx_sched_out(struct perf_cpu_context *cpuctx,
+ enum event_type_t event_type)
+{
+ struct perf_cpu_pmu_context *cpc;
+ int is_active = __ctx_sched_out(&cpuctx->ctx, event_type);
+
+ if (is_active < 0)
+ return;
+
+ list_for_each_entry(cpc, &cpuctx->cgrp_ctx_list, cgrp_ctx_entry)
+ __pmu_ctx_sched_out(&cpc->epc, is_active);
+}
+
+static void cgrp_ctx_sched_in(struct perf_cpu_context *cpuctx,
+ enum event_type_t event_type)
+{
+ struct perf_cpu_pmu_context *cpc;
+ int is_active = __ctx_sched_in(&cpuctx->ctx, event_type);
+
+ if (is_active < 0)
+ return;
+
+ list_for_each_entry(cpc, &cpuctx->cgrp_ctx_list, cgrp_ctx_entry) {
+ /*
+ * First go through the list and put on any pinned groups
+ * in order to give them the best chance of going on.
+ */
+ if (is_active & EVENT_PINNED)
+ ctx_pinned_sched_in(&cpuctx->ctx, cpc->epc.pmu);
+
+ /* Then walk through the lower prio flexible groups */
+ if (is_active & EVENT_FLEXIBLE)
+ ctx_flexible_sched_in(&cpuctx->ctx, cpc->epc.pmu);
+ }
+}
+
static inline bool
perf_cgroup_match(struct perf_event *event)
{
@@ -856,9 +918,9 @@ static void perf_cgroup_switch(struct task_struct *task)
return;

perf_ctx_lock(cpuctx, cpuctx->task_ctx);
- perf_ctx_disable(&cpuctx->ctx);
+ perf_cgrp_ctx_disable(&cpuctx->ctx);

- ctx_sched_out(&cpuctx->ctx, EVENT_ALL);
+ cgrp_ctx_sched_out(cpuctx, EVENT_ALL);
/*
* must not be done before ctxswout due
* to update_cgrp_time_from_cpuctx() in
@@ -870,9 +932,9 @@ static void perf_cgroup_switch(struct task_struct *task)
* perf_cgroup_set_timestamp() in ctx_sched_in()
* to not have to pass task around
*/
- ctx_sched_in(&cpuctx->ctx, EVENT_ALL);
+ cgrp_ctx_sched_in(cpuctx, EVENT_ALL);

- perf_ctx_enable(&cpuctx->ctx);
+ perf_cgrp_ctx_enable(&cpuctx->ctx);
perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
}

@@ -961,6 +1023,7 @@ static inline void
perf_cgroup_event_enable(struct perf_event *event, struct perf_event_context *ctx)
{
struct perf_cpu_context *cpuctx;
+ struct perf_cpu_pmu_context *cpc;

if (!is_cgroup_event(event))
return;
@@ -975,12 +1038,16 @@ perf_cgroup_event_enable(struct perf_event *event, struct perf_event_context *ct
return;

cpuctx->cgrp = perf_cgroup_from_task(current, ctx);
+
+ cpc = container_of(event->pmu_ctx, struct perf_cpu_pmu_context, epc);
+ list_add(&cpc->cgrp_ctx_entry, &cpuctx->cgrp_ctx_list);
}

static inline void
perf_cgroup_event_disable(struct perf_event *event, struct perf_event_context *ctx)
{
struct perf_cpu_context *cpuctx;
+ struct perf_cpu_pmu_context *cpc;

if (!is_cgroup_event(event))
return;
@@ -995,6 +1062,9 @@ perf_cgroup_event_disable(struct perf_event *event, struct perf_event_context *c
return;

cpuctx->cgrp = NULL;
+
+ cpc = container_of(event->pmu_ctx, struct perf_cpu_pmu_context, epc);
+ list_del(&cpc->cgrp_ctx_entry);
}

#else /* !CONFIG_CGROUP_PERF */
@@ -3238,11 +3308,10 @@ static void __pmu_ctx_sched_out(struct perf_event_pmu_context *pmu_ctx,
perf_pmu_enable(pmu);
}

-static void
-ctx_sched_out(struct perf_event_context *ctx, enum event_type_t event_type)
+static int
+__ctx_sched_out(struct perf_event_context *ctx, enum event_type_t event_type)
{
struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context);
- struct perf_event_pmu_context *pmu_ctx;
int is_active = ctx->is_active;

lockdep_assert_held(&ctx->lock);
@@ -3254,7 +3323,7 @@ ctx_sched_out(struct perf_event_context *ctx, enum event_type_t event_type)
WARN_ON_ONCE(ctx->is_active);
if (ctx->task)
WARN_ON_ONCE(cpuctx->task_ctx);
- return;
+ return -1;
}

/*
@@ -3290,6 +3359,18 @@ ctx_sched_out(struct perf_event_context *ctx, enum event_type_t event_type)

is_active ^= ctx->is_active; /* changed bits */

+ return is_active;
+}
+
+static void
+ctx_sched_out(struct perf_event_context *ctx, enum event_type_t event_type)
+{
+ struct perf_event_pmu_context *pmu_ctx;
+ int is_active = __ctx_sched_out(ctx, event_type);
+
+ if (is_active < 0)
+ return;
+
list_for_each_entry(pmu_ctx, &ctx->pmu_ctx_list, pmu_ctx_entry)
__pmu_ctx_sched_out(pmu_ctx, is_active);
}
@@ -3861,8 +3942,8 @@ static void __pmu_ctx_sched_in(struct perf_event_context *ctx, struct pmu *pmu)
ctx_flexible_sched_in(ctx, pmu);
}

-static void
-ctx_sched_in(struct perf_event_context *ctx, enum event_type_t event_type)
+static int
+__ctx_sched_in(struct perf_event_context *ctx, enum event_type_t event_type)
{
struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context);
int is_active = ctx->is_active;
@@ -3870,7 +3951,7 @@ ctx_sched_in(struct perf_event_context *ctx, enum event_type_t event_type)
lockdep_assert_held(&ctx->lock);

if (likely(!ctx->nr_events))
- return;
+ return -1;

if (!(is_active & EVENT_TIME)) {
/* start ctx time */
@@ -3893,6 +3974,17 @@ ctx_sched_in(struct perf_event_context *ctx, enum event_type_t event_type)

is_active ^= ctx->is_active; /* changed bits */

+ return is_active;
+}
+
+static void
+ctx_sched_in(struct perf_event_context *ctx, enum event_type_t event_type)
+{
+ int is_active = __ctx_sched_in(ctx, event_type);
+
+ if (is_active < 0)
+ return;
+
/*
* First go through the list and put on any pinned groups
* in order to give them the best chance of going on.
@@ -13541,6 +13633,9 @@ static void __init perf_event_init_all_cpus(void)
cpuctx->online = cpumask_test_cpu(cpu, perf_online_mask);
cpuctx->heap_size = ARRAY_SIZE(cpuctx->heap_default);
cpuctx->heap = cpuctx->heap_default;
+#ifdef CONFIG_CGROUP_PERF
+ INIT_LIST_HEAD(&cpuctx->cgrp_ctx_list);
+#endif
}
}

--
2.42.0.582.g8ccd20d70d-goog


2023-10-04 07:26:57

by Ingo Molnar

[permalink] [raw]
Subject: Re: [PATCH] perf/core: Introduce cpuctx->cgrp_ctx_list


* Namhyung Kim <[email protected]> wrote:

> AFAIK we don't have a tool to measure the context switch overhead
> directly. (I think I should add one to perf ftrace latency). But I can
> see it with a simple perf bench command like this.
>
> $ perf bench sched pipe -l 100000
> # Running 'sched/pipe' benchmark:
> # Executed 100000 pipe operations between two processes
>
> Total time: 0.650 [sec]
>
> 6.505740 usecs/op
> 153710 ops/sec
>
> It runs two tasks communicate each other using a pipe so it should
> stress the context switch code. This is the normal numbers on my
> system. But after I run these two perf stat commands in background,
> the numbers vary a lot.
>
> $ sudo perf stat -a -e cycles -G user.slice -- sleep 100000 &
> $ sudo perf stat -a -e uncore_imc/cas_count_read/ -- sleep 10000 &
>
> I will show the last two lines of perf bench sched pipe output for
> three runs.
>
> 58.597060 usecs/op # run 1
> 17065 ops/sec
>
> 11.329240 usecs/op # run 2
> 88267 ops/sec
>
> 88.481920 usecs/op # run 3
> 11301 ops/sec
>
> I think the deviation comes from the fact that uncore events are managed
> a certain number of cpus only. If the target process runs on a cpu that
> manages uncore pmu, it'd take longer. Otherwise it won't affect the
> performance much.

The numbers of pipe-message context switching will vary a lot depending on
CPU migration patterns as well.

The best way to measure context-switch overhead is to pin that task
to a single CPU with something like:

$ taskset 1 perf stat --null --repeat 10 perf bench sched pipe -l 10000 >/dev/null

Performance counter stats for 'perf bench sched pipe -l 10000' (10 runs):

0.049798 +- 0.000102 seconds time elapsed ( +- 0.21% )

as you can see the 0.21% stddev is pretty low.

If we allow 2 CPUs, both runtime and stddev is much higher:

$ taskset 3 perf stat --null --repeat 10 perf bench sched pipe -l 10000 >/dev/null

Performance counter stats for 'perf bench sched pipe -l 10000' (10 runs):

1.4835 +- 0.0383 seconds time elapsed ( +- 2.58% )

Thanks,

Ingo

2023-10-04 15:01:52

by Namhyung Kim

[permalink] [raw]
Subject: Re: [PATCH] perf/core: Introduce cpuctx->cgrp_ctx_list

Hi Ingo,

On Wed, Oct 4, 2023 at 12:26 AM Ingo Molnar <[email protected]> wrote:
>
>
> * Namhyung Kim <[email protected]> wrote:
>
> > AFAIK we don't have a tool to measure the context switch overhead
> > directly. (I think I should add one to perf ftrace latency). But I can
> > see it with a simple perf bench command like this.
> >
> > $ perf bench sched pipe -l 100000
> > # Running 'sched/pipe' benchmark:
> > # Executed 100000 pipe operations between two processes
> >
> > Total time: 0.650 [sec]
> >
> > 6.505740 usecs/op
> > 153710 ops/sec
> >
> > It runs two tasks communicate each other using a pipe so it should
> > stress the context switch code. This is the normal numbers on my
> > system. But after I run these two perf stat commands in background,
> > the numbers vary a lot.
> >
> > $ sudo perf stat -a -e cycles -G user.slice -- sleep 100000 &
> > $ sudo perf stat -a -e uncore_imc/cas_count_read/ -- sleep 10000 &
> >
> > I will show the last two lines of perf bench sched pipe output for
> > three runs.
> >
> > 58.597060 usecs/op # run 1
> > 17065 ops/sec
> >
> > 11.329240 usecs/op # run 2
> > 88267 ops/sec
> >
> > 88.481920 usecs/op # run 3
> > 11301 ops/sec
> >
> > I think the deviation comes from the fact that uncore events are managed
> > a certain number of cpus only. If the target process runs on a cpu that
> > manages uncore pmu, it'd take longer. Otherwise it won't affect the
> > performance much.
>
> The numbers of pipe-message context switching will vary a lot depending on
> CPU migration patterns as well.
>
> The best way to measure context-switch overhead is to pin that task
> to a single CPU with something like:
>
> $ taskset 1 perf stat --null --repeat 10 perf bench sched pipe -l 10000 >/dev/null
>
> Performance counter stats for 'perf bench sched pipe -l 10000' (10 runs):
>
> 0.049798 +- 0.000102 seconds time elapsed ( +- 0.21% )
>
> as you can see the 0.21% stddev is pretty low.
>
> If we allow 2 CPUs, both runtime and stddev is much higher:
>
> $ taskset 3 perf stat --null --repeat 10 perf bench sched pipe -l 10000 >/dev/null
>
> Performance counter stats for 'perf bench sched pipe -l 10000' (10 runs):
>
> 1.4835 +- 0.0383 seconds time elapsed ( +- 2.58% )

Thanks for taking your time. I should have said I also tried this.
But the problem is that it doesn't need the pure context switch.
It needs to switch to a different cgroup to trigger the overhead.

For example, I counted the number of context switches.

$ perf stat -e context-switches,cgroup-switches \
> perf bench sched pipe -l 10000 > /dev/null

Performance counter stats for 'perf bench sched pipe -l 10000':

20,001 context-switches
20,001 cgroup-switches

But if I use the taskset,

$ perf stat -e context-switches,cgroup-switches \
> taskset -c 0 perf bench sched pipe -l 10000 > /dev/null

Performance counter stats for 'taskset -c 0 perf bench sched pipe -l 10000':

20,003 context-switches
2 cgroup-switches

So the regression didn't happen when I used taskset because
the two tasks run on the cpu without changing cgroups.

Maybe I can add an option to perf bench sched to place
senders and receivers in different cgroups.

Thanks,
Namhyung

2023-10-04 15:43:13

by Ingo Molnar

[permalink] [raw]
Subject: Re: [PATCH] perf/core: Introduce cpuctx->cgrp_ctx_list


* Namhyung Kim <[email protected]> wrote:

> Maybe I can add an option to perf bench sched to place
> senders and receivers in different cgroups.

That would certainly be useful to measure cgroups overhead.

Thanks,

Ingo

2023-10-04 16:03:11

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [PATCH] perf/core: Introduce cpuctx->cgrp_ctx_list

On Tue, Oct 03, 2023 at 09:08:44PM -0700, Namhyung Kim wrote:

> But after the change, it ended up iterating all pmus/events in the cpu
> context if there's a cgroup event somewhere on the cpu context.
> Unfortunately it includes uncore pmus which have much longer latency to
> control.

Can you describe the problem in more detail please?

We have cgrp as part of the tree key: {cpu, pmu, cgroup, idx},
so it should be possible to find a specific cgroup for a cpu and or skip
to the next cgroup on that cpu in O(log n) time.

> To fix the issue, I restored a linked list equivalent to cgrp_cpuctx_list
> in the perf_cpu_context and link perf_cpu_pmu_contexts that have cgroup
> events only. Also add new helpers to enable/disable and does ctx sched
> in/out for cgroups.

Adding a list and duplicating the whole scheduling infrastructure seems
'unfortunate' at best.

2023-10-04 16:32:51

by Namhyung Kim

[permalink] [raw]
Subject: Re: [PATCH] perf/core: Introduce cpuctx->cgrp_ctx_list

Hi Peter,

On Wed, Oct 4, 2023 at 9:02 AM Peter Zijlstra <[email protected]> wrote:
>
> On Tue, Oct 03, 2023 at 09:08:44PM -0700, Namhyung Kim wrote:
>
> > But after the change, it ended up iterating all pmus/events in the cpu
> > context if there's a cgroup event somewhere on the cpu context.
> > Unfortunately it includes uncore pmus which have much longer latency to
> > control.
>
> Can you describe the problem in more detail please?

Sure.

>
> We have cgrp as part of the tree key: {cpu, pmu, cgroup, idx},
> so it should be possible to find a specific cgroup for a cpu and or skip
> to the next cgroup on that cpu in O(log n) time.

This is about a single (core) pmu when it has a lot of events.
But this problem is different, it's about accessing more pmus
unnecessarily.

Say we have the following events for CPU 0.

sw: context-switches
core: cycles, cycles-for-cgroup-A
uncore: whatever

The cpu context has a cgroup event so it needs to call
perf_cgroup_switch() at every context switch. But actually
it only needs to resched the 'core' pmu since it only has a
cgroup event. Other pmu events (like context-switches or
any uncore event) should not be bothered by that.

But perf_cgroup_switch() calls the general functions which
iterate all pmus in the (cpu) context.

cpuctx.ctx.pmu_ctx_list:
+-> sw -> core -> uncore (pmu_ctx_entry)

Then it disables pmus, sched-out current events, switch
cgroup pointer, sched-in new events and enable pmus.
This gives a lot more overhead when it has uncore pmus
since accessing MSRs for uncore pmus has longer latency.
But uncore pmus cannot have cgroup events in the first
place.

So we need a separate list to keep pmus that have
active cgroup events.

cpuctx.cgrp_ctx_list:
+-> core (cgrp_ctx_entry)

And we also need a logic to do the same work only
for this list.

Hope this helps.

>
> > To fix the issue, I restored a linked list equivalent to cgrp_cpuctx_list
> > in the perf_cpu_context and link perf_cpu_pmu_contexts that have cgroup
> > events only. Also add new helpers to enable/disable and does ctx sched
> > in/out for cgroups.
>
> Adding a list and duplicating the whole scheduling infrastructure seems
> 'unfortunate' at best.

Yeah, I know.. but I couldn't come up with a better solution.

Thanks,
Namhyung

2023-10-04 21:28:11

by Namhyung Kim

[permalink] [raw]
Subject: Re: [PATCH] perf/core: Introduce cpuctx->cgrp_ctx_list

On Wed, Oct 04, 2023 at 05:42:49PM +0200, Ingo Molnar wrote:
>
> * Namhyung Kim <[email protected]> wrote:
>
> > Maybe I can add an option to perf bench sched to place
> > senders and receivers in different cgroups.
>
> That would certainly be useful to measure cgroups overhead.

Sent out the change:

https://lore.kernel.org/all/[email protected]/

With that, the numbers became stable. :)

Before)

$ taskset -c 0 ./perf bench sched pipe -l 10000 -G AAA,BBB
# Running 'sched/pipe' benchmark:
# Executed 10000 pipe operations between two processes

Total time: 0.901 [sec]

90.128700 usecs/op
11095 ops/sec


After)

$ taskset -c 0 ./perf bench sched pipe -l 10000 -G AAA,BBB
# Running 'sched/pipe' benchmark:
# Executed 10000 pipe operations between two processes

Total time: 0.065 [sec]

6.560100 usecs/op
152436 ops/sec


Thanks,
Namhyung

2023-10-09 21:04:58

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [PATCH] perf/core: Introduce cpuctx->cgrp_ctx_list

On Wed, Oct 04, 2023 at 09:32:24AM -0700, Namhyung Kim wrote:

> Yeah, I know.. but I couldn't come up with a better solution.

Not been near a compiler, and haven't fully thought it through, but
could something like the work work?


---
include/linux/perf_event.h | 1 +
kernel/events/core.c | 115 +++++++++++++++++++++++----------------------
2 files changed, 61 insertions(+), 55 deletions(-)

diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index f31f962a6445..0367d748fae0 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -878,6 +878,7 @@ struct perf_event_pmu_context {
unsigned int embedded : 1;

unsigned int nr_events;
+ unsigned int nr_cgroups;

atomic_t refcount; /* event <-> epc */
struct rcu_head rcu_head;
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 708d474c2ede..f3d5d47ecdfc 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -375,6 +375,7 @@ enum event_type_t {
EVENT_TIME = 0x4,
/* see ctx_resched() for details */
EVENT_CPU = 0x8,
+ EVENT_CGROUP = 0x10,
EVENT_ALL = EVENT_FLEXIBLE | EVENT_PINNED,
};

@@ -684,20 +685,26 @@ do { \
___p; \
})

-static void perf_ctx_disable(struct perf_event_context *ctx)
+static void perf_ctx_disable(struct perf_event_context *ctx, bool cgroup)
{
struct perf_event_pmu_context *pmu_ctx;

- list_for_each_entry(pmu_ctx, &ctx->pmu_ctx_list, pmu_ctx_entry)
+ list_for_each_entry(pmu_ctx, &ctx->pmu_ctx_list, pmu_ctx_entry) {
+ if (cgroup && !pmu_ctx->nr_cgroups)
+ continue;
perf_pmu_disable(pmu_ctx->pmu);
+ }
}

-static void perf_ctx_enable(struct perf_event_context *ctx)
+static void perf_ctx_enable(struct perf_event_context *ctx. bool cgroup)
{
struct perf_event_pmu_context *pmu_ctx;

- list_for_each_entry(pmu_ctx, &ctx->pmu_ctx_list, pmu_ctx_entry)
+ list_for_each_entry(pmu_ctx, &ctx->pmu_ctx_list, pmu_ctx_entry) {
+ if (cgroup && !pmu_ctx->nr_cgroups)
+ continue;
perf_pmu_enable(pmu_ctx->pmu);
+ }
}

static void ctx_sched_out(struct perf_event_context *ctx, enum event_type_t event_type);
@@ -856,9 +863,9 @@ static void perf_cgroup_switch(struct task_struct *task)
return;

perf_ctx_lock(cpuctx, cpuctx->task_ctx);
- perf_ctx_disable(&cpuctx->ctx);
+ perf_ctx_disable(&cpuctx->ctx, true);

- ctx_sched_out(&cpuctx->ctx, EVENT_ALL);
+ ctx_sched_out(&cpuctx->ctx, EVENT_ALL|EVENT_CGROUP);
/*
* must not be done before ctxswout due
* to update_cgrp_time_from_cpuctx() in
@@ -870,9 +877,9 @@ static void perf_cgroup_switch(struct task_struct *task)
* perf_cgroup_set_timestamp() in ctx_sched_in()
* to not have to pass task around
*/
- ctx_sched_in(&cpuctx->ctx, EVENT_ALL);
+ ctx_sched_in(&cpuctx->ctx, EVENT_ALL|EVENT_CGROUP);

- perf_ctx_enable(&cpuctx->ctx);
+ perf_ctx_enable(&cpuctx->ctx, true);
perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
}

@@ -965,6 +972,8 @@ perf_cgroup_event_enable(struct perf_event *event, struct perf_event_context *ct
if (!is_cgroup_event(event))
return;

+ event->pmu_ctx->nr_cgroups++;
+
/*
* Because cgroup events are always per-cpu events,
* @ctx == &cpuctx->ctx.
@@ -985,6 +994,8 @@ perf_cgroup_event_disable(struct perf_event *event, struct perf_event_context *c
if (!is_cgroup_event(event))
return;

+ event->pmu_ctx->nr_cgroups--;
+
/*
* Because cgroup events are always per-cpu events,
* @ctx == &cpuctx->ctx.
@@ -2677,9 +2688,9 @@ static void ctx_resched(struct perf_cpu_context *cpuctx,

event_type &= EVENT_ALL;

- perf_ctx_disable(&cpuctx->ctx);
+ perf_ctx_disable(&cpuctx->ctx, false);
if (task_ctx) {
- perf_ctx_disable(task_ctx);
+ perf_ctx_disable(task_ctx, false);
task_ctx_sched_out(task_ctx, event_type);
}

@@ -2697,9 +2708,9 @@ static void ctx_resched(struct perf_cpu_context *cpuctx,

perf_event_sched_in(cpuctx, task_ctx);

- perf_ctx_enable(&cpuctx->ctx);
+ perf_ctx_enable(&cpuctx->ctx, false);
if (task_ctx)
- perf_ctx_enable(task_ctx);
+ perf_ctx_enable(task_ctx, false);
}

void perf_pmu_resched(struct pmu *pmu)
@@ -3244,6 +3255,9 @@ ctx_sched_out(struct perf_event_context *ctx, enum event_type_t event_type)
struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context);
struct perf_event_pmu_context *pmu_ctx;
int is_active = ctx->is_active;
+ bool cgroup = event_type & EVENT_CGROUP;
+
+ event_type &= ~EVENT_CGROUP;

lockdep_assert_held(&ctx->lock);

@@ -3290,8 +3304,11 @@ ctx_sched_out(struct perf_event_context *ctx, enum event_type_t event_type)

is_active ^= ctx->is_active; /* changed bits */

- list_for_each_entry(pmu_ctx, &ctx->pmu_ctx_list, pmu_ctx_entry)
+ list_for_each_entry(pmu_ctx, &ctx->pmu_ctx_list, pmu_ctx_entry) {
+ if (cgroup && !pmu_ctx->nr_cgroups)
+ continue;
__pmu_ctx_sched_out(pmu_ctx, is_active);
+ }
}

/*
@@ -3482,7 +3499,7 @@ perf_event_context_sched_out(struct task_struct *task, struct task_struct *next)
raw_spin_lock_nested(&next_ctx->lock, SINGLE_DEPTH_NESTING);
if (context_equiv(ctx, next_ctx)) {

- perf_ctx_disable(ctx);
+ perf_ctx_disable(ctx, false);

/* PMIs are disabled; ctx->nr_pending is stable. */
if (local_read(&ctx->nr_pending) ||
@@ -3502,7 +3519,7 @@ perf_event_context_sched_out(struct task_struct *task, struct task_struct *next)
perf_ctx_sched_task_cb(ctx, false);
perf_event_swap_task_ctx_data(ctx, next_ctx);

- perf_ctx_enable(ctx);
+ perf_ctx_enable(ctx, false);

/*
* RCU_INIT_POINTER here is safe because we've not
@@ -3526,13 +3543,13 @@ perf_event_context_sched_out(struct task_struct *task, struct task_struct *next)

if (do_switch) {
raw_spin_lock(&ctx->lock);
- perf_ctx_disable(ctx);
+ perf_ctx_disable(ctx, false);

inside_switch:
perf_ctx_sched_task_cb(ctx, false);
task_ctx_sched_out(ctx, EVENT_ALL);

- perf_ctx_enable(ctx);
+ perf_ctx_enable(ctx, false);
raw_spin_unlock(&ctx->lock);
}
}
@@ -3818,47 +3835,32 @@ static int merge_sched_in(struct perf_event *event, void *data)
return 0;
}

-static void ctx_pinned_sched_in(struct perf_event_context *ctx, struct pmu *pmu)
+static void pmu_groups_sched_in(struct perf_event_context *ctx,
+ struct perf_event_groups *groups,
+ struct pmu *pmu)
{
- struct perf_event_pmu_context *pmu_ctx;
int can_add_hw = 1;
-
- if (pmu) {
- visit_groups_merge(ctx, &ctx->pinned_groups,
- smp_processor_id(), pmu,
- merge_sched_in, &can_add_hw);
- } else {
- list_for_each_entry(pmu_ctx, &ctx->pmu_ctx_list, pmu_ctx_entry) {
- can_add_hw = 1;
- visit_groups_merge(ctx, &ctx->pinned_groups,
- smp_processor_id(), pmu_ctx->pmu,
- merge_sched_in, &can_add_hw);
- }
- }
+ visit_groups_merge(ctx, groups, smp_processor_id(), pmu,
+ merge_sched_in, &can_add_hw);
}

-static void ctx_flexible_sched_in(struct perf_event_context *ctx, struct pmu *pmu)
+static void ctx_groups_sched_in(struct perf_event_context *ctx,
+ struct perf_event_groups *groups,
+ bool cgroup)
{
struct perf_event_pmu_context *pmu_ctx;
- int can_add_hw = 1;

- if (pmu) {
- visit_groups_merge(ctx, &ctx->flexible_groups,
- smp_processor_id(), pmu,
- merge_sched_in, &can_add_hw);
- } else {
- list_for_each_entry(pmu_ctx, &ctx->pmu_ctx_list, pmu_ctx_entry) {
- can_add_hw = 1;
- visit_groups_merge(ctx, &ctx->flexible_groups,
- smp_processor_id(), pmu_ctx->pmu,
- merge_sched_in, &can_add_hw);
- }
+ list_for_each_entry(pmu_ctx, &ctx->pmu_ctx_list, pmu_ctx_entry) {
+ if (cgroup && !pmu_ctx->nr_cgroups)
+ continue;
+ pmu_groups_sched_in(ctx, groups, pmu_ctx->pmu);
}
}

-static void __pmu_ctx_sched_in(struct perf_event_context *ctx, struct pmu *pmu)
+static void __pmu_ctx_sched_in(struct perf_event_context *ctx,
+ struct pmu *pmu)
{
- ctx_flexible_sched_in(ctx, pmu);
+ pmu_groups_sched_in(ctx, &ctx->flexible_groups, pmu);
}

static void
@@ -3866,6 +3868,9 @@ ctx_sched_in(struct perf_event_context *ctx, enum event_type_t event_type)
{
struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context);
int is_active = ctx->is_active;
+ bool cgroup = event_type & EVENT_CGROUP;
+
+ event_type &= ~EVENT_CGROUP;

lockdep_assert_held(&ctx->lock);

@@ -3898,11 +3903,11 @@ ctx_sched_in(struct perf_event_context *ctx, enum event_type_t event_type)
* in order to give them the best chance of going on.
*/
if (is_active & EVENT_PINNED)
- ctx_pinned_sched_in(ctx, NULL);
+ ctx_groups_sched_in(ctx, &ctx->pinned_groups, cgroup);

/* Then walk through the lower prio flexible groups */
if (is_active & EVENT_FLEXIBLE)
- ctx_flexible_sched_in(ctx, NULL);
+ ctx_groups_sched_in(ctx, &ctx->flexible_groups, cgroup);
}

static void perf_event_context_sched_in(struct task_struct *task)
@@ -3917,11 +3922,11 @@ static void perf_event_context_sched_in(struct task_struct *task)

if (cpuctx->task_ctx == ctx) {
perf_ctx_lock(cpuctx, ctx);
- perf_ctx_disable(ctx);
+ perf_ctx_disable(ctx, false);

perf_ctx_sched_task_cb(ctx, true);

- perf_ctx_enable(ctx);
+ perf_ctx_enable(ctx, false);
perf_ctx_unlock(cpuctx, ctx);
goto rcu_unlock;
}
@@ -3934,7 +3939,7 @@ static void perf_event_context_sched_in(struct task_struct *task)
if (!ctx->nr_events)
goto unlock;

- perf_ctx_disable(ctx);
+ perf_ctx_disable(ctx, false);
/*
* We want to keep the following priority order:
* cpu pinned (that don't need to move), task pinned,
@@ -3944,7 +3949,7 @@ static void perf_event_context_sched_in(struct task_struct *task)
* events, no need to flip the cpuctx's events around.
*/
if (!RB_EMPTY_ROOT(&ctx->pinned_groups.tree)) {
- perf_ctx_disable(&cpuctx->ctx);
+ perf_ctx_disable(&cpuctx->ctx, false);
ctx_sched_out(&cpuctx->ctx, EVENT_FLEXIBLE);
}

@@ -3953,9 +3958,9 @@ static void perf_event_context_sched_in(struct task_struct *task)
perf_ctx_sched_task_cb(cpuctx->task_ctx, true);

if (!RB_EMPTY_ROOT(&ctx->pinned_groups.tree))
- perf_ctx_enable(&cpuctx->ctx);
+ perf_ctx_enable(&cpuctx->ctx, false);

- perf_ctx_enable(ctx);
+ perf_ctx_enable(ctx, false);

unlock:
perf_ctx_unlock(cpuctx, ctx);

2023-10-10 04:58:07

by Namhyung Kim

[permalink] [raw]
Subject: Re: [PATCH] perf/core: Introduce cpuctx->cgrp_ctx_list

Hi Peter,

On Mon, Oct 9, 2023 at 2:04 PM Peter Zijlstra <[email protected]> wrote:
>
> On Wed, Oct 04, 2023 at 09:32:24AM -0700, Namhyung Kim wrote:
>
> > Yeah, I know.. but I couldn't come up with a better solution.
>
> Not been near a compiler, and haven't fully thought it through, but
> could something like the work work?

Thanks for the patch, I think it'd work. Let me test it
and get back to you.

Thanks,
Namhyung

>
>
> ---
> include/linux/perf_event.h | 1 +
> kernel/events/core.c | 115 +++++++++++++++++++++++----------------------
> 2 files changed, 61 insertions(+), 55 deletions(-)
>
> diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
> index f31f962a6445..0367d748fae0 100644
> --- a/include/linux/perf_event.h
> +++ b/include/linux/perf_event.h
> @@ -878,6 +878,7 @@ struct perf_event_pmu_context {
> unsigned int embedded : 1;
>
> unsigned int nr_events;
> + unsigned int nr_cgroups;
>
> atomic_t refcount; /* event <-> epc */
> struct rcu_head rcu_head;
> diff --git a/kernel/events/core.c b/kernel/events/core.c
> index 708d474c2ede..f3d5d47ecdfc 100644
> --- a/kernel/events/core.c
> +++ b/kernel/events/core.c
> @@ -375,6 +375,7 @@ enum event_type_t {
> EVENT_TIME = 0x4,
> /* see ctx_resched() for details */
> EVENT_CPU = 0x8,
> + EVENT_CGROUP = 0x10,
> EVENT_ALL = EVENT_FLEXIBLE | EVENT_PINNED,
> };
>
> @@ -684,20 +685,26 @@ do { \
> ___p; \
> })
>
> -static void perf_ctx_disable(struct perf_event_context *ctx)
> +static void perf_ctx_disable(struct perf_event_context *ctx, bool cgroup)
> {
> struct perf_event_pmu_context *pmu_ctx;
>
> - list_for_each_entry(pmu_ctx, &ctx->pmu_ctx_list, pmu_ctx_entry)
> + list_for_each_entry(pmu_ctx, &ctx->pmu_ctx_list, pmu_ctx_entry) {
> + if (cgroup && !pmu_ctx->nr_cgroups)
> + continue;
> perf_pmu_disable(pmu_ctx->pmu);
> + }
> }
>
> -static void perf_ctx_enable(struct perf_event_context *ctx)
> +static void perf_ctx_enable(struct perf_event_context *ctx. bool cgroup)
> {
> struct perf_event_pmu_context *pmu_ctx;
>
> - list_for_each_entry(pmu_ctx, &ctx->pmu_ctx_list, pmu_ctx_entry)
> + list_for_each_entry(pmu_ctx, &ctx->pmu_ctx_list, pmu_ctx_entry) {
> + if (cgroup && !pmu_ctx->nr_cgroups)
> + continue;
> perf_pmu_enable(pmu_ctx->pmu);
> + }
> }
>
> static void ctx_sched_out(struct perf_event_context *ctx, enum event_type_t event_type);
> @@ -856,9 +863,9 @@ static void perf_cgroup_switch(struct task_struct *task)
> return;
>
> perf_ctx_lock(cpuctx, cpuctx->task_ctx);
> - perf_ctx_disable(&cpuctx->ctx);
> + perf_ctx_disable(&cpuctx->ctx, true);
>
> - ctx_sched_out(&cpuctx->ctx, EVENT_ALL);
> + ctx_sched_out(&cpuctx->ctx, EVENT_ALL|EVENT_CGROUP);
> /*
> * must not be done before ctxswout due
> * to update_cgrp_time_from_cpuctx() in
> @@ -870,9 +877,9 @@ static void perf_cgroup_switch(struct task_struct *task)
> * perf_cgroup_set_timestamp() in ctx_sched_in()
> * to not have to pass task around
> */
> - ctx_sched_in(&cpuctx->ctx, EVENT_ALL);
> + ctx_sched_in(&cpuctx->ctx, EVENT_ALL|EVENT_CGROUP);
>
> - perf_ctx_enable(&cpuctx->ctx);
> + perf_ctx_enable(&cpuctx->ctx, true);
> perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
> }
>
> @@ -965,6 +972,8 @@ perf_cgroup_event_enable(struct perf_event *event, struct perf_event_context *ct
> if (!is_cgroup_event(event))
> return;
>
> + event->pmu_ctx->nr_cgroups++;
> +
> /*
> * Because cgroup events are always per-cpu events,
> * @ctx == &cpuctx->ctx.
> @@ -985,6 +994,8 @@ perf_cgroup_event_disable(struct perf_event *event, struct perf_event_context *c
> if (!is_cgroup_event(event))
> return;
>
> + event->pmu_ctx->nr_cgroups--;
> +
> /*
> * Because cgroup events are always per-cpu events,
> * @ctx == &cpuctx->ctx.
> @@ -2677,9 +2688,9 @@ static void ctx_resched(struct perf_cpu_context *cpuctx,
>
> event_type &= EVENT_ALL;
>
> - perf_ctx_disable(&cpuctx->ctx);
> + perf_ctx_disable(&cpuctx->ctx, false);
> if (task_ctx) {
> - perf_ctx_disable(task_ctx);
> + perf_ctx_disable(task_ctx, false);
> task_ctx_sched_out(task_ctx, event_type);
> }
>
> @@ -2697,9 +2708,9 @@ static void ctx_resched(struct perf_cpu_context *cpuctx,
>
> perf_event_sched_in(cpuctx, task_ctx);
>
> - perf_ctx_enable(&cpuctx->ctx);
> + perf_ctx_enable(&cpuctx->ctx, false);
> if (task_ctx)
> - perf_ctx_enable(task_ctx);
> + perf_ctx_enable(task_ctx, false);
> }
>
> void perf_pmu_resched(struct pmu *pmu)
> @@ -3244,6 +3255,9 @@ ctx_sched_out(struct perf_event_context *ctx, enum event_type_t event_type)
> struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context);
> struct perf_event_pmu_context *pmu_ctx;
> int is_active = ctx->is_active;
> + bool cgroup = event_type & EVENT_CGROUP;
> +
> + event_type &= ~EVENT_CGROUP;
>
> lockdep_assert_held(&ctx->lock);
>
> @@ -3290,8 +3304,11 @@ ctx_sched_out(struct perf_event_context *ctx, enum event_type_t event_type)
>
> is_active ^= ctx->is_active; /* changed bits */
>
> - list_for_each_entry(pmu_ctx, &ctx->pmu_ctx_list, pmu_ctx_entry)
> + list_for_each_entry(pmu_ctx, &ctx->pmu_ctx_list, pmu_ctx_entry) {
> + if (cgroup && !pmu_ctx->nr_cgroups)
> + continue;
> __pmu_ctx_sched_out(pmu_ctx, is_active);
> + }
> }
>
> /*
> @@ -3482,7 +3499,7 @@ perf_event_context_sched_out(struct task_struct *task, struct task_struct *next)
> raw_spin_lock_nested(&next_ctx->lock, SINGLE_DEPTH_NESTING);
> if (context_equiv(ctx, next_ctx)) {
>
> - perf_ctx_disable(ctx);
> + perf_ctx_disable(ctx, false);
>
> /* PMIs are disabled; ctx->nr_pending is stable. */
> if (local_read(&ctx->nr_pending) ||
> @@ -3502,7 +3519,7 @@ perf_event_context_sched_out(struct task_struct *task, struct task_struct *next)
> perf_ctx_sched_task_cb(ctx, false);
> perf_event_swap_task_ctx_data(ctx, next_ctx);
>
> - perf_ctx_enable(ctx);
> + perf_ctx_enable(ctx, false);
>
> /*
> * RCU_INIT_POINTER here is safe because we've not
> @@ -3526,13 +3543,13 @@ perf_event_context_sched_out(struct task_struct *task, struct task_struct *next)
>
> if (do_switch) {
> raw_spin_lock(&ctx->lock);
> - perf_ctx_disable(ctx);
> + perf_ctx_disable(ctx, false);
>
> inside_switch:
> perf_ctx_sched_task_cb(ctx, false);
> task_ctx_sched_out(ctx, EVENT_ALL);
>
> - perf_ctx_enable(ctx);
> + perf_ctx_enable(ctx, false);
> raw_spin_unlock(&ctx->lock);
> }
> }
> @@ -3818,47 +3835,32 @@ static int merge_sched_in(struct perf_event *event, void *data)
> return 0;
> }
>
> -static void ctx_pinned_sched_in(struct perf_event_context *ctx, struct pmu *pmu)
> +static void pmu_groups_sched_in(struct perf_event_context *ctx,
> + struct perf_event_groups *groups,
> + struct pmu *pmu)
> {
> - struct perf_event_pmu_context *pmu_ctx;
> int can_add_hw = 1;
> -
> - if (pmu) {
> - visit_groups_merge(ctx, &ctx->pinned_groups,
> - smp_processor_id(), pmu,
> - merge_sched_in, &can_add_hw);
> - } else {
> - list_for_each_entry(pmu_ctx, &ctx->pmu_ctx_list, pmu_ctx_entry) {
> - can_add_hw = 1;
> - visit_groups_merge(ctx, &ctx->pinned_groups,
> - smp_processor_id(), pmu_ctx->pmu,
> - merge_sched_in, &can_add_hw);
> - }
> - }
> + visit_groups_merge(ctx, groups, smp_processor_id(), pmu,
> + merge_sched_in, &can_add_hw);
> }
>
> -static void ctx_flexible_sched_in(struct perf_event_context *ctx, struct pmu *pmu)
> +static void ctx_groups_sched_in(struct perf_event_context *ctx,
> + struct perf_event_groups *groups,
> + bool cgroup)
> {
> struct perf_event_pmu_context *pmu_ctx;
> - int can_add_hw = 1;
>
> - if (pmu) {
> - visit_groups_merge(ctx, &ctx->flexible_groups,
> - smp_processor_id(), pmu,
> - merge_sched_in, &can_add_hw);
> - } else {
> - list_for_each_entry(pmu_ctx, &ctx->pmu_ctx_list, pmu_ctx_entry) {
> - can_add_hw = 1;
> - visit_groups_merge(ctx, &ctx->flexible_groups,
> - smp_processor_id(), pmu_ctx->pmu,
> - merge_sched_in, &can_add_hw);
> - }
> + list_for_each_entry(pmu_ctx, &ctx->pmu_ctx_list, pmu_ctx_entry) {
> + if (cgroup && !pmu_ctx->nr_cgroups)
> + continue;
> + pmu_groups_sched_in(ctx, groups, pmu_ctx->pmu);
> }
> }
>
> -static void __pmu_ctx_sched_in(struct perf_event_context *ctx, struct pmu *pmu)
> +static void __pmu_ctx_sched_in(struct perf_event_context *ctx,
> + struct pmu *pmu)
> {
> - ctx_flexible_sched_in(ctx, pmu);
> + pmu_groups_sched_in(ctx, &ctx->flexible_groups, pmu);
> }
>
> static void
> @@ -3866,6 +3868,9 @@ ctx_sched_in(struct perf_event_context *ctx, enum event_type_t event_type)
> {
> struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context);
> int is_active = ctx->is_active;
> + bool cgroup = event_type & EVENT_CGROUP;
> +
> + event_type &= ~EVENT_CGROUP;
>
> lockdep_assert_held(&ctx->lock);
>
> @@ -3898,11 +3903,11 @@ ctx_sched_in(struct perf_event_context *ctx, enum event_type_t event_type)
> * in order to give them the best chance of going on.
> */
> if (is_active & EVENT_PINNED)
> - ctx_pinned_sched_in(ctx, NULL);
> + ctx_groups_sched_in(ctx, &ctx->pinned_groups, cgroup);
>
> /* Then walk through the lower prio flexible groups */
> if (is_active & EVENT_FLEXIBLE)
> - ctx_flexible_sched_in(ctx, NULL);
> + ctx_groups_sched_in(ctx, &ctx->flexible_groups, cgroup);
> }
>
> static void perf_event_context_sched_in(struct task_struct *task)
> @@ -3917,11 +3922,11 @@ static void perf_event_context_sched_in(struct task_struct *task)
>
> if (cpuctx->task_ctx == ctx) {
> perf_ctx_lock(cpuctx, ctx);
> - perf_ctx_disable(ctx);
> + perf_ctx_disable(ctx, false);
>
> perf_ctx_sched_task_cb(ctx, true);
>
> - perf_ctx_enable(ctx);
> + perf_ctx_enable(ctx, false);
> perf_ctx_unlock(cpuctx, ctx);
> goto rcu_unlock;
> }
> @@ -3934,7 +3939,7 @@ static void perf_event_context_sched_in(struct task_struct *task)
> if (!ctx->nr_events)
> goto unlock;
>
> - perf_ctx_disable(ctx);
> + perf_ctx_disable(ctx, false);
> /*
> * We want to keep the following priority order:
> * cpu pinned (that don't need to move), task pinned,
> @@ -3944,7 +3949,7 @@ static void perf_event_context_sched_in(struct task_struct *task)
> * events, no need to flip the cpuctx's events around.
> */
> if (!RB_EMPTY_ROOT(&ctx->pinned_groups.tree)) {
> - perf_ctx_disable(&cpuctx->ctx);
> + perf_ctx_disable(&cpuctx->ctx, false);
> ctx_sched_out(&cpuctx->ctx, EVENT_FLEXIBLE);
> }
>
> @@ -3953,9 +3958,9 @@ static void perf_event_context_sched_in(struct task_struct *task)
> perf_ctx_sched_task_cb(cpuctx->task_ctx, true);
>
> if (!RB_EMPTY_ROOT(&ctx->pinned_groups.tree))
> - perf_ctx_enable(&cpuctx->ctx);
> + perf_ctx_enable(&cpuctx->ctx, false);
>
> - perf_ctx_enable(ctx);
> + perf_ctx_enable(ctx, false);
>
> unlock:
> perf_ctx_unlock(cpuctx, ctx);

2023-10-11 03:46:53

by Namhyung Kim

[permalink] [raw]
Subject: Re: [PATCH] perf/core: Introduce cpuctx->cgrp_ctx_list

On Mon, Oct 9, 2023 at 9:57 PM Namhyung Kim <[email protected]> wrote:
>
> Hi Peter,
>
> On Mon, Oct 9, 2023 at 2:04 PM Peter Zijlstra <[email protected]> wrote:
> >
> > On Wed, Oct 04, 2023 at 09:32:24AM -0700, Namhyung Kim wrote:
> >
> > > Yeah, I know.. but I couldn't come up with a better solution.
> >
> > Not been near a compiler, and haven't fully thought it through, but
> > could something like the work work?
>
> Thanks for the patch, I think it'd work. Let me test it
> and get back to you.

I worked well but contained a typo. See below.

Which way do you want to process this change? Do I send it again
with your S-o-b or will you apply it by yourself? Either is fine, just
let me know. In case of latter, you can add

Tested-by: Namhyung Kim <[email protected]>

> >
> >
> > ---
> > include/linux/perf_event.h | 1 +
> > kernel/events/core.c | 115 +++++++++++++++++++++++----------------------
> > 2 files changed, 61 insertions(+), 55 deletions(-)
> >
> > diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
> > index f31f962a6445..0367d748fae0 100644
> > --- a/include/linux/perf_event.h
> > +++ b/include/linux/perf_event.h
> > @@ -878,6 +878,7 @@ struct perf_event_pmu_context {
> > unsigned int embedded : 1;
> >
> > unsigned int nr_events;
> > + unsigned int nr_cgroups;
> >
> > atomic_t refcount; /* event <-> epc */
> > struct rcu_head rcu_head;
> > diff --git a/kernel/events/core.c b/kernel/events/core.c
> > index 708d474c2ede..f3d5d47ecdfc 100644
> > --- a/kernel/events/core.c
> > +++ b/kernel/events/core.c
> > @@ -375,6 +375,7 @@ enum event_type_t {
> > EVENT_TIME = 0x4,
> > /* see ctx_resched() for details */
> > EVENT_CPU = 0x8,
> > + EVENT_CGROUP = 0x10,
> > EVENT_ALL = EVENT_FLEXIBLE | EVENT_PINNED,
> > };
> >
> > @@ -684,20 +685,26 @@ do { \
> > ___p; \
> > })
> >
> > -static void perf_ctx_disable(struct perf_event_context *ctx)
> > +static void perf_ctx_disable(struct perf_event_context *ctx, bool cgroup)
> > {
> > struct perf_event_pmu_context *pmu_ctx;
> >
> > - list_for_each_entry(pmu_ctx, &ctx->pmu_ctx_list, pmu_ctx_entry)
> > + list_for_each_entry(pmu_ctx, &ctx->pmu_ctx_list, pmu_ctx_entry) {
> > + if (cgroup && !pmu_ctx->nr_cgroups)
> > + continue;
> > perf_pmu_disable(pmu_ctx->pmu);
> > + }
> > }
> >
> > -static void perf_ctx_enable(struct perf_event_context *ctx)
> > +static void perf_ctx_enable(struct perf_event_context *ctx. bool cgroup)

s/./,/

Thanks,
Namhyung


> > {
> > struct perf_event_pmu_context *pmu_ctx;
> >
> > - list_for_each_entry(pmu_ctx, &ctx->pmu_ctx_list, pmu_ctx_entry)
> > + list_for_each_entry(pmu_ctx, &ctx->pmu_ctx_list, pmu_ctx_entry) {
> > + if (cgroup && !pmu_ctx->nr_cgroups)
> > + continue;
> > perf_pmu_enable(pmu_ctx->pmu);
> > + }
> > }

2023-10-11 07:52:11

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [PATCH] perf/core: Introduce cpuctx->cgrp_ctx_list

On Tue, Oct 10, 2023 at 08:45:03PM -0700, Namhyung Kim wrote:
> On Mon, Oct 9, 2023 at 9:57 PM Namhyung Kim <[email protected]> wrote:
> >
> > Hi Peter,
> >
> > On Mon, Oct 9, 2023 at 2:04 PM Peter Zijlstra <[email protected]> wrote:
> > >
> > > On Wed, Oct 04, 2023 at 09:32:24AM -0700, Namhyung Kim wrote:
> > >
> > > > Yeah, I know.. but I couldn't come up with a better solution.
> > >
> > > Not been near a compiler, and haven't fully thought it through, but
> > > could something like the work work?
> >
> > Thanks for the patch, I think it'd work. Let me test it
> > and get back to you.
>
> I worked well but contained a typo. See below.

Ha, typing hard ;-) I did say it hadn't been near a compiler ...

> Which way do you want to process this change? Do I send it again
> with your S-o-b or will you apply it by yourself? Either is fine, just
> let me know. In case of latter, you can add

I'll go write me a Changelog and apply the thing, then we can forget
about things.

2023-10-11 09:50:31

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [PATCH] perf/core: Introduce cpuctx->cgrp_ctx_list

On Wed, Oct 11, 2023 at 09:51:36AM +0200, Peter Zijlstra wrote:

> I'll go write me a Changelog and apply the thing, then we can forget
> about things.

I pushed out:

https://git.kernel.org/pub/scm/linux/kernel/git/peterz/queue.git/commit/?h=perf/core&id=52ecef05348dc97e3a5121f7cc0dd08e340b870c

for the robots to chew on, will push out to tip if nobody complains.

2023-10-11 16:03:45

by Namhyung Kim

[permalink] [raw]
Subject: Re: [PATCH] perf/core: Introduce cpuctx->cgrp_ctx_list

On Wed, Oct 11, 2023 at 2:50 AM Peter Zijlstra <[email protected]> wrote:
>
> On Wed, Oct 11, 2023 at 09:51:36AM +0200, Peter Zijlstra wrote:
>
> > I'll go write me a Changelog and apply the thing, then we can forget
> > about things.
>
> I pushed out:
>
> https://git.kernel.org/pub/scm/linux/kernel/git/peterz/queue.git/commit/?h=perf/core&id=52ecef05348dc97e3a5121f7cc0dd08e340b870c
>
> for the robots to chew on, will push out to tip if nobody complains.

Thanks a lot!
Namhyung

Subject: [tip: perf/core] perf: Optimize perf_cgroup_switch()

The following commit has been merged into the perf/core branch of tip:

Commit-ID: f06cc667f79909e9175460b167c277b7c64d3df0
Gitweb: https://git.kernel.org/tip/f06cc667f79909e9175460b167c277b7c64d3df0
Author: Peter Zijlstra <[email protected]>
AuthorDate: Mon, 09 Oct 2023 23:04:25 +02:00
Committer: Peter Zijlstra <[email protected]>
CommitterDate: Thu, 12 Oct 2023 19:28:38 +02:00

perf: Optimize perf_cgroup_switch()

Namhyung reported that bd2756811766 ("perf: Rewrite core context handling")
regresses context switch overhead when perf-cgroup is in use together
with 'slow' PMUs like uncore.

Specifically, perf_cgroup_switch()'s perf_ctx_disable() /
ctx_sched_out() etc.. all iterate the full list of active PMUs for
that CPU, even if they don't have cgroup events.

Previously there was cgrp_cpuctx_list which linked the relevant PMUs
together, but that got lost in the rework. Instead of re-instruducing
a similar list, let the perf_event_pmu_context iteration skip those
that do not have cgroup events. This avoids growing multiple versions
of the perf_event_pmu_context iteration.

Measured performance (on a slightly different patch):

Before)

$ taskset -c 0 ./perf bench sched pipe -l 10000 -G AAA,BBB
# Running 'sched/pipe' benchmark:
# Executed 10000 pipe operations between two processes

Total time: 0.901 [sec]

90.128700 usecs/op
11095 ops/sec

After)

$ taskset -c 0 ./perf bench sched pipe -l 10000 -G AAA,BBB
# Running 'sched/pipe' benchmark:
# Executed 10000 pipe operations between two processes

Total time: 0.065 [sec]

6.560100 usecs/op
152436 ops/sec

Fixes: bd2756811766 ("perf: Rewrite core context handling")
Reported-by: Namhyung Kim <[email protected]>
Debugged-by: Namhyung Kim <[email protected]>
Signed-off-by: Peter Zijlstra (Intel) <[email protected]>
Link: https://lkml.kernel.org/r/[email protected]
---
include/linux/perf_event.h | 1 +-
kernel/events/core.c | 115 ++++++++++++++++++------------------
2 files changed, 61 insertions(+), 55 deletions(-)

diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index f31f962..0367d74 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -878,6 +878,7 @@ struct perf_event_pmu_context {
unsigned int embedded : 1;

unsigned int nr_events;
+ unsigned int nr_cgroups;

atomic_t refcount; /* event <-> epc */
struct rcu_head rcu_head;
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 708d474..3eb26c2 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -375,6 +375,7 @@ enum event_type_t {
EVENT_TIME = 0x4,
/* see ctx_resched() for details */
EVENT_CPU = 0x8,
+ EVENT_CGROUP = 0x10,
EVENT_ALL = EVENT_FLEXIBLE | EVENT_PINNED,
};

@@ -684,20 +685,26 @@ do { \
___p; \
})

-static void perf_ctx_disable(struct perf_event_context *ctx)
+static void perf_ctx_disable(struct perf_event_context *ctx, bool cgroup)
{
struct perf_event_pmu_context *pmu_ctx;

- list_for_each_entry(pmu_ctx, &ctx->pmu_ctx_list, pmu_ctx_entry)
+ list_for_each_entry(pmu_ctx, &ctx->pmu_ctx_list, pmu_ctx_entry) {
+ if (cgroup && !pmu_ctx->nr_cgroups)
+ continue;
perf_pmu_disable(pmu_ctx->pmu);
+ }
}

-static void perf_ctx_enable(struct perf_event_context *ctx)
+static void perf_ctx_enable(struct perf_event_context *ctx, bool cgroup)
{
struct perf_event_pmu_context *pmu_ctx;

- list_for_each_entry(pmu_ctx, &ctx->pmu_ctx_list, pmu_ctx_entry)
+ list_for_each_entry(pmu_ctx, &ctx->pmu_ctx_list, pmu_ctx_entry) {
+ if (cgroup && !pmu_ctx->nr_cgroups)
+ continue;
perf_pmu_enable(pmu_ctx->pmu);
+ }
}

static void ctx_sched_out(struct perf_event_context *ctx, enum event_type_t event_type);
@@ -856,9 +863,9 @@ static void perf_cgroup_switch(struct task_struct *task)
return;

perf_ctx_lock(cpuctx, cpuctx->task_ctx);
- perf_ctx_disable(&cpuctx->ctx);
+ perf_ctx_disable(&cpuctx->ctx, true);

- ctx_sched_out(&cpuctx->ctx, EVENT_ALL);
+ ctx_sched_out(&cpuctx->ctx, EVENT_ALL|EVENT_CGROUP);
/*
* must not be done before ctxswout due
* to update_cgrp_time_from_cpuctx() in
@@ -870,9 +877,9 @@ static void perf_cgroup_switch(struct task_struct *task)
* perf_cgroup_set_timestamp() in ctx_sched_in()
* to not have to pass task around
*/
- ctx_sched_in(&cpuctx->ctx, EVENT_ALL);
+ ctx_sched_in(&cpuctx->ctx, EVENT_ALL|EVENT_CGROUP);

- perf_ctx_enable(&cpuctx->ctx);
+ perf_ctx_enable(&cpuctx->ctx, true);
perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
}

@@ -965,6 +972,8 @@ perf_cgroup_event_enable(struct perf_event *event, struct perf_event_context *ct
if (!is_cgroup_event(event))
return;

+ event->pmu_ctx->nr_cgroups++;
+
/*
* Because cgroup events are always per-cpu events,
* @ctx == &cpuctx->ctx.
@@ -985,6 +994,8 @@ perf_cgroup_event_disable(struct perf_event *event, struct perf_event_context *c
if (!is_cgroup_event(event))
return;

+ event->pmu_ctx->nr_cgroups--;
+
/*
* Because cgroup events are always per-cpu events,
* @ctx == &cpuctx->ctx.
@@ -2677,9 +2688,9 @@ static void ctx_resched(struct perf_cpu_context *cpuctx,

event_type &= EVENT_ALL;

- perf_ctx_disable(&cpuctx->ctx);
+ perf_ctx_disable(&cpuctx->ctx, false);
if (task_ctx) {
- perf_ctx_disable(task_ctx);
+ perf_ctx_disable(task_ctx, false);
task_ctx_sched_out(task_ctx, event_type);
}

@@ -2697,9 +2708,9 @@ static void ctx_resched(struct perf_cpu_context *cpuctx,

perf_event_sched_in(cpuctx, task_ctx);

- perf_ctx_enable(&cpuctx->ctx);
+ perf_ctx_enable(&cpuctx->ctx, false);
if (task_ctx)
- perf_ctx_enable(task_ctx);
+ perf_ctx_enable(task_ctx, false);
}

void perf_pmu_resched(struct pmu *pmu)
@@ -3244,6 +3255,9 @@ ctx_sched_out(struct perf_event_context *ctx, enum event_type_t event_type)
struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context);
struct perf_event_pmu_context *pmu_ctx;
int is_active = ctx->is_active;
+ bool cgroup = event_type & EVENT_CGROUP;
+
+ event_type &= ~EVENT_CGROUP;

lockdep_assert_held(&ctx->lock);

@@ -3290,8 +3304,11 @@ ctx_sched_out(struct perf_event_context *ctx, enum event_type_t event_type)

is_active ^= ctx->is_active; /* changed bits */

- list_for_each_entry(pmu_ctx, &ctx->pmu_ctx_list, pmu_ctx_entry)
+ list_for_each_entry(pmu_ctx, &ctx->pmu_ctx_list, pmu_ctx_entry) {
+ if (cgroup && !pmu_ctx->nr_cgroups)
+ continue;
__pmu_ctx_sched_out(pmu_ctx, is_active);
+ }
}

/*
@@ -3482,7 +3499,7 @@ perf_event_context_sched_out(struct task_struct *task, struct task_struct *next)
raw_spin_lock_nested(&next_ctx->lock, SINGLE_DEPTH_NESTING);
if (context_equiv(ctx, next_ctx)) {

- perf_ctx_disable(ctx);
+ perf_ctx_disable(ctx, false);

/* PMIs are disabled; ctx->nr_pending is stable. */
if (local_read(&ctx->nr_pending) ||
@@ -3502,7 +3519,7 @@ perf_event_context_sched_out(struct task_struct *task, struct task_struct *next)
perf_ctx_sched_task_cb(ctx, false);
perf_event_swap_task_ctx_data(ctx, next_ctx);

- perf_ctx_enable(ctx);
+ perf_ctx_enable(ctx, false);

/*
* RCU_INIT_POINTER here is safe because we've not
@@ -3526,13 +3543,13 @@ unlock:

if (do_switch) {
raw_spin_lock(&ctx->lock);
- perf_ctx_disable(ctx);
+ perf_ctx_disable(ctx, false);

inside_switch:
perf_ctx_sched_task_cb(ctx, false);
task_ctx_sched_out(ctx, EVENT_ALL);

- perf_ctx_enable(ctx);
+ perf_ctx_enable(ctx, false);
raw_spin_unlock(&ctx->lock);
}
}
@@ -3818,47 +3835,32 @@ static int merge_sched_in(struct perf_event *event, void *data)
return 0;
}

-static void ctx_pinned_sched_in(struct perf_event_context *ctx, struct pmu *pmu)
+static void pmu_groups_sched_in(struct perf_event_context *ctx,
+ struct perf_event_groups *groups,
+ struct pmu *pmu)
{
- struct perf_event_pmu_context *pmu_ctx;
int can_add_hw = 1;
-
- if (pmu) {
- visit_groups_merge(ctx, &ctx->pinned_groups,
- smp_processor_id(), pmu,
- merge_sched_in, &can_add_hw);
- } else {
- list_for_each_entry(pmu_ctx, &ctx->pmu_ctx_list, pmu_ctx_entry) {
- can_add_hw = 1;
- visit_groups_merge(ctx, &ctx->pinned_groups,
- smp_processor_id(), pmu_ctx->pmu,
- merge_sched_in, &can_add_hw);
- }
- }
+ visit_groups_merge(ctx, groups, smp_processor_id(), pmu,
+ merge_sched_in, &can_add_hw);
}

-static void ctx_flexible_sched_in(struct perf_event_context *ctx, struct pmu *pmu)
+static void ctx_groups_sched_in(struct perf_event_context *ctx,
+ struct perf_event_groups *groups,
+ bool cgroup)
{
struct perf_event_pmu_context *pmu_ctx;
- int can_add_hw = 1;

- if (pmu) {
- visit_groups_merge(ctx, &ctx->flexible_groups,
- smp_processor_id(), pmu,
- merge_sched_in, &can_add_hw);
- } else {
- list_for_each_entry(pmu_ctx, &ctx->pmu_ctx_list, pmu_ctx_entry) {
- can_add_hw = 1;
- visit_groups_merge(ctx, &ctx->flexible_groups,
- smp_processor_id(), pmu_ctx->pmu,
- merge_sched_in, &can_add_hw);
- }
+ list_for_each_entry(pmu_ctx, &ctx->pmu_ctx_list, pmu_ctx_entry) {
+ if (cgroup && !pmu_ctx->nr_cgroups)
+ continue;
+ pmu_groups_sched_in(ctx, groups, pmu_ctx->pmu);
}
}

-static void __pmu_ctx_sched_in(struct perf_event_context *ctx, struct pmu *pmu)
+static void __pmu_ctx_sched_in(struct perf_event_context *ctx,
+ struct pmu *pmu)
{
- ctx_flexible_sched_in(ctx, pmu);
+ pmu_groups_sched_in(ctx, &ctx->flexible_groups, pmu);
}

static void
@@ -3866,6 +3868,9 @@ ctx_sched_in(struct perf_event_context *ctx, enum event_type_t event_type)
{
struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context);
int is_active = ctx->is_active;
+ bool cgroup = event_type & EVENT_CGROUP;
+
+ event_type &= ~EVENT_CGROUP;

lockdep_assert_held(&ctx->lock);

@@ -3898,11 +3903,11 @@ ctx_sched_in(struct perf_event_context *ctx, enum event_type_t event_type)
* in order to give them the best chance of going on.
*/
if (is_active & EVENT_PINNED)
- ctx_pinned_sched_in(ctx, NULL);
+ ctx_groups_sched_in(ctx, &ctx->pinned_groups, cgroup);

/* Then walk through the lower prio flexible groups */
if (is_active & EVENT_FLEXIBLE)
- ctx_flexible_sched_in(ctx, NULL);
+ ctx_groups_sched_in(ctx, &ctx->flexible_groups, cgroup);
}

static void perf_event_context_sched_in(struct task_struct *task)
@@ -3917,11 +3922,11 @@ static void perf_event_context_sched_in(struct task_struct *task)

if (cpuctx->task_ctx == ctx) {
perf_ctx_lock(cpuctx, ctx);
- perf_ctx_disable(ctx);
+ perf_ctx_disable(ctx, false);

perf_ctx_sched_task_cb(ctx, true);

- perf_ctx_enable(ctx);
+ perf_ctx_enable(ctx, false);
perf_ctx_unlock(cpuctx, ctx);
goto rcu_unlock;
}
@@ -3934,7 +3939,7 @@ static void perf_event_context_sched_in(struct task_struct *task)
if (!ctx->nr_events)
goto unlock;

- perf_ctx_disable(ctx);
+ perf_ctx_disable(ctx, false);
/*
* We want to keep the following priority order:
* cpu pinned (that don't need to move), task pinned,
@@ -3944,7 +3949,7 @@ static void perf_event_context_sched_in(struct task_struct *task)
* events, no need to flip the cpuctx's events around.
*/
if (!RB_EMPTY_ROOT(&ctx->pinned_groups.tree)) {
- perf_ctx_disable(&cpuctx->ctx);
+ perf_ctx_disable(&cpuctx->ctx, false);
ctx_sched_out(&cpuctx->ctx, EVENT_FLEXIBLE);
}

@@ -3953,9 +3958,9 @@ static void perf_event_context_sched_in(struct task_struct *task)
perf_ctx_sched_task_cb(cpuctx->task_ctx, true);

if (!RB_EMPTY_ROOT(&ctx->pinned_groups.tree))
- perf_ctx_enable(&cpuctx->ctx);
+ perf_ctx_enable(&cpuctx->ctx, false);

- perf_ctx_enable(ctx);
+ perf_ctx_enable(ctx, false);

unlock:
perf_ctx_unlock(cpuctx, ctx);