2022-03-22 14:36:21

by Chengming Zhou

[permalink] [raw]
Subject: [PATCH v2 2/6] perf/core: Introduce percpu perf_cgroup

Although we don't have incosistency problem any more, we can
have other problem like:

CPU1 CPU2
(in context_switch) (attach running task)
prev->cgroups = cgrp2
perf_cgroup_sched_switch(prev, next)
cgrp2 == cgrp2 is True

If perf_cgroup of prev task changes from cgrp1 to cgrp2,
perf_cgroup_sched_switch() will skip perf_cgroup_switch(),
so the CPU would still schedule the cgrp1 events, but we should
schedule the cgrp2 events.

The reason of this problem is that we shouldn't use the changeable
prev->cgroups to decide whether skip perf_cgroup_switch().

This patch introduces a percpu perf_cgroup to cache the perf_cgroup
that scheduled in cpuctxes, which later used to compare with the
perf_cgroup of next task to decide whether skip perf_cgroup_switch().

Since the perf_cgroup_switch() can be called after the context switch,
the cgroup events might be scheduled already. So we put the comparison
of perf_cgroups in perf_cgroup_switch(), and delete the unused function
perf_cgroup_sched_switch().

We must clear the percpu perf_cgroup cache when the last cgroup event
disabled.

Fixes: a8d757ef076f ("perf events: Fix slow and broken cgroup context switch code")
Signed-off-by: Chengming Zhou <[email protected]>
---
kernel/events/core.c | 63 ++++++++++++++++----------------------------
1 file changed, 22 insertions(+), 41 deletions(-)

diff --git a/kernel/events/core.c b/kernel/events/core.c
index 8b5cf2aedfe6..848a3bfa9513 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -826,6 +826,7 @@ perf_cgroup_set_timestamp(struct task_struct *task,
}
}

+static DEFINE_PER_CPU(struct perf_cgroup *, cpu_perf_cgroup);
static DEFINE_PER_CPU(struct list_head, cgrp_cpuctx_list);

/*
@@ -833,6 +834,7 @@ static DEFINE_PER_CPU(struct list_head, cgrp_cpuctx_list);
*/
static void perf_cgroup_switch(struct task_struct *task)
{
+ struct perf_cgroup *cgrp;
struct perf_cpu_context *cpuctx, *tmp;
struct list_head *list;
unsigned long flags;
@@ -843,11 +845,21 @@ static void perf_cgroup_switch(struct task_struct *task)
*/
local_irq_save(flags);

+ cgrp = perf_cgroup_from_task(task, NULL);
+ if (cgrp == __this_cpu_read(cpu_perf_cgroup))
+ goto out;
+
+ __this_cpu_write(cpu_perf_cgroup, cgrp);
+
list = this_cpu_ptr(&cgrp_cpuctx_list);
list_for_each_entry_safe(cpuctx, tmp, list, cgrp_cpuctx_entry) {
WARN_ON_ONCE(cpuctx->ctx.nr_cgroups == 0);

perf_ctx_lock(cpuctx, cpuctx->task_ctx);
+
+ if (cpuctx->cgrp == cgrp)
+ continue;
+
perf_pmu_disable(cpuctx->ctx.pmu);

cpu_ctx_sched_out(cpuctx, EVENT_ALL);
@@ -855,14 +867,11 @@ static void perf_cgroup_switch(struct task_struct *task)
* must not be done before ctxswout due
* to event_filter_match() in event_sched_out()
*/
- cpuctx->cgrp = perf_cgroup_from_task(task,
- &cpuctx->ctx);
+ cpuctx->cgrp = cgrp;
/*
* set cgrp before ctxsw in to allow
* event_filter_match() to not have to pass
* task around
- * we pass the cpuctx->ctx to perf_cgroup_from_task()
- * because cgroup events are only per-cpu
*/
cpu_ctx_sched_in(cpuctx, EVENT_ALL, task);

@@ -870,35 +879,10 @@ static void perf_cgroup_switch(struct task_struct *task)
perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
}

+out:
local_irq_restore(flags);
}

-static inline void perf_cgroup_sched_switch(struct task_struct *task,
- struct task_struct *next)
-{
- struct perf_cgroup *cgrp1;
- struct perf_cgroup *cgrp2 = NULL;
-
- rcu_read_lock();
- /*
- * we come here when we know perf_cgroup_events > 0
- * we do not need to pass the ctx here because we know
- * we are holding the rcu lock
- */
- cgrp1 = perf_cgroup_from_task(task, NULL);
- cgrp2 = perf_cgroup_from_task(next, NULL);
-
- /*
- * only schedule out current cgroup events if we know
- * that we are switching to a different cgroup. Otherwise,
- * do no touch the cgroup events.
- */
- if (cgrp1 != cgrp2)
- perf_cgroup_switch(task);
-
- rcu_read_unlock();
-}
-
static int perf_cgroup_ensure_storage(struct perf_event *event,
struct cgroup_subsys_state *css)
{
@@ -1035,6 +1019,9 @@ perf_cgroup_event_disable(struct perf_event *event, struct perf_event_context *c
cpuctx->cgrp = NULL;

list_del(&cpuctx->cgrp_cpuctx_entry);
+
+ if (list_empty(per_cpu_ptr(&cgrp_cpuctx_list, event->cpu)))
+ __this_cpu_write(cpu_perf_cgroup, NULL);
}

#else /* !CONFIG_CGROUP_PERF */
@@ -1062,11 +1049,6 @@ static inline void update_cgrp_time_from_cpuctx(struct perf_cpu_context *cpuctx,
{
}

-static inline void perf_cgroup_sched_switch(struct task_struct *task,
- struct task_struct *next)
-{
-}
-
static inline int perf_cgroup_connect(pid_t pid, struct perf_event *event,
struct perf_event_attr *attr,
struct perf_event *group_leader)
@@ -1080,11 +1062,6 @@ perf_cgroup_set_timestamp(struct task_struct *task,
{
}

-static inline void
-perf_cgroup_sched_switch(struct task_struct *task, struct task_struct *next)
-{
-}
-
static inline u64 perf_cgroup_event_time(struct perf_event *event)
{
return 0;
@@ -1104,6 +1081,10 @@ static inline void
perf_cgroup_event_disable(struct perf_event *event, struct perf_event_context *ctx)
{
}
+
+static void perf_cgroup_switch(struct task_struct *task)
+{
+}
#endif

/*
@@ -3625,7 +3606,7 @@ void __perf_event_task_sched_out(struct task_struct *task,
* cgroup event are system-wide mode only
*/
if (atomic_read(this_cpu_ptr(&perf_cgroup_events)))
- perf_cgroup_sched_switch(task, next);
+ perf_cgroup_switch(next);
}

/*
--
2.20.1


2022-03-22 15:20:36

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [PATCH v2 2/6] perf/core: Introduce percpu perf_cgroup

On Tue, Mar 22, 2022 at 08:08:30PM +0800, Chengming Zhou wrote:
> Although we don't have incosistency problem any more, we can
> have other problem like:
>
> CPU1 CPU2
> (in context_switch) (attach running task)
> prev->cgroups = cgrp2
> perf_cgroup_sched_switch(prev, next)
> cgrp2 == cgrp2 is True
>

Again, I'm not following, how can you attach to a running task from
another CPU ?

2022-03-22 22:58:38

by Chengming Zhou

[permalink] [raw]
Subject: Re: [External] Re: [PATCH v2 2/6] perf/core: Introduce percpu perf_cgroup

On 2022/3/22 9:01 下午, Peter Zijlstra wrote:
> On Tue, Mar 22, 2022 at 08:08:30PM +0800, Chengming Zhou wrote:
>> Although we don't have incosistency problem any more, we can
>> have other problem like:
>>
>> CPU1 CPU2
>> (in context_switch) (attach running task)
>> prev->cgroups = cgrp2
>> perf_cgroup_sched_switch(prev, next)
>> cgrp2 == cgrp2 is True
>>
>
> Again, I'm not following, how can you attach to a running task from
> another CPU ?

Hi Peter, I make a little testcase which can reproduce the race
problem, on system with PSI disabled. Because when PSI enabled,
cgroup_move_task() will hold rq lock to assign task->cgroups.

```
#!/bin/bash

cd /sys/fs/cgroup/perf_event

mkdir cg1
mkdir cg2

perf stat -e cycles --cgroup /cg1 &

cg_run()
{
cg=$1
shift
echo $BASHPID > $cg/cgroup.procs
$@
}

cg_run cg1 schbench -r 100 &
cg_run cg2 schbench -r 100 &

while true; do
for i in $(cat cg1/cgroup.procs); do
echo $i > cg2/cgroup.procs
done
for i in $(cat cg2/cgroup.procs); do
echo $i > cg1/cgroup.procs
done
done
```

Some seconds later, dmesg will show the WARNING message:

[ 51.777830] WARNING: CPU: 2 PID: 1849 at kernel/events/core.c:869 perf_cgroup_switch+0x246/0x290
[ 51.779167] Modules linked in:
[ 51.779696] CPU: 2 PID: 1849 Comm: schbench Not tainted 5.17.0-rc8 #28
[ 51.780691] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.12.1-0-ga5cab58e9a3f-prebuilt.qemu.org 04/01/2014
[ 51.782353] RIP: 0010:perf_cgroup_switch+0x246/0x290
[ 51.783145] Code: 0f 0b e9 0b ff ff ff 48 83 7c 24 08 00 74 0c e8 00 7e f7 ff fb 66 0f 1f 44 00 00 48 83 c4 10 5b 5d 41 5c 41 5d 41 5e 41 5f c3 <0f> 0b e9 4f fe ff ff e8 be 7e f7 ff e8 a9 1f 93 00 89 c0 49 c7 c5
[ 51.785804] RSP: 0018:ffffba4440fcbd80 EFLAGS: 00010086
[ 51.786617] RAX: 0000000000000002 RBX: ffff8d78eb8b7200 RCX: 0000000000000000
[ 51.787696] RDX: 0000000000000000 RSI: ffffffffae1c83db RDI: ffffffffae185a69
[ 51.788777] RBP: ffff8d78eb8aad40 R08: 0000000000000001 R09: 0000000000000001
[ 51.789854] R10: 0000000000000000 R11: ffff8d78eb8b7220 R12: ffff8d78eb8b7208
[ 51.790929] R13: ffff8d78eb8aafa0 R14: ffff8d74cd6bb600 R15: 0000000000000000
[ 51.792006] FS: 00007fedaaffd700(0000) GS:ffff8d78eb880000(0000) knlGS:0000000000000000
[ 51.793223] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[ 51.794122] CR2: 000055e4bf2b696c CR3: 00000001128a2003 CR4: 0000000000370ee0
[ 51.795209] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
[ 51.796292] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
[ 51.797375] Call Trace:
[ 51.797828] <TASK>
[ 51.798229] __perf_event_task_sched_in+0x151/0x350
[ 51.799009] ? lock_release+0x1ed/0x2e0
[ 51.799640] finish_task_switch+0x1d3/0x2e0
[ 51.800328] ? __switch_to+0x136/0x4b0
[ 51.800953] __schedule+0x33e/0xae0
[ 51.801535] schedule+0x4e/0xc0
[ 51.802080] exit_to_user_mode_prepare+0x172/0x2a0
[ 51.802850] ? asm_sysvec_apic_timer_interrupt+0xa/0x20
[ 51.803675] irqentry_exit_to_user_mode+0x5/0x40
[ 51.804413] sysvec_apic_timer_interrupt+0x5c/0xd0
[ 51.805183] asm_sysvec_apic_timer_interrupt+0x12/0x20

Thanks.

2022-03-23 12:36:17

by Namhyung Kim

[permalink] [raw]
Subject: Re: [PATCH v2 2/6] perf/core: Introduce percpu perf_cgroup

On Tue, Mar 22, 2022 at 6:01 AM Peter Zijlstra <[email protected]> wrote:
>
> On Tue, Mar 22, 2022 at 08:08:30PM +0800, Chengming Zhou wrote:
> > Although we don't have incosistency problem any more, we can
> > have other problem like:
> >
> > CPU1 CPU2
> > (in context_switch) (attach running task)
> > prev->cgroups = cgrp2
> > perf_cgroup_sched_switch(prev, next)
> > cgrp2 == cgrp2 is True
> >
>
> Again, I'm not following, how can you attach to a running task from
> another CPU ?

I think it's supported from the beginning by writing PID to a file
in the cgroupfs.

Thanks,
Namhyung

2022-03-23 14:46:08

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [External] Re: [PATCH v2 2/6] perf/core: Introduce percpu perf_cgroup

On Wed, Mar 23, 2022 at 12:33:51AM +0800, Chengming Zhou wrote:
> On 2022/3/22 9:01 下午, Peter Zijlstra wrote:
> > On Tue, Mar 22, 2022 at 08:08:30PM +0800, Chengming Zhou wrote:
> >> Although we don't have incosistency problem any more, we can
> >> have other problem like:
> >>
> >> CPU1 CPU2
> >> (in context_switch) (attach running task)
> >> prev->cgroups = cgrp2
> >> perf_cgroup_sched_switch(prev, next)
> >> cgrp2 == cgrp2 is True
> >>
> >
> > Again, I'm not following, how can you attach to a running task from
> > another CPU ?
>
> Hi Peter, I make a little testcase which can reproduce the race
> problem, on system with PSI disabled. Because when PSI enabled,
> cgroup_move_task() will hold rq lock to assign task->cgroups.

No, the problem is that you're talking about cgroup attach while I'm
thinking of attaching a event to a task. And your picture has nothing to
clarify.

Those pictures of yours could really do with a few more function names
in them, otherwise it's absolutely unclear what code is running where.

2022-03-23 16:27:21

by Namhyung Kim

[permalink] [raw]
Subject: Re: [PATCH v2 2/6] perf/core: Introduce percpu perf_cgroup

On Tue, Mar 22, 2022 at 5:10 AM Chengming Zhou
<[email protected]> wrote:
>
> Although we don't have incosistency problem any more, we can
> have other problem like:
>
> CPU1 CPU2
> (in context_switch) (attach running task)
> prev->cgroups = cgrp2
> perf_cgroup_sched_switch(prev, next)
> cgrp2 == cgrp2 is True
>
> If perf_cgroup of prev task changes from cgrp1 to cgrp2,
> perf_cgroup_sched_switch() will skip perf_cgroup_switch(),
> so the CPU would still schedule the cgrp1 events, but we should
> schedule the cgrp2 events.

Ah ok, now I see the problem in changing prev->cgroup too.

>
> The reason of this problem is that we shouldn't use the changeable
> prev->cgroups to decide whether skip perf_cgroup_switch().
>
> This patch introduces a percpu perf_cgroup to cache the perf_cgroup
> that scheduled in cpuctxes, which later used to compare with the
> perf_cgroup of next task to decide whether skip perf_cgroup_switch().
>
> Since the perf_cgroup_switch() can be called after the context switch,
> the cgroup events might be scheduled already. So we put the comparison
> of perf_cgroups in perf_cgroup_switch(), and delete the unused function
> perf_cgroup_sched_switch().
>
> We must clear the percpu perf_cgroup cache when the last cgroup event
> disabled.
>
> Fixes: a8d757ef076f ("perf events: Fix slow and broken cgroup context switch code")
> Signed-off-by: Chengming Zhou <[email protected]>
> ---
> kernel/events/core.c | 63 ++++++++++++++++----------------------------
> 1 file changed, 22 insertions(+), 41 deletions(-)
>
> diff --git a/kernel/events/core.c b/kernel/events/core.c
> index 8b5cf2aedfe6..848a3bfa9513 100644
> --- a/kernel/events/core.c
> +++ b/kernel/events/core.c
> @@ -826,6 +826,7 @@ perf_cgroup_set_timestamp(struct task_struct *task,
> }
> }
>
> +static DEFINE_PER_CPU(struct perf_cgroup *, cpu_perf_cgroup);
> static DEFINE_PER_CPU(struct list_head, cgrp_cpuctx_list);
>
> /*
> @@ -833,6 +834,7 @@ static DEFINE_PER_CPU(struct list_head, cgrp_cpuctx_list);
> */
> static void perf_cgroup_switch(struct task_struct *task)
> {
> + struct perf_cgroup *cgrp;
> struct perf_cpu_context *cpuctx, *tmp;
> struct list_head *list;
> unsigned long flags;
> @@ -843,11 +845,21 @@ static void perf_cgroup_switch(struct task_struct *task)
> */
> local_irq_save(flags);
>
> + cgrp = perf_cgroup_from_task(task, NULL);
> + if (cgrp == __this_cpu_read(cpu_perf_cgroup))
> + goto out;
> +
> + __this_cpu_write(cpu_perf_cgroup, cgrp);
> +
> list = this_cpu_ptr(&cgrp_cpuctx_list);
> list_for_each_entry_safe(cpuctx, tmp, list, cgrp_cpuctx_entry) {
> WARN_ON_ONCE(cpuctx->ctx.nr_cgroups == 0);
>
> perf_ctx_lock(cpuctx, cpuctx->task_ctx);
> +
> + if (cpuctx->cgrp == cgrp)

Missing perf_ctx_unlock().

Thanks,
Namhyung

> + continue;
> +
> perf_pmu_disable(cpuctx->ctx.pmu);
>
> cpu_ctx_sched_out(cpuctx, EVENT_ALL);
> @@ -855,14 +867,11 @@ static void perf_cgroup_switch(struct task_struct *task)
> * must not be done before ctxswout due
> * to event_filter_match() in event_sched_out()
> */
> - cpuctx->cgrp = perf_cgroup_from_task(task,
> - &cpuctx->ctx);
> + cpuctx->cgrp = cgrp;
> /*
> * set cgrp before ctxsw in to allow
> * event_filter_match() to not have to pass
> * task around
> - * we pass the cpuctx->ctx to perf_cgroup_from_task()
> - * because cgroup events are only per-cpu
> */
> cpu_ctx_sched_in(cpuctx, EVENT_ALL, task);
>
> @@ -870,35 +879,10 @@ static void perf_cgroup_switch(struct task_struct *task)
> perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
> }
>
> +out:
> local_irq_restore(flags);
> }

2022-03-24 23:54:01

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [PATCH v2 2/6] perf/core: Introduce percpu perf_cgroup

On Tue, Mar 22, 2022 at 08:08:30PM +0800, Chengming Zhou wrote:

> diff --git a/kernel/events/core.c b/kernel/events/core.c
> index 8b5cf2aedfe6..848a3bfa9513 100644
> --- a/kernel/events/core.c
> +++ b/kernel/events/core.c

> @@ -843,11 +845,21 @@ static void perf_cgroup_switch(struct task_struct *task)
> */
> local_irq_save(flags);
>
> + cgrp = perf_cgroup_from_task(task, NULL);
> + if (cgrp == __this_cpu_read(cpu_perf_cgroup))
> + goto out;
> +
> + __this_cpu_write(cpu_perf_cgroup, cgrp);
> +
> list = this_cpu_ptr(&cgrp_cpuctx_list);
> list_for_each_entry_safe(cpuctx, tmp, list, cgrp_cpuctx_entry) {
> WARN_ON_ONCE(cpuctx->ctx.nr_cgroups == 0);
>
> perf_ctx_lock(cpuctx, cpuctx->task_ctx);
> +
> + if (cpuctx->cgrp == cgrp)
> + continue;
> +
> perf_pmu_disable(cpuctx->ctx.pmu);
>
> cpu_ctx_sched_out(cpuctx, EVENT_ALL);

This is just straight up broken.. you can't continue after taking a
lock, that'll miss unlock.

Also, I really don't see the point of cpu_perf_cgroup, cpuctx->cgrp
should be able to do this.

Also, perhaps merge this in the previous patch, I'm not sure it makes
sense to split this.

--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -833,6 +833,7 @@ static DEFINE_PER_CPU(struct list_head,
*/
static void perf_cgroup_switch(struct task_struct *task)
{
+ struct perf_cgroup *cgrp;
struct perf_cpu_context *cpuctx, *tmp;
struct list_head *list;
unsigned long flags;
@@ -843,11 +844,20 @@ static void perf_cgroup_switch(struct ta
*/
local_irq_save(flags);

+ cgrp = perf_cgroup_from_task(task, NULL);
+
list = this_cpu_ptr(&cgrp_cpuctx_list);
list_for_each_entry_safe(cpuctx, tmp, list, cgrp_cpuctx_entry) {
WARN_ON_ONCE(cpuctx->ctx.nr_cgroups == 0);

+ if (READ_ONCE(cpuctx->cgrp == cgrp))
+ continue
+
perf_ctx_lock(cpuctx, cpuctx->task_ctx);
+
+ if (cpuctx->cgrp == cgrp)
+ goto next;
+
perf_pmu_disable(cpuctx->ctx.pmu);

cpu_ctx_sched_out(cpuctx, EVENT_ALL);
@@ -855,50 +865,22 @@ static void perf_cgroup_switch(struct ta
* must not be done before ctxswout due
* to event_filter_match() in event_sched_out()
*/
- cpuctx->cgrp = perf_cgroup_from_task(task,
- &cpuctx->ctx);
+ WRITE_ONCE(cpuctx->cgrp, cgrp);
/*
* set cgrp before ctxsw in to allow
* event_filter_match() to not have to pass
* task around
- * we pass the cpuctx->ctx to perf_cgroup_from_task()
- * because cgroup events are only per-cpu
*/
cpu_ctx_sched_in(cpuctx, EVENT_ALL, task);

perf_pmu_enable(cpuctx->ctx.pmu);
+next:
perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
}

local_irq_restore(flags);
}

-static inline void perf_cgroup_sched_switch(struct task_struct *task,
- struct task_struct *next)
-{
- struct perf_cgroup *cgrp1;
- struct perf_cgroup *cgrp2 = NULL;
-
- rcu_read_lock();
- /*
- * we come here when we know perf_cgroup_events > 0
- * we do not need to pass the ctx here because we know
- * we are holding the rcu lock
- */
- cgrp1 = perf_cgroup_from_task(task, NULL);
- cgrp2 = perf_cgroup_from_task(next, NULL);
-
- /*
- * only schedule out current cgroup events if we know
- * that we are switching to a different cgroup. Otherwise,
- * do no touch the cgroup events.
- */
- if (cgrp1 != cgrp2)
- perf_cgroup_switch(task);
-
- rcu_read_unlock();
-}
-
static int perf_cgroup_ensure_storage(struct perf_event *event,
struct cgroup_subsys_state *css)
{
@@ -1062,11 +1044,6 @@ static inline void update_cgrp_time_from
{
}

-static inline void perf_cgroup_sched_switch(struct task_struct *task,
- struct task_struct *next)
-{
-}
-
static inline int perf_cgroup_connect(pid_t pid, struct perf_event *event,
struct perf_event_attr *attr,
struct perf_event *group_leader)
@@ -1080,11 +1057,6 @@ perf_cgroup_set_timestamp(struct task_st
{
}

-static inline void
-perf_cgroup_sched_switch(struct task_struct *task, struct task_struct *next)
-{
-}
-
static inline u64 perf_cgroup_event_time(struct perf_event *event)
{
return 0;
@@ -1104,6 +1076,10 @@ static inline void
perf_cgroup_event_disable(struct perf_event *event, struct perf_event_context *ctx)
{
}
+
+static void perf_cgroup_switch(struct task_struct *task)
+{
+}
#endif

/*
@@ -3625,7 +3601,7 @@ void __perf_event_task_sched_out(struct
* cgroup event are system-wide mode only
*/
if (atomic_read(this_cpu_ptr(&perf_cgroup_events)))
- perf_cgroup_sched_switch(task, next);
+ perf_cgroup_switch(next);
}

/*

2022-03-24 23:54:48

by Chengming Zhou

[permalink] [raw]
Subject: Re: [Phishing Risk] [External] Re: [PATCH v2 2/6] perf/core: Introduce percpu perf_cgroup

Hi Namhyung,

On 2022/3/23 6:18 上午, Namhyung Kim wrote:
> On Tue, Mar 22, 2022 at 5:10 AM Chengming Zhou
> <[email protected]> wrote:
>>
>> Although we don't have incosistency problem any more, we can
>> have other problem like:
>>
>> CPU1 CPU2
>> (in context_switch) (attach running task)
>> prev->cgroups = cgrp2
>> perf_cgroup_sched_switch(prev, next)
>> cgrp2 == cgrp2 is True
>>
>> If perf_cgroup of prev task changes from cgrp1 to cgrp2,
>> perf_cgroup_sched_switch() will skip perf_cgroup_switch(),
>> so the CPU would still schedule the cgrp1 events, but we should
>> schedule the cgrp2 events.
>
> Ah ok, now I see the problem in changing prev->cgroup too.
>
>>
>> The reason of this problem is that we shouldn't use the changeable
>> prev->cgroups to decide whether skip perf_cgroup_switch().
>>
>> This patch introduces a percpu perf_cgroup to cache the perf_cgroup
>> that scheduled in cpuctxes, which later used to compare with the
>> perf_cgroup of next task to decide whether skip perf_cgroup_switch().
>>
>> Since the perf_cgroup_switch() can be called after the context switch,
>> the cgroup events might be scheduled already. So we put the comparison
>> of perf_cgroups in perf_cgroup_switch(), and delete the unused function
>> perf_cgroup_sched_switch().
>>
>> We must clear the percpu perf_cgroup cache when the last cgroup event
>> disabled.
>>
>> Fixes: a8d757ef076f ("perf events: Fix slow and broken cgroup context switch code")
>> Signed-off-by: Chengming Zhou <[email protected]>
>> ---
>> kernel/events/core.c | 63 ++++++++++++++++----------------------------
>> 1 file changed, 22 insertions(+), 41 deletions(-)
>>
>> diff --git a/kernel/events/core.c b/kernel/events/core.c
>> index 8b5cf2aedfe6..848a3bfa9513 100644
>> --- a/kernel/events/core.c
>> +++ b/kernel/events/core.c
>> @@ -826,6 +826,7 @@ perf_cgroup_set_timestamp(struct task_struct *task,
>> }
>> }
>>
>> +static DEFINE_PER_CPU(struct perf_cgroup *, cpu_perf_cgroup);
>> static DEFINE_PER_CPU(struct list_head, cgrp_cpuctx_list);
>>
>> /*
>> @@ -833,6 +834,7 @@ static DEFINE_PER_CPU(struct list_head, cgrp_cpuctx_list);
>> */
>> static void perf_cgroup_switch(struct task_struct *task)
>> {
>> + struct perf_cgroup *cgrp;
>> struct perf_cpu_context *cpuctx, *tmp;
>> struct list_head *list;
>> unsigned long flags;
>> @@ -843,11 +845,21 @@ static void perf_cgroup_switch(struct task_struct *task)
>> */
>> local_irq_save(flags);
>>
>> + cgrp = perf_cgroup_from_task(task, NULL);
>> + if (cgrp == __this_cpu_read(cpu_perf_cgroup))
>> + goto out;
>> +
>> + __this_cpu_write(cpu_perf_cgroup, cgrp);
>> +
>> list = this_cpu_ptr(&cgrp_cpuctx_list);
>> list_for_each_entry_safe(cpuctx, tmp, list, cgrp_cpuctx_entry) {
>> WARN_ON_ONCE(cpuctx->ctx.nr_cgroups == 0);
>>
>> perf_ctx_lock(cpuctx, cpuctx->task_ctx);
>> +
>> + if (cpuctx->cgrp == cgrp)
>
> Missing perf_ctx_unlock().

Thank you, will fix next version.

>
> Thanks,
> Namhyung
>
>> + continue;
>> +
>> perf_pmu_disable(cpuctx->ctx.pmu);
>>
>> cpu_ctx_sched_out(cpuctx, EVENT_ALL);
>> @@ -855,14 +867,11 @@ static void perf_cgroup_switch(struct task_struct *task)
>> * must not be done before ctxswout due
>> * to event_filter_match() in event_sched_out()
>> */
>> - cpuctx->cgrp = perf_cgroup_from_task(task,
>> - &cpuctx->ctx);
>> + cpuctx->cgrp = cgrp;
>> /*
>> * set cgrp before ctxsw in to allow
>> * event_filter_match() to not have to pass
>> * task around
>> - * we pass the cpuctx->ctx to perf_cgroup_from_task()
>> - * because cgroup events are only per-cpu
>> */
>> cpu_ctx_sched_in(cpuctx, EVENT_ALL, task);
>>
>> @@ -870,35 +879,10 @@ static void perf_cgroup_switch(struct task_struct *task)
>> perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
>> }
>>
>> +out:
>> local_irq_restore(flags);
>> }

2022-03-25 00:49:02

by Chengming Zhou

[permalink] [raw]
Subject: Re: [External] Re: [PATCH v2 2/6] perf/core: Introduce percpu perf_cgroup

On 2022/3/23 8:51 下午, Peter Zijlstra wrote:
> On Tue, Mar 22, 2022 at 08:08:30PM +0800, Chengming Zhou wrote:
>
>> diff --git a/kernel/events/core.c b/kernel/events/core.c
>> index 8b5cf2aedfe6..848a3bfa9513 100644
>> --- a/kernel/events/core.c
>> +++ b/kernel/events/core.c
>
>> @@ -843,11 +845,21 @@ static void perf_cgroup_switch(struct task_struct *task)
>> */
>> local_irq_save(flags);
>>
>> + cgrp = perf_cgroup_from_task(task, NULL);
>> + if (cgrp == __this_cpu_read(cpu_perf_cgroup))
>> + goto out;
>> +
>> + __this_cpu_write(cpu_perf_cgroup, cgrp);
>> +
>> list = this_cpu_ptr(&cgrp_cpuctx_list);
>> list_for_each_entry_safe(cpuctx, tmp, list, cgrp_cpuctx_entry) {
>> WARN_ON_ONCE(cpuctx->ctx.nr_cgroups == 0);
>>
>> perf_ctx_lock(cpuctx, cpuctx->task_ctx);
>> +
>> + if (cpuctx->cgrp == cgrp)
>> + continue;
>> +
>> perf_pmu_disable(cpuctx->ctx.pmu);
>>
>> cpu_ctx_sched_out(cpuctx, EVENT_ALL);
>
> This is just straight up broken.. you can't continue after taking a
> lock, that'll miss unlock.

Yes, Namhyung has pointed it out, I will fix it next version.

>
> Also, I really don't see the point of cpu_perf_cgroup, cpuctx->cgrp
> should be able to do this.

But the problem is that we have two cpuctx on the percpu list, do you
mean we should use perf_cgroup of the first cpuctx to compare with
the next task's perf_cgroup ?

Or we should delete the cgrp in cpuctx, and use this new percpu cpu_perf_cgroup?

>
> Also, perhaps merge this in the previous patch, I'm not sure it makes
> sense to split this.

Ok, will do. I put it in one patch in v1 RFC, then split it for easier review.
I will put it together in the next version.

Thanks.

>
> --- a/kernel/events/core.c
> +++ b/kernel/events/core.c
> @@ -833,6 +833,7 @@ static DEFINE_PER_CPU(struct list_head,
> */
> static void perf_cgroup_switch(struct task_struct *task)
> {
> + struct perf_cgroup *cgrp;
> struct perf_cpu_context *cpuctx, *tmp;
> struct list_head *list;
> unsigned long flags;
> @@ -843,11 +844,20 @@ static void perf_cgroup_switch(struct ta
> */
> local_irq_save(flags);
>
> + cgrp = perf_cgroup_from_task(task, NULL);
> +
> list = this_cpu_ptr(&cgrp_cpuctx_list);
> list_for_each_entry_safe(cpuctx, tmp, list, cgrp_cpuctx_entry) {
> WARN_ON_ONCE(cpuctx->ctx.nr_cgroups == 0);
>
> + if (READ_ONCE(cpuctx->cgrp == cgrp))
> + continue
> +
> perf_ctx_lock(cpuctx, cpuctx->task_ctx);
> +
> + if (cpuctx->cgrp == cgrp)
> + goto next;
> +
> perf_pmu_disable(cpuctx->ctx.pmu);
>
> cpu_ctx_sched_out(cpuctx, EVENT_ALL);
> @@ -855,50 +865,22 @@ static void perf_cgroup_switch(struct ta
> * must not be done before ctxswout due
> * to event_filter_match() in event_sched_out()
> */
> - cpuctx->cgrp = perf_cgroup_from_task(task,
> - &cpuctx->ctx);
> + WRITE_ONCE(cpuctx->cgrp, cgrp);
> /*
> * set cgrp before ctxsw in to allow
> * event_filter_match() to not have to pass
> * task around
> - * we pass the cpuctx->ctx to perf_cgroup_from_task()
> - * because cgroup events are only per-cpu
> */
> cpu_ctx_sched_in(cpuctx, EVENT_ALL, task);
>
> perf_pmu_enable(cpuctx->ctx.pmu);
> +next:
> perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
> }
>
> local_irq_restore(flags);
> }
>
> -static inline void perf_cgroup_sched_switch(struct task_struct *task,
> - struct task_struct *next)
> -{
> - struct perf_cgroup *cgrp1;
> - struct perf_cgroup *cgrp2 = NULL;
> -
> - rcu_read_lock();
> - /*
> - * we come here when we know perf_cgroup_events > 0
> - * we do not need to pass the ctx here because we know
> - * we are holding the rcu lock
> - */
> - cgrp1 = perf_cgroup_from_task(task, NULL);
> - cgrp2 = perf_cgroup_from_task(next, NULL);
> -
> - /*
> - * only schedule out current cgroup events if we know
> - * that we are switching to a different cgroup. Otherwise,
> - * do no touch the cgroup events.
> - */
> - if (cgrp1 != cgrp2)
> - perf_cgroup_switch(task);
> -
> - rcu_read_unlock();
> -}
> -
> static int perf_cgroup_ensure_storage(struct perf_event *event,
> struct cgroup_subsys_state *css)
> {
> @@ -1062,11 +1044,6 @@ static inline void update_cgrp_time_from
> {
> }
>
> -static inline void perf_cgroup_sched_switch(struct task_struct *task,
> - struct task_struct *next)
> -{
> -}
> -
> static inline int perf_cgroup_connect(pid_t pid, struct perf_event *event,
> struct perf_event_attr *attr,
> struct perf_event *group_leader)
> @@ -1080,11 +1057,6 @@ perf_cgroup_set_timestamp(struct task_st
> {
> }
>
> -static inline void
> -perf_cgroup_sched_switch(struct task_struct *task, struct task_struct *next)
> -{
> -}
> -
> static inline u64 perf_cgroup_event_time(struct perf_event *event)
> {
> return 0;
> @@ -1104,6 +1076,10 @@ static inline void
> perf_cgroup_event_disable(struct perf_event *event, struct perf_event_context *ctx)
> {
> }
> +
> +static void perf_cgroup_switch(struct task_struct *task)
> +{
> +}
> #endif
>
> /*
> @@ -3625,7 +3601,7 @@ void __perf_event_task_sched_out(struct
> * cgroup event are system-wide mode only
> */
> if (atomic_read(this_cpu_ptr(&perf_cgroup_events)))
> - perf_cgroup_sched_switch(task, next);
> + perf_cgroup_switch(next);
> }
>
> /*

2022-03-25 06:19:38

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [External] Re: [PATCH v2 2/6] perf/core: Introduce percpu perf_cgroup

On Wed, Mar 23, 2022 at 09:07:01PM +0800, Chengming Zhou wrote:
> On 2022/3/23 8:51 下午, Peter Zijlstra wrote:
> > On Tue, Mar 22, 2022 at 08:08:30PM +0800, Chengming Zhou wrote:
> >
> >> diff --git a/kernel/events/core.c b/kernel/events/core.c
> >> index 8b5cf2aedfe6..848a3bfa9513 100644
> >> --- a/kernel/events/core.c
> >> +++ b/kernel/events/core.c
> >
> >> @@ -843,11 +845,21 @@ static void perf_cgroup_switch(struct task_struct *task)
> >> */
> >> local_irq_save(flags);
> >>
> >> + cgrp = perf_cgroup_from_task(task, NULL);
> >> + if (cgrp == __this_cpu_read(cpu_perf_cgroup))
> >> + goto out;

So this compares the cpu thing against the task thing, if matching, we
bail.

> >> +
> >> + __this_cpu_write(cpu_perf_cgroup, cgrp);

Then we set cpu thing.

> >> +
> >> list = this_cpu_ptr(&cgrp_cpuctx_list);
> >> list_for_each_entry_safe(cpuctx, tmp, list, cgrp_cpuctx_entry) {
> >> WARN_ON_ONCE(cpuctx->ctx.nr_cgroups == 0);
> >>
> >> perf_ctx_lock(cpuctx, cpuctx->task_ctx);
> >> +
> >> + if (cpuctx->cgrp == cgrp)
> >> + continue;
> >> +
> >> perf_pmu_disable(cpuctx->ctx.pmu);
> >>
> >> cpu_ctx_sched_out(cpuctx, EVENT_ALL);

> >> + cpuctx->cgrp = cgrp

But here we already have exactly the same pattern, we match cpuctx thing
against task thing and skip/set etc.

> > Also, I really don't see the point of cpu_perf_cgroup, cpuctx->cgrp
> > should be able to do this.
>
> But the problem is that we have two cpuctx on the percpu list, do you
> mean we should use perf_cgroup of the first cpuctx to compare with
> the next task's perf_cgroup ?
>
> Or we should delete the cgrp in cpuctx, and use this new percpu cpu_perf_cgroup?

I'm a bit confused, per the above, you already do exactly what the new
cpu_perf_cgroup does on the cpuctx->cgrp variable. AFAICT the only think
the new per-cpu variable does is avoid a lock, howveer:


> > --- a/kernel/events/core.c
> > +++ b/kernel/events/core.c
> > @@ -833,6 +833,7 @@ static DEFINE_PER_CPU(struct list_head,
> > */
> > static void perf_cgroup_switch(struct task_struct *task)
> > {
> > + struct perf_cgroup *cgrp;
> > struct perf_cpu_context *cpuctx, *tmp;
> > struct list_head *list;
> > unsigned long flags;
> > @@ -843,11 +844,20 @@ static void perf_cgroup_switch(struct ta
> > */
> > local_irq_save(flags);
> >
> > + cgrp = perf_cgroup_from_task(task, NULL);
> > +
> > list = this_cpu_ptr(&cgrp_cpuctx_list);
> > list_for_each_entry_safe(cpuctx, tmp, list, cgrp_cpuctx_entry) {
> > WARN_ON_ONCE(cpuctx->ctx.nr_cgroups == 0);
> >
> > + if (READ_ONCE(cpuctx->cgrp == cgrp))
> > + continue

I think we can avoid that by doing an early check, hmm?

> > +
> > perf_ctx_lock(cpuctx, cpuctx->task_ctx);
> > +
> > + if (cpuctx->cgrp == cgrp)
> > + goto next;
> > +
> > perf_pmu_disable(cpuctx->ctx.pmu);
> >
> > cpu_ctx_sched_out(cpuctx, EVENT_ALL);
> > @@ -855,50 +865,22 @@ static void perf_cgroup_switch(struct ta
> > * must not be done before ctxswout due
> > * to event_filter_match() in event_sched_out()
> > */
> > - cpuctx->cgrp = perf_cgroup_from_task(task,
> > - &cpuctx->ctx);
> > + WRITE_ONCE(cpuctx->cgrp, cgrp);

2022-03-25 19:57:33

by Chengming Zhou

[permalink] [raw]
Subject: Re: [External] Re: [PATCH v2 2/6] perf/core: Introduce percpu perf_cgroup

On 2022/3/23 4:13 下午, Peter Zijlstra wrote:
> On Wed, Mar 23, 2022 at 12:33:51AM +0800, Chengming Zhou wrote:
>> On 2022/3/22 9:01 下午, Peter Zijlstra wrote:
>>> On Tue, Mar 22, 2022 at 08:08:30PM +0800, Chengming Zhou wrote:
>>>> Although we don't have incosistency problem any more, we can
>>>> have other problem like:
>>>>
>>>> CPU1 CPU2
>>>> (in context_switch) (attach running task)
>>>> prev->cgroups = cgrp2
>>>> perf_cgroup_sched_switch(prev, next)
>>>> cgrp2 == cgrp2 is True
>>>>
>>>
>>> Again, I'm not following, how can you attach to a running task from
>>> another CPU ?
>>
>> Hi Peter, I make a little testcase which can reproduce the race
>> problem, on system with PSI disabled. Because when PSI enabled,
>> cgroup_move_task() will hold rq lock to assign task->cgroups.
>
> No, the problem is that you're talking about cgroup attach while I'm
> thinking of attaching a event to a task. And your picture has nothing to
> clarify.
>
> Those pictures of yours could really do with a few more function names
> in them, otherwise it's absolutely unclear what code is running where.

Sorry for the confusion ;-)
I will draw a better picture including more function names in the next version.

Thanks.