LinuxLists.cc - [RFC 1/2] perf core: Add PERF_COUNT_SW_CGROUP

2020-12-02 15:06:37

Subject: [RFC 1/2] perf core: Add PERF_COUNT_SW_CGROUP_SWITCHES event

This patch adds a new software event to count context switches
involving cgroup switches. So it's counted only if cgroups of
previous and next tasks are different.

One can argue that we can do this by using existing sched_switch event
with eBPF. But some systems might not have eBPF for some reason so
I'd like to add this as a simple way.

Signed-off-by: Namhyung Kim <[email protected]>
---
include/linux/perf_event.h | 22 ++++++++++++++++++++++
include/uapi/linux/perf_event.h | 1 +
2 files changed, 23 insertions(+)

diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 9a38f579bc76..d6dec422ba03 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -1185,6 +1185,27 @@ perf_sw_event_sched(u32 event_id, u64 nr, u64 addr)
}
}

+#ifdef CONFIG_CGROUP_PERF
+static inline void
+perf_sw_event_cgroup_switch(struct task_struct *prev, struct task_struct *next)
+{
+ struct cgroup *prev_cgrp, *next_cgrp;
+
+ rcu_read_lock();
+
+ prev_cgrp = task_css_check(prev, perf_event_cgrp_id, 1)->cgroup;
+ next_cgrp = task_css_check(next, perf_event_cgrp_id, 1)->cgroup;
+
+ if (prev_cgrp != next_cgrp)
+ perf_sw_event_sched(PERF_COUNT_SW_CGROUP_SWITCHES, 1, 0);
+
+ rcu_read_unlock();
+}
+#else
+static inline void perf_sw_event_cgroup_switch(struct task_struct *prev,
+ struct task_struct *next) {}
+#endif /* CONFIG_CGROUP_PERF */
+
extern struct static_key_false perf_sched_events;

static __always_inline bool
@@ -1220,6 +1241,7 @@ static inline void perf_event_task_sched_out(struct task_struct *prev,
struct task_struct *next)
{
perf_sw_event_sched(PERF_COUNT_SW_CONTEXT_SWITCHES, 1, 0);
+ perf_sw_event_cgroup_switch(prev, next);

if (static_branch_unlikely(&perf_sched_events))
__perf_event_task_sched_out(prev, next);
diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h
index b15e3447cd9f..16b9538ad89b 100644
--- a/include/uapi/linux/perf_event.h
+++ b/include/uapi/linux/perf_event.h
@@ -112,6 +112,7 @@ enum perf_sw_ids {
PERF_COUNT_SW_EMULATION_FAULTS = 8,
PERF_COUNT_SW_DUMMY = 9,
PERF_COUNT_SW_BPF_OUTPUT = 10,
+ PERF_COUNT_SW_CGROUP_SWITCHES = 11,

PERF_COUNT_SW_MAX, /* non-ABI */
};
--
2.29.2.454.gaff20da3a2-goog

2020-12-02 15:07:00

by Namhyung Kim

[permalink] [raw]

Subject: [RFC 2/2] perf tools: Add 'cgroup-switches' software event

It counts how often cgroups are changed actually during the context
switches.

# perf stat -a -e context-switches,cgroup-switches -a sleep 1

Performance counter stats for 'system wide':

11,267 context-switches
10,950 cgroup-switches

1.015634369 seconds time elapsed

Signed-off-by: Namhyung Kim <[email protected]>
---
tools/include/uapi/linux/perf_event.h | 1 +
tools/perf/util/parse-events.c | 4 ++++
tools/perf/util/parse-events.l | 1 +
3 files changed, 6 insertions(+)

diff --git a/tools/include/uapi/linux/perf_event.h b/tools/include/uapi/linux/perf_event.h
index b95d3c485d27..16559703c49c 100644
--- a/tools/include/uapi/linux/perf_event.h
+++ b/tools/include/uapi/linux/perf_event.h
@@ -112,6 +112,7 @@ enum perf_sw_ids {
PERF_COUNT_SW_EMULATION_FAULTS = 8,
PERF_COUNT_SW_DUMMY = 9,
PERF_COUNT_SW_BPF_OUTPUT = 10,
+ PERF_COUNT_SW_CGROUP_SWITCHES = 11,

PERF_COUNT_SW_MAX, /* non-ABI */
};
diff --git a/tools/perf/util/parse-events.c b/tools/perf/util/parse-events.c
index 3b273580fb84..f6a5a099e143 100644
--- a/tools/perf/util/parse-events.c
+++ b/tools/perf/util/parse-events.c
@@ -145,6 +145,10 @@ struct event_symbol event_symbols_sw[PERF_COUNT_SW_MAX] = {
.symbol = "bpf-output",
.alias = "",
},
+ [PERF_COUNT_SW_CGROUP_SWITCHES] = {
+ .symbol = "cgroup-switches",
+ .alias = "",
+ },
};

#define __PERF_EVENT_FIELD(config, name) \
diff --git a/tools/perf/util/parse-events.l b/tools/perf/util/parse-events.l
index 9db5097317f4..88f203bb6fab 100644
--- a/tools/perf/util/parse-events.l
+++ b/tools/perf/util/parse-events.l
@@ -347,6 +347,7 @@ emulation-faults { return sym(yyscanner, PERF_TYPE_SOFTWARE, PERF_COUNT_SW_EM
dummy { return sym(yyscanner, PERF_TYPE_SOFTWARE, PERF_COUNT_SW_DUMMY); }
duration_time { return tool(yyscanner, PERF_TOOL_DURATION_TIME); }
bpf-output { return sym(yyscanner, PERF_TYPE_SOFTWARE, PERF_COUNT_SW_BPF_OUTPUT); }
+cgroup-switches { return sym(yyscanner, PERF_TYPE_SOFTWARE, PERF_COUNT_SW_CGROUP_SWITCHES); }

/*
* We have to handle the kernel PMU event cycles-ct/cycles-t/mem-loads/mem-stores separately.
--
2.29.2.454.gaff20da3a2-goog

2020-12-02 16:25:19

by Peter Zijlstra

[permalink] [raw]

Subject: Re: [RFC 1/2] perf core: Add PERF_COUNT_SW_CGROUP_SWITCHES event

On Thu, Dec 03, 2020 at 12:02:04AM +0900, Namhyung Kim wrote:

> +#ifdef CONFIG_CGROUP_PERF
> +static inline void
> +perf_sw_event_cgroup_switch(struct task_struct *prev, struct task_struct *next)
> +{
> + struct cgroup *prev_cgrp, *next_cgrp;
> +
> + rcu_read_lock();
> +
> + prev_cgrp = task_css_check(prev, perf_event_cgrp_id, 1)->cgroup;
> + next_cgrp = task_css_check(next, perf_event_cgrp_id, 1)->cgroup;
> +
> + if (prev_cgrp != next_cgrp)
> + perf_sw_event_sched(PERF_COUNT_SW_CGROUP_SWITCHES, 1, 0);
> +
> + rcu_read_unlock();
> +}
> +#else
> +static inline void perf_sw_event_cgroup_switch(struct task_struct *prev,
> + struct task_struct *next) {}
> +#endif /* CONFIG_CGROUP_PERF */
> +
> extern struct static_key_false perf_sched_events;
>
> static __always_inline bool
> @@ -1220,6 +1241,7 @@ static inline void perf_event_task_sched_out(struct task_struct *prev,
> struct task_struct *next)
> {
> perf_sw_event_sched(PERF_COUNT_SW_CONTEXT_SWITCHES, 1, 0);
> + perf_sw_event_cgroup_switch(prev, next);
>
> if (static_branch_unlikely(&perf_sched_events))
> __perf_event_task_sched_out(prev, next);

Urgh.. that's horrible, try something like this.

diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 9a38f579bc76..5eb284819ee5 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -1174,25 +1174,19 @@ DECLARE_PER_CPU(struct pt_regs, __perf_regs[4]);
* which is guaranteed by us not actually scheduling inside other swevents
* because those disable preemption.
*/
-static __always_inline void
-perf_sw_event_sched(u32 event_id, u64 nr, u64 addr)
+static __always_inline void __perf_sw_event_sched(u32 event_id, u64 nr, u64 addr)
{
- if (static_key_false(&perf_swevent_enabled[event_id])) {
- struct pt_regs *regs = this_cpu_ptr(&__perf_regs[0]);
+ struct pt_regs *regs = this_cpu_ptr(&__perf_regs[0]);

- perf_fetch_caller_regs(regs);
- ___perf_sw_event(event_id, nr, regs, addr);
- }
+ perf_fetch_caller_regs(regs);
+ ___perf_sw_event(event_id, nr, regs, addr);
}

extern struct static_key_false perf_sched_events;

-static __always_inline bool
-perf_sw_migrate_enabled(void)
+static __always_inline bool __perf_sw_enabled(int swevt)
{
- if (static_key_false(&perf_swevent_enabled[PERF_COUNT_SW_CPU_MIGRATIONS]))
- return true;
- return false;
+ return static_key_false(&perf_swevent_enabled[swevt]);
}

static inline void perf_event_task_migrate(struct task_struct *task)
@@ -1207,11 +1201,9 @@ static inline void perf_event_task_sched_in(struct task_struct *prev,
if (static_branch_unlikely(&perf_sched_events))
__perf_event_task_sched_in(prev, task);

- if (perf_sw_migrate_enabled() && task->sched_migrated) {
- struct pt_regs *regs = this_cpu_ptr(&__perf_regs[0]);
-
- perf_fetch_caller_regs(regs);
- ___perf_sw_event(PERF_COUNT_SW_CPU_MIGRATIONS, 1, regs, 0);
+ if (__perf_sw_enabled(PERF_COUNT_SW_CPU_MIGRATIONS) &&
+ task->sched_migrated) {
+ __perf_sw_event_sched(PERF_COUNT_SW_CPU_MIGRATIONS, 1, 0);
task->sched_migrated = 0;
}
}
@@ -1219,7 +1211,13 @@ static inline void perf_event_task_sched_in(struct task_struct *prev,
static inline void perf_event_task_sched_out(struct task_struct *prev,
struct task_struct *next)
{
- perf_sw_event_sched(PERF_COUNT_SW_CONTEXT_SWITCHES, 1, 0);
+ if (__perf_sw_enabled(PERF_COUNT_SW_CONTEXT_SWITCHES))
+ __perf_sw_event_sched(PERF_COUNT_SW_CONTEXT_SWITCHES, 1, 0);
+
+ if (__perf_sw_enabled(PERF_COUNT_SW_CGROUP_SWITCHES) &&
+ (task_css_check(prev, perf_event_cgrp_id, 1)->cgroup !=
+ task_css_check(next, perf_event_cgrp_id, 1)->cgroup))
+ __perf_sw_event_sched(PERF_COUNT_SW_CGROUP_SWITCHES, 1, 0);

if (static_branch_unlikely(&perf_sched_events))
__perf_event_task_sched_out(prev, next);
@@ -1475,8 +1473,6 @@ static inline int perf_event_refresh(struct perf_event *event, int refresh)
static inline void
perf_sw_event(u32 event_id, u64 nr, struct pt_regs *regs, u64 addr) { }
static inline void
-perf_sw_event_sched(u32 event_id, u64 nr, u64 addr) { }
-static inline void
perf_bp_event(struct perf_event *event, void *data) { }

static inline int perf_register_guest_info_callbacks

2020-12-02 19:32:57

by Andi Kleen

[permalink] [raw]

Subject: Re: [RFC 1/2] perf core: Add PERF_COUNT_SW_CGROUP_SWITCHES event

> + prev_cgrp = task_css_check(prev, perf_event_cgrp_id, 1)->cgroup;
> + next_cgrp = task_css_check(next, perf_event_cgrp_id, 1)->cgroup;
> +
> + if (prev_cgrp != next_cgrp)
> + perf_sw_event_sched(PERF_COUNT_SW_CGROUP_SWITCHES, 1, 0);

Seems to be the perf cgroup only, not all cgroups.
That's a big difference and needs to be documented properly.

Probably would make sense to have two events for both, one for
all cgroups and one for perf only.

-Andi

2020-12-02 19:50:11

by Stephane Eranian

[permalink] [raw]

Subject: Re: [RFC 1/2] perf core: Add PERF_COUNT_SW_CGROUP_SWITCHES event

On Wed, Dec 2, 2020 at 11:28 AM Andi Kleen <[email protected]> wrote:
>
> > + prev_cgrp = task_css_check(prev, perf_event_cgrp_id, 1)->cgroup;
> > + next_cgrp = task_css_check(next, perf_event_cgrp_id, 1)->cgroup;
> > +
> > + if (prev_cgrp != next_cgrp)
> > + perf_sw_event_sched(PERF_COUNT_SW_CGROUP_SWITCHES, 1, 0);
>
> Seems to be the perf cgroup only, not all cgroups.
> That's a big difference and needs to be documented properly.
>
We care about the all-cgroup case.

> Probably would make sense to have two events for both, one for
> all cgroups and one for perf only.
>
>
>
> -Andi

2020-12-02 22:48:18

by Andi Kleen

[permalink] [raw]

Subject: Re: [RFC 1/2] perf core: Add PERF_COUNT_SW_CGROUP_SWITCHES event

On Wed, Dec 02, 2020 at 11:47:25AM -0800, Stephane Eranian wrote:
> On Wed, Dec 2, 2020 at 11:28 AM Andi Kleen <[email protected]> wrote:
> >
> > > + prev_cgrp = task_css_check(prev, perf_event_cgrp_id, 1)->cgroup;
> > > + next_cgrp = task_css_check(next, perf_event_cgrp_id, 1)->cgroup;
> > > +
> > > + if (prev_cgrp != next_cgrp)
> > > + perf_sw_event_sched(PERF_COUNT_SW_CGROUP_SWITCHES, 1, 0);
> >
> > Seems to be the perf cgroup only, not all cgroups.
> > That's a big difference and needs to be documented properly.
> >
> We care about the all-cgroup case.

Then it's not correct I think. You need a different hook point.

-Andi

2020-12-02 23:44:28

by Stephane Eranian

[permalink] [raw]

Subject: Re: [RFC 1/2] perf core: Add PERF_COUNT_SW_CGROUP_SWITCHES event

On Wed, Dec 2, 2020 at 2:42 PM Andi Kleen <[email protected]> wrote:
>
> On Wed, Dec 02, 2020 at 11:47:25AM -0800, Stephane Eranian wrote:
> > On Wed, Dec 2, 2020 at 11:28 AM Andi Kleen <[email protected]> wrote:
> > >
> > > > + prev_cgrp = task_css_check(prev, perf_event_cgrp_id, 1)->cgroup;
> > > > + next_cgrp = task_css_check(next, perf_event_cgrp_id, 1)->cgroup;
> > > > +
> > > > + if (prev_cgrp != next_cgrp)
> > > > + perf_sw_event_sched(PERF_COUNT_SW_CGROUP_SWITCHES, 1, 0);
> > >
> > > Seems to be the perf cgroup only, not all cgroups.
> > > That's a big difference and needs to be documented properly.
> > >
> > We care about the all-cgroup case.
>
> Then it's not correct I think. You need a different hook point.
>
I realize that ;-(

2020-12-03 01:08:27

by Namhyung Kim

[permalink] [raw]

Subject: Re: [RFC 1/2] perf core: Add PERF_COUNT_SW_CGROUP_SWITCHES event

Hi Stephane and Andi,

On Thu, Dec 3, 2020 at 8:40 AM Stephane Eranian <[email protected]> wrote:
>
> On Wed, Dec 2, 2020 at 2:42 PM Andi Kleen <[email protected]> wrote:
> >
> > On Wed, Dec 02, 2020 at 11:47:25AM -0800, Stephane Eranian wrote:
> > > On Wed, Dec 2, 2020 at 11:28 AM Andi Kleen <[email protected]> wrote:
> > > >
> > > > > + prev_cgrp = task_css_check(prev, perf_event_cgrp_id, 1)->cgroup;
> > > > > + next_cgrp = task_css_check(next, perf_event_cgrp_id, 1)->cgroup;
> > > > > +
> > > > > + if (prev_cgrp != next_cgrp)
> > > > > + perf_sw_event_sched(PERF_COUNT_SW_CGROUP_SWITCHES, 1, 0);
> > > >
> > > > Seems to be the perf cgroup only, not all cgroups.
> > > > That's a big difference and needs to be documented properly.
> > > >
> > > We care about the all-cgroup case.
> >
> > Then it's not correct I think. You need a different hook point.
> >
> I realize that ;-(

If we want to count any cgroup changes, I think we can compare
task->cgroups (css_set) here instead.

Thanks,
Namyung

2020-12-03 01:10:43

by Namhyung Kim

[permalink] [raw]

Subject: Re: [RFC 1/2] perf core: Add PERF_COUNT_SW_CGROUP_SWITCHES event

Hi Peter,

On Thu, Dec 3, 2020 at 1:19 AM Peter Zijlstra <[email protected]> wrote:
>
> On Thu, Dec 03, 2020 at 12:02:04AM +0900, Namhyung Kim wrote:
>
> > +#ifdef CONFIG_CGROUP_PERF
> > +static inline void
> > +perf_sw_event_cgroup_switch(struct task_struct *prev, struct task_struct *next)
> > +{
> > + struct cgroup *prev_cgrp, *next_cgrp;
> > +
> > + rcu_read_lock();
> > +
> > + prev_cgrp = task_css_check(prev, perf_event_cgrp_id, 1)->cgroup;
> > + next_cgrp = task_css_check(next, perf_event_cgrp_id, 1)->cgroup;
> > +
> > + if (prev_cgrp != next_cgrp)
> > + perf_sw_event_sched(PERF_COUNT_SW_CGROUP_SWITCHES, 1, 0);
> > +
> > + rcu_read_unlock();
> > +}
> > +#else
> > +static inline void perf_sw_event_cgroup_switch(struct task_struct *prev,
> > + struct task_struct *next) {}
> > +#endif /* CONFIG_CGROUP_PERF */
> > +
> > extern struct static_key_false perf_sched_events;
> >
> > static __always_inline bool
> > @@ -1220,6 +1241,7 @@ static inline void perf_event_task_sched_out(struct task_struct *prev,
> > struct task_struct *next)
> > {
> > perf_sw_event_sched(PERF_COUNT_SW_CONTEXT_SWITCHES, 1, 0);
> > + perf_sw_event_cgroup_switch(prev, next);
> >
> > if (static_branch_unlikely(&perf_sched_events))
> > __perf_event_task_sched_out(prev, next);
>
> Urgh.. that's horrible, try something like this.
>
> diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
> index 9a38f579bc76..5eb284819ee5 100644
> --- a/include/linux/perf_event.h
> +++ b/include/linux/perf_event.h
> @@ -1174,25 +1174,19 @@ DECLARE_PER_CPU(struct pt_regs, __perf_regs[4]);
> * which is guaranteed by us not actually scheduling inside other swevents
> * because those disable preemption.
> */
> -static __always_inline void
> -perf_sw_event_sched(u32 event_id, u64 nr, u64 addr)
> +static __always_inline void __perf_sw_event_sched(u32 event_id, u64 nr, u64 addr)
> {
> - if (static_key_false(&perf_swevent_enabled[event_id])) {
> - struct pt_regs *regs = this_cpu_ptr(&__perf_regs[0]);
> + struct pt_regs *regs = this_cpu_ptr(&__perf_regs[0]);
>
> - perf_fetch_caller_regs(regs);
> - ___perf_sw_event(event_id, nr, regs, addr);
> - }
> + perf_fetch_caller_regs(regs);
> + ___perf_sw_event(event_id, nr, regs, addr);
> }
>
> extern struct static_key_false perf_sched_events;
>
> -static __always_inline bool
> -perf_sw_migrate_enabled(void)
> +static __always_inline bool __perf_sw_enabled(int swevt)
> {
> - if (static_key_false(&perf_swevent_enabled[PERF_COUNT_SW_CPU_MIGRATIONS]))
> - return true;
> - return false;
> + return static_key_false(&perf_swevent_enabled[swevt]);
> }
>
> static inline void perf_event_task_migrate(struct task_struct *task)
> @@ -1207,11 +1201,9 @@ static inline void perf_event_task_sched_in(struct task_struct *prev,
> if (static_branch_unlikely(&perf_sched_events))
> __perf_event_task_sched_in(prev, task);
>
> - if (perf_sw_migrate_enabled() && task->sched_migrated) {
> - struct pt_regs *regs = this_cpu_ptr(&__perf_regs[0]);
> -
> - perf_fetch_caller_regs(regs);
> - ___perf_sw_event(PERF_COUNT_SW_CPU_MIGRATIONS, 1, regs, 0);
> + if (__perf_sw_enabled(PERF_COUNT_SW_CPU_MIGRATIONS) &&
> + task->sched_migrated) {
> + __perf_sw_event_sched(PERF_COUNT_SW_CPU_MIGRATIONS, 1, 0);
> task->sched_migrated = 0;
> }
> }
> @@ -1219,7 +1211,13 @@ static inline void perf_event_task_sched_in(struct task_struct *prev,
> static inline void perf_event_task_sched_out(struct task_struct *prev,
> struct task_struct *next)
> {
> - perf_sw_event_sched(PERF_COUNT_SW_CONTEXT_SWITCHES, 1, 0);
> + if (__perf_sw_enabled(PERF_COUNT_SW_CONTEXT_SWITCHES))
> + __perf_sw_event_sched(PERF_COUNT_SW_CONTEXT_SWITCHES, 1, 0);
> +
> + if (__perf_sw_enabled(PERF_COUNT_SW_CGROUP_SWITCHES) &&
> + (task_css_check(prev, perf_event_cgrp_id, 1)->cgroup !=
> + task_css_check(next, perf_event_cgrp_id, 1)->cgroup))
> + __perf_sw_event_sched(PERF_COUNT_SW_CGROUP_SWITCHES, 1, 0);

Right, it needs to check only when the event is enabled.

Thanks,
Namhyung

>
> if (static_branch_unlikely(&perf_sched_events))
> __perf_event_task_sched_out(prev, next);
> @@ -1475,8 +1473,6 @@ static inline int perf_event_refresh(struct perf_event *event, int refresh)
> static inline void
> perf_sw_event(u32 event_id, u64 nr, struct pt_regs *regs, u64 addr) { }
> static inline void
> -perf_sw_event_sched(u32 event_id, u64 nr, u64 addr) { }
> -static inline void
> perf_bp_event(struct perf_event *event, void *data) { }
>
> static inline int perf_register_guest_info_callbacks

2020-12-03 02:13:16

by Namhyung Kim

[permalink] [raw]

Subject: Re: [RFC 1/2] perf core: Add PERF_COUNT_SW_CGROUP_SWITCHES event

On Thu, Dec 3, 2020 at 1:19 AM Peter Zijlstra <[email protected]> wrote:

> diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
> index 9a38f579bc76..5eb284819ee5 100644
> --- a/include/linux/perf_event.h
> +++ b/include/linux/perf_event.h
> @@ -1174,25 +1174,19 @@ DECLARE_PER_CPU(struct pt_regs, __perf_regs[4]);
> * which is guaranteed by us not actually scheduling inside other swevents
> * because those disable preemption.
> */
> -static __always_inline void
> -perf_sw_event_sched(u32 event_id, u64 nr, u64 addr)
> +static __always_inline void __perf_sw_event_sched(u32 event_id, u64 nr, u64 addr)

It'd be nice to avoid the __ prefix if possible.

> {
> - if (static_key_false(&perf_swevent_enabled[event_id])) {
> - struct pt_regs *regs = this_cpu_ptr(&__perf_regs[0]);
> + struct pt_regs *regs = this_cpu_ptr(&__perf_regs[0]);
>
> - perf_fetch_caller_regs(regs);
> - ___perf_sw_event(event_id, nr, regs, addr);
> - }
> + perf_fetch_caller_regs(regs);
> + ___perf_sw_event(event_id, nr, regs, addr);
> }
>
> extern struct static_key_false perf_sched_events;
>
> -static __always_inline bool
> -perf_sw_migrate_enabled(void)
> +static __always_inline bool __perf_sw_enabled(int swevt)

Ditto.

> {
> - if (static_key_false(&perf_swevent_enabled[PERF_COUNT_SW_CPU_MIGRATIONS]))
> - return true;
> - return false;
> + return static_key_false(&perf_swevent_enabled[swevt]);
> }
>
> static inline void perf_event_task_migrate(struct task_struct *task)
> @@ -1207,11 +1201,9 @@ static inline void perf_event_task_sched_in(struct task_struct *prev,
> if (static_branch_unlikely(&perf_sched_events))
> __perf_event_task_sched_in(prev, task);
>
> - if (perf_sw_migrate_enabled() && task->sched_migrated) {
> - struct pt_regs *regs = this_cpu_ptr(&__perf_regs[0]);
> -
> - perf_fetch_caller_regs(regs);
> - ___perf_sw_event(PERF_COUNT_SW_CPU_MIGRATIONS, 1, regs, 0);
> + if (__perf_sw_enabled(PERF_COUNT_SW_CPU_MIGRATIONS) &&
> + task->sched_migrated) {

It seems task->sched_migrate is set only if the event is enabled,
then can we just check the value here?

> + __perf_sw_event_sched(PERF_COUNT_SW_CPU_MIGRATIONS, 1, 0);
> task->sched_migrated = 0;
> }
> }
> @@ -1219,7 +1211,13 @@ static inline void perf_event_task_sched_in(struct task_struct *prev,
> static inline void perf_event_task_sched_out(struct task_struct *prev,
> struct task_struct *next)
> {
> - perf_sw_event_sched(PERF_COUNT_SW_CONTEXT_SWITCHES, 1, 0);
> + if (__perf_sw_enabled(PERF_COUNT_SW_CONTEXT_SWITCHES))
> + __perf_sw_event_sched(PERF_COUNT_SW_CONTEXT_SWITCHES, 1, 0);
> +
> + if (__perf_sw_enabled(PERF_COUNT_SW_CGROUP_SWITCHES) &&
> + (task_css_check(prev, perf_event_cgrp_id, 1)->cgroup !=
> + task_css_check(next, perf_event_cgrp_id, 1)->cgroup))
> + __perf_sw_event_sched(PERF_COUNT_SW_CGROUP_SWITCHES, 1, 0);

I was not clear about the RCU protection here. Is it ok to access
the task's css_set directly?

Thanks,
Namhyung

>
> if (static_branch_unlikely(&perf_sched_events))
> __perf_event_task_sched_out(prev, next);
> @@ -1475,8 +1473,6 @@ static inline int perf_event_refresh(struct perf_event *event, int refresh)
> static inline void
> perf_sw_event(u32 event_id, u64 nr, struct pt_regs *regs, u64 addr) { }
> static inline void
> -perf_sw_event_sched(u32 event_id, u64 nr, u64 addr) { }
> -static inline void
> perf_bp_event(struct perf_event *event, void *data) { }
>
> static inline int perf_register_guest_info_callbacks

2020-12-03 07:48:16

by Peter Zijlstra

[permalink] [raw]

Subject: Re: [RFC 1/2] perf core: Add PERF_COUNT_SW_CGROUP_SWITCHES event

On Thu, Dec 03, 2020 at 11:10:30AM +0900, Namhyung Kim wrote:
> On Thu, Dec 3, 2020 at 1:19 AM Peter Zijlstra <[email protected]> wrote:
>
> > diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
> > index 9a38f579bc76..5eb284819ee5 100644
> > --- a/include/linux/perf_event.h
> > +++ b/include/linux/perf_event.h
> > @@ -1174,25 +1174,19 @@ DECLARE_PER_CPU(struct pt_regs, __perf_regs[4]);
> > * which is guaranteed by us not actually scheduling inside other swevents
> > * because those disable preemption.
> > */
> > -static __always_inline void
> > -perf_sw_event_sched(u32 event_id, u64 nr, u64 addr)
> > +static __always_inline void __perf_sw_event_sched(u32 event_id, u64 nr, u64 addr)
>
> It'd be nice to avoid the __ prefix if possible.

Not having __ would seem to suggest its a function of generic utility.
Still, *shrug* ;-)

> > {
> > - if (static_key_false(&perf_swevent_enabled[PERF_COUNT_SW_CPU_MIGRATIONS]))
> > - return true;
> > - return false;
> > + return static_key_false(&perf_swevent_enabled[swevt]);
> > }
> >
> > static inline void perf_event_task_migrate(struct task_struct *task)
> > @@ -1207,11 +1201,9 @@ static inline void perf_event_task_sched_in(struct task_struct *prev,
> > if (static_branch_unlikely(&perf_sched_events))
> > __perf_event_task_sched_in(prev, task);
> >
> > - if (perf_sw_migrate_enabled() && task->sched_migrated) {
> > - struct pt_regs *regs = this_cpu_ptr(&__perf_regs[0]);
> > -
> > - perf_fetch_caller_regs(regs);
> > - ___perf_sw_event(PERF_COUNT_SW_CPU_MIGRATIONS, 1, regs, 0);
> > + if (__perf_sw_enabled(PERF_COUNT_SW_CPU_MIGRATIONS) &&
> > + task->sched_migrated) {
>
> It seems task->sched_migrate is set only if the event is enabled,
> then can we just check the value here?

Why suffer the unconditional load and test? Your L1 too big?

> > + __perf_sw_event_sched(PERF_COUNT_SW_CPU_MIGRATIONS, 1, 0);
> > task->sched_migrated = 0;
> > }
> > }
> > @@ -1219,7 +1211,13 @@ static inline void perf_event_task_sched_in(struct task_struct *prev,
> > static inline void perf_event_task_sched_out(struct task_struct *prev,
> > struct task_struct *next)
> > {
> > - perf_sw_event_sched(PERF_COUNT_SW_CONTEXT_SWITCHES, 1, 0);
> > + if (__perf_sw_enabled(PERF_COUNT_SW_CONTEXT_SWITCHES))
> > + __perf_sw_event_sched(PERF_COUNT_SW_CONTEXT_SWITCHES, 1, 0);
> > +
> > + if (__perf_sw_enabled(PERF_COUNT_SW_CGROUP_SWITCHES) &&
> > + (task_css_check(prev, perf_event_cgrp_id, 1)->cgroup !=
> > + task_css_check(next, perf_event_cgrp_id, 1)->cgroup))
> > + __perf_sw_event_sched(PERF_COUNT_SW_CGROUP_SWITCHES, 1, 0);
>
> I was not clear about the RCU protection here. Is it ok to access
> the task's css_set directly?

We're here with preemption and IRQs disabled, good luck trying to get
RCU to consider that not a critical section and spirit things away under
us.

2020-12-03 12:25:05

by Peter Zijlstra

[permalink] [raw]

Subject: Re: [RFC 1/2] perf core: Add PERF_COUNT_SW_CGROUP_SWITCHES event

On Wed, Dec 02, 2020 at 11:28:28AM -0800, Andi Kleen wrote:
> > + prev_cgrp = task_css_check(prev, perf_event_cgrp_id, 1)->cgroup;
> > + next_cgrp = task_css_check(next, perf_event_cgrp_id, 1)->cgroup;
> > +
> > + if (prev_cgrp != next_cgrp)
> > + perf_sw_event_sched(PERF_COUNT_SW_CGROUP_SWITCHES, 1, 0);
>
> Seems to be the perf cgroup only, not all cgroups.
> That's a big difference and needs to be documented properly.

With cgroup-v2 that's all the same, no?

2020-12-04 07:30:59

by Namhyung Kim

[permalink] [raw]

Subject: Re: [RFC 1/2] perf core: Add PERF_COUNT_SW_CGROUP_SWITCHES event

On Thu, Dec 3, 2020 at 4:45 PM Peter Zijlstra <[email protected]> wrote:
>
> On Thu, Dec 03, 2020 at 11:10:30AM +0900, Namhyung Kim wrote:
> > On Thu, Dec 3, 2020 at 1:19 AM Peter Zijlstra <[email protected]> wrote:
> >
> > > diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
> > > index 9a38f579bc76..5eb284819ee5 100644
> > > --- a/include/linux/perf_event.h
> > > +++ b/include/linux/perf_event.h
> > > @@ -1174,25 +1174,19 @@ DECLARE_PER_CPU(struct pt_regs, __perf_regs[4]);
> > > * which is guaranteed by us not actually scheduling inside other swevents
> > > * because those disable preemption.
> > > */
> > > -static __always_inline void
> > > -perf_sw_event_sched(u32 event_id, u64 nr, u64 addr)
> > > +static __always_inline void __perf_sw_event_sched(u32 event_id, u64 nr, u64 addr)
> >
> > It'd be nice to avoid the __ prefix if possible.
>
> Not having __ would seem to suggest its a function of generic utility.
> Still, *shrug* ;-)

Ok, noted.

>
> > > {
> > > - if (static_key_false(&perf_swevent_enabled[PERF_COUNT_SW_CPU_MIGRATIONS]))
> > > - return true;
> > > - return false;
> > > + return static_key_false(&perf_swevent_enabled[swevt]);
> > > }
> > >
> > > static inline void perf_event_task_migrate(struct task_struct *task)
> > > @@ -1207,11 +1201,9 @@ static inline void perf_event_task_sched_in(struct task_struct *prev,
> > > if (static_branch_unlikely(&perf_sched_events))
> > > __perf_event_task_sched_in(prev, task);
> > >
> > > - if (perf_sw_migrate_enabled() && task->sched_migrated) {
> > > - struct pt_regs *regs = this_cpu_ptr(&__perf_regs[0]);
> > > -
> > > - perf_fetch_caller_regs(regs);
> > > - ___perf_sw_event(PERF_COUNT_SW_CPU_MIGRATIONS, 1, regs, 0);
> > > + if (__perf_sw_enabled(PERF_COUNT_SW_CPU_MIGRATIONS) &&
> > > + task->sched_migrated) {
> >
> > It seems task->sched_migrate is set only if the event is enabled,
> > then can we just check the value here?
>
> Why suffer the unconditional load and test? Your L1 too big?

I just wanted to avoid typing long lines.. ;-p

>
> > > + __perf_sw_event_sched(PERF_COUNT_SW_CPU_MIGRATIONS, 1, 0);
> > > task->sched_migrated = 0;
> > > }
> > > }
> > > @@ -1219,7 +1211,13 @@ static inline void perf_event_task_sched_in(struct task_struct *prev,
> > > static inline void perf_event_task_sched_out(struct task_struct *prev,
> > > struct task_struct *next)
> > > {
> > > - perf_sw_event_sched(PERF_COUNT_SW_CONTEXT_SWITCHES, 1, 0);
> > > + if (__perf_sw_enabled(PERF_COUNT_SW_CONTEXT_SWITCHES))
> > > + __perf_sw_event_sched(PERF_COUNT_SW_CONTEXT_SWITCHES, 1, 0);
> > > +
> > > + if (__perf_sw_enabled(PERF_COUNT_SW_CGROUP_SWITCHES) &&
> > > + (task_css_check(prev, perf_event_cgrp_id, 1)->cgroup !=
> > > + task_css_check(next, perf_event_cgrp_id, 1)->cgroup))
> > > + __perf_sw_event_sched(PERF_COUNT_SW_CGROUP_SWITCHES, 1, 0);
> >
> > I was not clear about the RCU protection here. Is it ok to access
> > the task's css_set directly?
>
> We're here with preemption and IRQs disabled, good luck trying to get
> RCU to consider that not a critical section and spirit things away under
> us.

Ok, someday I'll go reading the RCU code.. :)

Thanks,
Namhyung

2020-12-04 07:32:15

by Namhyung Kim

[permalink] [raw]

Subject: Re: [RFC 1/2] perf core: Add PERF_COUNT_SW_CGROUP_SWITCHES event

On Thu, Dec 3, 2020 at 9:20 PM Peter Zijlstra <[email protected]> wrote:
>
> On Wed, Dec 02, 2020 at 11:28:28AM -0800, Andi Kleen wrote:
> > > + prev_cgrp = task_css_check(prev, perf_event_cgrp_id, 1)->cgroup;
> > > + next_cgrp = task_css_check(next, perf_event_cgrp_id, 1)->cgroup;
> > > +
> > > + if (prev_cgrp != next_cgrp)
> > > + perf_sw_event_sched(PERF_COUNT_SW_CGROUP_SWITCHES, 1, 0);
> >
> > Seems to be the perf cgroup only, not all cgroups.
> > That's a big difference and needs to be documented properly.
>
> With cgroup-v2 that's all the same, no?

Right, but unfortunately it seems still cgroup v1 is used widely.

Thanks,
Namhyung