There are already two events for context switches, namely
the tracepoint sched:sched_switch and the software event
context_switches. Unfortunately neither are suitable for
use by non-privileged users for the purpose of synchronizing
hardware trace data (e.g. Intel PT) to the context switch.
Tracepoints are no good at all for non-privileged users
because they need either CAP_SYS_ADMIN or
/proc/sys/kernel/perf_event_paranoid <= -1.
On the other hand, kernel software events need either
CAP_SYS_ADMIN or /proc/sys/kernel/perf_event_paranoid <= 1.
Now many distributions do default perf_event_paranoid to 1
making context_switches a contender, except it has another
problem (which is also shared with sched:sched_switch)
which is that it happens before perf schedules events out
instead of after perf schedules events in. Whereas a
privileged user can see all the events anyway, a
non-privileged user only sees events for their own processes,
in other words they see when their process was scheduled out
not when it was scheduled in. That presents two problems to
use the event: 1. the information comes too late, so tools
have to look ahead in the event stream to find out what the
current state is 2. if they are unlucky tracing might have
stopped before the context-switches event is recorded.
This new PERF_RECORD_SWITCH event does not have those problems
and it also has a couple of other small advantages. It is
easier to use because it is an auxiliary event (like mmap,
comm and task events) which can be enabled by setting a single
bit. It is smaller than sched:sched_switch and easier to parse.
This implementation has a quirk which is that, if possible,
it scrounges the event data from the ID sample instead of
deriving it twice.
Also I haven't tested this patch yet, so it is RFC at the
moment.
Signed-off-by: Adrian Hunter <[email protected]>
---
include/uapi/linux/perf_event.h | 15 +++++-
kernel/events/core.c | 108 ++++++++++++++++++++++++++++++++++++++++
2 files changed, 122 insertions(+), 1 deletion(-)
diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h
index 31b10b02db75..f5403660c9f8 100644
--- a/include/uapi/linux/perf_event.h
+++ b/include/uapi/linux/perf_event.h
@@ -332,7 +332,8 @@ struct perf_event_attr {
mmap2 : 1, /* include mmap with inode data */
comm_exec : 1, /* flag comm events that are due to an exec */
use_clockid : 1, /* use @clockid for time fields */
- __reserved_1 : 38;
+ context_switch : 1, /* context switch data */
+ __reserved_1 : 37;
union {
__u32 wakeup_events; /* wakeup every n events */
@@ -812,6 +813,18 @@ enum perf_event_type {
*/
PERF_RECORD_ITRACE_START = 12,
+ /*
+ *
+ *
+ * struct {
+ * struct perf_event_header header;
+ * u32 pid, tid;
+ * u64 time;
+ * struct sample_id sample_id;
+ * };
+ */
+ PERF_RECORD_SWITCH = 13,
+
PERF_RECORD_MAX, /* non-ABI */
};
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 30c7374bd263..eda26c464f7c 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -161,6 +161,7 @@ static atomic_t nr_mmap_events __read_mostly;
static atomic_t nr_comm_events __read_mostly;
static atomic_t nr_task_events __read_mostly;
static atomic_t nr_freq_events __read_mostly;
+static atomic_t nr_switch_events __read_mostly;
static LIST_HEAD(pmus);
static DEFINE_MUTEX(pmus_lock);
@@ -2822,6 +2823,8 @@ static void perf_event_context_sched_in(struct perf_event_context *ctx,
perf_ctx_unlock(cpuctx, ctx);
}
+static void perf_event_switch(struct task_struct *task);
+
/*
* Called from scheduler to add the events of the current task
* with interrupts disabled.
@@ -2856,6 +2859,9 @@ void __perf_event_task_sched_in(struct task_struct *prev,
if (__this_cpu_read(perf_sched_cb_usages))
perf_pmu_sched_task(prev, task, true);
+
+ if (!atomic_read(&nr_switch_events))
+ perf_event_switch(task);
}
static u64 perf_calculate_period(struct perf_event *event, u64 nsec, u64 count)
@@ -3479,6 +3485,10 @@ static void unaccount_event(struct perf_event *event)
atomic_dec(&nr_task_events);
if (event->attr.freq)
atomic_dec(&nr_freq_events);
+ if (event->attr.context_switch) {
+ static_key_slow_dec_deferred(&perf_sched_events);
+ atomic_dec(&nr_switch_events);
+ }
if (is_cgroup_event(event))
static_key_slow_dec_deferred(&perf_sched_events);
if (has_branch_stack(event))
@@ -6205,6 +6215,100 @@ void perf_event_aux_event(struct perf_event *event, unsigned long head,
}
/*
+ * context_switch tracking
+ */
+
+struct perf_switch_event {
+ struct task_struct *task;
+
+ struct {
+ struct perf_event_header header;
+
+ u32 pid;
+ u32 tid;
+ u64 time;
+ } event_id;
+};
+
+static int perf_event_switch_match(struct perf_event *event)
+{
+ return event->attr.context_switch;
+}
+
+static void perf_event_switch_output(struct perf_event *event, void *data)
+{
+ struct perf_switch_event *switch_event = data;
+ struct perf_output_handle handle;
+ struct perf_sample_data sample;
+ struct task_struct *task = switch_event->task;
+ int size = switch_event->event_id.header.size;
+ int ret;
+
+ if (!perf_event_switch_match(event))
+ return;
+
+ sample.tid_entry.pid = -1;
+ sample.tid_entry.tid = -1;
+ sample.time = -1;
+
+ perf_event_header__init_id(&switch_event->event_id.header, &sample, event);
+
+ ret = perf_output_begin(&handle, event,
+ switch_event->event_id.header.size);
+ if (ret)
+ goto out;
+
+ if (sample.tid_entry.pid == -1)
+ switch_event->event_id.pid = perf_event_pid(event, task);
+ else
+ switch_event->event_id.pid = sample.tid_entry.pid;
+
+ if (sample.tid_entry.tid == -1)
+ switch_event->event_id.tid = perf_event_tid(event, task);
+ else
+ switch_event->event_id.tid = sample.tid_entry.tid;
+
+ if (sample.time == (u64)-1)
+ switch_event->event_id.time = perf_event_clock(event);
+ else
+ switch_event->event_id.time = sample.time;
+
+ perf_output_put(&handle, switch_event->event_id);
+
+ perf_event__output_id_sample(event, &handle, &sample);
+
+ perf_output_end(&handle);
+out:
+ switch_event->event_id.header.size = size;
+}
+
+static void perf_event_switch(struct task_struct *task)
+{
+ struct perf_switch_event switch_event;
+
+ if (!atomic_read(&nr_mmap_events))
+ return;
+
+ switch_event = (struct perf_switch_event){
+ .task = task,
+ .event_id = {
+ .header = {
+ .type = PERF_RECORD_SWITCH,
+ .misc = 0,
+ .size = sizeof(switch_event.event_id),
+ },
+ /* .pid */
+ /* .tid */
+ /* .time */
+ },
+ };
+
+ perf_event_aux(perf_event_switch_output,
+ &switch_event,
+ NULL);
+}
+
+/*
* IRQ throttle logging
*/
@@ -7707,6 +7811,10 @@ static void account_event(struct perf_event *event)
if (atomic_inc_return(&nr_freq_events) == 1)
tick_nohz_full_kick_all();
}
+ if (event->attr.context_switch) {
+ atomic_inc(&nr_switch_events);
+ static_key_slow_inc(&perf_sched_events.key);
+ }
if (has_branch_stack(event))
static_key_slow_inc(&perf_sched_events.key);
if (is_cgroup_event(event))
--
1.9.1
On Tue, Jun 09, 2015 at 05:21:10PM +0300, Adrian Hunter wrote:
> Tracepoints are no good at all for non-privileged users
> because they need either CAP_SYS_ADMIN or
> /proc/sys/kernel/perf_event_paranoid <= -1.
>
> On the other hand, kernel software events need either
> CAP_SYS_ADMIN or /proc/sys/kernel/perf_event_paranoid <= 1.
So while I think it makes sense to allow some tracepoint outside of that
priv level, IOW have a per tracepoint priv level filter thingy, I don't
think sched_switch() is one of those because it explicitly exposes
timing information on other tasks.
> This new PERF_RECORD_SWITCH event does not have those problems
> and it also has a couple of other small advantages. It is
> easier to use because it is an auxiliary event (like mmap,
> comm and task events) which can be enabled by setting a single
> bit. It is smaller than sched:sched_switch and easier to parse.
Right, so the one wee problem I have is that this only provides sched_in
data, I imagine people might be interested in sched_out as well.
Typically the switch even provides prev and next and thereby is
complete, but since we're limiting it to the one specific task, we'll
not have the sched_out data.
> @@ -812,6 +813,18 @@ enum perf_event_type {
> */
> PERF_RECORD_ITRACE_START = 12,
>
> + /*
> + *
> + *
> + * struct {
> + * struct perf_event_header header;
> + * u32 pid, tid;
> + * u64 time;
all 3 are already part of sample_id.
> + * struct sample_id sample_id;
> + * };
> + */
> + PERF_RECORD_SWITCH = 13,
On Thu, Jun 11, 2015 at 04:15:48PM +0200, Peter Zijlstra wrote:
> On Tue, Jun 09, 2015 at 05:21:10PM +0300, Adrian Hunter wrote:
> > Tracepoints are no good at all for non-privileged users
> > because they need either CAP_SYS_ADMIN or
> > /proc/sys/kernel/perf_event_paranoid <= -1.
> >
> > On the other hand, kernel software events need either
> > CAP_SYS_ADMIN or /proc/sys/kernel/perf_event_paranoid <= 1.
>
> So while I think it makes sense to allow some tracepoint outside of that
> priv level, IOW have a per tracepoint priv level filter thingy, I don't
> think sched_switch() is one of those because it explicitly exposes
> timing information on other tasks.
It's trivial for a running program to measure when it gets context switched
by looking at timing. I don't think this event provides anything new over that.
-Andi
On Thu, Jun 11, 2015 at 09:34:30AM -0700, Andi Kleen wrote:
> On Thu, Jun 11, 2015 at 04:15:48PM +0200, Peter Zijlstra wrote:
> > On Tue, Jun 09, 2015 at 05:21:10PM +0300, Adrian Hunter wrote:
> > > Tracepoints are no good at all for non-privileged users
> > > because they need either CAP_SYS_ADMIN or
> > > /proc/sys/kernel/perf_event_paranoid <= -1.
> > >
> > > On the other hand, kernel software events need either
> > > CAP_SYS_ADMIN or /proc/sys/kernel/perf_event_paranoid <= 1.
> >
> > So while I think it makes sense to allow some tracepoint outside of that
> > priv level, IOW have a per tracepoint priv level filter thingy, I don't
> > think sched_switch() is one of those because it explicitly exposes
> > timing information on other tasks.
>
> It's trivial for a running program to measure when it gets context switched
> by looking at timing. I don't think this event provides anything new over that.
The _existing_ ones expose timing of _other_ tasks, the proposed one does
not.
On 6/11/15 8:15 AM, Peter Zijlstra wrote:
>> This new PERF_RECORD_SWITCH event does not have those problems
>> and it also has a couple of other small advantages. It is
>> easier to use because it is an auxiliary event (like mmap,
>> comm and task events) which can be enabled by setting a single
>> bit. It is smaller than sched:sched_switch and easier to parse.
>
> Right, so the one wee problem I have is that this only provides sched_in
> data, I imagine people might be interested in sched_out as well.
Yes and with the option of collecting callchains for sched_out.
David
On 12/06/15 03:47, David Ahern wrote:
> On 6/11/15 8:15 AM, Peter Zijlstra wrote:
>>> This new PERF_RECORD_SWITCH event does not have those problems
>>> and it also has a couple of other small advantages. It is
>>> easier to use because it is an auxiliary event (like mmap,
>>> comm and task events) which can be enabled by setting a single
>>> bit. It is smaller than sched:sched_switch and easier to parse.
>>
>> Right, so the one wee problem I have is that this only provides sched_in
>> data, I imagine people might be interested in sched_out as well.
>
> Yes and with the option of collecting callchains for sched_out.
So what do you want that is different different from
PERF_COUNT_SW_CONTEXT_SWITCHES? And why?
On 11/06/15 17:15, Peter Zijlstra wrote:
> On Tue, Jun 09, 2015 at 05:21:10PM +0300, Adrian Hunter wrote:
>> Tracepoints are no good at all for non-privileged users
>> because they need either CAP_SYS_ADMIN or
>> /proc/sys/kernel/perf_event_paranoid <= -1.
>>
>> On the other hand, kernel software events need either
>> CAP_SYS_ADMIN or /proc/sys/kernel/perf_event_paranoid <= 1.
>
> So while I think it makes sense to allow some tracepoint outside of that
> priv level, IOW have a per tracepoint priv level filter thingy, I don't
> think sched_switch() is one of those because it explicitly exposes
> timing information on other tasks.
>
>> This new PERF_RECORD_SWITCH event does not have those problems
>> and it also has a couple of other small advantages. It is
>> easier to use because it is an auxiliary event (like mmap,
>> comm and task events) which can be enabled by setting a single
>> bit. It is smaller than sched:sched_switch and easier to parse.
>
> Right, so the one wee problem I have is that this only provides sched_in
> data, I imagine people might be interested in sched_out as well.
That is not a problem although it would be interesting to know the use-case.
To me it seemed unreasonable to expect to analyze scheduler behaviour
without admin-level privileges since it is inherently an administrative
activity.
>
> Typically the switch even provides prev and next and thereby is
> complete, but since we're limiting it to the one specific task, we'll
> not have the sched_out data.
That makes sense for completeness, but as I wrote, it would be interesting
to know what someone might actually use that for.
>
>> @@ -812,6 +813,18 @@ enum perf_event_type {
>> */
>> PERF_RECORD_ITRACE_START = 12,
>>
>> + /*
>> + *
>> + *
>> + * struct {
>> + * struct perf_event_header header;
>> + * u32 pid, tid;
>> + * u64 time;
>
> all 3 are already part of sample_id.
You have to decide whether you expect to be able to use an event without
sample_id. MMAP and MMAP2 both have pid, tid which are in sample_id, LOST
has id, EXIT and FORK have time, all of the THROTTLE/UNTHROTTLE members are
in sample_id etc. So it currently looks like we expect to be able to use an
event without requiring sample_id.
It doesn't affect my case either way because perf tools always sets
sample_id_all if it can.
>
>> + * struct sample_id sample_id;
>> + * };
>> + */
>> + PERF_RECORD_SWITCH = 13,
>
>
On Fri, Jun 12, 2015 at 02:12:11PM +0300, Adrian Hunter wrote:
> On 11/06/15 17:15, Peter Zijlstra wrote:
> > Right, so the one wee problem I have is that this only provides sched_in
> > data, I imagine people might be interested in sched_out as well.
>
> That is not a problem although it would be interesting to know the use-case.
> To me it seemed unreasonable to expect to analyze scheduler behaviour
> without admin-level privileges since it is inherently an administrative
> activity.
I was more thinking about it being used to track event duration inside a
task. Say you want measure the time between event A and event B but got
scheduled out in between.
---- A ----] .... [---- B -----
If you do not have the sched_out time, you cannot correct for that.
> > all 3 are already part of sample_id.
>
> You have to decide whether you expect to be able to use an event without
> sample_id. MMAP and MMAP2 both have pid, tid which are in sample_id, LOST
> has id, EXIT and FORK have time, all of the THROTTLE/UNTHROTTLE members are
> in sample_id etc. So it currently looks like we expect to be able to use an
> event without requiring sample_id.
I think we recently had this discussion:
lkml.kernel.org/r/[email protected]
The patch we ended up merging:
f38b0dbb491a ("perf/x86/intel: Introduce PERF_RECORD_LOST_SAMPLES")
Does indeed require sample_id.
Em Fri, Jun 12, 2015 at 02:09:38PM +0200, Peter Zijlstra escreveu:
> On Fri, Jun 12, 2015 at 02:12:11PM +0300, Adrian Hunter wrote:
> > On 11/06/15 17:15, Peter Zijlstra wrote:
>
> > > Right, so the one wee problem I have is that this only provides sched_in
> > > data, I imagine people might be interested in sched_out as well.
> >
> > That is not a problem although it would be interesting to know the use-case.
> > To me it seemed unreasonable to expect to analyze scheduler behaviour
> > without admin-level privileges since it is inherently an administrative
> > activity.
>
> I was more thinking about it being used to track event duration inside a
> task. Say you want measure the time between event A and event B but got
> scheduled out in between.
>
> ---- A ----] .... [---- B -----
>
> If you do not have the sched_out time, you cannot correct for that.
>
> > > all 3 are already part of sample_id.
> >
> > You have to decide whether you expect to be able to use an event without
> > sample_id. MMAP and MMAP2 both have pid, tid which are in sample_id, LOST
> > has id, EXIT and FORK have time, all of the THROTTLE/UNTHROTTLE members are
> > in sample_id etc. So it currently looks like we expect to be able to use an
> > event without requiring sample_id.
The fact that there is this duplication is because sample_id_all came
after those events, but this new one being proposed doesn't have to do
it :-)
> I think we recently had this discussion:
>
> lkml.kernel.org/r/[email protected]
>
> The patch we ended up merging:
>
> f38b0dbb491a ("perf/x86/intel: Introduce PERF_RECORD_LOST_SAMPLES")
>
> Does indeed require sample_id.
On 12/06/15 15:36, Arnaldo Carvalho de Melo wrote:
> Em Fri, Jun 12, 2015 at 02:09:38PM +0200, Peter Zijlstra escreveu:
>> On Fri, Jun 12, 2015 at 02:12:11PM +0300, Adrian Hunter wrote:
>>> On 11/06/15 17:15, Peter Zijlstra wrote:
>>
>>>> Right, so the one wee problem I have is that this only provides sched_in
>>>> data, I imagine people might be interested in sched_out as well.
>>>
>>> That is not a problem although it would be interesting to know the use-case.
>>> To me it seemed unreasonable to expect to analyze scheduler behaviour
>>> without admin-level privileges since it is inherently an administrative
>>> activity.
>>
>> I was more thinking about it being used to track event duration inside a
>> task. Say you want measure the time between event A and event B but got
>> scheduled out in between.
>>
>> ---- A ----] .... [---- B -----
>>
>> If you do not have the sched_out time, you cannot correct for that.
Thanks for the example.
>>
>>>> all 3 are already part of sample_id.
>>>
>>> You have to decide whether you expect to be able to use an event without
>>> sample_id. MMAP and MMAP2 both have pid, tid which are in sample_id, LOST
>>> has id, EXIT and FORK have time, all of the THROTTLE/UNTHROTTLE members are
>>> in sample_id etc. So it currently looks like we expect to be able to use an
>>> event without requiring sample_id.
>
> The fact that there is this duplication is because sample_id_all came
> after those events, but this new one being proposed doesn't have to do
> it :-)
Thanks, that's clear then. There will just need to be a flag to indicate
whether it is scheduling in or out.
>
>> I think we recently had this discussion:
>>
>> lkml.kernel.org/r/[email protected]
>>
>> The patch we ended up merging:
>>
>> f38b0dbb491a ("perf/x86/intel: Introduce PERF_RECORD_LOST_SAMPLES")
>>
>> Does indeed require sample_id.
>
>
On Fri, 2015-06-12 at 14:15 +0100, Adrian Hunter wrote:
> >>>> all 3 are already part of sample_id.
> >>>
> >>> You have to decide whether you expect to be able to use an event without
> >>> sample_id. MMAP and MMAP2 both have pid, tid which are in sample_id, LOST
> >>> has id, EXIT and FORK have time, all of the THROTTLE/UNTHROTTLE members are
> >>> in sample_id etc. So it currently looks like we expect to be able to use an
> >>> event without requiring sample_id.
> >
> > The fact that there is this duplication is because sample_id_all came
> > after those events, but this new one being proposed doesn't have to do
> > it :-)
>
> Thanks, that's clear then. There will just need to be a flag to indicate
> whether it is scheduling in or out.
Just a thought: wouldn't it be good to know what CPU have we been
scheduled from/to? This kind of information would be especially valuable
in heterogeneous systems.
Pawel
On Fri, 2015-06-12 at 14:28 +0100, Pawel Moll wrote:
> On Fri, 2015-06-12 at 14:15 +0100, Adrian Hunter wrote:
> > >>>> all 3 are already part of sample_id.
> > >>>
> > >>> You have to decide whether you expect to be able to use an event without
> > >>> sample_id. MMAP and MMAP2 both have pid, tid which are in sample_id, LOST
> > >>> has id, EXIT and FORK have time, all of the THROTTLE/UNTHROTTLE members are
> > >>> in sample_id etc. So it currently looks like we expect to be able to use an
> > >>> event without requiring sample_id.
> > >
> > > The fact that there is this duplication is because sample_id_all came
> > > after those events, but this new one being proposed doesn't have to do
> > > it :-)
> >
> > Thanks, that's clear then. There will just need to be a flag to indicate
> > whether it is scheduling in or out.
>
> Just a thought: wouldn't it be good to know what CPU have we been
> scheduled from/to? This kind of information would be especially valuable
> in heterogeneous systems.
Of course, cpu is a part of sample_id as well. I'm sorted out then :-)
Pawel
On 6/12/15 4:34 AM, Adrian Hunter wrote:
> On 12/06/15 03:47, David Ahern wrote:
>> On 6/11/15 8:15 AM, Peter Zijlstra wrote:
>>>> This new PERF_RECORD_SWITCH event does not have those problems
>>>> and it also has a couple of other small advantages. It is
>>>> easier to use because it is an auxiliary event (like mmap,
>>>> comm and task events) which can be enabled by setting a single
>>>> bit. It is smaller than sched:sched_switch and easier to parse.
>>>
>>> Right, so the one wee problem I have is that this only provides sched_in
>>> data, I imagine people might be interested in sched_out as well.
>>
>> Yes and with the option of collecting callchains for sched_out.
>
> So what do you want that is different different from
> PERF_COUNT_SW_CONTEXT_SWITCHES? And why?
>
My 'perf sched timehist' command can use either sched_switch and context
switches. As you have pointed out you need to collect them system wide
even if you only care about a subset of tasks. If you are going to add a
sched_in there is good symmetry by also having a sched_out that is used
and generated in a consistent manner.
On 6/12/15 5:12 AM, Adrian Hunter wrote:
>>> This new PERF_RECORD_SWITCH event does not have those problems
>>> and it also has a couple of other small advantages. It is
>>> easier to use because it is an auxiliary event (like mmap,
>>> comm and task events) which can be enabled by setting a single
>>> bit. It is smaller than sched:sched_switch and easier to parse.
>>
>> Right, so the one wee problem I have is that this only provides sched_in
>> data, I imagine people might be interested in sched_out as well.
>
> That is not a problem although it would be interesting to know the use-case.
> To me it seemed unreasonable to expect to analyze scheduler behaviour
> without admin-level privileges since it is inherently an administrative
> activity.
One use case is wanting to analyze a set of processes -- how long they
run when scheduled, where they are when scheduled out, scheduling delay
on wakeups, time between scehd in. I wrote the timehist tool in 2010 it
has been really helpful understanding what is happening on each cpu and
characteristics of a set of processes (e.g. ping ponging between tasks
sending messages to each other).
In this case it is not necessarily scheduler behavior (though it does it
enter the picture to a degree), but rather behavior of a process or set
of tasks.
On 6/12/15 7:28 AM, Pawel Moll wrote:
>> Thanks, that's clear then. There will just need to be a flag to indicate
>> whether it is scheduling in or out.
>
> Just a thought: wouldn't it be good to know what CPU have we been
> scheduled from/to? This kind of information would be especially valuable
> in heterogeneous systems.
yes that is essential.
On 12/06/2015 5:21 p.m., David Ahern wrote:
> On 6/12/15 4:34 AM, Adrian Hunter wrote:
>> On 12/06/15 03:47, David Ahern wrote:
>>> On 6/11/15 8:15 AM, Peter Zijlstra wrote:
>>>>> This new PERF_RECORD_SWITCH event does not have those problems
>>>>> and it also has a couple of other small advantages. It is
>>>>> easier to use because it is an auxiliary event (like mmap,
>>>>> comm and task events) which can be enabled by setting a single
>>>>> bit. It is smaller than sched:sched_switch and easier to parse.
>>>>
>>>> Right, so the one wee problem I have is that this only provides sched_in
>>>> data, I imagine people might be interested in sched_out as well.
>>>
>>> Yes and with the option of collecting callchains for sched_out.
>>
>> So what do you want that is different different from
>> PERF_COUNT_SW_CONTEXT_SWITCHES? And why?
>>
>
> My 'perf sched timehist' command can use either sched_switch and context
> switches. As you have pointed out you need to collect them system wide even if
> you only care about a subset of tasks. If you are going to add a sched_in
> there is good symmetry by also having a sched_out that is used and generated
> in a consistent manner.
Yes I will certainly add sched out, but the new thing is not a sample so it
does not have callchains. But presumably you could use
PERF_COUNT_SW_CONTEXT_SWITCHES as well?