Internal representations of task state are likely to be changed or ordered,
and reporting them to userspace without exporting them as part of API is not
a good choice, which can easily break a userspace observability tool as kernel
evolves. For example, perf suffers from this and still reports wrong states
by this patch.
OTOH, some masqueraded state like TASK_REPORT_IDLE and TASK_REPORT_MAX are
also reported inadvertently, which confuses things even more.
So report task state in symbolic char instead, which is self-explaining and
no further translation is needed.
Note for PREEMPT_ACTIVE, we introduce 'p' to report it and use the old
conventions for the rest.
Signed-off-by: Ze Gao <[email protected]>
---
include/trace/events/sched.h | 41 +++++++++++++-----------------------
1 file changed, 15 insertions(+), 26 deletions(-)
diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h
index fbb99a61f714..51102a7c0c2d 100644
--- a/include/trace/events/sched.h
+++ b/include/trace/events/sched.h
@@ -6,6 +6,7 @@
#define _TRACE_SCHED_H
#include <linux/kthread.h>
+#include <linux/sched.h>
#include <linux/sched/numa_balancing.h>
#include <linux/tracepoint.h>
#include <linux/binfmts.h>
@@ -187,7 +188,7 @@ DEFINE_EVENT(sched_wakeup_template, sched_wakeup_new,
TP_ARGS(p));
#ifdef CREATE_TRACE_POINTS
-static inline long __trace_sched_switch_state(bool preempt,
+static inline const char __trace_sched_switch_state(bool preempt,
unsigned int prev_state,
struct task_struct *p)
{
@@ -197,13 +198,6 @@ static inline long __trace_sched_switch_state(bool preempt,
BUG_ON(p != current);
#endif /* CONFIG_SCHED_DEBUG */
- /*
- * Preemption ignores task state, therefore preempted tasks are always
- * RUNNING (we will not have dequeued if state != RUNNING).
- */
- if (preempt)
- return TASK_REPORT_MAX;
-
/*
* task_state_index() uses fls() and returns a value from 0-8 range.
* Decrement it by 1 (except TASK_RUNNING state i.e 0) before using
@@ -212,7 +206,16 @@ static inline long __trace_sched_switch_state(bool preempt,
*/
state = __task_state_index(prev_state, p->exit_state);
- return state ? (1 << (state - 1)) : state;
+ /*
+ * Preemption ignores task state, therefore preempted tasks are always
+ * RUNNING (we will not have dequeued if state != RUNNING).
+ * Here, we use 'p' to denote this case and only for this case.
+ */
+ if (preempt)
+ return 'p';
+
+
+ return task_index_to_char(state);
}
#endif /* CREATE_TRACE_POINTS */
@@ -232,7 +235,7 @@ TRACE_EVENT(sched_switch,
__array( char, prev_comm, TASK_COMM_LEN )
__field( pid_t, prev_pid )
__field( int, prev_prio )
- __field( long, prev_state )
+ __field( char, prev_state )
__array( char, next_comm, TASK_COMM_LEN )
__field( pid_t, next_pid )
__field( int, next_prio )
@@ -249,22 +252,8 @@ TRACE_EVENT(sched_switch,
/* XXX SCHED_DEADLINE */
),
- TP_printk("prev_comm=%s prev_pid=%d prev_prio=%d prev_state=%s%s ==> next_comm=%s next_pid=%d next_prio=%d",
- __entry->prev_comm, __entry->prev_pid, __entry->prev_prio,
-
- (__entry->prev_state & (TASK_REPORT_MAX - 1)) ?
- __print_flags(__entry->prev_state & (TASK_REPORT_MAX - 1), "|",
- { TASK_INTERRUPTIBLE, "S" },
- { TASK_UNINTERRUPTIBLE, "D" },
- { __TASK_STOPPED, "T" },
- { __TASK_TRACED, "t" },
- { EXIT_DEAD, "X" },
- { EXIT_ZOMBIE, "Z" },
- { TASK_PARKED, "P" },
- { TASK_DEAD, "I" }) :
- "R",
-
- __entry->prev_state & TASK_REPORT_MAX ? "+" : "",
+ TP_printk("prev_comm=%s prev_pid=%d prev_prio=%d prev_state=%c ==> next_comm=%s next_pid=%d next_prio=%d",
+ __entry->prev_comm, __entry->prev_pid, __entry->prev_prio, __entry->prev_state,
__entry->next_comm, __entry->next_pid, __entry->next_prio)
);
--
2.40.1
On Tue, Jul 25, 2023 at 03:22:52PM +0800, Ze Gao wrote:
> @@ -197,13 +198,6 @@ static inline long __trace_sched_switch_state(bool preempt,
> BUG_ON(p != current);
> #endif /* CONFIG_SCHED_DEBUG */
>
> - /*
> - * Preemption ignores task state, therefore preempted tasks are always
> - * RUNNING (we will not have dequeued if state != RUNNING).
> - */
> - if (preempt)
> - return TASK_REPORT_MAX;
> -
> /*
> * task_state_index() uses fls() and returns a value from 0-8 range.
> * Decrement it by 1 (except TASK_RUNNING state i.e 0) before using
> @@ -212,7 +206,16 @@ static inline long __trace_sched_switch_state(bool preempt,
> */
> state = __task_state_index(prev_state, p->exit_state);
>
> - return state ? (1 << (state - 1)) : state;
> + /*
> + * Preemption ignores task state, therefore preempted tasks are always
> + * RUNNING (we will not have dequeued if state != RUNNING).
> + * Here, we use 'p' to denote this case and only for this case.
> + */
> + if (preempt)
> + return 'p';
> +
I don't get this move, why compute state before this return?
> +
> + return task_index_to_char(state);
> }
> #endif /* CREATE_TRACE_POINTS */
>
> @@ -232,7 +235,7 @@ TRACE_EVENT(sched_switch,
> __array( char, prev_comm, TASK_COMM_LEN )
> __field( pid_t, prev_pid )
> __field( int, prev_prio )
> - __field( long, prev_state )
> + __field( char, prev_state )
> __array( char, next_comm, TASK_COMM_LEN )
> __field( pid_t, next_pid )
> __field( int, next_prio )
This is a format change and will likely break a ton of programs :/
On Tue, Jul 25, 2023 at 4:34 PM Peter Zijlstra <[email protected]> wrote:
>
> On Tue, Jul 25, 2023 at 03:22:52PM +0800, Ze Gao wrote:
>
> > @@ -197,13 +198,6 @@ static inline long __trace_sched_switch_state(bool preempt,
> > BUG_ON(p != current);
> > #endif /* CONFIG_SCHED_DEBUG */
> >
> > - /*
> > - * Preemption ignores task state, therefore preempted tasks are always
> > - * RUNNING (we will not have dequeued if state != RUNNING).
> > - */
> > - if (preempt)
> > - return TASK_REPORT_MAX;
> > -
> > /*
> > * task_state_index() uses fls() and returns a value from 0-8 range.
> > * Decrement it by 1 (except TASK_RUNNING state i.e 0) before using
> > @@ -212,7 +206,16 @@ static inline long __trace_sched_switch_state(bool preempt,
> > */
> > state = __task_state_index(prev_state, p->exit_state);
> >
> > - return state ? (1 << (state - 1)) : state;
> > + /*
> > + * Preemption ignores task state, therefore preempted tasks are always
> > + * RUNNING (we will not have dequeued if state != RUNNING).
> > + * Here, we use 'p' to denote this case and only for this case.
> > + */
> > + if (preempt)
> > + return 'p';
> > +
>
> I don't get this move, why compute state before this return?
Oops, I was going to ignore the PREEMP_ACTIVE in the first attempt
and changed it to 'state = 0'
which I decided to introduce 'p' to denote this after second thoughts.
Will fix it and revert this move.
> > +
> > + return task_index_to_char(state);
> > }
> > #endif /* CREATE_TRACE_POINTS */
> >
> > @@ -232,7 +235,7 @@ TRACE_EVENT(sched_switch,
> > __array( char, prev_comm, TASK_COMM_LEN )
> > __field( pid_t, prev_pid )
> > __field( int, prev_prio )
> > - __field( long, prev_state )
> > + __field( char, prev_state )
> > __array( char, next_comm, TASK_COMM_LEN )
> > __field( pid_t, next_pid )
> > __field( int, next_prio )
>
> This is a format change and will likely break a ton of programs :/
Yeah, I admit that. And I believe this kind of break happens each
time the internal
task state constant mapping is rearranged, it's of no big difference
here, since the
most renowned perf itself is still broken at this time after. And
IMHO it's time to fix
this and do things correctly. That is why I propose this and mark it as RFC.
BTW, could you help to point to any possible tools/programs that would
break other
than perf/libtraceevent, because these two are the only users I run
into so far.
Regards,
Ze
On Tue, Jul 25, 2023 at 06:53:07PM +0800, Ze Gao wrote:
> > > @@ -232,7 +235,7 @@ TRACE_EVENT(sched_switch,
> > > __array( char, prev_comm, TASK_COMM_LEN )
> > > __field( pid_t, prev_pid )
> > > __field( int, prev_prio )
> > > - __field( long, prev_state )
> > > + __field( char, prev_state )
> > > __array( char, next_comm, TASK_COMM_LEN )
> > > __field( pid_t, next_pid )
> > > __field( int, next_prio )
> >
> > This is a format change and will likely break a ton of programs :/
>
> BTW, could you help to point to any possible tools/programs that would
> break other than perf/libtraceevent, because these two are the only
> users I run into so far.
Latencytop was the one breaking a few years ago, but there's a metric
ton of sched_switch users out there, this is bound to generate pain.
Steve, you remember what the status of all this was? at the time
breaking this was considered on par with ABI breakage and we reverted or
something. Is this still so?
On Tue, 25 Jul 2023 15:22:52 +0800
Ze Gao <[email protected]> wrote:
> #ifdef CREATE_TRACE_POINTS
> -static inline long __trace_sched_switch_state(bool preempt,
> +static inline const char __trace_sched_switch_state(bool preempt,
Does it really need to be "const"?
> unsigned int prev_state,
> struct task_struct *p)
> {
> @@ -197,13 +198,6 @@ static inline long __trace_sched_switch_state(bool preempt,
> BUG_ON(p != current);
> #endif /* CONFIG_SCHED_DEBUG */
>
> - /*
> - * Preemption ignores task state, therefore preempted tasks are always
> - * RUNNING (we will not have dequeued if state != RUNNING).
> - */
> - if (preempt)
> - return TASK_REPORT_MAX;
> -
> /*
> * task_state_index() uses fls() and returns a value from 0-8 range.
> * Decrement it by 1 (except TASK_RUNNING state i.e 0) before using
> @@ -212,7 +206,16 @@ static inline long __trace_sched_switch_state(bool preempt,
> */
> state = __task_state_index(prev_state, p->exit_state);
>
> - return state ? (1 << (state - 1)) : state;
> + /*
> + * Preemption ignores task state, therefore preempted tasks are always
> + * RUNNING (we will not have dequeued if state != RUNNING).
> + * Here, we use 'p' to denote this case and only for this case.
> + */
> + if (preempt)
> + return 'p';
> +
> +
> + return task_index_to_char(state);
> }
> #endif /* CREATE_TRACE_POINTS */
>
> @@ -232,7 +235,7 @@ TRACE_EVENT(sched_switch,
> __array( char, prev_comm, TASK_COMM_LEN )
> __field( pid_t, prev_pid )
> __field( int, prev_prio )
> - __field( long, prev_state )
> + __field( char, prev_state )
> __array( char, next_comm, TASK_COMM_LEN )
> __field( pid_t, next_pid )
> __field( int, next_prio )
This will break userspace. Just because you update libtraceevent
doesn't mean that it will get to all the users of it. There's still
tools that have the old method hard coded and doesn't use the library.
Now, because the old tools still do the parsing of this format, we can
add a new field called prev_state_char that will give you this. Now to
save space, we should change prev_state to int (can't make it short as
there's that test for "+" which does sometimes happen). I believe we
can make prev_prio and next prio into shorts (and possibly chars!).
-- Steve
> @@ -249,22 +252,8 @@ TRACE_EVENT(sched_switch,
> /* XXX SCHED_DEADLINE */
> ),
>
> - TP_printk("prev_comm=%s prev_pid=%d prev_prio=%d prev_state=%s%s ==> next_comm=%s next_pid=%d next_prio=%d",
> - __entry->prev_comm, __entry->prev_pid, __entry->prev_prio,
> -
> - (__entry->prev_state & (TASK_REPORT_MAX - 1)) ?
> - __print_flags(__entry->prev_state & (TASK_REPORT_MAX - 1), "|",
> - { TASK_INTERRUPTIBLE, "S" },
> - { TASK_UNINTERRUPTIBLE, "D" },
> - { __TASK_STOPPED, "T" },
> - { __TASK_TRACED, "t" },
> - { EXIT_DEAD, "X" },
> - { EXIT_ZOMBIE, "Z" },
> - { TASK_PARKED, "P" },
> - { TASK_DEAD, "I" }) :
> - "R",
> -
> - __entry->prev_state & TASK_REPORT_MAX ? "+" : "",
> + TP_printk("prev_comm=%s prev_pid=%d prev_prio=%d prev_state=%c ==> next_comm=%s next_pid=%d next_prio=%d",
> + __entry->prev_comm, __entry->prev_pid, __entry->prev_prio, __entry->prev_state,
> __entry->next_comm, __entry->next_pid, __entry->next_prio)
On Tue, 25 Jul 2023 15:31:00 +0200
Peter Zijlstra <[email protected]> wrote:
> > > This is a format change and will likely break a ton of programs :/
> >
>
> > BTW, could you help to point to any possible tools/programs that would
> > break other than perf/libtraceevent, because these two are the only
> > users I run into so far.
>
> Latencytop was the one breaking a few years ago, but there's a metric
> ton of sched_switch users out there, this is bound to generate pain.
>
> Steve, you remember what the status of all this was? at the time
> breaking this was considered on par with ABI breakage and we reverted or
> something. Is this still so?
I did reply to the patch (before I switched to thread mode, and notice
that there were already replies ;-)
Pretty much all the tools have been switched over to libtraceevent, but
some just copied the code internally. Although, newer code (like
rasdaemon) are moving over to the newer shared library. Anyway, I did
post a possibly solution in my other email.
-- Steve
On Tue, Jul 25, 2023 at 01:59:38PM -0400, Steven Rostedt wrote:
> > @@ -232,7 +235,7 @@ TRACE_EVENT(sched_switch,
> > __array( char, prev_comm, TASK_COMM_LEN )
> > __field( pid_t, prev_pid )
> > __field( int, prev_prio )
> > - __field( long, prev_state )
> > + __field( char, prev_state )
> > __array( char, next_comm, TASK_COMM_LEN )
> > __field( pid_t, next_pid )
> > __field( int, next_prio )
>
> This will break userspace. Just because you update libtraceevent
> doesn't mean that it will get to all the users of it. There's still
> tools that have the old method hard coded and doesn't use the library.
>
> Now, because the old tools still do the parsing of this format, we can
> add a new field called prev_state_char that will give you this. Now to
> save space, we should change prev_state to int (can't make it short as
> there's that test for "+" which does sometimes happen). I believe we
> can make prev_prio and next prio into shorts (and possibly chars!).
Or just leave the thing alone?
On Tue, 25 Jul 2023 22:16:03 +0200
Peter Zijlstra <[email protected]> wrote:
> > Now, because the old tools still do the parsing of this format, we can
> > add a new field called prev_state_char that will give you this. Now to
> > save space, we should change prev_state to int (can't make it short as
> > there's that test for "+" which does sometimes happen). I believe we
> > can make prev_prio and next prio into shorts (and possibly chars!).
>
> Or just leave the thing alone?
Sure, but I would like to change the fields to smaller ones just so
that the event wastes less space on the buffer.
-- Steve
I'd like to try this one, and request for another RFC.
Regards,
Ze
On Wed, Jul 26, 2023 at 5:55 AM Steven Rostedt <[email protected]> wrote:
>
> On Tue, 25 Jul 2023 22:16:03 +0200
> Peter Zijlstra <[email protected]> wrote:
>
> > > Now, because the old tools still do the parsing of this format, we can
> > > add a new field called prev_state_char that will give you this. Now to
> > > save space, we should change prev_state to int (can't make it short as
> > > there's that test for "+" which does sometimes happen). I believe we
> > > can make prev_prio and next prio into shorts (and possibly chars!).
> >
> > Or just leave the thing alone?
>
> Sure, but I would like to change the fields to smaller ones just so
> that the event wastes less space on the buffer.
>
> -- Steve
On Wed, Jul 26, 2023 at 1:59 AM Steven Rostedt <[email protected]> wrote:
>
> On Tue, 25 Jul 2023 15:22:52 +0800
> Ze Gao <[email protected]> wrote:
>
> > #ifdef CREATE_TRACE_POINTS
> > -static inline long __trace_sched_switch_state(bool preempt,
> > +static inline const char __trace_sched_switch_state(bool preempt,
>
> Does it really need to be "const"?
Not really since it's rvalue after all, will remove this.
> > unsigned int prev_state,
> > struct task_struct *p)
> > {
> > @@ -197,13 +198,6 @@ static inline long __trace_sched_switch_state(bool preempt,
> > BUG_ON(p != current);
> > #endif /* CONFIG_SCHED_DEBUG */
> >
> > - /*
> > - * Preemption ignores task state, therefore preempted tasks are always
> > - * RUNNING (we will not have dequeued if state != RUNNING).
> > - */
> > - if (preempt)
> > - return TASK_REPORT_MAX;
> > -
> > /*
> > * task_state_index() uses fls() and returns a value from 0-8 range.
> > * Decrement it by 1 (except TASK_RUNNING state i.e 0) before using
> > @@ -212,7 +206,16 @@ static inline long __trace_sched_switch_state(bool preempt,
> > */
> > state = __task_state_index(prev_state, p->exit_state);
> >
> > - return state ? (1 << (state - 1)) : state;
> > + /*
> > + * Preemption ignores task state, therefore preempted tasks are always
> > + * RUNNING (we will not have dequeued if state != RUNNING).
> > + * Here, we use 'p' to denote this case and only for this case.
> > + */
> > + if (preempt)
> > + return 'p';
> > +
> > +
> > + return task_index_to_char(state);
> > }
> > #endif /* CREATE_TRACE_POINTS */
> >
> > @@ -232,7 +235,7 @@ TRACE_EVENT(sched_switch,
> > __array( char, prev_comm, TASK_COMM_LEN )
> > __field( pid_t, prev_pid )
> > __field( int, prev_prio )
> > - __field( long, prev_state )
> > + __field( char, prev_state )
> > __array( char, next_comm, TASK_COMM_LEN )
> > __field( pid_t, next_pid )
> > __field( int, next_prio )
>
> This will break userspace. Just because you update libtraceevent
> doesn't mean that it will get to all the users of it. There's still
> tools that have the old method hard coded and doesn't use the library.
>
> Now, because the old tools still do the parsing of this format, we can
> add a new field called prev_state_char that will give you this. Now to
> save space, we should change prev_state to int (can't make it short as
> there's that test for "+" which does sometimes happen). I believe we
> can make prev_prio and next prio into shorts (and possibly chars!).
>
> -- Steve
>
Agreed, and my first attempt was to not break anything here by
introducing a new "char", which is kinda redundant, but is a good way
for smooth transition to decouple userspace from kernel on this weird
design(of ABI).
However, given the fact the in-tree perf still suffers this, I doubt many
other tools could spare from it. And the best guess is that they are
still partially broken silently and reports wrong and confusing states.
And this is basically why I came up with this RFC and deliberately
let them fail loudly to do a painful but quick recovery.
Anyway, if we want to keep status quo, I can send over another two
patches to fix perf and libtraceevent specifically and leave the tracepoint
alone. But seriously, imho we might have no chance to correct this.
and without a formal abi, the raw value (let alone the masqueraded ones)
is just meaningless for userspace and that leaves it intended for kernel
developers only.
Thanks,
Ze