Internal representations of task state are likely to be changed or
ordered, and reporting them to userspace without exporting them as
part of API is not a good choice, which can easily break a userspace
observability tool as kernel evolves. For example, perf suffers from
this and still reports wrong states by this patch.
OTOH, some masqueraded state like TASK_REPORT_IDLE and TASK_REPORT_MAX
are also reported inadvertently, which confuses things even more.
So add a new variable in company with the old raw value to report task
state in symbolic char, which is self-explaining and no further
translation is needed, and also report priorities in 'short' to save
some buffer space. Of course this does not break any userspace tool.
Note for PREEMPT_ACTIVE, we introduce 'p' to report it and use the old
conventions for the rest.
Signed-off-by: Ze Gao <[email protected]>
---
include/trace/events/sched.h | 60 +++++++++++++++++++++---------------
1 file changed, 35 insertions(+), 25 deletions(-)
diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h
index fbb99a61f714..0fcf68f49e45 100644
--- a/include/trace/events/sched.h
+++ b/include/trace/events/sched.h
@@ -6,6 +6,7 @@
#define _TRACE_SCHED_H
#include <linux/kthread.h>
+#include <linux/sched.h>
#include <linux/sched/numa_balancing.h>
#include <linux/tracepoint.h>
#include <linux/binfmts.h>
@@ -214,6 +215,27 @@ static inline long __trace_sched_switch_state(bool preempt,
return state ? (1 << (state - 1)) : state;
}
+
+static inline char __trace_sched_switch_state_char(bool preempt,
+ unsigned int prev_state,
+ struct task_struct *p)
+{
+ long state;
+
+#ifdef CONFIG_SCHED_DEBUG
+ BUG_ON(p != current);
+#endif /* CONFIG_SCHED_DEBUG */
+
+ /*
+ * For PREEMPT_ACTIVE, we introduce 'p' to report it and use the old
+ * conventions for the rest.
+ */
+ if (preempt)
+ return 'p';
+
+ state = __task_state_index(prev_state, p->exit_state);
+ return task_index_to_char(state);
+}
#endif /* CREATE_TRACE_POINTS */
/*
@@ -231,41 +253,29 @@ TRACE_EVENT(sched_switch,
TP_STRUCT__entry(
__array( char, prev_comm, TASK_COMM_LEN )
__field( pid_t, prev_pid )
- __field( int, prev_prio )
- __field( long, prev_state )
+ __field( short, prev_prio )
+ __field( int, prev_state )
+ __field( char, prev_state_char )
__array( char, next_comm, TASK_COMM_LEN )
__field( pid_t, next_pid )
- __field( int, next_prio )
+ __field( short, next_prio )
),
TP_fast_assign(
memcpy(__entry->next_comm, next->comm, TASK_COMM_LEN);
- __entry->prev_pid = prev->pid;
- __entry->prev_prio = prev->prio;
- __entry->prev_state = __trace_sched_switch_state(preempt, prev_state, prev);
+ __entry->prev_pid = prev->pid;
+ __entry->prev_prio = (short) prev->prio;
+ __entry->prev_state = __trace_sched_switch_state(preempt, prev_state, prev);
+ __entry->prev_state_char = __trace_sched_switch_state_char(preempt, prev_state, prev);
memcpy(__entry->prev_comm, prev->comm, TASK_COMM_LEN);
- __entry->next_pid = next->pid;
- __entry->next_prio = next->prio;
+ __entry->next_pid = next->pid;
+ __entry->next_prio = (short) next->prio;
/* XXX SCHED_DEADLINE */
),
- TP_printk("prev_comm=%s prev_pid=%d prev_prio=%d prev_state=%s%s ==> next_comm=%s next_pid=%d next_prio=%d",
- __entry->prev_comm, __entry->prev_pid, __entry->prev_prio,
-
- (__entry->prev_state & (TASK_REPORT_MAX - 1)) ?
- __print_flags(__entry->prev_state & (TASK_REPORT_MAX - 1), "|",
- { TASK_INTERRUPTIBLE, "S" },
- { TASK_UNINTERRUPTIBLE, "D" },
- { __TASK_STOPPED, "T" },
- { __TASK_TRACED, "t" },
- { EXIT_DEAD, "X" },
- { EXIT_ZOMBIE, "Z" },
- { TASK_PARKED, "P" },
- { TASK_DEAD, "I" }) :
- "R",
-
- __entry->prev_state & TASK_REPORT_MAX ? "+" : "",
- __entry->next_comm, __entry->next_pid, __entry->next_prio)
+ TP_printk("prev_comm=%s prev_pid=%d prev_prio=%d prev_state=%c ==> next_comm=%s next_pid=%d next_prio=%d",
+ __entry->prev_comm, __entry->prev_pid, __entry->prev_prio, __entry->prev_state_char, __entry->next_comm,
+ __entry->next_pid, __entry->next_prio)
);
/*
--
2.40.1
On Wed, Jul 26, 2023 at 08:16:16PM +0800, Ze Gao wrote:
> Internal representations of task state are likely to be changed or
> ordered, and reporting them to userspace without exporting them as
> part of API is not a good choice, which can easily break a userspace
> observability tool as kernel evolves. For example, perf suffers from
> this and still reports wrong states by this patch.
>
> OTOH, some masqueraded state like TASK_REPORT_IDLE and TASK_REPORT_MAX
> are also reported inadvertently, which confuses things even more.
>
> So add a new variable in company with the old raw value to report task
> state in symbolic char, which is self-explaining and no further
> translation is needed, and also report priorities in 'short' to save
> some buffer space. Of course this does not break any userspace tool.
>
> Note for PREEMPT_ACTIVE, we introduce 'p' to report it and use the old
> conventions for the rest.
So I really dont much like this. This looses the ability to see the
actual wait state flags, there could be multiple. Eg, things like
TASK_FREEZEABLE gets lost completely.
And this is on top of my reluctance to touch any of this at all, for
fear of further regressions.
On Mon, Jul 31, 2023 at 5:37 PM Peter Zijlstra <[email protected]> wrote:
>
> On Wed, Jul 26, 2023 at 08:16:16PM +0800, Ze Gao wrote:
> > Internal representations of task state are likely to be changed or
> > ordered, and reporting them to userspace without exporting them as
> > part of API is not a good choice, which can easily break a userspace
> > observability tool as kernel evolves. For example, perf suffers from
> > this and still reports wrong states by this patch.
> >
> > OTOH, some masqueraded state like TASK_REPORT_IDLE and TASK_REPORT_MAX
> > are also reported inadvertently, which confuses things even more.
> >
> > So add a new variable in company with the old raw value to report task
> > state in symbolic char, which is self-explaining and no further
> > translation is needed, and also report priorities in 'short' to save
> > some buffer space. Of course this does not break any userspace tool.
> >
> > Note for PREEMPT_ACTIVE, we introduce 'p' to report it and use the old
> > conventions for the rest.
>
> So I really dont much like this. This looses the ability to see the
> actual wait state flags, there could be multiple. Eg, things like
> TASK_FREEZEABLE gets lost completely.
How about adding a new char for each distinct state you want to report
, e.g., 'F' for this for debug purposes ? I don;t think we can lose the
multiple flags just because we choose to report in chars. Still people
can use the old raw value since we do not replace but supplement it here.
Regards,
Ze
On Mon, Jul 31, 2023 at 5:37 PM Peter Zijlstra <[email protected]> wrote:
>
> On Wed, Jul 26, 2023 at 08:16:16PM +0800, Ze Gao wrote:
> > Internal representations of task state are likely to be changed or
> > ordered, and reporting them to userspace without exporting them as
> > part of API is not a good choice, which can easily break a userspace
> > observability tool as kernel evolves. For example, perf suffers from
> > this and still reports wrong states by this patch.
> >
> > OTOH, some masqueraded state like TASK_REPORT_IDLE and TASK_REPORT_MAX
> > are also reported inadvertently, which confuses things even more.
> >
> > So add a new variable in company with the old raw value to report task
> > state in symbolic char, which is self-explaining and no further
> > translation is needed, and also report priorities in 'short' to save
> > some buffer space. Of course this does not break any userspace tool.
> >
> > Note for PREEMPT_ACTIVE, we introduce 'p' to report it and use the old
> > conventions for the rest.
>
> So I really dont much like this. This looses the ability to see the
> actual wait state flags, there could be multiple. Eg, things like
> TASK_FREEZEABLE gets lost completely.
Also, IIRC, TASK_FREEZABLE which is defined as 0x2000, is already lost
in the current implementation of __trace_sched_switch_state which limits
all states except PREEMPT_ACTIIVE below TASK_REPORT_IDLE to be
reported. So I do not believe you can achieve this by just leaving things alone.
Regards,
Ze
On Wed, 26 Jul 2023 20:16:16 +0800
Ze Gao <[email protected]> wrote:
> @@ -231,41 +253,29 @@ TRACE_EVENT(sched_switch,
> TP_STRUCT__entry(
> __array( char, prev_comm, TASK_COMM_LEN )
> __field( pid_t, prev_pid )
> - __field( int, prev_prio )
> - __field( long, prev_state )
> + __field( short, prev_prio )
> + __field( int, prev_state )
> + __field( char, prev_state_char )
> __array( char, next_comm, TASK_COMM_LEN )
> __field( pid_t, next_pid )
> - __field( int, next_prio )
> + __field( short, next_prio )
> ),
The above adds a bunch of holes. This needs to be reordered to condense the
event, we don't want to increase it. libtraceevent will handle reordering.
The above produces:
struct {
char prev_comm[16];
pid_t prev_pid;
short prev_prio; <-- 2 character padding
int prev_state;
char prev_state_char;
char next_comm[16]; <- 3 character padding
pid_t next_pid;
short next_prio; <- 2 char padding
};
(all events are at least 4 byte aligned, and are multiple of 4 bytes in
size, thus that last short of next_prio did nothing)
The above is a total of 56 bytes (note, that is the same as the current
sched_switch event size);
What the above should be:
TP_STRUCT__entry(
__field( pid_t, prev_pid )
__field( pid_t, next_pid )
__field( short, prev_prio )
__field( short, next_prio )
__field( int, prev_state )
__array( char, prev_comm, TASK_COMM_LEN )
__array( char, next_comm, TASK_COMM_LEN )
__field( char, prev_state_char )
),
Which would be:
struct {
pid_t prev_pid;
pid_t next_pid;
short prev_prio;
short next_prio;
int prev_state;
char prev_comm[16];
char next_comm[16];
char prev_stat_char; <-- 3 characters of padding
}
which would be 52 byte. Saving us 4 bytes per event. Which is a big deal!
-- Steve
Thanks Steven,
Indeed this did not consider the struct alignment. I left the field order intact
because I was not sure whether the order here matters for reporting or parsing.
Now I will fix this and send a v3, and again thanks for pointing it out.
Thanks,
Ze
On Mon, Jul 31, 2023 at 11:38 PM Steven Rostedt <[email protected]> wrote:
>
> On Wed, 26 Jul 2023 20:16:16 +0800
> Ze Gao <[email protected]> wrote:
>
> > @@ -231,41 +253,29 @@ TRACE_EVENT(sched_switch,
> > TP_STRUCT__entry(
> > __array( char, prev_comm, TASK_COMM_LEN )
> > __field( pid_t, prev_pid )
> > - __field( int, prev_prio )
> > - __field( long, prev_state )
> > + __field( short, prev_prio )
> > + __field( int, prev_state )
> > + __field( char, prev_state_char )
> > __array( char, next_comm, TASK_COMM_LEN )
> > __field( pid_t, next_pid )
> > - __field( int, next_prio )
> > + __field( short, next_prio )
> > ),
>
> The above adds a bunch of holes. This needs to be reordered to condense the
> event, we don't want to increase it. libtraceevent will handle reordering.
>
> The above produces:
>
> struct {
> char prev_comm[16];
> pid_t prev_pid;
> short prev_prio; <-- 2 character padding
> int prev_state;
> char prev_state_char;
> char next_comm[16]; <- 3 character padding
> pid_t next_pid;
> short next_prio; <- 2 char padding
> };
>
> (all events are at least 4 byte aligned, and are multiple of 4 bytes in
> size, thus that last short of next_prio did nothing)
>
> The above is a total of 56 bytes (note, that is the same as the current
> sched_switch event size);
>
> What the above should be:
>
> TP_STRUCT__entry(
> __field( pid_t, prev_pid )
> __field( pid_t, next_pid )
> __field( short, prev_prio )
> __field( short, next_prio )
> __field( int, prev_state )
> __array( char, prev_comm, TASK_COMM_LEN )
> __array( char, next_comm, TASK_COMM_LEN )
> __field( char, prev_state_char )
> ),
>
>
> Which would be:
>
> struct {
> pid_t prev_pid;
> pid_t next_pid;
> short prev_prio;
> short next_prio;
> int prev_state;
> char prev_comm[16];
> char next_comm[16];
> char prev_stat_char; <-- 3 characters of padding
> }
>
> which would be 52 byte. Saving us 4 bytes per event. Which is a big deal!
>
> -- Steve
>