Report prioritiy and prev_state in 'short' to save some buffer
space. And also reorder the fields so that we take struct
alignment into consideration to make the record compact.
Suggested-by: Steven Rostedt (Google) <[email protected]>
Signed-off-by: Ze Gao <[email protected]>
---
include/trace/events/sched.h | 26 +++++++++++++-------------
1 file changed, 13 insertions(+), 13 deletions(-)
diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h
index fbb99a61f714..43492daefa6f 100644
--- a/include/trace/events/sched.h
+++ b/include/trace/events/sched.h
@@ -187,14 +187,14 @@ DEFINE_EVENT(sched_wakeup_template, sched_wakeup_new,
TP_ARGS(p));
#ifdef CREATE_TRACE_POINTS
-static inline long __trace_sched_switch_state(bool preempt,
+static inline short __trace_sched_switch_state(bool preempt,
unsigned int prev_state,
struct task_struct *p)
{
unsigned int state;
#ifdef CONFIG_SCHED_DEBUG
- BUG_ON(p != current);
+ WARN_ON_ONCE(p != current);
#endif /* CONFIG_SCHED_DEBUG */
/*
@@ -229,23 +229,23 @@ TRACE_EVENT(sched_switch,
TP_ARGS(preempt, prev, next, prev_state),
TP_STRUCT__entry(
- __array( char, prev_comm, TASK_COMM_LEN )
__field( pid_t, prev_pid )
- __field( int, prev_prio )
- __field( long, prev_state )
- __array( char, next_comm, TASK_COMM_LEN )
__field( pid_t, next_pid )
- __field( int, next_prio )
+ __field( short, prev_prio )
+ __field( short, next_prio )
+ __array( char, prev_comm, TASK_COMM_LEN )
+ __array( char, next_comm, TASK_COMM_LEN )
+ __field( short, prev_state )
),
TP_fast_assign(
- memcpy(__entry->next_comm, next->comm, TASK_COMM_LEN);
- __entry->prev_pid = prev->pid;
- __entry->prev_prio = prev->prio;
- __entry->prev_state = __trace_sched_switch_state(preempt, prev_state, prev);
+ __entry->prev_pid = prev->pid;
+ __entry->next_pid = next->pid;
+ __entry->prev_prio = (short) prev->prio;
+ __entry->next_prio = (short) next->prio;
memcpy(__entry->prev_comm, prev->comm, TASK_COMM_LEN);
- __entry->next_pid = next->pid;
- __entry->next_prio = next->prio;
+ memcpy(__entry->next_comm, next->comm, TASK_COMM_LEN);
+ __entry->prev_state = __trace_sched_switch_state(preempt, prev_state, prev);
/* XXX SCHED_DEADLINE */
),
--
2.41.0
On Thu, 3 Aug 2023 04:33:50 -0400
Ze Gao <[email protected]> wrote:
> Report prioritiy and prev_state in 'short' to save some buffer
> space. And also reorder the fields so that we take struct
> alignment into consideration to make the record compact.
If I were to write this, I would have wrote:
The prev_state field in the sched_switch event is assigned by
__trace_sched_switch_state(). The largest number that function will return
is TASK_REPORT_MAX which is just 0x100. There's no reason that the
prev_state field is a full 32 bits when it is using just 9 bits max. In
order to save space on the ring buffer, shrink the prev_state to 16 bits
(short).
Also, change the positions of the other fields to accommodate the short
value of prev_state to eliminate any holes that were created in the
structure.
See the difference?
>
> #ifdef CREATE_TRACE_POINTS
> -static inline long __trace_sched_switch_state(bool preempt,
> +static inline short __trace_sched_switch_state(bool preempt,
> unsigned int prev_state,
> struct task_struct *p)
> {
> unsigned int state;
>
> #ifdef CONFIG_SCHED_DEBUG
> - BUG_ON(p != current);
> + WARN_ON_ONCE(p != current);
> #endif /* CONFIG_SCHED_DEBUG */
The above needs to be a separate patch.
-- Steve
>
> /*
> @@ -229,23 +229,23 @@ TRACE_EVENT(sched_switch,
> TP_ARGS(preempt, prev, next, prev_state),
>
> TP_STRUCT__entry(
> - __array( char, prev_comm, TASK_COMM_LEN )
> __field( pid_t, prev_pid )
> - __field( int, prev_prio )
> - __field( long, prev_state )
> - __array( char, next_comm, TASK_COMM_LEN )
> __field( pid_t, next_pid )
> - __field( int, next_prio )
> + __field( short, prev_prio )
> + __field( short, next_prio )
> + __array( char, prev_comm, TASK_COMM_LEN )
> + __array( char, next_comm, TASK_COMM_LEN )
> + __field( short, prev_state )
> ),
>
> TP_fast_assign(
> - memcpy(__entry->next_comm, next->comm, TASK_COMM_LEN);
> - __entry->prev_pid = prev->pid;
> - __entry->prev_prio = prev->prio;
> - __entry->prev_state = __trace_sched_switch_state(preempt, prev_state, prev);
> + __entry->prev_pid = prev->pid;
> + __entry->next_pid = next->pid;
> + __entry->prev_prio = (short) prev->prio;
> + __entry->next_prio = (short) next->prio;
> memcpy(__entry->prev_comm, prev->comm, TASK_COMM_LEN);
> - __entry->next_pid = next->pid;
> - __entry->next_prio = next->prio;
> + memcpy(__entry->next_comm, next->comm, TASK_COMM_LEN);
> + __entry->prev_state = __trace_sched_switch_state(preempt, prev_state, prev);
> /* XXX SCHED_DEADLINE */
> ),
>
On Thu, Aug 03, 2023 at 04:33:50AM -0400, Ze Gao wrote:
> Report prioritiy and prev_state in 'short' to save some buffer
> space. And also reorder the fields so that we take struct
> alignment into consideration to make the record compact.
>
> Suggested-by: Steven Rostedt (Google) <[email protected]>
> Signed-off-by: Ze Gao <[email protected]>
I am really getting fed up with this. This again doesn't list any
reasons on why this is a sane thing to do.
Please review past discussions and collate the various things mentioned
into this Changelog.
On Thu, Aug 03, 2023 at 05:18:26AM -0400, Steven Rostedt wrote:
> On Thu, 3 Aug 2023 04:33:50 -0400
> Ze Gao <[email protected]> wrote:
>
> > Report prioritiy and prev_state in 'short' to save some buffer
> > space. And also reorder the fields so that we take struct
> > alignment into consideration to make the record compact.
>
> If I were to write this, I would have wrote:
>
> The prev_state field in the sched_switch event is assigned by
> __trace_sched_switch_state(). The largest number that function will return
> is TASK_REPORT_MAX which is just 0x100. There's no reason that the
> prev_state field is a full 32 bits when it is using just 9 bits max. In
> order to save space on the ring buffer, shrink the prev_state to 16 bits
> (short).
>
> Also, change the positions of the other fields to accommodate the short
> value of prev_state to eliminate any holes that were created in the
> structure.
>
> See the difference?
This also doesn't mention you broke the data format for all trace events
a while back to ensure people are using libtracefs and are thus
confident this won't break things.
On Thu, Aug 3, 2023 at 4:54 PM Peter Zijlstra <[email protected]> wrote:
>
> On Thu, Aug 03, 2023 at 04:33:50AM -0400, Ze Gao wrote:
> > Report prioritiy and prev_state in 'short' to save some buffer
> > space. And also reorder the fields so that we take struct
> > alignment into consideration to make the record compact.
> >
> > Suggested-by: Steven Rostedt (Google) <[email protected]>
> > Signed-off-by: Ze Gao <[email protected]>
>
> I am really getting fed up with this. This again doesn't list any
> reasons on why this is a sane thing to do.
Lesson learned from Steven's reply, Will reorganize the changelog.
Thanks for pointing this out.
Regards,
Ze
On Thu, Aug 3, 2023 at 5:18 PM Steven Rostedt <[email protected]> wrote:
>
> On Thu, 3 Aug 2023 04:33:50 -0400
> Ze Gao <[email protected]> wrote:
>
> > Report prioritiy and prev_state in 'short' to save some buffer
> > space. And also reorder the fields so that we take struct
> > alignment into consideration to make the record compact.
>
> If I were to write this, I would have wrote:
>
> The prev_state field in the sched_switch event is assigned by
> __trace_sched_switch_state(). The largest number that function will return
> is TASK_REPORT_MAX which is just 0x100. There's no reason that the
> prev_state field is a full 32 bits when it is using just 9 bits max. In
> order to save space on the ring buffer, shrink the prev_state to 16 bits
> (short).
>
> Also, change the positions of the other fields to accommodate the short
> value of prev_state to eliminate any holes that were created in the
> structure.
>
> See the difference?
>
> >
> > #ifdef CREATE_TRACE_POINTS
> > -static inline long __trace_sched_switch_state(bool preempt,
> > +static inline short __trace_sched_switch_state(bool preempt,
> > unsigned int prev_state,
> > struct task_struct *p)
> > {
> > unsigned int state;
> >
> > #ifdef CONFIG_SCHED_DEBUG
> > - BUG_ON(p != current);
> > + WARN_ON_ONCE(p != current);
> > #endif /* CONFIG_SCHED_DEBUG */
>
> The above needs to be a separate patch.
I've moved this to a new patch, and this is the changelog:
sched, tracing: change BUG_ON to WARN_ON_ONCE in __trace_sched_switch_state
BUG_ON() was introduced in 2014 and old, and we
switch it to WARN_ON_ONCE() to not to crash the
kernel when the sched-out task is unexpected than
the current, as suggested by Steven.
Signed-off-by: Ze Gao <[email protected]>
Regards,
Ze
> >
> > /*
> > @@ -229,23 +229,23 @@ TRACE_EVENT(sched_switch,
> > TP_ARGS(preempt, prev, next, prev_state),
> >
> > TP_STRUCT__entry(
> > - __array( char, prev_comm, TASK_COMM_LEN )
> > __field( pid_t, prev_pid )
> > - __field( int, prev_prio )
> > - __field( long, prev_state )
> > - __array( char, next_comm, TASK_COMM_LEN )
> > __field( pid_t, next_pid )
> > - __field( int, next_prio )
> > + __field( short, prev_prio )
> > + __field( short, next_prio )
> > + __array( char, prev_comm, TASK_COMM_LEN )
> > + __array( char, next_comm, TASK_COMM_LEN )
> > + __field( short, prev_state )
> > ),
> >
> > TP_fast_assign(
> > - memcpy(__entry->next_comm, next->comm, TASK_COMM_LEN);
> > - __entry->prev_pid = prev->pid;
> > - __entry->prev_prio = prev->prio;
> > - __entry->prev_state = __trace_sched_switch_state(preempt, prev_state, prev);
> > + __entry->prev_pid = prev->pid;
> > + __entry->next_pid = next->pid;
> > + __entry->prev_prio = (short) prev->prio;
> > + __entry->next_prio = (short) next->prio;
> > memcpy(__entry->prev_comm, prev->comm, TASK_COMM_LEN);
> > - __entry->next_pid = next->pid;
> > - __entry->next_prio = next->prio;
> > + memcpy(__entry->next_comm, next->comm, TASK_COMM_LEN);
> > + __entry->prev_state = __trace_sched_switch_state(preempt, prev_state, prev);
> > /* XXX SCHED_DEADLINE */
> > ),
> >
>
On Thu, Aug 3, 2023 at 5:18 PM Steven Rostedt <[email protected]> wrote:
>
> On Thu, 3 Aug 2023 04:33:50 -0400
> Ze Gao <[email protected]> wrote:
>
> > Report prioritiy and prev_state in 'short' to save some buffer
> > space. And also reorder the fields so that we take struct
> > alignment into consideration to make the record compact.
>
> If I were to write this, I would have wrote:
>
> The prev_state field in the sched_switch event is assigned by
> __trace_sched_switch_state(). The largest number that function will return
> is TASK_REPORT_MAX which is just 0x100. There's no reason that the
> prev_state field is a full 32 bits when it is using just 9 bits max. In
> order to save space on the ring buffer, shrink the prev_state to 16 bits
> (short).
>
> Also, change the positions of the other fields to accommodate the short
> value of prev_state to eliminate any holes that were created in the
> structure.
Thanks for this elaboration! But I see Peter have comments on this, I'll wait
to see these resolved and then send my new changelog.
Thanks,
Ze
> >
> > #ifdef CREATE_TRACE_POINTS
> > -static inline long __trace_sched_switch_state(bool preempt,
> > +static inline short __trace_sched_switch_state(bool preempt,
> > unsigned int prev_state,
> > struct task_struct *p)
> > {
> > unsigned int state;
> >
> > #ifdef CONFIG_SCHED_DEBUG
> > - BUG_ON(p != current);
> > + WARN_ON_ONCE(p != current);
> > #endif /* CONFIG_SCHED_DEBUG */
>
> The above needs to be a separate patch.
>
> -- Steve
>
> >
> > /*
> > @@ -229,23 +229,23 @@ TRACE_EVENT(sched_switch,
> > TP_ARGS(preempt, prev, next, prev_state),
> >
> > TP_STRUCT__entry(
> > - __array( char, prev_comm, TASK_COMM_LEN )
> > __field( pid_t, prev_pid )
> > - __field( int, prev_prio )
> > - __field( long, prev_state )
> > - __array( char, next_comm, TASK_COMM_LEN )
> > __field( pid_t, next_pid )
> > - __field( int, next_prio )
> > + __field( short, prev_prio )
> > + __field( short, next_prio )
> > + __array( char, prev_comm, TASK_COMM_LEN )
> > + __array( char, next_comm, TASK_COMM_LEN )
> > + __field( short, prev_state )
> > ),
> >
> > TP_fast_assign(
> > - memcpy(__entry->next_comm, next->comm, TASK_COMM_LEN);
> > - __entry->prev_pid = prev->pid;
> > - __entry->prev_prio = prev->prio;
> > - __entry->prev_state = __trace_sched_switch_state(preempt, prev_state, prev);
> > + __entry->prev_pid = prev->pid;
> > + __entry->next_pid = next->pid;
> > + __entry->prev_prio = (short) prev->prio;
> > + __entry->next_prio = (short) next->prio;
> > memcpy(__entry->prev_comm, prev->comm, TASK_COMM_LEN);
> > - __entry->next_pid = next->pid;
> > - __entry->next_prio = next->prio;
> > + memcpy(__entry->next_comm, next->comm, TASK_COMM_LEN);
> > + __entry->prev_state = __trace_sched_switch_state(preempt, prev_state, prev);
> > /* XXX SCHED_DEADLINE */
> > ),
> >
>
On Thu, 3 Aug 2023 11:51:32 +0200
Peter Zijlstra <[email protected]> wrote:
> >
> > See the difference?
>
> This also doesn't mention you broke the data format for all trace events
> a while back to ensure people are using libtracefs and are thus
> confident this won't break things.
It was the meta data that happened a while ago. As other events change all
the time with this restriction (keeping field names around), I didn't
realize that it was required for this change log. Doesn't hurt adding it.
More details on that:
commit b000c8065 ("tracing: Remove the extra 4 bytes of padding in events")
This was to get rid of the padding that powertop relied on.
Nit, it's libtraceevent not libtracefs, as libtracefs gets you to the
format files, but it's libtraceveent that parses them.
-- Steve