2022-01-21 22:24:26

by Valentin Schneider

[permalink] [raw]
Subject: [PATCH v3 1/2] sched/tracing: Don't re-read p->state when emitting sched_switch event

As of commit

c6e7bd7afaeb ("sched/core: Optimize ttwu() spinning on p->on_cpu")

the following sequence becomes possible:

p->__state = TASK_INTERRUPTIBLE;
__schedule()
deactivate_task(p);
ttwu()
READ !p->on_rq
p->__state=TASK_WAKING
trace_sched_switch()
__trace_sched_switch_state()
task_state_index()
return 0;

TASK_WAKING isn't in TASK_REPORT, so the task appears as TASK_RUNNING in
the trace event.

Prevent this by pushing the value read from __schedule() down the trace
event.

Reported-by: Abhijeet Dharmapurikar <[email protected]>
Signed-off-by: Valentin Schneider <[email protected]>
---
include/linux/sched.h | 11 ++++++++---
include/trace/events/sched.h | 11 +++++++----
kernel/sched/core.c | 4 ++--
kernel/trace/fgraph.c | 4 +++-
kernel/trace/ftrace.c | 4 +++-
kernel/trace/trace_events.c | 8 ++++++--
kernel/trace/trace_osnoise.c | 4 +++-
kernel/trace/trace_sched_switch.c | 1 +
kernel/trace/trace_sched_wakeup.c | 1 +
9 files changed, 34 insertions(+), 14 deletions(-)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index d2e261adb8ea..d00837d12b9d 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1616,10 +1616,10 @@ static inline pid_t task_pgrp_nr(struct task_struct *tsk)
#define TASK_REPORT_IDLE (TASK_REPORT + 1)
#define TASK_REPORT_MAX (TASK_REPORT_IDLE << 1)

-static inline unsigned int task_state_index(struct task_struct *tsk)
+static inline unsigned int __task_state_index(unsigned int tsk_state,
+ unsigned int tsk_exit_state)
{
- unsigned int tsk_state = READ_ONCE(tsk->__state);
- unsigned int state = (tsk_state | tsk->exit_state) & TASK_REPORT;
+ unsigned int state = (tsk_state | tsk_exit_state) & TASK_REPORT;

BUILD_BUG_ON_NOT_POWER_OF_2(TASK_REPORT_MAX);

@@ -1629,6 +1629,11 @@ static inline unsigned int task_state_index(struct task_struct *tsk)
return fls(state);
}

+static inline unsigned int task_state_index(struct task_struct *tsk)
+{
+ return __task_state_index(READ_ONCE(tsk->__state), tsk->exit_state);
+}
+
static inline char task_index_to_char(unsigned int state)
{
static const char state_char[] = "RSDTtXZPI";
diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h
index 94640482cfe7..65e786756321 100644
--- a/include/trace/events/sched.h
+++ b/include/trace/events/sched.h
@@ -187,7 +187,9 @@ DEFINE_EVENT(sched_wakeup_template, sched_wakeup_new,
TP_ARGS(p));

#ifdef CREATE_TRACE_POINTS
-static inline long __trace_sched_switch_state(bool preempt, struct task_struct *p)
+static inline long __trace_sched_switch_state(bool preempt,
+ unsigned int prev_state,
+ struct task_struct *p)
{
unsigned int state;

@@ -208,7 +210,7 @@ static inline long __trace_sched_switch_state(bool preempt, struct task_struct *
* it for left shift operation to get the correct task->state
* mapping.
*/
- state = task_state_index(p);
+ state = __task_state_index(prev_state, p->exit_state);

return state ? (1 << (state - 1)) : state;
}
@@ -220,10 +222,11 @@ static inline long __trace_sched_switch_state(bool preempt, struct task_struct *
TRACE_EVENT(sched_switch,

TP_PROTO(bool preempt,
+ unsigned int prev_state,
struct task_struct *prev,
struct task_struct *next),

- TP_ARGS(preempt, prev, next),
+ TP_ARGS(preempt, prev_state, prev, next),

TP_STRUCT__entry(
__array( char, prev_comm, TASK_COMM_LEN )
@@ -239,7 +242,7 @@ TRACE_EVENT(sched_switch,
memcpy(__entry->next_comm, next->comm, TASK_COMM_LEN);
__entry->prev_pid = prev->pid;
__entry->prev_prio = prev->prio;
- __entry->prev_state = __trace_sched_switch_state(preempt, prev);
+ __entry->prev_state = __trace_sched_switch_state(preempt, prev_state, prev);
memcpy(__entry->prev_comm, prev->comm, TASK_COMM_LEN);
__entry->next_pid = next->pid;
__entry->next_prio = next->prio;
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index fe53e510e711..a8799a2d8546 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -4822,7 +4822,7 @@ static struct rq *finish_task_switch(struct task_struct *prev)
{
struct rq *rq = this_rq();
struct mm_struct *mm = rq->prev_mm;
- long prev_state;
+ unsigned int prev_state;

/*
* The previous task will have left us with a preempt_count of 2
@@ -6287,7 +6287,7 @@ static void __sched notrace __schedule(unsigned int sched_mode)
migrate_disable_switch(rq, prev);
psi_sched_switch(prev, next, !task_on_rq_queued(prev));

- trace_sched_switch(sched_mode & SM_MASK_PREEMPT, prev, next);
+ trace_sched_switch(sched_mode & SM_MASK_PREEMPT, prev_state, prev, next);

/* Also unlocks the rq: */
rq = context_switch(rq, prev, next, &rf);
diff --git a/kernel/trace/fgraph.c b/kernel/trace/fgraph.c
index 22061d38fc00..19028e072cdb 100644
--- a/kernel/trace/fgraph.c
+++ b/kernel/trace/fgraph.c
@@ -415,7 +415,9 @@ static int alloc_retstack_tasklist(struct ftrace_ret_stack **ret_stack_list)

static void
ftrace_graph_probe_sched_switch(void *ignore, bool preempt,
- struct task_struct *prev, struct task_struct *next)
+ unsigned int prev_state,
+ struct task_struct *prev,
+ struct task_struct *next)
{
unsigned long long timestamp;
int index;
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 30bc880c3849..e296ddeec99f 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -7311,7 +7311,9 @@ ftrace_func_t ftrace_ops_get_func(struct ftrace_ops *ops)

static void
ftrace_filter_pid_sched_switch_probe(void *data, bool preempt,
- struct task_struct *prev, struct task_struct *next)
+ unsigned int prev_state,
+ struct task_struct *prev,
+ struct task_struct *next)
{
struct trace_array *tr = data;
struct trace_pid_list *pid_list;
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 4021b9a79f93..6ddc6cc0d5d5 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -759,7 +759,9 @@ void trace_event_follow_fork(struct trace_array *tr, bool enable)

static void
event_filter_pid_sched_switch_probe_pre(void *data, bool preempt,
- struct task_struct *prev, struct task_struct *next)
+ unsigned int prev_state,
+ struct task_struct *prev,
+ struct task_struct *next)
{
struct trace_array *tr = data;
struct trace_pid_list *no_pid_list;
@@ -783,7 +785,9 @@ event_filter_pid_sched_switch_probe_pre(void *data, bool preempt,

static void
event_filter_pid_sched_switch_probe_post(void *data, bool preempt,
- struct task_struct *prev, struct task_struct *next)
+ unsigned int prev_state,
+ struct task_struct *prev,
+ struct task_struct *next)
{
struct trace_array *tr = data;
struct trace_pid_list *no_pid_list;
diff --git a/kernel/trace/trace_osnoise.c b/kernel/trace/trace_osnoise.c
index 7520d43aed55..a8a2d17f858c 100644
--- a/kernel/trace/trace_osnoise.c
+++ b/kernel/trace/trace_osnoise.c
@@ -1168,7 +1168,9 @@ thread_exit(struct osnoise_variables *osn_var, struct task_struct *t)
* used to record the beginning and to report the end of a thread noise window.
*/
static void
-trace_sched_switch_callback(void *data, bool preempt, struct task_struct *p,
+trace_sched_switch_callback(void *data, bool preempt,
+ unsigned int prev_state,
+ struct task_struct *p,
struct task_struct *n)
{
struct osnoise_variables *osn_var = this_cpu_osn_var();
diff --git a/kernel/trace/trace_sched_switch.c b/kernel/trace/trace_sched_switch.c
index e304196d7c28..993b0ed10d8c 100644
--- a/kernel/trace/trace_sched_switch.c
+++ b/kernel/trace/trace_sched_switch.c
@@ -22,6 +22,7 @@ static DEFINE_MUTEX(sched_register_mutex);

static void
probe_sched_switch(void *ignore, bool preempt,
+ unsigned int prev_state,
struct task_struct *prev, struct task_struct *next)
{
int flags;
diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c
index 2402de520eca..46429f9a96fa 100644
--- a/kernel/trace/trace_sched_wakeup.c
+++ b/kernel/trace/trace_sched_wakeup.c
@@ -426,6 +426,7 @@ tracing_sched_wakeup_trace(struct trace_array *tr,

static void notrace
probe_wakeup_sched_switch(void *ignore, bool preempt,
+ unsigned int prev_state,
struct task_struct *prev, struct task_struct *next)
{
struct trace_array_cpu *data;
--
2.25.1


Subject: [tip: sched/core] sched/tracing: Don't re-read p->state when emitting sched_switch event

The following commit has been merged into the sched/core branch of tip:

Commit-ID: fa2c3254d7cfff5f7a916ab928a562d1165f17bb
Gitweb: https://git.kernel.org/tip/fa2c3254d7cfff5f7a916ab928a562d1165f17bb
Author: Valentin Schneider <[email protected]>
AuthorDate: Thu, 20 Jan 2022 16:25:19
Committer: Peter Zijlstra <[email protected]>
CommitterDate: Tue, 01 Mar 2022 16:18:39 +01:00

sched/tracing: Don't re-read p->state when emitting sched_switch event

As of commit

c6e7bd7afaeb ("sched/core: Optimize ttwu() spinning on p->on_cpu")

the following sequence becomes possible:

p->__state = TASK_INTERRUPTIBLE;
__schedule()
deactivate_task(p);
ttwu()
READ !p->on_rq
p->__state=TASK_WAKING
trace_sched_switch()
__trace_sched_switch_state()
task_state_index()
return 0;

TASK_WAKING isn't in TASK_REPORT, so the task appears as TASK_RUNNING in
the trace event.

Prevent this by pushing the value read from __schedule() down the trace
event.

Reported-by: Abhijeet Dharmapurikar <[email protected]>
Signed-off-by: Valentin Schneider <[email protected]>
Signed-off-by: Peter Zijlstra (Intel) <[email protected]>
Reviewed-by: Steven Rostedt (Google) <[email protected]>
Link: https://lore.kernel.org/r/[email protected]
---
include/linux/sched.h | 11 ++++++++---
include/trace/events/sched.h | 11 +++++++----
kernel/sched/core.c | 4 ++--
kernel/trace/fgraph.c | 4 +++-
kernel/trace/ftrace.c | 4 +++-
kernel/trace/trace_events.c | 8 ++++++--
kernel/trace/trace_osnoise.c | 4 +++-
kernel/trace/trace_sched_switch.c | 1 +
kernel/trace/trace_sched_wakeup.c | 1 +
9 files changed, 34 insertions(+), 14 deletions(-)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index f00132e..457c8a0 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1620,10 +1620,10 @@ static inline pid_t task_pgrp_nr(struct task_struct *tsk)
#define TASK_REPORT_IDLE (TASK_REPORT + 1)
#define TASK_REPORT_MAX (TASK_REPORT_IDLE << 1)

-static inline unsigned int task_state_index(struct task_struct *tsk)
+static inline unsigned int __task_state_index(unsigned int tsk_state,
+ unsigned int tsk_exit_state)
{
- unsigned int tsk_state = READ_ONCE(tsk->__state);
- unsigned int state = (tsk_state | tsk->exit_state) & TASK_REPORT;
+ unsigned int state = (tsk_state | tsk_exit_state) & TASK_REPORT;

BUILD_BUG_ON_NOT_POWER_OF_2(TASK_REPORT_MAX);

@@ -1633,6 +1633,11 @@ static inline unsigned int task_state_index(struct task_struct *tsk)
return fls(state);
}

+static inline unsigned int task_state_index(struct task_struct *tsk)
+{
+ return __task_state_index(READ_ONCE(tsk->__state), tsk->exit_state);
+}
+
static inline char task_index_to_char(unsigned int state)
{
static const char state_char[] = "RSDTtXZPI";
diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h
index 9464048..65e7867 100644
--- a/include/trace/events/sched.h
+++ b/include/trace/events/sched.h
@@ -187,7 +187,9 @@ DEFINE_EVENT(sched_wakeup_template, sched_wakeup_new,
TP_ARGS(p));

#ifdef CREATE_TRACE_POINTS
-static inline long __trace_sched_switch_state(bool preempt, struct task_struct *p)
+static inline long __trace_sched_switch_state(bool preempt,
+ unsigned int prev_state,
+ struct task_struct *p)
{
unsigned int state;

@@ -208,7 +210,7 @@ static inline long __trace_sched_switch_state(bool preempt, struct task_struct *
* it for left shift operation to get the correct task->state
* mapping.
*/
- state = task_state_index(p);
+ state = __task_state_index(prev_state, p->exit_state);

return state ? (1 << (state - 1)) : state;
}
@@ -220,10 +222,11 @@ static inline long __trace_sched_switch_state(bool preempt, struct task_struct *
TRACE_EVENT(sched_switch,

TP_PROTO(bool preempt,
+ unsigned int prev_state,
struct task_struct *prev,
struct task_struct *next),

- TP_ARGS(preempt, prev, next),
+ TP_ARGS(preempt, prev_state, prev, next),

TP_STRUCT__entry(
__array( char, prev_comm, TASK_COMM_LEN )
@@ -239,7 +242,7 @@ TRACE_EVENT(sched_switch,
memcpy(__entry->next_comm, next->comm, TASK_COMM_LEN);
__entry->prev_pid = prev->pid;
__entry->prev_prio = prev->prio;
- __entry->prev_state = __trace_sched_switch_state(preempt, prev);
+ __entry->prev_state = __trace_sched_switch_state(preempt, prev_state, prev);
memcpy(__entry->prev_comm, prev->comm, TASK_COMM_LEN);
__entry->next_pid = next->pid;
__entry->next_prio = next->prio;
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index ef94612..3aafc15 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -4836,7 +4836,7 @@ static struct rq *finish_task_switch(struct task_struct *prev)
{
struct rq *rq = this_rq();
struct mm_struct *mm = rq->prev_mm;
- long prev_state;
+ unsigned int prev_state;

/*
* The previous task will have left us with a preempt_count of 2
@@ -6300,7 +6300,7 @@ static void __sched notrace __schedule(unsigned int sched_mode)
migrate_disable_switch(rq, prev);
psi_sched_switch(prev, next, !task_on_rq_queued(prev));

- trace_sched_switch(sched_mode & SM_MASK_PREEMPT, prev, next);
+ trace_sched_switch(sched_mode & SM_MASK_PREEMPT, prev_state, prev, next);

/* Also unlocks the rq: */
rq = context_switch(rq, prev, next, &rf);
diff --git a/kernel/trace/fgraph.c b/kernel/trace/fgraph.c
index 22061d3..19028e0 100644
--- a/kernel/trace/fgraph.c
+++ b/kernel/trace/fgraph.c
@@ -415,7 +415,9 @@ free:

static void
ftrace_graph_probe_sched_switch(void *ignore, bool preempt,
- struct task_struct *prev, struct task_struct *next)
+ unsigned int prev_state,
+ struct task_struct *prev,
+ struct task_struct *next)
{
unsigned long long timestamp;
int index;
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index f9feb19..6762ae0 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -7347,7 +7347,9 @@ ftrace_func_t ftrace_ops_get_func(struct ftrace_ops *ops)

static void
ftrace_filter_pid_sched_switch_probe(void *data, bool preempt,
- struct task_struct *prev, struct task_struct *next)
+ unsigned int prev_state,
+ struct task_struct *prev,
+ struct task_struct *next)
{
struct trace_array *tr = data;
struct trace_pid_list *pid_list;
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 3147614..2a19ea7 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -759,7 +759,9 @@ void trace_event_follow_fork(struct trace_array *tr, bool enable)

static void
event_filter_pid_sched_switch_probe_pre(void *data, bool preempt,
- struct task_struct *prev, struct task_struct *next)
+ unsigned int prev_state,
+ struct task_struct *prev,
+ struct task_struct *next)
{
struct trace_array *tr = data;
struct trace_pid_list *no_pid_list;
@@ -783,7 +785,9 @@ event_filter_pid_sched_switch_probe_pre(void *data, bool preempt,

static void
event_filter_pid_sched_switch_probe_post(void *data, bool preempt,
- struct task_struct *prev, struct task_struct *next)
+ unsigned int prev_state,
+ struct task_struct *prev,
+ struct task_struct *next)
{
struct trace_array *tr = data;
struct trace_pid_list *no_pid_list;
diff --git a/kernel/trace/trace_osnoise.c b/kernel/trace/trace_osnoise.c
index 870a08d..1829b4c 100644
--- a/kernel/trace/trace_osnoise.c
+++ b/kernel/trace/trace_osnoise.c
@@ -1167,7 +1167,9 @@ thread_exit(struct osnoise_variables *osn_var, struct task_struct *t)
* used to record the beginning and to report the end of a thread noise window.
*/
static void
-trace_sched_switch_callback(void *data, bool preempt, struct task_struct *p,
+trace_sched_switch_callback(void *data, bool preempt,
+ unsigned int prev_state,
+ struct task_struct *p,
struct task_struct *n)
{
struct osnoise_variables *osn_var = this_cpu_osn_var();
diff --git a/kernel/trace/trace_sched_switch.c b/kernel/trace/trace_sched_switch.c
index e304196..993b0ed 100644
--- a/kernel/trace/trace_sched_switch.c
+++ b/kernel/trace/trace_sched_switch.c
@@ -22,6 +22,7 @@ static DEFINE_MUTEX(sched_register_mutex);

static void
probe_sched_switch(void *ignore, bool preempt,
+ unsigned int prev_state,
struct task_struct *prev, struct task_struct *next)
{
int flags;
diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c
index 2402de5..46429f9 100644
--- a/kernel/trace/trace_sched_wakeup.c
+++ b/kernel/trace/trace_sched_wakeup.c
@@ -426,6 +426,7 @@ tracing_sched_wakeup_trace(struct trace_array *tr,

static void notrace
probe_wakeup_sched_switch(void *ignore, bool preempt,
+ unsigned int prev_state,
struct task_struct *prev, struct task_struct *next)
{
struct trace_array_cpu *data;

2022-03-04 20:33:03

by Valentin Schneider

[permalink] [raw]
Subject: Re: [tip: sched/core] sched/tracing: Don't re-read p->state when emitting sched_switch event

On 01/03/22 15:24, tip-bot2 for Valentin Schneider wrote:
> The following commit has been merged into the sched/core branch of tip:
>
> Commit-ID: fa2c3254d7cfff5f7a916ab928a562d1165f17bb
> Gitweb: https://git.kernel.org/tip/fa2c3254d7cfff5f7a916ab928a562d1165f17bb
> Author: Valentin Schneider <[email protected]>
> AuthorDate: Thu, 20 Jan 2022 16:25:19
> Committer: Peter Zijlstra <[email protected]>
> CommitterDate: Tue, 01 Mar 2022 16:18:39 +01:00
>
> sched/tracing: Don't re-read p->state when emitting sched_switch event
>
> As of commit
>
> c6e7bd7afaeb ("sched/core: Optimize ttwu() spinning on p->on_cpu")
>
> the following sequence becomes possible:
>
> p->__state = TASK_INTERRUPTIBLE;
> __schedule()
> deactivate_task(p);
> ttwu()
> READ !p->on_rq
> p->__state=TASK_WAKING
> trace_sched_switch()
> __trace_sched_switch_state()
> task_state_index()
> return 0;
>
> TASK_WAKING isn't in TASK_REPORT, so the task appears as TASK_RUNNING in
> the trace event.
>
> Prevent this by pushing the value read from __schedule() down the trace
> event.
>
> Reported-by: Abhijeet Dharmapurikar <[email protected]>
> Signed-off-by: Valentin Schneider <[email protected]>
> Signed-off-by: Peter Zijlstra (Intel) <[email protected]>
> Reviewed-by: Steven Rostedt (Google) <[email protected]>
> Link: https://lore.kernel.org/r/[email protected]

So I wasn't sure if that would be considered a "bug", but folks are asking
for this to be backported; do we want to slap

Fixes: c6e7bd7afaeb ("sched/core: Optimize ttwu() spinning on p->on_cpu")

to it?

2022-03-08 22:26:08

by Greg Kroah-Hartman

[permalink] [raw]
Subject: Re: [tip: sched/core] sched/tracing: Don't re-read p->state when emitting sched_switch event

On Tue, Mar 08, 2022 at 06:02:40PM +0000, Qais Yousef wrote:
> +CC stable
>
> On 03/01/22 15:24, tip-bot2 for Valentin Schneider wrote:
> > The following commit has been merged into the sched/core branch of tip:
> >
> > Commit-ID: fa2c3254d7cfff5f7a916ab928a562d1165f17bb
> > Gitweb: https://git.kernel.org/tip/fa2c3254d7cfff5f7a916ab928a562d1165f17bb
> > Author: Valentin Schneider <[email protected]>
> > AuthorDate: Thu, 20 Jan 2022 16:25:19
> > Committer: Peter Zijlstra <[email protected]>
> > CommitterDate: Tue, 01 Mar 2022 16:18:39 +01:00
> >
> > sched/tracing: Don't re-read p->state when emitting sched_switch event
> >
> > As of commit
> >
> > c6e7bd7afaeb ("sched/core: Optimize ttwu() spinning on p->on_cpu")
> >
> > the following sequence becomes possible:
> >
> > p->__state = TASK_INTERRUPTIBLE;
> > __schedule()
> > deactivate_task(p);
> > ttwu()
> > READ !p->on_rq
> > p->__state=TASK_WAKING
> > trace_sched_switch()
> > __trace_sched_switch_state()
> > task_state_index()
> > return 0;
> >
> > TASK_WAKING isn't in TASK_REPORT, so the task appears as TASK_RUNNING in
> > the trace event.
> >
> > Prevent this by pushing the value read from __schedule() down the trace
> > event.
> >
> > Reported-by: Abhijeet Dharmapurikar <[email protected]>
> > Signed-off-by: Valentin Schneider <[email protected]>
> > Signed-off-by: Peter Zijlstra (Intel) <[email protected]>
> > Reviewed-by: Steven Rostedt (Google) <[email protected]>
> > Link: https://lore.kernel.org/r/[email protected]
>
> Any objection to picking this for stable? I'm interested in this one for some
> Android users but prefer if it can be taken by stable rather than backport it
> individually.
>
> I think it makes sense to pick the next one in the series too.

What commit does this fix in Linus's tree?

Once it hits Linus's tree, let [email protected] know what the git
commit id is in Linus's tree.

thanks,

greg k-h

2022-03-08 23:48:10

by Qais Yousef

[permalink] [raw]
Subject: Re: [tip: sched/core] sched/tracing: Don't re-read p->state when emitting sched_switch event

+CC stable

On 03/01/22 15:24, tip-bot2 for Valentin Schneider wrote:
> The following commit has been merged into the sched/core branch of tip:
>
> Commit-ID: fa2c3254d7cfff5f7a916ab928a562d1165f17bb
> Gitweb: https://git.kernel.org/tip/fa2c3254d7cfff5f7a916ab928a562d1165f17bb
> Author: Valentin Schneider <[email protected]>
> AuthorDate: Thu, 20 Jan 2022 16:25:19
> Committer: Peter Zijlstra <[email protected]>
> CommitterDate: Tue, 01 Mar 2022 16:18:39 +01:00
>
> sched/tracing: Don't re-read p->state when emitting sched_switch event
>
> As of commit
>
> c6e7bd7afaeb ("sched/core: Optimize ttwu() spinning on p->on_cpu")
>
> the following sequence becomes possible:
>
> p->__state = TASK_INTERRUPTIBLE;
> __schedule()
> deactivate_task(p);
> ttwu()
> READ !p->on_rq
> p->__state=TASK_WAKING
> trace_sched_switch()
> __trace_sched_switch_state()
> task_state_index()
> return 0;
>
> TASK_WAKING isn't in TASK_REPORT, so the task appears as TASK_RUNNING in
> the trace event.
>
> Prevent this by pushing the value read from __schedule() down the trace
> event.
>
> Reported-by: Abhijeet Dharmapurikar <[email protected]>
> Signed-off-by: Valentin Schneider <[email protected]>
> Signed-off-by: Peter Zijlstra (Intel) <[email protected]>
> Reviewed-by: Steven Rostedt (Google) <[email protected]>
> Link: https://lore.kernel.org/r/[email protected]

Any objection to picking this for stable? I'm interested in this one for some
Android users but prefer if it can be taken by stable rather than backport it
individually.

I think it makes sense to pick the next one in the series too.

Thanks!

--
Qais Yousef

2022-03-08 23:48:23

by Qais Yousef

[permalink] [raw]
Subject: Re: [tip: sched/core] sched/tracing: Don't re-read p->state when emitting sched_switch event

On 03/08/22 19:10, Greg KH wrote:
> On Tue, Mar 08, 2022 at 06:02:40PM +0000, Qais Yousef wrote:
> > +CC stable
> >
> > On 03/01/22 15:24, tip-bot2 for Valentin Schneider wrote:
> > > The following commit has been merged into the sched/core branch of tip:
> > >
> > > Commit-ID: fa2c3254d7cfff5f7a916ab928a562d1165f17bb
> > > Gitweb: https://git.kernel.org/tip/fa2c3254d7cfff5f7a916ab928a562d1165f17bb
> > > Author: Valentin Schneider <[email protected]>
> > > AuthorDate: Thu, 20 Jan 2022 16:25:19
> > > Committer: Peter Zijlstra <[email protected]>
> > > CommitterDate: Tue, 01 Mar 2022 16:18:39 +01:00
> > >
> > > sched/tracing: Don't re-read p->state when emitting sched_switch event
> > >
> > > As of commit
> > >
> > > c6e7bd7afaeb ("sched/core: Optimize ttwu() spinning on p->on_cpu")
> > >
> > > the following sequence becomes possible:
> > >
> > > p->__state = TASK_INTERRUPTIBLE;
> > > __schedule()
> > > deactivate_task(p);
> > > ttwu()
> > > READ !p->on_rq
> > > p->__state=TASK_WAKING
> > > trace_sched_switch()
> > > __trace_sched_switch_state()
> > > task_state_index()
> > > return 0;
> > >
> > > TASK_WAKING isn't in TASK_REPORT, so the task appears as TASK_RUNNING in
> > > the trace event.
> > >
> > > Prevent this by pushing the value read from __schedule() down the trace
> > > event.
> > >
> > > Reported-by: Abhijeet Dharmapurikar <[email protected]>
> > > Signed-off-by: Valentin Schneider <[email protected]>
> > > Signed-off-by: Peter Zijlstra (Intel) <[email protected]>
> > > Reviewed-by: Steven Rostedt (Google) <[email protected]>
> > > Link: https://lore.kernel.org/r/[email protected]
> >
> > Any objection to picking this for stable? I'm interested in this one for some
> > Android users but prefer if it can be taken by stable rather than backport it
> > individually.
> >
> > I think it makes sense to pick the next one in the series too.
>
> What commit does this fix in Linus's tree?

It should be this one: c6e7bd7afaeb ("sched/core: Optimize ttwu() spinning on p->on_cpu")

Not sure about the 2nd patch, but I can try to figure it out.

> Once it hits Linus's tree, let [email protected] know what the git
> commit id is in Linus's tree.

Sure. My bad if I rushed the request. I just wanted to know whether it's okay
for this to go into stable. If no one shouts, I'll ping once it lands in
Linus'.


Thanks!

--
Qais Yousef

2022-04-10 21:51:12

by Qais Yousef

[permalink] [raw]
Subject: Re: [tip: sched/core] sched/tracing: Don't re-read p->state when emitting sched_switch event

On 03/08/22 18:51, Qais Yousef wrote:
> On 03/08/22 19:10, Greg KH wrote:
> > On Tue, Mar 08, 2022 at 06:02:40PM +0000, Qais Yousef wrote:
> > > +CC stable
> > >
> > > On 03/01/22 15:24, tip-bot2 for Valentin Schneider wrote:
> > > > The following commit has been merged into the sched/core branch of tip:
> > > >
> > > > Commit-ID: fa2c3254d7cfff5f7a916ab928a562d1165f17bb
> > > > Gitweb: https://git.kernel.org/tip/fa2c3254d7cfff5f7a916ab928a562d1165f17bb
> > > > Author: Valentin Schneider <[email protected]>
> > > > AuthorDate: Thu, 20 Jan 2022 16:25:19
> > > > Committer: Peter Zijlstra <[email protected]>
> > > > CommitterDate: Tue, 01 Mar 2022 16:18:39 +01:00
> > > >
> > > > sched/tracing: Don't re-read p->state when emitting sched_switch event
> > > >
> > > > As of commit
> > > >
> > > > c6e7bd7afaeb ("sched/core: Optimize ttwu() spinning on p->on_cpu")
> > > >
> > > > the following sequence becomes possible:
> > > >
> > > > p->__state = TASK_INTERRUPTIBLE;
> > > > __schedule()
> > > > deactivate_task(p);
> > > > ttwu()
> > > > READ !p->on_rq
> > > > p->__state=TASK_WAKING
> > > > trace_sched_switch()
> > > > __trace_sched_switch_state()
> > > > task_state_index()
> > > > return 0;
> > > >
> > > > TASK_WAKING isn't in TASK_REPORT, so the task appears as TASK_RUNNING in
> > > > the trace event.
> > > >
> > > > Prevent this by pushing the value read from __schedule() down the trace
> > > > event.
> > > >
> > > > Reported-by: Abhijeet Dharmapurikar <[email protected]>
> > > > Signed-off-by: Valentin Schneider <[email protected]>
> > > > Signed-off-by: Peter Zijlstra (Intel) <[email protected]>
> > > > Reviewed-by: Steven Rostedt (Google) <[email protected]>
> > > > Link: https://lore.kernel.org/r/[email protected]
> > >
> > > Any objection to picking this for stable? I'm interested in this one for some
> > > Android users but prefer if it can be taken by stable rather than backport it
> > > individually.
> > >
> > > I think it makes sense to pick the next one in the series too.
> >
> > What commit does this fix in Linus's tree?
>
> It should be this one: c6e7bd7afaeb ("sched/core: Optimize ttwu() spinning on p->on_cpu")

Should this be okay to be picked up by stable now? I can see AUTOSEL has picked
it up for v5.15+, but it impacts v5.10 too.

Thanks!

--
Qais Yousef

2022-04-12 05:43:35

by Greg Kroah-Hartman

[permalink] [raw]
Subject: Re: [tip: sched/core] sched/tracing: Don't re-read p->state when emitting sched_switch event

On Mon, Apr 11, 2022 at 09:18:19AM +0200, Holger Hoffst?tte wrote:
> On 2022-04-11 01:22, Holger Hoffst?tte wrote:
> > On 2022-04-11 00:06, Qais Yousef wrote:
> > > On 04/10/22 00:38, Qais Yousef wrote:
> > > > On 03/08/22 18:51, Qais Yousef wrote:
> > > > > On 03/08/22 19:10, Greg KH wrote:
> > > > > > On Tue, Mar 08, 2022 at 06:02:40PM +0000, Qais Yousef wrote:
> > > > > > > +CC stable
> > > > > > >
> > > > > > > On 03/01/22 15:24, tip-bot2 for Valentin Schneider wrote:
> > > > > > > > The following commit has been merged into the sched/core branch of tip:
> > > > > > > >
> > > > > > > > Commit-ID:???? fa2c3254d7cfff5f7a916ab928a562d1165f17bb
> > > > > > > > Gitweb:??????? https://git.kernel.org/tip/fa2c3254d7cfff5f7a916ab928a562d1165f17bb
> > > > > > > > Author:??????? Valentin Schneider <[email protected]>
> > > > > > > > AuthorDate:??? Thu, 20 Jan 2022 16:25:19
> > > > > > > > Committer:???? Peter Zijlstra <[email protected]>
> > > > > > > > CommitterDate: Tue, 01 Mar 2022 16:18:39 +01:00
> > > > > > > >
> > > > > > > > sched/tracing: Don't re-read p->state when emitting sched_switch event
> > > > > > > >
> > > > > > > > As of commit
> > > > > > > >
> > > > > > > > ?? c6e7bd7afaeb ("sched/core: Optimize ttwu() spinning on p->on_cpu")
> > > > > > > >
> > > > > > > > the following sequence becomes possible:
> > > > > > > >
> > > > > > > > ????????????? p->__state = TASK_INTERRUPTIBLE;
> > > > > > > > ????????????? __schedule()
> > > > > > > > ??????????? deactivate_task(p);
> > > > > > > > ?? ttwu()
> > > > > > > > ???? READ !p->on_rq
> > > > > > > > ???? p->__state=TASK_WAKING
> > > > > > > > ??????????? trace_sched_switch()
> > > > > > > > ????????????? __trace_sched_switch_state()
> > > > > > > > ??????????????? task_state_index()
> > > > > > > > ????????????????? return 0;
> > > > > > > >
> > > > > > > > TASK_WAKING isn't in TASK_REPORT, so the task appears as TASK_RUNNING in
> > > > > > > > the trace event.
> > > > > > > >
> > > > > > > > Prevent this by pushing the value read from __schedule() down the trace
> > > > > > > > event.
> > > > > > > >
> > > > > > > > Reported-by: Abhijeet Dharmapurikar <[email protected]>
> > > > > > > > Signed-off-by: Valentin Schneider <[email protected]>
> > > > > > > > Signed-off-by: Peter Zijlstra (Intel) <[email protected]>
> > > > > > > > Reviewed-by: Steven Rostedt (Google) <[email protected]>
> > > > > > > > Link: https://lore.kernel.org/r/[email protected]
> > > > > > >
> > > > > > > Any objection to picking this for stable? I'm interested in this one for some
> > > > > > > Android users but prefer if it can be taken by stable rather than backport it
> > > > > > > individually.
> > > > > > >
> > > > > > > I think it makes sense to pick the next one in the series too.
> > > > > >
> > > > > > What commit does this fix in Linus's tree?
> > > > >
> > > > > It should be this one: c6e7bd7afaeb ("sched/core: Optimize ttwu() spinning on p->on_cpu")
> > > >
> > > > Should this be okay to be picked up by stable now? I can see AUTOSEL has picked
> > > > it up for v5.15+, but it impacts v5.10 too.
> > >
> > > commit: fa2c3254d7cfff5f7a916ab928a562d1165f17bb
> > > subject: sched/tracing: Don't re-read p->state when emitting sched_switch event
> > >
> > > This patch has an impact on Android 5.10 users who experience tooling breakage.
> > > Is it possible to include in 5.10 LTS please?
> > >
> > > It was already picked up for 5.15+ by AUTOSEL and only 5.10 is missing.
> > >
> >
> > https://lore.kernel.org/stable/[email protected]/
> >
> > However, since then further investigation (still in progress) has shown that this
> > may have been the fault of the tool in question, so if you can verify that tracing
> > sched still works for you with this patch in 5.15.x then by all means
> > let's merge it.
>
> So it turns out the lockup is indeed the fault of the tool, which contains multiple
> kernel-version dependent tracepoint definitions and now fails with this
> patch.

What tools is this?

> Greg, please re-enqueue this patch where necessary (5.10, 5.15+)

If I queue it up again, will the tools keep breaking?

thanks,

greg k-hj

2022-04-12 21:01:33

by Greg Kroah-Hartman

[permalink] [raw]
Subject: Re: [tip: sched/core] sched/tracing: Don't re-read p->state when emitting sched_switch event

On Mon, Apr 11, 2022 at 10:05:19AM +0200, Holger Hoffst?tte wrote:
> On 2022-04-11 09:28, Greg KH wrote:
> > On Mon, Apr 11, 2022 at 09:18:19AM +0200, Holger Hoffst?tte wrote:
> > > On 2022-04-11 01:22, Holger Hoffst?tte wrote:
> > > > On 2022-04-11 00:06, Qais Yousef wrote:
> > > > > On 04/10/22 00:38, Qais Yousef wrote:
> > > > > > On 03/08/22 18:51, Qais Yousef wrote:
> > > > > > > On 03/08/22 19:10, Greg KH wrote:
> > > > > > > > On Tue, Mar 08, 2022 at 06:02:40PM +0000, Qais Yousef wrote:
> > > > > > > > > +CC stable
> > > > > > > > >
> > > > > > > > > On 03/01/22 15:24, tip-bot2 for Valentin Schneider wrote:
> > > > > > > > > > The following commit has been merged into the sched/core branch of tip:
> > > > > > > > > >
> > > > > > > > > > Commit-ID:???? fa2c3254d7cfff5f7a916ab928a562d1165f17bb
> > > > > > > > > > Gitweb:??????? https://git.kernel.org/tip/fa2c3254d7cfff5f7a916ab928a562d1165f17bb
> > > > > > > > > > Author:??????? Valentin Schneider <[email protected]>
> > > > > > > > > > AuthorDate:??? Thu, 20 Jan 2022 16:25:19
> > > > > > > > > > Committer:???? Peter Zijlstra <[email protected]>
> > > > > > > > > > CommitterDate: Tue, 01 Mar 2022 16:18:39 +01:00
> > > > > > > > > >
> > > > > > > > > > sched/tracing: Don't re-read p->state when emitting sched_switch event
> > > > > > > > > >
> > > > > > > > > > As of commit
> > > > > > > > > >
> > > > > > > > > > ?? c6e7bd7afaeb ("sched/core: Optimize ttwu() spinning on p->on_cpu")
> > > > > > > > > >
> > > > > > > > > > the following sequence becomes possible:
> > > > > > > > > >
> > > > > > > > > > ????????????? p->__state = TASK_INTERRUPTIBLE;
> > > > > > > > > > ????????????? __schedule()
> > > > > > > > > > ??????????? deactivate_task(p);
> > > > > > > > > > ?? ttwu()
> > > > > > > > > > ???? READ !p->on_rq
> > > > > > > > > > ???? p->__state=TASK_WAKING
> > > > > > > > > > ??????????? trace_sched_switch()
> > > > > > > > > > ????????????? __trace_sched_switch_state()
> > > > > > > > > > ??????????????? task_state_index()
> > > > > > > > > > ????????????????? return 0;
> > > > > > > > > >
> > > > > > > > > > TASK_WAKING isn't in TASK_REPORT, so the task appears as TASK_RUNNING in
> > > > > > > > > > the trace event.
> > > > > > > > > >
> > > > > > > > > > Prevent this by pushing the value read from __schedule() down the trace
> > > > > > > > > > event.
> > > > > > > > > >
> > > > > > > > > > Reported-by: Abhijeet Dharmapurikar <[email protected]>
> > > > > > > > > > Signed-off-by: Valentin Schneider <[email protected]>
> > > > > > > > > > Signed-off-by: Peter Zijlstra (Intel) <[email protected]>
> > > > > > > > > > Reviewed-by: Steven Rostedt (Google) <[email protected]>
> > > > > > > > > > Link: https://lore.kernel.org/r/[email protected]
> > > > > > > > >
> > > > > > > > > Any objection to picking this for stable? I'm interested in this one for some
> > > > > > > > > Android users but prefer if it can be taken by stable rather than backport it
> > > > > > > > > individually.
> > > > > > > > >
> > > > > > > > > I think it makes sense to pick the next one in the series too.
> > > > > > > >
> > > > > > > > What commit does this fix in Linus's tree?
> > > > > > >
> > > > > > > It should be this one: c6e7bd7afaeb ("sched/core: Optimize ttwu() spinning on p->on_cpu")
> > > > > >
> > > > > > Should this be okay to be picked up by stable now? I can see AUTOSEL has picked
> > > > > > it up for v5.15+, but it impacts v5.10 too.
> > > > >
> > > > > commit: fa2c3254d7cfff5f7a916ab928a562d1165f17bb
> > > > > subject: sched/tracing: Don't re-read p->state when emitting sched_switch event
> > > > >
> > > > > This patch has an impact on Android 5.10 users who experience tooling breakage.
> > > > > Is it possible to include in 5.10 LTS please?
> > > > >
> > > > > It was already picked up for 5.15+ by AUTOSEL and only 5.10 is missing.
> > > > >
> > > >
> > > > https://lore.kernel.org/stable/[email protected]/
> > > >
> > > > However, since then further investigation (still in progress) has shown that this
> > > > may have been the fault of the tool in question, so if you can verify that tracing
> > > > sched still works for you with this patch in 5.15.x then by all means
> > > > let's merge it.
> > >
> > > So it turns out the lockup is indeed the fault of the tool, which contains multiple
> > > kernel-version dependent tracepoint definitions and now fails with this
> > > patch.
> >
> > What tools is this?
>
> sysdig - which uses a helper kernel module which accesses tracepoints, but of course
> (as I just found) with copypasta'd TP definitions, which broke with this patch due to
> the additional parameter in the function signature. It's been prone to breakage forever
> because of a lack of a stable kernel ABI.
>
> Took me a while to find/figure out, but IMHO better safe than sorry. We've had
> autoselected scheduler patches before that looked fine but really were not.

Thanks for the info, sysdig does do some pretty intrusive things but I
thought it had moved over to being almost all bpf code.

Anyway, I'll wait for Qais to submit a properly backported and tested
set of patches before picking this up again due to all of the problems
I've had with this patch.

thanks,

greg k-h

2022-04-12 21:19:56

by Greg Kroah-Hartman

[permalink] [raw]
Subject: Re: [tip: sched/core] sched/tracing: Don't re-read p->state when emitting sched_switch event

On Sun, Apr 10, 2022 at 12:38:29AM +0100, Qais Yousef wrote:
> On 03/08/22 18:51, Qais Yousef wrote:
> > On 03/08/22 19:10, Greg KH wrote:
> > > On Tue, Mar 08, 2022 at 06:02:40PM +0000, Qais Yousef wrote:
> > > > +CC stable
> > > >
> > > > On 03/01/22 15:24, tip-bot2 for Valentin Schneider wrote:
> > > > > The following commit has been merged into the sched/core branch of tip:
> > > > >
> > > > > Commit-ID: fa2c3254d7cfff5f7a916ab928a562d1165f17bb
> > > > > Gitweb: https://git.kernel.org/tip/fa2c3254d7cfff5f7a916ab928a562d1165f17bb
> > > > > Author: Valentin Schneider <[email protected]>
> > > > > AuthorDate: Thu, 20 Jan 2022 16:25:19
> > > > > Committer: Peter Zijlstra <[email protected]>
> > > > > CommitterDate: Tue, 01 Mar 2022 16:18:39 +01:00
> > > > >
> > > > > sched/tracing: Don't re-read p->state when emitting sched_switch event
> > > > >
> > > > > As of commit
> > > > >
> > > > > c6e7bd7afaeb ("sched/core: Optimize ttwu() spinning on p->on_cpu")
> > > > >
> > > > > the following sequence becomes possible:
> > > > >
> > > > > p->__state = TASK_INTERRUPTIBLE;
> > > > > __schedule()
> > > > > deactivate_task(p);
> > > > > ttwu()
> > > > > READ !p->on_rq
> > > > > p->__state=TASK_WAKING
> > > > > trace_sched_switch()
> > > > > __trace_sched_switch_state()
> > > > > task_state_index()
> > > > > return 0;
> > > > >
> > > > > TASK_WAKING isn't in TASK_REPORT, so the task appears as TASK_RUNNING in
> > > > > the trace event.
> > > > >
> > > > > Prevent this by pushing the value read from __schedule() down the trace
> > > > > event.
> > > > >
> > > > > Reported-by: Abhijeet Dharmapurikar <[email protected]>
> > > > > Signed-off-by: Valentin Schneider <[email protected]>
> > > > > Signed-off-by: Peter Zijlstra (Intel) <[email protected]>
> > > > > Reviewed-by: Steven Rostedt (Google) <[email protected]>
> > > > > Link: https://lore.kernel.org/r/[email protected]
> > > >
> > > > Any objection to picking this for stable? I'm interested in this one for some
> > > > Android users but prefer if it can be taken by stable rather than backport it
> > > > individually.
> > > >
> > > > I think it makes sense to pick the next one in the series too.
> > >
> > > What commit does this fix in Linus's tree?
> >
> > It should be this one: c6e7bd7afaeb ("sched/core: Optimize ttwu() spinning on p->on_cpu")
>
> Should this be okay to be picked up by stable now? I can see AUTOSEL has picked
> it up for v5.15+, but it impacts v5.10 too.

It does not apply to 5.10 at all, how did you test this?

{sigh}

Again, if you want this applied to any stable trees, please test that it
works and send the properly backported patches.

thanks,

greg k-h

2022-04-12 21:31:30

by Holger Hoffstätte

[permalink] [raw]
Subject: Re: [tip: sched/core] sched/tracing: Don't re-read p->state when emitting sched_switch event

On 2022-04-11 01:22, Holger Hoffstätte wrote:
> On 2022-04-11 00:06, Qais Yousef wrote:
>> On 04/10/22 00:38, Qais Yousef wrote:
>>> On 03/08/22 18:51, Qais Yousef wrote:
>>>> On 03/08/22 19:10, Greg KH wrote:
>>>>> On Tue, Mar 08, 2022 at 06:02:40PM +0000, Qais Yousef wrote:
>>>>>> +CC stable
>>>>>>
>>>>>> On 03/01/22 15:24, tip-bot2 for Valentin Schneider wrote:
>>>>>>> The following commit has been merged into the sched/core branch of tip:
>>>>>>>
>>>>>>> Commit-ID:     fa2c3254d7cfff5f7a916ab928a562d1165f17bb
>>>>>>> Gitweb:        https://git.kernel.org/tip/fa2c3254d7cfff5f7a916ab928a562d1165f17bb
>>>>>>> Author:        Valentin Schneider <[email protected]>
>>>>>>> AuthorDate:    Thu, 20 Jan 2022 16:25:19
>>>>>>> Committer:     Peter Zijlstra <[email protected]>
>>>>>>> CommitterDate: Tue, 01 Mar 2022 16:18:39 +01:00
>>>>>>>
>>>>>>> sched/tracing: Don't re-read p->state when emitting sched_switch event
>>>>>>>
>>>>>>> As of commit
>>>>>>>
>>>>>>>    c6e7bd7afaeb ("sched/core: Optimize ttwu() spinning on p->on_cpu")
>>>>>>>
>>>>>>> the following sequence becomes possible:
>>>>>>>
>>>>>>>               p->__state = TASK_INTERRUPTIBLE;
>>>>>>>               __schedule()
>>>>>>>             deactivate_task(p);
>>>>>>>    ttwu()
>>>>>>>      READ !p->on_rq
>>>>>>>      p->__state=TASK_WAKING
>>>>>>>             trace_sched_switch()
>>>>>>>               __trace_sched_switch_state()
>>>>>>>                 task_state_index()
>>>>>>>                   return 0;
>>>>>>>
>>>>>>> TASK_WAKING isn't in TASK_REPORT, so the task appears as TASK_RUNNING in
>>>>>>> the trace event.
>>>>>>>
>>>>>>> Prevent this by pushing the value read from __schedule() down the trace
>>>>>>> event.
>>>>>>>
>>>>>>> Reported-by: Abhijeet Dharmapurikar <[email protected]>
>>>>>>> Signed-off-by: Valentin Schneider <[email protected]>
>>>>>>> Signed-off-by: Peter Zijlstra (Intel) <[email protected]>
>>>>>>> Reviewed-by: Steven Rostedt (Google) <[email protected]>
>>>>>>> Link: https://lore.kernel.org/r/[email protected]
>>>>>>
>>>>>> Any objection to picking this for stable? I'm interested in this one for some
>>>>>> Android users but prefer if it can be taken by stable rather than backport it
>>>>>> individually.
>>>>>>
>>>>>> I think it makes sense to pick the next one in the series too.
>>>>>
>>>>> What commit does this fix in Linus's tree?
>>>>
>>>> It should be this one: c6e7bd7afaeb ("sched/core: Optimize ttwu() spinning on p->on_cpu")
>>>
>>> Should this be okay to be picked up by stable now? I can see AUTOSEL has picked
>>> it up for v5.15+, but it impacts v5.10 too.
>>
>> commit: fa2c3254d7cfff5f7a916ab928a562d1165f17bb
>> subject: sched/tracing: Don't re-read p->state when emitting sched_switch event
>>
>> This patch has an impact on Android 5.10 users who experience tooling breakage.
>> Is it possible to include in 5.10 LTS please?
>>
>> It was already picked up for 5.15+ by AUTOSEL and only 5.10 is missing.
>>
>
> https://lore.kernel.org/stable/[email protected]/
>
> However, since then further investigation (still in progress) has shown that this
> may have been the fault of the tool in question, so if you can verify that tracing
> sched still works for you with this patch in 5.15.x then by all means
> let's merge it.

So it turns out the lockup is indeed the fault of the tool, which contains multiple
kernel-version dependent tracepoint definitions and now fails with this
patch.

Greg, please re-enqueue this patch where necessary (5.10, 5.15+)

-h

2022-04-12 22:28:28

by Holger Hoffstätte

[permalink] [raw]
Subject: Re: [tip: sched/core] sched/tracing: Don't re-read p->state when emitting sched_switch event

On 2022-04-11 00:06, Qais Yousef wrote:
> On 04/10/22 00:38, Qais Yousef wrote:
>> On 03/08/22 18:51, Qais Yousef wrote:
>>> On 03/08/22 19:10, Greg KH wrote:
>>>> On Tue, Mar 08, 2022 at 06:02:40PM +0000, Qais Yousef wrote:
>>>>> +CC stable
>>>>>
>>>>> On 03/01/22 15:24, tip-bot2 for Valentin Schneider wrote:
>>>>>> The following commit has been merged into the sched/core branch of tip:
>>>>>>
>>>>>> Commit-ID: fa2c3254d7cfff5f7a916ab928a562d1165f17bb
>>>>>> Gitweb: https://git.kernel.org/tip/fa2c3254d7cfff5f7a916ab928a562d1165f17bb
>>>>>> Author: Valentin Schneider <[email protected]>
>>>>>> AuthorDate: Thu, 20 Jan 2022 16:25:19
>>>>>> Committer: Peter Zijlstra <[email protected]>
>>>>>> CommitterDate: Tue, 01 Mar 2022 16:18:39 +01:00
>>>>>>
>>>>>> sched/tracing: Don't re-read p->state when emitting sched_switch event
>>>>>>
>>>>>> As of commit
>>>>>>
>>>>>> c6e7bd7afaeb ("sched/core: Optimize ttwu() spinning on p->on_cpu")
>>>>>>
>>>>>> the following sequence becomes possible:
>>>>>>
>>>>>> p->__state = TASK_INTERRUPTIBLE;
>>>>>> __schedule()
>>>>>> deactivate_task(p);
>>>>>> ttwu()
>>>>>> READ !p->on_rq
>>>>>> p->__state=TASK_WAKING
>>>>>> trace_sched_switch()
>>>>>> __trace_sched_switch_state()
>>>>>> task_state_index()
>>>>>> return 0;
>>>>>>
>>>>>> TASK_WAKING isn't in TASK_REPORT, so the task appears as TASK_RUNNING in
>>>>>> the trace event.
>>>>>>
>>>>>> Prevent this by pushing the value read from __schedule() down the trace
>>>>>> event.
>>>>>>
>>>>>> Reported-by: Abhijeet Dharmapurikar <[email protected]>
>>>>>> Signed-off-by: Valentin Schneider <[email protected]>
>>>>>> Signed-off-by: Peter Zijlstra (Intel) <[email protected]>
>>>>>> Reviewed-by: Steven Rostedt (Google) <[email protected]>
>>>>>> Link: https://lore.kernel.org/r/[email protected]
>>>>>
>>>>> Any objection to picking this for stable? I'm interested in this one for some
>>>>> Android users but prefer if it can be taken by stable rather than backport it
>>>>> individually.
>>>>>
>>>>> I think it makes sense to pick the next one in the series too.
>>>>
>>>> What commit does this fix in Linus's tree?
>>>
>>> It should be this one: c6e7bd7afaeb ("sched/core: Optimize ttwu() spinning on p->on_cpu")
>>
>> Should this be okay to be picked up by stable now? I can see AUTOSEL has picked
>> it up for v5.15+, but it impacts v5.10 too.
>
> commit: fa2c3254d7cfff5f7a916ab928a562d1165f17bb
> subject: sched/tracing: Don't re-read p->state when emitting sched_switch event
>
> This patch has an impact on Android 5.10 users who experience tooling breakage.
> Is it possible to include in 5.10 LTS please?
>
> It was already picked up for 5.15+ by AUTOSEL and only 5.10 is missing.
>

https://lore.kernel.org/stable/[email protected]/

However, since then further investigation (still in progress) has shown that this
may have been the fault of the tool in question, so if you can verify that tracing
sched still works for you with this patch in 5.15.x then by all means
let's merge it.

-h

2022-04-12 22:30:32

by Qais Yousef

[permalink] [raw]
Subject: Re: [tip: sched/core] sched/tracing: Don't re-read p->state when emitting sched_switch event

On 04/10/22 00:38, Qais Yousef wrote:
> On 03/08/22 18:51, Qais Yousef wrote:
> > On 03/08/22 19:10, Greg KH wrote:
> > > On Tue, Mar 08, 2022 at 06:02:40PM +0000, Qais Yousef wrote:
> > > > +CC stable
> > > >
> > > > On 03/01/22 15:24, tip-bot2 for Valentin Schneider wrote:
> > > > > The following commit has been merged into the sched/core branch of tip:
> > > > >
> > > > > Commit-ID: fa2c3254d7cfff5f7a916ab928a562d1165f17bb
> > > > > Gitweb: https://git.kernel.org/tip/fa2c3254d7cfff5f7a916ab928a562d1165f17bb
> > > > > Author: Valentin Schneider <[email protected]>
> > > > > AuthorDate: Thu, 20 Jan 2022 16:25:19
> > > > > Committer: Peter Zijlstra <[email protected]>
> > > > > CommitterDate: Tue, 01 Mar 2022 16:18:39 +01:00
> > > > >
> > > > > sched/tracing: Don't re-read p->state when emitting sched_switch event
> > > > >
> > > > > As of commit
> > > > >
> > > > > c6e7bd7afaeb ("sched/core: Optimize ttwu() spinning on p->on_cpu")
> > > > >
> > > > > the following sequence becomes possible:
> > > > >
> > > > > p->__state = TASK_INTERRUPTIBLE;
> > > > > __schedule()
> > > > > deactivate_task(p);
> > > > > ttwu()
> > > > > READ !p->on_rq
> > > > > p->__state=TASK_WAKING
> > > > > trace_sched_switch()
> > > > > __trace_sched_switch_state()
> > > > > task_state_index()
> > > > > return 0;
> > > > >
> > > > > TASK_WAKING isn't in TASK_REPORT, so the task appears as TASK_RUNNING in
> > > > > the trace event.
> > > > >
> > > > > Prevent this by pushing the value read from __schedule() down the trace
> > > > > event.
> > > > >
> > > > > Reported-by: Abhijeet Dharmapurikar <[email protected]>
> > > > > Signed-off-by: Valentin Schneider <[email protected]>
> > > > > Signed-off-by: Peter Zijlstra (Intel) <[email protected]>
> > > > > Reviewed-by: Steven Rostedt (Google) <[email protected]>
> > > > > Link: https://lore.kernel.org/r/[email protected]
> > > >
> > > > Any objection to picking this for stable? I'm interested in this one for some
> > > > Android users but prefer if it can be taken by stable rather than backport it
> > > > individually.
> > > >
> > > > I think it makes sense to pick the next one in the series too.
> > >
> > > What commit does this fix in Linus's tree?
> >
> > It should be this one: c6e7bd7afaeb ("sched/core: Optimize ttwu() spinning on p->on_cpu")
>
> Should this be okay to be picked up by stable now? I can see AUTOSEL has picked
> it up for v5.15+, but it impacts v5.10 too.

commit: fa2c3254d7cfff5f7a916ab928a562d1165f17bb
subject: sched/tracing: Don't re-read p->state when emitting sched_switch event

This patch has an impact on Android 5.10 users who experience tooling breakage.
Is it possible to include in 5.10 LTS please?

It was already picked up for 5.15+ by AUTOSEL and only 5.10 is missing.

Thanks

--
Qais Yousef

2022-04-12 22:55:35

by Holger Hoffstätte

[permalink] [raw]
Subject: Re: [tip: sched/core] sched/tracing: Don't re-read p->state when emitting sched_switch event

On 2022-04-11 09:28, Greg KH wrote:
> On Mon, Apr 11, 2022 at 09:18:19AM +0200, Holger Hoffstätte wrote:
>> On 2022-04-11 01:22, Holger Hoffstätte wrote:
>>> On 2022-04-11 00:06, Qais Yousef wrote:
>>>> On 04/10/22 00:38, Qais Yousef wrote:
>>>>> On 03/08/22 18:51, Qais Yousef wrote:
>>>>>> On 03/08/22 19:10, Greg KH wrote:
>>>>>>> On Tue, Mar 08, 2022 at 06:02:40PM +0000, Qais Yousef wrote:
>>>>>>>> +CC stable
>>>>>>>>
>>>>>>>> On 03/01/22 15:24, tip-bot2 for Valentin Schneider wrote:
>>>>>>>>> The following commit has been merged into the sched/core branch of tip:
>>>>>>>>>
>>>>>>>>> Commit-ID:     fa2c3254d7cfff5f7a916ab928a562d1165f17bb
>>>>>>>>> Gitweb:        https://git.kernel.org/tip/fa2c3254d7cfff5f7a916ab928a562d1165f17bb
>>>>>>>>> Author:        Valentin Schneider <[email protected]>
>>>>>>>>> AuthorDate:    Thu, 20 Jan 2022 16:25:19
>>>>>>>>> Committer:     Peter Zijlstra <[email protected]>
>>>>>>>>> CommitterDate: Tue, 01 Mar 2022 16:18:39 +01:00
>>>>>>>>>
>>>>>>>>> sched/tracing: Don't re-read p->state when emitting sched_switch event
>>>>>>>>>
>>>>>>>>> As of commit
>>>>>>>>>
>>>>>>>>>    c6e7bd7afaeb ("sched/core: Optimize ttwu() spinning on p->on_cpu")
>>>>>>>>>
>>>>>>>>> the following sequence becomes possible:
>>>>>>>>>
>>>>>>>>>               p->__state = TASK_INTERRUPTIBLE;
>>>>>>>>>               __schedule()
>>>>>>>>>             deactivate_task(p);
>>>>>>>>>    ttwu()
>>>>>>>>>      READ !p->on_rq
>>>>>>>>>      p->__state=TASK_WAKING
>>>>>>>>>             trace_sched_switch()
>>>>>>>>>               __trace_sched_switch_state()
>>>>>>>>>                 task_state_index()
>>>>>>>>>                   return 0;
>>>>>>>>>
>>>>>>>>> TASK_WAKING isn't in TASK_REPORT, so the task appears as TASK_RUNNING in
>>>>>>>>> the trace event.
>>>>>>>>>
>>>>>>>>> Prevent this by pushing the value read from __schedule() down the trace
>>>>>>>>> event.
>>>>>>>>>
>>>>>>>>> Reported-by: Abhijeet Dharmapurikar <[email protected]>
>>>>>>>>> Signed-off-by: Valentin Schneider <[email protected]>
>>>>>>>>> Signed-off-by: Peter Zijlstra (Intel) <[email protected]>
>>>>>>>>> Reviewed-by: Steven Rostedt (Google) <[email protected]>
>>>>>>>>> Link: https://lore.kernel.org/r/[email protected]
>>>>>>>>
>>>>>>>> Any objection to picking this for stable? I'm interested in this one for some
>>>>>>>> Android users but prefer if it can be taken by stable rather than backport it
>>>>>>>> individually.
>>>>>>>>
>>>>>>>> I think it makes sense to pick the next one in the series too.
>>>>>>>
>>>>>>> What commit does this fix in Linus's tree?
>>>>>>
>>>>>> It should be this one: c6e7bd7afaeb ("sched/core: Optimize ttwu() spinning on p->on_cpu")
>>>>>
>>>>> Should this be okay to be picked up by stable now? I can see AUTOSEL has picked
>>>>> it up for v5.15+, but it impacts v5.10 too.
>>>>
>>>> commit: fa2c3254d7cfff5f7a916ab928a562d1165f17bb
>>>> subject: sched/tracing: Don't re-read p->state when emitting sched_switch event
>>>>
>>>> This patch has an impact on Android 5.10 users who experience tooling breakage.
>>>> Is it possible to include in 5.10 LTS please?
>>>>
>>>> It was already picked up for 5.15+ by AUTOSEL and only 5.10 is missing.
>>>>
>>>
>>> https://lore.kernel.org/stable/[email protected]/
>>>
>>> However, since then further investigation (still in progress) has shown that this
>>> may have been the fault of the tool in question, so if you can verify that tracing
>>> sched still works for you with this patch in 5.15.x then by all means
>>> let's merge it.
>>
>> So it turns out the lockup is indeed the fault of the tool, which contains multiple
>> kernel-version dependent tracepoint definitions and now fails with this
>> patch.
>
> What tools is this?

sysdig - which uses a helper kernel module which accesses tracepoints, but of course
(as I just found) with copypasta'd TP definitions, which broke with this patch due to
the additional parameter in the function signature. It's been prone to breakage forever
because of a lack of a stable kernel ABI.

Took me a while to find/figure out, but IMHO better safe than sorry. We've had
autoselected scheduler patches before that looked fine but really were not.

>
>> Greg, please re-enqueue this patch where necessary (5.10, 5.15+)
>
> If I queue it up again, will the tools keep breaking?

Yes, but that's their problem with an out-of-tree module; a few more #ifdefs
are not going to make a big difference.

thanks
Holger

2022-04-12 23:22:25

by Qais Yousef

[permalink] [raw]
Subject: Re: [tip: sched/core] sched/tracing: Don't re-read p->state when emitting sched_switch event

On 04/11/22 15:22, Greg KH wrote:
> On Sun, Apr 10, 2022 at 12:38:29AM +0100, Qais Yousef wrote:
> > On 03/08/22 18:51, Qais Yousef wrote:
> > > On 03/08/22 19:10, Greg KH wrote:
> > > > On Tue, Mar 08, 2022 at 06:02:40PM +0000, Qais Yousef wrote:
> > > > > +CC stable
> > > > >
> > > > > On 03/01/22 15:24, tip-bot2 for Valentin Schneider wrote:
> > > > > > The following commit has been merged into the sched/core branch of tip:
> > > > > >
> > > > > > Commit-ID: fa2c3254d7cfff5f7a916ab928a562d1165f17bb
> > > > > > Gitweb: https://git.kernel.org/tip/fa2c3254d7cfff5f7a916ab928a562d1165f17bb
> > > > > > Author: Valentin Schneider <[email protected]>
> > > > > > AuthorDate: Thu, 20 Jan 2022 16:25:19
> > > > > > Committer: Peter Zijlstra <[email protected]>
> > > > > > CommitterDate: Tue, 01 Mar 2022 16:18:39 +01:00
> > > > > >
> > > > > > sched/tracing: Don't re-read p->state when emitting sched_switch event
> > > > > >
> > > > > > As of commit
> > > > > >
> > > > > > c6e7bd7afaeb ("sched/core: Optimize ttwu() spinning on p->on_cpu")
> > > > > >
> > > > > > the following sequence becomes possible:
> > > > > >
> > > > > > p->__state = TASK_INTERRUPTIBLE;
> > > > > > __schedule()
> > > > > > deactivate_task(p);
> > > > > > ttwu()
> > > > > > READ !p->on_rq
> > > > > > p->__state=TASK_WAKING
> > > > > > trace_sched_switch()
> > > > > > __trace_sched_switch_state()
> > > > > > task_state_index()
> > > > > > return 0;
> > > > > >
> > > > > > TASK_WAKING isn't in TASK_REPORT, so the task appears as TASK_RUNNING in
> > > > > > the trace event.
> > > > > >
> > > > > > Prevent this by pushing the value read from __schedule() down the trace
> > > > > > event.
> > > > > >
> > > > > > Reported-by: Abhijeet Dharmapurikar <[email protected]>
> > > > > > Signed-off-by: Valentin Schneider <[email protected]>
> > > > > > Signed-off-by: Peter Zijlstra (Intel) <[email protected]>
> > > > > > Reviewed-by: Steven Rostedt (Google) <[email protected]>
> > > > > > Link: https://lore.kernel.org/r/[email protected]
> > > > >
> > > > > Any objection to picking this for stable? I'm interested in this one for some
> > > > > Android users but prefer if it can be taken by stable rather than backport it
> > > > > individually.
> > > > >
> > > > > I think it makes sense to pick the next one in the series too.
> > > >
> > > > What commit does this fix in Linus's tree?
> > >
> > > It should be this one: c6e7bd7afaeb ("sched/core: Optimize ttwu() spinning on p->on_cpu")
> >
> > Should this be okay to be picked up by stable now? I can see AUTOSEL has picked
> > it up for v5.15+, but it impacts v5.10 too.
>
> It does not apply to 5.10 at all, how did you test this?
>
> {sigh}
>
> Again, if you want this applied to any stable trees, please test that it
> works and send the properly backported patches.

Sorry. I'm picking up pieces after a colleague left and was under the
impression that 5.10 Android users already use as-is, but due to GKI just need
it formally in stable. Few things got lost in translation, I am very sorry!

Let me sort all of this out properly including the testing part for 5.15 too.

Thanks!

--
Qais Yousef