2022-05-06 06:34:27

by Eric W. Biederman

[permalink] [raw]
Subject: [PATCH v4 12/12] sched,signal,ptrace: Rework TASK_TRACED, TASK_STOPPED state

From: Peter Zijlstra <[email protected]>

Currently ptrace_stop() / do_signal_stop() rely on the special states
TASK_TRACED and TASK_STOPPED resp. to keep unique state. That is, this
state exists only in task->__state and nowhere else.

There's two spots of bother with this:

- PREEMPT_RT has task->saved_state which complicates matters,
meaning task_is_{traced,stopped}() needs to check an additional
variable.

- An alternative freezer implementation that itself relies on a
special TASK state would loose TASK_TRACED/TASK_STOPPED and will
result in misbehaviour.

As such, add additional state to task->jobctl to track this state
outside of task->__state.

NOTE: this doesn't actually fix anything yet, just adds extra state.

--EWB
* didn't add a unnecessary newline in signal.h
* Update t->jobctl in signal_wake_up and ptrace_signal_wake_up
instead of in signal_wake_up_state. This prevents the clearing
of TASK_STOPPED and TASK_TRACED from getting lost.
* Added warnings if JOBCTL_STOPPED or JOBCTL_TRACED are not cleared

Signed-off-by: Peter Zijlstra (Intel) <[email protected]>
Link: https://lkml.kernel.org/r/[email protected]
Signed-off-by: Eric W. Biederman <[email protected]>
---
include/linux/sched.h | 8 +++-----
include/linux/sched/jobctl.h | 6 ++++++
include/linux/sched/signal.h | 19 +++++++++++++++----
kernel/ptrace.c | 16 +++++++++++++---
kernel/signal.c | 10 ++++++++--
5 files changed, 45 insertions(+), 14 deletions(-)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 610f2fdb1e2c..cbe5c899599c 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -118,11 +118,9 @@ struct task_group;

#define task_is_running(task) (READ_ONCE((task)->__state) == TASK_RUNNING)

-#define task_is_traced(task) ((READ_ONCE(task->__state) & __TASK_TRACED) != 0)
-
-#define task_is_stopped(task) ((READ_ONCE(task->__state) & __TASK_STOPPED) != 0)
-
-#define task_is_stopped_or_traced(task) ((READ_ONCE(task->__state) & (__TASK_STOPPED | __TASK_TRACED)) != 0)
+#define task_is_traced(task) ((READ_ONCE(task->jobctl) & JOBCTL_TRACED) != 0)
+#define task_is_stopped(task) ((READ_ONCE(task->jobctl) & JOBCTL_STOPPED) != 0)
+#define task_is_stopped_or_traced(task) ((READ_ONCE(task->jobctl) & (JOBCTL_STOPPED | JOBCTL_TRACED)) != 0)

/*
* Special states are those that do not use the normal wait-loop pattern. See
diff --git a/include/linux/sched/jobctl.h b/include/linux/sched/jobctl.h
index d556c3425963..68876d0a7ef9 100644
--- a/include/linux/sched/jobctl.h
+++ b/include/linux/sched/jobctl.h
@@ -21,6 +21,9 @@ struct task_struct;
#define JOBCTL_TRAP_FREEZE_BIT 23 /* trap for cgroup freezer */
#define JOBCTL_PTRACE_FROZEN_BIT 24 /* frozen for ptrace */

+#define JOBCTL_STOPPED_BIT 26 /* do_signal_stop() */
+#define JOBCTL_TRACED_BIT 27 /* ptrace_stop() */
+
#define JOBCTL_STOP_DEQUEUED (1UL << JOBCTL_STOP_DEQUEUED_BIT)
#define JOBCTL_STOP_PENDING (1UL << JOBCTL_STOP_PENDING_BIT)
#define JOBCTL_STOP_CONSUME (1UL << JOBCTL_STOP_CONSUME_BIT)
@@ -31,6 +34,9 @@ struct task_struct;
#define JOBCTL_TRAP_FREEZE (1UL << JOBCTL_TRAP_FREEZE_BIT)
#define JOBCTL_PTRACE_FROZEN (1UL << JOBCTL_PTRACE_FROZEN_BIT)

+#define JOBCTL_STOPPED (1UL << JOBCTL_STOPPED_BIT)
+#define JOBCTL_TRACED (1UL << JOBCTL_TRACED_BIT)
+
#define JOBCTL_TRAP_MASK (JOBCTL_TRAP_STOP | JOBCTL_TRAP_NOTIFY)
#define JOBCTL_PENDING_MASK (JOBCTL_STOP_PENDING | JOBCTL_TRAP_MASK)

diff --git a/include/linux/sched/signal.h b/include/linux/sched/signal.h
index e66948abbee4..07ba3404fcde 100644
--- a/include/linux/sched/signal.h
+++ b/include/linux/sched/signal.h
@@ -294,8 +294,10 @@ static inline int kernel_dequeue_signal(void)
static inline void kernel_signal_stop(void)
{
spin_lock_irq(&current->sighand->siglock);
- if (current->jobctl & JOBCTL_STOP_DEQUEUED)
+ if (current->jobctl & JOBCTL_STOP_DEQUEUED) {
+ current->jobctl |= JOBCTL_STOPPED;
set_special_state(TASK_STOPPED);
+ }
spin_unlock_irq(&current->sighand->siglock);

schedule();
@@ -437,12 +439,21 @@ extern void signal_wake_up_state(struct task_struct *t, unsigned int state);

static inline void signal_wake_up(struct task_struct *t, bool fatal)
{
- fatal = fatal && !(t->jobctl & JOBCTL_PTRACE_FROZEN);
- signal_wake_up_state(t, fatal ? TASK_WAKEKILL | __TASK_TRACED : 0);
+ unsigned int state = 0;
+ if (fatal && !(t->jobctl & JOBCTL_PTRACE_FROZEN)) {
+ t->jobctl &= ~(JOBCTL_STOPPED | JOBCTL_TRACED);
+ state = TASK_WAKEKILL | __TASK_TRACED;
+ }
+ signal_wake_up_state(t, state);
}
static inline void ptrace_signal_wake_up(struct task_struct *t, bool resume)
{
- signal_wake_up_state(t, resume ? __TASK_TRACED : 0);
+ unsigned int state = 0;
+ if (resume) {
+ t->jobctl &= ~JOBCTL_TRACED;
+ state = __TASK_TRACED;
+ }
+ signal_wake_up_state(t, state);
}

void task_join_group_stop(struct task_struct *task);
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index 36a5b7a00d2f..328a34a99124 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -185,7 +185,12 @@ static bool looks_like_a_spurious_pid(struct task_struct *task)
return true;
}

-/* Ensure that nothing can wake it up, even SIGKILL */
+/*
+ * Ensure that nothing can wake it up, even SIGKILL
+ *
+ * A task is switched to this state while a ptrace operation is in progress;
+ * such that the ptrace operation is uninterruptible.
+ */
static bool ptrace_freeze_traced(struct task_struct *task)
{
bool ret = false;
@@ -216,8 +221,10 @@ static void ptrace_unfreeze_traced(struct task_struct *task)
*/
if (lock_task_sighand(task, &flags)) {
task->jobctl &= ~JOBCTL_PTRACE_FROZEN;
- if (__fatal_signal_pending(task))
+ if (__fatal_signal_pending(task)) {
+ task->jobctl &= ~TASK_TRACED;
wake_up_state(task, __TASK_TRACED);
+ }
unlock_task_sighand(task, &flags);
}
}
@@ -462,8 +469,10 @@ static int ptrace_attach(struct task_struct *task, long request,
* in and out of STOPPED are protected by siglock.
*/
if (task_is_stopped(task) &&
- task_set_jobctl_pending(task, JOBCTL_TRAP_STOP | JOBCTL_TRAPPING))
+ task_set_jobctl_pending(task, JOBCTL_TRAP_STOP | JOBCTL_TRAPPING)) {
+ task->jobctl &= ~JOBCTL_STOPPED;
signal_wake_up_state(task, __TASK_STOPPED);
+ }

spin_unlock(&task->sighand->siglock);

@@ -875,6 +884,7 @@ static int ptrace_resume(struct task_struct *child, long request,
*/
spin_lock_irq(&child->sighand->siglock);
child->exit_code = data;
+ child->jobctl &= ~JOBCTL_TRACED;
wake_up_state(child, __TASK_TRACED);
spin_unlock_irq(&child->sighand->siglock);

diff --git a/kernel/signal.c b/kernel/signal.c
index a58b68a2d3c6..e782c2611b64 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -762,7 +762,10 @@ static int dequeue_synchronous_signal(kernel_siginfo_t *info)
*/
void signal_wake_up_state(struct task_struct *t, unsigned int state)
{
+ lockdep_assert_held(&t->sighand->siglock);
+
set_tsk_thread_flag(t, TIF_SIGPENDING);
+
/*
* TASK_WAKEKILL also means wake it up in the stopped/traced/killable
* case. We don't check t->state here because there is a race with it
@@ -930,9 +933,10 @@ static bool prepare_signal(int sig, struct task_struct *p, bool force)
for_each_thread(p, t) {
flush_sigqueue_mask(&flush, &t->pending);
task_clear_jobctl_pending(t, JOBCTL_STOP_PENDING);
- if (likely(!(t->ptrace & PT_SEIZED)))
+ if (likely(!(t->ptrace & PT_SEIZED))) {
+ t->jobctl &= ~JOBCTL_STOPPED;
wake_up_state(t, __TASK_STOPPED);
- else
+ } else
ptrace_trap_notify(t);
}

@@ -2218,6 +2222,7 @@ static int ptrace_stop(int exit_code, int why, unsigned long message,
return exit_code;

set_special_state(TASK_TRACED);
+ current->jobctl |= JOBCTL_TRACED;

/*
* We're committing to trapping. TRACED should be visible before
@@ -2436,6 +2441,7 @@ static bool do_signal_stop(int signr)
if (task_participate_group_stop(current))
notify = CLD_STOPPED;

+ current->jobctl |= JOBCTL_STOPPED;
set_special_state(TASK_STOPPED);
spin_unlock_irq(&current->sighand->siglock);

--
2.35.3



2022-06-21 13:17:36

by Alexander Gordeev

[permalink] [raw]
Subject: Re: [PATCH v4 12/12] sched,signal,ptrace: Rework TASK_TRACED, TASK_STOPPED state

On Thu, May 05, 2022 at 01:26:45PM -0500, Eric W. Biederman wrote:
> From: Peter Zijlstra <[email protected]>
>
> Currently ptrace_stop() / do_signal_stop() rely on the special states
> TASK_TRACED and TASK_STOPPED resp. to keep unique state. That is, this
> state exists only in task->__state and nowhere else.
>
> There's two spots of bother with this:
>
> - PREEMPT_RT has task->saved_state which complicates matters,
> meaning task_is_{traced,stopped}() needs to check an additional
> variable.
>
> - An alternative freezer implementation that itself relies on a
> special TASK state would loose TASK_TRACED/TASK_STOPPED and will
> result in misbehaviour.
>
> As such, add additional state to task->jobctl to track this state
> outside of task->__state.
>
> NOTE: this doesn't actually fix anything yet, just adds extra state.
>
> --EWB
> * didn't add a unnecessary newline in signal.h
> * Update t->jobctl in signal_wake_up and ptrace_signal_wake_up
> instead of in signal_wake_up_state. This prevents the clearing
> of TASK_STOPPED and TASK_TRACED from getting lost.
> * Added warnings if JOBCTL_STOPPED or JOBCTL_TRACED are not cleared

Hi Eric, Peter,

On s390 this patch triggers warning at kernel/ptrace.c:272 when
kill_child testcase from strace tool is repeatedly used (the source
is attached for reference):

while :; do
strace -f -qq -e signal=none -e trace=sched_yield,/kill ./kill_child
done

It normally takes few minutes to cause the warning in -rc3, but FWIW
it hits almost immediately for ptrace_stop-cleanup-for-v5.19 tag of
git://git.kernel.org/pub/scm/linux/kernel/git/ebiederm/user-namespace.

Commit 7b0fe1367ef2 ("ptrace: Document that wait_task_inactive can't
fail") suggests this WARN_ON_ONCE() is not really expected, yet we
observe a child in __TASK_TRACED state. Could you please comment here?

Thanks!


Attachments:
(No filename) (1.88 kB)
kill_child.c (1.34 kB)
Download all attachments

2022-06-21 14:13:43

by Eric W. Biederman

[permalink] [raw]
Subject: Re: [PATCH v4 12/12] sched,signal,ptrace: Rework TASK_TRACED, TASK_STOPPED state

Alexander Gordeev <[email protected]> writes:

> On Thu, May 05, 2022 at 01:26:45PM -0500, Eric W. Biederman wrote:
>> From: Peter Zijlstra <[email protected]>
>>
>> Currently ptrace_stop() / do_signal_stop() rely on the special states
>> TASK_TRACED and TASK_STOPPED resp. to keep unique state. That is, this
>> state exists only in task->__state and nowhere else.
>>
>> There's two spots of bother with this:
>>
>> - PREEMPT_RT has task->saved_state which complicates matters,
>> meaning task_is_{traced,stopped}() needs to check an additional
>> variable.
>>
>> - An alternative freezer implementation that itself relies on a
>> special TASK state would loose TASK_TRACED/TASK_STOPPED and will
>> result in misbehaviour.
>>
>> As such, add additional state to task->jobctl to track this state
>> outside of task->__state.
>>
>> NOTE: this doesn't actually fix anything yet, just adds extra state.
>>
>> --EWB
>> * didn't add a unnecessary newline in signal.h
>> * Update t->jobctl in signal_wake_up and ptrace_signal_wake_up
>> instead of in signal_wake_up_state. This prevents the clearing
>> of TASK_STOPPED and TASK_TRACED from getting lost.
>> * Added warnings if JOBCTL_STOPPED or JOBCTL_TRACED are not cleared
>
> Hi Eric, Peter,
>
> On s390 this patch triggers warning at kernel/ptrace.c:272 when
> kill_child testcase from strace tool is repeatedly used (the source
> is attached for reference):
>
> while :; do
> strace -f -qq -e signal=none -e trace=sched_yield,/kill ./kill_child
> done
>
> It normally takes few minutes to cause the warning in -rc3, but FWIW
> it hits almost immediately for ptrace_stop-cleanup-for-v5.19 tag of
> git://git.kernel.org/pub/scm/linux/kernel/git/ebiederm/user-namespace.
>
> Commit 7b0fe1367ef2 ("ptrace: Document that wait_task_inactive can't
> fail") suggests this WARN_ON_ONCE() is not really expected, yet we
> observe a child in __TASK_TRACED state. Could you please comment here?
>

For clarity the warning is that the child is not in __TASK_TRACED state.

The code is waiting for the code to stop in the scheduler in the
__TASK_TRACED state so that it can safely read and change the
processes state. Some of that state is not even saved until the
process is scheduled out so we have to wait until the process
is stopped in the scheduler.

At least on s390 it looks like there is a race between SIGKILL and
ptrace_check_attach. That isn't good.

Reading the code below there is something missing because I don't see
anything making ptrace calls, and ptrace_check_attach (which contains
the warning) only happens in the ptrace syscall.

Eric



> Thanks!
>
> /*
> * Check for the corner case that previously lead to segfault
> * due to an attempt to access unitialised tcp->s_ent.
> *
> * 13994 ????( <unfinished ...>
> * ...
> * 13994 <... ???? resumed>) = ?
> *
> * Copyright (c) 2019 The strace developers.
> * All rights reserved.
> *
> * SPDX-License-Identifier: GPL-2.0-or-later
> */
>
> #include "tests.h"
>
> #include <sched.h>
> #include <signal.h>
> #include <unistd.h>
> #include <sys/mman.h>
> #include <sys/wait.h>
>
> #define ITERS 10000
> #define SC_ITERS 10000
>
> int
> main(void)
> {
> volatile sig_atomic_t *const mem =
> mmap(NULL, get_page_size(), PROT_READ | PROT_WRITE,
> MAP_SHARED | MAP_ANONYMOUS, -1, 0);
> if (mem == MAP_FAILED)
> perror_msg_and_fail("mmap");
>
> for (unsigned int i = 0; i < ITERS; ++i) {
> mem[0] = mem[1] = 0;
>
> const pid_t pid = fork();
> if (pid < 0)
> perror_msg_and_fail("fork");
>
> if (!pid) {
> /* wait for the parent */
> while (!mem[0])
> ;
> /* let the parent know we are running */
> mem[1] = 1;
>
> for (unsigned int j = 0; j < SC_ITERS; j++)
> sched_yield();
>
> pause();
> return 0;
> }
>
> /* let the child know we are running */
> mem[0] = 1;
> /* wait for the child */
> while (!mem[1])
> ;
>
> if (kill(pid, SIGKILL))
> perror_msg_and_fail("kill");
> if (wait(NULL) != pid)
> perror_msg_and_fail("wait");
> }
>
> return 0;
> }

2022-06-21 15:26:11

by Alexander Gordeev

[permalink] [raw]
Subject: Re: [PATCH v4 12/12] sched,signal,ptrace: Rework TASK_TRACED, TASK_STOPPED state

On Tue, Jun 21, 2022 at 09:02:05AM -0500, Eric W. Biederman wrote:
> Alexander Gordeev <[email protected]> writes:
>
> > On Thu, May 05, 2022 at 01:26:45PM -0500, Eric W. Biederman wrote:
> >> From: Peter Zijlstra <[email protected]>
> >>
> >> Currently ptrace_stop() / do_signal_stop() rely on the special states
> >> TASK_TRACED and TASK_STOPPED resp. to keep unique state. That is, this
> >> state exists only in task->__state and nowhere else.
> >>
> >> There's two spots of bother with this:
> >>
> >> - PREEMPT_RT has task->saved_state which complicates matters,
> >> meaning task_is_{traced,stopped}() needs to check an additional
> >> variable.
> >>
> >> - An alternative freezer implementation that itself relies on a
> >> special TASK state would loose TASK_TRACED/TASK_STOPPED and will
> >> result in misbehaviour.
> >>
> >> As such, add additional state to task->jobctl to track this state
> >> outside of task->__state.
> >>
> >> NOTE: this doesn't actually fix anything yet, just adds extra state.
> >>
> >> --EWB
> >> * didn't add a unnecessary newline in signal.h
> >> * Update t->jobctl in signal_wake_up and ptrace_signal_wake_up
> >> instead of in signal_wake_up_state. This prevents the clearing
> >> of TASK_STOPPED and TASK_TRACED from getting lost.
> >> * Added warnings if JOBCTL_STOPPED or JOBCTL_TRACED are not cleared
> >
> > Hi Eric, Peter,
> >
> > On s390 this patch triggers warning at kernel/ptrace.c:272 when
> > kill_child testcase from strace tool is repeatedly used (the source
> > is attached for reference):
> >
> > while :; do
> > strace -f -qq -e signal=none -e trace=sched_yield,/kill ./kill_child
> > done
> >
> > It normally takes few minutes to cause the warning in -rc3, but FWIW
> > it hits almost immediately for ptrace_stop-cleanup-for-v5.19 tag of
> > git://git.kernel.org/pub/scm/linux/kernel/git/ebiederm/user-namespace.
> >
> > Commit 7b0fe1367ef2 ("ptrace: Document that wait_task_inactive can't
> > fail") suggests this WARN_ON_ONCE() is not really expected, yet we
> > observe a child in __TASK_TRACED state. Could you please comment here?
> >
>
> For clarity the warning is that the child is not in __TASK_TRACED state.
>
> The code is waiting for the code to stop in the scheduler in the
> __TASK_TRACED state so that it can safely read and change the
> processes state. Some of that state is not even saved until the
> process is scheduled out so we have to wait until the process
> is stopped in the scheduler.

So I assume (checked actually) the return 0 below from kernel/sched/core.c:
wait_task_inactive() is where it bails out:

3303 while (task_running(rq, p)) {
3304 if (match_state && unlikely(READ_ONCE(p->__state) != match_state))
3305 return 0;
3306 cpu_relax();
3307 }

Yet, the child task is always found in __TASK_TRACED state (as seen
in crash dumps):

> 101447 11342 13 ce3a8100 RU 0.0 10040 4412 strace
101450 101447 0 bb04b200 TR 0.0 2272 1136 kill_child
108261 101447 2 d0b10100 TR 0.0 2272 532 kill_child
crash> task bb04b200 __state
PID: 101450 TASK: bb04b200 CPU: 0 COMMAND: "kill_child"
__state = 8,

crash> task d0b10100 __state
PID: 108261 TASK: d0b10100 CPU: 2 COMMAND: "kill_child"
__state = 8,

> At least on s390 it looks like there is a race between SIGKILL and
> ptrace_check_attach. That isn't good.
>
> Reading the code below there is something missing because I don't see
> anything making ptrace calls, and ptrace_check_attach (which contains
> the warning) only happens in the ptrace syscall.

That is what I believe strace does when calling that code:

strace -f -qq -e signal=none -e trace=sched_yield,/kill ./kill_child

> Eric
>
>
>
> > Thanks!
> >
> > /*
> > * Check for the corner case that previously lead to segfault
> > * due to an attempt to access unitialised tcp->s_ent.
> > *
> > * 13994 ????( <unfinished ...>
> > * ...
> > * 13994 <... ???? resumed>) = ?
> > *
> > * Copyright (c) 2019 The strace developers.
> > * All rights reserved.
> > *
> > * SPDX-License-Identifier: GPL-2.0-or-later
> > */
> >
> > #include "tests.h"
> >
> > #include <sched.h>
> > #include <signal.h>
> > #include <unistd.h>
> > #include <sys/mman.h>
> > #include <sys/wait.h>
> >
> > #define ITERS 10000
> > #define SC_ITERS 10000
> >
> > int
> > main(void)
> > {
> > volatile sig_atomic_t *const mem =
> > mmap(NULL, get_page_size(), PROT_READ | PROT_WRITE,
> > MAP_SHARED | MAP_ANONYMOUS, -1, 0);
> > if (mem == MAP_FAILED)
> > perror_msg_and_fail("mmap");
> >
> > for (unsigned int i = 0; i < ITERS; ++i) {
> > mem[0] = mem[1] = 0;
> >
> > const pid_t pid = fork();
> > if (pid < 0)
> > perror_msg_and_fail("fork");
> >
> > if (!pid) {
> > /* wait for the parent */
> > while (!mem[0])
> > ;
> > /* let the parent know we are running */
> > mem[1] = 1;
> >
> > for (unsigned int j = 0; j < SC_ITERS; j++)
> > sched_yield();
> >
> > pause();
> > return 0;
> > }
> >
> > /* let the child know we are running */
> > mem[0] = 1;
> > /* wait for the child */
> > while (!mem[1])
> > ;
> >
> > if (kill(pid, SIGKILL))
> > perror_msg_and_fail("kill");
> > if (wait(NULL) != pid)
> > perror_msg_and_fail("wait");
> > }
> >
> > return 0;
> > }

2022-06-21 18:26:20

by Eric W. Biederman

[permalink] [raw]
Subject: Re: [PATCH v4 12/12] sched,signal,ptrace: Rework TASK_TRACED, TASK_STOPPED state

Alexander Gordeev <[email protected]> writes:

> On Tue, Jun 21, 2022 at 09:02:05AM -0500, Eric W. Biederman wrote:
>> Alexander Gordeev <[email protected]> writes:
>>
>> > On Thu, May 05, 2022 at 01:26:45PM -0500, Eric W. Biederman wrote:
>> >> From: Peter Zijlstra <[email protected]>
>> >>
>> >> Currently ptrace_stop() / do_signal_stop() rely on the special states
>> >> TASK_TRACED and TASK_STOPPED resp. to keep unique state. That is, this
>> >> state exists only in task->__state and nowhere else.
>> >>
>> >> There's two spots of bother with this:
>> >>
>> >> - PREEMPT_RT has task->saved_state which complicates matters,
>> >> meaning task_is_{traced,stopped}() needs to check an additional
>> >> variable.
>> >>
>> >> - An alternative freezer implementation that itself relies on a
>> >> special TASK state would loose TASK_TRACED/TASK_STOPPED and will
>> >> result in misbehaviour.
>> >>
>> >> As such, add additional state to task->jobctl to track this state
>> >> outside of task->__state.
>> >>
>> >> NOTE: this doesn't actually fix anything yet, just adds extra state.
>> >>
>> >> --EWB
>> >> * didn't add a unnecessary newline in signal.h
>> >> * Update t->jobctl in signal_wake_up and ptrace_signal_wake_up
>> >> instead of in signal_wake_up_state. This prevents the clearing
>> >> of TASK_STOPPED and TASK_TRACED from getting lost.
>> >> * Added warnings if JOBCTL_STOPPED or JOBCTL_TRACED are not cleared
>> >
>> > Hi Eric, Peter,
>> >
>> > On s390 this patch triggers warning at kernel/ptrace.c:272 when
>> > kill_child testcase from strace tool is repeatedly used (the source
>> > is attached for reference):
>> >
>> > while :; do
>> > strace -f -qq -e signal=none -e trace=sched_yield,/kill ./kill_child
>> > done
>> >
>> > It normally takes few minutes to cause the warning in -rc3, but FWIW
>> > it hits almost immediately for ptrace_stop-cleanup-for-v5.19 tag of
>> > git://git.kernel.org/pub/scm/linux/kernel/git/ebiederm/user-namespace.
>> >
>> > Commit 7b0fe1367ef2 ("ptrace: Document that wait_task_inactive can't
>> > fail") suggests this WARN_ON_ONCE() is not really expected, yet we
>> > observe a child in __TASK_TRACED state. Could you please comment here?
>> >
>>
>> For clarity the warning is that the child is not in __TASK_TRACED state.
>>
>> The code is waiting for the code to stop in the scheduler in the
>> __TASK_TRACED state so that it can safely read and change the
>> processes state. Some of that state is not even saved until the
>> process is scheduled out so we have to wait until the process
>> is stopped in the scheduler.
>
> So I assume (checked actually) the return 0 below from kernel/sched/core.c:
> wait_task_inactive() is where it bails out:
>
> 3303 while (task_running(rq, p)) {
> 3304 if (match_state && unlikely(READ_ONCE(p->__state) != match_state))
> 3305 return 0;
> 3306 cpu_relax();
> 3307 }
>
> Yet, the child task is always found in __TASK_TRACED state (as seen
> in crash dumps):
>
>> 101447 11342 13 ce3a8100 RU 0.0 10040 4412 strace
> 101450 101447 0 bb04b200 TR 0.0 2272 1136 kill_child
> 108261 101447 2 d0b10100 TR 0.0 2272 532 kill_child
> crash> task bb04b200 __state
> PID: 101450 TASK: bb04b200 CPU: 0 COMMAND: "kill_child"
> __state = 8,
>
> crash> task d0b10100 __state
> PID: 108261 TASK: d0b10100 CPU: 2 COMMAND: "kill_child"
> __state = 8,

That is weird.

>> At least on s390 it looks like there is a race between SIGKILL and
>> ptrace_check_attach. That isn't good.
>>
>> Reading the code below there is something missing because I don't see
>> anything making ptrace calls, and ptrace_check_attach (which contains
>> the warning) only happens in the ptrace syscall.
>
> That is what I believe strace does when calling that code:
>
> strace -f -qq -e signal=none -e trace=sched_yield,/kill ./kill_child

Thank you. That was my braino.

I will have to see if it reproduces for me on x86 (I don't have an
s390). Perhaps if I can reproduce it I can guess what is going wrong.

So far it appears WARN_ON_ONCE has nothing to warn about yet it is
warning.

Eric

2022-06-25 16:40:48

by Eric W. Biederman

[permalink] [raw]
Subject: Re: [PATCH v4 12/12] sched,signal,ptrace: Rework TASK_TRACED, TASK_STOPPED state

Alexander Gordeev <[email protected]> writes:

> On Tue, Jun 21, 2022 at 09:02:05AM -0500, Eric W. Biederman wrote:
>> Alexander Gordeev <[email protected]> writes:
>>
>> > On Thu, May 05, 2022 at 01:26:45PM -0500, Eric W. Biederman wrote:
>> >> From: Peter Zijlstra <[email protected]>
>> >>
>> >> Currently ptrace_stop() / do_signal_stop() rely on the special states
>> >> TASK_TRACED and TASK_STOPPED resp. to keep unique state. That is, this
>> >> state exists only in task->__state and nowhere else.
>> >>
>> >> There's two spots of bother with this:
>> >>
>> >> - PREEMPT_RT has task->saved_state which complicates matters,
>> >> meaning task_is_{traced,stopped}() needs to check an additional
>> >> variable.
>> >>
>> >> - An alternative freezer implementation that itself relies on a
>> >> special TASK state would loose TASK_TRACED/TASK_STOPPED and will
>> >> result in misbehaviour.
>> >>
>> >> As such, add additional state to task->jobctl to track this state
>> >> outside of task->__state.
>> >>
>> >> NOTE: this doesn't actually fix anything yet, just adds extra state.
>> >>
>> >> --EWB
>> >> * didn't add a unnecessary newline in signal.h
>> >> * Update t->jobctl in signal_wake_up and ptrace_signal_wake_up
>> >> instead of in signal_wake_up_state. This prevents the clearing
>> >> of TASK_STOPPED and TASK_TRACED from getting lost.
>> >> * Added warnings if JOBCTL_STOPPED or JOBCTL_TRACED are not cleared
>> >
>> > Hi Eric, Peter,
>> >
>> > On s390 this patch triggers warning at kernel/ptrace.c:272 when
>> > kill_child testcase from strace tool is repeatedly used (the source
>> > is attached for reference):
>> >
>> > while :; do
>> > strace -f -qq -e signal=none -e trace=sched_yield,/kill ./kill_child
>> > done
>> >
>> > It normally takes few minutes to cause the warning in -rc3, but FWIW
>> > it hits almost immediately for ptrace_stop-cleanup-for-v5.19 tag of
>> > git://git.kernel.org/pub/scm/linux/kernel/git/ebiederm/user-namespace.
>> >
>> > Commit 7b0fe1367ef2 ("ptrace: Document that wait_task_inactive can't
>> > fail") suggests this WARN_ON_ONCE() is not really expected, yet we
>> > observe a child in __TASK_TRACED state. Could you please comment here?
>> >
>>
>> For clarity the warning is that the child is not in __TASK_TRACED state.
>>
>> The code is waiting for the code to stop in the scheduler in the
>> __TASK_TRACED state so that it can safely read and change the
>> processes state. Some of that state is not even saved until the
>> process is scheduled out so we have to wait until the process
>> is stopped in the scheduler.
>
> So I assume (checked actually) the return 0 below from kernel/sched/core.c:
> wait_task_inactive() is where it bails out:
>
> 3303 while (task_running(rq, p)) {
> 3304 if (match_state && unlikely(READ_ONCE(p->__state) != match_state))
> 3305 return 0;
> 3306 cpu_relax();
> 3307 }
>
> Yet, the child task is always found in __TASK_TRACED state (as seen
> in crash dumps):
>
>> 101447 11342 13 ce3a8100 RU 0.0 10040 4412 strace
> 101450 101447 0 bb04b200 TR 0.0 2272 1136 kill_child
> 108261 101447 2 d0b10100 TR 0.0 2272 532 kill_child
> crash> task bb04b200 __state
> PID: 101450 TASK: bb04b200 CPU: 0 COMMAND: "kill_child"
> __state = 8,
>
> crash> task d0b10100 __state
> PID: 108261 TASK: d0b10100 CPU: 2 COMMAND: "kill_child"
> __state = 8,
>

I haven't gotten as far as reproducing this but I have started giving
this issue some thought.

This entire thing smells like a memory barrier is missing somewhere.
However by definition the lock implementations in linux provide all the
needed memory barriers, and in the ptrace_stop and ptrace_check_attach
path I don't see cases where these values are sampled outside of a lock
except in wait_task_inactive. Does doing that perhaps require a
barrier?

The two things I can think of that could shed light on what is going on
is enabling lockdep, to enable the debug check in signal_wake_up_state
and verifying bits of state that should be constant while the task
is frozen for ptrace are indeed constant when task is frozen for ptrace.
Something like my patch below.

If you could test that when you have a chance that would help narrow
down what is going on.

Thank you,
Eric

diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index 156a99283b11..6467a2b1c3bc 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -268,9 +268,13 @@ static int ptrace_check_attach(struct task_struct *child, bool ignore_state)
}
read_unlock(&tasklist_lock);

- if (!ret && !ignore_state &&
- WARN_ON_ONCE(!wait_task_inactive(child, __TASK_TRACED)))
+ if (!ret && !ignore_state) {
+ WARN_ON_ONCE(!(child->jobctl & JOBCTL_PTRACE_FROZEN));
+ WARN_ON_ONCE(!(child->joctctl & JOBCTL_TRACED));
+ WARN_ON_ONCE(READ_ONCE(child->__state) != __TASK_TRACED);
+ WARN_ON_ONCE(!wait_task_inactive(child, __TASK_TRACED));
ret = -ESRCH;
+ }

return ret;
}

2022-06-28 19:01:47

by Alexander Gordeev

[permalink] [raw]
Subject: Re: [PATCH v4 12/12] sched,signal,ptrace: Rework TASK_TRACED, TASK_STOPPED state

On Sat, Jun 25, 2022 at 11:34:46AM -0500, Eric W. Biederman wrote:
> I haven't gotten as far as reproducing this but I have started giving
> this issue some thought.
>
> This entire thing smells like a memory barrier is missing somewhere.
> However by definition the lock implementations in linux provide all the
> needed memory barriers, and in the ptrace_stop and ptrace_check_attach
> path I don't see cases where these values are sampled outside of a lock
> except in wait_task_inactive. Does doing that perhaps require a
> barrier?
>
> The two things I can think of that could shed light on what is going on
> is enabling lockdep, to enable the debug check in signal_wake_up_state
> and verifying bits of state that should be constant while the task
> is frozen for ptrace are indeed constant when task is frozen for ptrace.
> Something like my patch below.
>
> If you could test that when you have a chance that would help narrow
> down what is going on.
>
> Thank you,
> Eric
>
> diff --git a/kernel/ptrace.c b/kernel/ptrace.c
> index 156a99283b11..6467a2b1c3bc 100644
> --- a/kernel/ptrace.c
> +++ b/kernel/ptrace.c
> @@ -268,9 +268,13 @@ static int ptrace_check_attach(struct task_struct *child, bool ignore_state)
> }
> read_unlock(&tasklist_lock);
>
> - if (!ret && !ignore_state &&
> - WARN_ON_ONCE(!wait_task_inactive(child, __TASK_TRACED)))
> + if (!ret && !ignore_state) {
> + WARN_ON_ONCE(!(child->jobctl & JOBCTL_PTRACE_FROZEN));
> + WARN_ON_ONCE(!(child->joctctl & JOBCTL_TRACED));
> + WARN_ON_ONCE(READ_ONCE(child->__state) != __TASK_TRACED);
> + WARN_ON_ONCE(!wait_task_inactive(child, __TASK_TRACED));
> ret = -ESRCH;
> + }
>
> return ret;
> }

I modified your chunk a bit - hope that is what you had in mind:

diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index 156a99283b11..f0e9a9a4d63c 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -268,9 +268,19 @@ static int ptrace_check_attach(struct task_struct *child, bool ignore_state)
}
read_unlock(&tasklist_lock);

- if (!ret && !ignore_state &&
- WARN_ON_ONCE(!wait_task_inactive(child, __TASK_TRACED)))
- ret = -ESRCH;
+ if (!ret && !ignore_state) {
+ unsigned int __state;
+
+ WARN_ON_ONCE(!(child->jobctl & JOBCTL_PTRACE_FROZEN));
+ WARN_ON_ONCE(!(child->jobctl & JOBCTL_TRACED));
+ __state = READ_ONCE(child->__state);
+ if (__state != __TASK_TRACED) {
+ pr_err("%s(%d) __state %x", __FUNCTION__, __LINE__, __state);
+ WARN_ON_ONCE(1);
+ }
+ if (WARN_ON_ONCE(!wait_task_inactive(child, __TASK_TRACED)))
+ ret = -ESRCH;
+ }

return ret;
}


When WARN_ON_ONCE(1) hits the child __state is always zero/TASK_RUNNING,
as reported by the preceding pr_err(). Yet, in the resulting core dump
it is always __TASK_TRACED.

Removing WARN_ON_ONCE(1) while looping until (__state != __TASK_TRACED)
confirms the unexpected __state is always TASK_RUNNING. It never observed
more than one iteration and gets printed once in 30-60 mins.

So probably when the condition is entered __state is TASK_RUNNING more
often, but gets overwritten with __TASK_TRACED pretty quickly. Which kind
of consistent with my previous observation that kernel/sched/core.c:3305
is where return 0 makes wait_task_inactive() fail.

No other WARN_ON_ONCE() hit ever.

2022-06-28 23:12:11

by Eric W. Biederman

[permalink] [raw]
Subject: Re: [PATCH v4 12/12] sched,signal,ptrace: Rework TASK_TRACED, TASK_STOPPED state

Alexander Gordeev <[email protected]> writes:

> On Sat, Jun 25, 2022 at 11:34:46AM -0500, Eric W. Biederman wrote:
>> I haven't gotten as far as reproducing this but I have started giving
>> this issue some thought.
>>
>> This entire thing smells like a memory barrier is missing somewhere.
>> However by definition the lock implementations in linux provide all the
>> needed memory barriers, and in the ptrace_stop and ptrace_check_attach
>> path I don't see cases where these values are sampled outside of a lock
>> except in wait_task_inactive. Does doing that perhaps require a
>> barrier?
>>
>> The two things I can think of that could shed light on what is going on
>> is enabling lockdep, to enable the debug check in signal_wake_up_state
>> and verifying bits of state that should be constant while the task
>> is frozen for ptrace are indeed constant when task is frozen for ptrace.
>> Something like my patch below.
>>
>> If you could test that when you have a chance that would help narrow
>> down what is going on.
>>
>> Thank you,
>> Eric
>>
>> diff --git a/kernel/ptrace.c b/kernel/ptrace.c
>> index 156a99283b11..6467a2b1c3bc 100644
>> --- a/kernel/ptrace.c
>> +++ b/kernel/ptrace.c
>> @@ -268,9 +268,13 @@ static int ptrace_check_attach(struct task_struct *child, bool ignore_state)
>> }
>> read_unlock(&tasklist_lock);
>>
>> - if (!ret && !ignore_state &&
>> - WARN_ON_ONCE(!wait_task_inactive(child, __TASK_TRACED)))
>> + if (!ret && !ignore_state) {
>> + WARN_ON_ONCE(!(child->jobctl & JOBCTL_PTRACE_FROZEN));
>> + WARN_ON_ONCE(!(child->joctctl & JOBCTL_TRACED));
>> + WARN_ON_ONCE(READ_ONCE(child->__state) != __TASK_TRACED);
>> + WARN_ON_ONCE(!wait_task_inactive(child, __TASK_TRACED));
>> ret = -ESRCH;
>> + }
>>
>> return ret;
>> }
>
> I modified your chunk a bit - hope that is what you had in mind:

Yes.

> diff --git a/kernel/ptrace.c b/kernel/ptrace.c
> index 156a99283b11..f0e9a9a4d63c 100644
> --- a/kernel/ptrace.c
> +++ b/kernel/ptrace.c
> @@ -268,9 +268,19 @@ static int ptrace_check_attach(struct task_struct *child, bool ignore_state)
> }
> read_unlock(&tasklist_lock);
>
> - if (!ret && !ignore_state &&
> - WARN_ON_ONCE(!wait_task_inactive(child, __TASK_TRACED)))
> - ret = -ESRCH;
> + if (!ret && !ignore_state) {
> + unsigned int __state;
> +
> + WARN_ON_ONCE(!(child->jobctl & JOBCTL_PTRACE_FROZEN));
> + WARN_ON_ONCE(!(child->jobctl & JOBCTL_TRACED));
> + __state = READ_ONCE(child->__state);
> + if (__state != __TASK_TRACED) {
> + pr_err("%s(%d) __state %x", __FUNCTION__, __LINE__, __state);
> + WARN_ON_ONCE(1);
> + }
> + if (WARN_ON_ONCE(!wait_task_inactive(child, __TASK_TRACED)))
> + ret = -ESRCH;
> + }
>
> return ret;
> }
>
>
> When WARN_ON_ONCE(1) hits the child __state is always zero/TASK_RUNNING,
> as reported by the preceding pr_err(). Yet, in the resulting core dump
> it is always __TASK_TRACED.

Did you enable CONFIG_LOCKDEP? I am just wanting to ensure
that every caller of signal_wake_up_state was holding siglock.

> Removing WARN_ON_ONCE(1) while looping until (__state != __TASK_TRACED)
> confirms the unexpected __state is always TASK_RUNNING. It never observed
> more than one iteration and gets printed once in 30-60 mins.

Hmm. This does smell lock a missing barrier.

> So probably when the condition is entered __state is TASK_RUNNING more
> often, but gets overwritten with __TASK_TRACED pretty quickly. Which kind
> of consistent with my previous observation that kernel/sched/core.c:3305
> is where return 0 makes wait_task_inactive() fail.
>
> No other WARN_ON_ONCE() hit ever.

Yes. This smells like something is missing.

I am completely rusty at rolling barriers by hand but does something
like the below clear up those mysterious warnings?

diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index 156a99283b11..cb85bcf84640 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -202,6 +202,7 @@ static bool ptrace_freeze_traced(struct task_struct *task)
spin_lock_irq(&task->sighand->siglock);
if (task_is_traced(task) && !looks_like_a_spurious_pid(task) &&
!__fatal_signal_pending(task)) {
+ smp_rmb();
task->jobctl |= JOBCTL_PTRACE_FROZEN;
ret = true;
}
diff --git a/kernel/signal.c b/kernel/signal.c
index edb1dc9b00dc..bcd576e9de66 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -2233,6 +2233,7 @@ static int ptrace_stop(int exit_code, int why, unsigned long message,
return exit_code;

set_special_state(TASK_TRACED);
+ smp_wmb();
current->jobctl |= JOBCTL_TRACED;

/*

Eric

2022-06-28 23:14:17

by Steven Rostedt

[permalink] [raw]
Subject: Re: [PATCH v4 12/12] sched,signal,ptrace: Rework TASK_TRACED, TASK_STOPPED state

On Tue, 28 Jun 2022 17:42:22 -0500
"Eric W. Biederman" <[email protected]> wrote:

> diff --git a/kernel/ptrace.c b/kernel/ptrace.c
> index 156a99283b11..cb85bcf84640 100644
> --- a/kernel/ptrace.c
> +++ b/kernel/ptrace.c
> @@ -202,6 +202,7 @@ static bool ptrace_freeze_traced(struct task_struct *task)
> spin_lock_irq(&task->sighand->siglock);
> if (task_is_traced(task) && !looks_like_a_spurious_pid(task) &&
> !__fatal_signal_pending(task)) {
> + smp_rmb();
> task->jobctl |= JOBCTL_PTRACE_FROZEN;
> ret = true;
> }
> diff --git a/kernel/signal.c b/kernel/signal.c
> index edb1dc9b00dc..bcd576e9de66 100644
> --- a/kernel/signal.c
> +++ b/kernel/signal.c
> @@ -2233,6 +2233,7 @@ static int ptrace_stop(int exit_code, int why, unsigned long message,
> return exit_code;
>
> set_special_state(TASK_TRACED);
> + smp_wmb();
> current->jobctl |= JOBCTL_TRACED;
>

Are not these both done under the sighand->siglock spinlock?

That is, the two paths should already be synchronized, and the memory
barriers will not help anything inside the locks. The locking should (and
must) handle all that.

-- Steve

2022-06-28 23:38:09

by Steven Rostedt

[permalink] [raw]
Subject: Re: [PATCH v4 12/12] sched,signal,ptrace: Rework TASK_TRACED, TASK_STOPPED state

On Tue, 21 Jun 2022 17:15:47 +0200
Alexander Gordeev <[email protected]> wrote:

> So I assume (checked actually) the return 0 below from kernel/sched/core.c:
> wait_task_inactive() is where it bails out:
>
> 3303 while (task_running(rq, p)) {
> 3304 if (match_state && unlikely(READ_ONCE(p->__state) != match_state))
> 3305 return 0;
> 3306 cpu_relax();
> 3307 }
>
> Yet, the child task is always found in __TASK_TRACED state (as seen
> in crash dumps):
>
> > 101447 11342 13 ce3a8100 RU 0.0 10040 4412 strace
> 101450 101447 0 bb04b200 TR 0.0 2272 1136 kill_child
> 108261 101447 2 d0b10100 TR 0.0 2272 532 kill_child
> crash> task bb04b200 __state
> PID: 101450 TASK: bb04b200 CPU: 0 COMMAND: "kill_child"
> __state = 8,
>
> crash> task d0b10100 __state
> PID: 108261 TASK: d0b10100 CPU: 2 COMMAND: "kill_child"
> __state = 8,

If you are using crash, can you enable all trace events?

Then you should be able to extract the ftrace ring buffer from crash using
the trace.so extend (https://github.com/fujitsu/crash-trace)

I guess it should still work with s390.

Then you can see the events that lead up to the crash.

-- Steve

2022-06-29 03:46:18

by Eric W. Biederman

[permalink] [raw]
Subject: Re: [PATCH v4 12/12] sched,signal,ptrace: Rework TASK_TRACED, TASK_STOPPED state

Steven Rostedt <[email protected]> writes:

> On Tue, 28 Jun 2022 17:42:22 -0500
> "Eric W. Biederman" <[email protected]> wrote:
>
>> diff --git a/kernel/ptrace.c b/kernel/ptrace.c
>> index 156a99283b11..cb85bcf84640 100644
>> --- a/kernel/ptrace.c
>> +++ b/kernel/ptrace.c
>> @@ -202,6 +202,7 @@ static bool ptrace_freeze_traced(struct task_struct *task)
>> spin_lock_irq(&task->sighand->siglock);
>> if (task_is_traced(task) && !looks_like_a_spurious_pid(task) &&
>> !__fatal_signal_pending(task)) {
>> + smp_rmb();
>> task->jobctl |= JOBCTL_PTRACE_FROZEN;
>> ret = true;
>> }
>> diff --git a/kernel/signal.c b/kernel/signal.c
>> index edb1dc9b00dc..bcd576e9de66 100644
>> --- a/kernel/signal.c
>> +++ b/kernel/signal.c
>> @@ -2233,6 +2233,7 @@ static int ptrace_stop(int exit_code, int why, unsigned long message,
>> return exit_code;
>>
>> set_special_state(TASK_TRACED);
>> + smp_wmb();
>> current->jobctl |= JOBCTL_TRACED;
>>
>
> Are not these both done under the sighand->siglock spinlock?
>
> That is, the two paths should already be synchronized, and the memory
> barriers will not help anything inside the locks. The locking should (and
> must) handle all that.

I would presume so to. However the READ_ONCE that is going astray
does not look like it is honoring that.

So perhaps there is a bug in the s390 spin_lock barriers? Perhaps there
is a subtle detail in the barriers that spin locks provide that we are
overlooking?

I just know the observed behavior is:

- reading tsk->jobctl and seeing JOBCTL_TRACED set.
- reading tsk->__state and seeing TASK_RUNNING.

So unless PREEMPT_RT is enabled on s390. It looks like there is a
barrier problem.

Alexander do you have PREEMPT_RT enabled on s390? I have been assuming
you don't but I figure I should ask and make certain as PREEMPT_RT can
cause this kind of failure.

Eric

2022-06-29 20:36:59

by Alexander Gordeev

[permalink] [raw]
Subject: Re: [PATCH v4 12/12] sched,signal,ptrace: Rework TASK_TRACED, TASK_STOPPED state

On Tue, Jun 28, 2022 at 10:39:59PM -0500, Eric W. Biederman wrote:
> Steven Rostedt <[email protected]> writes:
>
> > On Tue, 28 Jun 2022 17:42:22 -0500
> > "Eric W. Biederman" <[email protected]> wrote:
> >
> >> diff --git a/kernel/ptrace.c b/kernel/ptrace.c
> >> index 156a99283b11..cb85bcf84640 100644
> >> --- a/kernel/ptrace.c
> >> +++ b/kernel/ptrace.c
> >> @@ -202,6 +202,7 @@ static bool ptrace_freeze_traced(struct task_struct *task)
> >> spin_lock_irq(&task->sighand->siglock);
> >> if (task_is_traced(task) && !looks_like_a_spurious_pid(task) &&
> >> !__fatal_signal_pending(task)) {
> >> + smp_rmb();
> >> task->jobctl |= JOBCTL_PTRACE_FROZEN;
> >> ret = true;
> >> }
> >> diff --git a/kernel/signal.c b/kernel/signal.c
> >> index edb1dc9b00dc..bcd576e9de66 100644
> >> --- a/kernel/signal.c
> >> +++ b/kernel/signal.c
> >> @@ -2233,6 +2233,7 @@ static int ptrace_stop(int exit_code, int why, unsigned long message,
> >> return exit_code;
> >>
> >> set_special_state(TASK_TRACED);
> >> + smp_wmb();
> >> current->jobctl |= JOBCTL_TRACED;
> >>
> >
> > Are not these both done under the sighand->siglock spinlock?
> >
> > That is, the two paths should already be synchronized, and the memory
> > barriers will not help anything inside the locks. The locking should (and
> > must) handle all that.
>
> I would presume so to. However the READ_ONCE that is going astray
> does not look like it is honoring that.
>
> So perhaps there is a bug in the s390 spin_lock barriers? Perhaps there
> is a subtle detail in the barriers that spin locks provide that we are
> overlooking?
>
> I just know the observed behavior is:
>
> - reading tsk->jobctl and seeing JOBCTL_TRACED set.
> - reading tsk->__state and seeing TASK_RUNNING.
>
> So unless PREEMPT_RT is enabled on s390. It looks like there is a
> barrier problem.
>
> Alexander do you have PREEMPT_RT enabled on s390? I have been assuming
> you don't but I figure I should ask and make certain as PREEMPT_RT can
> cause this kind of failure.

There is no change with the barriers added.

CONFIG_PREEMPT_RT is disabled and CONFIG_LOCKDEP is enabled (in attach).
FWIW, I also added a full barrier:

@@ -271,6 +272,7 @@ static int ptrace_check_attach(struct task_struct *child, bool ignore_state)
if (!ret && !ignore_state) {
unsigned int __state;

+ smp_mb();
WARN_ON_ONCE(!(child->jobctl & JOBCTL_PTRACE_FROZEN));
WARN_ON_ONCE(!(child->jobctl & JOBCTL_TRACED));
__state = READ_ONCE(child->__state);

I have not been able to extract the ftrace ring buffer yet - going to do that.

> Eric

Thanks!


Attachments:
(No filename) (2.70 kB)
config-5.19.0-rc4-08751-g2cf560748ed6 (89.00 kB)
Download all attachments

2022-07-05 14:11:08

by Sven Schnelle

[permalink] [raw]
Subject: Re: [PATCH v4 12/12] sched,signal,ptrace: Rework TASK_TRACED, TASK_STOPPED state

Hi,

Steven Rostedt <[email protected]> writes:

> On Tue, 21 Jun 2022 17:15:47 +0200
> Alexander Gordeev <[email protected]> wrote:
>
>> So I assume (checked actually) the return 0 below from kernel/sched/core.c:
>> wait_task_inactive() is where it bails out:
>>
>> 3303 while (task_running(rq, p)) {
>> 3304 if (match_state && unlikely(READ_ONCE(p->__state) != match_state))
>> 3305 return 0;
>> 3306 cpu_relax();
>> 3307 }
>>
>> Yet, the child task is always found in __TASK_TRACED state (as seen
>> in crash dumps):
>>
>> > 101447 11342 13 ce3a8100 RU 0.0 10040 4412 strace
>> 101450 101447 0 bb04b200 TR 0.0 2272 1136 kill_child
>> 108261 101447 2 d0b10100 TR 0.0 2272 532 kill_child
>> crash> task bb04b200 __state
>> PID: 101450 TASK: bb04b200 CPU: 0 COMMAND: "kill_child"
>> __state = 8,
>>
>> crash> task d0b10100 __state
>> PID: 108261 TASK: d0b10100 CPU: 2 COMMAND: "kill_child"
>> __state = 8,
>
> If you are using crash, can you enable all trace events?
>
> Then you should be able to extract the ftrace ring buffer from crash using
> the trace.so extend (https://github.com/fujitsu/crash-trace)
>
> I guess it should still work with s390.
>
> Then you can see the events that lead up to the crash.

Alexander is busy with other stuff, so I took over. I enabled the
sys,signal,sched and task tracepoints and ftrace_dump_on_oops. The last
lines from the trace buffer are:

[ 281.043459] strace-1177215 0d.... 269457070us : sched_waking: comm=kill_child pid=1178157 prio=120 target_cpu=003
[ 281.043463] kill_chi-1177218 1d.... 269457070us : signal_generate: sig=17 errno=0 code=4 comm=strace pid=1177215 grp=1 res=1
[ 281.043467] kill_chi-1177218 1d.... 269457070us : sched_stat_runtime: comm=kill_child pid=1177218 runtime=5299 [ns] vruntime=1830714210855 [ns]
[ 281.043471] kill_chi-1177218 1d.... 269457071us : sched_switch: prev_comm=kill_child prev_pid=1177218 prev_prio=120 prev_state=t ==> next_comm=swapper/1 next_pid=0 next_prio=120
[ 281.043475] strace-1177215 0..... 269457071us : sys_ptrace -> 0x50
[ 281.043478] strace-1177215 0..... 269457071us : sys_write(fd: 2, buf: 2aa15db3ad0, count: 12)
[ 281.043482] strace-1177215 0..... 269457072us : sys_write -> 0x12
[ 281.043485] <idle>-0 3dNh.. 269457072us : sched_wakeup: comm=kill_child pid=1178157 prio=120 target_cpu=003
[ 281.043489] <idle>-0 3d.... 269457073us : sched_switch: prev_comm=swapper/3 prev_pid=0 prev_prio=120 prev_state=R ==> next_comm=kill_child next_pid=1178157 next_prio=120
[ 281.043493] strace-1177215 0..... 269457073us : sys_write(fd: 2, buf: 2aa15db3ad0, count: 1a)
[ 281.043496] strace-1177215 0..... 269457073us : sys_write -> 0x1a
[ 281.043500] kill_chi-1178157 3..... 269457073us : sys_sched_yield -> 0xffffffffffffffda
[ 281.043504] strace-1177215 0..... 269457073us : sys_ptrace(request: 18, pid: 11fa2d, addr: 0, data: 0)
[ 281.043508] kill_chi-1178157 3d.... 269457073us : signal_deliver: sig=9 errno=0 code=0 sa_handler=0 sa_flags=0
[ 281.043511] kill_chi-1178157 3d.... 269457074us : signal_generate: sig=17 errno=0 code=4 comm=strace pid=1177215 grp=1 res=1
[ 281.043515] kill_chi-1178157 3d.... 269457074us : sched_stat_runtime: comm=kill_child pid=1178157 runtime=2408 [ns] vruntime=1983050055579 [ns]
[ 281.043519] kill_chi-1178157 3d.... 269457075us : sched_switch: prev_comm=kill_child prev_pid=1178157 prev_prio=120 prev_state=t ==> next_comm=swapper/3 next_pid=0 next_prio=120

I attached the full output to this mail. I haven't yet tried to
understand the problem, i just wanted to send you the requested
information in the hope that it will help you.

Regards
Sven


Attachments:
dmesg.xz (28.32 kB)

2022-07-05 15:49:08

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [PATCH v4 12/12] sched,signal,ptrace: Rework TASK_TRACED, TASK_STOPPED state

On Tue, Jun 28, 2022 at 10:39:59PM -0500, Eric W. Biederman wrote:

> > That is, the two paths should already be synchronized, and the memory
> > barriers will not help anything inside the locks. The locking should (and
> > must) handle all that.
>
> I would presume so to. However the READ_ONCE that is going astray
> does not look like it is honoring that.
>
> So perhaps there is a bug in the s390 spin_lock barriers? Perhaps there
> is a subtle detail in the barriers that spin locks provide that we are
> overlooking?

So the thing is, s390 is, like x86, a TSO architecture with SC atomics.
Or at least it used to be; I'm not entirely solid on the Z196 features.

I've been looking at this and I can't find anything obviously wrong.
arch_spin_trylock_once() has what seems a spurious barrier() but that's
not going to cause this.

Specifically, s390 is using a simple test-and-set spinlock based on
their Compare-and-Swap (CS) instruction (so no Z196 funnies around).

Except perhaps arch_spin_unlock(), I can't grok the magic there. It does
something weird before the presumably regular TSO store of 0 into the
lock word.

Ooohh.. /me finds arch_spin_lock_queued().. *urfh* because obviously a
copy of queued spinlocks is what we need.

rwlock_t OTOH is using __atomic_add_*() and that's all Z196 magic.

Sven, does all this still reproduce if you take out
CONFIG_HAVE_MARCH_Z196_FEATURES ? Also, could you please explain the
Z196 bits or point me to the relevant section in the PoO. Additionally,
what's that _niai[48] stuff?

And I'm assuming s390 has hardware fairness on competing CS ?

2022-07-05 17:33:55

by Sven Schnelle

[permalink] [raw]
Subject: Re: [PATCH v4 12/12] sched,signal,ptrace: Rework TASK_TRACED, TASK_STOPPED state


I'll continue debugging tomorrow, but maybe this helps already.


Attachments:
ptrace-debug.patch (2.76 kB)

2022-07-05 19:33:49

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [PATCH v4 12/12] sched,signal,ptrace: Rework TASK_TRACED, TASK_STOPPED state

On Tue, Jul 05, 2022 at 07:28:49PM +0200, Sven Schnelle wrote:
> Sven Schnelle <[email protected]> writes:

> I think there's a race in ptrace_check_attach(). It first calls
> ptrace_freeze_task(), which checks whether JOBCTL_TRACED is set.
> If it is (and a few other conditions match) it will set ret = 0.
>
> Later outside of siglock and tasklist_lock it will call
> wait_task_inactive, assuming the target is in TASK_TRACED, but it isn't.
>
> ptrace_stop(), which runs on another CPU, does:
>
> set_special_state(TASK_TRACED);
> current->jobctl |= JOBCTL_TRACED;
>
> which looks ok on first sight, but in this case JOBCTL is already set,
> so the reading CPU will immediately move on to wait_task_inactive(),
> before JOBCTL_TRACED is set. I don't know whether this is a valid
> combination. I never looked into JOBCTL_* semantics, but i guess now
> is a good time to do so. I added some debugging statements, and that
> gives:
>
> [ 86.218488] kill_chi-300545 2d.... 79990135us : ptrace_stop: state 8
> [ 86.218492] kill_chi-300545 2d.... 79990136us : signal_generate: sig=17 errno=0 code=4 comm=strace pid=300542 grp=1 res=1
> [ 86.218496] kill_chi-300545 2d.... 79990136us : sched_stat_runtime: comm=kill_child pid=300545 runtime=3058 [ns] vruntime=606165713178 [ns]
> [ 86.218500] kill_chi-300545 2d.... 79990136us : sched_switch: prev_comm=kill_child prev_pid=300545 prev_prio=120 prev_state=t ==> next_comm=swapper/2 next_pid=0 next_prio=120
> [ 86.218504] strace-300542 7..... 79990139us : sys_ptrace -> 0x50
> [ 86.218508] strace-300542 7..... 79990139us : sys_write(fd: 2, buf: 2aa198f7ad0, count: 12)
> [ 86.218512] strace-300542 7..... 79990140us : sys_write -> 0x12
> [ 86.218515] <idle>-0 6dNh.. 79990140us : sched_wakeup: comm=kill_child pid=343805 prio=120 target_cpu=006
> [ 86.218519] <idle>-0 6d.... 79990140us : sched_switch: prev_comm=swapper/6 prev_pid=0 prev_prio=120 prev_state=R ==> next_comm=kill_child next_pid=343805 next_prio=120
> [ 86.218524] strace-300542 7..... 79990140us : sys_write(fd: 2, buf: 2aa198f7ad0, count: 19)
> [ 86.218527] strace-300542 7..... 79990141us : sys_write -> 0x19
> [ 86.218531] kill_chi-343805 6..... 79990141us : sys_sched_yield -> 0xffffffffffffffda
> [ 86.218535] strace-300542 7..... 79990141us : sys_ptrace(request: 18, pid: 53efd, addr: 0, data: 0)
> [ 86.218539] kill_chi-343805 6d.... 79990141us : signal_deliver: sig=9 errno=0 code=0 sa_handler=0 sa_flags=0
> [ 86.218543] strace-300542 7d.... 79990141us : ptrace_check_attach: task_is_traced: 1, fatal signal pending: 0
> [ 86.218547] strace-300542 7..... 79990141us : ptrace_check_attach: child->pid = 343805, child->__flags=0
> [ 86.218551] kill_chi-343805 6d.... 79990141us : ptrace_stop: JOBCTL_TRACED already set, state=0 <------ valid combination of flags?

Yeah, that's not supposed to be so. JOBCTL_TRACED is supposed to follow
__TASK_TRACED for now. Set when __TASK_TRACED, cleared when
TASK_RUNNING.

Specifically {ptrace_,}signal_wake_up() in signal.h clear JOBCTL_TRACED
when they would wake a __TASK_TRACED task.

> [ 86.218554] kill_chi-343805 6d.... 79990141us : ptrace_stop: state 8
> [ 86.218558] kill_chi-343805 6d.... 79990142us : signal_generate: sig=17 errno=0 code=4 comm=strace pid=300542 grp=1 res=1
> [ 86.218562] kill_chi-343805 6d.... 79990142us : sched_stat_runtime: comm=kill_child pid=343805 runtime=2135 [ns] vruntime=556109013931 [ns]
> [ 86.218566] strace-300542 7..... 79990142us : wait_task_inactive: NO MATCH: state 0, match_state 8, pid 343805
> [ 86.218570] kill_chi-343805 6d.... 79990142us : sched_switch: prev_comm=kill_child prev_pid=343805 prev_prio=120 prev_state=t ==>next_comm=swapper/6 next_pid=0 next_prio=120
>

2022-07-06 07:27:57

by Alexander Gordeev

[permalink] [raw]
Subject: Re: [PATCH v4 12/12] sched,signal,ptrace: Rework TASK_TRACED, TASK_STOPPED state

On Tue, Jul 05, 2022 at 05:44:06PM +0200, Peter Zijlstra wrote:

Hi Peter,

> Sven, does all this still reproduce if you take out
> CONFIG_HAVE_MARCH_Z196_FEATURES ?

Yes, it hits.

2022-07-06 08:01:20

by Sven Schnelle

[permalink] [raw]
Subject: Re: [PATCH v4 12/12] sched,signal,ptrace: Rework TASK_TRACED, TASK_STOPPED state

Hi Peter,

Peter Zijlstra <[email protected]> writes:

> On Tue, Jul 05, 2022 at 07:28:49PM +0200, Sven Schnelle wrote:
>> Sven Schnelle <[email protected]> writes:
>
>> I think there's a race in ptrace_check_attach(). It first calls
>> ptrace_freeze_task(), which checks whether JOBCTL_TRACED is set.
>> If it is (and a few other conditions match) it will set ret = 0.
>>
>> Later outside of siglock and tasklist_lock it will call
>> wait_task_inactive, assuming the target is in TASK_TRACED, but it isn't.
>>
>> ptrace_stop(), which runs on another CPU, does:
>>
>> set_special_state(TASK_TRACED);
>> current->jobctl |= JOBCTL_TRACED;
>>
>> which looks ok on first sight, but in this case JOBCTL is already set,
>> so the reading CPU will immediately move on to wait_task_inactive(),
>> before JOBCTL_TRACED is set. I don't know whether this is a valid
>> combination. I never looked into JOBCTL_* semantics, but i guess now
>> is a good time to do so. I added some debugging statements, and that
>> gives:
>>
>> [ 86.218488] kill_chi-300545 2d.... 79990135us : ptrace_stop: state 8
>> [ 86.218492] kill_chi-300545 2d.... 79990136us : signal_generate: sig=17 errno=0 code=4 comm=strace pid=300542 grp=1 res=1
>> [ 86.218496] kill_chi-300545 2d.... 79990136us : sched_stat_runtime: comm=kill_child pid=300545 runtime=3058 [ns] vruntime=606165713178 [ns]
>> [ 86.218500] kill_chi-300545 2d.... 79990136us : sched_switch:
>> prev_comm=kill_child prev_pid=300545 prev_prio=120 prev_state=t ==>
>> next_comm=swapper/2 next_pid=0 next_prio=120
>> [ 86.218504] strace-300542 7..... 79990139us : sys_ptrace -> 0x50
>> [ 86.218508] strace-300542 7..... 79990139us : sys_write(fd: 2, buf: 2aa198f7ad0, count: 12)
>> [ 86.218512] strace-300542 7..... 79990140us : sys_write -> 0x12
>> [ 86.218515] <idle>-0 6dNh.. 79990140us : sched_wakeup: comm=kill_child pid=343805 prio=120 target_cpu=006
>> [ 86.218519] <idle>-0 6d.... 79990140us : sched_switch:
>> prev_comm=swapper/6 prev_pid=0 prev_prio=120 prev_state=R ==>
>> next_comm=kill_child next_pid=343805 next_prio=120
>> [ 86.218524] strace-300542 7..... 79990140us : sys_write(fd: 2, buf: 2aa198f7ad0, count: 19)
>> [ 86.218527] strace-300542 7..... 79990141us : sys_write -> 0x19
>> [ 86.218531] kill_chi-343805 6..... 79990141us : sys_sched_yield -> 0xffffffffffffffda
>> [ 86.218535] strace-300542 7..... 79990141us : sys_ptrace(request: 18, pid: 53efd, addr: 0, data: 0)
>> [ 86.218539] kill_chi-343805 6d.... 79990141us : signal_deliver: sig=9 errno=0 code=0 sa_handler=0 sa_flags=0
>> [ 86.218543] strace-300542 7d.... 79990141us : ptrace_check_attach: task_is_traced: 1, fatal signal pending: 0
>> [ 86.218547] strace-300542 7..... 79990141us : ptrace_check_attach: child->pid = 343805, child->__flags=0
>> [ 86.218551] kill_chi-343805 6d.... 79990141us : ptrace_stop: JOBCTL_TRACED already set, state=0 <------ valid combination of flags?
>
> Yeah, that's not supposed to be so. JOBCTL_TRACED is supposed to follow
> __TASK_TRACED for now. Set when __TASK_TRACED, cleared when
> TASK_RUNNING.
>
> Specifically {ptrace_,}signal_wake_up() in signal.h clear JOBCTL_TRACED
> when they would wake a __TASK_TRACED task.

try_to_wake_up() clears TASK_TRACED in this case because a signal
(SIGKILL) has to be delivered. As a test I put the following change
on top, and it "fixes" the problem:

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index da0bf6fe9ecd..f2e0f5e70e77 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -4141,6 +4149,9 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
* TASK_WAKING such that we can unlock p->pi_lock before doing the
* enqueue, such as ttwu_queue_wakelist().
*/
+ if (p->__state & TASK_TRACED)
+ trace_printk("clearing TASK_TRACED 2\n");
+ p->jobctl &= ~JOBCTL_TRACED;
WRITE_ONCE(p->__state, TASK_WAKING);

/*

There are several places where the state is changed from TASK_TRACED to
something else without clearing JOBCTL_TRACED.

2022-07-06 09:40:08

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [PATCH v4 12/12] sched,signal,ptrace: Rework TASK_TRACED, TASK_STOPPED state

On Wed, Jul 06, 2022 at 09:58:55AM +0200, Sven Schnelle wrote:

> >> [ 86.218551] kill_chi-343805 6d.... 79990141us : ptrace_stop: JOBCTL_TRACED already set, state=0 <------ valid combination of flags?
> >
> > Yeah, that's not supposed to be so. JOBCTL_TRACED is supposed to follow
> > __TASK_TRACED for now. Set when __TASK_TRACED, cleared when
> > TASK_RUNNING.
> >
> > Specifically {ptrace_,}signal_wake_up() in signal.h clear JOBCTL_TRACED
> > when they would wake a __TASK_TRACED task.
>
> try_to_wake_up() clears TASK_TRACED in this case because a signal
> (SIGKILL) has to be delivered. As a test I put the following change
> on top, and it "fixes" the problem:
>
> diff --git a/kernel/sched/core.c b/kernel/sched/core.c
> index da0bf6fe9ecd..f2e0f5e70e77 100644
> --- a/kernel/sched/core.c
> +++ b/kernel/sched/core.c
> @@ -4141,6 +4149,9 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
> * TASK_WAKING such that we can unlock p->pi_lock before doing the
> * enqueue, such as ttwu_queue_wakelist().
> */
> + if (p->__state & TASK_TRACED)
> + trace_printk("clearing TASK_TRACED 2\n");
> + p->jobctl &= ~JOBCTL_TRACED;
> WRITE_ONCE(p->__state, TASK_WAKING);
>
> /*
>
> There are several places where the state is changed from TASK_TRACED to
> something else without clearing JOBCTL_TRACED.

I'm having difficulty spotting them; I find:

TASK_WAKEKILL: signal_wake_up()
__TASK_TRACED: ptrace_signal_wake_up(), ptrace_unfreeze_traced(), ptrace_resume()

And all those sites dutifully clear JOBCTL_TRACED.

I'd be most interested in the calstack for the 'clearing TASK_TRACED 2'
events to see where we miss a spot.

2022-07-06 09:41:16

by Sven Schnelle

[permalink] [raw]
Subject: Re: [PATCH v4 12/12] sched,signal,ptrace: Rework TASK_TRACED, TASK_STOPPED state

Peter Zijlstra <[email protected]> writes:

> On Wed, Jul 06, 2022 at 09:58:55AM +0200, Sven Schnelle wrote:
>
>> >> [ 86.218551] kill_chi-343805 6d.... 79990141us : ptrace_stop: JOBCTL_TRACED already set, state=0 <------ valid combination of flags?
>> >
>> > Yeah, that's not supposed to be so. JOBCTL_TRACED is supposed to follow
>> > __TASK_TRACED for now. Set when __TASK_TRACED, cleared when
>> > TASK_RUNNING.
>> >
>> > Specifically {ptrace_,}signal_wake_up() in signal.h clear JOBCTL_TRACED
>> > when they would wake a __TASK_TRACED task.
>>
>> try_to_wake_up() clears TASK_TRACED in this case because a signal
>> (SIGKILL) has to be delivered. As a test I put the following change
>> on top, and it "fixes" the problem:
>>
>> diff --git a/kernel/sched/core.c b/kernel/sched/core.c
>> index da0bf6fe9ecd..f2e0f5e70e77 100644
>> --- a/kernel/sched/core.c
>> +++ b/kernel/sched/core.c
>> @@ -4141,6 +4149,9 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
>> * TASK_WAKING such that we can unlock p->pi_lock before doing the
>> * enqueue, such as ttwu_queue_wakelist().
>> */
>> + if (p->__state & TASK_TRACED)
>> + trace_printk("clearing TASK_TRACED 2\n");
>> + p->jobctl &= ~JOBCTL_TRACED;
>> WRITE_ONCE(p->__state, TASK_WAKING);
>>
>> /*
>>
>> There are several places where the state is changed from TASK_TRACED to
>> something else without clearing JOBCTL_TRACED.
>
> I'm having difficulty spotting them; I find:
>
> TASK_WAKEKILL: signal_wake_up()
> __TASK_TRACED: ptrace_signal_wake_up(), ptrace_unfreeze_traced(), ptrace_resume()
>
> And all those sites dutifully clear JOBCTL_TRACED.
>
> I'd be most interested in the calstack for the 'clearing TASK_TRACED 2'
> events to see where we miss a spot.

The calltrace is:
[ 9.863613] Call Trace:
[ 9.863616] [<00000000d3105f0e>] try_to_wake_up+0xae/0x620
[ 9.863620] ([<00000000d3106164>] try_to_wake_up+0x304/0x620)
[ 9.863623] [<00000000d30d1e46>] ptrace_unfreeze_traced+0x9e/0xa8
[ 9.863629] [<00000000d30d2ef0>] __s390x_sys_ptrace+0xc0/0x160
[ 9.863633] [<00000000d3c5d8f4>] __do_syscall+0x1d4/0x200
[ 9.863678] [<00000000d3c6c332>] system_call+0x82/0xb0
[ 9.863685] Last Breaking-Event-Address:
[ 9.863686] [<00000000d3106176>] try_to_wake_up+0x316/0x620
[ 9.863688] ---[ end trace 0000000000000000 ]---

ptrace_unfreeze_traced() is:

static void ptrace_unfreeze_traced(struct task_struct *task)
{
unsigned long flags;

/*
* The child may be awake and may have cleared
* JOBCTL_PTRACE_FROZEN (see ptrace_resume). The child will
* not set JOBCTL_PTRACE_FROZEN or enter __TASK_TRACED anew.
*/
if (lock_task_sighand(task, &flags)) {
task->jobctl &= ~JOBCTL_PTRACE_FROZEN;
if (__fatal_signal_pending(task)) {
task->jobctl &= ~TASK_TRACED;

Looking at this, shouldn't the line above read task->jobctl &= ~JOBCTL_TRACED?

wake_up_state(task, __TASK_TRACED);
}
unlock_task_sighand(task, &flags);
}
}

2022-07-06 10:19:20

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [PATCH v4 12/12] sched,signal,ptrace: Rework TASK_TRACED, TASK_STOPPED state

On Wed, Jul 06, 2022 at 11:27:05AM +0200, Sven Schnelle wrote:
> Peter Zijlstra <[email protected]> writes:
>
> > On Wed, Jul 06, 2022 at 09:58:55AM +0200, Sven Schnelle wrote:
> >
> >> >> [ 86.218551] kill_chi-343805 6d.... 79990141us : ptrace_stop: JOBCTL_TRACED already set, state=0 <------ valid combination of flags?
> >> >
> >> > Yeah, that's not supposed to be so. JOBCTL_TRACED is supposed to follow
> >> > __TASK_TRACED for now. Set when __TASK_TRACED, cleared when
> >> > TASK_RUNNING.
> >> >
> >> > Specifically {ptrace_,}signal_wake_up() in signal.h clear JOBCTL_TRACED
> >> > when they would wake a __TASK_TRACED task.
> >>
> >> try_to_wake_up() clears TASK_TRACED in this case because a signal
> >> (SIGKILL) has to be delivered. As a test I put the following change
> >> on top, and it "fixes" the problem:
> >>
> >> diff --git a/kernel/sched/core.c b/kernel/sched/core.c
> >> index da0bf6fe9ecd..f2e0f5e70e77 100644
> >> --- a/kernel/sched/core.c
> >> +++ b/kernel/sched/core.c
> >> @@ -4141,6 +4149,9 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
> >> * TASK_WAKING such that we can unlock p->pi_lock before doing the
> >> * enqueue, such as ttwu_queue_wakelist().
> >> */
> >> + if (p->__state & TASK_TRACED)
> >> + trace_printk("clearing TASK_TRACED 2\n");
> >> + p->jobctl &= ~JOBCTL_TRACED;
> >> WRITE_ONCE(p->__state, TASK_WAKING);
> >>
> >> /*
> >>
> >> There are several places where the state is changed from TASK_TRACED to
> >> something else without clearing JOBCTL_TRACED.
> >
> > I'm having difficulty spotting them; I find:
> >
> > TASK_WAKEKILL: signal_wake_up()
> > __TASK_TRACED: ptrace_signal_wake_up(), ptrace_unfreeze_traced(), ptrace_resume()
> >
> > And all those sites dutifully clear JOBCTL_TRACED.
> >
> > I'd be most interested in the calstack for the 'clearing TASK_TRACED 2'
> > events to see where we miss a spot.
>
> The calltrace is:
> [ 9.863613] Call Trace:
> [ 9.863616] [<00000000d3105f0e>] try_to_wake_up+0xae/0x620
> [ 9.863620] ([<00000000d3106164>] try_to_wake_up+0x304/0x620)
> [ 9.863623] [<00000000d30d1e46>] ptrace_unfreeze_traced+0x9e/0xa8
> [ 9.863629] [<00000000d30d2ef0>] __s390x_sys_ptrace+0xc0/0x160
> [ 9.863633] [<00000000d3c5d8f4>] __do_syscall+0x1d4/0x200
> [ 9.863678] [<00000000d3c6c332>] system_call+0x82/0xb0
> [ 9.863685] Last Breaking-Event-Address:
> [ 9.863686] [<00000000d3106176>] try_to_wake_up+0x316/0x620
> [ 9.863688] ---[ end trace 0000000000000000 ]---
>
> ptrace_unfreeze_traced() is:
>
> static void ptrace_unfreeze_traced(struct task_struct *task)
> {
> unsigned long flags;
>
> /*
> * The child may be awake and may have cleared
> * JOBCTL_PTRACE_FROZEN (see ptrace_resume). The child will
> * not set JOBCTL_PTRACE_FROZEN or enter __TASK_TRACED anew.
> */
> if (lock_task_sighand(task, &flags)) {
> task->jobctl &= ~JOBCTL_PTRACE_FROZEN;
> if (__fatal_signal_pending(task)) {
> task->jobctl &= ~TASK_TRACED;
>
> Looking at this, shouldn't the line above read task->jobctl &= ~JOBCTL_TRACED?

YES! Absolutely.

> wake_up_state(task, __TASK_TRACED);
> }
> unlock_task_sighand(task, &flags);
> }
> }