LinuxLists.cc - [PATCH v4 12/12] sched,signal,ptrace: Rework TASK_TRACED, TASK

2022-05-06 06:34:27

Subject: [PATCH v4 12/12] sched,signal,ptrace: Rework TASK_TRACED, TASK_STOPPED state

From: Peter Zijlstra <[email protected]>

Currently ptrace_stop() / do_signal_stop() rely on the special states
TASK_TRACED and TASK_STOPPED resp. to keep unique state. That is, this
state exists only in task->__state and nowhere else.

There's two spots of bother with this:

- PREEMPT_RT has task->saved_state which complicates matters,
meaning task_is_{traced,stopped}() needs to check an additional
variable.

- An alternative freezer implementation that itself relies on a
special TASK state would loose TASK_TRACED/TASK_STOPPED and will
result in misbehaviour.

As such, add additional state to task->jobctl to track this state
outside of task->__state.

NOTE: this doesn't actually fix anything yet, just adds extra state.

--EWB
* didn't add a unnecessary newline in signal.h
* Update t->jobctl in signal_wake_up and ptrace_signal_wake_up
instead of in signal_wake_up_state. This prevents the clearing
of TASK_STOPPED and TASK_TRACED from getting lost.
* Added warnings if JOBCTL_STOPPED or JOBCTL_TRACED are not cleared

Signed-off-by: Peter Zijlstra (Intel) <[email protected]>
Link: https://lkml.kernel.org/r/[email protected]
Signed-off-by: Eric W. Biederman <[email protected]>
---
include/linux/sched.h | 8 +++-----
include/linux/sched/jobctl.h | 6 ++++++
include/linux/sched/signal.h | 19 +++++++++++++++----
kernel/ptrace.c | 16 +++++++++++++---
kernel/signal.c | 10 ++++++++--
5 files changed, 45 insertions(+), 14 deletions(-)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 610f2fdb1e2c..cbe5c899599c 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -118,11 +118,9 @@ struct task_group;

#define task_is_running(task) (READ_ONCE((task)->__state) == TASK_RUNNING)

-#define task_is_traced(task) ((READ_ONCE(task->__state) & __TASK_TRACED) != 0)
-
-#define task_is_stopped(task) ((READ_ONCE(task->__state) & __TASK_STOPPED) != 0)
-
-#define task_is_stopped_or_traced(task) ((READ_ONCE(task->__state) & (__TASK_STOPPED | __TASK_TRACED)) != 0)
+#define task_is_traced(task) ((READ_ONCE(task->jobctl) & JOBCTL_TRACED) != 0)
+#define task_is_stopped(task) ((READ_ONCE(task->jobctl) & JOBCTL_STOPPED) != 0)
+#define task_is_stopped_or_traced(task) ((READ_ONCE(task->jobctl) & (JOBCTL_STOPPED | JOBCTL_TRACED)) != 0)

/*
* Special states are those that do not use the normal wait-loop pattern. See
diff --git a/include/linux/sched/jobctl.h b/include/linux/sched/jobctl.h
index d556c3425963..68876d0a7ef9 100644
--- a/include/linux/sched/jobctl.h
+++ b/include/linux/sched/jobctl.h
@@ -21,6 +21,9 @@ struct task_struct;
#define JOBCTL_TRAP_FREEZE_BIT 23 /* trap for cgroup freezer */
#define JOBCTL_PTRACE_FROZEN_BIT 24 /* frozen for ptrace */

+#define JOBCTL_STOPPED_BIT 26 /* do_signal_stop() */
+#define JOBCTL_TRACED_BIT 27 /* ptrace_stop() */
+
#define JOBCTL_STOP_DEQUEUED (1UL << JOBCTL_STOP_DEQUEUED_BIT)
#define JOBCTL_STOP_PENDING (1UL << JOBCTL_STOP_PENDING_BIT)
#define JOBCTL_STOP_CONSUME (1UL << JOBCTL_STOP_CONSUME_BIT)
@@ -31,6 +34,9 @@ struct task_struct;
#define JOBCTL_TRAP_FREEZE (1UL << JOBCTL_TRAP_FREEZE_BIT)
#define JOBCTL_PTRACE_FROZEN (1UL << JOBCTL_PTRACE_FROZEN_BIT)

+#define JOBCTL_STOPPED (1UL << JOBCTL_STOPPED_BIT)
+#define JOBCTL_TRACED (1UL << JOBCTL_TRACED_BIT)
+
#define JOBCTL_TRAP_MASK (JOBCTL_TRAP_STOP | JOBCTL_TRAP_NOTIFY)
#define JOBCTL_PENDING_MASK (JOBCTL_STOP_PENDING | JOBCTL_TRAP_MASK)

diff --git a/include/linux/sched/signal.h b/include/linux/sched/signal.h
index e66948abbee4..07ba3404fcde 100644
--- a/include/linux/sched/signal.h
+++ b/include/linux/sched/signal.h
@@ -294,8 +294,10 @@ static inline int kernel_dequeue_signal(void)
static inline void kernel_signal_stop(void)
{
spin_lock_irq(&current->sighand->siglock);
- if (current->jobctl & JOBCTL_STOP_DEQUEUED)
+ if (current->jobctl & JOBCTL_STOP_DEQUEUED) {
+ current->jobctl |= JOBCTL_STOPPED;
set_special_state(TASK_STOPPED);
+ }
spin_unlock_irq(&current->sighand->siglock);

schedule();
@@ -437,12 +439,21 @@ extern void signal_wake_up_state(struct task_struct *t, unsigned int state);

static inline void signal_wake_up(struct task_struct *t, bool fatal)
{
- fatal = fatal && !(t->jobctl & JOBCTL_PTRACE_FROZEN);
- signal_wake_up_state(t, fatal ? TASK_WAKEKILL | __TASK_TRACED : 0);
+ unsigned int state = 0;
+ if (fatal && !(t->jobctl & JOBCTL_PTRACE_FROZEN)) {
+ t->jobctl &= ~(JOBCTL_STOPPED | JOBCTL_TRACED);
+ state = TASK_WAKEKILL | __TASK_TRACED;
+ }
+ signal_wake_up_state(t, state);
}
static inline void ptrace_signal_wake_up(struct task_struct *t, bool resume)
{
- signal_wake_up_state(t, resume ? __TASK_TRACED : 0);
+ unsigned int state = 0;
+ if (resume) {
+ t->jobctl &= ~JOBCTL_TRACED;
+ state = __TASK_TRACED;
+ }
+ signal_wake_up_state(t, state);
}

void task_join_group_stop(struct task_struct *task);
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index 36a5b7a00d2f..328a34a99124 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -185,7 +185,12 @@ static bool looks_like_a_spurious_pid(struct task_struct *task)
return true;
}

-/* Ensure that nothing can wake it up, even SIGKILL */
+/*
+ * Ensure that nothing can wake it up, even SIGKILL
+ *
+ * A task is switched to this state while a ptrace operation is in progress;
+ * such that the ptrace operation is uninterruptible.
+ */
static bool ptrace_freeze_traced(struct task_struct *task)
{
bool ret = false;
@@ -216,8 +221,10 @@ static void ptrace_unfreeze_traced(struct task_struct *task)
*/
if (lock_task_sighand(task, &flags)) {
task->jobctl &= ~JOBCTL_PTRACE_FROZEN;
- if (__fatal_signal_pending(task))
+ if (__fatal_signal_pending(task)) {
+ task->jobctl &= ~TASK_TRACED;
wake_up_state(task, __TASK_TRACED);
+ }
unlock_task_sighand(task, &flags);
}
}
@@ -462,8 +469,10 @@ static int ptrace_attach(struct task_struct *task, long request,
* in and out of STOPPED are protected by siglock.
*/
if (task_is_stopped(task) &&
- task_set_jobctl_pending(task, JOBCTL_TRAP_STOP | JOBCTL_TRAPPING))
+ task_set_jobctl_pending(task, JOBCTL_TRAP_STOP | JOBCTL_TRAPPING)) {
+ task->jobctl &= ~JOBCTL_STOPPED;
signal_wake_up_state(task, __TASK_STOPPED);
+ }

spin_unlock(&task->sighand->siglock);

@@ -875,6 +884,7 @@ static int ptrace_resume(struct task_struct *child, long request,
*/
spin_lock_irq(&child->sighand->siglock);
child->exit_code = data;
+ child->jobctl &= ~JOBCTL_TRACED;
wake_up_state(child, __TASK_TRACED);
spin_unlock_irq(&child->sighand->siglock);

diff --git a/kernel/signal.c b/kernel/signal.c
index a58b68a2d3c6..e782c2611b64 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -762,7 +762,10 @@ static int dequeue_synchronous_signal(kernel_siginfo_t *info)
*/
void signal_wake_up_state(struct task_struct *t, unsigned int state)
{
+ lockdep_assert_held(&t->sighand->siglock);
+
set_tsk_thread_flag(t, TIF_SIGPENDING);
+
/*
* TASK_WAKEKILL also means wake it up in the stopped/traced/killable
* case. We don't check t->state here because there is a race with it
@@ -930,9 +933,10 @@ static bool prepare_signal(int sig, struct task_struct *p, bool force)
for_each_thread(p, t) {
flush_sigqueue_mask(&flush, &t->pending);
task_clear_jobctl_pending(t, JOBCTL_STOP_PENDING);
- if (likely(!(t->ptrace & PT_SEIZED)))
+ if (likely(!(t->ptrace & PT_SEIZED))) {
+ t->jobctl &= ~JOBCTL_STOPPED;
wake_up_state(t, __TASK_STOPPED);
- else
+ } else
ptrace_trap_notify(t);
}

@@ -2218,6 +2222,7 @@ static int ptrace_stop(int exit_code, int why, unsigned long message,
return exit_code;

set_special_state(TASK_TRACED);
+ current->jobctl |= JOBCTL_TRACED;

/*
* We're committing to trapping. TRACED should be visible before
@@ -2436,6 +2441,7 @@ static bool do_signal_stop(int signr)
if (task_participate_group_stop(current))
notify = CLD_STOPPED;

+ current->jobctl |= JOBCTL_STOPPED;
set_special_state(TASK_STOPPED);
spin_unlock_irq(&current->sighand->siglock);

--
2.35.3

2022-06-21 13:17:36

Steven Rostedt <[email protected]> writes:

> On Tue, 28 Jun 2022 17:42:22 -0500
> "Eric W. Biederman" <[email protected]> wrote:
>
>> diff --git a/kernel/ptrace.c b/kernel/ptrace.c
>> index 156a99283b11..cb85bcf84640 100644
>> --- a/kernel/ptrace.c
>> +++ b/kernel/ptrace.c
>> @@ -202,6 +202,7 @@ static bool ptrace_freeze_traced(struct task_struct *task)
>> spin_lock_irq(&task->sighand->siglock);
>> if (task_is_traced(task) && !looks_like_a_spurious_pid(task) &&
>> !__fatal_signal_pending(task)) {
>> + smp_rmb();
>> task->jobctl |= JOBCTL_PTRACE_FROZEN;
>> ret = true;
>> }
>> diff --git a/kernel/signal.c b/kernel/signal.c
>> index edb1dc9b00dc..bcd576e9de66 100644
>> --- a/kernel/signal.c
>> +++ b/kernel/signal.c
>> @@ -2233,6 +2233,7 @@ static int ptrace_stop(int exit_code, int why, unsigned long message,
>> return exit_code;
>>
>> set_special_state(TASK_TRACED);
>> + smp_wmb();
>> current->jobctl |= JOBCTL_TRACED;
>>
>
> Are not these both done under the sighand->siglock spinlock?
>
> That is, the two paths should already be synchronized, and the memory
> barriers will not help anything inside the locks. The locking should (and
> must) handle all that.

I would presume so to. However the READ_ONCE that is going astray
does not look like it is honoring that.

So perhaps there is a bug in the s390 spin_lock barriers? Perhaps there
is a subtle detail in the barriers that spin locks provide that we are
overlooking?

I just know the observed behavior is:

- reading tsk->jobctl and seeing JOBCTL_TRACED set.
- reading tsk->__state and seeing TASK_RUNNING.

So unless PREEMPT_RT is enabled on s390. It looks like there is a
barrier problem.

Alexander do you have PREEMPT_RT enabled on s390? I have been assuming
you don't but I figure I should ask and make certain as PREEMPT_RT can
cause this kind of failure.

Eric

2022-06-29 20:36:59

by Alexander Gordeev

[permalink] [raw]

Subject: Re: [PATCH v4 12/12] sched,signal,ptrace: Rework TASK_TRACED, TASK_STOPPED state

On Tue, Jun 28, 2022 at 10:39:59PM -0500, Eric W. Biederman wrote:
> Steven Rostedt <[email protected]> writes:
>
> > On Tue, 28 Jun 2022 17:42:22 -0500
> > "Eric W. Biederman" <[email protected]> wrote:
> >
> >> diff --git a/kernel/ptrace.c b/kernel/ptrace.c
> >> index 156a99283b11..cb85bcf84640 100644
> >> --- a/kernel/ptrace.c
> >> +++ b/kernel/ptrace.c
> >> @@ -202,6 +202,7 @@ static bool ptrace_freeze_traced(struct task_struct *task)
> >> spin_lock_irq(&task->sighand->siglock);
> >> if (task_is_traced(task) && !looks_like_a_spurious_pid(task) &&
> >> !__fatal_signal_pending(task)) {
> >> + smp_rmb();
> >> task->jobctl |= JOBCTL_PTRACE_FROZEN;
> >> ret = true;
> >> }
> >> diff --git a/kernel/signal.c b/kernel/signal.c
> >> index edb1dc9b00dc..bcd576e9de66 100644
> >> --- a/kernel/signal.c
> >> +++ b/kernel/signal.c
> >> @@ -2233,6 +2233,7 @@ static int ptrace_stop(int exit_code, int why, unsigned long message,
> >> return exit_code;
> >>
> >> set_special_state(TASK_TRACED);
> >> + smp_wmb();
> >> current->jobctl |= JOBCTL_TRACED;
> >>
> >
> > Are not these both done under the sighand->siglock spinlock?
> >
> > That is, the two paths should already be synchronized, and the memory
> > barriers will not help anything inside the locks. The locking should (and
> > must) handle all that.
>
> I would presume so to. However the READ_ONCE that is going astray
> does not look like it is honoring that.
>
> So perhaps there is a bug in the s390 spin_lock barriers? Perhaps there
> is a subtle detail in the barriers that spin locks provide that we are
> overlooking?
>
> I just know the observed behavior is:
>
> - reading tsk->jobctl and seeing JOBCTL_TRACED set.
> - reading tsk->__state and seeing TASK_RUNNING.
>
> So unless PREEMPT_RT is enabled on s390. It looks like there is a
> barrier problem.
>
> Alexander do you have PREEMPT_RT enabled on s390? I have been assuming
> you don't but I figure I should ask and make certain as PREEMPT_RT can
> cause this kind of failure.

There is no change with the barriers added.

CONFIG_PREEMPT_RT is disabled and CONFIG_LOCKDEP is enabled (in attach).
FWIW, I also added a full barrier:

@@ -271,6 +272,7 @@ static int ptrace_check_attach(struct task_struct *child, bool ignore_state)
if (!ret && !ignore_state) {
unsigned int __state;

+ smp_mb();
WARN_ON_ONCE(!(child->jobctl & JOBCTL_PTRACE_FROZEN));
WARN_ON_ONCE(!(child->jobctl & JOBCTL_TRACED));
__state = READ_ONCE(child->__state);

I have not been able to extract the ftrace ring buffer yet - going to do that.

> Eric

Thanks!

Attachments:

(No filename) (2.70 kB)
config-5.19.0-rc4-08751-g2cf560748ed6 (89.00 kB)
Download all attachments

2022-07-05 14:11:08

[permalink] [raw]

Subject: Re: [PATCH v4 12/12] sched,signal,ptrace: Rework TASK_TRACED, TASK_STOPPED state

On Wed, Jul 06, 2022 at 11:27:05AM +0200, Sven Schnelle wrote:
> Peter Zijlstra <[email protected]> writes:
>
> > On Wed, Jul 06, 2022 at 09:58:55AM +0200, Sven Schnelle wrote:
> >
> >> >> [ 86.218551] kill_chi-343805 6d.... 79990141us : ptrace_stop: JOBCTL_TRACED already set, state=0 <------ valid combination of flags?
> >> >
> >> > Yeah, that's not supposed to be so. JOBCTL_TRACED is supposed to follow
> >> > __TASK_TRACED for now. Set when __TASK_TRACED, cleared when
> >> > TASK_RUNNING.
> >> >
> >> > Specifically {ptrace_,}signal_wake_up() in signal.h clear JOBCTL_TRACED
> >> > when they would wake a __TASK_TRACED task.
> >>
> >> try_to_wake_up() clears TASK_TRACED in this case because a signal
> >> (SIGKILL) has to be delivered. As a test I put the following change
> >> on top, and it "fixes" the problem:
> >>
> >> diff --git a/kernel/sched/core.c b/kernel/sched/core.c
> >> index da0bf6fe9ecd..f2e0f5e70e77 100644
> >> --- a/kernel/sched/core.c
> >> +++ b/kernel/sched/core.c
> >> @@ -4141,6 +4149,9 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
> >> * TASK_WAKING such that we can unlock p->pi_lock before doing the
> >> * enqueue, such as ttwu_queue_wakelist().
> >> */
> >> + if (p->__state & TASK_TRACED)
> >> + trace_printk("clearing TASK_TRACED 2\n");
> >> + p->jobctl &= ~JOBCTL_TRACED;
> >> WRITE_ONCE(p->__state, TASK_WAKING);
> >>
> >> /*
> >>
> >> There are several places where the state is changed from TASK_TRACED to
> >> something else without clearing JOBCTL_TRACED.
> >
> > I'm having difficulty spotting them; I find:
> >
> > TASK_WAKEKILL: signal_wake_up()
> > __TASK_TRACED: ptrace_signal_wake_up(), ptrace_unfreeze_traced(), ptrace_resume()
> >
> > And all those sites dutifully clear JOBCTL_TRACED.
> >
> > I'd be most interested in the calstack for the 'clearing TASK_TRACED 2'
> > events to see where we miss a spot.
>
> The calltrace is:
> [ 9.863613] Call Trace:
> [ 9.863616] [<00000000d3105f0e>] try_to_wake_up+0xae/0x620
> [ 9.863620] ([<00000000d3106164>] try_to_wake_up+0x304/0x620)
> [ 9.863623] [<00000000d30d1e46>] ptrace_unfreeze_traced+0x9e/0xa8
> [ 9.863629] [<00000000d30d2ef0>] __s390x_sys_ptrace+0xc0/0x160
> [ 9.863633] [<00000000d3c5d8f4>] __do_syscall+0x1d4/0x200
> [ 9.863678] [<00000000d3c6c332>] system_call+0x82/0xb0
> [ 9.863685] Last Breaking-Event-Address:
> [ 9.863686] [<00000000d3106176>] try_to_wake_up+0x316/0x620
> [ 9.863688] ---[ end trace 0000000000000000 ]---
>
> ptrace_unfreeze_traced() is:
>
> static void ptrace_unfreeze_traced(struct task_struct *task)
> {
> unsigned long flags;
>
> /*
> * The child may be awake and may have cleared
> * JOBCTL_PTRACE_FROZEN (see ptrace_resume). The child will
> * not set JOBCTL_PTRACE_FROZEN or enter __TASK_TRACED anew.
> */
> if (lock_task_sighand(task, &flags)) {
> task->jobctl &= ~JOBCTL_PTRACE_FROZEN;
> if (__fatal_signal_pending(task)) {
> task->jobctl &= ~TASK_TRACED;
>
> Looking at this, shouldn't the line above read task->jobctl &= ~JOBCTL_TRACED?

YES! Absolutely.

> wake_up_state(task, __TASK_TRACED);
> }
> unlock_task_sighand(task, &flags);
> }
> }