2022-04-22 20:04:25

by Peter Zijlstra

[permalink] [raw]
Subject: [PATCH v2 1/5] sched,signal,ptrace: Rework TASK_TRACED, TASK_STOPPED state

Currently ptrace_stop() / do_signal_stop() rely on the special states
TASK_TRACED and TASK_STOPPED resp. to keep unique state. That is, this
state exists only in task->__state and nowhere else.

There's two spots of bother with this:

- PREEMPT_RT has task->saved_state which complicates matters,
meaning task_is_{traced,stopped}() needs to check an additional
variable.

- An alternative freezer implementation that itself relies on a
special TASK state would loose TASK_TRACED/TASK_STOPPED and will
result in misbehaviour.

As such, add additional state to task->jobctl to track this state
outside of task->__state.

NOTE: this doesn't actually fix anything yet, just adds extra state.

Signed-off-by: Peter Zijlstra (Intel) <[email protected]>
---
include/linux/sched.h | 8 +++-----
include/linux/sched/jobctl.h | 6 ++++++
include/linux/sched/signal.h | 5 ++++-
kernel/ptrace.c | 26 +++++++++++++++-----------
kernel/signal.c | 16 ++++++++++++----
5 files changed, 40 insertions(+), 21 deletions(-)

--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -118,11 +118,9 @@ struct task_group;

#define task_is_running(task) (READ_ONCE((task)->__state) == TASK_RUNNING)

-#define task_is_traced(task) ((READ_ONCE(task->__state) & __TASK_TRACED) != 0)
-
-#define task_is_stopped(task) ((READ_ONCE(task->__state) & __TASK_STOPPED) != 0)
-
-#define task_is_stopped_or_traced(task) ((READ_ONCE(task->__state) & (__TASK_STOPPED | __TASK_TRACED)) != 0)
+#define task_is_traced(task) ((READ_ONCE(task->jobctl) & JOBCTL_TRACED) != 0)
+#define task_is_stopped(task) ((READ_ONCE(task->jobctl) & JOBCTL_STOPPED) != 0)
+#define task_is_stopped_or_traced(task) ((READ_ONCE(task->jobctl) & (JOBCTL_STOPPED | JOBCTL_TRACED)) != 0)

/*
* Special states are those that do not use the normal wait-loop pattern. See
--- a/include/linux/sched/jobctl.h
+++ b/include/linux/sched/jobctl.h
@@ -20,6 +20,9 @@ struct task_struct;
#define JOBCTL_LISTENING_BIT 22 /* ptracer is listening for events */
#define JOBCTL_TRAP_FREEZE_BIT 23 /* trap for cgroup freezer */

+#define JOBCTL_STOPPED_BIT 24 /* do_signal_stop() */
+#define JOBCTL_TRACED_BIT 25 /* ptrace_stop() */
+
#define JOBCTL_STOP_DEQUEUED (1UL << JOBCTL_STOP_DEQUEUED_BIT)
#define JOBCTL_STOP_PENDING (1UL << JOBCTL_STOP_PENDING_BIT)
#define JOBCTL_STOP_CONSUME (1UL << JOBCTL_STOP_CONSUME_BIT)
@@ -29,6 +32,9 @@ struct task_struct;
#define JOBCTL_LISTENING (1UL << JOBCTL_LISTENING_BIT)
#define JOBCTL_TRAP_FREEZE (1UL << JOBCTL_TRAP_FREEZE_BIT)

+#define JOBCTL_STOPPED (1UL << JOBCTL_STOPPED_BIT)
+#define JOBCTL_TRACED (1UL << JOBCTL_TRACED_BIT)
+
#define JOBCTL_TRAP_MASK (JOBCTL_TRAP_STOP | JOBCTL_TRAP_NOTIFY)
#define JOBCTL_PENDING_MASK (JOBCTL_STOP_PENDING | JOBCTL_TRAP_MASK)

--- a/include/linux/sched/signal.h
+++ b/include/linux/sched/signal.h
@@ -294,8 +294,10 @@ static inline int kernel_dequeue_signal(
static inline void kernel_signal_stop(void)
{
spin_lock_irq(&current->sighand->siglock);
- if (current->jobctl & JOBCTL_STOP_DEQUEUED)
+ if (current->jobctl & JOBCTL_STOP_DEQUEUED) {
+ current->jobctl |= JOBCTL_STOPPED;
set_special_state(TASK_STOPPED);
+ }
spin_unlock_irq(&current->sighand->siglock);

schedule();
@@ -439,6 +441,7 @@ static inline void signal_wake_up(struct
{
signal_wake_up_state(t, resume ? TASK_WAKEKILL : 0);
}
+
static inline void ptrace_signal_wake_up(struct task_struct *t, bool resume)
{
signal_wake_up_state(t, resume ? __TASK_TRACED : 0);
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -185,7 +185,12 @@ static bool looks_like_a_spurious_pid(st
return true;
}

-/* Ensure that nothing can wake it up, even SIGKILL */
+/*
+ * Ensure that nothing can wake it up, even SIGKILL
+ *
+ * A task is switched to this state while a ptrace operation is in progress;
+ * such that the ptrace operation is uninterruptible.
+ */
static bool ptrace_freeze_traced(struct task_struct *task)
{
bool ret = false;
@@ -218,9 +223,10 @@ static void ptrace_unfreeze_traced(struc
*/
spin_lock_irq(&task->sighand->siglock);
if (READ_ONCE(task->__state) == __TASK_TRACED) {
- if (__fatal_signal_pending(task))
+ if (__fatal_signal_pending(task)) {
+ task->jobctl &= ~JOBCTL_TRACED;
wake_up_state(task, __TASK_TRACED);
- else
+ } else
WRITE_ONCE(task->__state, TASK_TRACED);
}
spin_unlock_irq(&task->sighand->siglock);
@@ -475,8 +481,10 @@ static int ptrace_attach(struct task_str
* in and out of STOPPED are protected by siglock.
*/
if (task_is_stopped(task) &&
- task_set_jobctl_pending(task, JOBCTL_TRAP_STOP | JOBCTL_TRAPPING))
+ task_set_jobctl_pending(task, JOBCTL_TRAP_STOP | JOBCTL_TRAPPING)) {
+ task->jobctl &= ~JOBCTL_STOPPED;
signal_wake_up_state(task, __TASK_STOPPED);
+ }

spin_unlock(&task->sighand->siglock);

@@ -850,8 +858,6 @@ static long ptrace_get_rseq_configuratio
static int ptrace_resume(struct task_struct *child, long request,
unsigned long data)
{
- bool need_siglock;
-
if (!valid_signal(data))
return -EIO;

@@ -892,13 +898,11 @@ static int ptrace_resume(struct task_str
* status and clears the code too; this can't race with the tracee, it
* takes siglock after resume.
*/
- need_siglock = data && !thread_group_empty(current);
- if (need_siglock)
- spin_lock_irq(&child->sighand->siglock);
+ spin_lock_irq(&child->sighand->siglock);
child->exit_code = data;
+ child->jobctl &= ~JOBCTL_TRACED;
wake_up_state(child, __TASK_TRACED);
- if (need_siglock)
- spin_unlock_irq(&child->sighand->siglock);
+ spin_unlock_irq(&child->sighand->siglock);

return 0;
}
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -762,7 +762,10 @@ static int dequeue_synchronous_signal(ke
*/
void signal_wake_up_state(struct task_struct *t, unsigned int state)
{
+ lockdep_assert_held(&t->sighand->siglock);
+
set_tsk_thread_flag(t, TIF_SIGPENDING);
+
/*
* TASK_WAKEKILL also means wake it up in the stopped/traced/killable
* case. We don't check t->state here because there is a race with it
@@ -770,7 +773,9 @@ void signal_wake_up_state(struct task_st
* By using wake_up_state, we ensure the process will wake up and
* handle its death signal.
*/
- if (!wake_up_state(t, state | TASK_INTERRUPTIBLE))
+ if (wake_up_state(t, state | TASK_INTERRUPTIBLE))
+ t->jobctl &= ~(JOBCTL_STOPPED | JOBCTL_TRACED);
+ else
kick_process(t);
}

@@ -884,7 +889,7 @@ static int check_kill_permission(int sig
static void ptrace_trap_notify(struct task_struct *t)
{
WARN_ON_ONCE(!(t->ptrace & PT_SEIZED));
- assert_spin_locked(&t->sighand->siglock);
+ lockdep_assert_held(&t->sighand->siglock);

task_set_jobctl_pending(t, JOBCTL_TRAP_NOTIFY);
ptrace_signal_wake_up(t, t->jobctl & JOBCTL_LISTENING);
@@ -930,9 +935,10 @@ static bool prepare_signal(int sig, stru
for_each_thread(p, t) {
flush_sigqueue_mask(&flush, &t->pending);
task_clear_jobctl_pending(t, JOBCTL_STOP_PENDING);
- if (likely(!(t->ptrace & PT_SEIZED)))
+ if (likely(!(t->ptrace & PT_SEIZED))) {
+ t->jobctl &= ~JOBCTL_STOPPED;
wake_up_state(t, __TASK_STOPPED);
- else
+ } else
ptrace_trap_notify(t);
}

@@ -2219,6 +2225,7 @@ static int ptrace_stop(int exit_code, in
* schedule() will not sleep if there is a pending signal that
* can awaken the task.
*/
+ current->jobctl |= JOBCTL_TRACED;
set_special_state(TASK_TRACED);

/*
@@ -2460,6 +2467,7 @@ static bool do_signal_stop(int signr)
if (task_participate_group_stop(current))
notify = CLD_STOPPED;

+ current->jobctl |= JOBCTL_STOPPED;
set_special_state(TASK_STOPPED);
spin_unlock_irq(&current->sighand->siglock);




2022-04-27 11:35:23

by Eric W. Biederman

[permalink] [raw]
Subject: Re: [PATCH v2 1/5] sched,signal,ptrace: Rework TASK_TRACED, TASK_STOPPED state

Peter Zijlstra <[email protected]> writes:

> Currently ptrace_stop() / do_signal_stop() rely on the special states
> TASK_TRACED and TASK_STOPPED resp. to keep unique state. That is, this
> state exists only in task->__state and nowhere else.
>
> There's two spots of bother with this:
>
> - PREEMPT_RT has task->saved_state which complicates matters,
> meaning task_is_{traced,stopped}() needs to check an additional
> variable.
>
> - An alternative freezer implementation that itself relies on a
> special TASK state would loose TASK_TRACED/TASK_STOPPED and will
> result in misbehaviour.
>
> As such, add additional state to task->jobctl to track this state
> outside of task->__state.
>
> NOTE: this doesn't actually fix anything yet, just adds extra state.
>
> Signed-off-by: Peter Zijlstra (Intel) <[email protected]>

> --- a/kernel/signal.c
> +++ b/kernel/signal.c
> @@ -770,7 +773,9 @@ void signal_wake_up_state(struct task_st
> * By using wake_up_state, we ensure the process will wake up and
> * handle its death signal.
> */
> - if (!wake_up_state(t, state | TASK_INTERRUPTIBLE))
> + if (wake_up_state(t, state | TASK_INTERRUPTIBLE))
> + t->jobctl &= ~(JOBCTL_STOPPED | JOBCTL_TRACED);
> + else
> kick_process(t);
> }

This hunk is subtle and I don't think it is actually what we want if the
code is going to be robust against tsk->__state becoming TASK_FROZEN.

I think we want the clearing of JOBCTL_STOPPED and JOBCTL_TRACED
to be independent of what tsk->__state and tsk->saved_state are.

Something like:

static inline void signal_wake_up(struct task_struct *t, bool resume)
{
unsigned int state = 0;
if (resume && !(t->jobctl & JOBCTL_DELAY_WAKEKILL)) {
t->jobctl &= ~(JOBCTL_STOPPED | JOBCTL_TRACED);
state = TASK_WAKEKILL;
}
signal_wake_up_state(t, state);
}

static inline void ptrace_signal_wake_up(struct task_struct *t, bool resume)
{
unsigned int state = 0;
if (resume) {
t->jobctl &= ~JOBCTL_TRACED;
state = __TASK_TRACED;
}
signal_wake_up_state(t, state);
}

That would allow __set_task_special in the final patch to look like:

/*
* The special task states (TASK_STOPPED, TASK_TRACED) keep their canonical
* state in p->jobctl. If either of them got a wakeup that was missed because
* TASK_FROZEN, then their canonical state reflects that and the below will
* refuse to restore the special state and instead issue the wakeup.
*/
static int __set_task_special(struct task_struct *p, void *arg)
{
unsigned int state = 0;

if (p->jobctl & JOBCTL_TRACED)
state = TASK_TRACED;

else if (p->jobctl & JOBCTL_STOPPED)
state = TASK_STOPPED;

if (state)
WRITE_ONCE(p->__state, state);

return state;
}


With no need to figure out if a wake_up was dropped and reverse engineer
what the wakeup was.

Eric

2022-05-02 08:02:10

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [PATCH v2 1/5] sched,signal,ptrace: Rework TASK_TRACED, TASK_STOPPED state

On Tue, Apr 26, 2022 at 06:34:09PM -0500, Eric W. Biederman wrote:
> Peter Zijlstra <[email protected]> writes:
>
> > Currently ptrace_stop() / do_signal_stop() rely on the special states
> > TASK_TRACED and TASK_STOPPED resp. to keep unique state. That is, this
> > state exists only in task->__state and nowhere else.
> >
> > There's two spots of bother with this:
> >
> > - PREEMPT_RT has task->saved_state which complicates matters,
> > meaning task_is_{traced,stopped}() needs to check an additional
> > variable.
> >
> > - An alternative freezer implementation that itself relies on a
> > special TASK state would loose TASK_TRACED/TASK_STOPPED and will
> > result in misbehaviour.
> >
> > As such, add additional state to task->jobctl to track this state
> > outside of task->__state.
> >
> > NOTE: this doesn't actually fix anything yet, just adds extra state.
> >
> > Signed-off-by: Peter Zijlstra (Intel) <[email protected]>
>
> > --- a/kernel/signal.c
> > +++ b/kernel/signal.c
> > @@ -770,7 +773,9 @@ void signal_wake_up_state(struct task_st
> > * By using wake_up_state, we ensure the process will wake up and
> > * handle its death signal.
> > */
> > - if (!wake_up_state(t, state | TASK_INTERRUPTIBLE))
> > + if (wake_up_state(t, state | TASK_INTERRUPTIBLE))
> > + t->jobctl &= ~(JOBCTL_STOPPED | JOBCTL_TRACED);
> > + else
> > kick_process(t);
> > }
>
> This hunk is subtle and I don't think it is actually what we want if the
> code is going to be robust against tsk->__state becoming TASK_FROZEN.

Oooh, indeed. Yes, let me go back to that resume based thing as you
suggest.

But first, let me go read all your patches :-)