While thinking about the information leaks fixed in 77f6ab8b7768
("don't dump the threads that had been already exiting when zapped.")
I realized the problem is much more general than just coredumps and
exit_mm. We have io_uring threads, PTRACE_EVENT_FORK,
PTRACE_EVENT_VFORK, PTRACE_EVENT_CLONE, PTRACE_EVENT_EXEC and
PTRACE_EVENT_EXIT where ptrace is allowed to access userspace
registers, but on some architectures has not saved them so
they can be modified.
The function alpha_switch_to does something reasonable it saves the
floating point registers and the caller saved registers and switches
to a different thread. Any register the caller is not expected to
save it does not save.
Meanhile the system call entry point on alpha also does something
reasonable. The system call entry point saves all but the caller
saved integer registers and doesn't touch the floating point registers
as the kernel code does not touch them.
This is a nice happy fast path until the kernel wants to access the
user space's registers through ptrace or similar. As user spaces's
caller saved registers may be saved at an unpredictable point in the
kernel code's stack, the routine which may stop and make the userspace
registers available must be wrapped by code that will first save a
switch stack frame at the bottom of the call stack, call the code that
may access those registers and then pop the switch stack frame.
The practical problem with this code structure is that this results in
a game of whack-a-mole wrapping different kernel system calls. Loosing
the game of whack-a-mole results in a security hole where userspace can
write arbitrary data to the kernel stack.
In general it is not possible to prevent generic code introducing a
ptrace_stop or register access not knowing alpha's limitations, that
where alpha does not make all of the registers avaliable.
Prevent security holes by recording when all of the registers are
available so generic code changes do not result in security holes
on alpha.
Cc: [email protected]
Fixes: dbe1bdbb39db ("io_uring: handle signals for IO threads like a normal thread")
Fixes: 45c1a159b85b ("Add PTRACE_O_TRACEVFORKDONE and PTRACE_O_TRACEEXIT facilities.")
Fixes: a0691b116f6a ("Add new ptrace event tracing mechanism")
History-tree: https://git.kernel.org/pub/scm/linux/kernel/git/tglx/history.git
Signed-off-by: "Eric W. Biederman" <[email protected]>
---
arch/alpha/include/asm/thread_info.h | 2 ++
arch/alpha/kernel/entry.S | 38 ++++++++++++++++++++++------
arch/alpha/kernel/ptrace.c | 13 ++++++++--
3 files changed, 43 insertions(+), 10 deletions(-)
diff --git a/arch/alpha/include/asm/thread_info.h b/arch/alpha/include/asm/thread_info.h
index 2592356e3215..41e5986ed9c8 100644
--- a/arch/alpha/include/asm/thread_info.h
+++ b/arch/alpha/include/asm/thread_info.h
@@ -63,6 +63,7 @@ register struct thread_info *__current_thread_info __asm__("$8");
#define TIF_NEED_RESCHED 3 /* rescheduling necessary */
#define TIF_SYSCALL_AUDIT 4 /* syscall audit active */
#define TIF_NOTIFY_SIGNAL 5 /* signal notifications exist */
+#define TIF_ALLREGS_SAVED 6 /* both pt_regs and switch_stack saved */
#define TIF_DIE_IF_KERNEL 9 /* dik recursion lock */
#define TIF_MEMDIE 13 /* is terminating due to OOM killer */
#define TIF_POLLING_NRFLAG 14 /* idle is polling for TIF_NEED_RESCHED */
@@ -73,6 +74,7 @@ register struct thread_info *__current_thread_info __asm__("$8");
#define _TIF_NOTIFY_RESUME (1<<TIF_NOTIFY_RESUME)
#define _TIF_SYSCALL_AUDIT (1<<TIF_SYSCALL_AUDIT)
#define _TIF_NOTIFY_SIGNAL (1<<TIF_NOTIFY_SIGNAL)
+#define _TIF_ALLREGS_SAVED (1<<TIF_ALLREGS_SAVED)
#define _TIF_POLLING_NRFLAG (1<<TIF_POLLING_NRFLAG)
/* Work to do on interrupt/exception return. */
diff --git a/arch/alpha/kernel/entry.S b/arch/alpha/kernel/entry.S
index e227f3a29a43..c1edf54dc035 100644
--- a/arch/alpha/kernel/entry.S
+++ b/arch/alpha/kernel/entry.S
@@ -174,6 +174,28 @@
.cfi_adjust_cfa_offset -SWITCH_STACK_SIZE
.endm
+.macro SAVE_SWITCH_STACK
+ DO_SWITCH_STACK
+1: ldl_l $1, TI_FLAGS($8)
+ bis $1, _TIF_ALLREGS_SAVED, $1
+ stl_c $1, TI_FLAGS($8)
+ beq $1, 2f
+.subsection 2
+2: br 1b
+.previous
+.endm
+
+.macro RESTORE_SWITCH_STACK
+1: ldl_l $1, TI_FLAGS($8)
+ bic $1, _TIF_ALLREGS_SAVED, $1
+ stl_c $1, TI_FLAGS($8)
+ beq $1, 2f
+.subsection 2
+2: br 1b
+.previous
+ UNDO_SWITCH_STACK
+.endm
+
/*
* Non-syscall kernel entry points.
*/
@@ -559,9 +581,9 @@ $work_resched:
$work_notifysig:
mov $sp, $16
- DO_SWITCH_STACK
+ SAVE_SWITCH_STACK
jsr $26, do_work_pending
- UNDO_SWITCH_STACK
+ RESTORE_SWITCH_STACK
br restore_all
/*
@@ -572,9 +594,9 @@ $work_notifysig:
.type strace, @function
strace:
/* set up signal stack, call syscall_trace */
- DO_SWITCH_STACK
+ SAVE_SWITCH_STACK
jsr $26, syscall_trace_enter /* returns the syscall number */
- UNDO_SWITCH_STACK
+ RESTORE_SWITCH_STACK
/* get the arguments back.. */
ldq $16, SP_OFF+24($sp)
@@ -602,9 +624,9 @@ ret_from_straced:
$strace_success:
stq $0, 0($sp) /* save return value */
- DO_SWITCH_STACK
+ SAVE_SWITCH_STACK
jsr $26, syscall_trace_leave
- UNDO_SWITCH_STACK
+ RESTORE_SWITCH_STACK
br $31, ret_from_sys_call
.align 3
@@ -618,13 +640,13 @@ $strace_error:
stq $0, 0($sp)
stq $1, 72($sp) /* a3 for return */
- DO_SWITCH_STACK
+ SAVE_SWITCH_STACK
mov $18, $9 /* save old syscall number */
mov $19, $10 /* save old a3 */
jsr $26, syscall_trace_leave
mov $9, $18
mov $10, $19
- UNDO_SWITCH_STACK
+ RESTORE_SWITCH_STACK
mov $31, $26 /* tell "ret_from_sys_call" we can restart */
br ret_from_sys_call
diff --git a/arch/alpha/kernel/ptrace.c b/arch/alpha/kernel/ptrace.c
index 8c43212ae38e..41fb994f36dc 100644
--- a/arch/alpha/kernel/ptrace.c
+++ b/arch/alpha/kernel/ptrace.c
@@ -117,7 +117,13 @@ get_reg_addr(struct task_struct * task, unsigned long regno)
zero = 0;
addr = &zero;
} else {
- addr = task_stack_page(task) + regoff[regno];
+ int off = regoff[regno];
+ if (WARN_ON_ONCE((off < PT_REG(r0)) &&
+ !test_ti_thread_flag(task_thread_info(task),
+ TIF_ALLREGS_SAVED)))
+ addr = &zero;
+ else
+ addr = task_stack_page(task) + off;
}
return addr;
}
@@ -145,13 +151,16 @@ get_reg(struct task_struct * task, unsigned long regno)
static int
put_reg(struct task_struct *task, unsigned long regno, unsigned long data)
{
+ unsigned long *addr;
if (regno == 63) {
task_thread_info(task)->ieee_state
= ((task_thread_info(task)->ieee_state & ~IEEE_SW_MASK)
| (data & IEEE_SW_MASK));
data = (data & FPCR_DYN_MASK) | ieee_swcr_to_fpcr(data);
}
- *get_reg_addr(task, regno) = data;
+ addr = get_reg_addr(task, regno);
+ if (addr != &zero)
+ *addr = data;
return 0;
}
--
2.20.1
Hi Eric,
instrumenting get_reg on m68k and using a similar patch to yours to warn
when unsaved registers are accessed on the switch stack, I get a hit
from getegid and getegid32, just by running a simple ptrace on ls.
Going to wack those two moles now ...
Cheers,
Michael
On 17/06/21 6:31 am, Eric W. Biederman wrote:
> While thinking about the information leaks fixed in 77f6ab8b7768
> ("don't dump the threads that had been already exiting when zapped.")
> I realized the problem is much more general than just coredumps and
> exit_mm. We have io_uring threads, PTRACE_EVENT_FORK,
> PTRACE_EVENT_VFORK, PTRACE_EVENT_CLONE, PTRACE_EVENT_EXEC and
> PTRACE_EVENT_EXIT where ptrace is allowed to access userspace
> registers, but on some architectures has not saved them so
> they can be modified.
>
> The function alpha_switch_to does something reasonable it saves the
> floating point registers and the caller saved registers and switches
> to a different thread. Any register the caller is not expected to
> save it does not save.
>
> Meanhile the system call entry point on alpha also does something
> reasonable. The system call entry point saves all but the caller
> saved integer registers and doesn't touch the floating point registers
> as the kernel code does not touch them.
>
> This is a nice happy fast path until the kernel wants to access the
> user space's registers through ptrace or similar. As user spaces's
> caller saved registers may be saved at an unpredictable point in the
> kernel code's stack, the routine which may stop and make the userspace
> registers available must be wrapped by code that will first save a
> switch stack frame at the bottom of the call stack, call the code that
> may access those registers and then pop the switch stack frame.
>
> The practical problem with this code structure is that this results in
> a game of whack-a-mole wrapping different kernel system calls. Loosing
> the game of whack-a-mole results in a security hole where userspace can
> write arbitrary data to the kernel stack.
>
> In general it is not possible to prevent generic code introducing a
> ptrace_stop or register access not knowing alpha's limitations, that
> where alpha does not make all of the registers avaliable.
>
> Prevent security holes by recording when all of the registers are
> available so generic code changes do not result in security holes
> on alpha.
>
> Cc: [email protected]
> Fixes: dbe1bdbb39db ("io_uring: handle signals for IO threads like a normal thread")
> Fixes: 45c1a159b85b ("Add PTRACE_O_TRACEVFORKDONE and PTRACE_O_TRACEEXIT facilities.")
> Fixes: a0691b116f6a ("Add new ptrace event tracing mechanism")
> History-tree: https://git.kernel.org/pub/scm/linux/kernel/git/tglx/history.git
> Signed-off-by: "Eric W. Biederman" <[email protected]>
> ---
> arch/alpha/include/asm/thread_info.h | 2 ++
> arch/alpha/kernel/entry.S | 38 ++++++++++++++++++++++------
> arch/alpha/kernel/ptrace.c | 13 ++++++++--
> 3 files changed, 43 insertions(+), 10 deletions(-)
>
> diff --git a/arch/alpha/include/asm/thread_info.h b/arch/alpha/include/asm/thread_info.h
> index 2592356e3215..41e5986ed9c8 100644
> --- a/arch/alpha/include/asm/thread_info.h
> +++ b/arch/alpha/include/asm/thread_info.h
> @@ -63,6 +63,7 @@ register struct thread_info *__current_thread_info __asm__("$8");
> #define TIF_NEED_RESCHED 3 /* rescheduling necessary */
> #define TIF_SYSCALL_AUDIT 4 /* syscall audit active */
> #define TIF_NOTIFY_SIGNAL 5 /* signal notifications exist */
> +#define TIF_ALLREGS_SAVED 6 /* both pt_regs and switch_stack saved */
> #define TIF_DIE_IF_KERNEL 9 /* dik recursion lock */
> #define TIF_MEMDIE 13 /* is terminating due to OOM killer */
> #define TIF_POLLING_NRFLAG 14 /* idle is polling for TIF_NEED_RESCHED */
> @@ -73,6 +74,7 @@ register struct thread_info *__current_thread_info __asm__("$8");
> #define _TIF_NOTIFY_RESUME (1<<TIF_NOTIFY_RESUME)
> #define _TIF_SYSCALL_AUDIT (1<<TIF_SYSCALL_AUDIT)
> #define _TIF_NOTIFY_SIGNAL (1<<TIF_NOTIFY_SIGNAL)
> +#define _TIF_ALLREGS_SAVED (1<<TIF_ALLREGS_SAVED)
> #define _TIF_POLLING_NRFLAG (1<<TIF_POLLING_NRFLAG)
>
> /* Work to do on interrupt/exception return. */
> diff --git a/arch/alpha/kernel/entry.S b/arch/alpha/kernel/entry.S
> index e227f3a29a43..c1edf54dc035 100644
> --- a/arch/alpha/kernel/entry.S
> +++ b/arch/alpha/kernel/entry.S
> @@ -174,6 +174,28 @@
> .cfi_adjust_cfa_offset -SWITCH_STACK_SIZE
> .endm
>
> +.macro SAVE_SWITCH_STACK
> + DO_SWITCH_STACK
> +1: ldl_l $1, TI_FLAGS($8)
> + bis $1, _TIF_ALLREGS_SAVED, $1
> + stl_c $1, TI_FLAGS($8)
> + beq $1, 2f
> +.subsection 2
> +2: br 1b
> +.previous
> +.endm
> +
> +.macro RESTORE_SWITCH_STACK
> +1: ldl_l $1, TI_FLAGS($8)
> + bic $1, _TIF_ALLREGS_SAVED, $1
> + stl_c $1, TI_FLAGS($8)
> + beq $1, 2f
> +.subsection 2
> +2: br 1b
> +.previous
> + UNDO_SWITCH_STACK
> +.endm
> +
> /*
> * Non-syscall kernel entry points.
> */
> @@ -559,9 +581,9 @@ $work_resched:
>
> $work_notifysig:
> mov $sp, $16
> - DO_SWITCH_STACK
> + SAVE_SWITCH_STACK
> jsr $26, do_work_pending
> - UNDO_SWITCH_STACK
> + RESTORE_SWITCH_STACK
> br restore_all
>
> /*
> @@ -572,9 +594,9 @@ $work_notifysig:
> .type strace, @function
> strace:
> /* set up signal stack, call syscall_trace */
> - DO_SWITCH_STACK
> + SAVE_SWITCH_STACK
> jsr $26, syscall_trace_enter /* returns the syscall number */
> - UNDO_SWITCH_STACK
> + RESTORE_SWITCH_STACK
>
> /* get the arguments back.. */
> ldq $16, SP_OFF+24($sp)
> @@ -602,9 +624,9 @@ ret_from_straced:
> $strace_success:
> stq $0, 0($sp) /* save return value */
>
> - DO_SWITCH_STACK
> + SAVE_SWITCH_STACK
> jsr $26, syscall_trace_leave
> - UNDO_SWITCH_STACK
> + RESTORE_SWITCH_STACK
> br $31, ret_from_sys_call
>
> .align 3
> @@ -618,13 +640,13 @@ $strace_error:
> stq $0, 0($sp)
> stq $1, 72($sp) /* a3 for return */
>
> - DO_SWITCH_STACK
> + SAVE_SWITCH_STACK
> mov $18, $9 /* save old syscall number */
> mov $19, $10 /* save old a3 */
> jsr $26, syscall_trace_leave
> mov $9, $18
> mov $10, $19
> - UNDO_SWITCH_STACK
> + RESTORE_SWITCH_STACK
>
> mov $31, $26 /* tell "ret_from_sys_call" we can restart */
> br ret_from_sys_call
> diff --git a/arch/alpha/kernel/ptrace.c b/arch/alpha/kernel/ptrace.c
> index 8c43212ae38e..41fb994f36dc 100644
> --- a/arch/alpha/kernel/ptrace.c
> +++ b/arch/alpha/kernel/ptrace.c
> @@ -117,7 +117,13 @@ get_reg_addr(struct task_struct * task, unsigned long regno)
> zero = 0;
> addr = &zero;
> } else {
> - addr = task_stack_page(task) + regoff[regno];
> + int off = regoff[regno];
> + if (WARN_ON_ONCE((off < PT_REG(r0)) &&
> + !test_ti_thread_flag(task_thread_info(task),
> + TIF_ALLREGS_SAVED)))
> + addr = &zero;
> + else
> + addr = task_stack_page(task) + off;
> }
> return addr;
> }
> @@ -145,13 +151,16 @@ get_reg(struct task_struct * task, unsigned long regno)
> static int
> put_reg(struct task_struct *task, unsigned long regno, unsigned long data)
> {
> + unsigned long *addr;
> if (regno == 63) {
> task_thread_info(task)->ieee_state
> = ((task_thread_info(task)->ieee_state & ~IEEE_SW_MASK)
> | (data & IEEE_SW_MASK));
> data = (data & FPCR_DYN_MASK) | ieee_swcr_to_fpcr(data);
> }
> - *get_reg_addr(task, regno) = data;
> + addr = get_reg_addr(task, regno);
> + if (addr != &zero)
> + *addr = data;
> return 0;
> }
>
On Sun, Jun 20, 2021 at 7:01 PM Michael Schmitz <[email protected]> wrote:
>
> instrumenting get_reg on m68k and using a similar patch to yours to warn
> when unsaved registers are accessed on the switch stack, I get a hit
> from getegid and getegid32, just by running a simple ptrace on ls.
>
> Going to wack those two moles now ...
I don't see what's going on. Those system calls don't use the register
state, afaik. What's the call chain, exactly?
Linus
On Mon, Jun 21, 2021 at 02:01:18PM +1200, Michael Schmitz wrote:
> Hi Eric,
>
> instrumenting get_reg on m68k and using a similar patch to yours to warn
> when unsaved registers are accessed on the switch stack, I get a hit from
> getegid and getegid32, just by running a simple ptrace on ls.
>
> Going to wack those two moles now ...
Explain, please. get_reg() is called by tracer; whose state are you checking?
Because you are *not* accessing the switch stack of the caller of get_reg().
And tracee should be in something like syscall_trace() or do_notify_resume();
both have SAVE_SWITCH_STACK done by the glue...
Hi Linus,
Am 21.06.2021 um 14:17 schrieb Linus Torvalds:
> On Sun, Jun 20, 2021 at 7:01 PM Michael Schmitz <[email protected]> wrote:
>>
>> instrumenting get_reg on m68k and using a similar patch to yours to warn
>> when unsaved registers are accessed on the switch stack, I get a hit
>> from getegid and getegid32, just by running a simple ptrace on ls.
>>
>> Going to wack those two moles now ...
>
> I don't see what's going on. Those system calls don't use the register
> state, afaik. What's the call chain, exactly?
This is what I get from WARN_ONCE:
------------[ cut here ]------------
WARNING: CPU: 0 PID: 1177 at arch/m68k/kernel/ptrace.c:91 get_reg+0x90/0xb8
Modules linked in:
CPU: 0 PID: 1177 Comm: strace Not tainted
5.13.0-rc1-atari-fpuemu-exitfix+ #1146
Stack from 014b7f04:
014b7f04 00336401 00336401 000278f0 0032c015 0000005b 00000005
0002795a
0032c015 0000005b 0000338c 00000009 00000000 00000000 ffffffe4
00000005
00000003 00000014 00000003 00000014 efc2b90c 0000338c 0032c015
0000005b
00000009 00000000 efc2b908 00912540 efc2b908 000034cc 00912540
00000005
00000000 efc2b908 00000003 00912540 8000110c c010b0a4 efc2b90c
0002d1d8
00912540 00000003 00000014 efc2b908 0000049a 00000014 efc2b908
800acaa8
Call Trace: [<000278f0>] __warn+0x9e/0xb4
[<0002795a>] warn_slowpath_fmt+0x54/0x62
[<0000338c>] get_reg+0x90/0xb8
[<0000338c>] get_reg+0x90/0xb8
[<000034cc>] arch_ptrace+0x7e/0x250
[<0002d1d8>] sys_ptrace+0x232/0x2f8
[<00002ab6>] syscall+0x8/0xc
[<0000c00b>] lower+0x7/0x20
---[ end trace ee4be53b94695793 ]---
Syscall numbers are actually 90 and 192 - sys_old_mmap and sys_mmap2 on
m68k. Used the calculator on my Ubuntu desktop, that appears to be a
little confused about hex to decimal conversions.
I hope that makes more sense?
Cheers,
Michael
>
> Linus
>
Hi Al,
Am 21.06.2021 um 14:27 schrieb Al Viro:
> On Mon, Jun 21, 2021 at 02:01:18PM +1200, Michael Schmitz wrote:
>> Hi Eric,
>>
>> instrumenting get_reg on m68k and using a similar patch to yours to warn
>> when unsaved registers are accessed on the switch stack, I get a hit from
>> getegid and getegid32, just by running a simple ptrace on ls.
>>
>> Going to wack those two moles now ...
>
> Explain, please. get_reg() is called by tracer; whose state are you checking?
The check is only triggered when syscall tracing (I set a flag on trace
entry, and clear that on trace exit)... From the WARN_ONCE stack dump,
it appears that I get the warning from inside the syscall, not
syscall_trace().
> Because you are *not* accessing the switch stack of the caller of get_reg().
> And tracee should be in something like syscall_trace() or do_notify_resume();
> both have SAVE_SWITCH_STACK done by the glue...
And that's where my problem may be - I stupidly forgot to set the 'all
registers saved' flag before calling syscall_trace() ...
I'll fix that and try again. Sorry for the noise!
Cheers,
Michael
>
On Sun, Jun 20, 2021 at 8:18 PM Michael Schmitz <[email protected]> wrote:
>
> I hope that makes more sense?
So the problem is in your debug patch: you don't set that
TIS_SWITCH_STACK in nearly enough places.
In this particular example, I think it's that you don't set it in
do_trace_exit, so when you strace the process, the system call exit -
which is where the return value will be picked up - gets that warning.
You did set TIS_SWITCH_STACK on trace_entry, but then it's cleared
again during the system call, and not set at the trace_exit path.
Oddly, your debug patch also _clears_ it on the exit path, but it
doesn't set it when do_trace_exit does the SAVE_SWITCH_STACK.
You oddly also set it for __sys_exit, but not all the other special
system calls that also do that SAVE_SWITCH_STACK.
Really, pretty much every single case of SAVE_SWITCH_STACK would need
to set it. Not just do_trace_enter/exit
It's why I didn't like Eric's debug patch either. It's quite expensive
to do, partly because you look up that curptr thing. All very nasty.
It would be *much* better to make the flag be part of the stack frame,
but sadly at least on alpha we had exported the format of that stack
frame to user space.
Anyway, I think these debug patches are not just expensive but the
m68k one most definitely is also very incomplete.
Linus
On Mon, Jun 21, 2021 at 03:18:35PM +1200, Michael Schmitz wrote:
> This is what I get from WARN_ONCE:
>
> ------------[ cut here ]------------
> WARNING: CPU: 0 PID: 1177 at arch/m68k/kernel/ptrace.c:91 get_reg+0x90/0xb8
> Modules linked in:
> CPU: 0 PID: 1177 Comm: strace Not tainted 5.13.0-rc1-atari-fpuemu-exitfix+
> #1146
> Stack from 014b7f04:
> 014b7f04 00336401 00336401 000278f0 0032c015 0000005b 00000005
> 0002795a
> 0032c015 0000005b 0000338c 00000009 00000000 00000000 ffffffe4
> 00000005
> 00000003 00000014 00000003 00000014 efc2b90c 0000338c 0032c015
> 0000005b
> 00000009 00000000 efc2b908 00912540 efc2b908 000034cc 00912540
> 00000005
> 00000000 efc2b908 00000003 00912540 8000110c c010b0a4 efc2b90c
> 0002d1d8
> 00912540 00000003 00000014 efc2b908 0000049a 00000014 efc2b908
> 800acaa8
> Call Trace: [<000278f0>] __warn+0x9e/0xb4
> [<0002795a>] warn_slowpath_fmt+0x54/0x62
> [<0000338c>] get_reg+0x90/0xb8
> [<0000338c>] get_reg+0x90/0xb8
> [<000034cc>] arch_ptrace+0x7e/0x250
> [<0002d1d8>] sys_ptrace+0x232/0x2f8
> [<00002ab6>] syscall+0x8/0xc
> [<0000c00b>] lower+0x7/0x20
>
> ---[ end trace ee4be53b94695793 ]---
>
> Syscall numbers are actually 90 and 192 - sys_old_mmap and sys_mmap2 on
> m68k. Used the calculator on my Ubuntu desktop, that appears to be a little
> confused about hex to decimal conversions.
>
> I hope that makes more sense?
Not really; what is the condition you are checking? The interesting trace
is not that with get_reg() - it's that of the process being traced. You
are not accessing the stack of caller of ptrace(2) here, so you want to
know that SAVE_SWITCH_STACK had been done by the tracee, not tracer.
And if that had been strace ls, you have TIF_SYSCALL_TRACE set for ls, so
* ls hits system_call
* notices TIF_SYSCALL_TRACE and goes to do_trace_entry
* does SAVE_SWITCH_STACK there
* calls syscall_trace(), which calls ptrace_notify()
* ptrace_notify() calls ptrace_do_notify(), which calls ptrace_stop()
* ptrace_stop() arranges for tracer to be woken up and gives CPU up,
with TASK_TRACED as process state.
That's the callchain in ls, and switch_stack accessed by get_reg() from
strace is the one on ls(1) stack created by SAVE_SWITCH_STACK.
Hi Linus,
I realized that the patch is still incomplete when answering Al...
Am 21.06.2021 um 15:37 schrieb Linus Torvalds:
> On Sun, Jun 20, 2021 at 8:18 PM Michael Schmitz <[email protected]> wrote:
>>
>> I hope that makes more sense?
>
> So the problem is in your debug patch: you don't set that
> TIS_SWITCH_STACK in nearly enough places.
>
> In this particular example, I think it's that you don't set it in
> do_trace_exit, so when you strace the process, the system call exit -
> which is where the return value will be picked up - gets that warning.
>
> You did set TIS_SWITCH_STACK on trace_entry, but then it's cleared
> again during the system call, and not set at the trace_exit path.
> Oddly, your debug patch also _clears_ it on the exit path, but it
> doesn't set it when do_trace_exit does the SAVE_SWITCH_STACK.
>
> You oddly also set it for __sys_exit, but not all the other special
> system calls that also do that SAVE_SWITCH_STACK.
That's the one I used to test whether my debug patch had any ill side
effects (i.e. smashing the stack) late yesterday. Forgot to add that to
the other cases.
>
> Really, pretty much every single case of SAVE_SWITCH_STACK would need
> to set it. Not just do_trace_enter/exit
Yes - done that now and the warning is gone.
> It's why I didn't like Eric's debug patch either. It's quite expensive
> to do, partly because you look up that curptr thing. All very nasty.
I need to talk to Geert and Andreas to find where register a1 is
preserved, but if I have to reload a1 all the time, this won't be useful
except for debugging.
> It would be *much* better to make the flag be part of the stack frame,
> but sadly at least on alpha we had exported the format of that stack
> frame to user space.
Same on m68k, but can we push a flag _after_ the switch stack?
> Anyway, I think these debug patches are not just expensive but the
> m68k one most definitely is also very incomplete.
Yes, I've seen that in the meantime. Need to triple check my work next
time.
Sorry for the extra noise!
Cheers,
Michael
>
> Linus
>
Hi Al,
Am 21.06.2021 um 15:44 schrieb Al Viro:
> On Mon, Jun 21, 2021 at 03:18:35PM +1200, Michael Schmitz wrote:
>
>> This is what I get from WARN_ONCE:
>>
>> ------------[ cut here ]------------
>> WARNING: CPU: 0 PID: 1177 at arch/m68k/kernel/ptrace.c:91 get_reg+0x90/0xb8
>> Modules linked in:
>> CPU: 0 PID: 1177 Comm: strace Not tainted 5.13.0-rc1-atari-fpuemu-exitfix+
>> #1146
>> Stack from 014b7f04:
>> 014b7f04 00336401 00336401 000278f0 0032c015 0000005b 00000005
>> 0002795a
>> 0032c015 0000005b 0000338c 00000009 00000000 00000000 ffffffe4
>> 00000005
>> 00000003 00000014 00000003 00000014 efc2b90c 0000338c 0032c015
>> 0000005b
>> 00000009 00000000 efc2b908 00912540 efc2b908 000034cc 00912540
>> 00000005
>> 00000000 efc2b908 00000003 00912540 8000110c c010b0a4 efc2b90c
>> 0002d1d8
>> 00912540 00000003 00000014 efc2b908 0000049a 00000014 efc2b908
>> 800acaa8
>> Call Trace: [<000278f0>] __warn+0x9e/0xb4
>> [<0002795a>] warn_slowpath_fmt+0x54/0x62
>> [<0000338c>] get_reg+0x90/0xb8
>> [<0000338c>] get_reg+0x90/0xb8
>> [<000034cc>] arch_ptrace+0x7e/0x250
>> [<0002d1d8>] sys_ptrace+0x232/0x2f8
>> [<00002ab6>] syscall+0x8/0xc
>> [<0000c00b>] lower+0x7/0x20
>>
>> ---[ end trace ee4be53b94695793 ]---
>>
>> Syscall numbers are actually 90 and 192 - sys_old_mmap and sys_mmap2 on
>> m68k. Used the calculator on my Ubuntu desktop, that appears to be a little
>> confused about hex to decimal conversions.
>>
>> I hope that makes more sense?
>
> Not really; what is the condition you are checking? The interesting trace
The check in get_reg() is:
if (WARN_ON_ONCE((off < PT_REG(d1)) &&
test_ti_thread_status(task_thread_info(task),TIS_TRACING)
&& !test_ti_thread_status(task_thread_info(task),
TIS_ALLREGS_SAVED))) {
unsigned long *addr_d0;
addr_d0 = (unsigned long *)(task->thread.esp0 +
regoff[16]);
pr_err("get_reg with incomplete stack, regno %d offs
%d orig_d0 %lx\n", regno, off, *addr_d0);
return 0;
}
> is not that with get_reg() - it's that of the process being traced. You
> are not accessing the stack of caller of ptrace(2) here, so you want to
> know that SAVE_SWITCH_STACK had been done by the tracee, not tracer.
>
> And if that had been strace ls, you have TIF_SYSCALL_TRACE set for ls, so
> * ls hits system_call
> * notices TIF_SYSCALL_TRACE and goes to do_trace_entry
> * does SAVE_SWITCH_STACK there
... and sets both the new TIS_TRACING and TIS_ALLREGS_SAVED flags in the
thread_info->status field (now that I've corrected my patch).
> * calls syscall_trace(), which calls ptrace_notify()
> * ptrace_notify() calls ptrace_do_notify(), which calls ptrace_stop()
> * ptrace_stop() arranges for tracer to be woken up and gives CPU up,
> with TASK_TRACED as process state.
Thanks for explaining! So in order to get a trace for the process being
traced, I would have to check the TIS_ALLREGS_SAVED in ptrace_stop()?
> That's the callchain in ls, and switch_stack accessed by get_reg() from
> strace is the one on ls(1) stack created by SAVE_SWITCH_STACK.
So testing for TIS_ALLREGS_SAVED in get_reg() (called by the tracer, but
with the tracee's task struct passed to arch_ptrace()) does check that
SAVE_SWITCH_STACK was done before the syscall in the tracee, right?
Anyway, I'd missed setting the flags for some crucial SAVE_SWITCH_STACK
operations in my woefully incomplete patch. With that corrected, there's
no more warning from mmap. I'll try with a more recent version of strace
and gdb once I've updated my test image.
Cheers,
Michael