2021-02-01 19:45:30

by Andrei Vagin

[permalink] [raw]
Subject: [PATCH 0/3 v2] arm64/ptrace: allow to get all registers on syscall traps

Right now, ip/r12 for AArch32 and x7 for AArch64 is used to indicate
whether or not the stop has been signalled from syscall entry or syscall
exit. This means that:

- Any writes by the tracer to this register during the stop are
ignored/discarded.

- The actual value of the register is not available during the stop,
so the tracer cannot save it and restore it later.

For applications like the user-mode Linux or gVisor, it is critical to
have access to the full set of registers in any moment. For example,
they need to change values of all registers to emulate rt_sigreturn or
execve and they need to have the full set of registers to build a signal
frame.

This series introduces the PTRACE_O_ARM64_RAW_REGS option. If it is set,
PTRACE_GETREGSET returns values of all registers, and PTRACE_SETREGSET
allows to change any of them.

Cc: Catalin Marinas <[email protected]>
Cc: Dave Martin <[email protected]>
Cc: Keno Fischer <[email protected]>
Cc: Oleg Nesterov <[email protected]>
Cc: Will Deacon <[email protected]>

Andrei Vagin (3):
arm64/ptrace: don't clobber task registers on syscall entry/exit traps
arm64/ptrace: introduce PTRACE_O_ARM64_RAW_REGS
selftest/arm64/ptrace: add tests for PTRACE_O_ARM64_RAW_REGS

v2: use the ptrace option instead of adding a new regset.

arch/arm64/include/asm/ptrace.h | 5 +
arch/arm64/kernel/ptrace.c | 130 +++++++++++-----
include/uapi/linux/elf.h | 1 +
tools/testing/selftests/arm64/Makefile | 2 +-
tools/testing/selftests/arm64/ptrace/Makefile | 6 +
.../arm64/ptrace/ptrace_syscall_regs_test.c | 142 ++++++++++++++++++
6 files changed, 246 insertions(+), 40 deletions(-)
create mode 100644 tools/testing/selftests/arm64/ptrace/Makefile
create mode 100644 tools/testing/selftests/arm64/ptrace/ptrace_syscall_regs_test.c

--
2.29.2


2021-02-01 19:45:44

by Andrei Vagin

[permalink] [raw]
Subject: [PATCH 1/3] arm64/ptrace: don't clobber task registers on syscall entry/exit traps

ip/r12 for AArch32 and x7 for AArch64 is used to indicate whether or not
the stop has been signalled from syscall entry or syscall exit. This
means that:

- Any writes by the tracer to this register during the stop are
ignored/discarded.

- The actual value of the register is not available during the stop,
so the tracer cannot save it and restore it later.

Right now, these registers are clobbered in tracehook_report_syscall.
This change moves the logic to gpr_get and compat_gpr_get where
registers are copied into a user-space buffer.

This will allow to change these registers and to introduce a new
ptrace option to get the full set of registers.

Signed-off-by: Andrei Vagin <[email protected]>
---
arch/arm64/include/asm/ptrace.h | 5 ++
arch/arm64/kernel/ptrace.c | 104 ++++++++++++++++++++------------
2 files changed, 69 insertions(+), 40 deletions(-)

diff --git a/arch/arm64/include/asm/ptrace.h b/arch/arm64/include/asm/ptrace.h
index e58bca832dff..0a9552b4f61e 100644
--- a/arch/arm64/include/asm/ptrace.h
+++ b/arch/arm64/include/asm/ptrace.h
@@ -170,6 +170,11 @@ static inline unsigned long pstate_to_compat_psr(const unsigned long pstate)
return psr;
}

+enum ptrace_syscall_dir {
+ PTRACE_SYSCALL_ENTER = 0,
+ PTRACE_SYSCALL_EXIT,
+};
+
/*
* This struct defines the way the registers are stored on the stack during an
* exception. Note that sizeof(struct pt_regs) has to be a multiple of 16 (for
diff --git a/arch/arm64/kernel/ptrace.c b/arch/arm64/kernel/ptrace.c
index 8ac487c84e37..39da03104528 100644
--- a/arch/arm64/kernel/ptrace.c
+++ b/arch/arm64/kernel/ptrace.c
@@ -40,6 +40,7 @@
#include <asm/syscall.h>
#include <asm/traps.h>
#include <asm/system_misc.h>
+#include <asm/ptrace.h>

#define CREATE_TRACE_POINTS
#include <trace/events/syscalls.h>
@@ -561,7 +562,31 @@ static int gpr_get(struct task_struct *target,
struct membuf to)
{
struct user_pt_regs *uregs = &task_pt_regs(target)->user_regs;
- return membuf_write(&to, uregs, sizeof(*uregs));
+ unsigned long saved_reg;
+ int ret;
+
+ /*
+ * We have some ABI weirdness here in the way that we handle syscall
+ * exit stops because we indicate whether or not the stop has been
+ * signalled from syscall entry or syscall exit by clobbering the general
+ * purpose register x7.
+ */
+ saved_reg = uregs->regs[7];
+
+ switch (target->ptrace_message) {
+ case PTRACE_EVENTMSG_SYSCALL_ENTRY:
+ uregs->regs[7] = PTRACE_SYSCALL_ENTER;
+ break;
+ case PTRACE_EVENTMSG_SYSCALL_EXIT:
+ uregs->regs[7] = PTRACE_SYSCALL_EXIT;
+ break;
+ }
+
+ ret = membuf_write(&to, uregs, sizeof(*uregs));
+
+ uregs->regs[7] = saved_reg;
+
+ return ret;
}

static int gpr_set(struct task_struct *target, const struct user_regset *regset,
@@ -575,6 +600,17 @@ static int gpr_set(struct task_struct *target, const struct user_regset *regset,
if (ret)
return ret;

+ /*
+ * Historically, x7 can't be changed if the stop has been signalled
+ * from syscall-enter of syscall-exit.
+ */
+ switch (target->ptrace_message) {
+ case PTRACE_EVENTMSG_SYSCALL_ENTRY:
+ case PTRACE_EVENTMSG_SYSCALL_EXIT:
+ newregs.regs[7] = task_pt_regs(target)->regs[7];
+ break;
+ }
+
if (!valid_user_regs(&newregs, target))
return -EINVAL;

@@ -1206,6 +1242,20 @@ static inline compat_ulong_t compat_get_user_reg(struct task_struct *task, int i
struct pt_regs *regs = task_pt_regs(task);

switch (idx) {
+ case 12:
+ /*
+ * We have some ABI weirdness here in the way that we handle
+ * syscall exit stops because we indicate whether or not the
+ * stop has been signalled from syscall entry or syscall exit
+ * by clobbering the general purpose register r12.
+ */
+ switch (task->ptrace_message) {
+ case PTRACE_EVENTMSG_SYSCALL_ENTRY:
+ return PTRACE_SYSCALL_ENTER;
+ case PTRACE_EVENTMSG_SYSCALL_EXIT:
+ return PTRACE_SYSCALL_EXIT;
+ }
+ return regs->regs[idx];
case 15:
return regs->pc;
case 16:
@@ -1282,6 +1332,17 @@ static int compat_gpr_set(struct task_struct *target,

}

+ /*
+ * Historically, x12 can't be changed if the stop has been signalled
+ * from syscall-enter of syscall-exit.
+ */
+ switch (target->ptrace_message) {
+ case PTRACE_EVENTMSG_SYSCALL_ENTRY:
+ case PTRACE_EVENTMSG_SYSCALL_EXIT:
+ newregs.regs[12] = task_pt_regs(target)->regs[12];
+ break;
+ }
+
if (valid_user_regs(&newregs.user_regs, target))
*task_pt_regs(target) = newregs;
else
@@ -1740,53 +1801,16 @@ long arch_ptrace(struct task_struct *child, long request,
return ptrace_request(child, request, addr, data);
}

-enum ptrace_syscall_dir {
- PTRACE_SYSCALL_ENTER = 0,
- PTRACE_SYSCALL_EXIT,
-};
-
static void tracehook_report_syscall(struct pt_regs *regs,
enum ptrace_syscall_dir dir)
{
- int regno;
- unsigned long saved_reg;
-
- /*
- * We have some ABI weirdness here in the way that we handle syscall
- * exit stops because we indicate whether or not the stop has been
- * signalled from syscall entry or syscall exit by clobbering a general
- * purpose register (ip/r12 for AArch32, x7 for AArch64) in the tracee
- * and restoring its old value after the stop. This means that:
- *
- * - Any writes by the tracer to this register during the stop are
- * ignored/discarded.
- *
- * - The actual value of the register is not available during the stop,
- * so the tracer cannot save it and restore it later.
- *
- * - Syscall stops behave differently to seccomp and pseudo-step traps
- * (the latter do not nobble any registers).
- */
- regno = (is_compat_task() ? 12 : 7);
- saved_reg = regs->regs[regno];
- regs->regs[regno] = dir;
-
if (dir == PTRACE_SYSCALL_ENTER) {
if (tracehook_report_syscall_entry(regs))
forget_syscall(regs);
- regs->regs[regno] = saved_reg;
- } else if (!test_thread_flag(TIF_SINGLESTEP)) {
- tracehook_report_syscall_exit(regs, 0);
- regs->regs[regno] = saved_reg;
} else {
- regs->regs[regno] = saved_reg;
+ int singlestep = test_thread_flag(TIF_SINGLESTEP);

- /*
- * Signal a pseudo-step exception since we are stepping but
- * tracer modifications to the registers may have rewound the
- * state machine.
- */
- tracehook_report_syscall_exit(regs, 1);
+ tracehook_report_syscall_exit(regs, singlestep);
}
}

--
2.29.2

2021-02-01 19:46:03

by Andrei Vagin

[permalink] [raw]
Subject: [PATCH 3/3] selftest/arm64/ptrace: add tests for PTRACE_O_ARM64_RAW_REGS

Test output:
TAP version 13
1..2
# selftests: arm64/ptrace: ptrace_syscall_raw_regs_test
# 1..2
# ok 1 x7: 686920776f726c64
# ok 2 The child exited with code 0.
# # Totals: pass:2 fail:0 xfail:0 xpass:0 skip:0 error:0
ok 1 selftests: arm64/ptrace: ptrace_syscall_raw_regs_test
# selftests: arm64/ptrace: ptrace_syscall_regs_test
# 1..3
# ok 1 x7: 0
# ok 2 x7: 1
# ok 3 The child exited with code 0.
# # Totals: pass:3 fail:0 xfail:0 xpass:0 skip:0 error:0
ok 2 selftests: arm64/ptrace: ptrace_syscall_regs_test

Signed-off-by: Andrei Vagin <[email protected]>
---
tools/testing/selftests/arm64/Makefile | 2 +-
tools/testing/selftests/arm64/ptrace/Makefile | 6 +
.../ptrace/ptrace_syscall_raw_regs_test.c | 142 +++++++++++++++++
.../arm64/ptrace/ptrace_syscall_regs_test.c | 150 ++++++++++++++++++
4 files changed, 299 insertions(+), 1 deletion(-)
create mode 100644 tools/testing/selftests/arm64/ptrace/Makefile
create mode 100644 tools/testing/selftests/arm64/ptrace/ptrace_syscall_raw_regs_test.c
create mode 100644 tools/testing/selftests/arm64/ptrace/ptrace_syscall_regs_test.c

diff --git a/tools/testing/selftests/arm64/Makefile b/tools/testing/selftests/arm64/Makefile
index 2c9d012797a7..704770a60ece 100644
--- a/tools/testing/selftests/arm64/Makefile
+++ b/tools/testing/selftests/arm64/Makefile
@@ -4,7 +4,7 @@
ARCH ?= $(shell uname -m 2>/dev/null || echo not)

ifneq (,$(filter $(ARCH),aarch64 arm64))
-ARM64_SUBTARGETS ?= tags signal pauth fp mte
+ARM64_SUBTARGETS ?= tags signal pauth fp mte ptrace
else
ARM64_SUBTARGETS :=
endif
diff --git a/tools/testing/selftests/arm64/ptrace/Makefile b/tools/testing/selftests/arm64/ptrace/Makefile
new file mode 100644
index 000000000000..84b27449f3d1
--- /dev/null
+++ b/tools/testing/selftests/arm64/ptrace/Makefile
@@ -0,0 +1,6 @@
+# SPDX-License-Identifier: GPL-2.0
+
+CFLAGS += -g -I../../../../../usr/include/
+TEST_GEN_PROGS := ptrace_syscall_raw_regs_test ptrace_syscall_regs_test
+
+include ../../lib.mk
diff --git a/tools/testing/selftests/arm64/ptrace/ptrace_syscall_raw_regs_test.c b/tools/testing/selftests/arm64/ptrace/ptrace_syscall_raw_regs_test.c
new file mode 100644
index 000000000000..78f913303a99
--- /dev/null
+++ b/tools/testing/selftests/arm64/ptrace/ptrace_syscall_raw_regs_test.c
@@ -0,0 +1,142 @@
+// SPDX-License-Identifier: GPL-2.0-only
+#include <errno.h>
+#include <signal.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+
+#include <sys/types.h>
+#include <sys/ptrace.h>
+#include <sys/user.h>
+#include <sys/wait.h>
+#include <sys/uio.h>
+#include <linux/elf.h>
+#include <linux/unistd.h>
+
+#include "../../kselftest.h"
+
+#define TEST_VAL 0x686920776f726c64UL
+
+#define pr_p(func, fmt, ...) func(fmt ": %m", ##__VA_ARGS__)
+
+#define pr_err(fmt, ...) \
+ ({ \
+ ksft_test_result_error(fmt "\n", ##__VA_ARGS__); \
+ -1; \
+ })
+
+#define pr_fail(fmt, ...) \
+ ({ \
+ ksft_test_result_fail(fmt "\n", ##__VA_ARGS__); \
+ -1; \
+ })
+
+#define pr_perror(fmt, ...) pr_p(pr_err, fmt, ##__VA_ARGS__)
+
+static long loop(void *val)
+{
+ register long x0 __asm__("x0");
+ register void *x1 __asm__("x1") = val;
+ register long x8 __asm__("x8") = 555;
+
+ __asm__ (
+ "again:\n"
+ "ldr x7, [x1, 0]\n"
+ "svc 0\n"
+ "str x7, [x1, 0]\n"
+ : "=r"(x0)
+ : "r"(x1), "r"(x8)
+ :
+ );
+ return 0;
+}
+
+static int child(void)
+{
+ long val = TEST_VAL;
+
+ loop(&val);
+ if (val != ~TEST_VAL) {
+ ksft_print_msg("Unexpected x7: %lx\n", val);
+ return 1;
+ }
+
+ return 0;
+}
+
+#ifndef PTRACE_SYSEMU
+#define PTRACE_SYSEMU 31
+#endif
+
+#ifndef PTRACE_O_ARM64_RAW_REGS
+#define PTRACE_O_ARM64_RAW_REGS (1 << 28)
+#endif
+
+int main(int argc, void **argv)
+{
+ struct user_regs_struct regs = {};
+ struct iovec iov = {
+ .iov_base = &regs,
+ .iov_len = sizeof(struct user_regs_struct),
+ };
+ int status;
+ pid_t pid;
+
+ ksft_set_plan(2);
+
+ pid = fork();
+ if (pid == 0) {
+ kill(getpid(), SIGSTOP);
+ child();
+ _exit(0);
+ }
+ if (pid < 0)
+ return 1;
+
+ if (ptrace(PTRACE_ATTACH, pid, 0, 0))
+ return pr_perror("Can't attach to the child %d", pid);
+ if (waitpid(pid, &status, 0) != pid)
+ return pr_perror("Can't wait for the child %d", pid);
+ if (ptrace(PTRACE_SETOPTIONS, pid, 0, PTRACE_O_ARM64_RAW_REGS))
+ return pr_perror("Can't set PTRACE_O_ARM64_RAW_REGS");
+ /* skip SIGSTOP */
+ if (ptrace(PTRACE_CONT, pid, 0, 0))
+ return pr_perror("Can't resume the child %d", pid);
+ if (waitpid(pid, &status, 0) != pid)
+ return pr_perror("Can't wait for the child %d", pid);
+
+ /* Resume the child to the next system call. */
+ if (ptrace(PTRACE_SYSEMU, pid, 0, 0))
+ return pr_perror("Can't resume the child %d", pid);
+ if (waitpid(pid, &status, 0) != pid)
+ return pr_perror("Can't wait for the child %d", pid);
+ if (!WIFSTOPPED(status) || WSTOPSIG(status) != SIGTRAP)
+ return pr_err("Unexpected status: %d", status);
+
+ /* Check that x7 isnt't clobbered if PTRACE_O_ARM64_RAW_REGS is set. */
+ if (ptrace(PTRACE_GETREGSET, pid, NT_PRSTATUS, &iov))
+ return pr_perror("Can't get child registers");
+ if (regs.regs[7] != TEST_VAL)
+ return pr_fail("unexpected x7: %lx", regs.regs[7]);
+ ksft_test_result_pass("x7: %llx\n", regs.regs[7]);
+
+ /* Check that the child will see a new value of x7. */
+ regs.regs[0] = 0;
+ regs.regs[7] = ~TEST_VAL;
+ if (ptrace(PTRACE_SETREGSET, pid, NT_PRSTATUS, &iov))
+ return pr_perror("Can't set child registers");
+
+ if (ptrace(PTRACE_CONT, pid, 0, 0))
+ return pr_perror("Can't resume the child %d", pid);
+ if (waitpid(pid, &status, 0) != pid)
+ return pr_perror("Can't wait for the child %d", pid);
+
+ if (status != 0)
+ return pr_fail("Child exited with code %d.", status);
+
+ ksft_test_result_pass("The child exited with code 0.\n");
+ ksft_exit_pass();
+ return 0;
+}
+
diff --git a/tools/testing/selftests/arm64/ptrace/ptrace_syscall_regs_test.c b/tools/testing/selftests/arm64/ptrace/ptrace_syscall_regs_test.c
new file mode 100644
index 000000000000..d1534525ef26
--- /dev/null
+++ b/tools/testing/selftests/arm64/ptrace/ptrace_syscall_regs_test.c
@@ -0,0 +1,150 @@
+// SPDX-License-Identifier: GPL-2.0-only
+#include <errno.h>
+#include <signal.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+
+#include <sys/types.h>
+#include <sys/ptrace.h>
+#include <sys/user.h>
+#include <sys/wait.h>
+#include <sys/uio.h>
+#include <linux/elf.h>
+#include <linux/unistd.h>
+
+#include "../../kselftest.h"
+
+#define TEST_VAL 0x686920776f726c64UL
+
+#define pr_p(func, fmt, ...) func(fmt ": %m", ##__VA_ARGS__)
+
+#define pr_err(fmt, ...) \
+ ({ \
+ ksft_test_result_error(fmt "\n", ##__VA_ARGS__); \
+ -1; \
+ })
+
+#define pr_fail(fmt, ...) \
+ ({ \
+ ksft_test_result_fail(fmt "\n", ##__VA_ARGS__); \
+ -1; \
+ })
+
+#define pr_perror(fmt, ...) pr_p(pr_err, fmt, ##__VA_ARGS__)
+
+static long loop(void *val)
+{
+ register long x0 __asm__("x0");
+ register void *x1 __asm__("x1") = val;
+ register long x8 __asm__("x8") = 555;
+
+ __asm__ (
+ "again:\n"
+ "ldr x7, [x1, 0]\n"
+ "svc 0\n"
+ "str x7, [x1, 0]\n"
+ : "=r"(x0)
+ : "r"(x1), "r"(x8)
+ :
+ );
+ return 0;
+}
+
+static int child(void)
+{
+ long val = TEST_VAL;
+
+ loop(&val);
+ if (val != TEST_VAL) {
+ ksft_print_msg("Unexpected x7: %lx\n", val);
+ return 1;
+ }
+
+ return 0;
+}
+
+#ifndef PTRACE_O_ARM64_RAW_REGS
+#define PTRACE_O_ARM64_RAW_REGS (1 << 28)
+#endif
+
+int main(int argc, void **argv)
+{
+ struct user_regs_struct regs = {};
+ struct iovec iov = {
+ .iov_base = &regs,
+ .iov_len = sizeof(struct user_regs_struct),
+ };
+ int status;
+ pid_t pid;
+
+ ksft_set_plan(3);
+
+ pid = fork();
+ if (pid == 0) {
+ kill(getpid(), SIGSTOP);
+ child();
+ _exit(0);
+ }
+ if (pid < 0)
+ return 1;
+
+ if (ptrace(PTRACE_ATTACH, pid, 0, 0))
+ return pr_perror("Can't attach to the child %d", pid);
+ if (waitpid(pid, &status, 0) != pid)
+ return pr_perror("Can't wait for the child %d", pid);
+ /* skip SIGSTOP */
+ if (ptrace(PTRACE_CONT, pid, 0, 0))
+ return pr_perror("Can't resume the child %d", pid);
+ if (waitpid(pid, &status, 0) != pid)
+ return pr_perror("Can't wait for the child %d", pid);
+
+ /* Resume the child to the next system call. */
+ if (ptrace(PTRACE_SYSCALL, pid, 0, 0))
+ return pr_perror("Can't resume the child %d", pid);
+ if (waitpid(pid, &status, 0) != pid)
+ return pr_perror("Can't wait for the child %d", pid);
+ if (!WIFSTOPPED(status) || WSTOPSIG(status) != SIGTRAP)
+ return pr_err("Unexpected status: %d", status);
+
+ /* Check that x7 is 0 on syscall-enter. */
+ if (ptrace(PTRACE_GETREGSET, pid, NT_PRSTATUS, &iov))
+ return pr_perror("Can't get child registers");
+ if (regs.regs[7] != 0)
+ return pr_fail("Unexpected x7: %lx", regs.regs[7]);
+ ksft_test_result_pass("x7: %llx\n", regs.regs[7]);
+
+ if (ptrace(PTRACE_SYSCALL, pid, 0, 0))
+ return pr_perror("Can't resume the child %d", pid);
+ if (waitpid(pid, &status, 0) != pid)
+ return pr_perror("Can't wait for the child %d", pid);
+ if (!WIFSTOPPED(status) || WSTOPSIG(status) != SIGTRAP)
+ return pr_err("Unexpected status: %d", status);
+
+ /* Check that x7 is 1 on syscall-exit. */
+ if (ptrace(PTRACE_GETREGSET, pid, NT_PRSTATUS, &iov))
+ return pr_perror("Can't get child registers");
+ if (regs.regs[7] != 1)
+ return pr_fail("Unexpected x7: %lx", regs.regs[7]);
+ ksft_test_result_pass("x7: %llx\n", regs.regs[7]);
+
+ /* Check that the child will not a new value of x7. */
+ regs.regs[0] = 0;
+ regs.regs[7] = ~TEST_VAL;
+ if (ptrace(PTRACE_SETREGSET, pid, NT_PRSTATUS, &iov))
+ return pr_perror("Can't set child registers");
+
+ if (ptrace(PTRACE_CONT, pid, 0, 0))
+ return pr_perror("Can't resume the child %d", pid);
+ if (waitpid(pid, &status, 0) != pid)
+ return pr_perror("Can't wait for the child %d", pid);
+
+ if (status != 0)
+ return pr_fail("Child exited with code %d.", status);
+
+ ksft_test_result_pass("The child exited with code 0.\n");
+ ksft_exit_pass();
+ return 0;
+}
+
--
2.29.2

2021-02-02 00:15:47

by Keno Fischer

[permalink] [raw]
Subject: Re: [PATCH 0/3 v2] arm64/ptrace: allow to get all registers on syscall traps

Hi Andrei,

> This series introduces the PTRACE_O_ARM64_RAW_REGS option. If it is set,
> PTRACE_GETREGSET returns values of all registers, and PTRACE_SETREGSET
> allows to change any of them.

thanks for picking this up. I meant to work on this, but unfortunately ran out
of time to be able to push it through, so I'm glad you're working on
it, since it
does absolutely need to get fixed. Besides this issue, the other problem we
ran into when trying to port our ptracer to aarch64 is that orig_x0 is not
accessible through the ptrace interface on aarch64, which can cause tricky
behavior around restarts. We managed to work around that in the end,
but it's painful. If we're fixing the kernel here anyway, I'm wondering if
we might want to address that as well while we're at it.

Keno

2021-02-04 14:59:20

by Will Deacon

[permalink] [raw]
Subject: Re: [PATCH 0/3 v2] arm64/ptrace: allow to get all registers on syscall traps

Hi Andrei,

On Mon, Feb 01, 2021 at 11:40:09AM -0800, Andrei Vagin wrote:
> Right now, ip/r12 for AArch32 and x7 for AArch64 is used to indicate
> whether or not the stop has been signalled from syscall entry or syscall
> exit. This means that:
>
> - Any writes by the tracer to this register during the stop are
> ignored/discarded.
>
> - The actual value of the register is not available during the stop,
> so the tracer cannot save it and restore it later.
>
> For applications like the user-mode Linux or gVisor, it is critical to
> have access to the full set of registers in any moment. For example,
> they need to change values of all registers to emulate rt_sigreturn or
> execve and they need to have the full set of registers to build a signal
> frame.
>
> This series introduces the PTRACE_O_ARM64_RAW_REGS option. If it is set,
> PTRACE_GETREGSET returns values of all registers, and PTRACE_SETREGSET
> allows to change any of them.

I haven't had a chance to go through this properly yet, but I spotted a
couple of things worth mentioning off the bat:

- Please drop all of the compat changes here. The compat layer is intended
to be compatible with arch/arm/, so if you want to introduce new ptrace
behaviours for 32-bit applications, you need to make the changes there
and then update our compat layer accordingly.

- When Keno mentioned this before [1,2], he also talked about making
orig_x0 available. Since extending the ABI is a giant pain, I think
this should be seriously considered.

[1] https://lore.kernel.org/r/CABV8kRzkLiVuqxT3+8c1o8m_OuROtXgfowQcrMVnrxu=CiGB=w@mail.gmail.com
[2] https://lore.kernel.org/r/CABV8kRzg1BaKdAhqXU3hONhfPAHj6Nbw0wLBC1Lo7PN1UA0CoA@mail.gmail.com

Will

2021-02-04 15:31:10

by Will Deacon

[permalink] [raw]
Subject: Re: [PATCH 1/3] arm64/ptrace: don't clobber task registers on syscall entry/exit traps

On Mon, Feb 01, 2021 at 11:40:10AM -0800, Andrei Vagin wrote:
> ip/r12 for AArch32 and x7 for AArch64 is used to indicate whether or not
> the stop has been signalled from syscall entry or syscall exit. This
> means that:
>
> - Any writes by the tracer to this register during the stop are
> ignored/discarded.
>
> - The actual value of the register is not available during the stop,
> so the tracer cannot save it and restore it later.
>
> Right now, these registers are clobbered in tracehook_report_syscall.
> This change moves the logic to gpr_get and compat_gpr_get where
> registers are copied into a user-space buffer.
>
> This will allow to change these registers and to introduce a new
> ptrace option to get the full set of registers.
>
> Signed-off-by: Andrei Vagin <[email protected]>
> ---
> arch/arm64/include/asm/ptrace.h | 5 ++
> arch/arm64/kernel/ptrace.c | 104 ++++++++++++++++++++------------
> 2 files changed, 69 insertions(+), 40 deletions(-)
>
> diff --git a/arch/arm64/include/asm/ptrace.h b/arch/arm64/include/asm/ptrace.h
> index e58bca832dff..0a9552b4f61e 100644
> --- a/arch/arm64/include/asm/ptrace.h
> +++ b/arch/arm64/include/asm/ptrace.h
> @@ -170,6 +170,11 @@ static inline unsigned long pstate_to_compat_psr(const unsigned long pstate)
> return psr;
> }
>
> +enum ptrace_syscall_dir {
> + PTRACE_SYSCALL_ENTER = 0,
> + PTRACE_SYSCALL_EXIT,
> +};
> +
> /*
> * This struct defines the way the registers are stored on the stack during an
> * exception. Note that sizeof(struct pt_regs) has to be a multiple of 16 (for
> diff --git a/arch/arm64/kernel/ptrace.c b/arch/arm64/kernel/ptrace.c
> index 8ac487c84e37..39da03104528 100644
> --- a/arch/arm64/kernel/ptrace.c
> +++ b/arch/arm64/kernel/ptrace.c
> @@ -40,6 +40,7 @@
> #include <asm/syscall.h>
> #include <asm/traps.h>
> #include <asm/system_misc.h>
> +#include <asm/ptrace.h>
>
> #define CREATE_TRACE_POINTS
> #include <trace/events/syscalls.h>
> @@ -561,7 +562,31 @@ static int gpr_get(struct task_struct *target,
> struct membuf to)
> {
> struct user_pt_regs *uregs = &task_pt_regs(target)->user_regs;
> - return membuf_write(&to, uregs, sizeof(*uregs));
> + unsigned long saved_reg;
> + int ret;
> +
> + /*
> + * We have some ABI weirdness here in the way that we handle syscall
> + * exit stops because we indicate whether or not the stop has been
> + * signalled from syscall entry or syscall exit by clobbering the general
> + * purpose register x7.
> + */

When you move a comment, please don't truncate it!

> + saved_reg = uregs->regs[7];
> +
> + switch (target->ptrace_message) {
> + case PTRACE_EVENTMSG_SYSCALL_ENTRY:
> + uregs->regs[7] = PTRACE_SYSCALL_ENTER;
> + break;
> + case PTRACE_EVENTMSG_SYSCALL_EXIT:
> + uregs->regs[7] = PTRACE_SYSCALL_EXIT;
> + break;
> + }

I'm wary of checking target->ptrace_message here, as I seem to recall the
regset code also being used for coredumps. What guarantees we don't break
things there?

> +
> + ret = membuf_write(&to, uregs, sizeof(*uregs));
> +
> + uregs->regs[7] = saved_reg;
> +
> + return ret;
> }
>
> static int gpr_set(struct task_struct *target, const struct user_regset *regset,
> @@ -575,6 +600,17 @@ static int gpr_set(struct task_struct *target, const struct user_regset *regset,
> if (ret)
> return ret;
>
> + /*
> + * Historically, x7 can't be changed if the stop has been signalled
> + * from syscall-enter of syscall-exit.
> + */
> + switch (target->ptrace_message) {
> + case PTRACE_EVENTMSG_SYSCALL_ENTRY:
> + case PTRACE_EVENTMSG_SYSCALL_EXIT:
> + newregs.regs[7] = task_pt_regs(target)->regs[7];
> + break;
> + }
> +
> if (!valid_user_regs(&newregs, target))
> return -EINVAL;
>
> @@ -1206,6 +1242,20 @@ static inline compat_ulong_t compat_get_user_reg(struct task_struct *task, int i
> struct pt_regs *regs = task_pt_regs(task);
>
> switch (idx) {
> + case 12:
> + /*
> + * We have some ABI weirdness here in the way that we handle
> + * syscall exit stops because we indicate whether or not the
> + * stop has been signalled from syscall entry or syscall exit
> + * by clobbering the general purpose register r12.
> + */
> + switch (task->ptrace_message) {
> + case PTRACE_EVENTMSG_SYSCALL_ENTRY:
> + return PTRACE_SYSCALL_ENTER;
> + case PTRACE_EVENTMSG_SYSCALL_EXIT:
> + return PTRACE_SYSCALL_EXIT;
> + }
> + return regs->regs[idx];
> case 15:
> return regs->pc;
> case 16:
> @@ -1282,6 +1332,17 @@ static int compat_gpr_set(struct task_struct *target,
>
> }
>
> + /*
> + * Historically, x12 can't be changed if the stop has been signalled
> + * from syscall-enter of syscall-exit.
> + */
> + switch (target->ptrace_message) {
> + case PTRACE_EVENTMSG_SYSCALL_ENTRY:
> + case PTRACE_EVENTMSG_SYSCALL_EXIT:
> + newregs.regs[12] = task_pt_regs(target)->regs[12];
> + break;
> + }
> +
> if (valid_user_regs(&newregs.user_regs, target))
> *task_pt_regs(target) = newregs;
> else
> @@ -1740,53 +1801,16 @@ long arch_ptrace(struct task_struct *child, long request,
> return ptrace_request(child, request, addr, data);
> }
>
> -enum ptrace_syscall_dir {
> - PTRACE_SYSCALL_ENTER = 0,
> - PTRACE_SYSCALL_EXIT,
> -};
> -
> static void tracehook_report_syscall(struct pt_regs *regs,
> enum ptrace_syscall_dir dir)
> {
> - int regno;
> - unsigned long saved_reg;
> -
> - /*
> - * We have some ABI weirdness here in the way that we handle syscall
> - * exit stops because we indicate whether or not the stop has been
> - * signalled from syscall entry or syscall exit by clobbering a general
> - * purpose register (ip/r12 for AArch32, x7 for AArch64) in the tracee
> - * and restoring its old value after the stop. This means that:
> - *
> - * - Any writes by the tracer to this register during the stop are
> - * ignored/discarded.
> - *
> - * - The actual value of the register is not available during the stop,
> - * so the tracer cannot save it and restore it later.
> - *
> - * - Syscall stops behave differently to seccomp and pseudo-step traps
> - * (the latter do not nobble any registers).
> - */
> - regno = (is_compat_task() ? 12 : 7);
> - saved_reg = regs->regs[regno];
> - regs->regs[regno] = dir;
> -
> if (dir == PTRACE_SYSCALL_ENTER) {
> if (tracehook_report_syscall_entry(regs))
> forget_syscall(regs);
> - regs->regs[regno] = saved_reg;
> - } else if (!test_thread_flag(TIF_SINGLESTEP)) {
> - tracehook_report_syscall_exit(regs, 0);
> - regs->regs[regno] = saved_reg;
> } else {
> - regs->regs[regno] = saved_reg;
> + int singlestep = test_thread_flag(TIF_SINGLESTEP);
>
> - /*
> - * Signal a pseudo-step exception since we are stepping but
> - * tracer modifications to the registers may have rewound the
> - * state machine.
> - */
> - tracehook_report_syscall_exit(regs, 1);
> + tracehook_report_syscall_exit(regs, singlestep);

Again, please preserve the comment in some form (maybe "... if we are
stepping since tracer ...").

That said, doesn't your change above break the pseudo-step trap? Currently,
we report the real x7 for those.

Will

2021-02-04 17:02:16

by Dave Martin

[permalink] [raw]
Subject: Re: [PATCH 1/3] arm64/ptrace: don't clobber task registers on syscall entry/exit traps

On Thu, Feb 04, 2021 at 03:23:34PM +0000, Will Deacon wrote:
> On Mon, Feb 01, 2021 at 11:40:10AM -0800, Andrei Vagin wrote:
> > ip/r12 for AArch32 and x7 for AArch64 is used to indicate whether or not
> > the stop has been signalled from syscall entry or syscall exit. This
> > means that:
> >
> > - Any writes by the tracer to this register during the stop are
> > ignored/discarded.
> >
> > - The actual value of the register is not available during the stop,
> > so the tracer cannot save it and restore it later.
> >
> > Right now, these registers are clobbered in tracehook_report_syscall.
> > This change moves the logic to gpr_get and compat_gpr_get where
> > registers are copied into a user-space buffer.
> >
> > This will allow to change these registers and to introduce a new
> > ptrace option to get the full set of registers.
> >
> > Signed-off-by: Andrei Vagin <[email protected]>
> > ---
> > arch/arm64/include/asm/ptrace.h | 5 ++
> > arch/arm64/kernel/ptrace.c | 104 ++++++++++++++++++++------------
> > 2 files changed, 69 insertions(+), 40 deletions(-)
> >
> > diff --git a/arch/arm64/include/asm/ptrace.h b/arch/arm64/include/asm/ptrace.h
> > index e58bca832dff..0a9552b4f61e 100644
> > --- a/arch/arm64/include/asm/ptrace.h
> > +++ b/arch/arm64/include/asm/ptrace.h
> > @@ -170,6 +170,11 @@ static inline unsigned long pstate_to_compat_psr(const unsigned long pstate)
> > return psr;
> > }
> >
> > +enum ptrace_syscall_dir {
> > + PTRACE_SYSCALL_ENTER = 0,
> > + PTRACE_SYSCALL_EXIT,
> > +};
> > +
> > /*
> > * This struct defines the way the registers are stored on the stack during an
> > * exception. Note that sizeof(struct pt_regs) has to be a multiple of 16 (for
> > diff --git a/arch/arm64/kernel/ptrace.c b/arch/arm64/kernel/ptrace.c
> > index 8ac487c84e37..39da03104528 100644
> > --- a/arch/arm64/kernel/ptrace.c
> > +++ b/arch/arm64/kernel/ptrace.c
> > @@ -40,6 +40,7 @@
> > #include <asm/syscall.h>
> > #include <asm/traps.h>
> > #include <asm/system_misc.h>
> > +#include <asm/ptrace.h>
> >
> > #define CREATE_TRACE_POINTS
> > #include <trace/events/syscalls.h>
> > @@ -561,7 +562,31 @@ static int gpr_get(struct task_struct *target,
> > struct membuf to)
> > {
> > struct user_pt_regs *uregs = &task_pt_regs(target)->user_regs;
> > - return membuf_write(&to, uregs, sizeof(*uregs));
> > + unsigned long saved_reg;
> > + int ret;
> > +
> > + /*
> > + * We have some ABI weirdness here in the way that we handle syscall
> > + * exit stops because we indicate whether or not the stop has been
> > + * signalled from syscall entry or syscall exit by clobbering the general
> > + * purpose register x7.
> > + */
>
> When you move a comment, please don't truncate it!
>
> > + saved_reg = uregs->regs[7];
> > +
> > + switch (target->ptrace_message) {
> > + case PTRACE_EVENTMSG_SYSCALL_ENTRY:
> > + uregs->regs[7] = PTRACE_SYSCALL_ENTER;
> > + break;
> > + case PTRACE_EVENTMSG_SYSCALL_EXIT:
> > + uregs->regs[7] = PTRACE_SYSCALL_EXIT;
> > + break;
> > + }
>
> I'm wary of checking target->ptrace_message here, as I seem to recall the
> regset code also being used for coredumps. What guarantees we don't break
> things there?

For a coredump, is there any way to know whether a given thread was
inside a traced syscall when the coredump was generated? If so, x7 in
the dump may already unreliable and we only need to make best efforts to
get it "right".

Since triggering of the coredump and death of other threads all require
dequeueing of some signal, I think all threads must always outside the
syscall-enter...syscall-exit path before any of the coredump runs anyway,
in which case the above should never matter... Though somone else ought
to eyeball the coredump code before we agree on that.

ptrace_message doesn't seem absolutely the wrong thing to check, but
we'd need to be sure that it can't be stale (say, left over from some
previous trap).


Out of interest, where did this arm64 ptrace feature come from? Was it
just pasted from 32-bit and thinly adapted? It looks like an
arch-specific attempt to do what PTRACE_O_TRACESYSGOOD does, in which
case it may have been obsolete even before it was upstreamed. I wonder
whether anyone is actually relying on it at all...

Doesn't mean we can definitely fix it safely, but it's annoying.

[...]

Cheers
---Dave

2021-02-05 00:51:54

by Will Deacon

[permalink] [raw]
Subject: Re: [PATCH 3/3] selftest/arm64/ptrace: add tests for PTRACE_O_ARM64_RAW_REGS

[+Kees]

On Mon, Feb 01, 2021 at 11:40:12AM -0800, Andrei Vagin wrote:
> Test output:
> TAP version 13
> 1..2
> # selftests: arm64/ptrace: ptrace_syscall_raw_regs_test
> # 1..2
> # ok 1 x7: 686920776f726c64
> # ok 2 The child exited with code 0.
> # # Totals: pass:2 fail:0 xfail:0 xpass:0 skip:0 error:0
> ok 1 selftests: arm64/ptrace: ptrace_syscall_raw_regs_test
> # selftests: arm64/ptrace: ptrace_syscall_regs_test
> # 1..3
> # ok 1 x7: 0
> # ok 2 x7: 1
> # ok 3 The child exited with code 0.
> # # Totals: pass:3 fail:0 xfail:0 xpass:0 skip:0 error:0
> ok 2 selftests: arm64/ptrace: ptrace_syscall_regs_test
>
> Signed-off-by: Andrei Vagin <[email protected]>
> ---
> tools/testing/selftests/arm64/Makefile | 2 +-
> tools/testing/selftests/arm64/ptrace/Makefile | 6 +
> .../ptrace/ptrace_syscall_raw_regs_test.c | 142 +++++++++++++++++
> .../arm64/ptrace/ptrace_syscall_regs_test.c | 150 ++++++++++++++++++
> 4 files changed, 299 insertions(+), 1 deletion(-)
> create mode 100644 tools/testing/selftests/arm64/ptrace/Makefile
> create mode 100644 tools/testing/selftests/arm64/ptrace/ptrace_syscall_raw_regs_test.c
> create mode 100644 tools/testing/selftests/arm64/ptrace/ptrace_syscall_regs_test.c

Thanks for the tests!

We already have a pretty extensive set of syscall entry tests in
tools/testing/selftests/seccomp, so perhaps this would be better off as part
of that? Maybe worth a look.

Will

2021-02-08 20:14:07

by Andrei Vagin

[permalink] [raw]
Subject: Re: [PATCH 0/3 v2] arm64/ptrace: allow to get all registers on syscall traps

On Mon, Feb 01, 2021 at 07:11:12PM -0500, Keno Fischer wrote:
> Hi Andrei,
>
> > This series introduces the PTRACE_O_ARM64_RAW_REGS option. If it is set,
> > PTRACE_GETREGSET returns values of all registers, and PTRACE_SETREGSET
> > allows to change any of them.
>
> thanks for picking this up. I meant to work on this, but unfortunately ran out
> of time to be able to push it through, so I'm glad you're working on
> it, since it
> does absolutely need to get fixed. Besides this issue, the other problem we
> ran into when trying to port our ptracer to aarch64 is that orig_x0 is not
> accessible through the ptrace interface on aarch64, which can cause tricky
> behavior around restarts.

Could you describe the problem in more details? I wonder whether we have
the same thing in CRIU...

> We managed to work around that in the end,
> but it's painful. If we're fixing the kernel here anyway, I'm wondering if
> we might want to address that as well while we're at it.

Sure let think how to do this properly.

In this case, I think the ptrace option isn't a good choise. I don't
think that it is a good idea to change the layout of regset depending on
options...

Thanks,
Andrei

>
> Keno

2021-02-08 20:41:29

by Keno Fischer

[permalink] [raw]
Subject: Re: [PATCH 0/3 v2] arm64/ptrace: allow to get all registers on syscall traps

>
> Could you describe the problem in more details? I wonder whether we have
> the same thing in CRIU...
>

Sure, basically the issue is that orig_x0 gets recorded at the start of the
syscall entry, but is not otherwise part of the ptrace state. It used
to be primarily used for resetting the argument back to its original
value during syscall restarts, but I see it's been expanded slightly
with the user dispatch mechanism (though as far as I can tell
not in a way that interacts with ptrace). Basically the problem
is that if you change the value of x0 during either the ptrace
entry stop or a seccomp stop and then incur a syscall restart,
the syscall will restart with the original x0 rather than with
the modified x0, which may be unexpected. Of course,
relatedly, if you're doing CRIU-like things you can end up
in situations where the future behavior will depend on the
orig_x0 value, which isn't restore-able at the moment. It's
possible to work around all of this by keeping a local copy
of orig_x0 and being very careful with the ptrace traps around
restarts, but getting the logic right is extremely tricky. My
suggestion for what I thought would be reasonable
behavior was:

1. Expose orig_x0 to ptrace
2. Set orig_x0 to x0 and set x0 to -ENOSYS at the start of the syscall
dispatcher
3. Use orig_x0 for syscall arguments/seccomp/restarts

That's basically how rax works on x86_64 and it doesn't
seem to cause major problems (though of course I may
be biased by having x86_64 already work when I started the
aarch64 port). Just the first item would be sufficient of course
for getting rid of most of the bookkeeping. I should also say
that, for us, the ptrace getregs call can be the throughput
limiting operation, so it would be nice if getting the entire
basic register set would only require one syscall. I won't
insist on it, since we do have a solution in place that kinda
works (and only requires the one syscall),
but I thought I'd mention it.

While we're on this topic, and in case it's helpful to anybody,
I should also point out that the order of the ptrace-signal-stop,
vs setup for the syscall restart differs between x86 and
aarch64 (aarch64 sets up the restart first then delivers the
ptrace trap/signal - x86 the other way around). I actually
think the aarch64 behavior is saner here, but I figured I'd
leave this breadcrumb for anybody who's writing a ptracer
and stumbles across this.

Keno

2021-02-10 20:59:21

by Kees Cook

[permalink] [raw]
Subject: Re: [PATCH 3/3] selftest/arm64/ptrace: add tests for PTRACE_O_ARM64_RAW_REGS

On Thu, Feb 04, 2021 at 03:40:39PM +0000, Will Deacon wrote:
> [+Kees]
>
> On Mon, Feb 01, 2021 at 11:40:12AM -0800, Andrei Vagin wrote:
> > Test output:
> > TAP version 13
> > 1..2
> > # selftests: arm64/ptrace: ptrace_syscall_raw_regs_test
> > # 1..2
> > # ok 1 x7: 686920776f726c64
> > # ok 2 The child exited with code 0.
> > # # Totals: pass:2 fail:0 xfail:0 xpass:0 skip:0 error:0
> > ok 1 selftests: arm64/ptrace: ptrace_syscall_raw_regs_test
> > # selftests: arm64/ptrace: ptrace_syscall_regs_test
> > # 1..3
> > # ok 1 x7: 0
> > # ok 2 x7: 1
> > # ok 3 The child exited with code 0.
> > # # Totals: pass:3 fail:0 xfail:0 xpass:0 skip:0 error:0
> > ok 2 selftests: arm64/ptrace: ptrace_syscall_regs_test
> >
> > Signed-off-by: Andrei Vagin <[email protected]>
> > ---
> > tools/testing/selftests/arm64/Makefile | 2 +-
> > tools/testing/selftests/arm64/ptrace/Makefile | 6 +
> > .../ptrace/ptrace_syscall_raw_regs_test.c | 142 +++++++++++++++++
> > .../arm64/ptrace/ptrace_syscall_regs_test.c | 150 ++++++++++++++++++
> > 4 files changed, 299 insertions(+), 1 deletion(-)
> > create mode 100644 tools/testing/selftests/arm64/ptrace/Makefile
> > create mode 100644 tools/testing/selftests/arm64/ptrace/ptrace_syscall_raw_regs_test.c
> > create mode 100644 tools/testing/selftests/arm64/ptrace/ptrace_syscall_regs_test.c
>
> Thanks for the tests!
>
> We already have a pretty extensive set of syscall entry tests in
> tools/testing/selftests/seccomp, so perhaps this would be better off as part
> of that? Maybe worth a look.

I'm happy with this living in either place -- I can make an argument
either way. If it's arm64-specific, maybe better to live outside of
seccomp?

--
Kees Cook

2021-02-25 16:07:30

by Andrei Vagin

[permalink] [raw]
Subject: Re: [PATCH 1/3] arm64/ptrace: don't clobber task registers on syscall entry/exit traps

On Thu, Feb 04, 2021 at 03:23:34PM +0000, Will Deacon wrote:
> On Mon, Feb 01, 2021 at 11:40:10AM -0800, Andrei Vagin wrote:
> > ip/r12 for AArch32 and x7 for AArch64 is used to indicate whether or not
> > the stop has been signalled from syscall entry or syscall exit. This
> > means that:
> >
> > - Any writes by the tracer to this register during the stop are
> > ignored/discarded.
> >
> > - The actual value of the register is not available during the stop,
> > so the tracer cannot save it and restore it later.
> >
> > Right now, these registers are clobbered in tracehook_report_syscall.
> > This change moves the logic to gpr_get and compat_gpr_get where
> > registers are copied into a user-space buffer.
> >
> > This will allow to change these registers and to introduce a new
> > ptrace option to get the full set of registers.
> >
> > Signed-off-by: Andrei Vagin <[email protected]>
> > ---
> > arch/arm64/include/asm/ptrace.h | 5 ++
> > arch/arm64/kernel/ptrace.c | 104 ++++++++++++++++++++------------
> > 2 files changed, 69 insertions(+), 40 deletions(-)
> >
> > diff --git a/arch/arm64/include/asm/ptrace.h b/arch/arm64/include/asm/ptrace.h
> > index e58bca832dff..0a9552b4f61e 100644
> > --- a/arch/arm64/include/asm/ptrace.h
> > +++ b/arch/arm64/include/asm/ptrace.h
> > @@ -170,6 +170,11 @@ static inline unsigned long pstate_to_compat_psr(const unsigned long pstate)
> > return psr;
> > }
> >
> > +enum ptrace_syscall_dir {
> > + PTRACE_SYSCALL_ENTER = 0,
> > + PTRACE_SYSCALL_EXIT,
> > +};
> > +
> > /*
> > * This struct defines the way the registers are stored on the stack during an
> > * exception. Note that sizeof(struct pt_regs) has to be a multiple of 16 (for
> > diff --git a/arch/arm64/kernel/ptrace.c b/arch/arm64/kernel/ptrace.c
> > index 8ac487c84e37..39da03104528 100644
> > --- a/arch/arm64/kernel/ptrace.c
> > +++ b/arch/arm64/kernel/ptrace.c
> > @@ -40,6 +40,7 @@
> > #include <asm/syscall.h>
> > #include <asm/traps.h>
> > #include <asm/system_misc.h>
> > +#include <asm/ptrace.h>
> >
> > #define CREATE_TRACE_POINTS
> > #include <trace/events/syscalls.h>
> > @@ -561,7 +562,31 @@ static int gpr_get(struct task_struct *target,
> > struct membuf to)
> > {
> > struct user_pt_regs *uregs = &task_pt_regs(target)->user_regs;
> > - return membuf_write(&to, uregs, sizeof(*uregs));
> > + unsigned long saved_reg;
> > + int ret;
> > +
> > + /*
> > + * We have some ABI weirdness here in the way that we handle syscall
> > + * exit stops because we indicate whether or not the stop has been
> > + * signalled from syscall entry or syscall exit by clobbering the general
> > + * purpose register x7.
> > + */
>
> When you move a comment, please don't truncate it!

This is my fault. In the previous version, the other part of this
comment was irelevant, because I always allowed to change clobbered
registers, but then I realized that we can't do that.

>
> > + saved_reg = uregs->regs[7];
> > +
> > + switch (target->ptrace_message) {
> > + case PTRACE_EVENTMSG_SYSCALL_ENTRY:
> > + uregs->regs[7] = PTRACE_SYSCALL_ENTER;
> > + break;
> > + case PTRACE_EVENTMSG_SYSCALL_EXIT:
> > + uregs->regs[7] = PTRACE_SYSCALL_EXIT;
> > + break;
> > + }
>
> I'm wary of checking target->ptrace_message here, as I seem to recall the
> regset code also being used for coredumps. What guarantees we don't break
> things there?

Registers were clobbered in tracehook_report_syscall,
task->ptrace_message is set in ptrace_report_syscall.

do_coredump() is called from get_signal and secure_computing, so we
always see actuall registers in core dumps with and without these
changes.

>
> > +
> > + ret = membuf_write(&to, uregs, sizeof(*uregs));
> > +
> > + uregs->regs[7] = saved_reg;
> > +
> > + return ret;
> > }
> >
> > static int gpr_set(struct task_struct *target, const struct user_regset *regset,
> > @@ -575,6 +600,17 @@ static int gpr_set(struct task_struct *target, const struct user_regset *regset,
> > if (ret)
> > return ret;
> >
> > + /*
> > + * Historically, x7 can't be changed if the stop has been signalled
> > + * from syscall-enter of syscall-exit.
> > + */
> > + switch (target->ptrace_message) {
> > + case PTRACE_EVENTMSG_SYSCALL_ENTRY:
> > + case PTRACE_EVENTMSG_SYSCALL_EXIT:
> > + newregs.regs[7] = task_pt_regs(target)->regs[7];
> > + break;
> > + }
> > +
> > if (!valid_user_regs(&newregs, target))
> > return -EINVAL;
> >
> > @@ -1206,6 +1242,20 @@ static inline compat_ulong_t compat_get_user_reg(struct task_struct *task, int i
> > struct pt_regs *regs = task_pt_regs(task);
> >
> > switch (idx) {
> > + case 12:
> > + /*
> > + * We have some ABI weirdness here in the way that we handle
> > + * syscall exit stops because we indicate whether or not the
> > + * stop has been signalled from syscall entry or syscall exit
> > + * by clobbering the general purpose register r12.
> > + */
> > + switch (task->ptrace_message) {
> > + case PTRACE_EVENTMSG_SYSCALL_ENTRY:
> > + return PTRACE_SYSCALL_ENTER;
> > + case PTRACE_EVENTMSG_SYSCALL_EXIT:
> > + return PTRACE_SYSCALL_EXIT;
> > + }
> > + return regs->regs[idx];
> > case 15:
> > return regs->pc;
> > case 16:
> > @@ -1282,6 +1332,17 @@ static int compat_gpr_set(struct task_struct *target,
> >
> > }
> >
> > + /*
> > + * Historically, x12 can't be changed if the stop has been signalled
> > + * from syscall-enter of syscall-exit.
> > + */
> > + switch (target->ptrace_message) {
> > + case PTRACE_EVENTMSG_SYSCALL_ENTRY:
> > + case PTRACE_EVENTMSG_SYSCALL_EXIT:
> > + newregs.regs[12] = task_pt_regs(target)->regs[12];
> > + break;
> > + }
> > +
> > if (valid_user_regs(&newregs.user_regs, target))
> > *task_pt_regs(target) = newregs;
> > else
> > @@ -1740,53 +1801,16 @@ long arch_ptrace(struct task_struct *child, long request,
> > return ptrace_request(child, request, addr, data);
> > }
> >
> > -enum ptrace_syscall_dir {
> > - PTRACE_SYSCALL_ENTER = 0,
> > - PTRACE_SYSCALL_EXIT,
> > -};
> > -
> > static void tracehook_report_syscall(struct pt_regs *regs,
> > enum ptrace_syscall_dir dir)
> > {
> > - int regno;
> > - unsigned long saved_reg;
> > -
> > - /*
> > - * We have some ABI weirdness here in the way that we handle syscall
> > - * exit stops because we indicate whether or not the stop has been
> > - * signalled from syscall entry or syscall exit by clobbering a general
> > - * purpose register (ip/r12 for AArch32, x7 for AArch64) in the tracee
> > - * and restoring its old value after the stop. This means that:
> > - *
> > - * - Any writes by the tracer to this register during the stop are
> > - * ignored/discarded.
> > - *
> > - * - The actual value of the register is not available during the stop,
> > - * so the tracer cannot save it and restore it later.
> > - *
> > - * - Syscall stops behave differently to seccomp and pseudo-step traps
> > - * (the latter do not nobble any registers).
> > - */
> > - regno = (is_compat_task() ? 12 : 7);
> > - saved_reg = regs->regs[regno];
> > - regs->regs[regno] = dir;
> > -
> > if (dir == PTRACE_SYSCALL_ENTER) {
> > if (tracehook_report_syscall_entry(regs))
> > forget_syscall(regs);
> > - regs->regs[regno] = saved_reg;
> > - } else if (!test_thread_flag(TIF_SINGLESTEP)) {
> > - tracehook_report_syscall_exit(regs, 0);
> > - regs->regs[regno] = saved_reg;
> > } else {
> > - regs->regs[regno] = saved_reg;
> > + int singlestep = test_thread_flag(TIF_SINGLESTEP);
> >
> > - /*
> > - * Signal a pseudo-step exception since we are stepping but
> > - * tracer modifications to the registers may have rewound the
> > - * state machine.
> > - */
> > - tracehook_report_syscall_exit(regs, 1);
> > + tracehook_report_syscall_exit(regs, singlestep);
>
> Again, please preserve the comment in some form (maybe "... if we are
> stepping since tracer ...").

ok

>
> That said, doesn't your change above break the pseudo-step trap? Currently,
> we report the real x7 for those.

No, it doesn't.

In case of singlestep, tracehook_report_syscall_exit calls
user_single_step_report instead of ptrace_report_syscall, so
current->ptace_message will not be set PTRACE_EVENTMSG_SYSCALL_EXIT.


>
> Will