Allow rseq critical section abort handlers to optionally figure out at
which instruction pointer the rseq critical section was aborted.
This allows implementing rseq critical sections with loops containing
the commit instruction, for which having the commit as last instruction
of the sequence is not possible. This is useful to implement adaptative
mutexes aware of preemption in user-space. (see [1])
This also allows implementing rseq critical sections with multiple
commit steps, and use the abort-at-ip information to figure out what
needs to be undone in the abort handler.
Introduce the RSEQ_FLAG_QUERY_ABORT_AT_IP rseq system call flag. This
lets userspace query whether the kernel and architecture supports the
abort-at-ip rseq extension.
Only provide this information for rseq critical sections for which the
rseq_cs descriptor has the RSEQ_CS_FLAG_ABORT_AT_IP flag set. Abort
handlers for critical sections with this flag set need to readjust the
stack pointer. The abort-at-ip pointer is populated by the kernel on
the top of stack on abort. For x86-32, the abort handler of an
abort-at-ip critical section needs to add 4 bytes to the stack pointer.
For x86-64, the abort hanler needs to add 136 bytes to the stack
pointer: 8 bytes to skip the abort-at-ip value, and 128 bytes to skip
the x86-64 redzone for leaf functions.
[1] https://github.com/compudj/rseq-test/blob/adapt-lock-abort-at-ip/test-rseq-adaptative-lock.c#L80
Signed-off-by: Mathieu Desnoyers <[email protected]>
---
Changes since v1:
- Use top of stack to place abort-at-ip value rather than ecx/rcx
register,
- Skip redzone on x86-64.
---
arch/x86/include/asm/ptrace.h | 5 +++++
arch/x86/kernel/ptrace.c | 12 ++++++++++++
include/uapi/linux/rseq.h | 4 ++++
kernel/rseq.c | 28 ++++++++++++++++++++++++++++
4 files changed, 49 insertions(+)
diff --git a/arch/x86/include/asm/ptrace.h b/arch/x86/include/asm/ptrace.h
index 703663175a5a..c96eb2448110 100644
--- a/arch/x86/include/asm/ptrace.h
+++ b/arch/x86/include/asm/ptrace.h
@@ -387,5 +387,10 @@ extern int do_set_thread_area(struct task_struct *p, int idx,
# define do_set_thread_area_64(p, s, t) (0)
#endif
+#ifdef CONFIG_RSEQ
+# define RSEQ_ARCH_HAS_ABORT_AT_IP
+int rseq_abort_at_ip(struct pt_regs *regs, unsigned long ip);
+#endif
+
#endif /* !__ASSEMBLY__ */
#endif /* _ASM_X86_PTRACE_H */
diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c
index 6d2244c94799..561ed98d12ba 100644
--- a/arch/x86/kernel/ptrace.c
+++ b/arch/x86/kernel/ptrace.c
@@ -1368,3 +1368,15 @@ void user_single_step_report(struct pt_regs *regs)
{
send_sigtrap(regs, 0, TRAP_BRKPT);
}
+
+int rseq_abort_at_ip(struct pt_regs *regs, unsigned long ip)
+{
+ if (user_64bit_mode(regs)) {
+ /* Need to skip redzone for leaf functions. */
+ regs->sp -= sizeof(u64) + 128;
+ return put_user(ip, (u64 __user *)regs->sp);
+ } else {
+ regs->sp -= sizeof(u32);
+ return put_user(ip, (u32 __user *)regs->sp);
+ }
+}
diff --git a/include/uapi/linux/rseq.h b/include/uapi/linux/rseq.h
index 9a402fdb60e9..3130232e6d0c 100644
--- a/include/uapi/linux/rseq.h
+++ b/include/uapi/linux/rseq.h
@@ -20,12 +20,14 @@ enum rseq_cpu_id_state {
enum rseq_flags {
RSEQ_FLAG_UNREGISTER = (1 << 0),
+ RSEQ_FLAG_QUERY_ABORT_AT_IP = (1 << 1),
};
enum rseq_cs_flags_bit {
RSEQ_CS_FLAG_NO_RESTART_ON_PREEMPT_BIT = 0,
RSEQ_CS_FLAG_NO_RESTART_ON_SIGNAL_BIT = 1,
RSEQ_CS_FLAG_NO_RESTART_ON_MIGRATE_BIT = 2,
+ RSEQ_CS_FLAG_ABORT_AT_IP_BIT = 3,
};
enum rseq_cs_flags {
@@ -35,6 +37,8 @@ enum rseq_cs_flags {
(1U << RSEQ_CS_FLAG_NO_RESTART_ON_SIGNAL_BIT),
RSEQ_CS_FLAG_NO_RESTART_ON_MIGRATE =
(1U << RSEQ_CS_FLAG_NO_RESTART_ON_MIGRATE_BIT),
+ RSEQ_CS_FLAG_ABORT_AT_IP =
+ (1U << RSEQ_CS_FLAG_ABORT_AT_IP_BIT),
};
/*
diff --git a/kernel/rseq.c b/kernel/rseq.c
index 6d45ac3dae7f..fb52f2d11b56 100644
--- a/kernel/rseq.c
+++ b/kernel/rseq.c
@@ -21,6 +21,13 @@
#define RSEQ_CS_PREEMPT_MIGRATE_FLAGS (RSEQ_CS_FLAG_NO_RESTART_ON_MIGRATE | \
RSEQ_CS_FLAG_NO_RESTART_ON_PREEMPT)
+#ifdef RSEQ_ARCH_HAS_ABORT_AT_IP
+static bool rseq_has_abort_at_ip(void) { return true; }
+#else
+static bool rseq_has_abort_at_ip(void) { return false; }
+static int rseq_abort_at_ip(struct pt_regs *regs, unsigned long ip) { return 0; }
+#endif
+
/*
*
* Restartable sequences are a lightweight interface that allows
@@ -79,6 +86,16 @@
*
* [abort_ip]
* F1. <failure>
+ *
+ * rseq critical sections defined with the RSEQ_CS_FLAG_ABORT_AT_IP flag
+ * have the following behavior on abort: when the stack grows down: the
+ * stack pointer is decremented to skip the redzone, and decremented of
+ * the pointer size. The aborted address (abort-at-ip) is stored at
+ * this stack pointer location. The user-space abort handler needs to
+ * pop the abort-at-ip address from the stack, and add the redzone size
+ * to the stack pointer.
+ *
+ * TODO: describe stack grows up.
*/
static int rseq_update_cpu_id(struct task_struct *t)
@@ -261,6 +278,11 @@ static int rseq_ip_fixup(struct pt_regs *regs)
trace_rseq_ip_fixup(ip, rseq_cs.start_ip, rseq_cs.post_commit_offset,
rseq_cs.abort_ip);
instruction_pointer_set(regs, (unsigned long)rseq_cs.abort_ip);
+ if (rseq_cs.flags & RSEQ_CS_FLAG_ABORT_AT_IP) {
+ ret = rseq_abort_at_ip(regs, ip);
+ if (ret)
+ return ret;
+ }
return 0;
}
@@ -330,6 +352,12 @@ SYSCALL_DEFINE4(rseq, struct rseq __user *, rseq, u32, rseq_len,
{
int ret;
+ if (flags & RSEQ_FLAG_QUERY_ABORT_AT_IP) {
+ if (flags & ~RSEQ_FLAG_QUERY_ABORT_AT_IP)
+ return -EINVAL;
+ return rseq_has_abort_at_ip() ? 0 : -EINVAL;
+ }
+
if (flags & RSEQ_FLAG_UNREGISTER) {
if (flags & ~RSEQ_FLAG_UNREGISTER)
return -EINVAL;
--
2.17.1
Signed-off-by: Mathieu Desnoyers <[email protected]>
Cc: Shuah Khan <[email protected]>
Cc: [email protected]
---
tools/testing/selftests/rseq/.gitignore | 1 +
tools/testing/selftests/rseq/Makefile | 3 +-
tools/testing/selftests/rseq/rseq.c | 21 ++
tools/testing/selftests/rseq/rseq.h | 12 +
.../selftests/rseq/rseq_ext_abort_at_ip.c | 285 ++++++++++++++++++
5 files changed, 321 insertions(+), 1 deletion(-)
create mode 100644 tools/testing/selftests/rseq/rseq_ext_abort_at_ip.c
diff --git a/tools/testing/selftests/rseq/.gitignore b/tools/testing/selftests/rseq/.gitignore
index 5910888ebfe1..472152afab77 100644
--- a/tools/testing/selftests/rseq/.gitignore
+++ b/tools/testing/selftests/rseq/.gitignore
@@ -5,3 +5,4 @@ basic_rseq_op_test
param_test
param_test_benchmark
param_test_compare_twice
+rseq_ext_abort_at_ip
diff --git a/tools/testing/selftests/rseq/Makefile b/tools/testing/selftests/rseq/Makefile
index 2af9d39a9716..dae0dba7552b 100644
--- a/tools/testing/selftests/rseq/Makefile
+++ b/tools/testing/selftests/rseq/Makefile
@@ -13,7 +13,8 @@ LDLIBS += -lpthread
OVERRIDE_TARGETS = 1
TEST_GEN_PROGS = basic_test basic_percpu_ops_test param_test \
- param_test_benchmark param_test_compare_twice
+ param_test_benchmark param_test_compare_twice \
+ rseq_ext_abort_at_ip
TEST_GEN_PROGS_EXTENDED = librseq.so
diff --git a/tools/testing/selftests/rseq/rseq.c b/tools/testing/selftests/rseq/rseq.c
index 7159eb777fd3..3e012f581ddb 100644
--- a/tools/testing/selftests/rseq/rseq.c
+++ b/tools/testing/selftests/rseq/rseq.c
@@ -73,6 +73,27 @@ static int sys_rseq(volatile struct rseq *rseq_abi, uint32_t rseq_len,
return syscall(__NR_rseq, rseq_abi, rseq_len, flags, sig);
}
+/*
+ * Return 0 if extension is available, negative error otherwise.
+ */
+int rseq_query_extension(enum rseq_extension ext)
+{
+ int ret;
+ unsigned int flags;
+
+ switch (ext) {
+ case RSEQ_EXT_ABORT_AT_IP:
+ flags = RSEQ_FLAG_QUERY_ABORT_AT_IP;
+ break;
+ default:
+ return -EINVAL;
+ }
+ ret = sys_rseq(NULL, 0, flags, 0);
+ if (!ret)
+ return 0;
+ return -errno;
+}
+
int rseq_register_current_thread(void)
{
int rc, ret = 0;
diff --git a/tools/testing/selftests/rseq/rseq.h b/tools/testing/selftests/rseq/rseq.h
index 3f63eb362b92..34c80402c078 100644
--- a/tools/testing/selftests/rseq/rseq.h
+++ b/tools/testing/selftests/rseq/rseq.h
@@ -67,6 +67,12 @@ extern int __rseq_handled;
abort(); \
} while (0)
+enum rseq_extension {
+ RSEQ_EXT_ABORT_AT_IP = 0,
+};
+
+#define ASM_RSEQ_CS_FLAG_ABORT_AT_IP (1U << 3)
+
#if defined(__x86_64__) || defined(__i386__)
#include <rseq-x86.h>
#elif defined(__ARMEL__)
@@ -162,4 +168,10 @@ static inline void rseq_prepare_unload(void)
rseq_clear_rseq_cs();
}
+/*
+ * Query whether rseq extension is available. Return 0 if available, negative
+ * error value otherwise.
+ */
+int rseq_query_extension(enum rseq_extension ext);
+
#endif /* RSEQ_H_ */
diff --git a/tools/testing/selftests/rseq/rseq_ext_abort_at_ip.c b/tools/testing/selftests/rseq/rseq_ext_abort_at_ip.c
new file mode 100644
index 000000000000..4cddf7433cc3
--- /dev/null
+++ b/tools/testing/selftests/rseq/rseq_ext_abort_at_ip.c
@@ -0,0 +1,285 @@
+// SPDX-License-Identifier: LGPL-2.1
+/*
+ * RSEQ abort-at-ip extension test.
+ *
+ * The test test_abort_at_ip_loop() implements an infinite loop which only exits when
+ * aborted. This rseq critical section is defined with the abort-at-ip
+ * extension, which requires the userspace abort handler to reajust the stack pointer.
+ * This test validates that the abort-at-ip value is within the address range of the
+ * rseq critical section.
+ *
+ * The test test_abort_at_ip_undo() validates that when aborted between two
+ * consecutive increments of two distinct variables, those variables are indeed one
+ * value apart. This validates that abort undo operations based on the abort-at-ip
+ * work as expected.
+ */
+
+#define _GNU_SOURCE
+#include <assert.h>
+#include <sched.h>
+#include <signal.h>
+#include <stdio.h>
+#include <string.h>
+#include <sys/time.h>
+
+#include "rseq.h"
+
+const int nr_iter = 10;
+
+#ifdef __x86_64__
+static void test_abort_at_ip_loop(void)
+{
+ void *abort_ip_addr, *abort_ip_start, *abort_ip_end;
+
+ printf("Testing abort_at_ip infinite loop\n");
+
+ __asm__ __volatile__ goto (
+ __RSEQ_ASM_DEFINE_TABLE(3, 0x0, ASM_RSEQ_CS_FLAG_ABORT_AT_IP, 1f, (2f - 1f), 4f) /* start, post_commit_offset, abort */
+ /* Start rseq by storing table entry pointer into rseq_cs. */
+ RSEQ_ASM_STORE_RSEQ_CS(1, 3b, RSEQ_CS_OFFSET(%[rseq_abi]))
+ "rep; nop\n\t" /* cpu_relax for busy loop. */
+ "jmp 1b\n\t" /* infinite loop. */
+ "2:\n\t"
+ RSEQ_ASM_DEFINE_ABORT(4,
+ /* abort-at-ip must be pop from the stack. */
+ "popq %%rcx\n\t"
+ "addq $128, %%rsp\n\t" /* x86-64 redzone */
+ "movq %%rcx, %[abort_ip_addr]\n\t"
+ "lea 1b(%%rip), %%rcx\n\t"
+ "movq %%rcx, %[abort_ip_start]\n\t"
+ "lea 2b(%%rip), %%rcx\n\t"
+ "movq %%rcx, %[abort_ip_end]\n\t",
+ abort)
+ : /* gcc asm goto does not allow outputs */
+ : [rseq_abi] "r" (&__rseq_abi),
+ [abort_ip_addr] "m" (abort_ip_addr),
+ [abort_ip_start] "m" (abort_ip_start),
+ [abort_ip_end] "m" (abort_ip_end)
+ : "memory", "cc", "rcx"
+ : abort
+ );
+ fprintf(stderr, "Error: infinite loop should never exit gracefully.\n");
+ abort();
+
+abort:
+ printf("Critical section aborted (as expected) at ip %p, within range [%p,%p[\n",
+ abort_ip_addr, abort_ip_start, abort_ip_end);
+ if (abort_ip_addr < abort_ip_start || abort_ip_addr >= abort_ip_end) {
+ fprintf(stderr, "Error: abort-ip is outside of expected range\n");
+ abort();
+ }
+}
+
+static void test_abort_at_ip_undo(void)
+{
+ void *abort_ip_addr, *abort_ip_start, *abort_ip_end, *ip_after_first_inc, *ip_after_second_inc;
+ unsigned long v[2] = { 0, 0 };
+
+ printf("Testing abort_at_ip undo\n");
+
+ __asm__ __volatile__ goto (
+ __RSEQ_ASM_DEFINE_TABLE(3, 0x0, ASM_RSEQ_CS_FLAG_ABORT_AT_IP, 1f, (2f - 1f), 4f) /* start, post_commit_offset, abort */
+ /* Start rseq by storing table entry pointer into rseq_cs. */
+ RSEQ_ASM_STORE_RSEQ_CS(1, 3b, RSEQ_CS_OFFSET(%[rseq_abi]))
+ "incq %[v0]\n\t"
+ "10:\n\t"
+ "rep; nop\n\t"
+ "incq %[v1]\n\t"
+ "20:\n\t"
+ "rep; nop\n\t"
+ "jmp 1b\n\t" /* infinite loop. */
+ "2:\n\t"
+ RSEQ_ASM_DEFINE_ABORT(4,
+ /* abort-at-ip must be pop from the stack. */
+ "popq %%rcx\n\t"
+ "addq $128, %%rsp\n\t" /* x86-64 redzone */
+ "movq %%rcx, %[abort_ip_addr]\n\t"
+ "lea 1b(%%rip), %%rcx\n\t"
+ "movq %%rcx, %[abort_ip_start]\n\t"
+ "lea 2b(%%rip), %%rcx\n\t"
+ "movq %%rcx, %[abort_ip_end]\n\t"
+ "lea 10b(%%rip), %%rcx\n\t"
+ "movq %%rcx, %[ip_after_first_inc]\n\t"
+ "lea 20b(%%rip), %%rcx\n\t"
+ "movq %%rcx, %[ip_after_second_inc]\n\t",
+ abort)
+ : /* gcc asm goto does not allow outputs */
+ : [rseq_abi] "r" (&__rseq_abi),
+ [abort_ip_addr] "m" (abort_ip_addr),
+ [abort_ip_start] "m" (abort_ip_start),
+ [abort_ip_end] "m" (abort_ip_end),
+ [ip_after_first_inc] "m" (ip_after_first_inc),
+ [ip_after_second_inc] "m" (ip_after_second_inc),
+ [v0] "m" (v[0]),
+ [v1] "m" (v[1])
+ : "memory", "cc", "rcx"
+ : abort
+ );
+ fprintf(stderr, "Error: infinite loop should never exit gracefully.\n");
+ abort();
+
+abort:
+ printf("Critical section aborted (as expected) at ip %p, within range [%p,%p[\n",
+ abort_ip_addr, abort_ip_start, abort_ip_end);
+ if (abort_ip_addr < abort_ip_start || abort_ip_addr >= abort_ip_end) {
+ fprintf(stderr, "Error: abort-ip is outside of expected range\n");
+ abort();
+ }
+ printf("ip after first inc: %p, ip after second inc: %p\n",
+ ip_after_first_inc, ip_after_second_inc);
+ printf("Counter values: v0: %lu v1: %lu\n", v[0], v[1]);
+ if (abort_ip_addr < ip_after_first_inc || abort_ip_addr >= ip_after_second_inc) {
+ if (v[0] != v[1])
+ abort();
+ } else {
+ if (v[0] != v[1] + 1)
+ abort();
+ }
+}
+#elif defined (__i386__)
+static void test_abort_at_ip_loop(void)
+{
+ void *abort_ip_addr, *abort_ip_start, *abort_ip_end;
+
+ printf("Testing abort_at_ip infinite loop\n");
+
+ __asm__ __volatile__ goto (
+ __RSEQ_ASM_DEFINE_TABLE(3, 0x0, ASM_RSEQ_CS_FLAG_ABORT_AT_IP, 1f, (2f - 1f), 4f) /* start, post_commit_offset, abort */
+ /* Start rseq by storing table entry pointer into rseq_cs. */
+ RSEQ_ASM_STORE_RSEQ_CS(1, 3b, RSEQ_CS_OFFSET(%[rseq_abi]))
+ "rep; nop\n\t" /* cpu_relax for busy loop. */
+ "jmp 1b\n\t" /* infinite loop. */
+ "2:\n\t"
+ RSEQ_ASM_DEFINE_ABORT(4,
+ /* abort-at-ip must be pop from the stack. */
+ "popl %%ecx\n\t"
+ "movl %%ecx, %[abort_ip_addr]\n\t"
+ "movl $1b, %%ecx\n\t"
+ "movl %%ecx, %[abort_ip_start]\n\t"
+ "movl $2b, %%ecx\n\t"
+ "movl %%ecx, %[abort_ip_end]\n\t",
+ abort)
+ : /* gcc asm goto does not allow outputs */
+ : [rseq_abi] "r" (&__rseq_abi),
+ [abort_ip_addr] "m" (abort_ip_addr),
+ [abort_ip_start] "m" (abort_ip_start),
+ [abort_ip_end] "m" (abort_ip_end)
+ : "memory", "cc", "ecx"
+ : abort
+ );
+ fprintf(stderr, "Error: infinite loop should never exit gracefully.\n");
+ abort();
+
+abort:
+ printf("Critical section aborted (as expected) at ip %p, within range [%p,%p[\n",
+ abort_ip_addr, abort_ip_start, abort_ip_end);
+ if (abort_ip_addr < abort_ip_start || abort_ip_addr >= abort_ip_end) {
+ fprintf(stderr, "Error: abort-ip is outside of expected range\n");
+ abort();
+ }
+}
+
+static void test_abort_at_ip_undo(void)
+{
+ void *abort_ip_addr, *abort_ip_start, *abort_ip_end, *ip_after_first_inc, *ip_after_second_inc;
+ unsigned long v[2] = { 0, 0 };
+
+ printf("Testing abort_at_ip undo\n");
+
+ __asm__ __volatile__ goto (
+ __RSEQ_ASM_DEFINE_TABLE(3, 0x0, ASM_RSEQ_CS_FLAG_ABORT_AT_IP, 1f, (2f - 1f), 4f) /* start, post_commit_offset, abort */
+ /* Start rseq by storing table entry pointer into rseq_cs. */
+ RSEQ_ASM_STORE_RSEQ_CS(1, 3b, RSEQ_CS_OFFSET(%[rseq_abi]))
+ "incl %[v0]\n\t"
+ "10:\n\t"
+ "rep; nop\n\t"
+ "incl %[v1]\n\t"
+ "20:\n\t"
+ "rep; nop\n\t"
+ "jmp 1b\n\t" /* infinite loop. */
+ "2:\n\t"
+ RSEQ_ASM_DEFINE_ABORT(4,
+ /* abort-at-ip must be pop from the stack. */
+ "popl %%ecx\n\t"
+ "movl %%ecx, %[abort_ip_addr]\n\t"
+ "movl $1b, %%ecx\n\t"
+ "movl %%ecx, %[abort_ip_start]\n\t"
+ "movl $2b, %%ecx\n\t"
+ "movl %%ecx, %[abort_ip_end]\n\t"
+ "movl $10b, %%ecx\n\t"
+ "movl %%ecx, %[ip_after_first_inc]\n\t"
+ "movl $20b, %%ecx\n\t"
+ "movl %%ecx, %[ip_after_second_inc]\n\t",
+ abort)
+ : /* gcc asm goto does not allow outputs */
+ : [rseq_abi] "r" (&__rseq_abi),
+ [abort_ip_addr] "m" (abort_ip_addr),
+ [abort_ip_start] "m" (abort_ip_start),
+ [abort_ip_end] "m" (abort_ip_end),
+ [ip_after_first_inc] "m" (ip_after_first_inc),
+ [ip_after_second_inc] "m" (ip_after_second_inc),
+ [v0] "m" (v[0]),
+ [v1] "m" (v[1])
+ : "memory", "cc", "rcx"
+ : abort
+ );
+ fprintf(stderr, "Error: infinite loop should never exit gracefully.\n");
+ abort();
+
+abort:
+ printf("Critical section aborted (as expected) at ip %p, within range [%p,%p[\n",
+ abort_ip_addr, abort_ip_start, abort_ip_end);
+ if (abort_ip_addr < abort_ip_start || abort_ip_addr >= abort_ip_end) {
+ fprintf(stderr, "Error: abort-ip is outside of expected range\n");
+ abort();
+ }
+ printf("ip after first inc: %p, ip after second inc: %p\n",
+ ip_after_first_inc, ip_after_second_inc);
+ printf("Counter values: v0: %lu v1: %lu\n", v[0], v[1]);
+ if (abort_ip_addr < ip_after_first_inc || abort_ip_addr >= ip_after_second_inc) {
+ if (v[0] != v[1])
+ abort();
+ } else {
+ if (v[0] != v[1] + 1)
+ abort();
+ }
+}
+#else
+static void test_abort_at_ip_loop(void)
+{
+ abort();
+}
+static void test_abort_at_ip_undo(void)
+{
+ abort();
+}
+#endif
+
+int main(int argc, char **argv)
+{
+ int i;
+
+ if (rseq_register_current_thread()) {
+ fprintf(stderr, "Error: rseq_register_current_thread(...) failed(%d): %s\n",
+ errno, strerror(errno));
+ goto init_thread_error;
+ }
+ printf("testing abort-at-ip extension\n");
+ if (rseq_query_extension(RSEQ_EXT_ABORT_AT_IP) != 0) {
+ fprintf(stderr, "RSEQ abort-at-ip extension is not supported, skipping test.\n");
+ return 0;
+ }
+ for (i = 0; i < nr_iter; i++)
+ test_abort_at_ip_loop();
+ for (i = 0; i < nr_iter; i++)
+ test_abort_at_ip_undo();
+ if (rseq_unregister_current_thread()) {
+ fprintf(stderr, "Error: rseq_unregister_current_thread(...) failed(%d): %s\n",
+ errno, strerror(errno));
+ goto init_thread_error;
+ }
+ return 0;
+
+init_thread_error:
+ return -1;
+}
--
2.17.1
On Mon, Jan 10, 2022 at 12:16:10PM -0500, Mathieu Desnoyers wrote:
> Allow rseq critical section abort handlers to optionally figure out at
> which instruction pointer the rseq critical section was aborted.
>
> This allows implementing rseq critical sections with loops containing
> the commit instruction, for which having the commit as last instruction
> of the sequence is not possible. This is useful to implement adaptative
> mutexes aware of preemption in user-space. (see [1])
>
> This also allows implementing rseq critical sections with multiple
> commit steps, and use the abort-at-ip information to figure out what
> needs to be undone in the abort handler.
>
> Introduce the RSEQ_FLAG_QUERY_ABORT_AT_IP rseq system call flag. This
> lets userspace query whether the kernel and architecture supports the
> abort-at-ip rseq extension.
>
> Only provide this information for rseq critical sections for which the
> rseq_cs descriptor has the RSEQ_CS_FLAG_ABORT_AT_IP flag set. Abort
> handlers for critical sections with this flag set need to readjust the
> stack pointer. The abort-at-ip pointer is populated by the kernel on
> the top of stack on abort. For x86-32, the abort handler of an
> abort-at-ip critical section needs to add 4 bytes to the stack pointer.
> For x86-64, the abort hanler needs to add 136 bytes to the stack
> pointer: 8 bytes to skip the abort-at-ip value, and 128 bytes to skip
> the x86-64 redzone for leaf functions.
>
> [1] https://github.com/compudj/rseq-test/blob/adapt-lock-abort-at-ip/test-rseq-adaptative-lock.c#L80
>
> Signed-off-by: Mathieu Desnoyers <[email protected]>
> ---
> Changes since v1:
> - Use top of stack to place abort-at-ip value rather than ecx/rcx
> register,
> - Skip redzone on x86-64.
> ---
> arch/x86/include/asm/ptrace.h | 5 +++++
> arch/x86/kernel/ptrace.c | 12 ++++++++++++
> include/uapi/linux/rseq.h | 4 ++++
> kernel/rseq.c | 28 ++++++++++++++++++++++++++++
> 4 files changed, 49 insertions(+)
>
> diff --git a/arch/x86/include/asm/ptrace.h b/arch/x86/include/asm/ptrace.h
> index 703663175a5a..c96eb2448110 100644
> --- a/arch/x86/include/asm/ptrace.h
> +++ b/arch/x86/include/asm/ptrace.h
> @@ -387,5 +387,10 @@ extern int do_set_thread_area(struct task_struct *p, int idx,
> # define do_set_thread_area_64(p, s, t) (0)
> #endif
>
> +#ifdef CONFIG_RSEQ
> +# define RSEQ_ARCH_HAS_ABORT_AT_IP
> +int rseq_abort_at_ip(struct pt_regs *regs, unsigned long ip);
> +#endif
> +
> #endif /* !__ASSEMBLY__ */
> #endif /* _ASM_X86_PTRACE_H */
> diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c
> index 6d2244c94799..561ed98d12ba 100644
> --- a/arch/x86/kernel/ptrace.c
> +++ b/arch/x86/kernel/ptrace.c
> @@ -1368,3 +1368,15 @@ void user_single_step_report(struct pt_regs *regs)
> {
> send_sigtrap(regs, 0, TRAP_BRKPT);
> }
> +
> +int rseq_abort_at_ip(struct pt_regs *regs, unsigned long ip)
> +{
> + if (user_64bit_mode(regs)) {
> + /* Need to skip redzone for leaf functions. */
> + regs->sp -= sizeof(u64) + 128;
> + return put_user(ip, (u64 __user *)regs->sp);
> + } else {
> + regs->sp -= sizeof(u32);
> + return put_user(ip, (u32 __user *)regs->sp);
> + }
I think it would be really helpful if you added the full explanation for
sizeof(u64) + 128 or -sizeof(u32) into this codepath or provide
constants. For folks not familiar with stuff like this it'll look like
magic numbers. :)
> +}
> diff --git a/include/uapi/linux/rseq.h b/include/uapi/linux/rseq.h
> index 9a402fdb60e9..3130232e6d0c 100644
> --- a/include/uapi/linux/rseq.h
> +++ b/include/uapi/linux/rseq.h
> @@ -20,12 +20,14 @@ enum rseq_cpu_id_state {
>
> enum rseq_flags {
> RSEQ_FLAG_UNREGISTER = (1 << 0),
> + RSEQ_FLAG_QUERY_ABORT_AT_IP = (1 << 1),
> };
>
> enum rseq_cs_flags_bit {
> RSEQ_CS_FLAG_NO_RESTART_ON_PREEMPT_BIT = 0,
> RSEQ_CS_FLAG_NO_RESTART_ON_SIGNAL_BIT = 1,
> RSEQ_CS_FLAG_NO_RESTART_ON_MIGRATE_BIT = 2,
> + RSEQ_CS_FLAG_ABORT_AT_IP_BIT = 3,
> };
>
> enum rseq_cs_flags {
> @@ -35,6 +37,8 @@ enum rseq_cs_flags {
> (1U << RSEQ_CS_FLAG_NO_RESTART_ON_SIGNAL_BIT),
> RSEQ_CS_FLAG_NO_RESTART_ON_MIGRATE =
> (1U << RSEQ_CS_FLAG_NO_RESTART_ON_MIGRATE_BIT),
> + RSEQ_CS_FLAG_ABORT_AT_IP =
> + (1U << RSEQ_CS_FLAG_ABORT_AT_IP_BIT),
> };
>
> /*
> diff --git a/kernel/rseq.c b/kernel/rseq.c
> index 6d45ac3dae7f..fb52f2d11b56 100644
> --- a/kernel/rseq.c
> +++ b/kernel/rseq.c
> @@ -21,6 +21,13 @@
> #define RSEQ_CS_PREEMPT_MIGRATE_FLAGS (RSEQ_CS_FLAG_NO_RESTART_ON_MIGRATE | \
> RSEQ_CS_FLAG_NO_RESTART_ON_PREEMPT)
>
> +#ifdef RSEQ_ARCH_HAS_ABORT_AT_IP
> +static bool rseq_has_abort_at_ip(void) { return true; }
> +#else
> +static bool rseq_has_abort_at_ip(void) { return false; }
> +static int rseq_abort_at_ip(struct pt_regs *regs, unsigned long ip) { return 0; }
> +#endif
> +
> /*
> *
> * Restartable sequences are a lightweight interface that allows
> @@ -79,6 +86,16 @@
> *
> * [abort_ip]
> * F1. <failure>
> + *
> + * rseq critical sections defined with the RSEQ_CS_FLAG_ABORT_AT_IP flag
> + * have the following behavior on abort: when the stack grows down: the
> + * stack pointer is decremented to skip the redzone, and decremented of
> + * the pointer size. The aborted address (abort-at-ip) is stored at
> + * this stack pointer location. The user-space abort handler needs to
> + * pop the abort-at-ip address from the stack, and add the redzone size
> + * to the stack pointer.
> + *
> + * TODO: describe stack grows up.
Is this intentional or did you forget? :)
----- On Jan 11, 2022, at 6:05 AM, Christian Brauner [email protected] wrote:
> On Mon, Jan 10, 2022 at 12:16:10PM -0500, Mathieu Desnoyers wrote:
>> Allow rseq critical section abort handlers to optionally figure out at
>> which instruction pointer the rseq critical section was aborted.
>>
>> This allows implementing rseq critical sections with loops containing
>> the commit instruction, for which having the commit as last instruction
>> of the sequence is not possible. This is useful to implement adaptative
>> mutexes aware of preemption in user-space. (see [1])
>>
>> This also allows implementing rseq critical sections with multiple
>> commit steps, and use the abort-at-ip information to figure out what
>> needs to be undone in the abort handler.
>>
>> Introduce the RSEQ_FLAG_QUERY_ABORT_AT_IP rseq system call flag. This
>> lets userspace query whether the kernel and architecture supports the
>> abort-at-ip rseq extension.
>>
>> Only provide this information for rseq critical sections for which the
>> rseq_cs descriptor has the RSEQ_CS_FLAG_ABORT_AT_IP flag set. Abort
>> handlers for critical sections with this flag set need to readjust the
>> stack pointer. The abort-at-ip pointer is populated by the kernel on
>> the top of stack on abort. For x86-32, the abort handler of an
>> abort-at-ip critical section needs to add 4 bytes to the stack pointer.
>> For x86-64, the abort hanler needs to add 136 bytes to the stack
>> pointer: 8 bytes to skip the abort-at-ip value, and 128 bytes to skip
>> the x86-64 redzone for leaf functions.
>>
>> [1]
>> https://github.com/compudj/rseq-test/blob/adapt-lock-abort-at-ip/test-rseq-adaptative-lock.c#L80
>>
>> Signed-off-by: Mathieu Desnoyers <[email protected]>
>> ---
>> Changes since v1:
>> - Use top of stack to place abort-at-ip value rather than ecx/rcx
>> register,
>> - Skip redzone on x86-64.
>> ---
>> arch/x86/include/asm/ptrace.h | 5 +++++
>> arch/x86/kernel/ptrace.c | 12 ++++++++++++
>> include/uapi/linux/rseq.h | 4 ++++
>> kernel/rseq.c | 28 ++++++++++++++++++++++++++++
>> 4 files changed, 49 insertions(+)
>>
>> diff --git a/arch/x86/include/asm/ptrace.h b/arch/x86/include/asm/ptrace.h
>> index 703663175a5a..c96eb2448110 100644
>> --- a/arch/x86/include/asm/ptrace.h
>> +++ b/arch/x86/include/asm/ptrace.h
>> @@ -387,5 +387,10 @@ extern int do_set_thread_area(struct task_struct *p, int
>> idx,
>> # define do_set_thread_area_64(p, s, t) (0)
>> #endif
>>
>> +#ifdef CONFIG_RSEQ
>> +# define RSEQ_ARCH_HAS_ABORT_AT_IP
>> +int rseq_abort_at_ip(struct pt_regs *regs, unsigned long ip);
>> +#endif
>> +
>> #endif /* !__ASSEMBLY__ */
>> #endif /* _ASM_X86_PTRACE_H */
>> diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c
>> index 6d2244c94799..561ed98d12ba 100644
>> --- a/arch/x86/kernel/ptrace.c
>> +++ b/arch/x86/kernel/ptrace.c
>> @@ -1368,3 +1368,15 @@ void user_single_step_report(struct pt_regs *regs)
>> {
>> send_sigtrap(regs, 0, TRAP_BRKPT);
>> }
>> +
>> +int rseq_abort_at_ip(struct pt_regs *regs, unsigned long ip)
>> +{
>> + if (user_64bit_mode(regs)) {
>> + /* Need to skip redzone for leaf functions. */
>> + regs->sp -= sizeof(u64) + 128;
>> + return put_user(ip, (u64 __user *)regs->sp);
>> + } else {
>> + regs->sp -= sizeof(u32);
>> + return put_user(ip, (u32 __user *)regs->sp);
>> + }
>
> I think it would be really helpful if you added the full explanation for
> sizeof(u64) + 128 or -sizeof(u32) into this codepath or provide
> constants. For folks not familiar with stuff like this it'll look like
> magic numbers. :)
Good point, here is the planned update:
int rseq_abort_at_ip(struct pt_regs *regs, unsigned long ip)
{
if (user_64bit_mode(regs)) {
/*
* rseq abort-at-ip x86-64 ABI: stack pointer is decremented to
* skip the redzone (128 bytes on x86-64), and decremented of
* the pointer size (8 bytes). The aborted address (abort-at-ip)
* is stored at this updated stack pointer location (top of stack).
*
* Skipping the redzone is needed to make sure not to corrupt
* stack data when the rseq critical section is within a leaf
* function.
*/
regs->sp -= sizeof(u64) + 128;
return put_user(ip, (u64 __user *)regs->sp);
} else {
/*
* rseq abort-at-ip x86-32 ABI: stack pointer is decremented of
* the pointer size (4 bytes). The aborted address (abort-at-ip)
* is stored at this updated stack pointer location (top of stack).
*/
regs->sp -= sizeof(u32);
return put_user(ip, (u32 __user *)regs->sp);
}
}
>
>> +}
>> diff --git a/include/uapi/linux/rseq.h b/include/uapi/linux/rseq.h
>> index 9a402fdb60e9..3130232e6d0c 100644
>> --- a/include/uapi/linux/rseq.h
>> +++ b/include/uapi/linux/rseq.h
>> @@ -20,12 +20,14 @@ enum rseq_cpu_id_state {
>>
>> enum rseq_flags {
>> RSEQ_FLAG_UNREGISTER = (1 << 0),
>> + RSEQ_FLAG_QUERY_ABORT_AT_IP = (1 << 1),
>> };
>>
>> enum rseq_cs_flags_bit {
>> RSEQ_CS_FLAG_NO_RESTART_ON_PREEMPT_BIT = 0,
>> RSEQ_CS_FLAG_NO_RESTART_ON_SIGNAL_BIT = 1,
>> RSEQ_CS_FLAG_NO_RESTART_ON_MIGRATE_BIT = 2,
>> + RSEQ_CS_FLAG_ABORT_AT_IP_BIT = 3,
>> };
>>
>> enum rseq_cs_flags {
>> @@ -35,6 +37,8 @@ enum rseq_cs_flags {
>> (1U << RSEQ_CS_FLAG_NO_RESTART_ON_SIGNAL_BIT),
>> RSEQ_CS_FLAG_NO_RESTART_ON_MIGRATE =
>> (1U << RSEQ_CS_FLAG_NO_RESTART_ON_MIGRATE_BIT),
>> + RSEQ_CS_FLAG_ABORT_AT_IP =
>> + (1U << RSEQ_CS_FLAG_ABORT_AT_IP_BIT),
>> };
>>
>> /*
>> diff --git a/kernel/rseq.c b/kernel/rseq.c
>> index 6d45ac3dae7f..fb52f2d11b56 100644
>> --- a/kernel/rseq.c
>> +++ b/kernel/rseq.c
>> @@ -21,6 +21,13 @@
>> #define RSEQ_CS_PREEMPT_MIGRATE_FLAGS (RSEQ_CS_FLAG_NO_RESTART_ON_MIGRATE | \
>> RSEQ_CS_FLAG_NO_RESTART_ON_PREEMPT)
>>
>> +#ifdef RSEQ_ARCH_HAS_ABORT_AT_IP
>> +static bool rseq_has_abort_at_ip(void) { return true; }
>> +#else
>> +static bool rseq_has_abort_at_ip(void) { return false; }
>> +static int rseq_abort_at_ip(struct pt_regs *regs, unsigned long ip) { return 0;
>> }
>> +#endif
>> +
>> /*
>> *
>> * Restartable sequences are a lightweight interface that allows
>> @@ -79,6 +86,16 @@
>> *
>> * [abort_ip]
>> * F1. <failure>
>> + *
>> + * rseq critical sections defined with the RSEQ_CS_FLAG_ABORT_AT_IP flag
>> + * have the following behavior on abort: when the stack grows down: the
>> + * stack pointer is decremented to skip the redzone, and decremented of
>> + * the pointer size. The aborted address (abort-at-ip) is stored at
>> + * this stack pointer location. The user-space abort handler needs to
>> + * pop the abort-at-ip address from the stack, and add the redzone size
>> + * to the stack pointer.
>> + *
>> + * TODO: describe stack grows up.
>
> Is this intentional or did you forget? :)
Since I did not implement abort-at-ip on stack-grows-up architectures, I felt
it would be too early to describe the algorithm. I can simply remove the TODO
altogether and we'll take care of it when we get there ? If I had to try to
wordsmith it, it would look like e.g.:
* [...] When the stack grows up: the
* stack pointer is incremented to skip the redzone, and incremented of
* the pointer size. The aborted address (abort-at-ip) is stored immediately
* under this stack pointer location. The user-space abort handler needs to
* pop the abort-at-ip address from the stack, and subtract the redzone size
* from the stack pointer.
[ Please let me know if I got somehow confused in my understanding of stack grows
up architectures. ]
I'm also unsure whether any of the stack grows up architecture have redzones ?
From a quick grep for redzone in Linux arch/, only openrisc, powerpc64 and
x86-64 appear to have redzones.
Thanks,
Mathieu
--
Mathieu Desnoyers
EfficiOS Inc.
http://www.efficios.com
On Tue, Jan 11, 2022 at 12:43:05PM -0500, Mathieu Desnoyers wrote:
> ----- On Jan 11, 2022, at 6:05 AM, Christian Brauner [email protected] wrote:
>
> > On Mon, Jan 10, 2022 at 12:16:10PM -0500, Mathieu Desnoyers wrote:
> >> Allow rseq critical section abort handlers to optionally figure out at
> >> which instruction pointer the rseq critical section was aborted.
> >>
> >> This allows implementing rseq critical sections with loops containing
> >> the commit instruction, for which having the commit as last instruction
> >> of the sequence is not possible. This is useful to implement adaptative
> >> mutexes aware of preemption in user-space. (see [1])
> >>
> >> This also allows implementing rseq critical sections with multiple
> >> commit steps, and use the abort-at-ip information to figure out what
> >> needs to be undone in the abort handler.
> >>
> >> Introduce the RSEQ_FLAG_QUERY_ABORT_AT_IP rseq system call flag. This
> >> lets userspace query whether the kernel and architecture supports the
> >> abort-at-ip rseq extension.
> >>
> >> Only provide this information for rseq critical sections for which the
> >> rseq_cs descriptor has the RSEQ_CS_FLAG_ABORT_AT_IP flag set. Abort
> >> handlers for critical sections with this flag set need to readjust the
> >> stack pointer. The abort-at-ip pointer is populated by the kernel on
> >> the top of stack on abort. For x86-32, the abort handler of an
> >> abort-at-ip critical section needs to add 4 bytes to the stack pointer.
> >> For x86-64, the abort hanler needs to add 136 bytes to the stack
> >> pointer: 8 bytes to skip the abort-at-ip value, and 128 bytes to skip
> >> the x86-64 redzone for leaf functions.
> >>
> >> [1]
> >> https://github.com/compudj/rseq-test/blob/adapt-lock-abort-at-ip/test-rseq-adaptative-lock.c#L80
> >>
> >> Signed-off-by: Mathieu Desnoyers <[email protected]>
> >> ---
> >> Changes since v1:
> >> - Use top of stack to place abort-at-ip value rather than ecx/rcx
> >> register,
> >> - Skip redzone on x86-64.
> >> ---
> >> arch/x86/include/asm/ptrace.h | 5 +++++
> >> arch/x86/kernel/ptrace.c | 12 ++++++++++++
> >> include/uapi/linux/rseq.h | 4 ++++
> >> kernel/rseq.c | 28 ++++++++++++++++++++++++++++
> >> 4 files changed, 49 insertions(+)
> >>
> >> diff --git a/arch/x86/include/asm/ptrace.h b/arch/x86/include/asm/ptrace.h
> >> index 703663175a5a..c96eb2448110 100644
> >> --- a/arch/x86/include/asm/ptrace.h
> >> +++ b/arch/x86/include/asm/ptrace.h
> >> @@ -387,5 +387,10 @@ extern int do_set_thread_area(struct task_struct *p, int
> >> idx,
> >> # define do_set_thread_area_64(p, s, t) (0)
> >> #endif
> >>
> >> +#ifdef CONFIG_RSEQ
> >> +# define RSEQ_ARCH_HAS_ABORT_AT_IP
> >> +int rseq_abort_at_ip(struct pt_regs *regs, unsigned long ip);
> >> +#endif
> >> +
> >> #endif /* !__ASSEMBLY__ */
> >> #endif /* _ASM_X86_PTRACE_H */
> >> diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c
> >> index 6d2244c94799..561ed98d12ba 100644
> >> --- a/arch/x86/kernel/ptrace.c
> >> +++ b/arch/x86/kernel/ptrace.c
> >> @@ -1368,3 +1368,15 @@ void user_single_step_report(struct pt_regs *regs)
> >> {
> >> send_sigtrap(regs, 0, TRAP_BRKPT);
> >> }
> >> +
> >> +int rseq_abort_at_ip(struct pt_regs *regs, unsigned long ip)
> >> +{
> >> + if (user_64bit_mode(regs)) {
> >> + /* Need to skip redzone for leaf functions. */
> >> + regs->sp -= sizeof(u64) + 128;
> >> + return put_user(ip, (u64 __user *)regs->sp);
> >> + } else {
> >> + regs->sp -= sizeof(u32);
> >> + return put_user(ip, (u32 __user *)regs->sp);
> >> + }
> >
> > I think it would be really helpful if you added the full explanation for
> > sizeof(u64) + 128 or -sizeof(u32) into this codepath or provide
> > constants. For folks not familiar with stuff like this it'll look like
> > magic numbers. :)
>
> Good point, here is the planned update:
That's great, thanks!
>
> int rseq_abort_at_ip(struct pt_regs *regs, unsigned long ip)
> {
> if (user_64bit_mode(regs)) {
> /*
> * rseq abort-at-ip x86-64 ABI: stack pointer is decremented to
> * skip the redzone (128 bytes on x86-64), and decremented of
> * the pointer size (8 bytes). The aborted address (abort-at-ip)
> * is stored at this updated stack pointer location (top of stack).
> *
> * Skipping the redzone is needed to make sure not to corrupt
> * stack data when the rseq critical section is within a leaf
> * function.
> */
> regs->sp -= sizeof(u64) + 128;
> return put_user(ip, (u64 __user *)regs->sp);
> } else {
> /*
> * rseq abort-at-ip x86-32 ABI: stack pointer is decremented of
> * the pointer size (4 bytes). The aborted address (abort-at-ip)
> * is stored at this updated stack pointer location (top of stack).
> */
> regs->sp -= sizeof(u32);
> return put_user(ip, (u32 __user *)regs->sp);
> }
> }
>
>
>
> >
> >> +}
> >> diff --git a/include/uapi/linux/rseq.h b/include/uapi/linux/rseq.h
> >> index 9a402fdb60e9..3130232e6d0c 100644
> >> --- a/include/uapi/linux/rseq.h
> >> +++ b/include/uapi/linux/rseq.h
> >> @@ -20,12 +20,14 @@ enum rseq_cpu_id_state {
> >>
> >> enum rseq_flags {
> >> RSEQ_FLAG_UNREGISTER = (1 << 0),
> >> + RSEQ_FLAG_QUERY_ABORT_AT_IP = (1 << 1),
> >> };
> >>
> >> enum rseq_cs_flags_bit {
> >> RSEQ_CS_FLAG_NO_RESTART_ON_PREEMPT_BIT = 0,
> >> RSEQ_CS_FLAG_NO_RESTART_ON_SIGNAL_BIT = 1,
> >> RSEQ_CS_FLAG_NO_RESTART_ON_MIGRATE_BIT = 2,
> >> + RSEQ_CS_FLAG_ABORT_AT_IP_BIT = 3,
> >> };
> >>
> >> enum rseq_cs_flags {
> >> @@ -35,6 +37,8 @@ enum rseq_cs_flags {
> >> (1U << RSEQ_CS_FLAG_NO_RESTART_ON_SIGNAL_BIT),
> >> RSEQ_CS_FLAG_NO_RESTART_ON_MIGRATE =
> >> (1U << RSEQ_CS_FLAG_NO_RESTART_ON_MIGRATE_BIT),
> >> + RSEQ_CS_FLAG_ABORT_AT_IP =
> >> + (1U << RSEQ_CS_FLAG_ABORT_AT_IP_BIT),
> >> };
> >>
> >> /*
> >> diff --git a/kernel/rseq.c b/kernel/rseq.c
> >> index 6d45ac3dae7f..fb52f2d11b56 100644
> >> --- a/kernel/rseq.c
> >> +++ b/kernel/rseq.c
> >> @@ -21,6 +21,13 @@
> >> #define RSEQ_CS_PREEMPT_MIGRATE_FLAGS (RSEQ_CS_FLAG_NO_RESTART_ON_MIGRATE | \
> >> RSEQ_CS_FLAG_NO_RESTART_ON_PREEMPT)
> >>
> >> +#ifdef RSEQ_ARCH_HAS_ABORT_AT_IP
> >> +static bool rseq_has_abort_at_ip(void) { return true; }
> >> +#else
> >> +static bool rseq_has_abort_at_ip(void) { return false; }
> >> +static int rseq_abort_at_ip(struct pt_regs *regs, unsigned long ip) { return 0;
> >> }
> >> +#endif
> >> +
> >> /*
> >> *
> >> * Restartable sequences are a lightweight interface that allows
> >> @@ -79,6 +86,16 @@
> >> *
> >> * [abort_ip]
> >> * F1. <failure>
> >> + *
> >> + * rseq critical sections defined with the RSEQ_CS_FLAG_ABORT_AT_IP flag
> >> + * have the following behavior on abort: when the stack grows down: the
> >> + * stack pointer is decremented to skip the redzone, and decremented of
> >> + * the pointer size. The aborted address (abort-at-ip) is stored at
> >> + * this stack pointer location. The user-space abort handler needs to
> >> + * pop the abort-at-ip address from the stack, and add the redzone size
> >> + * to the stack pointer.
> >> + *
> >> + * TODO: describe stack grows up.
> >
> > Is this intentional or did you forget? :)
>
> Since I did not implement abort-at-ip on stack-grows-up architectures, I felt
> it would be too early to describe the algorithm. I can simply remove the TODO
> altogether and we'll take care of it when we get there ? If I had to try to
> wordsmith it, it would look like e.g.:
>
> * [...] When the stack grows up: the
> * stack pointer is incremented to skip the redzone, and incremented of
> * the pointer size. The aborted address (abort-at-ip) is stored immediately
> * under this stack pointer location. The user-space abort handler needs to
> * pop the abort-at-ip address from the stack, and subtract the redzone size
> * from the stack pointer.
>
> [ Please let me know if I got somehow confused in my understanding of stack grows
> up architectures. ]
>
> I'm also unsure whether any of the stack grows up architecture have redzones ?
I don't think so? From when I last touched that piece of arch code when
massaging copy_thread() I only remember parisc as having an upwards
growing stack.
> From a quick grep for redzone in Linux arch/, only openrisc, powerpc64 and
> x86-64 appear to have redzones.
----- On Jan 12, 2022, at 3:46 AM, Christian Brauner [email protected] wrote:
> On Tue, Jan 11, 2022 at 12:43:05PM -0500, Mathieu Desnoyers wrote:
[...]
>> >> + *
>> >> + * rseq critical sections defined with the RSEQ_CS_FLAG_ABORT_AT_IP flag
>> >> + * have the following behavior on abort: when the stack grows down: the
>> >> + * stack pointer is decremented to skip the redzone, and decremented of
>> >> + * the pointer size. The aborted address (abort-at-ip) is stored at
>> >> + * this stack pointer location. The user-space abort handler needs to
>> >> + * pop the abort-at-ip address from the stack, and add the redzone size
>> >> + * to the stack pointer.
>> >> + *
>> >> + * TODO: describe stack grows up.
>> >
>> > Is this intentional or did you forget? :)
>>
>> Since I did not implement abort-at-ip on stack-grows-up architectures, I felt
>> it would be too early to describe the algorithm. I can simply remove the TODO
>> altogether and we'll take care of it when we get there ? If I had to try to
>> wordsmith it, it would look like e.g.:
>>
>> * [...] When the stack grows up: the
>> * stack pointer is incremented to skip the redzone, and incremented of
>> * the pointer size. The aborted address (abort-at-ip) is stored immediately
>> * under this stack pointer location. The user-space abort handler needs to
>> * pop the abort-at-ip address from the stack, and subtract the redzone size
>> * from the stack pointer.
>>
>> [ Please let me know if I got somehow confused in my understanding of stack
>> grows
>> up architectures. ]
>>
>> I'm also unsure whether any of the stack grows up architecture have redzones ?
>
> I don't think so? From when I last touched that piece of arch code when
> massaging copy_thread() I only remember parisc as having an upwards
> growing stack.
>
>> From a quick grep for redzone in Linux arch/, only openrisc, powerpc64 and
> > x86-64 appear to have redzones.
I figured it was kind of silly to special-case arch-agnostic comments for stack
grows up/down, how about the following instead ?
* rseq critical sections defined with the RSEQ_CS_FLAG_ABORT_AT_IP flag
* have the following behavior on abort: the stack pointer is adjusted to
* skip over the redzone [*], and the aborted address (abort-at-ip) is pushed
* at this stack pointer location. The user-space abort handler needs to pop
* the abort-at-ip address from the stack, and adjust the stack pointer to skip
* back over the redzone.
*
* [*] The openrisc, powerpc64 and x86-64 architectures define a "redzone" as a
* stack area beyond the stack pointer which can be used by the compiler
* to store local variables in leaf functions.
Thanks,
Mathieu
--
Mathieu Desnoyers
EfficiOS Inc.
http://www.efficios.com
On Wed, Jan 12, 2022 at 09:47:29AM -0500, Mathieu Desnoyers wrote:
> ----- On Jan 12, 2022, at 3:46 AM, Christian Brauner [email protected] wrote:
>
> > On Tue, Jan 11, 2022 at 12:43:05PM -0500, Mathieu Desnoyers wrote:
> [...]
> >> >> + *
> >> >> + * rseq critical sections defined with the RSEQ_CS_FLAG_ABORT_AT_IP flag
> >> >> + * have the following behavior on abort: when the stack grows down: the
> >> >> + * stack pointer is decremented to skip the redzone, and decremented of
> >> >> + * the pointer size. The aborted address (abort-at-ip) is stored at
> >> >> + * this stack pointer location. The user-space abort handler needs to
> >> >> + * pop the abort-at-ip address from the stack, and add the redzone size
> >> >> + * to the stack pointer.
> >> >> + *
> >> >> + * TODO: describe stack grows up.
> >> >
> >> > Is this intentional or did you forget? :)
> >>
> >> Since I did not implement abort-at-ip on stack-grows-up architectures, I felt
> >> it would be too early to describe the algorithm. I can simply remove the TODO
> >> altogether and we'll take care of it when we get there ? If I had to try to
> >> wordsmith it, it would look like e.g.:
> >>
> >> * [...] When the stack grows up: the
> >> * stack pointer is incremented to skip the redzone, and incremented of
> >> * the pointer size. The aborted address (abort-at-ip) is stored immediately
> >> * under this stack pointer location. The user-space abort handler needs to
> >> * pop the abort-at-ip address from the stack, and subtract the redzone size
> >> * from the stack pointer.
> >>
> >> [ Please let me know if I got somehow confused in my understanding of stack
> >> grows
> >> up architectures. ]
> >>
> >> I'm also unsure whether any of the stack grows up architecture have redzones ?
> >
> > I don't think so? From when I last touched that piece of arch code when
> > massaging copy_thread() I only remember parisc as having an upwards
> > growing stack.
> >
> >> From a quick grep for redzone in Linux arch/, only openrisc, powerpc64 and
> > > x86-64 appear to have redzones.
>
> I figured it was kind of silly to special-case arch-agnostic comments for stack
> grows up/down, how about the following instead ?
>
> * rseq critical sections defined with the RSEQ_CS_FLAG_ABORT_AT_IP flag
> * have the following behavior on abort: the stack pointer is adjusted to
> * skip over the redzone [*], and the aborted address (abort-at-ip) is pushed
> * at this stack pointer location. The user-space abort handler needs to pop
> * the abort-at-ip address from the stack, and adjust the stack pointer to skip
> * back over the redzone.
> *
> * [*] The openrisc, powerpc64 and x86-64 architectures define a "redzone" as a
> * stack area beyond the stack pointer which can be used by the compiler
> * to store local variables in leaf functions.
Sounds good to me.
> * [*] The openrisc, powerpc64 and x86-64 architectures define a "redzone" as a
> * stack area beyond the stack pointer which can be used by the compiler
> * to store local variables in leaf functions.
I wonder if that is really worth the trouble it causes!
By the time a function is spilling values to stack the cost
of a %sp update is almost certainly noise.
Someone clearly thought it was a 'good idea (tm)'.
David
-
Registered Address Lakeside, Bramley Road, Mount Farm, Milton Keynes, MK1 1PT, UK
Registration No: 1397386 (Wales)
----- On Jan 12, 2022, at 9:58 AM, David Laight [email protected] wrote:
>> * [*] The openrisc, powerpc64 and x86-64 architectures define a "redzone" as a
>> * stack area beyond the stack pointer which can be used by the compiler
>> * to store local variables in leaf functions.
>
> I wonder if that is really worth the trouble it causes!
> By the time a function is spilling values to stack the cost
> of a %sp update is almost certainly noise.
>
> Someone clearly thought it was a 'good idea (tm)'.
I must admit that I've been surprised to learn about these redzones. Thanks for
pointing them out to me, it was clearly a blind spot. I suspect it would be useful
to introduce per-architecture KERNEL_REDZONE, USER_REDZONE and COMPAT_USER_REDZONE
with a asm-generic version defining them to 0, with proper documentation. It would
make it clearer to kernel developers working on stuff similar to signal handler
delivery that they need to consider these carefully.
Mathieu
>
> David
>
> -
> Registered Address Lakeside, Bramley Road, Mount Farm, Milton Keynes, MK1 1PT,
> UK
> Registration No: 1397386 (Wales)
--
Mathieu Desnoyers
EfficiOS Inc.
http://www.efficios.com
From: Mathieu Desnoyers
> Sent: 12 January 2022 15:06
>
> ----- On Jan 12, 2022, at 9:58 AM, David Laight [email protected] wrote:
>
> >> * [*] The openrisc, powerpc64 and x86-64 architectures define a "redzone" as a
> >> * stack area beyond the stack pointer which can be used by the compiler
> >> * to store local variables in leaf functions.
> >
> > I wonder if that is really worth the trouble it causes!
> > By the time a function is spilling values to stack the cost
> > of a %sp update is almost certainly noise.
> >
> > Someone clearly thought it was a 'good idea (tm)'.
>
> I must admit that I've been surprised to learn about these redzones. Thanks for
> pointing them out to me, it was clearly a blind spot. I suspect it would be useful
> to introduce per-architecture KERNEL_REDZONE, USER_REDZONE and COMPAT_USER_REDZONE
> with a asm-generic version defining them to 0, with proper documentation. It would
> make it clearer to kernel developers working on stuff similar to signal handler
> delivery that they need to consider these carefully.
They can never be used in kernel - any ISR would overwrite them.
David
-
Registered Address Lakeside, Bramley Road, Mount Farm, Milton Keynes, MK1 1PT, UK
Registration No: 1397386 (Wales)
----- On Jan 12, 2022, at 10:15 AM, David Laight [email protected] wrote:
> From: Mathieu Desnoyers
>> Sent: 12 January 2022 15:06
>>
>> ----- On Jan 12, 2022, at 9:58 AM, David Laight [email protected] wrote:
>>
>> >> * [*] The openrisc, powerpc64 and x86-64 architectures define a "redzone" as a
>> >> * stack area beyond the stack pointer which can be used by the compiler
>> >> * to store local variables in leaf functions.
>> >
>> > I wonder if that is really worth the trouble it causes!
>> > By the time a function is spilling values to stack the cost
>> > of a %sp update is almost certainly noise.
>> >
>> > Someone clearly thought it was a 'good idea (tm)'.
>>
>> I must admit that I've been surprised to learn about these redzones. Thanks for
>> pointing them out to me, it was clearly a blind spot. I suspect it would be
>> useful
>> to introduce per-architecture KERNEL_REDZONE, USER_REDZONE and
>> COMPAT_USER_REDZONE
>> with a asm-generic version defining them to 0, with proper documentation. It
>> would
>> make it clearer to kernel developers working on stuff similar to signal handler
>> delivery that they need to consider these carefully.
>
> They can never be used in kernel - any ISR would overwrite them.
arch/powerpc/include/asm/ptrace.h define those for ppc64:
113:#define USER_REDZONE_SIZE 512
114:#define KERNEL_REDZONE_SIZE 288
and then uses the kernel redzone size for:
#define STACK_INT_FRAME_SIZE (sizeof(struct pt_regs) + \
STACK_FRAME_OVERHEAD + KERNEL_REDZONE_SIZE)
which AFAIU should ensure that ISR don't overwrite the redzone within the kernel.
Thanks,
Mathieu
>
> David
>
> -
> Registered Address Lakeside, Bramley Road, Mount Farm, Milton Keynes, MK1 1PT,
> UK
> Registration No: 1397386 (Wales)
--
Mathieu Desnoyers
EfficiOS Inc.
http://www.efficios.com
On Wed, Jan 12, 2022 at 03:15:27PM +0000, David Laight wrote:
> From: Mathieu Desnoyers
> > Sent: 12 January 2022 15:06
> >
> > ----- On Jan 12, 2022, at 9:58 AM, David Laight [email protected] wrote:
> >
> > >> * [*] The openrisc, powerpc64 and x86-64 architectures define a "redzone" as a
> > >> * stack area beyond the stack pointer which can be used by the compiler
> > >> * to store local variables in leaf functions.
> > >
> > > I wonder if that is really worth the trouble it causes!
> > > By the time a function is spilling values to stack the cost
> > > of a %sp update is almost certainly noise.
> > >
> > > Someone clearly thought it was a 'good idea (tm)'.
> >
> > I must admit that I've been surprised to learn about these redzones. Thanks for
> > pointing them out to me, it was clearly a blind spot. I suspect it would be useful
> > to introduce per-architecture KERNEL_REDZONE, USER_REDZONE and COMPAT_USER_REDZONE
> > with a asm-generic version defining them to 0, with proper documentation. It would
> > make it clearer to kernel developers working on stuff similar to signal handler
> > delivery that they need to consider these carefully.
>
> They can never be used in kernel - any ISR would overwrite them.
That depends on how the architecture does exceptions; also consider:
https://www.intel.com/content/www/us/en/develop/download/flexible-return-and-event-delivery-specification.html
From: Peter Zijlstra
> Sent: 12 January 2022 15:34
>
> On Wed, Jan 12, 2022 at 03:15:27PM +0000, David Laight wrote:
> > From: Mathieu Desnoyers
> > > Sent: 12 January 2022 15:06
> > >
> > > ----- On Jan 12, 2022, at 9:58 AM, David Laight [email protected] wrote:
> > >
> > > >> * [*] The openrisc, powerpc64 and x86-64 architectures define a "redzone" as a
> > > >> * stack area beyond the stack pointer which can be used by the compiler
> > > >> * to store local variables in leaf functions.
> > > >
> > > > I wonder if that is really worth the trouble it causes!
> > > > By the time a function is spilling values to stack the cost
> > > > of a %sp update is almost certainly noise.
> > > >
> > > > Someone clearly thought it was a 'good idea (tm)'.
> > >
> > > I must admit that I've been surprised to learn about these redzones. Thanks for
> > > pointing them out to me, it was clearly a blind spot. I suspect it would be useful
> > > to introduce per-architecture KERNEL_REDZONE, USER_REDZONE and COMPAT_USER_REDZONE
> > > with a asm-generic version defining them to 0, with proper documentation. It would
> > > make it clearer to kernel developers working on stuff similar to signal handler
> > > delivery that they need to consider these carefully.
> >
> > They can never be used in kernel - any ISR would overwrite them.
>
> That depends on how the architecture does exceptions;
True, many newer ones don't actually write anything to the stack.
Makes the cpu simpler.
> also consider:
>
> https://www.intel.com/content/www/us/en/develop/download/flexible-return-and-event-delivery-
> specification.html
That contains the snippet:
The SWAPGS instruction supports efficient updates of the GS base address.
Which is just so horribly not true...
Even FRED is always doing a GS swap - so you can easily lose the kernel GS value.
I remember fixing all the 'in kernel' faults in the netbsd x86-64 return to user path.
Entirely horrid...
David
-
Registered Address Lakeside, Bramley Road, Mount Farm, Milton Keynes, MK1 1PT, UK
Registration No: 1397386 (Wales)