2016-12-19 19:12:50

by Andy Lutomirski

[permalink] [raw]
Subject: [PATCH] selftests/x86: Add a selftest for SYSRET to noncanonical addresses

SYSRET to a noncanonical address will blow up on Intel CPUs. Linux
needs to prevent this from happening in two major cases, and the
criteria will become more complicated when support for larger virtual
address spaces is added.

A fast-path SYSCALL will fallthrough to the following instruction
using SYSRET without any particular checking. To prevent fallthrough
to a noncanonical address, Linux prevents the highest canonical page
from being mapped. This test case checks a variety of possible maximum
addresses to make sure that either we can't map code there or that
SYSCALL fallthrough works.

A slow-path system call can return anywhere. Linux needs to make sure
that, if the return address is non-canonical, it won't use SYSRET.
This test cases causes sigreturn() to return to a variety of addresses
(with RCX == RIP) and makes sure that nothing explodes.

Some of this code comes from Kirill Shutemov.

Cc: "Kirill A. Shutemov" <[email protected]>
Signed-off-by: Andy Lutomirski <[email protected]>
---
tools/testing/selftests/x86/Makefile | 2 +-
tools/testing/selftests/x86/sysret_rip.c | 196 +++++++++++++++++++++++++++++++
2 files changed, 197 insertions(+), 1 deletion(-)
create mode 100644 tools/testing/selftests/x86/sysret_rip.c

diff --git a/tools/testing/selftests/x86/Makefile b/tools/testing/selftests/x86/Makefile
index 8c1cb423cfe6..25d4067c11e4 100644
--- a/tools/testing/selftests/x86/Makefile
+++ b/tools/testing/selftests/x86/Makefile
@@ -10,7 +10,7 @@ TARGETS_C_BOTHBITS := single_step_syscall sysret_ss_attrs syscall_nt ptrace_sysc
TARGETS_C_32BIT_ONLY := entry_from_vm86 syscall_arg_fault test_syscall_vdso unwind_vdso \
test_FCMOV test_FCOMI test_FISTTP \
vdso_restorer
-TARGETS_C_64BIT_ONLY := fsgsbase
+TARGETS_C_64BIT_ONLY := fsgsbase sysret_rip

TARGETS_C_32BIT_ALL := $(TARGETS_C_BOTHBITS) $(TARGETS_C_32BIT_ONLY)
TARGETS_C_64BIT_ALL := $(TARGETS_C_BOTHBITS) $(TARGETS_C_64BIT_ONLY)
diff --git a/tools/testing/selftests/x86/sysret_rip.c b/tools/testing/selftests/x86/sysret_rip.c
new file mode 100644
index 000000000000..3aa7d27c2d42
--- /dev/null
+++ b/tools/testing/selftests/x86/sysret_rip.c
@@ -0,0 +1,196 @@
+/*
+ * sigreturn.c - tests that x86 avoids Intel SYSRET pitfalls
+ * Copyright (c) 2014-2016 Andrew Lutomirski
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ */
+
+#define _GNU_SOURCE
+
+#include <stdlib.h>
+#include <unistd.h>
+#include <stdio.h>
+#include <string.h>
+#include <inttypes.h>
+#include <sys/signal.h>
+#include <sys/ucontext.h>
+#include <sys/syscall.h>
+#include <err.h>
+#include <stddef.h>
+#include <stdbool.h>
+#include <setjmp.h>
+#include <sys/user.h>
+#include <sys/mman.h>
+#include <assert.h>
+
+
+asm (
+ ".pushsection \".text\", \"ax\"\n\t"
+ ".balign 4096\n\t"
+ "test_page: .globl test_page\n\t"
+ ".fill 4094,1,0xcc\n\t"
+ "test_syscall_insn:\n\t"
+ "syscall\n\t"
+ ".ifne . - test_page - 4096\n\t"
+ ".error \"test page is not one page long\"\n\t"
+ ".endif\n\t"
+ ".popsection"
+ );
+
+extern const char test_page[];
+static void const *current_test_page_addr = test_page;
+
+static void sethandler(int sig, void (*handler)(int, siginfo_t *, void *),
+ int flags)
+{
+ struct sigaction sa;
+ memset(&sa, 0, sizeof(sa));
+ sa.sa_sigaction = handler;
+ sa.sa_flags = SA_SIGINFO | flags;
+ sigemptyset(&sa.sa_mask);
+ if (sigaction(sig, &sa, 0))
+ err(1, "sigaction");
+}
+
+static void clearhandler(int sig)
+{
+ struct sigaction sa;
+ memset(&sa, 0, sizeof(sa));
+ sa.sa_handler = SIG_DFL;
+ sigemptyset(&sa.sa_mask);
+ if (sigaction(sig, &sa, 0))
+ err(1, "sigaction");
+}
+
+/* State used by our signal handlers. */
+static gregset_t initial_regs;
+
+static volatile unsigned long rip;
+
+static void sigsegv_for_sigreturn_test(int sig, siginfo_t *info, void *ctx_void)
+{
+ ucontext_t *ctx = (ucontext_t*)ctx_void;
+
+ if (rip != ctx->uc_mcontext.gregs[REG_RIP]) {
+ printf("[FAIL]\tRequested RIP=0x%lx but got RIP=0x%lx\n",
+ rip, (unsigned long)ctx->uc_mcontext.gregs[REG_RIP]);
+ fflush(stdout);
+ _exit(1);
+ }
+
+ memcpy(&ctx->uc_mcontext.gregs, &initial_regs, sizeof(gregset_t));
+
+ printf("[OK]\tGot SIGSEGV at RIP=0x%lx\n", rip);
+}
+
+static void sigusr1(int sig, siginfo_t *info, void *ctx_void)
+{
+ ucontext_t *ctx = (ucontext_t*)ctx_void;
+
+ memcpy(&initial_regs, &ctx->uc_mcontext.gregs, sizeof(gregset_t));
+
+ /* Set IP and CX to match so that SYSRET can happen. */
+ ctx->uc_mcontext.gregs[REG_RIP] = rip;
+ ctx->uc_mcontext.gregs[REG_RCX] = rip;
+
+ /* R11 and EFLAGS should already match. */
+ assert(ctx->uc_mcontext.gregs[REG_EFL] ==
+ ctx->uc_mcontext.gregs[REG_R11]);
+
+ sethandler(SIGSEGV, sigsegv_for_sigreturn_test, SA_RESETHAND);
+
+ return;
+}
+
+static void test_sigreturn_to(unsigned long ip)
+{
+ rip = ip;
+ printf("[RUN]\tsigreturn to 0x%lx\n", ip);
+ raise(SIGUSR1);
+}
+
+static jmp_buf jmpbuf;
+
+static void sigsegv_for_fallthrough(int sig, siginfo_t *info, void *ctx_void)
+{
+ ucontext_t *ctx = (ucontext_t*)ctx_void;
+
+ if (rip != ctx->uc_mcontext.gregs[REG_RIP]) {
+ printf("[FAIL]\tExpected SIGSEGV at 0x%lx but got RIP=0x%lx\n",
+ rip, (unsigned long)ctx->uc_mcontext.gregs[REG_RIP]);
+ fflush(stdout);
+ _exit(1);
+ }
+
+ siglongjmp(jmpbuf, 1);
+}
+
+static void test_syscall_fallthrough_to(unsigned long ip)
+{
+ void *new_address = (void *)(ip - 4096);
+ void *ret;
+
+ printf("[RUN]\tTrying a SYSCALL that falls through to 0x%lx\n", ip);
+
+ ret = mremap((void *)current_test_page_addr, 4096, 4096,
+ MREMAP_MAYMOVE | MREMAP_FIXED, new_address);
+ if (ret == MAP_FAILED) {
+ if (ip <= (1UL << 47) - PAGE_SIZE) {
+ err(1, "mremap to 0x%p", new_address);
+ } else {
+ printf("[OK]\tmremap to 0x%p failed\n", new_address);
+ return;
+ }
+ }
+
+ if (ret != new_address)
+ errx(1, "mremap malfunctioned: asked for %p but got %p\n",
+ new_address, ret);
+
+ current_test_page_addr = new_address;
+ rip = ip;
+
+ if (sigsetjmp(jmpbuf, 1) == 0) {
+ asm volatile ("call *%[syscall_insn]" :: "a" (SYS_getpid),
+ [syscall_insn] "rm" (ip - 2));
+ errx(1, "[FAIL]\tSyscall trampoline returned");
+ }
+
+ printf("[OK]\tWe survived\n");
+}
+
+int main()
+{
+ /*
+ * When the kernel returns from a slow-path syscall, it will
+ * detect whether SYSRET is appropriate. If it incorrectly
+ * thinks that SYSRET is appropriate when RIP is noncanonical,
+ * it'll crash on Intel CPUs.
+ */
+ sethandler(SIGUSR1, sigusr1, 0);
+ for (int i = 47; i < 64; i++)
+ test_sigreturn_to((1UL<<i));
+
+ clearhandler(SIGUSR1);
+
+ sethandler(SIGSEGV, sigsegv_for_fallthrough, 0);
+
+ /* This should execute on all kernels. */
+ test_syscall_fallthrough_to((1UL << 47) - PAGE_SIZE);
+
+ /* Make sure that we didn't screw up the mremap logic. */
+ test_syscall_fallthrough_to((1UL << 47) - 2*PAGE_SIZE);
+
+ /* These are the interesting cases. */
+ for (int i = 47; i < 64; i++)
+ test_syscall_fallthrough_to((1UL<<i));
+
+ return 0;
+}
--
2.9.3


2016-12-19 23:30:43

by Kirill A. Shutemov

[permalink] [raw]
Subject: Re: [PATCH] selftests/x86: Add a selftest for SYSRET to noncanonical addresses

On Mon, Dec 19, 2016 at 11:12:42AM -0800, Andy Lutomirski wrote:
> SYSRET to a noncanonical address will blow up on Intel CPUs. Linux
> needs to prevent this from happening in two major cases, and the
> criteria will become more complicated when support for larger virtual
> address spaces is added.
>
> A fast-path SYSCALL will fallthrough to the following instruction
> using SYSRET without any particular checking. To prevent fallthrough
> to a noncanonical address, Linux prevents the highest canonical page
> from being mapped. This test case checks a variety of possible maximum
> addresses to make sure that either we can't map code there or that
> SYSCALL fallthrough works.
>
> A slow-path system call can return anywhere. Linux needs to make sure
> that, if the return address is non-canonical, it won't use SYSRET.
> This test cases causes sigreturn() to return to a variety of addresses
> (with RCX == RIP) and makes sure that nothing explodes.
>
> Some of this code comes from Kirill Shutemov.
>
> Cc: "Kirill A. Shutemov" <[email protected]>
> Signed-off-by: Andy Lutomirski <[email protected]>

Thanks a lot.

That's what I've got with 5-level paging:

[RUN] sigreturn to 0x800000000000
[OK] Got SIGSEGV at RIP=0x800000000000
[RUN] sigreturn to 0x1000000000000
[OK] Got SIGSEGV at RIP=0x1000000000000
[RUN] sigreturn to 0x2000000000000
[OK] Got SIGSEGV at RIP=0x2000000000000
[RUN] sigreturn to 0x4000000000000
[OK] Got SIGSEGV at RIP=0x4000000000000
[RUN] sigreturn to 0x8000000000000
[OK] Got SIGSEGV at RIP=0x8000000000000
[RUN] sigreturn to 0x10000000000000
[OK] Got SIGSEGV at RIP=0x10000000000000
[RUN] sigreturn to 0x20000000000000
[OK] Got SIGSEGV at RIP=0x20000000000000
[RUN] sigreturn to 0x40000000000000
[OK] Got SIGSEGV at RIP=0x40000000000000
[RUN] sigreturn to 0x80000000000000
[OK] Got SIGSEGV at RIP=0x80000000000000
[RUN] sigreturn to 0x100000000000000
[OK] Got SIGSEGV at RIP=0x100000000000000
[RUN] sigreturn to 0x200000000000000
[OK] Got SIGSEGV at RIP=0x200000000000000
[RUN] sigreturn to 0x400000000000000
[OK] Got SIGSEGV at RIP=0x400000000000000
[RUN] sigreturn to 0x800000000000000
[OK] Got SIGSEGV at RIP=0x800000000000000
[RUN] sigreturn to 0x1000000000000000
[OK] Got SIGSEGV at RIP=0x1000000000000000
[RUN] sigreturn to 0x2000000000000000
[OK] Got SIGSEGV at RIP=0x2000000000000000
[RUN] sigreturn to 0x4000000000000000
[OK] Got SIGSEGV at RIP=0x4000000000000000
[RUN] sigreturn to 0x8000000000000000
[OK] Got SIGSEGV at RIP=0x8000000000000000
[RUN] Trying a SYSCALL that falls through to 0x7ffffffff000
[OK] We survived
[RUN] Trying a SYSCALL that falls through to 0x7fffffffe000
[OK] We survived
[RUN] Trying a SYSCALL that falls through to 0x800000000000
[OK] We survived
[RUN] Trying a SYSCALL that falls through to 0x1000000000000
[OK] We survived
[RUN] Trying a SYSCALL that falls through to 0x2000000000000
[OK] We survived
[RUN] Trying a SYSCALL that falls through to 0x4000000000000
[OK] We survived
[RUN] Trying a SYSCALL that falls through to 0x8000000000000
[OK] We survived
[RUN] Trying a SYSCALL that falls through to 0x10000000000000
[OK] We survived
[RUN] Trying a SYSCALL that falls through to 0x20000000000000
[OK] We survived
[RUN] Trying a SYSCALL that falls through to 0x40000000000000
[OK] We survived
[RUN] Trying a SYSCALL that falls through to 0x80000000000000
[OK] We survived
[RUN] Trying a SYSCALL that falls through to 0x100000000000000
[OK] mremap to 0x0xfffffffffff000 failed
[RUN] Trying a SYSCALL that falls through to 0x200000000000000
[OK] mremap to 0x0x1fffffffffff000 failed
[RUN] Trying a SYSCALL that falls through to 0x400000000000000
[OK] mremap to 0x0x3fffffffffff000 failed
[RUN] Trying a SYSCALL that falls through to 0x800000000000000
[OK] mremap to 0x0x7fffffffffff000 failed
[RUN] Trying a SYSCALL that falls through to 0x1000000000000000
[OK] mremap to 0x0xffffffffffff000 failed
[RUN] Trying a SYSCALL that falls through to 0x2000000000000000
[OK] mremap to 0x0x1ffffffffffff000 failed
[RUN] Trying a SYSCALL that falls through to 0x4000000000000000
[OK] mremap to 0x0x3ffffffffffff000 failed
[RUN] Trying a SYSCALL that falls through to 0x8000000000000000
[OK] mremap to 0x0x7ffffffffffff000 failed

> ---
> tools/testing/selftests/x86/Makefile | 2 +-
> tools/testing/selftests/x86/sysret_rip.c | 196 +++++++++++++++++++++++++++++++
> 2 files changed, 197 insertions(+), 1 deletion(-)
> create mode 100644 tools/testing/selftests/x86/sysret_rip.c
>
> diff --git a/tools/testing/selftests/x86/Makefile b/tools/testing/selftests/x86/Makefile
> index 8c1cb423cfe6..25d4067c11e4 100644
> --- a/tools/testing/selftests/x86/Makefile
> +++ b/tools/testing/selftests/x86/Makefile
> @@ -10,7 +10,7 @@ TARGETS_C_BOTHBITS := single_step_syscall sysret_ss_attrs syscall_nt ptrace_sysc
> TARGETS_C_32BIT_ONLY := entry_from_vm86 syscall_arg_fault test_syscall_vdso unwind_vdso \
> test_FCMOV test_FCOMI test_FISTTP \
> vdso_restorer
> -TARGETS_C_64BIT_ONLY := fsgsbase
> +TARGETS_C_64BIT_ONLY := fsgsbase sysret_rip
>
> TARGETS_C_32BIT_ALL := $(TARGETS_C_BOTHBITS) $(TARGETS_C_32BIT_ONLY)
> TARGETS_C_64BIT_ALL := $(TARGETS_C_BOTHBITS) $(TARGETS_C_64BIT_ONLY)
> diff --git a/tools/testing/selftests/x86/sysret_rip.c b/tools/testing/selftests/x86/sysret_rip.c
> new file mode 100644
> index 000000000000..3aa7d27c2d42
> --- /dev/null
> +++ b/tools/testing/selftests/x86/sysret_rip.c
> @@ -0,0 +1,196 @@
> +/*
> + * sigreturn.c - tests that x86 avoids Intel SYSRET pitfalls
> + * Copyright (c) 2014-2016 Andrew Lutomirski
> + *
> + * This program is free software; you can redistribute it and/or modify
> + * it under the terms and conditions of the GNU General Public License,
> + * version 2, as published by the Free Software Foundation.
> + *
> + * This program is distributed in the hope it will be useful, but
> + * WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> + * General Public License for more details.
> + */
> +
> +#define _GNU_SOURCE
> +
> +#include <stdlib.h>
> +#include <unistd.h>
> +#include <stdio.h>
> +#include <string.h>
> +#include <inttypes.h>
> +#include <sys/signal.h>
> +#include <sys/ucontext.h>
> +#include <sys/syscall.h>
> +#include <err.h>
> +#include <stddef.h>
> +#include <stdbool.h>
> +#include <setjmp.h>
> +#include <sys/user.h>
> +#include <sys/mman.h>
> +#include <assert.h>
> +
> +
> +asm (
> + ".pushsection \".text\", \"ax\"\n\t"
> + ".balign 4096\n\t"
> + "test_page: .globl test_page\n\t"
> + ".fill 4094,1,0xcc\n\t"
> + "test_syscall_insn:\n\t"
> + "syscall\n\t"
> + ".ifne . - test_page - 4096\n\t"
> + ".error \"test page is not one page long\"\n\t"
> + ".endif\n\t"
> + ".popsection"
> + );
> +
> +extern const char test_page[];
> +static void const *current_test_page_addr = test_page;
> +
> +static void sethandler(int sig, void (*handler)(int, siginfo_t *, void *),
> + int flags)
> +{
> + struct sigaction sa;
> + memset(&sa, 0, sizeof(sa));
> + sa.sa_sigaction = handler;
> + sa.sa_flags = SA_SIGINFO | flags;
> + sigemptyset(&sa.sa_mask);
> + if (sigaction(sig, &sa, 0))
> + err(1, "sigaction");
> +}
> +
> +static void clearhandler(int sig)
> +{
> + struct sigaction sa;
> + memset(&sa, 0, sizeof(sa));
> + sa.sa_handler = SIG_DFL;
> + sigemptyset(&sa.sa_mask);
> + if (sigaction(sig, &sa, 0))
> + err(1, "sigaction");
> +}
> +
> +/* State used by our signal handlers. */
> +static gregset_t initial_regs;
> +
> +static volatile unsigned long rip;
> +
> +static void sigsegv_for_sigreturn_test(int sig, siginfo_t *info, void *ctx_void)
> +{
> + ucontext_t *ctx = (ucontext_t*)ctx_void;
> +
> + if (rip != ctx->uc_mcontext.gregs[REG_RIP]) {
> + printf("[FAIL]\tRequested RIP=0x%lx but got RIP=0x%lx\n",
> + rip, (unsigned long)ctx->uc_mcontext.gregs[REG_RIP]);
> + fflush(stdout);
> + _exit(1);
> + }
> +
> + memcpy(&ctx->uc_mcontext.gregs, &initial_regs, sizeof(gregset_t));
> +
> + printf("[OK]\tGot SIGSEGV at RIP=0x%lx\n", rip);
> +}
> +
> +static void sigusr1(int sig, siginfo_t *info, void *ctx_void)
> +{
> + ucontext_t *ctx = (ucontext_t*)ctx_void;
> +
> + memcpy(&initial_regs, &ctx->uc_mcontext.gregs, sizeof(gregset_t));
> +
> + /* Set IP and CX to match so that SYSRET can happen. */
> + ctx->uc_mcontext.gregs[REG_RIP] = rip;
> + ctx->uc_mcontext.gregs[REG_RCX] = rip;
> +
> + /* R11 and EFLAGS should already match. */
> + assert(ctx->uc_mcontext.gregs[REG_EFL] ==
> + ctx->uc_mcontext.gregs[REG_R11]);
> +
> + sethandler(SIGSEGV, sigsegv_for_sigreturn_test, SA_RESETHAND);
> +
> + return;
> +}
> +
> +static void test_sigreturn_to(unsigned long ip)
> +{
> + rip = ip;
> + printf("[RUN]\tsigreturn to 0x%lx\n", ip);
> + raise(SIGUSR1);
> +}
> +
> +static jmp_buf jmpbuf;
> +
> +static void sigsegv_for_fallthrough(int sig, siginfo_t *info, void *ctx_void)
> +{
> + ucontext_t *ctx = (ucontext_t*)ctx_void;
> +
> + if (rip != ctx->uc_mcontext.gregs[REG_RIP]) {
> + printf("[FAIL]\tExpected SIGSEGV at 0x%lx but got RIP=0x%lx\n",
> + rip, (unsigned long)ctx->uc_mcontext.gregs[REG_RIP]);
> + fflush(stdout);
> + _exit(1);
> + }
> +
> + siglongjmp(jmpbuf, 1);
> +}
> +
> +static void test_syscall_fallthrough_to(unsigned long ip)
> +{
> + void *new_address = (void *)(ip - 4096);
> + void *ret;
> +
> + printf("[RUN]\tTrying a SYSCALL that falls through to 0x%lx\n", ip);
> +
> + ret = mremap((void *)current_test_page_addr, 4096, 4096,
> + MREMAP_MAYMOVE | MREMAP_FIXED, new_address);
> + if (ret == MAP_FAILED) {
> + if (ip <= (1UL << 47) - PAGE_SIZE) {
> + err(1, "mremap to 0x%p", new_address);

"0x" is redundant for %p.

> + } else {
> + printf("[OK]\tmremap to 0x%p failed\n", new_address);

Ditto.

> + return;
> + }
> + }
> +
> + if (ret != new_address)
> + errx(1, "mremap malfunctioned: asked for %p but got %p\n",
> + new_address, ret);
> +
> + current_test_page_addr = new_address;
> + rip = ip;
> +
> + if (sigsetjmp(jmpbuf, 1) == 0) {
> + asm volatile ("call *%[syscall_insn]" :: "a" (SYS_getpid),
> + [syscall_insn] "rm" (ip - 2));
> + errx(1, "[FAIL]\tSyscall trampoline returned");
> + }
> +
> + printf("[OK]\tWe survived\n");
> +}
> +
> +int main()
> +{
> + /*
> + * When the kernel returns from a slow-path syscall, it will
> + * detect whether SYSRET is appropriate. If it incorrectly
> + * thinks that SYSRET is appropriate when RIP is noncanonical,
> + * it'll crash on Intel CPUs.
> + */
> + sethandler(SIGUSR1, sigusr1, 0);
> + for (int i = 47; i < 64; i++)
> + test_sigreturn_to((1UL<<i));

Redundant parenthesis?

> +
> + clearhandler(SIGUSR1);
> +
> + sethandler(SIGSEGV, sigsegv_for_fallthrough, 0);
> +
> + /* This should execute on all kernels. */
> + test_syscall_fallthrough_to((1UL << 47) - PAGE_SIZE);
> +
> + /* Make sure that we didn't screw up the mremap logic. */
> + test_syscall_fallthrough_to((1UL << 47) - 2*PAGE_SIZE);
> +
> + /* These are the interesting cases. */
> + for (int i = 47; i < 64; i++)
> + test_syscall_fallthrough_to((1UL<<i));

Ditto.

Also, "(1UL << i) - PAGE_SIZE" is interesting too. TASK_SIZE for 5-level
paging would be (1UL << 56) - PAGE_SIZE. I would be better to catch both
corner cases.

> +
> + return 0;
> +}
> --
> 2.9.3
>

--
Kirill A. Shutemov

2016-12-19 23:38:08

by Andy Lutomirski

[permalink] [raw]
Subject: Re: [PATCH] selftests/x86: Add a selftest for SYSRET to noncanonical addresses

On Mon, Dec 19, 2016 at 3:30 PM, Kirill A. Shutemov
<[email protected]> wrote:
> On Mon, Dec 19, 2016 at 11:12:42AM -0800, Andy Lutomirski wrote:
>> SYSRET to a noncanonical address will blow up on Intel CPUs. Linux
>> needs to prevent this from happening in two major cases, and the
>> criteria will become more complicated when support for larger virtual
>> address spaces is added.
>>
>> A fast-path SYSCALL will fallthrough to the following instruction
>> using SYSRET without any particular checking. To prevent fallthrough
>> to a noncanonical address, Linux prevents the highest canonical page
>> from being mapped. This test case checks a variety of possible maximum
>> addresses to make sure that either we can't map code there or that
>> SYSCALL fallthrough works.
>>
>> A slow-path system call can return anywhere. Linux needs to make sure
>> that, if the return address is non-canonical, it won't use SYSRET.
>> This test cases causes sigreturn() to return to a variety of addresses
>> (with RCX == RIP) and makes sure that nothing explodes.
>>
>> Some of this code comes from Kirill Shutemov.
>>
>> Cc: "Kirill A. Shutemov" <[email protected]>
>> Signed-off-by: Andy Lutomirski <[email protected]>
>
> Thanks a lot.
>
> That's what I've got with 5-level paging:

[...]

Looks good.

>> + err(1, "mremap to 0x%p", new_address);
>
> "0x" is redundant for %p.
>
>> + } else {
>> + printf("[OK]\tmremap to 0x%p failed\n", new_address);
>
> Ditto.

Will fix.

>> + test_sigreturn_to((1UL<<i));
>
> Redundant parenthesis?

Indeed.

>
>> +
>> + clearhandler(SIGUSR1);
>> +
>> + sethandler(SIGSEGV, sigsegv_for_fallthrough, 0);
>> +
>> + /* This should execute on all kernels. */
>> + test_syscall_fallthrough_to((1UL << 47) - PAGE_SIZE);
>> +
>> + /* Make sure that we didn't screw up the mremap logic. */
>> + test_syscall_fallthrough_to((1UL << 47) - 2*PAGE_SIZE);
>> +
>> + /* These are the interesting cases. */
>> + for (int i = 47; i < 64; i++)
>> + test_syscall_fallthrough_to((1UL<<i));
>
> Ditto.
>
> Also, "(1UL << i) - PAGE_SIZE" is interesting too. TASK_SIZE for 5-level
> paging would be (1UL << 56) - PAGE_SIZE. I would be better to catch both
> corner cases.

There's not much scope for error, though -- (1UL << 56) - PAGE_SIZE
isn't really any different from any other address. I all add it, but
I'm not sure I see any way that the kernel could plausible get it
wrong. I guess it's comforting to see the boundary, though.

--Andy